[SEBA-356] Sadis-server logs to kafka Change-Id: Ia1473621e5f8c12818fff6d4dffe708f3cc146cd

commit: 9a2772a1846566b498d4fb1fefa3959d0de823eb [log] [tgz]
author: Matteo Scandolo <matteo.scandolo@gmail.com> Mon Nov 19 14:56:26 2018 -0800
committer: Matteo Scandolo <matteo.scandolo@gmail.com> Mon Dec 10 13:15:52 2018 -0800
tree: 996f9cb56ed03fb317229f581b6bdc69ca6d375f
parent: 1f49bf54632d764fb201c185fc576511d73626d8 [diff]
diff --git a/Dockerfile b/Dockerfile
index 9f97dbf..a8abc63 100644
--- a/Dockerfile
+++ b/Dockerfile

@@ -13,14 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# docker build -t sadis-server:candidate .
+# docker build -t opencord/sadis-server:candidate .
 
-FROM golang:1.7-alpine as builder
+FROM golang:1.10-stretch as builder
 MAINTAINER Open Networking Foundation <info@opennetworking.org>
 
 WORKDIR /go
 ADD . /go/src/gerrit.opencord.org/sadis-server
-RUN go build -o /build/entry-point gerrit.opencord.org/sadis-server
+RUN CGO_ENABLED=0 GOOS=linux go build -o /build/entry-point gerrit.opencord.org/sadis-server
 
 FROM alpine:3.5
 MAINTAINER Open Networking Foundation <info@opennetworking.org>

diff --git a/Gopkg.lock b/Gopkg.lock
index 0adeea4..ff66754 100644
--- a/Gopkg.lock
+++ b/Gopkg.lock

@@ -2,12 +2,68 @@
 
 
 [[projects]]
-  digest = "1:69b1cc331fca23d702bd72f860c6a647afd0aa9fcbc1d0659b1365e26546dd70"
-  name = "github.com/Sirupsen/logrus"
+  digest = "1:c99bd4548f502371b98c77534239a514c9a1e715d468af3c108db06186aa692a"
+  name = "github.com/DataDog/zstd"
   packages = ["."]
   pruneopts = "UT"
-  revision = "bcd833dfe83d3cebad139e4a29ed79cb2318bf95"
-  version = "v1.2.0"
+  revision = "aebefd9fcb99f22cd691ef778a12ed68f0e6a1ab"
+  version = "v1.3.4"
+
+[[projects]]
+  digest = "1:a59a467c541a1bf8b06e4fad6113028c959be6573b78ceca9f8020cd0d2127fc"
+  name = "github.com/Shopify/sarama"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "879f631812a30a580659e8035e7cda9994bb99ac"
+  version = "v1.20.0"
+
+[[projects]]
+  digest = "1:ffe9824d294da03b391f44e1ae8281281b4afc1bdaa9588c9097785e3af10cec"
+  name = "github.com/davecgh/go-spew"
+  packages = ["spew"]
+  pruneopts = "UT"
+  revision = "8991bc29aa16c548c550c7ff78260e27b9ab7c73"
+  version = "v1.1.1"
+
+[[projects]]
+  digest = "1:1f0c7ab489b407a7f8f9ad16c25a504d28ab461517a971d341388a56156c1bd7"
+  name = "github.com/eapache/go-resiliency"
+  packages = ["breaker"]
+  pruneopts = "UT"
+  revision = "ea41b0fad31007accc7f806884dcdf3da98b79ce"
+  version = "v1.1.0"
+
+[[projects]]
+  branch = "master"
+  digest = "1:79f16588b5576b1b3cd90e48d2374cc9a1a8776862d28d8fd0f23b0e15534967"
+  name = "github.com/eapache/go-xerial-snappy"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "776d5712da21bc4762676d614db1d8a64f4238b0"
+
+[[projects]]
+  digest = "1:444b82bfe35c83bbcaf84e310fb81a1f9ece03edfed586483c869e2c046aef69"
+  name = "github.com/eapache/queue"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "44cc805cf13205b55f69e14bcb69867d1ae92f98"
+  version = "v1.1.0"
+
+[[projects]]
+  branch = "master"
+  digest = "1:1cc1edfbf8585a36c546d9b475f7fc568e3a74289d87f0e48942f76f59778b33"
+  name = "github.com/gfremex/logrus-kafka-hook"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "f62e125fcbfec116f56051e523699d5cce1d33f8"
+
+[[projects]]
+  branch = "master"
+  digest = "1:4a0c6bb4805508a6287675fac876be2ac1182539ca8a32468d8128882e9d5009"
+  name = "github.com/golang/snappy"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "2e65f85255dbc3072edf28d6b5b8efc472979f5a"
 
 [[projects]]
   digest = "1:c79fb010be38a59d657c48c6ba1d003a8aa651fa56b579d959d74573b7dff8e1"
@@ -42,6 +98,33 @@
   version = "v1.0.1"
 
 [[projects]]
+  digest = "1:e39a5ee8fcbec487f8fc68863ef95f2b025e0739b0e4aa55558a2b4cf8f0ecf0"
+  name = "github.com/pierrec/lz4"
+  packages = [
+    ".",
+    "internal/xxh32",
+  ]
+  pruneopts = "UT"
+  revision = "635575b42742856941dbc767b44905bb9ba083f6"
+  version = "v2.0.7"
+
+[[projects]]
+  branch = "master"
+  digest = "1:d38f81081a389f1466ec98192cf9115a82158854d6f01e1c23e2e7554b97db71"
+  name = "github.com/rcrowley/go-metrics"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "3113b8401b8a98917cde58f8bbd42a1b1c03b1fd"
+
+[[projects]]
+  digest = "1:69b1cc331fca23d702bd72f860c6a647afd0aa9fcbc1d0659b1365e26546dd70"
+  name = "github.com/sirupsen/logrus"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "bcd833dfe83d3cebad139e4a29ed79cb2318bf95"
+  version = "v1.2.0"
+
+[[projects]]
   branch = "master"
   digest = "1:3f3a05ae0b95893d90b9b3b5afdb79a9b3d96e4e36e099d841ae602e4aca0da8"
   name = "golang.org/x/crypto"
@@ -64,9 +147,10 @@
   analyzer-name = "dep"
   analyzer-version = 1
   input-imports = [
-    "github.com/Sirupsen/logrus",
+    "github.com/gfremex/logrus-kafka-hook",
     "github.com/gorilla/mux",
     "github.com/kelseyhightower/envconfig",
+    "github.com/sirupsen/logrus",
   ]
   solver-name = "gps-cdcl"
   solver-version = 1

diff --git a/handlers.go b/handlers.go
index c85f239..fc28647 100644
--- a/handlers.go
+++ b/handlers.go

@@ -15,6 +15,7 @@
 
 import (
 	"encoding/json"
+	"github.com/sirupsen/logrus"
 	"net"
 	"net/http"
 	"strconv"
@@ -26,7 +27,9 @@
 	vars := mux.Vars(r)
 	sadisRequestID := vars["id"]
 
-	log.Infof("Looking for object %s in XOS database", sadisRequestID)
+	log.WithFields(logrus.Fields{
+		"sadisId": sadisRequestID,
+	}).Infof("Looking for object %s in XOS database", sadisRequestID)
 
 	defer r.Body.Close()
 
@@ -112,7 +115,9 @@
 		}
 	}
 
-	log.Infof("Couldn't find object %s in XOS database", sadisRequestID)
+	log.WithFields(logrus.Fields{
+		"sadisId": sadisRequestID,
+	}).Infof("Couldn't find object %s in XOS database", sadisRequestID)
 
 	http.NotFound(w, r)
 }

diff --git a/sadisServer.go b/sadisServer.go
index a7e1bdc..b2c56ec 100644
--- a/sadisServer.go
+++ b/sadisServer.go

@@ -19,30 +19,34 @@
 	"net/http"
 	"os"
 
-	"github.com/Sirupsen/logrus"
+	"github.com/sirupsen/logrus"
 
 	"github.com/gorilla/mux"
 	"github.com/kelseyhightower/envconfig"
+	lkh "github.com/gfremex/logrus-kafka-hook"
 )
 
 const appName = "SADISSERVER"
 
 type Config struct {
-	Port       int    `default:"8000" desc:"port on which to listen for requests"`
-	Xos		   string `default:"127.0.0.1:8181" desc:"connection string with which to connect to XOS"`
-	Username   string `default:"admin@opencord.org" desc:"username with which to connect to XOS"`
-	Password   string `default:"letmein" desc:"password with which to connect to XOS"`
-	LogLevel   string `default:"info" envconfig:"LOG_LEVEL" desc:"detail level for logging"`
-	LogFormat  string `default:"text" envconfig:"LOG_FORMAT" desc:"log output format, text or json"`
+	Port         int    `default:"8000" desc:"port on which to listen for requests"`
+	Xos		     string `default:"127.0.0.1:8181" desc:"connection string with which to connect to XOS"`
+	Username     string `default:"admin@opencord.org" desc:"username with which to connect to XOS"`
+	Password     string `default:"letmein" desc:"password with which to connect to XOS"`
+	LogLevel     string `default:"info" envconfig:"LOG_LEVEL" desc:"detail level for logging"`
+	LogFormat    string `default:"text" envconfig:"LOG_FORMAT" desc:"log output format, text or json"`
+	KafkaBroker  string `default:"" desc:"url of the kafka broker"`
 
 	connect string
 }
 
-var log = logrus.New()
+var logger = logrus.New()
+var log *logrus.Entry
 var appFlags = flag.NewFlagSet("", flag.ContinueOnError)
+var config Config
 
-func main() {
-	config := Config{}
+func init()  {
+	config = Config{}
 	appFlags.Usage = func() {
 		envconfig.Usage(appName, &config)
 	}
@@ -56,35 +60,63 @@
 
 	err := envconfig.Process(appName, &config)
 	if err != nil {
-		log.Fatalf("[ERROR] Unable to parse configuration options : %s", err)
+		logger.Fatalf("[ERROR] Unable to parse configuration options : %s", err)
 	}
 
-	switch config.LogFormat {
-	case "json":
-		log.Formatter = &logrus.JSONFormatter{}
-	default:
-		log.Formatter = &logrus.TextFormatter{
-			FullTimestamp: true,
-			ForceColors:   true,
+	if len(config.KafkaBroker) > 0 {
+		logger.Debug("Setting up kafka integration")
+		hook, err := lkh.NewKafkaHook(
+			"kh",
+			[]logrus.Level{logrus.DebugLevel, logrus.InfoLevel, logrus.WarnLevel, logrus.ErrorLevel},
+			&logrus.JSONFormatter{
+				FieldMap: logrus.FieldMap{
+					logrus.FieldKeyTime:  "@timestamp",
+					logrus.FieldKeyLevel: "level",
+					logrus.FieldKeyMsg:   "message",
+				},
+			},
+			[]string{config.KafkaBroker},
+		)
+
+		if err != nil {
+			logger.Error(err)
 		}
+
+		logger.Hooks.Add(hook)
 	}
 
 	level, err := logrus.ParseLevel(config.LogLevel)
 	if err != nil {
 		level = logrus.WarnLevel
 	}
-	log.Level = level
+	logger.Level = level
 
-	log.Infof(`Configuration:
-        PORT:       %d
-        XOS:        %s
-        USERNAME:   %s
-        PASSWORD:   %s
-        LOG_LEVEL:  %s
-        LOG_FORMAT: %s`,
-		config.Port, config.Xos,
-		config.Username, config.Password,
-		config.LogLevel, config.LogFormat)
+	switch config.LogFormat {
+	case "json":
+		logger.Formatter = &logrus.JSONFormatter{}
+	default:
+		logger.Formatter = &logrus.TextFormatter{
+			FullTimestamp: true,
+			ForceColors:   true,
+		}
+	}
+
+	log = logger.WithField("topics", []string{"sadis-server.log"})
+}
+
+func main() {
+
+
+
+	log.WithFields(logrus.Fields{
+		"PORT": config.Port,
+		"XOS": config.Xos,
+		"USERNAME": config.Username,
+		"PASSWORD": config.Password,
+		"LOG_LEVEL": config.LogLevel,
+		"LOG_FORMAT": config.LogFormat,
+		"KAFKA_BROKER": config.KafkaBroker,
+	}).Infof(`Sadis-server started`)
 
 	router := mux.NewRouter()
 	router.HandleFunc("/subscriber/{id}", config.getSubscriberHandler)

diff --git a/vendor/github.com/DataDog/zstd/.travis.yml b/vendor/github.com/DataDog/zstd/.travis.yml
new file mode 100644
index 0000000..88c152e
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/.travis.yml

@@ -0,0 +1,18 @@
+language: go
+
+go:
+  - 1.8.x
+  - 1.9.x
+  - 1.10.x
+
+os:
+  - linux
+  - osx
+
+install:
+  - "wget http://sun.aei.polsl.pl/~sdeor/corpus/mr.bz2"
+  - "bzip2 -d mr.bz2"
+script:
+  - "go build"
+  - "PAYLOAD=`pwd`/mr go test -v"
+  - "PAYLOAD=`pwd`/mr go test -bench ."

diff --git a/vendor/github.com/DataDog/zstd/LICENSE b/vendor/github.com/DataDog/zstd/LICENSE
new file mode 100644
index 0000000..345c1eb
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/LICENSE

@@ -0,0 +1,27 @@
+Simplified BSD License
+
+Copyright (c) 2016, Datadog <info@datadoghq.com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

diff --git a/vendor/github.com/DataDog/zstd/README.md b/vendor/github.com/DataDog/zstd/README.md
new file mode 100644
index 0000000..6c02e16
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/README.md

@@ -0,0 +1,120 @@
+# Zstd Go Wrapper
+
+[C Zstd Homepage](https://github.com/Cyan4973/zstd)
+
+The current headers and C files are from *v1.3.4* (Commit
+[2555975](https://github.com/facebook/zstd/releases/tag/v1.3.4)).
+
+## Usage
+
+There are two main APIs:
+
+* simple Compress/Decompress
+* streaming API (io.Reader/io.Writer)
+
+The compress/decompress APIs mirror that of lz4, while the streaming API was
+designed to be a drop-in replacement for zlib.
+
+### Simple `Compress/Decompress`
+
+
+```go
+// Compress compresses the byte array given in src and writes it to dst.
+// If you already have a buffer allocated, you can pass it to prevent allocation
+// If not, you can pass nil as dst.
+// If the buffer is too small, it will be reallocated, resized, and returned bu the function
+// If dst is nil, this will allocate the worst case size (CompressBound(src))
+Compress(dst, src []byte) ([]byte, error)
+```
+
+```go
+// CompressLevel is the same as Compress but you can pass another compression level
+CompressLevel(dst, src []byte, level int) ([]byte, error)
+```
+
+```go
+// Decompress will decompress your payload into dst.
+// If you already have a buffer allocated, you can pass it to prevent allocation
+// If not, you can pass nil as dst (allocates a 4*src size as default).
+// If the buffer is too small, it will retry 3 times by doubling the dst size
+// After max retries, it will switch to the slower stream API to be sure to be able
+// to decompress. Currently switches if compression ratio > 4*2**3=32.
+Decompress(dst, src []byte) ([]byte, error)
+```
+
+### Stream API
+
+```go
+// NewWriter creates a new object that can optionally be initialized with
+// a precomputed dictionary. If dict is nil, compress without a dictionary.
+// The dictionary array should not be changed during the use of this object.
+// You MUST CALL Close() to write the last bytes of a zstd stream and free C objects.
+NewWriter(w io.Writer) *Writer
+NewWriterLevel(w io.Writer, level int) *Writer
+NewWriterLevelDict(w io.Writer, level int, dict []byte) *Writer
+
+// Write compresses the input data and write it to the underlying writer
+(w *Writer) Write(p []byte) (int, error)
+
+// Close flushes the buffer and frees C zstd objects
+(w *Writer) Close() error
+```
+
+```go
+// NewReader returns a new io.ReadCloser that will decompress data from the
+// underlying reader.  If a dictionary is provided to NewReaderDict, it must
+// not be modified until Close is called.  It is the caller's responsibility
+// to call Close, which frees up C objects.
+NewReader(r io.Reader) io.ReadCloser
+NewReaderDict(r io.Reader, dict []byte) io.ReadCloser
+```
+
+### Benchmarks (benchmarked with v0.5.0)
+
+The author of Zstd also wrote lz4. Zstd is intended to occupy a speed/ratio
+level similar to what zlib currently provides.  In our tests, the can always
+be made to be better than zlib by chosing an appropriate level while still
+keeping compression and decompression time faster than zlib.
+
+You can run the benchmarks against your own payloads by using the Go benchmarks tool.
+Just export your payload filepath as the `PAYLOAD` environment variable and run the benchmarks:
+
+```go
+go test -bench .
+```
+
+Compression of a 7Mb pdf zstd (this wrapper) vs [czlib](https://github.com/DataDog/czlib):
+```
+BenchmarkCompression               5     221056624 ns/op      67.34 MB/s
+BenchmarkDecompression           100      18370416 ns/op     810.32 MB/s
+
+BenchmarkFzlibCompress             2     610156603 ns/op      24.40 MB/s
+BenchmarkFzlibDecompress          20      81195246 ns/op     183.33 MB/s
+```
+
+Ratio is also better by a margin of ~20%.
+Compression speed is always better than zlib on all the payloads we tested;
+However, [czlib](https://github.com/DataDog/czlib) has optimisations that make it
+faster at decompressiong small payloads:
+
+```
+Testing with size: 11... czlib: 8.97 MB/s, zstd: 3.26 MB/s
+Testing with size: 27... czlib: 23.3 MB/s, zstd: 8.22 MB/s
+Testing with size: 62... czlib: 31.6 MB/s, zstd: 19.49 MB/s
+Testing with size: 141... czlib: 74.54 MB/s, zstd: 42.55 MB/s
+Testing with size: 323... czlib: 155.14 MB/s, zstd: 99.39 MB/s
+Testing with size: 739... czlib: 235.9 MB/s, zstd: 216.45 MB/s
+Testing with size: 1689... czlib: 116.45 MB/s, zstd: 345.64 MB/s
+Testing with size: 3858... czlib: 176.39 MB/s, zstd: 617.56 MB/s
+Testing with size: 8811... czlib: 254.11 MB/s, zstd: 824.34 MB/s
+Testing with size: 20121... czlib: 197.43 MB/s, zstd: 1339.11 MB/s
+Testing with size: 45951... czlib: 201.62 MB/s, zstd: 1951.57 MB/s
+```
+
+zstd starts to shine with payloads > 1KB
+
+### Stability - Current state: STABLE
+
+The C library seems to be pretty stable and according to the author has been tested and fuzzed.
+
+For the Go wrapper, the test cover most usual cases and we have succesfully tested it on all staging and prod data.

diff --git a/vendor/github.com/DataDog/zstd/ZSTD_LICENSE b/vendor/github.com/DataDog/zstd/ZSTD_LICENSE
new file mode 100644
index 0000000..a793a80
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/ZSTD_LICENSE

@@ -0,0 +1,30 @@
+BSD License
+
+For Zstandard software
+
+Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

diff --git a/vendor/github.com/DataDog/zstd/bitstream.h b/vendor/github.com/DataDog/zstd/bitstream.h
new file mode 100644
index 0000000..f7f389f
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/bitstream.h

@@ -0,0 +1,471 @@
+/* ******************************************************************
+   bitstream
+   Part of FSE library
+   header file (to include)
+   Copyright (C) 2013-2017, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "mem.h"            /* unaligned access routines */
+#include "error_private.h"  /* error codes and messages */
+
+
+/*-*************************************
+*  Debug
+***************************************/
+#if defined(BIT_DEBUG) && (BIT_DEBUG>=1)
+#  include <assert.h>
+#else
+#  ifndef assert
+#    define assert(condition) ((void)0)
+#  endif
+#endif
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#if defined(__BMI__) && defined(__GNUC__)
+#  include <immintrin.h>   /* support for bextr (experimental) */
+#endif
+
+#define STREAM_ACCUMULATOR_MIN_32  25
+#define STREAM_ACCUMULATOR_MIN_64  57
+#define STREAM_ACCUMULATOR_MIN    ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+ * A critical property of these streams is that they encode and decode in **reverse** direction.
+ * So the first bit sequence you add will be the last to be read, like a LIFO stack.
+ */
+typedef struct
+{
+    size_t bitContainer;
+    unsigned bitPos;
+    char*  startPtr;
+    char*  ptr;
+    char*  endPtr;
+} BIT_CStream_t;
+
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+    const char* limitPtr;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+/* unsafe version; does not check buffer overflow */
+
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+    assert(val != 0);
+    {
+#   if defined(_MSC_VER)   /* Visual */
+        unsigned long r=0;
+        _BitScanReverse ( &r, val );
+        return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+        return 31 - __builtin_clz (val);
+#   else   /* Software version */
+        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+                                                 11, 14, 16, 18, 22, 25,  3, 30,
+                                                  8, 12, 20, 28, 15, 17, 24,  7,
+                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+        U32 v = val;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+#   endif
+    }
+}
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = {
+    0,          1,         3,         7,         0xF,       0x1F,
+    0x3F,       0x7F,      0xFF,      0x1FF,     0x3FF,     0x7FF,
+    0xFFF,      0x1FFF,    0x3FFF,    0x7FFF,    0xFFFF,    0x1FFFF,
+    0x3FFFF,    0x7FFFF,   0xFFFFF,   0x1FFFFF,  0x3FFFFF,  0x7FFFFF,
+    0xFFFFFF,   0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
+    0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */
+#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0]))
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(size_t)
+ *  @return : 0 if success,
+ *            otherwise an error code (can be tested using ERR_isError()) */
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
+{
+    bitC->bitContainer = 0;
+    bitC->bitPos = 0;
+    bitC->startPtr = (char*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+    if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+/*! BIT_addBits() :
+ *  can add up to 31 bits into `bitC`.
+ *  Note : does not check for register overflow ! */
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+                            size_t value, unsigned nbBits)
+{
+    MEM_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+    assert(nbBits < BIT_MASK_SIZE);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+                                size_t value, unsigned nbBits)
+{
+    assert((value>>nbBits) == 0);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  assumption : bitContainer has not overflowed
+ *  unsafe version; does not check buffer overflow */
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    assert(bitC->ptr <= bitC->endPtr);
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_flushBits() :
+ *  assumption : bitContainer has not overflowed
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow.
+ *  overflow will be revealed later on using BIT_closeCStream() */
+MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+ *            or 0 if it could not fit into dstBuffer */
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+{
+    BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+    BIT_flushBits(bitC);
+    if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+
+/*-********************************************************
+*  bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+ *  Initialize a BIT_DStream_t.
+ * `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+ * `srcSize` must be the *exact* size of the bitStream, in bytes.
+ * @return : size of stream (== srcSize), or an errorCode if a problem is detected
+ */
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    bitD->start = (const char*)srcBuffer;
+    bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+    } else {
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                /* fall-through */
+
+        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                /* fall-through */
+
+        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                /* fall-through */
+
+        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
+                /* fall-through */
+
+        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
+                /* fall-through */
+
+        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                /* fall-through */
+
+        default: break;
+        }
+        {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+            if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+        }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
+{
+    return bitContainer >> start;
+}
+
+MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
+{
+#if defined(__BMI__) && defined(__GNUC__) && __GNUC__*1000+__GNUC_MINOR__ >= 4008  /* experimental */
+#  if defined(__x86_64__)
+    if (sizeof(bitContainer)==8)
+        return _bextr_u64(bitContainer, start, nbBits);
+    else
+#  endif
+        return _bextr_u32(bitContainer, start, nbBits);
+#else
+    assert(nbBits < BIT_MASK_SIZE);
+    return (bitContainer >> start) & BIT_mask[nbBits];
+#endif
+}
+
+MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+}
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ * @return : value extracted */
+MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
+{
+#if defined(__BMI__) && defined(__GNUC__)   /* experimental; fails if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8 */
+    return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
+#else
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
+#endif
+}
+
+/*! BIT_lookBitsFast() :
+ *  unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+{
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    assert(nbBits >= 1);
+    return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ * @return : extracted value. */
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_readBitsFast() :
+ *  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BIT_lookBitsFast(bitD, nbBits);
+    assert(nbBits >= 1);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_reloadDStream() :
+ *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+ *  This function is safe, it guarantees it will not read beyond src buffer.
+ * @return : status of `BIT_DStream_t` internal register.
+ *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->limitPtr) {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BIT_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    /* start < ptr < limitPtr */
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream() :
+ * @return : 1 if DStream has _exactly_ reached its end (all bits consumed).
+ */
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */

diff --git a/vendor/github.com/DataDog/zstd/compiler.h b/vendor/github.com/DataDog/zstd/compiler.h
new file mode 100644
index 0000000..e90a3bc
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/compiler.h

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPILER_H
+#define ZSTD_COMPILER_H
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+/* force inlining */
+#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+/**
+ * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+ * parameters. They must be inlined for the compiler to elimininate the constant
+ * branches.
+ */
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+/**
+ * HINT_INLINE is used to help the compiler generate better code. It is *not*
+ * used for "templates", so it can be tweaked based on the compilers
+ * performance.
+ *
+ * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
+ * always_inline attribute.
+ *
+ * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
+ * attribute.
+ */
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+#  define HINT_INLINE static INLINE_KEYWORD
+#else
+#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
+#endif
+
+/* force no inlining */
+#ifdef _MSC_VER
+#  define FORCE_NOINLINE static __declspec(noinline)
+#else
+#  ifdef __GNUC__
+#    define FORCE_NOINLINE static __attribute__((__noinline__))
+#  else
+#    define FORCE_NOINLINE static
+#  endif
+#endif
+
+/* target attribute */
+#ifndef __has_attribute
+  #define __has_attribute(x) 0  /* Compatibility with non-clang compilers. */
+#endif
+#if defined(__GNUC__)
+#  define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
+#else
+#  define TARGET_ATTRIBUTE(target)
+#endif
+
+/* Enable runtime BMI2 dispatch based on the CPU.
+ * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
+ */
+#ifndef DYNAMIC_BMI2
+  #if (defined(__clang__) && __has_attribute(__target__)) \
+      || (defined(__GNUC__) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) \
+      && (defined(__x86_64__) || defined(_M_X86)) \
+      && !defined(__BMI2__)
+  #  define DYNAMIC_BMI2 1
+  #else
+  #  define DYNAMIC_BMI2 0
+  #endif
+#endif
+
+/* prefetch */
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#  include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#  define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
+#elif defined(__GNUC__)
+#  define PREFETCH(ptr)   __builtin_prefetch(ptr, 0, 0)
+#else
+#  define PREFETCH(ptr)   /* disabled */
+#endif
+
+/* disable warnings */
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+#endif /* ZSTD_COMPILER_H */

diff --git a/vendor/github.com/DataDog/zstd/cover.c b/vendor/github.com/DataDog/zstd/cover.c
new file mode 100644
index 0000000..b5a3957
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/cover.c

@@ -0,0 +1,1048 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* *****************************************************************************
+ * Constructs a dictionary using a heuristic based on the following paper:
+ *
+ * Liao, Petri, Moffat, Wirth
+ * Effective Construction of Relative Lempel-Ziv Dictionaries
+ * Published in WWW 2016.
+ *
+ * Adapted from code originally written by @ot (Giuseppe Ottaviano).
+ ******************************************************************************/
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+/*-*************************************
+*  Constants
+***************************************/
+#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+
+/*-*************************************
+*  Console display
+***************************************/
+static int g_displayLevel = 2;
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+/*-*************************************
+* Hash table
+***************************************
+* A small specialized hash map for storing activeDmers.
+* The map does not resize, so if it becomes full it will loop forever.
+* Thus, the map must be large enough to store every value.
+* The map implements linear probing and keeps its load less than 0.5.
+*/
+
+#define MAP_EMPTY_VALUE ((U32)-1)
+typedef struct COVER_map_pair_t_s {
+  U32 key;
+  U32 value;
+} COVER_map_pair_t;
+
+typedef struct COVER_map_s {
+  COVER_map_pair_t *data;
+  U32 sizeLog;
+  U32 size;
+  U32 sizeMask;
+} COVER_map_t;
+
+/**
+ * Clear the map.
+ */
+static void COVER_map_clear(COVER_map_t *map) {
+  memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
+}
+
+/**
+ * Initializes a map of the given size.
+ * Returns 1 on success and 0 on failure.
+ * The map must be destroyed with COVER_map_destroy().
+ * The map is only guaranteed to be large enough to hold size elements.
+ */
+static int COVER_map_init(COVER_map_t *map, U32 size) {
+  map->sizeLog = ZSTD_highbit32(size) + 2;
+  map->size = (U32)1 << map->sizeLog;
+  map->sizeMask = map->size - 1;
+  map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
+  if (!map->data) {
+    map->sizeLog = 0;
+    map->size = 0;
+    return 0;
+  }
+  COVER_map_clear(map);
+  return 1;
+}
+
+/**
+ * Internal hash function
+ */
+static const U32 prime4bytes = 2654435761U;
+static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
+  return (key * prime4bytes) >> (32 - map->sizeLog);
+}
+
+/**
+ * Helper function that returns the index that a key should be placed into.
+ */
+static U32 COVER_map_index(COVER_map_t *map, U32 key) {
+  const U32 hash = COVER_map_hash(map, key);
+  U32 i;
+  for (i = hash;; i = (i + 1) & map->sizeMask) {
+    COVER_map_pair_t *pos = &map->data[i];
+    if (pos->value == MAP_EMPTY_VALUE) {
+      return i;
+    }
+    if (pos->key == key) {
+      return i;
+    }
+  }
+}
+
+/**
+ * Returns the pointer to the value for key.
+ * If key is not in the map, it is inserted and the value is set to 0.
+ * The map must not be full.
+ */
+static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
+  COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
+  if (pos->value == MAP_EMPTY_VALUE) {
+    pos->key = key;
+    pos->value = 0;
+  }
+  return &pos->value;
+}
+
+/**
+ * Deletes key from the map if present.
+ */
+static void COVER_map_remove(COVER_map_t *map, U32 key) {
+  U32 i = COVER_map_index(map, key);
+  COVER_map_pair_t *del = &map->data[i];
+  U32 shift = 1;
+  if (del->value == MAP_EMPTY_VALUE) {
+    return;
+  }
+  for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
+    COVER_map_pair_t *const pos = &map->data[i];
+    /* If the position is empty we are done */
+    if (pos->value == MAP_EMPTY_VALUE) {
+      del->value = MAP_EMPTY_VALUE;
+      return;
+    }
+    /* If pos can be moved to del do so */
+    if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
+      del->key = pos->key;
+      del->value = pos->value;
+      del = pos;
+      shift = 1;
+    } else {
+      ++shift;
+    }
+  }
+}
+
+/**
+ * Destroyes a map that is inited with COVER_map_init().
+ */
+static void COVER_map_destroy(COVER_map_t *map) {
+  if (map->data) {
+    free(map->data);
+  }
+  map->data = NULL;
+  map->size = 0;
+}
+
+/*-*************************************
+* Context
+***************************************/
+
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  U32 *suffix;
+  size_t suffixSize;
+  U32 *freqs;
+  U32 *dmerAt;
+  unsigned d;
+} COVER_ctx_t;
+
+/* We need a global context for qsort... */
+static COVER_ctx_t *g_ctx = NULL;
+
+/*-*************************************
+*  Helper functions
+***************************************/
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+  size_t sum = 0;
+  size_t i;
+  for (i = 0; i < nbSamples; ++i) {
+    sum += samplesSizes[i];
+  }
+  return sum;
+}
+
+/**
+ * Returns -1 if the dmer at lp is less than the dmer at rp.
+ * Return 0 if the dmers at lp and rp are equal.
+ * Returns 1 if the dmer at lp is greater than the dmer at rp.
+ */
+static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
+  U32 const lhs = *(U32 const *)lp;
+  U32 const rhs = *(U32 const *)rp;
+  return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
+}
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
+  U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
+  U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
+  U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
+  if (lhs < rhs) {
+    return -1;
+  }
+  return (lhs > rhs);
+}
+
+/**
+ * Same as COVER_cmp() except ties are broken by pointer value
+ * NOTE: g_ctx must be set to call this function.  A global is required because
+ * qsort doesn't take an opaque pointer.
+ */
+static int COVER_strict_cmp(const void *lp, const void *rp) {
+  int result = COVER_cmp(g_ctx, lp, rp);
+  if (result == 0) {
+    result = lp < rp ? -1 : 1;
+  }
+  return result;
+}
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_strict_cmp8(const void *lp, const void *rp) {
+  int result = COVER_cmp8(g_ctx, lp, rp);
+  if (result == 0) {
+    result = lp < rp ? -1 : 1;
+  }
+  return result;
+}
+
+/**
+ * Returns the first pointer in [first, last) whose element does not compare
+ * less than value.  If no such element exists it returns last.
+ */
+static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
+                                       size_t value) {
+  size_t count = last - first;
+  while (count != 0) {
+    size_t step = count / 2;
+    const size_t *ptr = first;
+    ptr += step;
+    if (*ptr < value) {
+      first = ++ptr;
+      count -= step + 1;
+    } else {
+      count = step;
+    }
+  }
+  return first;
+}
+
+/**
+ * Generic groupBy function.
+ * Groups an array sorted by cmp into groups with equivalent values.
+ * Calls grp for each group.
+ */
+static void
+COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
+              int (*cmp)(COVER_ctx_t *, const void *, const void *),
+              void (*grp)(COVER_ctx_t *, const void *, const void *)) {
+  const BYTE *ptr = (const BYTE *)data;
+  size_t num = 0;
+  while (num < count) {
+    const BYTE *grpEnd = ptr + size;
+    ++num;
+    while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
+      grpEnd += size;
+      ++num;
+    }
+    grp(ctx, ptr, grpEnd);
+    ptr = grpEnd;
+  }
+}
+
+/*-*************************************
+*  Cover functions
+***************************************/
+
+/**
+ * Called on each group of positions with the same dmer.
+ * Counts the frequency of each dmer and saves it in the suffix array.
+ * Fills `ctx->dmerAt`.
+ */
+static void COVER_group(COVER_ctx_t *ctx, const void *group,
+                        const void *groupEnd) {
+  /* The group consists of all the positions with the same first d bytes. */
+  const U32 *grpPtr = (const U32 *)group;
+  const U32 *grpEnd = (const U32 *)groupEnd;
+  /* The dmerId is how we will reference this dmer.
+   * This allows us to map the whole dmer space to a much smaller space, the
+   * size of the suffix array.
+   */
+  const U32 dmerId = (U32)(grpPtr - ctx->suffix);
+  /* Count the number of samples this dmer shows up in */
+  U32 freq = 0;
+  /* Details */
+  const size_t *curOffsetPtr = ctx->offsets;
+  const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
+  /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
+   * different sample than the last.
+   */
+  size_t curSampleEnd = ctx->offsets[0];
+  for (; grpPtr != grpEnd; ++grpPtr) {
+    /* Save the dmerId for this position so we can get back to it. */
+    ctx->dmerAt[*grpPtr] = dmerId;
+    /* Dictionaries only help for the first reference to the dmer.
+     * After that zstd can reference the match from the previous reference.
+     * So only count each dmer once for each sample it is in.
+     */
+    if (*grpPtr < curSampleEnd) {
+      continue;
+    }
+    freq += 1;
+    /* Binary search to find the end of the sample *grpPtr is in.
+     * In the common case that grpPtr + 1 == grpEnd we can skip the binary
+     * search because the loop is over.
+     */
+    if (grpPtr + 1 != grpEnd) {
+      const size_t *sampleEndPtr =
+          COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
+      curSampleEnd = *sampleEndPtr;
+      curOffsetPtr = sampleEndPtr + 1;
+    }
+  }
+  /* At this point we are never going to look at this segment of the suffix
+   * array again.  We take advantage of this fact to save memory.
+   * We store the frequency of the dmer in the first position of the group,
+   * which is dmerId.
+   */
+  ctx->suffix[dmerId] = freq;
+}
+
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} COVER_segment_t;
+
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of dmer d.
+ * Let S_i be the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer d is in the dictionay we set F(d) = 0.
+ */
+static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
+                                           COVER_map_t *activeDmers, U32 begin,
+                                           U32 end,
+                                           ZDICT_cover_params_t parameters) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 dmersInK = k - d + 1;
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+  /* Reset the activeDmers in the segment */
+  COVER_map_clear(activeDmers);
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* The dmerId for the dmer at the next position */
+    U32 newDmer = ctx->dmerAt[activeSegment.end];
+    /* The entry in activeDmers for this dmerId */
+    U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
+    /* If the dmer isn't already present in the segment add its score. */
+    if (*newDmerOcc == 0) {
+      /* The paper suggest using the L-0.5 norm, but experiments show that it
+       * doesn't help.
+       */
+      activeSegment.score += freqs[newDmer];
+    }
+    /* Add the dmer to the segment */
+    activeSegment.end += 1;
+    *newDmerOcc += 1;
+
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      U32 delDmer = ctx->dmerAt[activeSegment.begin];
+      U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
+      activeSegment.begin += 1;
+      *delDmerOcc -= 1;
+      /* If this is the last occurence of the dmer, subtract its score */
+      if (*delDmerOcc == 0) {
+        COVER_map_remove(activeDmers, delDmer);
+        activeSegment.score -= freqs[delDmer];
+      }
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+  {
+    /* Trim off the zero frequency head and tail from the segment. */
+    U32 newBegin = bestSegment.end;
+    U32 newEnd = bestSegment.begin;
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      U32 freq = freqs[ctx->dmerAt[pos]];
+      if (freq != 0) {
+        newBegin = MIN(newBegin, pos);
+        newEnd = pos + 1;
+      }
+    }
+    bestSegment.begin = newBegin;
+    bestSegment.end = newEnd;
+  }
+  {
+    /* Zero out the frequency of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      freqs[ctx->dmerAt[pos]] = 0;
+    }
+  }
+  return bestSegment;
+}
+
+/**
+ * Check the validity of the parameters.
+ * Returns non-zero if the parameters are valid and 0 otherwise.
+ */
+static int COVER_checkParameters(ZDICT_cover_params_t parameters,
+                                 size_t maxDictSize) {
+  /* k and d are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  return 1;
+}
+
+/**
+ * Clean up a context initialized with `COVER_ctx_init()`.
+ */
+static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
+  if (!ctx) {
+    return;
+  }
+  if (ctx->suffix) {
+    free(ctx->suffix);
+    ctx->suffix = NULL;
+  }
+  if (ctx->freqs) {
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+  }
+  if (ctx->dmerAt) {
+    free(ctx->dmerAt);
+    ctx->dmerAt = NULL;
+  }
+  if (ctx->offsets) {
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+  }
+}
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can used multiple
+ * times.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `COVER_ctx_destroy()`.
+ */
+static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
+                          const size_t *samplesSizes, unsigned nbSamples,
+                          unsigned d) {
+  const BYTE *const samples = (const BYTE *)samplesBuffer;
+  const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+  /* Checks */
+  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+      totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
+    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                 (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
+    return 0;
+  }
+  /* Zero the context */
+  memset(ctx, 0, sizeof(*ctx));
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples,
+               (U32)totalSamplesSize);
+  ctx->samples = samples;
+  ctx->samplesSizes = samplesSizes;
+  ctx->nbSamples = nbSamples;
+  /* Partial suffix array */
+  ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1;
+  ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  /* Maps index to the dmerID */
+  ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  /* The offsets of each file */
+  ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
+  if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
+    DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
+    COVER_ctx_destroy(ctx);
+    return 0;
+  }
+  ctx->freqs = NULL;
+  ctx->d = d;
+
+  /* Fill offsets from the samlesSizes */
+  {
+    U32 i;
+    ctx->offsets[0] = 0;
+    for (i = 1; i <= nbSamples; ++i) {
+      ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+    }
+  }
+  DISPLAYLEVEL(2, "Constructing partial suffix array\n");
+  {
+    /* suffix is a partial suffix array.
+     * It only sorts suffixes by their first parameters.d bytes.
+     * The sort is stable, so each dmer group is sorted by position in input.
+     */
+    U32 i;
+    for (i = 0; i < ctx->suffixSize; ++i) {
+      ctx->suffix[i] = i;
+    }
+    /* qsort doesn't take an opaque pointer, so pass as a global */
+    g_ctx = ctx;
+    qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
+          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
+  }
+  DISPLAYLEVEL(2, "Computing frequencies\n");
+  /* For each dmer group (group of positions with the same first d bytes):
+   * 1. For each position we set dmerAt[position] = dmerID.  The dmerID is
+   *    (groupBeginPtr - suffix).  This allows us to go from position to
+   *    dmerID so we can look up values in freq.
+   * 2. We calculate how many samples the dmer occurs in and save it in
+   *    freqs[dmerId].
+   */
+  COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
+                (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
+  ctx->freqs = ctx->suffix;
+  ctx->suffix = NULL;
+  return 1;
+}
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
+                                    COVER_map_t *activeDmers, void *dictBuffer,
+                                    size_t dictBufferCapacity,
+                                    ZDICT_cover_params_t parameters) {
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data up into epochs of equal size.
+   * We will select at least one segment from each epoch.
+   */
+  const U32 epochs = (U32)(dictBufferCapacity / parameters.k);
+  const U32 epochSize = (U32)(ctx->suffixSize / epochs);
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
+               epochSize);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
+    const U32 epochBegin = (U32)(epoch * epochSize);
+    const U32 epochEnd = epochBegin + epochSize;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = COVER_selectSegment(
+        ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
+    /* If the segment covers no dmers, then we are out of content */
+    if (segment.score == 0) {
+      break;
+    }
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t parameters)
+{
+  BYTE* const dict = (BYTE*)dictBuffer;
+  COVER_ctx_t ctx;
+  COVER_map_t activeDmers;
+
+  /* Initialize global data */
+  g_displayLevel = parameters.zParams.notificationLevel;
+  /* Checks */
+  if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
+    DISPLAYLEVEL(1, "Cover parameters incorrect\n");
+    return ERROR(GENERIC);
+  }
+  if (nbSamples == 0) {
+    DISPLAYLEVEL(1, "Cover must have at least one input file\n");
+    return ERROR(GENERIC);
+  }
+  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+    DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                 ZDICT_DICTSIZE_MIN);
+    return ERROR(dstSize_tooSmall);
+  }
+  /* Initialize context and activeDmers */
+  if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                      parameters.d)) {
+    return ERROR(GENERIC);
+  }
+  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
+    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
+    COVER_ctx_destroy(&ctx);
+    return ERROR(GENERIC);
+  }
+
+  DISPLAYLEVEL(2, "Building dictionary\n");
+  {
+    const size_t tail =
+        COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
+                              dictBufferCapacity, parameters);
+    const size_t dictionarySize = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
+    if (!ZSTD_isError(dictionarySize)) {
+      DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                   (U32)dictionarySize);
+    }
+    COVER_ctx_destroy(&ctx);
+    COVER_map_destroy(&activeDmers);
+    return dictionarySize;
+  }
+}
+
+/**
+ * COVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except COVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct COVER_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_cover_params_t parameters;
+  size_t compressedSize;
+} COVER_best_t;
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+static void COVER_best_init(COVER_best_t *best) {
+  if (best==NULL) return; /* compatible with init on NULL */
+  (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
+  (void)ZSTD_pthread_cond_init(&best->cond, NULL);
+  best->liveJobs = 0;
+  best->dict = NULL;
+  best->dictSize = 0;
+  best->compressedSize = (size_t)-1;
+  memset(&best->parameters, 0, sizeof(best->parameters));
+}
+
+/**
+ * Wait until liveJobs == 0.
+ */
+static void COVER_best_wait(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  while (best->liveJobs != 0) {
+    ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
+  }
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+static void COVER_best_destroy(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  COVER_best_wait(best);
+  if (best->dict) {
+    free(best->dict);
+  }
+  ZSTD_pthread_mutex_destroy(&best->mutex);
+  ZSTD_pthread_cond_destroy(&best->cond);
+}
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+static void COVER_best_start(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  ++best->liveJobs;
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+                              ZDICT_cover_params_t parameters, void *dict,
+                              size_t dictSize) {
+  if (!best) {
+    return;
+  }
+  {
+    size_t liveJobs;
+    ZSTD_pthread_mutex_lock(&best->mutex);
+    --best->liveJobs;
+    liveJobs = best->liveJobs;
+    /* If the new dictionary is better */
+    if (compressedSize < best->compressedSize) {
+      /* Allocate space if necessary */
+      if (!best->dict || best->dictSize < dictSize) {
+        if (best->dict) {
+          free(best->dict);
+        }
+        best->dict = malloc(dictSize);
+        if (!best->dict) {
+          best->compressedSize = ERROR(GENERIC);
+          best->dictSize = 0;
+          return;
+        }
+      }
+      /* Save the dictionary, parameters, and size */
+      memcpy(best->dict, dict, dictSize);
+      best->dictSize = dictSize;
+      best->parameters = parameters;
+      best->compressedSize = compressedSize;
+    }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
+    if (liveJobs == 0) {
+      ZSTD_pthread_cond_broadcast(&best->cond);
+    }
+  }
+}
+
+/**
+ * Parameters for COVER_tryParameters().
+ */
+typedef struct COVER_tryParameters_data_s {
+  const COVER_ctx_t *ctx;
+  COVER_best_t *best;
+  size_t dictBufferCapacity;
+  ZDICT_cover_params_t parameters;
+} COVER_tryParameters_data_t;
+
+/**
+ * Tries a set of parameters and upates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void COVER_tryParameters(void *opaque) {
+  /* Save parameters as local variables */
+  COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
+  const COVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Allocate space for hash table, dict, and freqs */
+  COVER_map_t activeDmers;
+  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
+    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
+    goto _cleanup;
+  }
+  if (!dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
+  /* Build the dictionary */
+  {
+    const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
+                                              dictBufferCapacity, parameters);
+    dictBufferCapacity = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples,
+        parameters.zParams);
+    if (ZDICT_isError(dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+      goto _cleanup;
+    }
+  }
+  /* Check total compressed size */
+  {
+    /* Pointers */
+    ZSTD_CCtx *cctx;
+    ZSTD_CDict *cdict;
+    void *dst;
+    /* Local variables */
+    size_t dstCapacity;
+    size_t i;
+    /* Allocate dst with enough space to compress the maximum sized sample */
+    {
+      size_t maxSampleSize = 0;
+      for (i = 0; i < ctx->nbSamples; ++i) {
+        maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
+      }
+      dstCapacity = ZSTD_compressBound(maxSampleSize);
+      dst = malloc(dstCapacity);
+    }
+    /* Create the cctx and cdict */
+    cctx = ZSTD_createCCtx();
+    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                             parameters.zParams.compressionLevel);
+    if (!dst || !cctx || !cdict) {
+      goto _compressCleanup;
+    }
+    /* Compress each sample and sum their sizes (or error) */
+    totalCompressedSize = dictBufferCapacity;
+    for (i = 0; i < ctx->nbSamples; ++i) {
+      const size_t size = ZSTD_compress_usingCDict(
+          cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
+          ctx->samplesSizes[i], cdict);
+      if (ZSTD_isError(size)) {
+        totalCompressedSize = ERROR(GENERIC);
+        goto _compressCleanup;
+      }
+      totalCompressedSize += size;
+    }
+  _compressCleanup:
+    ZSTD_freeCCtx(cctx);
+    ZSTD_freeCDict(cdict);
+    if (dst) {
+      free(dst);
+    }
+  }
+
+_cleanup:
+  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
+                    dictBufferCapacity);
+  free(data);
+  COVER_map_destroy(&activeDmers);
+  if (dict) {
+    free(dict);
+  }
+  if (freqs) {
+    free(freqs);
+  }
+}
+
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t *parameters) {
+  /* constants */
+  const unsigned nbThreads = parameters->nbThreads;
+  const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+  const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+  const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+  const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+  const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+  const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+  const unsigned kIterations =
+      (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+  /* Local variables */
+  const int displayLevel = parameters->zParams.notificationLevel;
+  unsigned iteration = 1;
+  unsigned d;
+  unsigned k;
+  COVER_best_t best;
+  POOL_ctx *pool = NULL;
+
+  /* Checks */
+  if (kMinK < kMaxD || kMaxK < kMinK) {
+    LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+    return ERROR(GENERIC);
+  }
+  if (nbSamples == 0) {
+    DISPLAYLEVEL(1, "Cover must have at least one input file\n");
+    return ERROR(GENERIC);
+  }
+  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+    DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                 ZDICT_DICTSIZE_MIN);
+    return ERROR(dstSize_tooSmall);
+  }
+  if (nbThreads > 1) {
+    pool = POOL_create(nbThreads, 1);
+    if (!pool) {
+      return ERROR(memory_allocation);
+    }
+  }
+  /* Initialization */
+  COVER_best_init(&best);
+  /* Turn down global display level to clean up display at level 2 and below */
+  g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+  /* Loop through d first because each new value needs a new context */
+  LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                    kIterations);
+  for (d = kMinD; d <= kMaxD; d += 2) {
+    /* Initialize the context for this value of d */
+    COVER_ctx_t ctx;
+    LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+    if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return ERROR(GENERIC);
+    }
+    /* Loop through k reusing the same context */
+    for (k = kMinK; k <= kMaxK; k += kStepSize) {
+      /* Prepare the arguments */
+      COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
+          sizeof(COVER_tryParameters_data_t));
+      LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+      if (!data) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+        COVER_best_destroy(&best);
+        COVER_ctx_destroy(&ctx);
+        POOL_free(pool);
+        return ERROR(GENERIC);
+      }
+      data->ctx = &ctx;
+      data->best = &best;
+      data->dictBufferCapacity = dictBufferCapacity;
+      data->parameters = *parameters;
+      data->parameters.k = k;
+      data->parameters.d = d;
+      data->parameters.steps = kSteps;
+      data->parameters.zParams.notificationLevel = g_displayLevel;
+      /* Check the parameters */
+      if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
+        DISPLAYLEVEL(1, "Cover parameters incorrect\n");
+        free(data);
+        continue;
+      }
+      /* Call the function and pass ownership of data to it */
+      COVER_best_start(&best);
+      if (pool) {
+        POOL_add(pool, &COVER_tryParameters, data);
+      } else {
+        COVER_tryParameters(data);
+      }
+      /* Print status */
+      LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                         (U32)((iteration * 100) / kIterations));
+      ++iteration;
+    }
+    COVER_best_wait(&best);
+    COVER_ctx_destroy(&ctx);
+  }
+  LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+  /* Fill the output buffer and parameters with output of the best parameters */
+  {
+    const size_t dictSize = best.dictSize;
+    if (ZSTD_isError(best.compressedSize)) {
+      const size_t compressedSize = best.compressedSize;
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return compressedSize;
+    }
+    *parameters = best.parameters;
+    memcpy(dictBuffer, best.dict, dictSize);
+    COVER_best_destroy(&best);
+    POOL_free(pool);
+    return dictSize;
+  }
+}

diff --git a/vendor/github.com/DataDog/zstd/cpu.h b/vendor/github.com/DataDog/zstd/cpu.h
new file mode 100755
index 0000000..4eb48e3
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/cpu.h

@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMMON_CPU_H
+#define ZSTD_COMMON_CPU_H
+
+/**
+ * Implementation taken from folly/CpuId.h
+ * https://github.com/facebook/folly/blob/master/folly/CpuId.h
+ */
+
+#include <string.h>
+
+#include "mem.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+typedef struct {
+    U32 f1c;
+    U32 f1d;
+    U32 f7b;
+    U32 f7c;
+} ZSTD_cpuid_t;
+
+MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
+    U32 f1c = 0;
+    U32 f1d = 0;
+    U32 f7b = 0;
+    U32 f7c = 0;
+#ifdef _MSC_VER
+    int reg[4];
+    __cpuid((int*)reg, 0);
+    {
+        int const n = reg[0];
+        if (n >= 1) {
+            __cpuid((int*)reg, 1);
+            f1c = (U32)reg[2];
+            f1d = (U32)reg[3];
+        }
+        if (n >= 7) {
+            __cpuidex((int*)reg, 7, 0);
+            f7b = (U32)reg[1];
+            f7c = (U32)reg[2];
+        }
+    }
+#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
+    /* The following block like the normal cpuid branch below, but gcc
+     * reserves ebx for use of its pic register so we must specially
+     * handle the save and restore to avoid clobbering the register
+     */
+    U32 n;
+    __asm__(
+        "pushl %%ebx\n\t"
+        "cpuid\n\t"
+        "popl %%ebx\n\t"
+        : "=a"(n)
+        : "a"(0)
+        : "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "popl %%ebx\n\t"
+          : "=a"(f1a), "=c"(f1c), "=d"(f1d)
+          : "a"(1)
+          :);
+    }
+    if (n >= 7) {
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "movl %%ebx, %%eax\n\r"
+          "popl %%ebx"
+          : "=a"(f7b), "=c"(f7c)
+          : "a"(7), "c"(0)
+          : "edx");
+    }
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+    U32 n;
+    __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
+    }
+    if (n >= 7) {
+      U32 f7a;
+      __asm__("cpuid"
+              : "=a"(f7a), "=b"(f7b), "=c"(f7c)
+              : "a"(7), "c"(0)
+              : "edx");
+    }
+#endif
+    {
+        ZSTD_cpuid_t cpuid;
+        cpuid.f1c = f1c;
+        cpuid.f1d = f1d;
+        cpuid.f7b = f7b;
+        cpuid.f7c = f7c;
+        return cpuid;
+    }
+}
+
+#define X(name, r, bit)                                                        \
+  MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) {                 \
+    return ((cpuid.r) & (1U << bit)) != 0;                                     \
+  }
+
+/* cpuid(1): Processor Info and Feature Bits. */
+#define C(name, bit) X(name, f1c, bit)
+  C(sse3, 0)
+  C(pclmuldq, 1)
+  C(dtes64, 2)
+  C(monitor, 3)
+  C(dscpl, 4)
+  C(vmx, 5)
+  C(smx, 6)
+  C(eist, 7)
+  C(tm2, 8)
+  C(ssse3, 9)
+  C(cnxtid, 10)
+  C(fma, 12)
+  C(cx16, 13)
+  C(xtpr, 14)
+  C(pdcm, 15)
+  C(pcid, 17)
+  C(dca, 18)
+  C(sse41, 19)
+  C(sse42, 20)
+  C(x2apic, 21)
+  C(movbe, 22)
+  C(popcnt, 23)
+  C(tscdeadline, 24)
+  C(aes, 25)
+  C(xsave, 26)
+  C(osxsave, 27)
+  C(avx, 28)
+  C(f16c, 29)
+  C(rdrand, 30)
+#undef C
+#define D(name, bit) X(name, f1d, bit)
+  D(fpu, 0)
+  D(vme, 1)
+  D(de, 2)
+  D(pse, 3)
+  D(tsc, 4)
+  D(msr, 5)
+  D(pae, 6)
+  D(mce, 7)
+  D(cx8, 8)
+  D(apic, 9)
+  D(sep, 11)
+  D(mtrr, 12)
+  D(pge, 13)
+  D(mca, 14)
+  D(cmov, 15)
+  D(pat, 16)
+  D(pse36, 17)
+  D(psn, 18)
+  D(clfsh, 19)
+  D(ds, 21)
+  D(acpi, 22)
+  D(mmx, 23)
+  D(fxsr, 24)
+  D(sse, 25)
+  D(sse2, 26)
+  D(ss, 27)
+  D(htt, 28)
+  D(tm, 29)
+  D(pbe, 31)
+#undef D
+
+/* cpuid(7): Extended Features. */
+#define B(name, bit) X(name, f7b, bit)
+  B(bmi1, 3)
+  B(hle, 4)
+  B(avx2, 5)
+  B(smep, 7)
+  B(bmi2, 8)
+  B(erms, 9)
+  B(invpcid, 10)
+  B(rtm, 11)
+  B(mpx, 14)
+  B(avx512f, 16)
+  B(avx512dq, 17)
+  B(rdseed, 18)
+  B(adx, 19)
+  B(smap, 20)
+  B(avx512ifma, 21)
+  B(pcommit, 22)
+  B(clflushopt, 23)
+  B(clwb, 24)
+  B(avx512pf, 26)
+  B(avx512er, 27)
+  B(avx512cd, 28)
+  B(sha, 29)
+  B(avx512bw, 30)
+  B(avx512vl, 31)
+#undef B
+#define C(name, bit) X(name, f7c, bit)
+  C(prefetchwt1, 0)
+  C(avx512vbmi, 1)
+#undef C
+
+#undef X
+
+#endif /* ZSTD_COMMON_CPU_H */

diff --git a/vendor/github.com/DataDog/zstd/divsufsort.c b/vendor/github.com/DataDog/zstd/divsufsort.c
new file mode 100644
index 0000000..60cceb0
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/divsufsort.c

@@ -0,0 +1,1913 @@
+/*
+ * divsufsort.c for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*- Compiler specifics -*/
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4244)
+#  pragma warning(disable : 4127)    /* C4127 : Condition expression is constant */
+#endif
+
+
+/*- Dependencies -*/
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "divsufsort.h"
+
+/*- Constants -*/
+#if defined(INLINE)
+# undef INLINE
+#endif
+#if !defined(INLINE)
+# define INLINE __inline
+#endif
+#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1)
+# undef ALPHABET_SIZE
+#endif
+#if !defined(ALPHABET_SIZE)
+# define ALPHABET_SIZE (256)
+#endif
+#define BUCKET_A_SIZE (ALPHABET_SIZE)
+#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
+#if defined(SS_INSERTIONSORT_THRESHOLD)
+# if SS_INSERTIONSORT_THRESHOLD < 1
+#  undef SS_INSERTIONSORT_THRESHOLD
+#  define SS_INSERTIONSORT_THRESHOLD (1)
+# endif
+#else
+# define SS_INSERTIONSORT_THRESHOLD (8)
+#endif
+#if defined(SS_BLOCKSIZE)
+# if SS_BLOCKSIZE < 0
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (0)
+# elif 32768 <= SS_BLOCKSIZE
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (32767)
+# endif
+#else
+# define SS_BLOCKSIZE (1024)
+#endif
+/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */
+#if SS_BLOCKSIZE == 0
+# define SS_MISORT_STACKSIZE (96)
+#elif SS_BLOCKSIZE <= 4096
+# define SS_MISORT_STACKSIZE (16)
+#else
+# define SS_MISORT_STACKSIZE (24)
+#endif
+#define SS_SMERGE_STACKSIZE (32)
+#define TR_INSERTIONSORT_THRESHOLD (8)
+#define TR_STACKSIZE (64)
+
+
+/*- Macros -*/
+#ifndef SWAP
+# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0)
+#endif /* SWAP */
+#ifndef MIN
+# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b))
+#endif /* MIN */
+#ifndef MAX
+# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b))
+#endif /* MAX */
+#define STACK_PUSH(_a, _b, _c, _d)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize++].d = (_d);\
+  } while(0)
+#define STACK_PUSH5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\
+  } while(0)
+#define STACK_POP(_a, _b, _c, _d)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d;\
+  } while(0)
+#define STACK_POP5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\
+  } while(0)
+#define BUCKET_A(_c0) bucket_A[(_c0)]
+#if ALPHABET_SIZE == 256
+#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
+#else
+#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
+#endif
+
+
+/*- Private Functions -*/
+
+static const int lg_table[256]= {
+ -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+int
+ss_ilg(int n) {
+#if SS_BLOCKSIZE == 0
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+#elif SS_BLOCKSIZE < 256
+  return lg_table[n];
+#else
+  return (n & 0xff00) ?
+          8 + lg_table[(n >> 8) & 0xff] :
+          0 + lg_table[(n >> 0) & 0xff];
+#endif
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+#if SS_BLOCKSIZE != 0
+
+static const int sqq_table[256] = {
+  0,  16,  22,  27,  32,  35,  39,  42,  45,  48,  50,  53,  55,  57,  59,  61,
+ 64,  65,  67,  69,  71,  73,  75,  76,  78,  80,  81,  83,  84,  86,  87,  89,
+ 90,  91,  93,  94,  96,  97,  98,  99, 101, 102, 103, 104, 106, 107, 108, 109,
+110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155,
+156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
+169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180,
+181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191,
+192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201,
+202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211,
+212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221,
+221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230,
+230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
+239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247,
+247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255
+};
+
+static INLINE
+int
+ss_isqrt(int x) {
+  int y, e;
+
+  if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; }
+  e = (x & 0xffff0000) ?
+        ((x & 0xff000000) ?
+          24 + lg_table[(x >> 24) & 0xff] :
+          16 + lg_table[(x >> 16) & 0xff]) :
+        ((x & 0x0000ff00) ?
+           8 + lg_table[(x >>  8) & 0xff] :
+           0 + lg_table[(x >>  0) & 0xff]);
+
+  if(e >= 16) {
+    y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7);
+    if(e >= 24) { y = (y + 1 + x / y) >> 1; }
+    y = (y + 1 + x / y) >> 1;
+  } else if(e >= 8) {
+    y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1;
+  } else {
+    return sqq_table[x] >> 4;
+  }
+
+  return (x < (y * y)) ? y - 1 : y;
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Compares two suffixes. */
+static INLINE
+int
+ss_compare(const unsigned char *T,
+           const int *p1, const int *p2,
+           int depth) {
+  const unsigned char *U1, *U2, *U1n, *U2n;
+
+  for(U1 = T + depth + *p1,
+      U2 = T + depth + *p2,
+      U1n = T + *(p1 + 1) + 2,
+      U2n = T + *(p2 + 1) + 2;
+      (U1 < U1n) && (U2 < U2n) && (*U1 == *U2);
+      ++U1, ++U2) {
+  }
+
+  return U1 < U1n ?
+        (U2 < U2n ? *U1 - *U2 : 1) :
+        (U2 < U2n ? -1 : 0);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1)
+
+/* Insertionsort for small size groups */
+static
+void
+ss_insertionsort(const unsigned char *T, const int *PA,
+                 int *first, int *last, int depth) {
+  int *i, *j;
+  int t;
+  int r;
+
+  for(i = last - 2; first <= i; --i) {
+    for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) {
+      do { *(j - 1) = *j; } while((++j < last) && (*j < 0));
+      if(last <= j) { break; }
+    }
+    if(r == 0) { *j = ~*j; }
+    *(j - 1) = t;
+  }
+}
+
+#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+void
+ss_fixdown(const unsigned char *Td, const int *PA,
+           int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = Td[PA[SA[k = j++]]];
+    if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    ss_fixdown(Td, PA, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+ss_median3(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3) {
+  int *t;
+  if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); }
+  if(Td[PA[*v2]] > Td[PA[*v3]]) {
+    if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+ss_median5(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); }
+  if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); }
+  if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); }
+  if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return ss_median3(Td, PA, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = ss_median3(Td, PA, first, first + t, first + (t << 1));
+  middle = ss_median3(Td, PA, middle - t, middle, middle + t);
+  last   = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return ss_median3(Td, PA, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Binary partition for substrings. */
+static INLINE
+int *
+ss_partition(const int *PA,
+                    int *first, int *last, int depth) {
+  int *a, *b;
+  int t;
+  for(a = first - 1, b = last;;) {
+    for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; }
+    for(; (a < --b) && ((PA[*b] + depth) <  (PA[*b + 1] + 1));) { }
+    if(b <= a) { break; }
+    t = ~*b;
+    *b = *a;
+    *a = t;
+  }
+  if(first < a) { *first = ~*first; }
+  return a;
+}
+
+/* Multikey introsort for medium size groups. */
+static
+void
+ss_mintrosort(const unsigned char *T, const int *PA,
+              int *first, int *last,
+              int depth) {
+#define STACK_SIZE SS_MISORT_STACKSIZE
+  struct { int *a, *b, c; int d; } stack[STACK_SIZE];
+  const unsigned char *Td;
+  int *a, *b, *c, *d, *e, *f;
+  int s, t;
+  int ssize;
+  int limit;
+  int v, x = 0;
+
+  for(ssize = 0, limit = ss_ilg(last - first);;) {
+
+    if((last - first) <= SS_INSERTIONSORT_THRESHOLD) {
+#if 1 < SS_INSERTIONSORT_THRESHOLD
+      if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); }
+#endif
+      STACK_POP(first, last, depth, limit);
+      continue;
+    }
+
+    Td = T + depth;
+    if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); }
+    if(limit < 0) {
+      for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) {
+        if((x = Td[PA[*a]]) != v) {
+          if(1 < (a - first)) { break; }
+          v = x;
+          first = a;
+        }
+      }
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, a, depth);
+      }
+      if((a - first) <= (last - a)) {
+        if(1 < (a - first)) {
+          STACK_PUSH(a, last, depth, -1);
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        } else {
+          first = a, limit = -1;
+        }
+      } else {
+        if(1 < (last - a)) {
+          STACK_PUSH(first, a, depth + 1, ss_ilg(a - first));
+          first = a, limit = -1;
+        } else {
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        }
+      }
+      continue;
+    }
+
+    /* choose pivot */
+    a = ss_pivot(Td, PA, first, last);
+    v = Td[PA[*a]];
+    SWAP(*first, *a);
+
+    /* partition */
+    for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { }
+    if(((a = b) < last) && (x < v)) {
+      for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+    }
+    for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { }
+    if((b < (d = c)) && (x > v)) {
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+    for(; b < c;) {
+      SWAP(*b, *c);
+      for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+
+    if(a <= d) {
+      c = b - 1;
+
+      if((s = a - first) > (t = b - a)) { s = t; }
+      for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+      if((s = d - c) > (t = last - d - 1)) { s = t; }
+      for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+
+      a = first + (b - a), c = last - (d - c);
+      b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth);
+
+      if((a - first) <= (last - c)) {
+        if((last - c) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(c, last, depth, limit);
+          last = a;
+        } else if((a - first) <= (c - b)) {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          last = a;
+        } else {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(first, a, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      } else {
+        if((a - first) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(first, a, depth, limit);
+          first = c;
+        } else if((last - c) <= (c - b)) {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          first = c;
+        } else {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(c, last, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      }
+    } else {
+      limit += 1;
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, last, depth);
+        limit = ss_ilg(last - first);
+      }
+      depth += 1;
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if SS_BLOCKSIZE != 0
+
+static INLINE
+void
+ss_blockswap(int *a, int *b, int n) {
+  int t;
+  for(; 0 < n; --n, ++a, ++b) {
+    t = *a, *a = *b, *b = t;
+  }
+}
+
+static INLINE
+void
+ss_rotate(int *first, int *middle, int *last) {
+  int *a, *b, t;
+  int l, r;
+  l = middle - first, r = last - middle;
+  for(; (0 < l) && (0 < r);) {
+    if(l == r) { ss_blockswap(first, middle, l); break; }
+    if(l < r) {
+      a = last - 1, b = middle - 1;
+      t = *a;
+      do {
+        *a-- = *b, *b-- = *a;
+        if(b < first) {
+          *a = t;
+          last = a;
+          if((r -= l + 1) <= l) { break; }
+          a -= 1, b = middle - 1;
+          t = *a;
+        }
+      } while(1);
+    } else {
+      a = first, b = middle;
+      t = *a;
+      do {
+        *a++ = *b, *b++ = *a;
+        if(last <= b) {
+          *a = t;
+          first = a + 1;
+          if((l -= r + 1) <= r) { break; }
+          a += 1, b = middle;
+          t = *a;
+        }
+      } while(1);
+    }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static
+void
+ss_inplacemerge(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int depth) {
+  const int *p;
+  int *a, *b;
+  int len, half;
+  int q, r;
+  int x;
+
+  for(;;) {
+    if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); }
+    else                { x = 0; p = PA +  *(last - 1); }
+    for(a = first, len = middle - first, half = len >> 1, r = -1;
+        0 < len;
+        len = half, half >>= 1) {
+      b = a + half;
+      q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth);
+      if(q < 0) {
+        a = b + 1;
+        half -= (len & 1) ^ 1;
+      } else {
+        r = q;
+      }
+    }
+    if(a < middle) {
+      if(r == 0) { *a = ~*a; }
+      ss_rotate(a, middle, last);
+      last -= middle - a;
+      middle = a;
+      if(first == middle) { break; }
+    }
+    --last;
+    if(x != 0) { while(*--last < 0) { } }
+    if(middle == last) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Merge-forward with internal buffer. */
+static
+void
+ss_mergeforward(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int *buf, int depth) {
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+
+  bufend = buf + (middle - first) - 1;
+  ss_blockswap(buf, first, middle - first);
+
+  for(t = *(a = first), b = buf, c = middle;;) {
+    r = ss_compare(T, PA + *b, PA + *c, depth);
+    if(r < 0) {
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+    } else if(r > 0) {
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    } else {
+      *c = ~*c;
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    }
+  }
+}
+
+/* Merge-backward with internal buffer. */
+static
+void
+ss_mergebackward(const unsigned char *T, const int *PA,
+                 int *first, int *middle, int *last,
+                 int *buf, int depth) {
+  const int *p1, *p2;
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+  int x;
+
+  bufend = buf + (last - middle) - 1;
+  ss_blockswap(buf, middle, last - middle);
+
+  x = 0;
+  if(*bufend < 0)       { p1 = PA + ~*bufend; x |= 1; }
+  else                  { p1 = PA +  *bufend; }
+  if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; }
+  else                  { p2 = PA +  *(middle - 1); }
+  for(t = *(a = last - 1), b = bufend, c = middle - 1;;) {
+    r = ss_compare(T, p1, p2, depth);
+    if(0 < r) {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = *b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+    } else if(r < 0) {
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    } else {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = ~*b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    }
+  }
+}
+
+/* D&C based merge. */
+static
+void
+ss_swapmerge(const unsigned char *T, const int *PA,
+             int *first, int *middle, int *last,
+             int *buf, int bufsize, int depth) {
+#define STACK_SIZE SS_SMERGE_STACKSIZE
+#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a)))
+#define MERGE_CHECK(a, b, c)\
+  do {\
+    if(((c) & 1) ||\
+       (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\
+      *(a) = ~*(a);\
+    }\
+    if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\
+      *(b) = ~*(b);\
+    }\
+  } while(0)
+  struct { int *a, *b, *c; int d; } stack[STACK_SIZE];
+  int *l, *r, *lm, *rm;
+  int m, len, half;
+  int ssize;
+  int check, next;
+
+  for(check = 0, ssize = 0;;) {
+    if((last - middle) <= bufsize) {
+      if((first < middle) && (middle < last)) {
+        ss_mergebackward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    if((middle - first) <= bufsize) {
+      if(first < middle) {
+        ss_mergeforward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1;
+        0 < len;
+        len = half, half >>= 1) {
+      if(ss_compare(T, PA + GETIDX(*(middle + m + half)),
+                       PA + GETIDX(*(middle - m - half - 1)), depth) < 0) {
+        m += half + 1;
+        half -= (len & 1) ^ 1;
+      }
+    }
+
+    if(0 < m) {
+      lm = middle - m, rm = middle + m;
+      ss_blockswap(lm, middle, m);
+      l = r = middle, next = 0;
+      if(rm < last) {
+        if(*rm < 0) {
+          *rm = ~*rm;
+          if(first < lm) { for(; *--l < 0;) { } next |= 4; }
+          next |= 1;
+        } else if(first < lm) {
+          for(; *r < 0; ++r) { }
+          next |= 2;
+        }
+      }
+
+      if((l - first) <= (last - r)) {
+        STACK_PUSH(r, rm, last, (next & 3) | (check & 4));
+        middle = lm, last = l, check = (check & 3) | (next & 4);
+      } else {
+        if((next & 2) && (r == middle)) { next ^= 6; }
+        STACK_PUSH(first, lm, l, (check & 3) | (next & 4));
+        first = r, middle = rm, check = (next & 3) | (check & 4);
+      }
+    } else {
+      if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) {
+        *middle = ~*middle;
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Substring sort */
+static
+void
+sssort(const unsigned char *T, const int *PA,
+       int *first, int *last,
+       int *buf, int bufsize,
+       int depth, int n, int lastsuffix) {
+  int *a;
+#if SS_BLOCKSIZE != 0
+  int *b, *middle, *curbuf;
+  int j, k, curbufsize, limit;
+#endif
+  int i;
+
+  if(lastsuffix != 0) { ++first; }
+
+#if SS_BLOCKSIZE == 0
+  ss_mintrosort(T, PA, first, last, depth);
+#else
+  if((bufsize < SS_BLOCKSIZE) &&
+      (bufsize < (last - first)) &&
+      (bufsize < (limit = ss_isqrt(last - first)))) {
+    if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; }
+    buf = middle = last - limit, bufsize = limit;
+  } else {
+    middle = last, limit = 0;
+  }
+  for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#endif
+    curbufsize = last - (a + SS_BLOCKSIZE);
+    curbuf = a + SS_BLOCKSIZE;
+    if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; }
+    for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) {
+      ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth);
+    }
+  }
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+  ss_mintrosort(T, PA, a, middle, depth);
+#elif 1 < SS_BLOCKSIZE
+  ss_insertionsort(T, PA, a, middle, depth);
+#endif
+  for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) {
+    if(i & 1) {
+      ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth);
+      a -= k;
+    }
+  }
+  if(limit != 0) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, middle, last, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, middle, last, depth);
+#endif
+    ss_inplacemerge(T, PA, first, middle, last, depth);
+  }
+#endif
+
+  if(lastsuffix != 0) {
+    /* Insert last type B* suffix. */
+    int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2;
+    for(a = first, i = *(first - 1);
+        (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth)));
+        ++a) {
+      *(a - 1) = *a;
+    }
+    *(a - 1) = i;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+int
+tr_ilg(int n) {
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Simple insertionsort for small size groups. */
+static
+void
+tr_insertionsort(const int *ISAd, int *first, int *last) {
+  int *a, *b;
+  int t, r;
+
+  for(a = first + 1; a < last; ++a) {
+    for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) {
+      do { *(b + 1) = *b; } while((first <= --b) && (*b < 0));
+      if(b < first) { break; }
+    }
+    if(r == 0) { *b = ~*b; }
+    *(b + 1) = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_fixdown(const int *ISAd, int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = ISAd[SA[k = j++]];
+    if(d < (e = ISAd[SA[j]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+tr_heapsort(const int *ISAd, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    tr_fixdown(ISAd, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+tr_median3(const int *ISAd, int *v1, int *v2, int *v3) {
+  int *t;
+  if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); }
+  if(ISAd[*v2] > ISAd[*v3]) {
+    if(ISAd[*v1] > ISAd[*v3]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+tr_median5(const int *ISAd,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); }
+  if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); }
+  if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); }
+  if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(ISAd[*v3] > ISAd[*v4]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+tr_pivot(const int *ISAd, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return tr_median3(ISAd, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = tr_median3(ISAd, first, first + t, first + (t << 1));
+  middle = tr_median3(ISAd, middle - t, middle, middle + t);
+  last   = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return tr_median3(ISAd, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+typedef struct _trbudget_t trbudget_t;
+struct _trbudget_t {
+  int chance;
+  int remain;
+  int incval;
+  int count;
+};
+
+static INLINE
+void
+trbudget_init(trbudget_t *budget, int chance, int incval) {
+  budget->chance = chance;
+  budget->remain = budget->incval = incval;
+}
+
+static INLINE
+int
+trbudget_check(trbudget_t *budget, int size) {
+  if(size <= budget->remain) { budget->remain -= size; return 1; }
+  if(budget->chance == 0) { budget->count += size; return 0; }
+  budget->remain += budget->incval - size;
+  budget->chance -= 1;
+  return 1;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_partition(const int *ISAd,
+             int *first, int *middle, int *last,
+             int **pa, int **pb, int v) {
+  int *a, *b, *c, *d, *e, *f;
+  int t, s;
+  int x = 0;
+
+  for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { }
+  if(((a = b) < last) && (x < v)) {
+    for(; (++b < last) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+  }
+  for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { }
+  if((b < (d = c)) && (x > v)) {
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+  for(; b < c;) {
+    SWAP(*b, *c);
+    for(; (++b < c) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+
+  if(a <= d) {
+    c = b - 1;
+    if((s = a - first) > (t = b - a)) { s = t; }
+    for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    if((s = d - c) > (t = last - d - 1)) { s = t; }
+    for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    first += (b - a), last -= (d - c);
+  }
+  *pa = first, *pb = last;
+}
+
+static
+void
+tr_copy(int *ISA, const int *SA,
+        int *first, int *a, int *b, int *last,
+        int depth) {
+  /* sort suffixes of middle partition
+     by using sorted order of suffixes of left and right partition. */
+  int *c, *d, *e;
+  int s, v;
+
+  v = b - SA - 1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      ISA[s] = d - SA;
+    }
+  }
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      ISA[s] = d - SA;
+    }
+  }
+}
+
+static
+void
+tr_partialcopy(int *ISA, const int *SA,
+               int *first, int *a, int *b, int *last,
+               int depth) {
+  int *c, *d, *e;
+  int s, v;
+  int rank, lastrank, newrank = -1;
+
+  v = b - SA - 1;
+  lastrank = -1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+
+  lastrank = -1;
+  for(e = d; first <= e; --e) {
+    rank = ISA[*e];
+    if(lastrank != rank) { lastrank = rank; newrank = e - SA; }
+    if(newrank != rank) { ISA[*e] = newrank; }
+  }
+
+  lastrank = -1;
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+}
+
+static
+void
+tr_introsort(int *ISA, const int *ISAd,
+             int *SA, int *first, int *last,
+             trbudget_t *budget) {
+#define STACK_SIZE TR_STACKSIZE
+  struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE];
+  int *a, *b, *c;
+  int t;
+  int v, x = 0;
+  int incr = ISAd - ISA;
+  int limit, next;
+  int ssize, trlink = -1;
+
+  for(ssize = 0, limit = tr_ilg(last - first);;) {
+
+    if(limit < 0) {
+      if(limit == -1) {
+        /* tandem repeat partition */
+        tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1);
+
+        /* update ranks */
+        if(a < last) {
+          for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+        }
+        if(b < last) {
+          for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; }
+        }
+
+        /* push */
+        if(1 < (b - a)) {
+          STACK_PUSH5(NULL, a, b, 0, 0);
+          STACK_PUSH5(ISAd - incr, first, last, -2, trlink);
+          trlink = ssize - 2;
+        }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink);
+            last = a, limit = tr_ilg(a - first);
+          } else if(1 < (last - b)) {
+            first = b, limit = tr_ilg(last - b);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink);
+            first = b, limit = tr_ilg(last - b);
+          } else if(1 < (a - first)) {
+            last = a, limit = tr_ilg(a - first);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      } else if(limit == -2) {
+        /* tandem repeat copy */
+        a = stack[--ssize].b, b = stack[ssize].c;
+        if(stack[ssize].d == 0) {
+          tr_copy(ISA, SA, first, a, b, last, ISAd - ISA);
+        } else {
+          if(0 <= trlink) { stack[trlink].d = -1; }
+          tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA);
+        }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      } else {
+        /* sorted partition */
+        if(0 <= *first) {
+          a = first;
+          do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a));
+          first = a;
+        }
+        if(first < last) {
+          a = first; do { *a = ~*a; } while(*++a < 0);
+          next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1;
+          if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } }
+
+          /* push */
+          if(trbudget_check(budget, a - first)) {
+            if((a - first) <= (last - a)) {
+              STACK_PUSH5(ISAd, a, last, -3, trlink);
+              ISAd += incr, last = a, limit = next;
+            } else {
+              if(1 < (last - a)) {
+                STACK_PUSH5(ISAd + incr, first, a, next, trlink);
+                first = a, limit = -3;
+              } else {
+                ISAd += incr, last = a, limit = next;
+              }
+            }
+          } else {
+            if(0 <= trlink) { stack[trlink].d = -1; }
+            if(1 < (last - a)) {
+              first = a, limit = -3;
+            } else {
+              STACK_POP5(ISAd, first, last, limit, trlink);
+            }
+          }
+        } else {
+          STACK_POP5(ISAd, first, last, limit, trlink);
+        }
+      }
+      continue;
+    }
+
+    if((last - first) <= TR_INSERTIONSORT_THRESHOLD) {
+      tr_insertionsort(ISAd, first, last);
+      limit = -3;
+      continue;
+    }
+
+    if(limit-- == 0) {
+      tr_heapsort(ISAd, first, last - first);
+      for(a = last - 1; first < a; a = b) {
+        for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; }
+      }
+      limit = -3;
+      continue;
+    }
+
+    /* choose pivot */
+    a = tr_pivot(ISAd, first, last);
+    SWAP(*first, *a);
+    v = ISAd[*first];
+
+    /* partition */
+    tr_partition(ISAd, first, first + 1, last, &a, &b, v);
+    if((last - first) != (b - a)) {
+      next = (ISA[*a] != v) ? tr_ilg(b - a) : -1;
+
+      /* update ranks */
+      for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+      if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } }
+
+      /* push */
+      if((1 < (b - a)) && (trbudget_check(budget, b - a))) {
+        if((a - first) <= (last - b)) {
+          if((last - b) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              last = a;
+            } else if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((a - first) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        } else {
+          if((a - first) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              first = b;
+            } else if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((last - b) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        }
+      } else {
+        if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            last = a;
+          } else if(1 < (last - b)) {
+            first = b;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            first = b;
+          } else if(1 < (a - first)) {
+            last = a;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      }
+    } else {
+      if(trbudget_check(budget, last - first)) {
+        limit = tr_ilg(last - first), ISAd += incr;
+      } else {
+        if(0 <= trlink) { stack[trlink].d = -1; }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      }
+    }
+  }
+#undef STACK_SIZE
+}
+
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Tandem repeat sort */
+static
+void
+trsort(int *ISA, int *SA, int n, int depth) {
+  int *ISAd;
+  int *first, *last;
+  trbudget_t budget;
+  int t, skip, unsorted;
+
+  trbudget_init(&budget, tr_ilg(n) * 2 / 3, n);
+/*  trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */
+  for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) {
+    first = SA;
+    skip = 0;
+    unsorted = 0;
+    do {
+      if((t = *first) < 0) { first -= t; skip += t; }
+      else {
+        if(skip != 0) { *(first + skip) = skip; skip = 0; }
+        last = SA + ISA[t] + 1;
+        if(1 < (last - first)) {
+          budget.count = 0;
+          tr_introsort(ISA, ISAd, SA, first, last, &budget);
+          if(budget.count != 0) { unsorted += budget.count; }
+          else { skip = first - last; }
+        } else if((last - first) == 1) {
+          skip = -1;
+        }
+        first = last;
+      }
+    } while(first < (SA + n));
+    if(skip != 0) { *(first + skip) = skip; }
+    if(unsorted == 0) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Sorts suffixes of type B*. */
+static
+int
+sort_typeBstar(const unsigned char *T, int *SA,
+               int *bucket_A, int *bucket_B,
+               int n, int openMP) {
+  int *PAb, *ISAb, *buf;
+#ifdef LIBBSC_OPENMP
+  int *curbuf;
+  int l;
+#endif
+  int i, j, k, t, m, bufsize;
+  int c0, c1;
+#ifdef LIBBSC_OPENMP
+  int d0, d1;
+#endif
+  (void)openMP;
+
+  /* Initialize bucket arrays. */
+  for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
+  for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
+
+  /* Count the number of occurrences of the first one or two characters of each
+     type A, B and B* suffix. Moreover, store the beginning position of all
+     type B* suffixes into the array SA. */
+  for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
+    /* type A suffix. */
+    do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
+    if(0 <= i) {
+      /* type B* suffix. */
+      ++BUCKET_BSTAR(c0, c1);
+      SA[--m] = i;
+      /* type B suffix. */
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
+        ++BUCKET_B(c0, c1);
+      }
+    }
+  }
+  m = n - m;
+/*
+note:
+  A type B* suffix is lexicographically smaller than a type B suffix that
+  begins with the same first two characters.
+*/
+
+  /* Calculate the index of start/end point of each bucket. */
+  for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
+    t = i + BUCKET_A(c0);
+    BUCKET_A(c0) = i + j; /* start point */
+    i = t + BUCKET_B(c0, c0);
+    for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
+      j += BUCKET_BSTAR(c0, c1);
+      BUCKET_BSTAR(c0, c1) = j; /* end point */
+      i += BUCKET_B(c0, c1);
+    }
+  }
+
+  if(0 < m) {
+    /* Sort the type B* suffixes by their first two characters. */
+    PAb = SA + n - m; ISAb = SA + m;
+    for(i = m - 2; 0 <= i; --i) {
+      t = PAb[i], c0 = T[t], c1 = T[t + 1];
+      SA[--BUCKET_BSTAR(c0, c1)] = i;
+    }
+    t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
+    SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
+
+    /* Sort the type B* substrings using sssort. */
+#ifdef LIBBSC_OPENMP
+    if (openMP)
+    {
+        buf = SA + m;
+        c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
+#pragma omp parallel default(shared) private(bufsize, curbuf, k, l, d0, d1)
+        {
+          bufsize = (n - (2 * m)) / omp_get_num_threads();
+          curbuf = buf + omp_get_thread_num() * bufsize;
+          k = 0;
+          for(;;) {
+            #pragma omp critical(sssort_lock)
+            {
+              if(0 < (l = j)) {
+                d0 = c0, d1 = c1;
+                do {
+                  k = BUCKET_BSTAR(d0, d1);
+                  if(--d1 <= d0) {
+                    d1 = ALPHABET_SIZE - 1;
+                    if(--d0 < 0) { break; }
+                  }
+                } while(((l - k) <= 1) && (0 < (l = k)));
+                c0 = d0, c1 = d1, j = k;
+              }
+            }
+            if(l == 0) { break; }
+            sssort(T, PAb, SA + k, SA + l,
+                   curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
+          }
+        }
+    }
+    else
+    {
+        buf = SA + m, bufsize = n - (2 * m);
+        for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+          for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+            i = BUCKET_BSTAR(c0, c1);
+            if(1 < (j - i)) {
+              sssort(T, PAb, SA + i, SA + j,
+                     buf, bufsize, 2, n, *(SA + i) == (m - 1));
+            }
+          }
+        }
+    }
+#else
+    buf = SA + m, bufsize = n - (2 * m);
+    for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+        i = BUCKET_BSTAR(c0, c1);
+        if(1 < (j - i)) {
+          sssort(T, PAb, SA + i, SA + j,
+                 buf, bufsize, 2, n, *(SA + i) == (m - 1));
+        }
+      }
+    }
+#endif
+
+    /* Compute ranks of type B* substrings. */
+    for(i = m - 1; 0 <= i; --i) {
+      if(0 <= SA[i]) {
+        j = i;
+        do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
+        SA[i + 1] = i - j;
+        if(i <= 0) { break; }
+      }
+      j = i;
+      do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
+      ISAb[SA[i]] = j;
+    }
+
+    /* Construct the inverse suffix array of type B* suffixes using trsort. */
+    trsort(ISAb, SA, m, 1);
+
+    /* Set the sorted order of tyoe B* suffixes. */
+    for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
+      if(0 <= i) {
+        t = i;
+        for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
+        SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
+      }
+    }
+
+    /* Calculate the index of start/end point of each bucket. */
+    BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
+    for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
+      i = BUCKET_A(c0 + 1) - 1;
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
+        t = i - BUCKET_B(c0, c1);
+        BUCKET_B(c0, c1) = i; /* end point */
+
+        /* Move all type B* suffixes to the correct position. */
+        for(i = t, j = BUCKET_BSTAR(c0, c1);
+            j <= k;
+            --i, --k) { SA[i] = SA[k]; }
+      }
+      BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
+      BUCKET_B(c0, c0) = i; /* end point */
+    }
+  }
+
+  return m;
+}
+
+/* Constructs the suffix array by using the sorted order of type B* suffixes. */
+static
+void
+construct_SA(const unsigned char *T, int *SA,
+             int *bucket_A, int *bucket_B,
+             int n, int m) {
+  int *i, *j, *k;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          *j = ~s;
+          c0 = T[--s];
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else {
+          assert(((s == 0) && (T[s] == c1)) || (s < 0));
+          *j = ~s;
+        }
+      }
+    }
+  }
+
+  /* Construct the suffix array by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else {
+      assert(s < 0);
+      *i = ~s;
+    }
+  }
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT(const unsigned char *T, int *SA,
+              int *bucket_A, int *bucket_B,
+              int n, int m) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      *i = c0;
+      if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT_indexes(const unsigned char *T, int *SA,
+                      int *bucket_A, int *bucket_B,
+                      int n, int m,
+                      unsigned char * num_indexes, int * indexes) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  int mod = n / 8;
+  {
+      mod |= mod >> 1;  mod |= mod >> 2;
+      mod |= mod >> 4;  mod |= mod >> 8;
+      mod |= mod >> 16; mod >>= 1;
+
+      *num_indexes = (unsigned char)((n - 1) / (mod + 1));
+  }
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+
+          if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = j - SA;
+
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  if (T[n - 2] < c2) {
+    if (((n - 1) & mod) == 0) indexes[(n - 1) / (mod + 1) - 1] = k - SA;
+    *k++ = ~((int)T[n - 2]);
+  }
+  else {
+    *k++ = n - 1;
+  }
+
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+
+      if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = i - SA;
+
+      c0 = T[--s];
+      *i = c0;
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      if((0 < s) && (T[s - 1] < c0)) {
+          if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = k - SA;
+          *k++ = ~((int)T[s - 1]);
+      } else
+        *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/*- Function -*/
+
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP) {
+  int *bucket_A, *bucket_B;
+  int m;
+  int err = 0;
+
+  /* Check arguments. */
+  if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
+  else if(n == 0) { return 0; }
+  else if(n == 1) { SA[0] = 0; return 0; }
+  else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
+
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Suffixsort. */
+  if((bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, SA, bucket_A, bucket_B, n, openMP);
+    construct_SA(T, SA, bucket_A, bucket_B, n, m);
+  } else {
+    err = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+
+  return err;
+}
+
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP) {
+  int *B;
+  int *bucket_A, *bucket_B;
+  int m, pidx, i;
+
+  /* Check arguments. */
+  if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
+  else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
+
+  if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); }
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Burrows-Wheeler Transform. */
+  if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, B, bucket_A, bucket_B, n, openMP);
+
+    if (num_indexes == NULL || indexes == NULL) {
+        pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
+    } else {
+        pidx = construct_BWT_indexes(T, B, bucket_A, bucket_B, n, m, num_indexes, indexes);
+    }
+
+    /* Copy to output string. */
+    U[0] = T[n - 1];
+    for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; }
+    for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; }
+    pidx += 1;
+  } else {
+    pidx = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+  if(A == NULL) { free(B); }
+
+  return pidx;
+}

diff --git a/vendor/github.com/DataDog/zstd/divsufsort.h b/vendor/github.com/DataDog/zstd/divsufsort.h
new file mode 100644
index 0000000..5440994
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/divsufsort.h

@@ -0,0 +1,67 @@
+/*
+ * divsufsort.h for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT_H
+#define _DIVSUFSORT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/*- Prototypes -*/
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param openMP enables OpenMP optimization.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP);
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string. (can be T)
+ * @param A [0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @param num_indexes The length of secondary indexes array. (can be NULL)
+ * @param indexes The secondary indexes array. (can be NULL)
+ * @param openMP enables OpenMP optimization.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT_H */

diff --git a/vendor/github.com/DataDog/zstd/entropy_common.c b/vendor/github.com/DataDog/zstd/entropy_common.c
new file mode 100644
index 0000000..b37a082
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/entropy_common.c

@@ -0,0 +1,221 @@
+/*
+   Common functions of New Generation Entropy library
+   Copyright (C) 2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+*************************************************************************** */
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include "mem.h"
+#include "error_private.h"       /* ERR_*, ERROR */
+#define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+#include "huf.h"
+
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+
+/*===   Error Management   ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) & (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0 += 24;
+                if (ip < iend-5) {
+                    ip += 2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount   += 16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0 += 3;
+                bitStream >>= 2;
+                bitCount += 2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            } else {
+                bitStream >>= 2;
+        }   }
+        {   int const max = (2*threshold-1) - remaining;
+            int count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = bitStream & (threshold-1);
+                bitCount += nbBits-1;
+            } else {
+                count = bitStream & (2*threshold-1);
+                if (count >= threshold) count -= max;
+                bitCount += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= count < 0 ? -count : count;   /* -1 means +1 */
+            normalizedCounter[charnum++] = (short)count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }   /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+    if (remaining != 1) return ERROR(corruption_detected);
+    if (bitCount > 32) return ERROR(corruption_detected);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    return ip-istart;
+}
+
+
+/*! HUF_readStats() :
+    Read compact Huffman tree, saved by HUF_writeCTable().
+    `huffWeight` is destination buffer.
+    `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    /* memset(huffWeight, 0, hwSize);   *//* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128) {  /* special header */
+        oSize = iSize - 127;
+        iSize = ((oSize+1)/2);
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        if (oSize >= hwSize) return ERROR(corruption_detected);
+        ip += 1;
+        {   U32 n;
+            for (n=0; n<oSize; n+=2) {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)];  /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] >= HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+        if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << BIT_highbit32(rest);
+            U32 const lastWeight = BIT_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}

diff --git a/vendor/github.com/DataDog/zstd/error_private.c b/vendor/github.com/DataDog/zstd/error_private.c
new file mode 100644
index 0000000..d004ee6
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/error_private.c

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* The purpose of this file is to have a single list of error strings embedded in binary */
+
+#include "error_private.h"
+
+const char* ERR_getErrorString(ERR_enum code)
+{
+    static const char* const notErrorCode = "Unspecified error code";
+    switch( code )
+    {
+    case PREFIX(no_error): return "No error detected";
+    case PREFIX(GENERIC):  return "Error (generic)";
+    case PREFIX(prefix_unknown): return "Unknown frame descriptor";
+    case PREFIX(version_unsupported): return "Version not supported";
+    case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+    case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+    case PREFIX(init_missing): return "Context should be init first";
+    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough";
+    case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
+    case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+    case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+    case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+    case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+    case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+    case PREFIX(srcSize_wrong): return "Src size is incorrect";
+        /* following error codes are not stable and may be removed or changed in a future version */
+    case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+    case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+    case PREFIX(maxCode):
+    default: return notErrorCode;
+    }
+}

diff --git a/vendor/github.com/DataDog/zstd/error_private.h b/vendor/github.com/DataDog/zstd/error_private.h
new file mode 100644
index 0000000..0d2fa7e
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/error_private.h

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>        /* size_t */
+#include "zstd_errors.h"  /* enum list */
+
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#define ERROR(name) ZSTD_ERROR(name)
+#define ZSTD_ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+
+
+/*-****************************************
+*  Error Strings
+******************************************/
+
+const char* ERR_getErrorString(ERR_enum code);   /* error_private.c */
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    return ERR_getErrorString(ERR_getErrorCode(code));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */

diff --git a/vendor/github.com/DataDog/zstd/errors.go b/vendor/github.com/DataDog/zstd/errors.go
new file mode 100644
index 0000000..38db0d5
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/errors.go

@@ -0,0 +1,35 @@
+package zstd
+
+/*
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"
+*/
+import "C"
+
+// ErrorCode is an error returned by the zstd library.
+type ErrorCode int
+
+// Error returns the error string given by zstd
+func (e ErrorCode) Error() string {
+	return C.GoString(C.ZSTD_getErrorName(C.size_t(e)))
+}
+
+func cIsError(code int) bool {
+	return int(C.ZSTD_isError(C.size_t(code))) != 0
+}
+
+// getError returns an error for the return code, or nil if it's not an error
+func getError(code int) error {
+	if code < 0 && cIsError(code) {
+		return ErrorCode(code)
+	}
+	return nil
+}
+
+// IsDstSizeTooSmallError returns whether the error correspond to zstd standard sDstSizeTooSmall error
+func IsDstSizeTooSmallError(e error) bool {
+	if e != nil && e.Error() == "Destination buffer is too small" {
+		return true
+	}
+	return false
+}

diff --git a/vendor/github.com/DataDog/zstd/fse.h b/vendor/github.com/DataDog/zstd/fse.h
new file mode 100644
index 0000000..6a1d272
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/fse.h

@@ -0,0 +1,704 @@
+/* ******************************************************************
+   FSE : Finite State Entropy codec
+   Public Prototypes declaration
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef FSE_H
+#define FSE_H
+
+
+/*-*****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+
+
+/*-*****************************************
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define FSE_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define FSE_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define FSE_PUBLIC_API
+#endif
+
+/*------   Version   ------*/
+#define FSE_VERSION_MAJOR    0
+#define FSE_VERSION_MINOR    9
+#define FSE_VERSION_RELEASE  0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER  (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /**< library version number; to be used when checking dll version */
+
+/*-****************************************
+*  FSE simple functions
+******************************************/
+/*! FSE_compress() :
+    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+    @return : size of compressed data (<= dstCapacity).
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+*/
+FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/*! FSE_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+    @return : size of regenerated data (<= maxDstSize),
+              or an error code, which can be tested using FSE_isError() .
+
+    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size);       /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+/*-*****************************************
+*  FSE advanced functions
+******************************************/
+/*! FSE_compress2() :
+    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+    Both parameters can be defined as '0' to mean : use default value
+    @return : size of compressed data
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+                     if FSE_isError(return), it's an error code.
+*/
+FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[]
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+
+/*! FSE_count():
+    Provides the precise count of each byte within a table 'count'.
+    'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+    *maxSymbolValuePtr will be updated if detected smaller than initial value.
+    @return : the count of the most frequent symbol (which is not identified).
+              if return == srcSize, there is only one symbol.
+              Can also return an error code, which can be tested with FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+/*! FSE_optimalTableLog():
+    dynamically downsize 'tableLog' when conditions are met.
+    It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+    @return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+    normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+    'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+    @return : tableLog,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_NCountWriteBound():
+    Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+    Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+    Compactly save 'normalizedCounter' into 'buffer'.
+    @return : size of the compressed table,
+              or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*! Constructor and Destructor of FSE_CTable.
+    Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+
+/*! FSE_buildCTable():
+    Builds `ct`, which must be already allocated, using FSE_createCTable().
+    @return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_compress_usingCTable():
+    Compress `src` using `ct` into `dst` which must be already allocated.
+    @return : size of compressed data (<= `dstCapacity`),
+              or 0 if compressed data could not fit into `dst`,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSE_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSE_DTable.
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+
+/*! FSE_buildDTable():
+    Builds 'dt', which must be already allocated, using FSE_createDTable().
+    return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_decompress_usingDTable():
+    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+    into `dst` which must be already allocated.
+    @return : size of regenerated data (necessarily <= `dstCapacity`),
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+#endif  /* FSE_H */
+
+#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+#define FSE_H_FSE_STATIC_LINKING_ONLY
+
+/* *** Dependency *** */
+#include "bitstream.h"
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+/* or use the size to malloc() space directly. Pay attention to alignment restrictions though */
+#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue)   (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable))
+#define FSE_DTABLE_SIZE(maxTableLog)                   (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable))
+
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned
+ */
+size_t FSE_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* source, size_t sourceSize, unsigned* workSpace);
+
+/** FSE_countFast() :
+ *  same as FSE_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr
+ */
+size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` must be a table of minimum `1024` unsigned
+ */
+size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* workSpace);
+
+/*! FSE_count_simple() :
+ * Same as FSE_countFast(), but does not use any additional memory (not even on stack).
+ * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`).
+*/
+size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+ */
+#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `(1<<tableLog)`.
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
+
+typedef enum {
+   FSE_repeat_none,  /**< Cannot use the previous table */
+   FSE_repeat_check, /**< Can use the previous table but it must be checked */
+   FSE_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+ } FSE_repeat;
+
+/* *****************************************
+*  FSE symbol compression API
+*******************************************/
+/*!
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   Hence their body are included in next section.
+*/
+typedef struct {
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+    size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+    FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+    FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+    BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+    BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+    FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+    size_t size = BIT_closeCStream(&bitStream);
+*/
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct {
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+    errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+    errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+    unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+    size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+    endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+    BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+    BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+    FSE_endOfDState(&DState);
+*/
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+typedef struct {
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+    const void* ptr = ct;
+    const U16* u16ptr = (const U16*) ptr;
+    const U32 tableLog = MEM_read16(ptr);
+    statePtr->value = (ptrdiff_t)1<<tableLog;
+    statePtr->stateTable = u16ptr+2;
+    statePtr->symbolTT = ((const U32*)ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1));
+    statePtr->stateLog = tableLog;
+}
+
+
+/*! FSE_initCState2() :
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    FSE_initCState(statePtr, ct);
+    {   const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+    }
+}
+
+MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
+{
+    FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    BIT_addBits(bitC, statePtr->value, nbBitsOut);
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+    BIT_flushBits(bitC);
+}
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#  define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#  define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#  define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#  error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3)
+
+
+#endif /* FSE_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif

diff --git a/vendor/github.com/DataDog/zstd/fse_compress.c b/vendor/github.com/DataDog/zstd/fse_compress.c
new file mode 100644
index 0000000..cb8f1fa
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/fse_compress.c

@@ -0,0 +1,849 @@
+/* ******************************************************************
+   FSE : Finite State Entropy encoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+#include "bitstream.h"
+#include "compiler.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    U32 const tableSize = 1 << tableLog;
+    U32 const tableMask = tableSize - 1;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    U32 const step = FSE_TABLESTEP(tableSize);
+    U32 cumul[FSE_MAX_SYMBOL_VALUE+2];
+
+    FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)workSpace;
+    U32 highThreshold = tableSize-1;
+
+    /* CTable header */
+    if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
+    tableU16[-2] = (U16) tableLog;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* For explanations on how to distribute symbol values over the table :
+    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+    /* symbol start positions */
+    {   U32 u;
+        cumul[0] = 0;
+        for (u=1; u<=maxSymbolValue+1; u++) {
+            if (normalizedCounter[u-1]==-1) {  /* Low proba symbol */
+                cumul[u] = cumul[u-1] + 1;
+                tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
+            } else {
+                cumul[u] = cumul[u-1] + normalizedCounter[u-1];
+        }   }
+        cumul[maxSymbolValue+1] = tableSize+1;
+    }
+
+    /* Spread symbols */
+    {   U32 position = 0;
+        U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            int nbOccurences;
+            for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++) {
+                tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* Low proba area */
+        }   }
+
+        if (position!=0) return ERROR(GENERIC);   /* Must have gone through all positions */
+    }
+
+    /* Build table */
+    {   U32 u; for (u=0; u<tableSize; u++) {
+        FSE_FUNCTION_TYPE s = tableSymbol[u];   /* note : static analyzer may not understand tableSymbol is properly initialized */
+        tableU16[cumul[s]++] = (U16) (tableSize+u);   /* TableU16 : sorted by symbol order; gives next state value */
+    }   }
+
+    /* Build Symbol Transformation Table */
+    {   unsigned total = 0;
+        unsigned s;
+        for (s=0; s<=maxSymbolValue; s++) {
+            switch (normalizedCounter[s])
+            {
+            case  0: break;
+
+            case -1:
+            case  1:
+                symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
+                symbolTT[s].deltaFindState = total - 1;
+                total ++;
+                break;
+            default :
+                {
+                    U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1);
+                    U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
+                    symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                    symbolTT[s].deltaFindState = total - normalizedCounter[s];
+                    total +=  normalizedCounter[s];
+    }   }   }   }
+
+    return 0;
+}
+
+
+size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE];   /* memset() is not necessary, even if static analyzer complain about it */
+    return FSE_buildCTable_wksp(ct, normalizedCounter, maxSymbolValue, tableLog, tableSymbol, sizeof(tableSymbol));
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3;
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+}
+
+static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                                       unsigned writeIsSafe)
+{
+    BYTE* const ostart = (BYTE*) header;
+    BYTE* out = ostart;
+    BYTE* const oend = ostart + headerBufferSize;
+    int nbBits;
+    const int tableSize = 1 << tableLog;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    bitStream = 0;
+    bitCount  = 0;
+    /* Table Size */
+    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
+    bitCount  += 4;
+
+    /* Init */
+    remaining = tableSize+1;   /* +1 for extra accuracy */
+    threshold = tableSize;
+    nbBits = tableLog+1;
+
+    while (remaining>1) {  /* stops at 1 */
+        if (previous0) {
+            unsigned start = charnum;
+            while (!normalizedCounter[charnum]) charnum++;
+            while (charnum >= start+24) {
+                start+=24;
+                bitStream += 0xFFFFU << bitCount;
+                if ((!writeIsSafe) && (out > oend-2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE) bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out+=2;
+                bitStream>>=16;
+            }
+            while (charnum >= start+3) {
+                start+=3;
+                bitStream += 3 << bitCount;
+                bitCount += 2;
+            }
+            bitStream += (charnum-start) << bitCount;
+            bitCount += 2;
+            if (bitCount>16) {
+                if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE)bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out += 2;
+                bitStream >>= 16;
+                bitCount -= 16;
+        }   }
+        {   int count = normalizedCounter[charnum++];
+            int const max = (2*threshold-1)-remaining;
+            remaining -= count < 0 ? -count : count;
+            count++;   /* +1 for extra accuracy */
+            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            bitStream += count << bitCount;
+            bitCount  += nbBits;
+            bitCount  -= (count<max);
+            previous0  = (count==1);
+            if (remaining<1) return ERROR(GENERIC);
+            while (remaining<threshold) { nbBits--; threshold>>=1; }
+        }
+        if (bitCount>16) {
+            if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            out[0] = (BYTE)bitStream;
+            out[1] = (BYTE)(bitStream>>8);
+            out += 2;
+            bitStream >>= 16;
+            bitCount -= 16;
+    }   }
+
+    /* flush remaining bitStream */
+    if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    out[0] = (BYTE)bitStream;
+    out[1] = (BYTE)(bitStream>>8);
+    out+= (bitCount+7) /8;
+
+    if (charnum > maxSymbolValue + 1) return ERROR(GENERIC);
+
+    return (out-ostart);
+}
+
+
+size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
+
+    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+}
+
+
+
+/*-**************************************************************
+*  Counting histogram
+****************************************************************/
+/*! FSE_count_simple
+    This function counts byte values within `src`, and store the histogram into table `count`.
+    It doesn't use any additional memory.
+    But this function is unsafe : it doesn't check that all values within `src` can fit into `count`.
+    For this reason, prefer using a table `count` with 256 elements.
+    @return : count of most numerous element.
+*/
+size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                        const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+
+    memset(count, 0, (maxSymbolValue+1)*sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) {
+        assert(*ip <= maxSymbolValue);
+        count[*ip++]++;
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    { U32 s; for (s=0; s<=maxSymbolValue; s++) if (count[s] > max) max = count[s]; }
+
+    return (size_t)max;
+}
+
+
+/* FSE_count_parallel_wksp() :
+ * Same as FSE_count_parallel(), but using an externally provided scratch buffer.
+ * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`.
+ * @return : largest histogram frequency, or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */
+static size_t FSE_count_parallel_wksp(
+                                unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                unsigned checkMax, unsigned* const workSpace)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+    U32* const Counting1 = workSpace;
+    U32* const Counting2 = Counting1 + 256;
+    U32* const Counting3 = Counting2 + 256;
+    U32* const Counting4 = Counting3 + 256;
+
+    memset(workSpace, 0, 4*256*sizeof(unsigned));
+
+    /* safety checks */
+    if (!sourceSize) {
+        memset(count, 0, maxSymbolValue + 1);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    if (!maxSymbolValue) maxSymbolValue = 255;            /* 0 == default */
+
+    /* by stripes of 16 bytes */
+    {   U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    if (checkMax) {   /* verify stats will fit into destination table */
+        U32 s; for (s=255; s>maxSymbolValue; s--) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
+    }   }
+
+    {   U32 s;
+        if (maxSymbolValue > 255) maxSymbolValue = 255;
+        for (s=0; s<=maxSymbolValue; s++) {
+            count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+            if (count[s] > max) max = count[s];
+    }   }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+    return (size_t)max;
+}
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                          const void* source, size_t sourceSize,
+                          unsigned* workSpace)
+{
+    if (sourceSize < 1500) /* heuristic threshold */
+        return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
+}
+
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    unsigned tmpCounters[1024];
+    return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters);
+}
+
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* source, size_t sourceSize, unsigned* workSpace)
+{
+    if (*maxSymbolValuePtr < 255)
+        return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
+    *maxSymbolValuePtr = 255;
+    return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
+}
+
+size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* src, size_t srcSize)
+{
+    unsigned tmpCounters[1024];
+    return FSE_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters);
+}
+
+
+
+/*-**************************************************************
+*  FSE Compression Code
+****************************************************************/
+/*! FSE_sizeof_CTable() :
+    FSE_CTable is a variable size structure which contains :
+    `U16 tableLog;`
+    `U16 maxSymbolValue;`
+    `U16 nextStateNumber[1 << tableLog];`                         // This size is variable
+    `FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];`  // This size is variable
+Allocation is manual (C standard does not support variable-size structures).
+*/
+size_t FSE_sizeof_CTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    return FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+}
+
+FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t size;
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+    return (FSE_CTable*)malloc(size);
+}
+
+void FSE_freeCTable (FSE_CTable* ct) { free(ct); }
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+    U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+    U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+    U32 tableLog = maxTableLog;
+    U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
+    if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
+    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
+    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
+    return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue)
+{
+    short const NOT_YET_ASSIGNED = -2;
+    U32 s;
+    U32 distributed = 0;
+    U32 ToDistribute;
+
+    /* Init */
+    U32 const lowThreshold = (U32)(total >> tableLog);
+    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (count[s] == 0) {
+            norm[s]=0;
+            continue;
+        }
+        if (count[s] <= lowThreshold) {
+            norm[s] = -1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+        if (count[s] <= lowOne) {
+            norm[s] = 1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+
+        norm[s]=NOT_YET_ASSIGNED;
+    }
+    ToDistribute = (1 << tableLog) - distributed;
+
+    if ((total / ToDistribute) > lowOne) {
+        /* risk of rounding to zero */
+        lowOne = (U32)((total * 3) / (ToDistribute * 2));
+        for (s=0; s<=maxSymbolValue; s++) {
+            if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+                norm[s] = 1;
+                distributed++;
+                total -= count[s];
+                continue;
+        }   }
+        ToDistribute = (1 << tableLog) - distributed;
+    }
+
+    if (distributed == maxSymbolValue+1) {
+        /* all values are pretty poor;
+           probably incompressible data (should have already been detected);
+           find max, then give all remaining points to max */
+        U32 maxV = 0, maxC = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > maxC) { maxV=s; maxC=count[s]; }
+        norm[maxV] += (short)ToDistribute;
+        return 0;
+    }
+
+    if (total == 0) {
+        /* all of the symbols were low enough for the lowOne or lowThreshold */
+        for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1))
+            if (norm[s] > 0) { ToDistribute--; norm[s]++; }
+        return 0;
+    }
+
+    {   U64 const vStepLog = 62 - tableLog;
+        U64 const mid = (1ULL << (vStepLog-1)) - 1;
+        U64 const rStep = ((((U64)1<<vStepLog) * ToDistribute) + mid) / total;   /* scale on remaining */
+        U64 tmpTotal = mid;
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (norm[s]==NOT_YET_ASSIGNED) {
+                U64 const end = tmpTotal + (count[s] * rStep);
+                U32 const sStart = (U32)(tmpTotal >> vStepLog);
+                U32 const sEnd = (U32)(end >> vStepLog);
+                U32 const weight = sEnd - sStart;
+                if (weight < 1)
+                    return ERROR(GENERIC);
+                norm[s] = (short)weight;
+                tmpTotal = end;
+    }   }   }
+
+    return 0;
+}
+
+
+size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+                           const unsigned* count, size_t total,
+                           unsigned maxSymbolValue)
+{
+    /* Sanity checks */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported size */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported size */
+    if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC);   /* Too small tableLog, compression potentially impossible */
+
+    {   static U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+        U64 const scale = 62 - tableLog;
+        U64 const step = ((U64)1<<62) / total;   /* <== here, one division ! */
+        U64 const vStep = 1ULL<<(scale-20);
+        int stillToDistribute = 1<<tableLog;
+        unsigned s;
+        unsigned largest=0;
+        short largestP=0;
+        U32 lowThreshold = (U32)(total >> tableLog);
+
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (count[s] == total) return 0;   /* rle special case */
+            if (count[s] == 0) { normalizedCounter[s]=0; continue; }
+            if (count[s] <= lowThreshold) {
+                normalizedCounter[s] = -1;
+                stillToDistribute--;
+            } else {
+                short proba = (short)((count[s]*step) >> scale);
+                if (proba<8) {
+                    U64 restToBeat = vStep * rtbTable[proba];
+                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
+                }
+                if (proba > largestP) { largestP=proba; largest=s; }
+                normalizedCounter[s] = proba;
+                stillToDistribute -= proba;
+        }   }
+        if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+            /* corner case, need another normalization method */
+            size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+            if (FSE_isError(errorCode)) return errorCode;
+        }
+        else normalizedCounter[largest] += (short)stillToDistribute;
+    }
+
+#if 0
+    {   /* Print Table (debug) */
+        U32 s;
+        U32 nTotal = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            printf("%3i: %4i \n", s, normalizedCounter[s]);
+        for (s=0; s<=maxSymbolValue; s++)
+            nTotal += abs(normalizedCounter[s]);
+        if (nTotal != (1U<<tableLog))
+            printf("Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+        getchar();
+    }
+#endif
+
+    return tableLog;
+}
+
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+{
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+
+    /* header */
+    tableU16[-2] = (U16) nbBits;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* Build table */
+    for (s=0; s<tableSize; s++)
+        tableU16[s] = (U16)(tableSize + s);
+
+    /* Build Symbol Transformation Table */
+    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+        for (s=0; s<=maxSymbolValue; s++) {
+            symbolTT[s].deltaNbBits = deltaNbBits;
+            symbolTT[s].deltaFindState = s-1;
+    }   }
+
+    return 0;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+{
+    void* ptr = ct;
+    U16* tableU16 = ( (U16*) ptr) + 2;
+    void* FSCTptr = (U32*)ptr + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) FSCTptr;
+
+    /* header */
+    tableU16[-2] = (U16) 0;
+    tableU16[-1] = (U16) symbolValue;
+
+    /* Build table */
+    tableU16[0] = 0;
+    tableU16[1] = 0;   /* just in case */
+
+    /* Build Symbol Transformation Table */
+    symbolTT[symbolValue].deltaNbBits = 0;
+    symbolTT[symbolValue].deltaFindState = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct, const unsigned fast)
+{
+    const BYTE* const istart = (const BYTE*) src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip=iend;
+
+    BIT_CStream_t bitC;
+    FSE_CState_t CState1, CState2;
+
+    /* init */
+    if (srcSize <= 2) return 0;
+    { size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+      if (FSE_isError(initError)) return 0; /* not enough space available to write a bitstream */ }
+
+#define FSE_FLUSHBITS(s)  (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+    if (srcSize & 1) {
+        FSE_initCState2(&CState1, ct, *--ip);
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    } else {
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_initCState2(&CState1, ct, *--ip);
+    }
+
+    /* join to mod 4 */
+    srcSize -= 2;
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) {  /* test bit 2 */
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* 2 or 4 encoding per loop */
+    while ( ip>istart ) {
+
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+            FSE_FLUSHBITS(&bitC);
+
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) {  /* this test must be static */
+            FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        }
+
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    FSE_flushCState(&bitC, &CState2);
+    FSE_flushCState(&bitC, &CState1);
+    return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct)
+{
+    unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+    if (fast)
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+    else
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` size must be `(1<<tableLog)`.
+ */
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32   count[FSE_MAX_SYMBOL_VALUE+1];
+    S16   norm[FSE_MAX_SYMBOL_VALUE+1];
+    FSE_CTable* CTable = (FSE_CTable*)workSpace;
+    size_t const CTableSize = FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue);
+    void* scratchBuffer = (void*)(CTable + CTableSize);
+    size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
+
+    /* init conditions */
+    if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
+    if (srcSize <= 1) return 0;  /* Not compressible */
+    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(maxCount, FSE_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned*)scratchBuffer) );
+        if (maxCount == srcSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+        if (maxCount < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue) );
+
+    /* Write table description header */
+    {   CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+        op += nc_err;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    /* check compressibility */
+    if ( (size_t)(op-ostart) >= srcSize-1 ) return 0;
+
+    return op-ostart;
+}
+
+typedef struct {
+    FSE_CTable CTable_max[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
+    BYTE scratchBuffer[1 << FSE_MAX_TABLELOG];
+} fseWkspMax_t;
+
+size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
+{
+    fseWkspMax_t scratchBuffer;
+    FSE_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE));   /* compilation failures here means scratchBuffer is not large enough */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
+}
+
+size_t FSE_compress (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return FSE_compress2(dst, dstCapacity, src, srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
+}
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */

diff --git a/vendor/github.com/DataDog/zstd/fse_decompress.c b/vendor/github.com/DataDog/zstd/fse_decompress.c
new file mode 100644
index 0000000..4c66c3b
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/fse_decompress.c

@@ -0,0 +1,309 @@
+/* ******************************************************************
+   FSE : Finite State Entropy decoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include "bitstream.h"
+#include "compiler.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+/* check and forward error code */
+#define CHECK_F(f) { size_t const e = f; if (FSE_isError(e)) return e; }
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+FSE_DTable* FSE_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSE_freeDTable (FSE_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSE_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSV1 = tableMask+1;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<maxSV1; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+
+    /* Init */
+    CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) & (op<olimit) ; op+=4) {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state1);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state2);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+
+size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+
+    /* normal FSE decoding mode */
+    size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(NCountLength)) return NCountLength;
+    //if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */
+    if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+    ip += NCountLength;
+    cSrcSize -= NCountLength;
+
+    CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) );
+
+    return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace);   /* always return, even if it is an error code */
+}
+
+
+typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize)
+{
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */

diff --git a/vendor/github.com/DataDog/zstd/huf.h b/vendor/github.com/DataDog/zstd/huf.h
new file mode 100644
index 0000000..b4645b4
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/huf.h

@@ -0,0 +1,327 @@
+/* ******************************************************************
+   Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include <stddef.h>    /* size_t */
+
+
+/* *** library symbols visibility *** */
+/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+ *        HUF symbols remain "private" (internal symbols for library only).
+ *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define HUF_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+#else
+#  define HUF_PUBLIC_API
+#endif
+
+
+/* ========================== */
+/* ***  simple functions  *** */
+/* ========================== */
+
+/** HUF_compress() :
+ *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+ * 'dst' buffer must be already allocated.
+ *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+ * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+ * @return : size of compressed data (<= `dstCapacity`).
+ *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+ *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+ */
+HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/** HUF_decompress() :
+ *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+ *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+ * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+ *  Note : in contrast with FSE, HUF_decompress can regenerate
+ *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+ *         because it knows size to regenerate (originalSize).
+ * @return : size of regenerated data (== originalSize),
+ *           or an error code, which can be tested using HUF_isError()
+ */
+HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/* ***   Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /**< maximum input size for a single block compressed with HUF_compress */
+HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /**< maximum compressed size (worst case) */
+
+/* Error Management */
+HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /**< tells if a return value is an error code */
+HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /**< provides error code string (useful for debugging) */
+
+
+/* ***   Advanced function   *** */
+
+/** HUF_compress2() :
+ *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+ * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+ * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               unsigned maxSymbolValue, unsigned tableLog);
+
+/** HUF_compress4X_wksp() :
+ *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+ * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */
+#define HUF_WORKSPACE_SIZE (6 << 10)
+#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
+HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     unsigned maxSymbolValue, unsigned tableLog,
+                                     void* workSpace, size_t wkspSize);
+
+#endif   /* HUF_H_298734234 */
+
+/* ******************************************************************
+ *  WARNING !!
+ *  The following section contains advanced and experimental definitions
+ *  which shall never be used in the context of a dynamic library,
+ *  because they are not guaranteed to remain stable in the future.
+ *  Only consider them in association with static linking.
+ * *****************************************************************/
+#if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+#define HUF_H_HUF_STATIC_LINKING_ONLY
+
+/* *** Dependencies *** */
+#include "mem.h"   /* U32 */
+
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT  11      /* default tableLog value when none specified */
+#define HUF_SYMBOLVALUE_MAX  255
+
+#define HUF_TABLELOG_ABSOLUTEMAX  15  /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#  error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true when incompressible is pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+#define HUF_CTABLE_SIZE_U32(maxSymbolValue)   ((maxSymbolValue)+1)   /* Use tables of U32, for proper alignment */
+#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32))
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+    U32 name##hb[HUF_CTABLE_SIZE_U32(maxSymbolValue)]; \
+    void* name##hv = &(name##hb); \
+    HUF_CElt* name = (HUF_CElt*)(name##hv)   /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+
+
+/* ****************************************
+ *  HUF detailed API
+ * ****************************************/
+
+/*! HUF_compress() does the following:
+ *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+ *  2. (optional) refine tableLog using HUF_optimalTableLog()
+ *  3. build Huffman table from count using HUF_buildCTable()
+ *  4. save Huffman table to memory buffer using HUF_writeCTable()
+ *  5. encode the data stream using HUF_compress4X_usingCTable()
+ *
+ *  The following API allows targeting specific sub-functions for advanced tasks.
+ *  For example, it's possible to compress several blocks using the same 'CTable',
+ *  or to save and regenerate 'CTable' using external methods.
+ */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+
+typedef enum {
+   HUF_repeat_none,  /**< Cannot use the previous table */
+   HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+   HUF_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+ } HUF_repeat;
+/** HUF_compress4X_repeat() :
+ *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,    /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+ */
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+ *  Read compact Huffman tree, saved by HUF_writeCTable().
+ * `huffWeight` is destination buffer.
+ * @return : size read from `src` , or an error Code .
+ *  Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
+                     U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize);
+
+/** HUF_readCTable() :
+ *  Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+
+/*
+ * HUF_decompress() does the following:
+ * 1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+ * 2. build Huffman table from save, using HUF_readDTableX?()
+ * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
+ */
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+
+/**
+ *  The minimum workspace size for the `workSpace` used in
+ *  HUF_readDTableX2_wksp() and HUF_readDTableX4_wksp().
+ *
+ *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
+ *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
+ *  Buffer overflow errors may potentially occur if code modifications result in
+ *  a required workspace size greater than that specified in the following
+ *  macro.
+ */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX4_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+
+
+/* ====================== */
+/* single stream variants */
+/* ====================== */
+
+size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+/** HUF_compress1X_repeat() :
+ *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+
+size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /**< automatic selection of sing or double symbol decoder, based on DTable */
+size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+
+/* BMI2 variants.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+
+#endif /* HUF_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif

diff --git a/vendor/github.com/DataDog/zstd/huf_compress.c b/vendor/github.com/DataDog/zstd/huf_compress.c
new file mode 100644
index 0000000..83230b4
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/huf_compress.c

@@ -0,0 +1,788 @@
+/* ******************************************************************
+   Huffman encoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+#include "bitstream.h"
+#include "compiler.h"
+#define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+#include "fse.h"        /* header compression */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+
+/* **************************************************************
+*  Utils
+****************************************************************/
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+}
+
+
+/* *******************************************************
+*  HUF : Huffman block compression
+*********************************************************/
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32 maxSymbolValue = HUF_TABLELOG_MAX;
+    U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+
+    FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
+    BYTE scratchBuffer[1<<MAX_FSE_TABLELOG_FOR_HUFF_HEADER];
+
+    U32 count[HUF_TABLELOG_MAX+1];
+    S16 norm[HUF_TABLELOG_MAX+1];
+
+    /* init conditions */
+    if (wtSize <= 1) return 0;  /* Not compressible */
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(maxCount, FSE_count_simple(count, &maxSymbolValue, weightTable, wtSize) );
+        if (maxCount == wtSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue) );
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+        op += hSize;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    return op-ostart;
+}
+
+
+struct HUF_CElt_s {
+  U16  val;
+  BYTE nbBits;
+};   /* typedef'd to HUF_CElt within "huf.h" */
+
+/*! HUF_writeCTable() :
+    `CTable` : Huffman tree to save, using huf representation.
+    @return : size of saved CTable */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+                        const HUF_CElt* CTable, U32 maxSymbolValue, U32 huffLog)
+{
+    BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
+    BYTE* op = (BYTE*)dst;
+    U32 n;
+
+     /* check conditions */
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+
+    /* convert to weight */
+    bitsToWeight[0] = 0;
+    for (n=1; n<huffLog+1; n++)
+        bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+    for (n=0; n<maxSymbolValue; n++)
+        huffWeight[n] = bitsToWeight[CTable[n].nbBits];
+
+    /* attempt weights compression by FSE */
+    {   CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, huffWeight, maxSymbolValue) );
+        if ((hSize>1) & (hSize < maxSymbolValue/2)) {   /* FSE compressed */
+            op[0] = (BYTE)hSize;
+            return hSize+1;
+    }   }
+
+    /* write raw values as 4-bits (max : 15) */
+    if (maxSymbolValue > (256-128)) return ERROR(GENERIC);   /* should not happen : likely means source cannot be compressed */
+    if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall);   /* not enough space within dst buffer */
+    op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1));
+    huffWeight[maxSymbolValue] = 0;   /* to be sure it doesn't cause msan issue in final combination */
+    for (n=0; n<maxSymbolValue; n+=2)
+        op[(n/2)+1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n+1]);
+    return ((maxSymbolValue+1)/2) + 1;
+}
+
+
+size_t HUF_readCTable (HUF_CElt* CTable, U32* maxSymbolValuePtr, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];   /* init not required, even though some static analyzer may complain */
+    U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+
+    /* get symbol weights */
+    CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
+
+    /* check result */
+    if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+
+    /* Prepare base value per rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<=tableLog; n++) {
+            U32 current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill nbBits */
+    {   U32 n; for (n=0; n<nbSymbols; n++) {
+            const U32 w = huffWeight[n];
+            CTable[n].nbBits = (BYTE)(tableLog + 1 - w);
+    }   }
+
+    /* fill val */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+2]  = {0};  /* support w=0=>n=tableLog+1 */
+        U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
+        { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
+        /* determine stating value per rank */
+        valPerRank[tableLog+1] = 0;   /* for w==0 */
+        {   U16 min = 0;
+            U32 n; for (n=tableLog; n>0; n--) {  /* start at n=tablelog <-> w=1 */
+                valPerRank[n] = min;     /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        /* assign value within rank, symbol order */
+        { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
+    }
+
+    *maxSymbolValuePtr = nbSymbols - 1;
+    return readSize;
+}
+
+
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+{
+    const U32 largestBits = huffNode[lastNonNull].nbBits;
+    if (largestBits <= maxNbBits) return largestBits;   /* early exit : no elt > maxNbBits */
+
+    /* there are several too large elements (at least >= 2) */
+    {   int totalCost = 0;
+        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        U32 n = lastNonNull;
+
+        while (huffNode[n].nbBits > maxNbBits) {
+            totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+            huffNode[n].nbBits = (BYTE)maxNbBits;
+            n --;
+        }  /* n stops at huffNode[n].nbBits <= maxNbBits */
+        while (huffNode[n].nbBits == maxNbBits) n--;   /* n end at index of smallest symbol using < maxNbBits */
+
+        /* renorm totalCost */
+        totalCost >>= (largestBits - maxNbBits);  /* note : totalCost is necessarily a multiple of baseCost */
+
+        /* repay normalized cost */
+        {   U32 const noSymbol = 0xF0F0F0F0;
+            U32 rankLast[HUF_TABLELOG_MAX+2];
+            int pos;
+
+            /* Get pos of last (smallest) symbol per rank */
+            memset(rankLast, 0xF0, sizeof(rankLast));
+            {   U32 currentNbBits = maxNbBits;
+                for (pos=n ; pos >= 0; pos--) {
+                    if (huffNode[pos].nbBits >= currentNbBits) continue;
+                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+                    rankLast[maxNbBits-currentNbBits] = pos;
+            }   }
+
+            while (totalCost > 0) {
+                U32 nBitsToDecrease = BIT_highbit32(totalCost) + 1;
+                for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                    U32 highPos = rankLast[nBitsToDecrease];
+                    U32 lowPos = rankLast[nBitsToDecrease-1];
+                    if (highPos == noSymbol) continue;
+                    if (lowPos == noSymbol) break;
+                    {   U32 const highTotal = huffNode[highPos].count;
+                        U32 const lowTotal = 2 * huffNode[lowPos].count;
+                        if (highTotal <= lowTotal) break;
+                }   }
+                /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+                /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+                while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
+                    nBitsToDecrease ++;
+                totalCost -= 1 << (nBitsToDecrease-1);
+                if (rankLast[nBitsToDecrease-1] == noSymbol)
+                    rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease];   /* this rank is no longer empty */
+                huffNode[rankLast[nBitsToDecrease]].nbBits ++;
+                if (rankLast[nBitsToDecrease] == 0)    /* special case, reached largest symbol */
+                    rankLast[nBitsToDecrease] = noSymbol;
+                else {
+                    rankLast[nBitsToDecrease]--;
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                        rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+            }   }   /* while (totalCost > 0) */
+
+            while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+                if (rankLast[1] == noSymbol) {  /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
+                    while (huffNode[n].nbBits == maxNbBits) n--;
+                    huffNode[n+1].nbBits--;
+                    rankLast[1] = n+1;
+                    totalCost++;
+                    continue;
+                }
+                huffNode[ rankLast[1] + 1 ].nbBits--;
+                rankLast[1]++;
+                totalCost ++;
+    }   }   }   /* there are several too large elements (at least >= 2) */
+
+    return maxNbBits;
+}
+
+
+typedef struct {
+    U32 base;
+    U32 current;
+} rankPos;
+
+static void HUF_sort(nodeElt* huffNode, const U32* count, U32 maxSymbolValue)
+{
+    rankPos rank[32];
+    U32 n;
+
+    memset(rank, 0, sizeof(rank));
+    for (n=0; n<=maxSymbolValue; n++) {
+        U32 r = BIT_highbit32(count[n] + 1);
+        rank[r].base ++;
+    }
+    for (n=30; n>0; n--) rank[n-1].base += rank[n].base;
+    for (n=0; n<32; n++) rank[n].current = rank[n].base;
+    for (n=0; n<=maxSymbolValue; n++) {
+        U32 const c = count[n];
+        U32 const r = BIT_highbit32(c+1) + 1;
+        U32 pos = rank[r].current++;
+        while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) {
+            huffNode[pos] = huffNode[pos-1];
+            pos--;
+        }
+        huffNode[pos].count = c;
+        huffNode[pos].byte  = (BYTE)n;
+    }
+}
+
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of HUF_CTABLE_WORKSPACE_SIZE_U32 unsigned.
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX+1)
+typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
+size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+{
+    nodeElt* const huffNode0 = (nodeElt*)workSpace;
+    nodeElt* const huffNode = huffNode0+1;
+    U32 n, nonNullRank;
+    int lowS, lowN;
+    U16 nodeNb = STARTNODE;
+    U32 nodeRoot;
+
+    /* safety checks */
+    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (wkspSize < sizeof(huffNodeTable)) return ERROR(workSpace_tooSmall);
+    if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+    memset(huffNode0, 0, sizeof(huffNodeTable));
+
+    /* sort, decreasing order */
+    HUF_sort(huffNode, count, maxSymbolValue);
+
+    /* init for parents */
+    nonNullRank = maxSymbolValue;
+    while(huffNode[nonNullRank].count == 0) nonNullRank--;
+    lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb;
+    huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count;
+    huffNode[lowS].parent = huffNode[lowS-1].parent = nodeNb;
+    nodeNb++; lowS-=2;
+    for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30);
+    huffNode0[0].count = (U32)(1U<<31);  /* fake entry, strong barrier */
+
+    /* create parents */
+    while (nodeNb <= nodeRoot) {
+        U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+        huffNode[n1].parent = huffNode[n2].parent = nodeNb;
+        nodeNb++;
+    }
+
+    /* distribute weights (unlimited tree height) */
+    huffNode[nodeRoot].nbBits = 0;
+    for (n=nodeRoot-1; n>=STARTNODE; n--)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+    for (n=0; n<=nonNullRank; n++)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
+    /* enforce maxTableLog */
+    maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits);
+
+    /* fill result into tree (val, nbBits) */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
+        U16 valPerRank[HUF_TABLELOG_MAX+1] = {0};
+        if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+        for (n=0; n<=nonNullRank; n++)
+            nbPerRank[huffNode[n].nbBits]++;
+        /* determine stating value per rank */
+        {   U16 min = 0;
+            for (n=maxNbBits; n>0; n--) {
+                valPerRank[n] = min;      /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[huffNode[n].byte].nbBits = huffNode[n].nbBits;   /* push nbBits per symbol, symbol order */
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[n].val = valPerRank[tree[n].nbBits]++;   /* assign value within rank, symbol order */
+    }
+
+    return maxNbBits;
+}
+
+/** HUF_buildCTable() :
+ * @return : maxNbBits
+ *  Note : count is used before tree is written, so they can safely overlap
+ */
+size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)
+{
+    huffNodeTable nodeTable;
+    return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, nodeTable, sizeof(nodeTable));
+}
+
+static size_t HUF_estimateCompressedSize(HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
+{
+    size_t nbBits = 0;
+    int s;
+    for (s = 0; s <= (int)maxSymbolValue; ++s) {
+        nbBits += CTable[s].nbBits * count[s];
+    }
+    return nbBits >> 3;
+}
+
+static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+  int bad = 0;
+  int s;
+  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+    bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
+  }
+  return !bad;
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+FORCE_INLINE_TEMPLATE void
+HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
+{
+    BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+#define HUF_FLUSHBITS(s)  BIT_flushBits(s)
+
+#define HUF_FLUSHBITS_1(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
+
+#define HUF_FLUSHBITS_2(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    const BYTE* ip = (const BYTE*) src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+    size_t n;
+    BIT_CStream_t bitC;
+
+    /* init */
+    if (dstSize < 8) return 0;   /* not enough space to compress */
+    { size_t const initErr = BIT_initCStream(&bitC, op, oend-op);
+      if (HUF_isError(initErr)) return 0; }
+
+    n = srcSize & ~3;  /* join to mod 4 */
+    switch (srcSize & 3)
+    {
+        case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
+                 HUF_FLUSHBITS_2(&bitC);
+		 /* fall-through */
+        case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
+                 HUF_FLUSHBITS_1(&bitC);
+		 /* fall-through */
+        case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
+                 HUF_FLUSHBITS(&bitC);
+		 /* fall-through */
+        case 0 : /* fall-through */
+        default: break;
+    }
+
+    for (; n>0; n-=4) {  /* note : n&3==0 at this stage */
+        HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
+        HUF_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
+        HUF_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
+        HUF_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
+        HUF_FLUSHBITS(&bitC);
+    }
+
+    return BIT_closeCStream(&bitC);
+}
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+                                      const void* src, size_t srcSize,
+                                      const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    if (bmi2) {
+        return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+    }
+    return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+}
+
+#else
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    (void)bmi2;
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+#endif
+
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+
+static size_t
+HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, int bmi2)
+{
+    size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    if (dstSize < 6 + 1 + 1 + 1 + 8) return 0;   /* minimum space to compress successfully */
+    if (srcSize < 12) return 0;   /* no saving possible : too small input */
+    op += 6;   /* jumpTable */
+
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart+2, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart+4, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, iend-ip, CTable, bmi2) );
+        if (cSize==0) return 0;
+        op += cSize;
+    }
+
+    return op-ostart;
+}
+
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+
+static size_t HUF_compressCTable_internal(
+                BYTE* const ostart, BYTE* op, BYTE* const oend,
+                const void* src, size_t srcSize,
+                unsigned singleStream, const HUF_CElt* CTable, const int bmi2)
+{
+    size_t const cSize = singleStream ?
+                         HUF_compress1X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2) :
+                         HUF_compress4X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2);
+    if (HUF_isError(cSize)) { return cSize; }
+    if (cSize==0) { return 0; }   /* uncompressible */
+    op += cSize;
+    /* check compressibility */
+    if ((size_t)(op-ostart) >= srcSize-1) { return 0; }
+    return op-ostart;
+}
+
+typedef struct {
+    U32 count[HUF_SYMBOLVALUE_MAX + 1];
+    HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
+    huffNodeTable nodeTable;
+} HUF_compress_tables_t;
+
+/* HUF_compress_internal() :
+ * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+static size_t HUF_compress_internal (
+                void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog,
+                unsigned singleStream,
+                void* workSpace, size_t wkspSize,
+                HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+                const int bmi2)
+{
+    HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    /* checks & inits */
+    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
+    if (!srcSize) return 0;  /* Uncompressed */
+    if (!dstSize) return 0;  /* cannot fit anything within dst budget */
+    if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);   /* current block size limit */
+    if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+    if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+
+    /* Heuristic : If old table is valid, use it for small inputs */
+    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           singleStream, oldHufTable, bmi2);
+    }
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(largest, FSE_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) );
+        if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+        if (largest <= (srcSize >> 7)+1) return 0;   /* heuristic : probably not compressible enough */
+    }
+
+    /* Check validity of previous table */
+    if ( repeat
+      && *repeat == HUF_repeat_check
+      && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) {
+        *repeat = HUF_repeat_none;
+    }
+    /* Heuristic : use existing table for small inputs */
+    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           singleStream, oldHufTable, bmi2);
+    }
+
+    /* Build Huffman Tree */
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    {   CHECK_V_F(maxBits, HUF_buildCTable_wksp(table->CTable, table->count,
+                                                maxSymbolValue, huffLog,
+                                                table->nodeTable, sizeof(table->nodeTable)) );
+        huffLog = (U32)maxBits;
+        /* Zero unused symbols in CTable, so we can check it for validity */
+        memset(table->CTable + (maxSymbolValue + 1), 0,
+               sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
+    }
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) );
+        /* Check if using previous huffman table is beneficial */
+        if (repeat && *repeat != HUF_repeat_none) {
+            size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue);
+            size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue);
+            if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                return HUF_compressCTable_internal(ostart, op, oend,
+                                                   src, srcSize,
+                                                   singleStream, oldHufTable, bmi2);
+        }   }
+
+        /* Use the new huffman table */
+        if (hSize + 12ul >= srcSize) { return 0; }
+        op += hSize;
+        if (repeat) { *repeat = HUF_repeat_none; }
+        if (oldHufTable)
+            memcpy(oldHufTable, table->CTable, sizeof(table->CTable));  /* Save new table */
+    }
+    return HUF_compressCTable_internal(ostart, op, oend,
+                                       src, srcSize,
+                                       singleStream, table->CTable, bmi2);
+}
+
+
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 1 /*single stream*/,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/);
+}
+
+size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 1 /*single stream*/,
+                                 workSpace, wkspSize, hufTable,
+                                 repeat, preferRepeat, bmi2);
+}
+
+size_t HUF_compress1X (void* dst, size_t dstSize,
+                 const void* src, size_t srcSize,
+                 unsigned maxSymbolValue, unsigned huffLog)
+{
+    unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+    return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * provide workspace to generate compression tables */
+size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 0 /*4 streams*/,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/);
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * re-use an existing huffman compression table */
+size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 0 /* 4 streams */,
+                                 workSpace, wkspSize,
+                                 hufTable, repeat, preferRepeat, bmi2);
+}
+
+size_t HUF_compress2 (void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog)
+{
+    unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+    return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT);
+}

diff --git a/vendor/github.com/DataDog/zstd/huf_decompress.c b/vendor/github.com/DataDog/zstd/huf_decompress.c
new file mode 100644
index 0000000..73f5c46
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/huf_decompress.c

@@ -0,0 +1,1096 @@
+/* ******************************************************************
+   Huffman decoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include <string.h>     /* memcpy, memset */
+#include "bitstream.h"  /* BIT_* */
+#include "compiler.h"
+#include "fse.h"        /* header compression */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+#define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; }
+
+
+/* **************************************************************
+*  Byte alignment for workSpace management
+****************************************************************/
+#define HUF_ALIGN(x, a)         HUF_ALIGN_MASK((x), (a) - 1)
+#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+{
+    DTableDesc dtd;
+    memcpy(&dtd, table, sizeof(dtd));
+    return dtd;
+}
+
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+{
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    size_t iSize;
+    void* const dtPtr = DTable + 1;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+
+    U32* rankVal;
+    BYTE* huffWeight;
+    size_t spaceUsed32 = 0;
+
+    rankVal = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+    huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+    if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
+
+    HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+    /* memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* Table header */
+    {   DTableDesc dtd = HUF_getDTableDesc(DTable);
+        if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge);   /* DTable too small, Huffman tree cannot fit in */
+        dtd.tableType = 0;
+        dtd.tableLog = (BYTE)tableLog;
+        memcpy(DTable, &dtd, sizeof(dtd));
+    }
+
+    /* Calculate starting value for each rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<tableLog+1; n++) {
+            U32 const current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill DTable */
+    {   U32 n;
+        for (n=0; n<nbSymbols; n++) {
+            U32 const w = huffWeight[n];
+            U32 const length = (1 << w) >> 1;
+            U32 u;
+            HUF_DEltX2 D;
+            D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+            for (u = rankVal[w]; u < rankVal[w] + length; u++)
+                dt[u] = D;
+            rankVal[w] += length;
+    }   }
+
+    return iSize;
+}
+
+size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_readDTableX2_wksp(DTable, src, srcSize,
+                                 workSpace, sizeof(workSpace));
+}
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
+
+FORCE_INLINE_TEMPLATE BYTE
+HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    BYTE const c = dt[val].byte;
+    BIT_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)  \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+HINT_INLINE size_t
+HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* [0-3] symbols remaining */
+    if (MEM_32bits())
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
+            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const void* dtPtr = DTable + 1;
+    const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+    BIT_DStream_t bitD;
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    U32 const dtLog = dtd.tableLog;
+
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable + 1;
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal = BIT_DStream_unfinished;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+            BIT_reloadDStream(&bitD1);
+            BIT_reloadDStream(&bitD2);
+            BIT_reloadDStream(&bitD3);
+            BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        /* note : should not be necessary : op# advance in lock step, and we control op4.
+         *        but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
+    }   }
+    return 1;
+}
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+HINT_INLINE size_t
+HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+                const HUF_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X4_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BIT_DStream_t bitD;
+
+    /* Init */
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X4_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;
+        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+                                               const void *cSrc,
+                                               size_t cSrcSize,
+                                               const HUF_DTable *DTable);
+#if DYNAMIC_BMI2
+
+#define X(fn)                                                               \
+                                                                            \
+    static size_t fn##_default(                                             \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2(                       \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        if (bmi2) {                                                         \
+            return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+        }                                                                   \
+        return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+    }
+
+#else
+
+#define X(fn)                                                               \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        (void)bmi2;                                                         \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }
+
+#endif
+
+X(HUF_decompress1X2_usingDTable_internal)
+X(HUF_decompress4X2_usingDTable_internal)
+X(HUF_decompress1X4_usingDTable_internal)
+X(HUF_decompress4X4_usingDTable_internal)
+
+#undef X
+
+
+size_t HUF_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp (dctx, cSrc, cSrcSize,
+                                                workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+}
+
+
+size_t HUF_decompress4X2_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/* HUF_fillDTableX4Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX4 DElt;
+    U32 rankVal[HUF_TABLELOG_MAX + 1];
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    {   U32 s; for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+            const U32 symbol = sortedSymbols[s].symbol;
+            const U32 weight = sortedSymbols[s].weight;
+            const U32 nbBits = nbBitsBaseline - weight;
+            const U32 length = 1 << (sizeLog-nbBits);
+            const U32 start = rankVal[weight];
+            U32 i = start;
+            const U32 end = start + length;
+
+            MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+            DElt.nbBits = (BYTE)(nbBits + consumed);
+            DElt.length = 2;
+            do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+            rankVal[weight] += length;
+    }   }
+}
+
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
+
+static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_TABLELOG_MAX + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            HUF_DEltX4 DElt;
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits = (BYTE)(nbBits);
+            DElt.length = 1;
+            {   U32 const end = start + length;
+                U32 u;
+                for (u = start; u < end; u++) DTable[u] = DElt;
+        }   }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
+                             size_t srcSize, void* workSpace,
+                             size_t wkspSize)
+{
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    U32 const maxTableLog = dtd.maxTableLog;
+    size_t iSize;
+    void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
+    HUF_DEltX4* const dt = (HUF_DEltX4*)dtPtr;
+    U32 *rankStart;
+
+    rankValCol_t* rankVal;
+    U32* rankStats;
+    U32* rankStart0;
+    sortedSymbol_t* sortedSymbol;
+    BYTE* weightList;
+    size_t spaceUsed32 = 0;
+
+    rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+    rankStats = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_MAX + 1;
+    rankStart0 = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_MAX + 2;
+    sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
+    spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+    weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+    if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
+
+    rankStart = rankStart0 + 1;
+    memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
+    if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    /* memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = weightList[s];
+            U32 const r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = rankVal[0];
+        {   int const rescale = (maxTableLog-tableLog) - 1;   /* tableLog <= maxTableLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 current = nextRankVal;
+                nextRankVal += rankStats[w] << (w+rescale);
+                rankVal0[w] = current;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUF_fillDTableX4(dt, maxTableLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    dtd.tableLog = (BYTE)maxTableLog;
+    dtd.tableType = 1;
+    memcpy(DTable, &dtd, sizeof(dtd));
+    return iSize;
+}
+
+size_t HUF_readDTableX4(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+  U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+  return HUF_readDTableX4_wksp(DTable, src, srcSize,
+                               workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize,
+                                               workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X4_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X4_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X4_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize,
+                                         workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+{
+    assert(dstSize > 0);
+    assert(dstSize <= 128 KB);
+    /* decoder timing evaluation */
+    {   U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 */
+        U32 const D256 = (U32)(dstSize >> 8);
+        U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+        U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+        DTime1 += DTime1 >> 3;  /* advantage to algorithm using less memory, to reduce cache eviction */
+        return DTime1 < DTime0;
+}   }
+
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[2] = { HUF_decompress4X2, HUF_decompress4X4 };
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+    }
+}
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                         workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+                                     size_t dstSize, const void* cSrc,
+                                     size_t cSrcSize, void* workSpace,
+                                     size_t wkspSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize):
+                        HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+    }
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                  const void* cSrc, size_t cSrcSize,
+                                  void* workSpace, size_t wkspSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize):
+                        HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize);
+    }
+}
+
+size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                             const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                      workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+}
+
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+                        HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    }
+}

diff --git a/vendor/github.com/DataDog/zstd/mem.h b/vendor/github.com/DataDog/zstd/mem.h
new file mode 100644
index 0000000..47d2300
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/mem.h

@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>     /* size_t, ptrdiff_t */
+#include <string.h>     /* memcpy */
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __inline __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+/* code only tested on 32 and 64 bits systems */
+#define MEM_STATIC_ASSERT(c)   { enum { MEM_static_assert = 1/(int)(!!(c)) }; }
+MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef   uint8_t BYTE;
+  typedef  uint16_t U16;
+  typedef   int16_t S16;
+  typedef  uint32_t U32;
+  typedef   int32_t S32;
+  typedef  uint64_t U64;
+  typedef   int64_t S64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || defined(__GNUC__)
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
+    __pragma( pack(push, 1) )
+    typedef struct { U16 v; } unalign16;
+    typedef struct { U32 v; } unalign32;
+    typedef struct { U64 v; } unalign64;
+    typedef struct { size_t v; } unalignArch;
+    __pragma( pack(pop) )
+#else
+    typedef struct { U16 v; } __attribute__((packed)) unalign16;
+    typedef struct { U32 v; } __attribute__((packed)) unalign32;
+    typedef struct { U64 v; } __attribute__((packed)) unalign64;
+    typedef struct { size_t v; } __attribute__((packed)) unalignArch;
+#endif
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; }
+MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC size_t MEM_readST(const void* memPtr)
+{
+    size_t val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap32(in);
+#else
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap64(in);
+#else
+    return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+#endif
+}
+
+MEM_STATIC size_t MEM_swapST(size_t in)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_swap32((U32)in);
+    else
+        return (size_t)MEM_swap64((U64)in);
+}
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE24(const void* memPtr)
+{
+    return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16);
+}
+
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val)
+{
+    MEM_writeLE16(memPtr, (U16)val);
+    ((BYTE*)memPtr)[2] = (BYTE)(val>>16);
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, val32);
+    else
+        MEM_write32(memPtr, MEM_swap32(val32));
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, val64);
+    else
+        MEM_write64(memPtr, MEM_swap64(val64));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeLE32(memPtr, (U32)val);
+    else
+        MEM_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+MEM_STATIC U32 MEM_readBE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap32(MEM_read32(memPtr));
+    else
+        return MEM_read32(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, MEM_swap32(val32));
+    else
+        MEM_write32(memPtr, val32);
+}
+
+MEM_STATIC U64 MEM_readBE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap64(MEM_read64(memPtr));
+    else
+        return MEM_read64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, MEM_swap64(val64));
+    else
+        MEM_write64(memPtr, val64);
+}
+
+MEM_STATIC size_t MEM_readBEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readBE32(memPtr);
+    else
+        return (size_t)MEM_readBE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeBE32(memPtr, (U32)val);
+    else
+        MEM_writeBE64(memPtr, (U64)val);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */

diff --git a/vendor/github.com/DataDog/zstd/pool.c b/vendor/github.com/DataDog/zstd/pool.c
new file mode 100644
index 0000000..773488b
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/pool.c

@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Dependencies   ======= */
+#include <stddef.h>  /* size_t */
+#include "pool.h"
+#include "zstd_internal.h"  /* ZSTD_malloc, ZSTD_free */
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+#ifdef ZSTD_MULTITHREAD
+
+#include "threading.h"   /* pthread adaptation */
+
+/* A job is a function and an opaque argument */
+typedef struct POOL_job_s {
+    POOL_function function;
+    void *opaque;
+} POOL_job;
+
+struct POOL_ctx_s {
+    ZSTD_customMem customMem;
+    /* Keep track of the threads */
+    ZSTD_pthread_t *threads;
+    size_t numThreads;
+
+    /* The queue is a circular buffer */
+    POOL_job *queue;
+    size_t queueHead;
+    size_t queueTail;
+    size_t queueSize;
+
+    /* The number of threads working on jobs */
+    size_t numThreadsBusy;
+    /* Indicates if the queue is empty */
+    int queueEmpty;
+
+    /* The mutex protects the queue */
+    ZSTD_pthread_mutex_t queueMutex;
+    /* Condition variable for pushers to wait on when the queue is full */
+    ZSTD_pthread_cond_t queuePushCond;
+    /* Condition variables for poppers to wait on when the queue is empty */
+    ZSTD_pthread_cond_t queuePopCond;
+    /* Indicates if the queue is shutting down */
+    int shutdown;
+};
+
+/* POOL_thread() :
+   Work thread for the thread pool.
+   Waits for jobs and executes them.
+   @returns : NULL on failure else non-null.
+*/
+static void* POOL_thread(void* opaque) {
+    POOL_ctx* const ctx = (POOL_ctx*)opaque;
+    if (!ctx) { return NULL; }
+    for (;;) {
+        /* Lock the mutex and wait for a non-empty queue or until shutdown */
+        ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+
+        while (ctx->queueEmpty && !ctx->shutdown) {
+            ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
+        }
+        /* empty => shutting down: so stop */
+        if (ctx->queueEmpty) {
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+            return opaque;
+        }
+        /* Pop a job off the queue */
+        {   POOL_job const job = ctx->queue[ctx->queueHead];
+            ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
+            ctx->numThreadsBusy++;
+            ctx->queueEmpty = ctx->queueHead == ctx->queueTail;
+            /* Unlock the mutex, signal a pusher, and run the job */
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+
+            job.function(job.opaque);
+
+            /* If the intended queue size was 0, signal after finishing job */
+            if (ctx->queueSize == 1) {
+                ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+                ctx->numThreadsBusy--;
+                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+                ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+        }   }
+    }  /* for (;;) */
+    /* Unreachable */
+}
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) {
+    POOL_ctx* ctx;
+    /* Check the parameters */
+    if (!numThreads) { return NULL; }
+    /* Allocate the context and zero initialize */
+    ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem);
+    if (!ctx) { return NULL; }
+    /* Initialize the job queue.
+     * It needs one extra space since one space is wasted to differentiate empty
+     * and full queues.
+     */
+    ctx->queueSize = queueSize + 1;
+    ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem);
+    ctx->queueHead = 0;
+    ctx->queueTail = 0;
+    ctx->numThreadsBusy = 0;
+    ctx->queueEmpty = 1;
+    (void)ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL);
+    (void)ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL);
+    (void)ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL);
+    ctx->shutdown = 0;
+    /* Allocate space for the thread handles */
+    ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
+    ctx->numThreads = 0;
+    ctx->customMem = customMem;
+    /* Check for errors */
+    if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
+    /* Initialize the threads */
+    {   size_t i;
+        for (i = 0; i < numThreads; ++i) {
+            if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
+                ctx->numThreads = i;
+                POOL_free(ctx);
+                return NULL;
+        }   }
+        ctx->numThreads = numThreads;
+    }
+    return ctx;
+}
+
+/*! POOL_join() :
+    Shutdown the queue, wake any sleeping threads, and join all of the threads.
+*/
+static void POOL_join(POOL_ctx* ctx) {
+    /* Shut down the queue */
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    ctx->shutdown = 1;
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    /* Wake up sleeping threads */
+    ZSTD_pthread_cond_broadcast(&ctx->queuePushCond);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    /* Join all of the threads */
+    {   size_t i;
+        for (i = 0; i < ctx->numThreads; ++i) {
+            ZSTD_pthread_join(ctx->threads[i], NULL);
+    }   }
+}
+
+void POOL_free(POOL_ctx *ctx) {
+    if (!ctx) { return; }
+    POOL_join(ctx);
+    ZSTD_pthread_mutex_destroy(&ctx->queueMutex);
+    ZSTD_pthread_cond_destroy(&ctx->queuePushCond);
+    ZSTD_pthread_cond_destroy(&ctx->queuePopCond);
+    ZSTD_free(ctx->queue, ctx->customMem);
+    ZSTD_free(ctx->threads, ctx->customMem);
+    ZSTD_free(ctx, ctx->customMem);
+}
+
+size_t POOL_sizeof(POOL_ctx *ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    return sizeof(*ctx)
+        + ctx->queueSize * sizeof(POOL_job)
+        + ctx->numThreads * sizeof(ZSTD_pthread_t);
+}
+
+/**
+ * Returns 1 if the queue is full and 0 otherwise.
+ *
+ * If the queueSize is 1 (the pool was created with an intended queueSize of 0),
+ * then a queue is empty if there is a thread free and no job is waiting.
+ */
+static int isQueueFull(POOL_ctx const* ctx) {
+    if (ctx->queueSize > 1) {
+        return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
+    } else {
+        return ctx->numThreadsBusy == ctx->numThreads ||
+               !ctx->queueEmpty;
+    }
+}
+
+
+static void POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque)
+{
+    POOL_job const job = {function, opaque};
+    assert(ctx != NULL);
+    if (ctx->shutdown) return;
+
+    ctx->queueEmpty = 0;
+    ctx->queue[ctx->queueTail] = job;
+    ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
+    ZSTD_pthread_cond_signal(&ctx->queuePopCond);
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    /* Wait until there is space in the queue for the new job */
+    while (isQueueFull(ctx) && (!ctx->shutdown)) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    if (isQueueFull(ctx)) {
+        ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        return 0;
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return 1;
+}
+
+
+#else  /* ZSTD_MULTITHREAD  not defined */
+
+/* ========================== */
+/* No multi-threading support */
+/* ========================== */
+
+
+/* We don't need any data, but if it is empty, malloc() might return NULL. */
+struct POOL_ctx_s {
+    int dummy;
+};
+static POOL_ctx g_ctx;
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) {
+    (void)numThreads;
+    (void)queueSize;
+    (void)customMem;
+    return &g_ctx;
+}
+
+void POOL_free(POOL_ctx* ctx) {
+    assert(!ctx || ctx == &g_ctx);
+    (void)ctx;
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+}
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+    return 1;
+}
+
+size_t POOL_sizeof(POOL_ctx* ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    assert(ctx == &g_ctx);
+    return sizeof(*ctx);
+}
+
+#endif  /* ZSTD_MULTITHREAD */

diff --git a/vendor/github.com/DataDog/zstd/pool.h b/vendor/github.com/DataDog/zstd/pool.h
new file mode 100644
index 0000000..a57e9b4
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/pool.h

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef POOL_H
+#define POOL_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#include <stddef.h>   /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_customMem */
+#include "zstd.h"
+
+typedef struct POOL_ctx_s POOL_ctx;
+
+/*! POOL_create() :
+ *  Create a thread pool with at most `numThreads` threads.
+ * `numThreads` must be at least 1.
+ *  The maximum number of queued jobs before blocking is `queueSize`.
+ * @return : POOL_ctx pointer on success, else NULL.
+*/
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem);
+
+/*! POOL_free() :
+    Free a thread pool returned by POOL_create().
+*/
+void POOL_free(POOL_ctx* ctx);
+
+/*! POOL_sizeof() :
+    return memory usage of pool returned by POOL_create().
+*/
+size_t POOL_sizeof(POOL_ctx* ctx);
+
+/*! POOL_function :
+    The function type that can be added to a thread pool.
+*/
+typedef void (*POOL_function)(void*);
+/*! POOL_add_function :
+    The function type for a generic thread pool add function.
+*/
+typedef void (*POOL_add_function)(void*, POOL_function, void*);
+
+/*! POOL_add() :
+    Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
+    Possibly blocks until there is room in the queue.
+    Note : The function may be executed asynchronously, so `opaque` must live until the function has been completed.
+*/
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+/*! POOL_tryAdd() :
+    Add the job `function(opaque)` to the thread pool if a worker is available.
+    return immediately otherwise.
+   @return : 1 if successful, 0 if not.
+*/
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif

diff --git a/vendor/github.com/DataDog/zstd/threading.c b/vendor/github.com/DataDog/zstd/threading.c
new file mode 100644
index 0000000..8be8c8d
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/threading.c

@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ */
+
+/**
+ * This file will hold wrapper for systems, which do not support pthreads
+ */
+
+/* create fake symbol to avoid empty trnaslation unit warning */
+int g_ZSTD_threading_useles_symbol;
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+
+
+/* ===  Dependencies  === */
+#include <process.h>
+#include <errno.h>
+#include "threading.h"
+
+
+/* ===  Implementation  === */
+
+static unsigned __stdcall worker(void *arg)
+{
+    ZSTD_pthread_t* const thread = (ZSTD_pthread_t*) arg;
+    thread->arg = thread->start_routine(thread->arg);
+    return 0;
+}
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+            void* (*start_routine) (void*), void* arg)
+{
+    (void)unused;
+    thread->arg = arg;
+    thread->start_routine = start_routine;
+    thread->handle = (HANDLE) _beginthreadex(NULL, 0, worker, thread, 0, NULL);
+
+    if (!thread->handle)
+        return errno;
+    else
+        return 0;
+}
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread, void **value_ptr)
+{
+    DWORD result;
+
+    if (!thread.handle) return 0;
+
+    result = WaitForSingleObject(thread.handle, INFINITE);
+    switch (result) {
+    case WAIT_OBJECT_0:
+        if (value_ptr) *value_ptr = thread.arg;
+        return 0;
+    case WAIT_ABANDONED:
+        return EINVAL;
+    default:
+        return GetLastError();
+    }
+}
+
+#endif   /* ZSTD_MULTITHREAD */

diff --git a/vendor/github.com/DataDog/zstd/threading.h b/vendor/github.com/DataDog/zstd/threading.h
new file mode 100644
index 0000000..d806c89
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/threading.h

@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ */
+
+#ifndef THREADING_H_938743
+#define THREADING_H_938743
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+#ifdef WINVER
+#  undef WINVER
+#endif
+#define WINVER       0x0600
+
+#ifdef _WIN32_WINNT
+#  undef _WIN32_WINNT
+#endif
+#define _WIN32_WINNT 0x0600
+
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#include <windows.h>
+#undef ERROR
+#define ERROR(name) ZSTD_ERROR(name)
+
+
+/* mutex */
+#define ZSTD_pthread_mutex_t           CRITICAL_SECTION
+#define ZSTD_pthread_mutex_init(a, b)  ((void)(b), InitializeCriticalSection((a)), 0)
+#define ZSTD_pthread_mutex_destroy(a)  DeleteCriticalSection((a))
+#define ZSTD_pthread_mutex_lock(a)     EnterCriticalSection((a))
+#define ZSTD_pthread_mutex_unlock(a)   LeaveCriticalSection((a))
+
+/* condition variable */
+#define ZSTD_pthread_cond_t             CONDITION_VARIABLE
+#define ZSTD_pthread_cond_init(a, b)    ((void)(b), InitializeConditionVariable((a)), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    SleepConditionVariableCS((a), (b), INFINITE)
+#define ZSTD_pthread_cond_signal(a)     WakeConditionVariable((a))
+#define ZSTD_pthread_cond_broadcast(a)  WakeAllConditionVariable((a))
+
+/* ZSTD_pthread_create() and ZSTD_pthread_join() */
+typedef struct {
+    HANDLE handle;
+    void* (*start_routine)(void*);
+    void* arg;
+} ZSTD_pthread_t;
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+                   void* (*start_routine) (void*), void* arg);
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr);
+
+/**
+ * add here more wrappers as required
+ */
+
+
+#elif defined(ZSTD_MULTITHREAD)   /* posix assumed ; need a better detection method */
+/* ===   POSIX Systems   === */
+#  include <pthread.h>
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t
+#define ZSTD_pthread_mutex_init(a, b)   pthread_mutex_init((a), (b))
+#define ZSTD_pthread_mutex_destroy(a)   pthread_mutex_destroy((a))
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock((a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock((a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t
+#define ZSTD_pthread_cond_init(a, b)    pthread_cond_init((a), (b))
+#define ZSTD_pthread_cond_destroy(a)    pthread_cond_destroy((a))
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait((a), (b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal((a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast((a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a, b)         pthread_join((a),(b))
+
+#else  /* ZSTD_MULTITHREAD not defined */
+/* No multithreading support */
+
+typedef int ZSTD_pthread_mutex_t;
+#define ZSTD_pthread_mutex_init(a, b)   ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_mutex_destroy(a)   ((void)(a))
+#define ZSTD_pthread_mutex_lock(a)      ((void)(a))
+#define ZSTD_pthread_mutex_unlock(a)    ((void)(a))
+
+typedef int ZSTD_pthread_cond_t;
+#define ZSTD_pthread_cond_init(a, b)    ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    ((void)(a), (void)(b))
+#define ZSTD_pthread_cond_signal(a)     ((void)(a))
+#define ZSTD_pthread_cond_broadcast(a)  ((void)(a))
+
+/* do not use ZSTD_pthread_t */
+
+#endif /* ZSTD_MULTITHREAD */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* THREADING_H_938743 */

diff --git a/vendor/github.com/DataDog/zstd/update.txt b/vendor/github.com/DataDog/zstd/update.txt
new file mode 100644
index 0000000..1de939f
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/update.txt

@@ -0,0 +1,56 @@
+./lib/common/bitstream.h
+./lib/common/compiler.h
+./lib/compress/zstd_compress_internal.h
+./lib/compress/zstd_fast.h
+./lib/compress/zstd_double_fast.h
+./lib/compress/zstd_lazy.h
+./lib/compress/zstd_ldm.h
+./lib/dictBuilder/cover.c
+./lib/dictBuilder/divsufsort.c
+./lib/dictBuilder/divsufsort.h
+./lib/common/entropy_common.c
+./lib/common/error_private.c
+./lib/common/error_private.h
+./lib/compress/fse_compress.c
+./lib/common/fse_decompress.c
+./lib/common/fse.h
+./lib/compress/huf_compress.c
+./lib/decompress/huf_decompress.c
+./lib/common/huf.h
+./lib/common/mem.h
+./lib/common/pool.c
+./lib/common/pool.h
+./lib/common/threading.c
+./lib/common/threading.h
+./lib/common/xxhash.c
+./lib/common/xxhash.h
+./lib/deprecated/zbuff_common.c
+./lib/deprecated/zbuff_compress.c
+./lib/deprecated/zbuff_decompress.c
+./lib/deprecated/zbuff.h
+./lib/dictBuilder/zdict.c
+./lib/dictBuilder/zdict.h
+./lib/common/zstd_common.c
+./lib/compress/zstd_compress.c
+./lib/decompress/zstd_decompress.c
+./lib/common/zstd_errors.h
+./lib/zstd.h
+./lib/common/zstd_internal.h
+./lib/legacy/zstd_legacy.h
+./lib/compress/zstd_opt.c
+./lib/compress/zstd_opt.h
+./lib/legacy/zstd_v01.c
+./lib/legacy/zstd_v01.h
+./lib/legacy/zstd_v02.c
+./lib/legacy/zstd_v02.h
+./lib/legacy/zstd_v03.c
+./lib/legacy/zstd_v03.h
+./lib/legacy/zstd_v04.c
+./lib/legacy/zstd_v04.h
+./lib/legacy/zstd_v05.c
+./lib/legacy/zstd_v05.h
+./lib/legacy/zstd_v06.c
+./lib/legacy/zstd_v06.h
+./lib/legacy/zstd_v07.c
+./lib/legacy/zstd_v07.h
+

diff --git a/vendor/github.com/DataDog/zstd/xxhash.c b/vendor/github.com/DataDog/zstd/xxhash.c
new file mode 100644
index 0000000..9d9c0e9
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/xxhash.c

@@ -0,0 +1,875 @@
+/*
+*  xxHash - Fast Hash algorithm
+*  Copyright (C) 2012-2016, Yann Collet
+*
+*  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions are
+*  met:
+*
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following disclaimer
+*  in the documentation and/or other materials provided with the
+*  distribution.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*  You can contact the author at :
+*  - xxHash homepage: http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+ * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+ * By default, this option is disabled. To enable it, uncomment below define :
+ */
+/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+
+/*!XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independance be of no importance for your application, you may set the #define below to 1,
+ * to improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#ifndef XXH_FORCE_NATIVE_FORMAT   /* can be defined externally */
+#  define XXH_FORCE_NATIVE_FORMAT 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash; set to 0 when the input data
+ * is guaranteed to be aligned.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/* for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY
+#endif
+#include "xxhash.h"
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+
+
+#ifdef _MSC_VER
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#ifndef MEM_MODULE
+# define MEM_MODULE
+# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint8_t  BYTE;
+    typedef uint16_t U16;
+    typedef uint32_t U32;
+    typedef  int32_t S32;
+    typedef uint64_t U64;
+#  else
+    typedef unsigned char      BYTE;
+    typedef unsigned short     U16;
+    typedef unsigned int       U32;
+    typedef   signed int       S32;
+    typedef unsigned long long U64;   /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */
+#  endif
+#endif
+
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static U32 XXH_read32(const void* memPtr)
+{
+    U32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+static U64 XXH_read64(const void* memPtr)
+{
+    U64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#  define XXH_swap64 _byteswap_uint64
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#  define XXH_swap64 __builtin_bswap64
+#else
+static U32 XXH_swap32 (U32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+static U64 XXH_swap64 (U64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* *************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+    static const int g_one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&g_one))
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+static U32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+
+/* *************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(int)(!!(c)) }; }    /* use only *after* variable declarations */
+
+
+/* *************************************
+*  Constants
+***************************************/
+static const U32 PRIME32_1 = 2654435761U;
+static const U32 PRIME32_2 = 2246822519U;
+static const U32 PRIME32_3 = 3266489917U;
+static const U32 PRIME32_4 =  668265263U;
+static const U32 PRIME32_5 =  374761393U;
+
+static const U64 PRIME64_1 = 11400714785074694791ULL;
+static const U64 PRIME64_2 = 14029467366897019727ULL;
+static const U64 PRIME64_3 =  1609587929392839161ULL;
+static const U64 PRIME64_4 =  9650029242287828579ULL;
+static const U64 PRIME64_5 =  2870177450012600261ULL;
+
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* **************************
+*  Utils
+****************************/
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+
+/* ***************************
+*  Simple Hash Functions
+*****************************/
+
+static U32 XXH32_round(U32 seed, U32 input)
+{
+    seed += input * PRIME32_2;
+    seed  = XXH_rotl32(seed, 13);
+    seed *= PRIME32_1;
+    return seed;
+}
+
+FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U32 h32;
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
+            v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
+            v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
+            v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
+        } while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p+4<=bEnd) {
+        h32 += XXH_get32bits(p) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_CREATESTATE_STATIC(state);
+    XXH32_reset(state, seed);
+    XXH32_update(state, input, len);
+    return XXH32_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+static U64 XXH64_round(U64 acc, U64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+    U64 h64;
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8;
+        } while (p<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (U64) len;
+
+    while (p+8<=bEnd) {
+        U64 const k1 = XXH64_round(0, XXH_get64bits(p));
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd) {
+        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_CREATESTATE_STATIC(state);
+    XXH64_reset(state, seed);
+    XXH64_update(state, input, len);
+    return XXH64_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+/* **************************************************
+*  Advanced Hash Functions
+****************************************************/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+
+/*** Hash feed ***/
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state)-4);   /* do not write into reserved, for future removal */
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    memcpy(statePtr, &state, sizeof(state));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
+{
+    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state)-8);   /* do not write into reserved, for future removal */
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+    memcpy(statePtr, &state, sizeof(state));
+    return XXH_OK;
+}
+
+
+FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len_32 += (unsigned)len;
+    state->large_len |= (len>=16) | (state->total_len_32>=16);
+
+    if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+        state->memsize += (unsigned)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize) {   /* some data left from previous update */
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+        {   const U32* p32 = state->mem32;
+            state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
+            state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
+            state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
+            state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16) {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do {
+            v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
+            v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
+            v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
+            v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+        XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+        state->memsize = (unsigned)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
+{
+    const BYTE * p = (const BYTE*)state->mem32;
+    const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;
+    U32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    while (p+4<=bEnd) {
+        h32 += XXH_readLE32(p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h32 += (*p) * PRIME32_5;
+        h32  = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+
+/* **** XXH64 **** */
+
+FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32) {  /* fill in tmp buffer */
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize) {   /* tmp buffer is full */
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+        state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
+        state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
+        state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
+        state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
+        p += 32-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p+32 <= bEnd) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        do {
+            v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
+            v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
+            v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
+            v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+        XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+        state->memsize = (unsigned)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
+{
+    const BYTE * p = (const BYTE*)state->mem64;
+    const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;
+    U64 h64;
+
+    if (state->total_len >= 32) {
+        U64 const v1 = state->v1;
+        U64 const v2 = state->v2;
+        U64 const v3 = state->v3;
+        U64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 + PRIME64_5;
+    }
+
+    h64 += (U64) state->total_len;
+
+    while (p+8<=bEnd) {
+        U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd) {
+        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
+        h64  = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h64 ^= (*p) * PRIME64_5;
+        h64  = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+/* **************************
+*  Canonical representation
+****************************/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}

diff --git a/vendor/github.com/DataDog/zstd/xxhash.h b/vendor/github.com/DataDog/zstd/xxhash.h
new file mode 100644
index 0000000..9bad1f5
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/xxhash.h

@@ -0,0 +1,305 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/* ****************************
+*  API modifier
+******************************/
+/** XXH_PRIVATE_API
+*   This is useful if you want to include xxhash functions in `static` mode
+*   in order to inline them, and remove their symbol from the public list.
+*   Methodology :
+*     #define XXH_PRIVATE_API
+*     #include "xxhash.h"
+*   `xxhash.c` is automatically included.
+*   It's not useful to compile and link it as a separate module anymore.
+*/
+#ifdef XXH_PRIVATE_API
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+#    define XXH_PUBLIC_API static   /* this version may generate warnings for unused static functions; disable the relevant warning */
+#  endif
+#else
+#  define XXH_PUBLIC_API   /* do nothing */
+#endif /* XXH_PRIVATE_API */
+
+/*!XXH_NAMESPACE, aka Namespace Emulation :
+
+If you want to include _and expose_ xxHash functions from within your own library,
+but also want to avoid symbol collisions with another library which also includes xxHash,
+
+you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values).
+
+Note that no change is required within the calling program as long as it includes `xxhash.h` :
+regular symbol name will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    6
+#define XXH_VERSION_RELEASE  2
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Simple Hash Functions
+******************************/
+typedef unsigned int       XXH32_hash_t;
+typedef unsigned long long XXH64_hash_t;
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*!
+XXH32() :
+    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    "seed" can be used to alter the result predictably.
+    This function runs 2x faster on 64-bits systems, but slower on 32-bits systems (see benchmark).
+*/
+
+
+/* ****************************
+*  Streaming Hash Functions
+******************************/
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+
+/*! State allocation, compatible with dynamic libraries */
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+
+/* hash streaming */
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned int seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*
+These functions generate the xxHash of an input provided in multiple segments.
+Note that, for small input, they are slower than single-call functions, due to state management.
+For small input, prefer `XXH32()` and `XXH64()` .
+
+XXH state must first be allocated, using XXH*_createState() .
+
+Start a new hash by initializing state with a seed, using XXH*_reset().
+
+Then, feed the hash state by calling XXH*_update() as many times as necessary.
+Obviously, input must be allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, a hash value can be produced anytime, by using XXH*_digest().
+This function returns the nn-bits hash as an int or long long.
+
+It's still possible to continue inserting input into the hash state after a digest,
+and generate some new hashes later on, by calling again XXH*_digest().
+
+When done, free XXH state space if it was allocated dynamically.
+*/
+
+
+/* **************************
+*  Utils
+****************************/
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* ! C99 */
+#  define restrict   /* disable restrict */
+#endif
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state);
+
+
+/* **************************
+*  Canonical representation
+****************************/
+/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
+*  The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+*  These functions allow transformation of hash result into and from its canonical format.
+*  This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+*/
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+/* ================================================================================================
+   This section contains definitions which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   They shall only be used with static linking.
+   Never use these definitions in association with dynamic linking !
+=================================================================================================== */
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345)
+#define XXH_STATIC_H_3543687687345
+
+/* These definitions are only meant to allow allocation of XXH state
+   statically, on stack, or in a struct for example.
+   Do not use members directly. */
+
+   struct XXH32_state_s {
+       unsigned total_len_32;
+       unsigned large_len;
+       unsigned v1;
+       unsigned v2;
+       unsigned v3;
+       unsigned v4;
+       unsigned mem32[4];   /* buffer defined as U32 for alignment */
+       unsigned memsize;
+       unsigned reserved;   /* never read nor write, will be removed in a future version */
+   };   /* typedef'd to XXH32_state_t */
+
+   struct XXH64_state_s {
+       unsigned long long total_len;
+       unsigned long long v1;
+       unsigned long long v2;
+       unsigned long long v3;
+       unsigned long long v4;
+       unsigned long long mem64[4];   /* buffer defined as U64 for alignment */
+       unsigned memsize;
+       unsigned reserved[2];          /* never read nor write, will be removed in a future version */
+   };   /* typedef'd to XXH64_state_t */
+
+
+#  ifdef XXH_PRIVATE_API
+#    include "xxhash.c"   /* include xxhash functions as `static`, for inlining */
+#  endif
+
+#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */
+
+
+#if defined (__cplusplus)
+}
+#endif

diff --git a/vendor/github.com/DataDog/zstd/zbuff.h b/vendor/github.com/DataDog/zstd/zbuff.h
new file mode 100644
index 0000000..a93115d
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zbuff.h

@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* ***************************************************************
+*  NOTES/WARNINGS
+******************************************************************/
+/* The streaming API defined here is deprecated.
+ * Consider migrating towards ZSTD_compressStream() API in `zstd.h`
+ * See 'lib/README.md'.
+ *****************************************************************/
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_BUFFERED_H_23987
+#define ZSTD_BUFFERED_H_23987
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include <stddef.h>      /* size_t */
+#include "zstd.h"        /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
+
+
+/* ***************************************************************
+*  Compiler specifics
+*****************************************************************/
+/* Deprecation warnings */
+/* Should these warnings be a problem,
+   it is generally possible to disable them,
+   typically with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual.
+   Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS */
+#ifdef ZBUFF_DISABLE_DEPRECATE_WARNINGS
+#  define ZBUFF_DEPRECATED(message) ZSTDLIB_API  /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define ZBUFF_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API
+#  elif (defined(__GNUC__) && (__GNUC__ >= 5)) || defined(__clang__)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ >= 3)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement ZBUFF_DEPRECATED for this compiler")
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API
+#  endif
+#endif /* ZBUFF_DISABLE_DEPRECATE_WARNINGS */
+
+
+/* *************************************
+*  Streaming functions
+***************************************/
+/* This is the easier "buffered" streaming API,
+*  using an internal buffer to lift all restrictions on user-provided buffers
+*  which can be any size, any place, for both input and output.
+*  ZBUFF and ZSTD are 100% interoperable,
+*  frames created by one can be decoded by the other one */
+
+typedef ZSTD_CStream ZBUFF_CCtx;
+ZBUFF_DEPRECATED("use ZSTD_createCStream") ZBUFF_CCtx* ZBUFF_createCCtx(void);
+ZBUFF_DEPRECATED("use ZSTD_freeCStream")   size_t      ZBUFF_freeCCtx(ZBUFF_CCtx* cctx);
+
+ZBUFF_DEPRECATED("use ZSTD_initCStream")           size_t ZBUFF_compressInit(ZBUFF_CCtx* cctx, int compressionLevel);
+ZBUFF_DEPRECATED("use ZSTD_initCStream_usingDict") size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+
+ZBUFF_DEPRECATED("use ZSTD_compressStream") size_t ZBUFF_compressContinue(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr, const void* src, size_t* srcSizePtr);
+ZBUFF_DEPRECATED("use ZSTD_flushStream")    size_t ZBUFF_compressFlush(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr);
+ZBUFF_DEPRECATED("use ZSTD_endStream")      size_t ZBUFF_compressEnd(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr);
+
+/*-*************************************************
+*  Streaming compression - howto
+*
+*  A ZBUFF_CCtx object is required to track streaming operation.
+*  Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources.
+*  ZBUFF_CCtx objects can be reused multiple times.
+*
+*  Start by initializing ZBUF_CCtx.
+*  Use ZBUFF_compressInit() to start a new compression operation.
+*  Use ZBUFF_compressInitDictionary() for a compression which requires a dictionary.
+*
+*  Use ZBUFF_compressContinue() repetitively to consume input stream.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written within *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present again remaining data.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each call, so save its content if it matters or change @dst .
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's just a hint, to improve latency)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  At any moment, it's possible to flush whatever data remains within buffer, using ZBUFF_compressFlush().
+*  The nb of bytes written into `dst` will be reported into *dstCapacityPtr.
+*  Note that the function cannot output more than *dstCapacityPtr,
+*  therefore, some content might still be left into internal buffer if *dstCapacityPtr is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressEnd() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small.
+*  In which case, call again ZBUFF_compressFlush() to complete the flush.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : _recommended buffer_ sizes (not compulsory) : ZBUFF_recommendedCInSize() / ZBUFF_recommendedCOutSize()
+*  input : ZBUFF_recommendedCInSize==128 KB block size is the internal unit, use this value to reduce intermediate stages (better latency)
+*  output : ZBUFF_recommendedCOutSize==ZSTD_compressBound(128 KB) + 3 + 3 : ensures it's always possible to write/flush/end a full block. Skip some buffering.
+*  By using both, it ensures that input will be entirely consumed, and output will always contain the result, reducing intermediate buffering.
+* **************************************************/
+
+
+typedef ZSTD_DStream ZBUFF_DCtx;
+ZBUFF_DEPRECATED("use ZSTD_createDStream") ZBUFF_DCtx* ZBUFF_createDCtx(void);
+ZBUFF_DEPRECATED("use ZSTD_freeDStream")   size_t      ZBUFF_freeDCtx(ZBUFF_DCtx* dctx);
+
+ZBUFF_DEPRECATED("use ZSTD_initDStream")           size_t ZBUFF_decompressInit(ZBUFF_DCtx* dctx);
+ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZBUFF_DEPRECATED("use ZSTD_decompressStream") size_t ZBUFF_decompressContinue(ZBUFF_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFF_DCtx object is required to track streaming operations.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation,
+*   or ZBUFF_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFF_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+*  @return : 0 when a frame is completely decoded and fully flushed,
+*            1 when there is still some data left within internal buffer to flush,
+*            >1 when more data is expected, with value being a suggested next input size (it's just a hint, which helps latency),
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize() and ZBUFF_recommendedDOutSize()
+*  output : ZBUFF_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFF_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+ZBUFF_DEPRECATED("use ZSTD_isError")      unsigned ZBUFF_isError(size_t errorCode);
+ZBUFF_DEPRECATED("use ZSTD_getErrorName") const char* ZBUFF_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, they tend to offer better latency */
+ZBUFF_DEPRECATED("use ZSTD_CStreamInSize")  size_t ZBUFF_recommendedCInSize(void);
+ZBUFF_DEPRECATED("use ZSTD_CStreamOutSize") size_t ZBUFF_recommendedCOutSize(void);
+ZBUFF_DEPRECATED("use ZSTD_DStreamInSize")  size_t ZBUFF_recommendedDInSize(void);
+ZBUFF_DEPRECATED("use ZSTD_DStreamOutSize") size_t ZBUFF_recommendedDOutSize(void);
+
+#endif  /* ZSTD_BUFFERED_H_23987 */
+
+
+#ifdef ZBUFF_STATIC_LINKING_ONLY
+#ifndef ZBUFF_STATIC_H_30298098432
+#define ZBUFF_STATIC_H_30298098432
+
+/* ====================================================================================
+ * The definitions in this section are considered experimental.
+ * They should never be used in association with a dynamic library, as they may change in the future.
+ * They are provided for advanced usages.
+ * Use them only in association with static linking.
+ * ==================================================================================== */
+
+/*--- Dependency ---*/
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters, ZSTD_customMem */
+#include "zstd.h"
+
+
+/*--- Custom memory allocator ---*/
+/*! ZBUFF_createCCtx_advanced() :
+ *  Create a ZBUFF compression context using external alloc and free functions */
+ZBUFF_DEPRECATED("use ZSTD_createCStream_advanced") ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem);
+
+/*! ZBUFF_createDCtx_advanced() :
+ *  Create a ZBUFF decompression context using external alloc and free functions */
+ZBUFF_DEPRECATED("use ZSTD_createDStream_advanced") ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem);
+
+
+/*--- Advanced Streaming Initialization ---*/
+ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc,
+                                               const void* dict, size_t dictSize,
+                                               ZSTD_parameters params, unsigned long long pledgedSrcSize);
+
+
+#endif    /* ZBUFF_STATIC_H_30298098432 */
+#endif    /* ZBUFF_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif

diff --git a/vendor/github.com/DataDog/zstd/zbuff_common.c b/vendor/github.com/DataDog/zstd/zbuff_common.c
new file mode 100644
index 0000000..661b9b0
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zbuff_common.c

@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "error_private.h"
+#include "zbuff.h"
+
+/*-****************************************
+*  ZBUFF Error Management  (deprecated)
+******************************************/
+
+/*! ZBUFF_isError() :
+*   tells if a return value is an error code */
+unsigned ZBUFF_isError(size_t errorCode) { return ERR_isError(errorCode); }
+/*! ZBUFF_getErrorName() :
+*   provides error code string from function result (useful for debugging) */
+const char* ZBUFF_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }

diff --git a/vendor/github.com/DataDog/zstd/zbuff_compress.c b/vendor/github.com/DataDog/zstd/zbuff_compress.c
new file mode 100644
index 0000000..f39c60d
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zbuff_compress.c

@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/* *************************************
+*  Dependencies
+***************************************/
+#define ZBUFF_STATIC_LINKING_ONLY
+#include "zbuff.h"
+
+
+/*-***********************************************************
+*  Streaming compression
+*
+*  A ZBUFF_CCtx object is required to track streaming operation.
+*  Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources.
+*  Use ZBUFF_compressInit() to start a new compression operation.
+*  ZBUFF_CCtx objects can be reused multiple times.
+*
+*  Use ZBUFF_compressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input.
+*  The content of dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change dst .
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressFlush() can be used to instruct ZBUFF to compress and output whatever remains within its buffer.
+*  Note that it will not output more than *dstCapacityPtr.
+*  Therefore, some content might still be left into its internal buffer if dst buffer is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressEnd() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory)
+*  input : ZSTD_BLOCKSIZE_MAX (128 KB), internal unit size, it improves latency to use this value.
+*  output : ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + ZBUFF_endFrameSize : ensures it's always possible to write/flush/end a full block at best speed.
+* ***********************************************************/
+
+ZBUFF_CCtx* ZBUFF_createCCtx(void)
+{
+    return ZSTD_createCStream();
+}
+
+ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createCStream_advanced(customMem);
+}
+
+size_t ZBUFF_freeCCtx(ZBUFF_CCtx* zbc)
+{
+    return ZSTD_freeCStream(zbc);
+}
+
+
+/* ======   Initialization   ====== */
+
+size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc,
+                                   const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* preserve "0 == unknown" behavior */
+    return ZSTD_initCStream_advanced(zbc, dict, dictSize, params, pledgedSrcSize);
+}
+
+
+size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, int compressionLevel)
+{
+    return ZSTD_initCStream_usingDict(zbc, dict, dictSize, compressionLevel);
+}
+
+size_t ZBUFF_compressInit(ZBUFF_CCtx* zbc, int compressionLevel)
+{
+    return ZSTD_initCStream(zbc, compressionLevel);
+}
+
+/* ======   Compression   ====== */
+
+
+size_t ZBUFF_compressContinue(ZBUFF_CCtx* zbc,
+                              void* dst, size_t* dstCapacityPtr,
+                        const void* src, size_t* srcSizePtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    ZSTD_inBuffer inBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    inBuff.src = src;
+    inBuff.pos = 0;
+    inBuff.size = *srcSizePtr;
+    result = ZSTD_compressStream(zbc, &outBuff, &inBuff);
+    *dstCapacityPtr = outBuff.pos;
+    *srcSizePtr = inBuff.pos;
+    return result;
+}
+
+
+
+/* ======   Finalize   ====== */
+
+size_t ZBUFF_compressFlush(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    result = ZSTD_flushStream(zbc, &outBuff);
+    *dstCapacityPtr = outBuff.pos;
+    return result;
+}
+
+
+size_t ZBUFF_compressEnd(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    result = ZSTD_endStream(zbc, &outBuff);
+    *dstCapacityPtr = outBuff.pos;
+    return result;
+}
+
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFF_recommendedCInSize(void)  { return ZSTD_CStreamInSize(); }
+size_t ZBUFF_recommendedCOutSize(void) { return ZSTD_CStreamOutSize(); }

diff --git a/vendor/github.com/DataDog/zstd/zbuff_decompress.c b/vendor/github.com/DataDog/zstd/zbuff_decompress.c
new file mode 100644
index 0000000..923c22b
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zbuff_decompress.c

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/* *************************************
+*  Dependencies
+***************************************/
+#define ZBUFF_STATIC_LINKING_ONLY
+#include "zbuff.h"
+
+
+ZBUFF_DCtx* ZBUFF_createDCtx(void)
+{
+    return ZSTD_createDStream();
+}
+
+ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDStream_advanced(customMem);
+}
+
+size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbd)
+{
+    return ZSTD_freeDStream(zbd);
+}
+
+
+/* *** Initialization *** */
+
+size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* zbd, const void* dict, size_t dictSize)
+{
+    return ZSTD_initDStream_usingDict(zbd, dict, dictSize);
+}
+
+size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbd)
+{
+    return ZSTD_initDStream(zbd);
+}
+
+
+/* *** Decompression *** */
+
+size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbd,
+                                void* dst, size_t* dstCapacityPtr,
+                          const void* src, size_t* srcSizePtr)
+{
+    ZSTD_outBuffer outBuff;
+    ZSTD_inBuffer inBuff;
+    size_t result;
+    outBuff.dst  = dst;
+    outBuff.pos  = 0;
+    outBuff.size = *dstCapacityPtr;
+    inBuff.src  = src;
+    inBuff.pos  = 0;
+    inBuff.size = *srcSizePtr;
+    result = ZSTD_decompressStream(zbd, &outBuff, &inBuff);
+    *dstCapacityPtr = outBuff.pos;
+    *srcSizePtr = inBuff.pos;
+    return result;
+}
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFF_recommendedDInSize(void)  { return ZSTD_DStreamInSize(); }
+size_t ZBUFF_recommendedDOutSize(void) { return ZSTD_DStreamOutSize(); }

diff --git a/vendor/github.com/DataDog/zstd/zdict.c b/vendor/github.com/DataDog/zstd/zdict.c
new file mode 100644
index 0000000..7d24e49
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zdict.c

@@ -0,0 +1,1108 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*-**************************************
+*  Tuning parameters
+****************************************/
+#define MINRATIO 4   /* minimum nb of apparition to be selected in dictionary */
+#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
+#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
+
+
+/*-**************************************
+*  Compiler Options
+****************************************/
+/* Unix Large Files support (>4GB) */
+#define _FILE_OFFSET_BITS 64
+#if (defined(__sun__) && (!defined(__LP64__)))   /* Sun Solaris 32-bits requires specific definitions */
+#  define _LARGEFILE_SOURCE
+#elif ! defined(__LP64__)                        /* No point defining Large file for 64 bit */
+#  define _LARGEFILE64_SOURCE
+#endif
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdlib.h>        /* malloc, free */
+#include <string.h>        /* memset */
+#include <stdio.h>         /* fprintf, fopen, ftello64 */
+#include <time.h>          /* clock */
+
+#include "mem.h"           /* read */
+#include "fse.h"           /* FSE_normalizeCount, FSE_writeNCount */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"           /* HUF_buildCTable, HUF_writeCTable */
+#include "zstd_internal.h" /* includes zstd.h */
+#include "xxhash.h"        /* XXH64 */
+#include "divsufsort.h"
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#  define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define DICTLISTSIZE_DEFAULT 10000
+
+#define NOISELENGTH 32
+
+static const int g_compressionLevel_default = 3;
+static const U32 g_selectivity_default = 9;
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
+#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); }    /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+
+static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
+
+static void ZDICT_printHex(const void* ptr, size_t length)
+{
+    const BYTE* const b = (const BYTE*)ptr;
+    size_t u;
+    for (u=0; u<length; u++) {
+        BYTE c = b[u];
+        if (c<32 || c>126) c = '.';   /* non-printable char */
+        DISPLAY("%c", c);
+    }
+}
+
+
+/*-********************************************************
+*  Helper functions
+**********************************************************/
+unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
+
+const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
+{
+    if (dictSize < 8) return 0;
+    if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
+    return MEM_readLE32((const char*)dictBuffer + 4);
+}
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static unsigned ZDICT_NbCommonBytes (size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r=0;
+            _BitScanForward( &r, (U32)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clzll(val) >> 3);
+#       else
+            unsigned r;
+            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+    }   }
+}
+
+
+/*! ZDICT_count() :
+    Count the nb of common bytes between 2 pointers.
+    Note : this function presumes end of buffer followed by noisy guard band.
+*/
+static size_t ZDICT_count(const void* pIn, const void* pMatch)
+{
+    const char* const pStart = (const char*)pIn;
+    for (;;) {
+        size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+        if (!diff) {
+            pIn = (const char*)pIn+sizeof(size_t);
+            pMatch = (const char*)pMatch+sizeof(size_t);
+            continue;
+        }
+        pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
+        return (size_t)((const char*)pIn - pStart);
+    }
+}
+
+
+typedef struct {
+    U32 pos;
+    U32 length;
+    U32 savings;
+} dictItem;
+
+static void ZDICT_initDictItem(dictItem* d)
+{
+    d->pos = 1;
+    d->length = 0;
+    d->savings = (U32)(-1);
+}
+
+
+#define LLIMIT 64          /* heuristic determined experimentally */
+#define MINMATCHLENGTH 7   /* heuristic determined experimentally */
+static dictItem ZDICT_analyzePos(
+                       BYTE* doneMarks,
+                       const int* suffix, U32 start,
+                       const void* buffer, U32 minRatio, U32 notificationLevel)
+{
+    U32 lengthList[LLIMIT] = {0};
+    U32 cumulLength[LLIMIT] = {0};
+    U32 savings[LLIMIT] = {0};
+    const BYTE* b = (const BYTE*)buffer;
+    size_t maxLength = LLIMIT;
+    size_t pos = suffix[start];
+    U32 end = start;
+    dictItem solution;
+
+    /* init */
+    memset(&solution, 0, sizeof(solution));
+    doneMarks[pos] = 1;
+
+    /* trivial repetition cases */
+    if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
+       ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
+       ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
+        /* skip and mark segment */
+        U16 const pattern16 = MEM_read16(b+pos+4);
+        U32 u, patternEnd = 6;
+        while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
+        if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
+        for (u=1; u<patternEnd; u++)
+            doneMarks[pos+u] = 1;
+        return solution;
+    }
+
+    /* look forward */
+    {   size_t length;
+        do {
+            end++;
+            length = ZDICT_count(b + pos, b + suffix[end]);
+        } while (length >= MINMATCHLENGTH);
+    }
+
+    /* look backward */
+    {   size_t length;
+        do {
+            length = ZDICT_count(b + pos, b + *(suffix+start-1));
+            if (length >=MINMATCHLENGTH) start--;
+        } while(length >= MINMATCHLENGTH);
+    }
+
+    /* exit if not found a minimum nb of repetitions */
+    if (end-start < minRatio) {
+        U32 idx;
+        for(idx=start; idx<end; idx++)
+            doneMarks[suffix[idx]] = 1;
+        return solution;
+    }
+
+    {   int i;
+        U32 searchLength;
+        U32 refinedStart = start;
+        U32 refinedEnd = end;
+
+        DISPLAYLEVEL(4, "\n");
+        DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u  ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
+        DISPLAYLEVEL(4, "\n");
+
+        for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
+            BYTE currentChar = 0;
+            U32 currentCount = 0;
+            U32 currentID = refinedStart;
+            U32 id;
+            U32 selectedCount = 0;
+            U32 selectedID = currentID;
+            for (id =refinedStart; id < refinedEnd; id++) {
+                if (b[suffix[id] + searchLength] != currentChar) {
+                    if (currentCount > selectedCount) {
+                        selectedCount = currentCount;
+                        selectedID = currentID;
+                    }
+                    currentID = id;
+                    currentChar = b[ suffix[id] + searchLength];
+                    currentCount = 0;
+                }
+                currentCount ++;
+            }
+            if (currentCount > selectedCount) {  /* for last */
+                selectedCount = currentCount;
+                selectedID = currentID;
+            }
+
+            if (selectedCount < minRatio)
+                break;
+            refinedStart = selectedID;
+            refinedEnd = refinedStart + selectedCount;
+        }
+
+        /* evaluate gain based on new ref */
+        start = refinedStart;
+        pos = suffix[refinedStart];
+        end = start;
+        memset(lengthList, 0, sizeof(lengthList));
+
+        /* look forward */
+        {   size_t length;
+            do {
+                end++;
+                length = ZDICT_count(b + pos, b + suffix[end]);
+                if (length >= LLIMIT) length = LLIMIT-1;
+                lengthList[length]++;
+            } while (length >=MINMATCHLENGTH);
+        }
+
+        /* look backward */
+        {   size_t length = MINMATCHLENGTH;
+            while ((length >= MINMATCHLENGTH) & (start > 0)) {
+                length = ZDICT_count(b + pos, b + suffix[start - 1]);
+                if (length >= LLIMIT) length = LLIMIT - 1;
+                lengthList[length]++;
+                if (length >= MINMATCHLENGTH) start--;
+            }
+        }
+
+        /* largest useful length */
+        memset(cumulLength, 0, sizeof(cumulLength));
+        cumulLength[maxLength-1] = lengthList[maxLength-1];
+        for (i=(int)(maxLength-2); i>=0; i--)
+            cumulLength[i] = cumulLength[i+1] + lengthList[i];
+
+        for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
+        maxLength = i;
+
+        /* reduce maxLength in case of final into repetitive data */
+        {   U32 l = (U32)maxLength;
+            BYTE const c = b[pos + maxLength-1];
+            while (b[pos+l-2]==c) l--;
+            maxLength = l;
+        }
+        if (maxLength < MINMATCHLENGTH) return solution;   /* skip : no long-enough solution */
+
+        /* calculate savings */
+        savings[5] = 0;
+        for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
+            savings[i] = savings[i-1] + (lengthList[i] * (i-3));
+
+        DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f)  \n",
+                     (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
+
+        solution.pos = (U32)pos;
+        solution.length = (U32)maxLength;
+        solution.savings = savings[maxLength];
+
+        /* mark positions done */
+        {   U32 id;
+            for (id=start; id<end; id++) {
+                U32 p, pEnd, length;
+                U32 const testedPos = suffix[id];
+                if (testedPos == pos)
+                    length = solution.length;
+                else {
+                    length = (U32)ZDICT_count(b+pos, b+testedPos);
+                    if (length > solution.length) length = solution.length;
+                }
+                pEnd = (U32)(testedPos + length);
+                for (p=testedPos; p<pEnd; p++)
+                    doneMarks[p] = 1;
+    }   }   }
+
+    return solution;
+}
+
+
+static int isIncluded(const void* in, const void* container, size_t length)
+{
+    const char* const ip = (const char*) in;
+    const char* const into = (const char*) container;
+    size_t u;
+
+    for (u=0; u<length; u++) {  /* works because end of buffer is a noisy guard band */
+        if (ip[u] != into[u]) break;
+    }
+
+    return u==length;
+}
+
+/*! ZDICT_tryMerge() :
+    check if dictItem can be merged, do it if possible
+    @return : id of destination elt, 0 if not merged
+*/
+static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
+{
+    const U32 tableSize = table->pos;
+    const U32 eltEnd = elt.pos + elt.length;
+    const char* const buf = (const char*) buffer;
+
+    /* tail overlap */
+    U32 u; for (u=1; u<tableSize; u++) {
+        if (u==eltNbToSkip) continue;
+        if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) {  /* overlap, existing > new */
+            /* append */
+            U32 const addedLength = table[u].pos - elt.pos;
+            table[u].length += addedLength;
+            table[u].pos = elt.pos;
+            table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
+            table[u].savings += elt.length / 8;    /* rough approx bonus */
+            elt = table[u];
+            /* sort : improve rank */
+            while ((u>1) && (table[u-1].savings < elt.savings))
+            table[u] = table[u-1], u--;
+            table[u] = elt;
+            return u;
+    }   }
+
+    /* front overlap */
+    for (u=1; u<tableSize; u++) {
+        if (u==eltNbToSkip) continue;
+
+        if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) {  /* overlap, existing < new */
+            /* append */
+            int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
+            table[u].savings += elt.length / 8;    /* rough approx bonus */
+            if (addedLength > 0) {   /* otherwise, elt fully included into existing */
+                table[u].length += addedLength;
+                table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
+            }
+            /* sort : improve rank */
+            elt = table[u];
+            while ((u>1) && (table[u-1].savings < elt.savings))
+                table[u] = table[u-1], u--;
+            table[u] = elt;
+            return u;
+        }
+
+        if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
+            if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
+                size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
+                table[u].pos = elt.pos;
+                table[u].savings += (U32)(elt.savings * addedLength / elt.length);
+                table[u].length = MIN(elt.length, table[u].length + 1);
+                return u;
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+static void ZDICT_removeDictItem(dictItem* table, U32 id)
+{
+    /* convention : table[0].pos stores nb of elts */
+    U32 const max = table[0].pos;
+    U32 u;
+    if (!id) return;   /* protection, should never happen */
+    for (u=id; u<max-1; u++)
+        table[u] = table[u+1];
+    table->pos--;
+}
+
+
+static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
+{
+    /* merge if possible */
+    U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
+    if (mergeId) {
+        U32 newMerge = 1;
+        while (newMerge) {
+            newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
+            if (newMerge) ZDICT_removeDictItem(table, mergeId);
+            mergeId = newMerge;
+        }
+        return;
+    }
+
+    /* insert */
+    {   U32 current;
+        U32 nextElt = table->pos;
+        if (nextElt >= maxSize) nextElt = maxSize-1;
+        current = nextElt-1;
+        while (table[current].savings < elt.savings) {
+            table[current+1] = table[current];
+            current--;
+        }
+        table[current+1] = elt;
+        table->pos = nextElt+1;
+    }
+}
+
+
+static U32 ZDICT_dictSize(const dictItem* dictList)
+{
+    U32 u, dictSize = 0;
+    for (u=1; u<dictList[0].pos; u++)
+        dictSize += dictList[u].length;
+    return dictSize;
+}
+
+
+static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
+                            const void* const buffer, size_t bufferSize,   /* buffer must end with noisy guard band */
+                            const size_t* fileSizes, unsigned nbFiles,
+                            U32 minRatio, U32 notificationLevel)
+{
+    int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
+    int* const suffix = suffix0+1;
+    U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
+    BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks));   /* +16 for overflow security */
+    U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
+    size_t result = 0;
+    clock_t displayClock = 0;
+    clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
+
+#   define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
+            if (ZDICT_clockSpan(displayClock) > refreshRate)  \
+            { displayClock = clock(); DISPLAY(__VA_ARGS__); \
+            if (notificationLevel>=4) fflush(stderr); } }
+
+    /* init */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
+        result = ERROR(memory_allocation);
+        goto _cleanup;
+    }
+    if (minRatio < MINRATIO) minRatio = MINRATIO;
+    memset(doneMarks, 0, bufferSize+16);
+
+    /* limit sample set size (divsufsort limitation)*/
+    if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
+    while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
+
+    /* sort */
+    DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
+    {   int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
+        if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
+    }
+    suffix[bufferSize] = (int)bufferSize;   /* leads into noise */
+    suffix0[0] = (int)bufferSize;           /* leads into noise */
+    /* build reverse suffix sort */
+    {   size_t pos;
+        for (pos=0; pos < bufferSize; pos++)
+            reverseSuffix[suffix[pos]] = (U32)pos;
+        /* note filePos tracks borders between samples.
+           It's not used at this stage, but planned to become useful in a later update */
+        filePos[0] = 0;
+        for (pos=1; pos<nbFiles; pos++)
+            filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
+    }
+
+    DISPLAYLEVEL(2, "finding patterns ... \n");
+    DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
+
+    {   U32 cursor; for (cursor=0; cursor < bufferSize; ) {
+            dictItem solution;
+            if (doneMarks[cursor]) { cursor++; continue; }
+            solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
+            if (solution.length==0) { cursor++; continue; }
+            ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
+            cursor += solution.length;
+            DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
+    }   }
+
+_cleanup:
+    free(suffix0);
+    free(reverseSuffix);
+    free(doneMarks);
+    free(filePos);
+    return result;
+}
+
+
+static void ZDICT_fillNoise(void* buffer, size_t length)
+{
+    unsigned const prime1 = 2654435761U;
+    unsigned const prime2 = 2246822519U;
+    unsigned acc = prime1;
+    size_t p=0;;
+    for (p=0; p<length; p++) {
+        acc *= prime2;
+        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
+    }
+}
+
+
+typedef struct
+{
+    ZSTD_CCtx* ref;    /* contains reference to dictionary */
+    ZSTD_CCtx* zc;     /* working context */
+    void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
+} EStats_ress_t;
+
+#define MAXREPOFFSET 1024
+
+static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
+                              U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
+                              const void* src, size_t srcSize,
+                              U32 notificationLevel)
+{
+    size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
+    size_t cSize;
+
+    if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
+    {   size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
+        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
+    }
+    cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
+    if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
+
+    if (cSize) {  /* if == 0; block is not compressible */
+        const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
+
+        /* literals stats */
+        {   const BYTE* bytePtr;
+            for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
+                countLit[*bytePtr]++;
+        }
+
+        /* seqStats */
+        {   U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+            ZSTD_seqToCodes(seqStorePtr);
+
+            {   const BYTE* codePtr = seqStorePtr->ofCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
+            }
+
+            {   const BYTE* codePtr = seqStorePtr->mlCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
+            }
+
+            {   const BYTE* codePtr = seqStorePtr->llCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
+            }
+
+            if (nbSeq >= 2) { /* rep offsets */
+                const seqDef* const seq = seqStorePtr->sequencesStart;
+                U32 offset1 = seq[0].offset - 3;
+                U32 offset2 = seq[1].offset - 3;
+                if (offset1 >= MAXREPOFFSET) offset1 = 0;
+                if (offset2 >= MAXREPOFFSET) offset2 = 0;
+                repOffsets[offset1] += 3;
+                repOffsets[offset2] += 1;
+    }   }   }
+}
+
+static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
+{
+    size_t total=0;
+    unsigned u;
+    for (u=0; u<nbFiles; u++) total += fileSizes[u];
+    return total;
+}
+
+typedef struct { U32 offset; U32 count; } offsetCount_t;
+
+static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
+{
+    U32 u;
+    table[ZSTD_REP_NUM].offset = val;
+    table[ZSTD_REP_NUM].count = count;
+    for (u=ZSTD_REP_NUM; u>0; u--) {
+        offsetCount_t tmp;
+        if (table[u-1].count >= table[u].count) break;
+        tmp = table[u-1];
+        table[u-1] = table[u];
+        table[u] = tmp;
+    }
+}
+
+/* ZDICT_flatLit() :
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
+ */
+static void ZDICT_flatLit(U32* countLit)
+{
+    int u;
+    for (u=1; u<256; u++) countLit[u] = 2;
+    countLit[0]   = 4;
+    countLit[253] = 1;
+    countLit[254] = 1;
+}
+
+#define OFFCODE_MAX 30  /* only applicable to first block */
+static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
+                                   unsigned compressionLevel,
+                             const void*  srcBuffer, const size_t* fileSizes, unsigned nbFiles,
+                             const void* dictBuffer, size_t  dictBufferSize,
+                                   unsigned notificationLevel)
+{
+    U32 countLit[256];
+    HUF_CREATE_STATIC_CTABLE(hufTable, 255);
+    U32 offcodeCount[OFFCODE_MAX+1];
+    short offcodeNCount[OFFCODE_MAX+1];
+    U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
+    U32 matchLengthCount[MaxML+1];
+    short matchLengthNCount[MaxML+1];
+    U32 litLengthCount[MaxLL+1];
+    short litLengthNCount[MaxLL+1];
+    U32 repOffset[MAXREPOFFSET];
+    offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
+    EStats_ress_t esr;
+    ZSTD_parameters params;
+    U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
+    size_t pos = 0, errorCode;
+    size_t eSize = 0;
+    size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
+    size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
+    BYTE* dstPtr = (BYTE*)dstBuffer;
+
+    /* init */
+    DEBUGLOG(4, "ZDICT_analyzeEntropy");
+    esr.ref = ZSTD_createCCtx();
+    esr.zc = ZSTD_createCCtx();
+    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
+    if (!esr.ref || !esr.zc || !esr.workPlace) {
+        eSize = ERROR(memory_allocation);
+        DISPLAYLEVEL(1, "Not enough memory \n");
+        goto _cleanup;
+    }
+    if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; }   /* too large dictionary */
+    for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
+    for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
+    for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
+    for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
+    memset(repOffset, 0, sizeof(repOffset));
+    repOffset[1] = repOffset[4] = repOffset[8] = 1;
+    memset(bestRepOffset, 0, sizeof(bestRepOffset));
+    if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
+    params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
+    {   size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
+        if (ZSTD_isError(beginResult)) {
+            DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
+            eSize = ERROR(GENERIC);
+            goto _cleanup;
+    }   }
+
+    /* collect stats on all samples */
+    for (u=0; u<nbFiles; u++) {
+        ZDICT_countEStats(esr, params,
+                          countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
+                         (const char*)srcBuffer + pos, fileSizes[u],
+                          notificationLevel);
+        pos += fileSizes[u];
+    }
+
+    /* analyze, build stats, starting with literals */
+    {   size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+        if (HUF_isError(maxNbBits)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, " HUF_buildCTable error \n");
+            goto _cleanup;
+        }
+        if (maxNbBits==8) {  /* not compressible : will fail on HUF_writeCTable() */
+            DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
+            ZDICT_flatLit(countLit);  /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
+            maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+            assert(maxNbBits==9);
+        }
+        huffLog = (U32)maxNbBits;
+    }
+
+    /* looking for most common first offsets */
+    {   U32 offset;
+        for (offset=1; offset<MAXREPOFFSET; offset++)
+            ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
+    }
+    /* note : the result of this phase should be used to better appreciate the impact on statistics */
+
+    total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
+    errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
+        goto _cleanup;
+    }
+    Offlog = (U32)errorCode;
+
+    total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
+    errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
+        goto _cleanup;
+    }
+    mlLog = (U32)errorCode;
+
+    total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
+    errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
+        goto _cleanup;
+    }
+    llLog = (U32)errorCode;
+
+    /* write result to buffer */
+    {   size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
+        if (HUF_isError(hhSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "HUF_writeCTable error \n");
+            goto _cleanup;
+        }
+        dstPtr += hhSize;
+        maxDstSize -= hhSize;
+        eSize += hhSize;
+    }
+
+    {   size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
+        if (FSE_isError(ohSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += ohSize;
+        maxDstSize -= ohSize;
+        eSize += ohSize;
+    }
+
+    {   size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
+        if (FSE_isError(mhSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += mhSize;
+        maxDstSize -= mhSize;
+        eSize += mhSize;
+    }
+
+    {   size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
+        if (FSE_isError(lhSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += lhSize;
+        maxDstSize -= lhSize;
+        eSize += lhSize;
+    }
+
+    if (maxDstSize<12) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
+        goto _cleanup;
+    }
+# if 0
+    MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
+    MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
+    MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
+#else
+    /* at this stage, we don't use the result of "most common first offset",
+       as the impact of statistics is not properly evaluated */
+    MEM_writeLE32(dstPtr+0, repStartValue[0]);
+    MEM_writeLE32(dstPtr+4, repStartValue[1]);
+    MEM_writeLE32(dstPtr+8, repStartValue[2]);
+#endif
+    eSize += 12;
+
+_cleanup:
+    ZSTD_freeCCtx(esr.ref);
+    ZSTD_freeCCtx(esr.zc);
+    free(esr.workPlace);
+
+    return eSize;
+}
+
+
+
+size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
+                          const void* customDictContent, size_t dictContentSize,
+                          const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                          ZDICT_params_t params)
+{
+    size_t hSize;
+#define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
+    BYTE header[HBUFFSIZE];
+    int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
+    U32 const notificationLevel = params.notificationLevel;
+
+    /* check conditions */
+    DEBUGLOG(4, "ZDICT_finalizeDictionary");
+    if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
+    if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
+
+    /* dictionary header */
+    MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
+    {   U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
+        U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
+        U32 const dictID = params.dictID ? params.dictID : compliantID;
+        MEM_writeLE32(header+4, dictID);
+    }
+    hSize = 8;
+
+    /* entropy tables */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    DISPLAYLEVEL(2, "statistics ... \n");
+    {   size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
+                                  compressionLevel,
+                                  samplesBuffer, samplesSizes, nbSamples,
+                                  customDictContent, dictContentSize,
+                                  notificationLevel);
+        if (ZDICT_isError(eSize)) return eSize;
+        hSize += eSize;
+    }
+
+    /* copy elements in final buffer ; note : src and dst buffer can overlap */
+    if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
+    {   size_t const dictSize = hSize + dictContentSize;
+        char* dictEnd = (char*)dictBuffer + dictSize;
+        memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
+        memcpy(dictBuffer, header, hSize);
+        return dictSize;
+    }
+}
+
+
+size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+                                                 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                                 ZDICT_params_t params)
+{
+    int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
+    U32 const notificationLevel = params.notificationLevel;
+    size_t hSize = 8;
+
+    /* calculate entropy tables */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    DISPLAYLEVEL(2, "statistics ... \n");
+    {   size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
+                                  compressionLevel,
+                                  samplesBuffer, samplesSizes, nbSamples,
+                                  (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
+                                  notificationLevel);
+        if (ZDICT_isError(eSize)) return eSize;
+        hSize += eSize;
+    }
+
+    /* add dictionary header (after entropy tables) */
+    MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
+    {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
+        U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
+        U32 const dictID = params.dictID ? params.dictID : compliantID;
+        MEM_writeLE32((char*)dictBuffer+4, dictID);
+    }
+
+    if (hSize + dictContentSize < dictBufferCapacity)
+        memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
+    return MIN(dictBufferCapacity, hSize+dictContentSize);
+}
+
+
+/*! ZDICT_trainFromBuffer_unsafe_legacy() :
+*   Warning : `samplesBuffer` must be followed by noisy guard band.
+*   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
+*/
+size_t ZDICT_trainFromBuffer_unsafe_legacy(
+                            void* dictBuffer, size_t maxDictSize,
+                            const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                            ZDICT_legacy_params_t params)
+{
+    U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
+    dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
+    unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
+    unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
+    size_t const targetDictSize = maxDictSize;
+    size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
+    size_t dictSize = 0;
+    U32 const notificationLevel = params.zParams.notificationLevel;
+
+    /* checks */
+    if (!dictList) return ERROR(memory_allocation);
+    if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); }   /* requested dictionary size is too small */
+    if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* not enough source to create dictionary */
+
+    /* init */
+    ZDICT_initDictItem(dictList);
+
+    /* build dictionary */
+    ZDICT_trainBuffer_legacy(dictList, dictListSize,
+                       samplesBuffer, samplesBuffSize,
+                       samplesSizes, nbSamples,
+                       minRep, notificationLevel);
+
+    /* display best matches */
+    if (params.zParams.notificationLevel>= 3) {
+        U32 const nb = MIN(25, dictList[0].pos);
+        U32 const dictContentSize = ZDICT_dictSize(dictList);
+        U32 u;
+        DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
+        DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
+        for (u=1; u<nb; u++) {
+            U32 const pos = dictList[u].pos;
+            U32 const length = dictList[u].length;
+            U32 const printedLength = MIN(40, length);
+            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
+                return ERROR(GENERIC);   /* should never happen */
+            DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
+                         u, length, pos, dictList[u].savings);
+            ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
+            DISPLAYLEVEL(3, "| \n");
+    }   }
+
+
+    /* create dictionary */
+    {   U32 dictContentSize = ZDICT_dictSize(dictList);
+        if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* dictionary content too small */
+        if (dictContentSize < targetDictSize/4) {
+            DISPLAYLEVEL(2, "!  warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
+            if (samplesBuffSize < 10 * targetDictSize)
+                DISPLAYLEVEL(2, "!  consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
+            if (minRep > MINRATIO) {
+                DISPLAYLEVEL(2, "!  consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
+                DISPLAYLEVEL(2, "!  note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
+            }
+        }
+
+        if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
+            U32 proposedSelectivity = selectivity-1;
+            while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
+            DISPLAYLEVEL(2, "!  note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
+            DISPLAYLEVEL(2, "!  consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
+            DISPLAYLEVEL(2, "!  always test dictionary efficiency on real samples \n");
+        }
+
+        /* limit dictionary size */
+        {   U32 const max = dictList->pos;   /* convention : nb of useful elts within dictList */
+            U32 currentSize = 0;
+            U32 n; for (n=1; n<max; n++) {
+                currentSize += dictList[n].length;
+                if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
+            }
+            dictList->pos = n;
+            dictContentSize = currentSize;
+        }
+
+        /* build dict content */
+        {   U32 u;
+            BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
+            for (u=1; u<dictList->pos; u++) {
+                U32 l = dictList[u].length;
+                ptr -= l;
+                if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); }   /* should not happen */
+                memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
+        }   }
+
+        dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
+                                                             samplesBuffer, samplesSizes, nbSamples,
+                                                             params.zParams);
+    }
+
+    /* clean up */
+    free(dictList);
+    return dictSize;
+}
+
+
+/* ZDICT_trainFromBuffer_legacy() :
+ * issue : samplesBuffer need to be followed by a noisy guard band.
+ * work around : duplicate the buffer, and add the noise */
+size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                              ZDICT_legacy_params_t params)
+{
+    size_t result;
+    void* newBuff;
+    size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
+    if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0;   /* not enough content => no dictionary */
+
+    newBuff = malloc(sBuffSize + NOISELENGTH);
+    if (!newBuff) return ERROR(memory_allocation);
+
+    memcpy(newBuff, samplesBuffer, sBuffSize);
+    ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
+
+    result =
+        ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
+                                            samplesSizes, nbSamples, params);
+    free(newBuff);
+    return result;
+}
+
+
+size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
+                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+{
+    ZDICT_cover_params_t params;
+    DEBUGLOG(3, "ZDICT_trainFromBuffer");
+    memset(&params, 0, sizeof(params));
+    params.d = 8;
+    params.steps = 4;
+    /* Default to level 6 since no compression level information is available */
+    params.zParams.compressionLevel = 6;
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
+    params.zParams.notificationLevel = ZSTD_DEBUG;
+#endif
+    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+                                               samplesBuffer, samplesSizes, nbSamples,
+                                               &params);
+}
+
+size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+{
+    ZDICT_params_t params;
+    memset(&params, 0, sizeof(params));
+    return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
+                                                     samplesBuffer, samplesSizes, nbSamples,
+                                                     params);
+}

diff --git a/vendor/github.com/DataDog/zstd/zdict.h b/vendor/github.com/DataDog/zstd/zdict.h
new file mode 100644
index 0000000..ad459c2
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zdict.h

@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef DICTBUILDER_H_001
+#define DICTBUILDER_H_001
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*======  Dependencies  ======*/
+#include <stddef.h>  /* size_t */
+
+
+/* =====   ZDICTLIB_API : control library symbols visibility   ===== */
+#ifndef ZDICTLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZDICTLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZDICTLIB_API ZDICTLIB_VISIBILITY
+#endif
+
+
+/*! ZDICT_trainFromBuffer():
+ *  Train a dictionary from an array of samples.
+ *  Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte.
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
+                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+
+
+/*======   Helper functions   ======*/
+ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize);  /**< extracts dictID; @return zero if error (not a valid dictionary) */
+ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
+ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
+
+
+
+#ifdef ZDICT_STATIC_LINKING_ONLY
+
+/* ====================================================================================
+ * The definitions in this section are considered experimental.
+ * They should never be used with a dynamic library, as they may change in the future.
+ * They are provided for advanced usages.
+ * Use them only in association with static linking.
+ * ==================================================================================== */
+
+typedef struct {
+    int      compressionLevel;   /* optimize for a specific zstd compression level; 0 means default */
+    unsigned notificationLevel;  /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
+    unsigned dictID;             /* force dictID value; 0 means auto mode (32-bits random value) */
+} ZDICT_params_t;
+
+/*! ZDICT_cover_params_t:
+ *  k and d are the only required parameters.
+ *  For others, value 0 means default.
+ */
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    ZDICT_params_t zParams;
+} ZDICT_cover_params_t;
+
+
+/*! ZDICT_trainFromBuffer_cover():
+ *  Train a dictionary from an array of samples using the COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
+          void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+          ZDICT_cover_params_t parameters);
+
+/*! ZDICT_optimizeTrainFromBuffer_cover():
+ * The same requirements as above hold for all the parameters except `parameters`.
+ * This function tries many parameter combinations and picks the best parameters.
+ * `*parameters` is filled with the best parameters found,
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
+ *
+ * All of the parameters d, k, steps are optional.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ * if steps is zero it defaults to its default value.
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+          void* dictBuffer, size_t dictBufferCapacity,
+    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+          ZDICT_cover_params_t* parameters);
+
+/*! ZDICT_finalizeDictionary():
+ * Given a custom content as a basis for dictionary, and a set of samples,
+ * finalize dictionary by adding headers and statistics.
+ *
+ * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
+ *
+ * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
+ * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
+ *           or an error code, which can be tested by ZDICT_isError().
+ * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
+ * Note 2: dictBuffer and dictContent can overlap
+ */
+#define ZDICT_CONTENTSIZE_MIN 128
+#define ZDICT_DICTSIZE_MIN    256
+ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
+                                const void* dictContent, size_t dictContentSize,
+                                const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                ZDICT_params_t parameters);
+
+typedef struct {
+    unsigned selectivityLevel;   /* 0 means default; larger => select more => larger dictionary */
+    ZDICT_params_t zParams;
+} ZDICT_legacy_params_t;
+
+/*! ZDICT_trainFromBuffer_legacy():
+ *  Train a dictionary from an array of samples.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * `parameters` is optional and can be provided with values set to 0 to mean "default".
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ *  Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_legacy_params_t parameters);
+
+/* Deprecation warnings */
+/* It is generally possible to disable deprecation warnings from compiler,
+   for example with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual.
+   Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */
+#ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS
+#  define ZDICT_DEPRECATED(message) ZDICTLIB_API   /* disable deprecation warnings */
+#else
+#  define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API
+#  elif (ZDICT_GCC_VERSION >= 405) || defined(__clang__)
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message)))
+#  elif (ZDICT_GCC_VERSION >= 301)
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API
+#  endif
+#endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */
+
+ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead")
+size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+
+
+#endif   /* ZDICT_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* DICTBUILDER_H_001 */

diff --git a/vendor/github.com/DataDog/zstd/zstd.go b/vendor/github.com/DataDog/zstd/zstd.go
new file mode 100644
index 0000000..ac3839f
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd.go

@@ -0,0 +1,139 @@
+package zstd
+
+/*
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"
+#include "stdint.h"  // for uintptr_t
+
+// The following *_wrapper function are used for removing superflouos
+// memory allocations when calling the wrapped functions from Go code.
+// See https://github.com/golang/go/issues/24450 for details.
+
+static size_t ZSTD_compress_wrapper(uintptr_t dst, size_t maxDstSize, const uintptr_t src, size_t srcSize, int compressionLevel) {
+	return ZSTD_compress((void*)dst, maxDstSize, (const void*)src, srcSize, compressionLevel);
+}
+
+static size_t ZSTD_decompress_wrapper(uintptr_t dst, size_t maxDstSize, uintptr_t src, size_t srcSize) {
+	return ZSTD_decompress((void*)dst, maxDstSize, (const void *)src, srcSize);
+}
+
+*/
+import "C"
+import (
+	"bytes"
+	"errors"
+	"io/ioutil"
+	"unsafe"
+)
+
+// Defines best and standard values for zstd cli
+const (
+	BestSpeed          = 1
+	BestCompression    = 20
+	DefaultCompression = 5
+)
+
+var (
+	// ErrEmptySlice is returned when there is nothing to compress
+	ErrEmptySlice = errors.New("Bytes slice is empty")
+)
+
+// CompressBound returns the worst case size needed for a destination buffer,
+// which can be used to preallocate a destination buffer or select a previously
+// allocated buffer from a pool.
+// See zstd.h to mirror implementation of ZSTD_COMPRESSBOUND
+func CompressBound(srcSize int) int {
+	lowLimit := 128 << 10 // 128 kB
+	var margin int
+	if srcSize < lowLimit {
+		margin = (lowLimit - srcSize) >> 11
+	}
+	return srcSize + (srcSize >> 8) + margin
+}
+
+// cCompressBound is a cgo call to check the go implementation above against the c code.
+func cCompressBound(srcSize int) int {
+	return int(C.ZSTD_compressBound(C.size_t(srcSize)))
+}
+
+// Compress src into dst.  If you have a buffer to use, you can pass it to
+// prevent allocation.  If it is too small, or if nil is passed, a new buffer
+// will be allocated and returned.
+func Compress(dst, src []byte) ([]byte, error) {
+	return CompressLevel(dst, src, DefaultCompression)
+}
+
+// CompressLevel is the same as Compress but you can pass a compression level
+func CompressLevel(dst, src []byte, level int) ([]byte, error) {
+	if len(src) == 0 {
+		return []byte{}, ErrEmptySlice
+	}
+	bound := CompressBound(len(src))
+	if cap(dst) >= bound {
+		dst = dst[0:bound] // Reuse dst buffer
+	} else {
+		dst = make([]byte, bound)
+	}
+
+	cWritten := C.ZSTD_compress_wrapper(
+		C.uintptr_t(uintptr(unsafe.Pointer(&dst[0]))),
+		C.size_t(len(dst)),
+		C.uintptr_t(uintptr(unsafe.Pointer(&src[0]))),
+		C.size_t(len(src)),
+		C.int(level))
+
+	written := int(cWritten)
+	// Check if the return is an Error code
+	if err := getError(written); err != nil {
+		return nil, err
+	}
+	return dst[:written], nil
+}
+
+// Decompress src into dst.  If you have a buffer to use, you can pass it to
+// prevent allocation.  If it is too small, or if nil is passed, a new buffer
+// will be allocated and returned.
+func Decompress(dst, src []byte) ([]byte, error) {
+	decompress := func(dst, src []byte) ([]byte, error) {
+
+		cWritten := C.ZSTD_decompress_wrapper(
+			C.uintptr_t(uintptr(unsafe.Pointer(&dst[0]))),
+			C.size_t(len(dst)),
+			C.uintptr_t(uintptr(unsafe.Pointer(&src[0]))),
+			C.size_t(len(src)))
+
+		written := int(cWritten)
+		// Check error
+		if err := getError(written); err != nil {
+			return nil, err
+		}
+		return dst[:written], nil
+	}
+
+	if dst == nil {
+		// Attempt to use zStd to determine decompressed size (may result in error or 0)
+		size := int(C.size_t(C.ZSTD_getDecompressedSize(unsafe.Pointer(&src[0]), C.size_t(len(src)))))
+
+		if err := getError(size); err != nil {
+			return nil, err
+		}
+
+		if size > 0 {
+			dst = make([]byte, size)
+		} else {
+			dst = make([]byte, len(src)*3) // starting guess
+		}
+	}
+	for i := 0; i < 3; i++ { // 3 tries to allocate a bigger buffer
+		result, err := decompress(dst, src)
+		if !IsDstSizeTooSmallError(err) {
+			return result, err
+		}
+		dst = make([]byte, len(dst)*2) // Grow buffer by 2
+	}
+
+	// We failed getting a dst buffer of correct size, use stream API
+	r := NewReader(bytes.NewReader(src))
+	defer r.Close()
+	return ioutil.ReadAll(r)
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd.h b/vendor/github.com/DataDog/zstd/zstd.h
new file mode 100644
index 0000000..6405da6
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd.h

@@ -0,0 +1,1399 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
+
+/* ======   Dependency   ======*/
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDLIB_API ZSTDLIB_VISIBILITY
+#endif
+
+
+/*******************************************************************************************************
+  Introduction
+
+  zstd, short for Zstandard, is a fast lossless compression algorithm,
+  targeting real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression functions.
+  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
+  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  Compression can be done in:
+    - a single step (described as Simple API)
+    - a single step, reusing a context (described as Explicit context)
+    - unbounded multiple steps (described as Streaming compression)
+  The compression ratio achievable on small data can be highly improved using a dictionary in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)
+
+  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
+  Advanced experimental APIs shall never be used with a dynamic library.
+  They are not "stable", their definition may change in the future. Only static linking is allowed.
+*********************************************************************************************************/
+
+/*------   Version   ------*/
+#define ZSTD_VERSION_MAJOR    1
+#define ZSTD_VERSION_MINOR    3
+#define ZSTD_VERSION_RELEASE  4
+
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< useful to check dll version */
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+ZSTDLIB_API const char* ZSTD_versionString(void);   /* added in v1.3.0 */
+
+
+/***************************************
+*  Simple API
+***************************************/
+/*! ZSTD_compress() :
+ *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  @return : compressed size written into `dst` (<= `dstCapacity),
+ *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  int compressionLevel);
+
+/*! ZSTD_decompress() :
+ *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ *  `dstCapacity` is an upper bound of originalSize to regenerate.
+ *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                              const void* src, size_t compressedSize);
+
+/*! ZSTD_getFrameContentSize() : added in v1.3.0
+ *  `src` should point to the start of a ZSTD encoded frame.
+ *  `srcSize` must be at least as large as the frame header.
+ *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ *  @return : - decompressed size of the frame in `src`, if known
+ *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ *   note 1 : a 0 return value means the frame is valid but "empty".
+ *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *            Optionally, application can rely on some implicit limit,
+ *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+ *            (For example, data could be necessarily cut into blocks <= 16 KB).
+ *   note 3 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 4 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure return value fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ *  "empty", "unknown" and "error" results to the same return value (0),
+ *  while ZSTD_getFrameContentSize() gives them separate return values.
+ * `src` is the start of a zstd compressed frame.
+ * @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. */
+ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+
+/*======  Helper functions  ======*/
+#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+
+
+/***************************************
+*  Explicit context
+***************************************/
+/*= Compression context
+ *  When compressing many times,
+ *  it is recommended to allocate a context just once, and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution in multi-threaded environments. */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);
+
+/*! ZSTD_compressCCtx() :
+ *  Same as ZSTD_compress(), requires an allocated ZSTD_CCtx (see ZSTD_createCCtx()). */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* ctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     int compressionLevel);
+
+/*= Decompression context
+ *  When decompressing many times,
+ *  it is recommended to allocate a context only once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution. */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
+
+/*! ZSTD_decompressDCtx() :
+ *  Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()) */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize);
+
+
+/**************************
+*  Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+ *  Compression using a predefined Dictionary (see dictBuilder/zdict.h).
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+ *  Decompression using a predefined Dictionary (see dictBuilder/zdict.h).
+ *  Dictionary must be identical to the one used during compression.
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/**********************************
+ *  Bulk processing dictionary API
+ *********************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+ *  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
+ *  ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
+ *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
+                                         int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+ *  Function frees memory allocated by ZSTD_createCDict(). */
+ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ *  Note that compression level is decided during dictionary creation.
+ *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  dictBuffer can be released after DDict creation, as its content is copied inside DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+ *  Function frees memory allocated with ZSTD_createDDict() */
+ZSTDLIB_API size_t      ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+ *  Decompression using a digested Dictionary.
+ *  Faster startup than ZSTD_decompress_usingDict(), recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_DDict* ddict);
+
+
+/****************************
+*  Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+  const void* src;    /**< start of input buffer */
+  size_t size;        /**< size of input buffer */
+  size_t pos;         /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+  void*  dst;         /**< start of output buffer */
+  size_t size;        /**< size of output buffer */
+  size_t pos;         /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
+
+
+/*-***********************************************************************
+*  Streaming compression - HowTo
+*
+*  A ZSTD_CStream object is required to track streaming operation.
+*  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+*  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+*  It is recommended to re-use ZSTD_CStream in situations where many streaming operations will be achieved consecutively,
+*  since it will play nicer with system's memory, by re-using already allocated memory.
+*  Use one separate ZSTD_CStream per thread for parallel execution.
+*
+*  Start a new compression by initializing ZSTD_CStream.
+*  Use ZSTD_initCStream() to start a new compression operation.
+*  Use ZSTD_initCStream_usingDict() or ZSTD_initCStream_usingCDict() for a compression which requires a dictionary (experimental section)
+*
+*  Use ZSTD_compressStream() repetitively to consume input stream.
+*  The function will automatically update both `pos` fields.
+*  Note that it may not consume the entire input, in which case `pos < size`,
+*  and it's up to the caller to present again remaining data.
+*  @return : a size hint, preferred nb of bytes to use as input for next function call
+*            or an error code, which can be tested using ZSTD_isError().
+*            Note 1 : it's just a hint, to help latency a little, any other value will work fine.
+*            Note 2 : size hint is guaranteed to be <= ZSTD_CStreamInSize()
+*
+*  At any moment, it's possible to flush whatever data remains within internal buffer, using ZSTD_flushStream().
+*  `output->pos` will be updated.
+*  Note that some content might still be left within internal buffer if `output->size` is too small.
+*  @return : nb of bytes still present within internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZSTD_isError().
+*
+*  ZSTD_endStream() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  ZSTD_endStream() may not be able to flush full data if `output->size` is too small.
+*  In which case, call again ZSTD_endStream() to complete the flush.
+*  @return : 0 if frame fully completed and fully flushed,
+             or >0 if some data is still present within internal buffer
+                  (value is minimum size estimation for remaining data to flush, but it could be more)
+*            or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
+                                 /* Continue to distinguish them for compatibility with versions <= v1.2.0 */
+/*===== ZSTD_CStream management functions =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
+
+/*===== Streaming compression functions =====*/
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void);    /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block in all circumstances. */
+
+
+
+/*-***************************************************************************
+*  Streaming decompression - HowTo
+*
+*  A ZSTD_DStream object is required to track streaming operations.
+*  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+*  ZSTD_DStream objects can be re-used multiple times.
+*
+*  Use ZSTD_initDStream() to start a new decompression operation,
+*   or ZSTD_initDStream_usingDict() if decompression requires a dictionary.
+*   @return : recommended first input size
+*
+*  Use ZSTD_decompressStream() repetitively to consume your input.
+*  The function will update both `pos` fields.
+*  If `input.pos < input.size`, some input has not been consumed.
+*  It's up to the caller to present again remaining data.
+*  If `output.pos < output.size`, decoder has flushed everything it could.
+*  @return : 0 when a frame is completely decoded and fully flushed,
+*            an error code, which can be tested using ZSTD_isError(),
+*            any other value > 0, which means there is still some decoding to do to complete current frame.
+*            The return value is a suggested next input size (a hint to improve latency) that will never load more than the current frame.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
+                                 /* For compatibility with versions <= v1.2.0, continue to consider them separated. */
+/*===== ZSTD_DStream management functions =====*/
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
+
+/*===== Streaming decompression functions =====*/
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+#endif  /* ZSTD_H_235446 */
+
+
+
+/****************************************************************************************
+ * START OF ADVANCED AND EXPERIMENTAL FUNCTIONS
+ * The definitions in this section are considered experimental.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * They are provided for advanced scenarios.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/* --- Constants ---*/
+#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* >= v0.7.0 */
+
+#define ZSTD_WINDOWLOG_MAX_32   30
+#define ZSTD_WINDOWLOG_MAX_64   31
+#define ZSTD_WINDOWLOG_MAX    ((unsigned)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN      10
+#define ZSTD_HASHLOG_MAX      ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN         6
+#define ZSTD_CHAINLOG_MAX_32    29
+#define ZSTD_CHAINLOG_MAX_64    30
+#define ZSTD_CHAINLOG_MAX     ((unsigned)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN       ZSTD_HASHLOG_MIN
+#define ZSTD_HASHLOG3_MAX       17
+#define ZSTD_SEARCHLOG_MAX     (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN       1
+#define ZSTD_SEARCHLENGTH_MAX    7   /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_SEARCHLENGTH_MIN    3   /* only for ZSTD_btopt, other strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MIN    1   /* only used by btopt, btultra and btfast */
+#define ZSTD_LDM_MINMATCH_MIN    4
+#define ZSTD_LDM_MINMATCH_MAX 4096
+#define ZSTD_LDM_BUCKETSIZELOG_MAX 8
+
+#define ZSTD_FRAMEHEADERSIZE_PREFIX 5   /* minimum input size to know frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN    6
+#define ZSTD_FRAMEHEADERSIZE_MAX   18   /* for static allocation */
+static const size_t ZSTD_frameHeaderSize_prefix = ZSTD_FRAMEHEADERSIZE_PREFIX;
+static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
+static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
+static const size_t ZSTD_skippableHeaderSize = 8;  /* magic number + skippable frame length */
+
+
+/*--- Advanced types ---*/
+typedef enum { ZSTD_fast=1, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2,
+               ZSTD_btlazy2, ZSTD_btopt, ZSTD_btultra } ZSTD_strategy;   /* from faster to stronger */
+
+typedef struct {
+    unsigned windowLog;      /**< largest match distance : larger == more compression, more memory needed during decompression */
+    unsigned chainLog;       /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+    unsigned hashLog;        /**< dispatch table : larger == faster, more memory */
+    unsigned searchLog;      /**< nb of searches : larger == more compression, slower */
+    unsigned searchLength;   /**< match length searched : larger == faster decompression, sometimes less compression */
+    unsigned targetLength;   /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+    ZSTD_strategy strategy;
+} ZSTD_compressionParameters;
+
+typedef struct {
+    unsigned contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+    unsigned checksumFlag;    /**< 1: generate a 32-bits checksum at end of frame, for error detection */
+    unsigned noDictIDFlag;    /**< 1: no dictID will be saved into frame header (if dictionary compression) */
+} ZSTD_frameParameters;
+
+typedef struct {
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
+
+typedef enum {
+    ZSTD_dct_auto=0,      /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+    ZSTD_dct_rawContent,  /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_fullDict     /* refuses to load a dictionary if it does not respect Zstandard's specification */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+    ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */
+    ZSTD_dlm_byRef,      /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+
+
+
+/***************************************
+*  Frame size functions
+***************************************/
+
+/*! ZSTD_findFrameCompressedSize() :
+ *  `src` should point to the start of a ZSTD encoded frame or skippable frame
+ *  `srcSize` must be >= first frame size
+ *  @return : the compressed size of the first frame starting at `src`,
+ *            suitable to pass to `ZSTD_decompress` or similar,
+ *            or an error code if input is invalid */
+ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_findDecompressedSize() :
+ *  `src` should point the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary exactly at `srcSize` bytes after `src`)
+ *  @return : - decompressed size of all data in all successive frames
+ *            - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
+ *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ *   note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 3 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure result fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
+ *            read each contained frame header.  This is fast as most of the data is skipped,
+ *            however it does mean that all frame data must be present and valid. */
+ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_frameHeaderSize() :
+*   `src` should point to the start of a ZSTD frame
+*   `srcSize` must be >= ZSTD_frameHeaderSize_prefix.
+*   @return : size of the Frame Header */
+ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
+
+/***************************************
+*  Memory management
+***************************************/
+
+/*! ZSTD_sizeof_*() :
+ *  These functions give the current memory usage of selected object.
+ *  Object memory usage can evolve when re-used. */
+ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_estimate*() :
+ *  These functions make it possible to estimate memory usage
+ *  of a future {D,C}Ctx, before its creation.
+ *  ZSTD_estimateCCtxSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCCtxSize_usingCParams() can provide a tighter estimation.
+ *  ZSTD_estimateCCtxSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ *  ZSTD_estimateCCtxSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbWorkers is >= 1.
+ *  Note : CCtx size estimation is only correct for single-threaded compression. */
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
+
+/*! ZSTD_estimateCStreamSize() :
+ *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+ *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbWorkers is >= 1.
+ *  Note : CStream size estimation is only correct for single-threaded compression.
+ *  ZSTD_DStream memory budget depends on window Size.
+ *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+ *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+ *         an internal ?Dict will be created, which additional size is not estimated here.
+ *         In this case, get total size by adding ZSTD_estimate?DictSize */
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+
+/*! ZSTD_estimate?DictSize() :
+ *  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+ *  ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+ *  Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
+ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);
+
+/*! ZSTD_initStatic*() :
+ *  Initialize an object using a pre-allocated fixed-size buffer.
+ *  workspace: The memory area to emplace the object into.
+ *             Provided pointer *must be 8-bytes aligned*.
+ *             Buffer must outlive object.
+ *  workspaceSize: Use ZSTD_estimate*Size() to determine
+ *                 how large workspace must be to support target scenario.
+ * @return : pointer to object (same address as workspace, just different type),
+ *           or NULL if error (size too small, incorrect alignment, etc.)
+ *  Note : zstd will never resize nor malloc() when using a static buffer.
+ *         If the object requires more memory than available,
+ *         zstd will just error out (typically ZSTD_error_memory_allocation).
+ *  Note 2 : there is no corresponding "free" function.
+ *           Since workspace is allocated externally, it must be freed externally too.
+ *  Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+ *           into its associated cParams.
+ *  Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+ *                 ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+ *  Limitation 2 : static cctx currently not compatible with multi-threading.
+ *  Limitation 3 : static dctx is incompatible with legacy support.
+ */
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticCCtx() */
+
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticDCtx() */
+
+ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType,
+                                        ZSTD_compressionParameters cParams);
+
+ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType);
+
+/*! Custom memory allocation :
+ *  These prototypes make it possible to pass your own allocation/free functions.
+ *  ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+ *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /**< this constant defers to stdlib's functions */
+
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_compressionParameters cParams,
+                                                  ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_customMem customMem);
+
+
+
+/***************************************
+*  Advanced compression functions
+***************************************/
+
+/*! ZSTD_createCDict_byReference() :
+ *  Create a digested dictionary for compression
+ *  Dictionary content is simply referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives CDict, it must remain read accessible throughout the lifetime of CDict */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_getCParams() :
+*   @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
+*   `estimatedSrcSize` value is optional, select 0 if not known */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_getParams() :
+*   same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`.
+*   All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */
+ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_checkCParams() :
+*   Ensure param values remain within authorized range */
+ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+
+/*! ZSTD_adjustCParams() :
+ *  optimize params for a given `srcSize` and `dictSize`.
+ *  both values are optional, select `0` if unknown. */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
+/*! ZSTD_compress_advanced() :
+*   Same as ZSTD_compress_usingDict(), with fine-tune control over each compression parameter */
+ZSTDLIB_API size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const void* dict,size_t dictSize,
+                                  ZSTD_parameters params);
+
+/*! ZSTD_compress_usingCDict_advanced() :
+*   Same as ZSTD_compress_usingCDict(), with fine-tune control over frame parameters */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const ZSTD_CDict* cdict, ZSTD_frameParameters fParams);
+
+
+/*--- Advanced decompression functions ---*/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  Dictionary content is referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives DDict,
+ *  it must remain read accessible throughout the lifetime of DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
+
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+
+
+/********************************************************************
+*  Advanced streaming functions
+********************************************************************/
+
+/*=====   Advanced Streaming compression functions  =====*/
+ZSTDLIB_API size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   /**< pledgedSrcSize must be correct. If it is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, "0" also disables frame content size field. It may be enabled in the future. */
+ZSTDLIB_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /**< creates of an internal CDict (incompatible with static CCtx), except if dict == NULL or dictSize < 8, in which case no dict is used. Note: dict is loaded with ZSTD_dm_auto (treated as a full zstd dictionary if it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.*/
+ZSTDLIB_API size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize,
+                                             ZSTD_parameters params, unsigned long long pledgedSrcSize);  /**< pledgedSrcSize must be correct. If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. dict is loaded with ZSTD_dm_auto and ZSTD_dlm_byCopy. */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);  /**< note : cdict will just be referenced, and must outlive compression session */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, unsigned long long pledgedSrcSize);  /**< same as ZSTD_initCStream_usingCDict(), with control over frame parameters. pledgedSrcSize must be correct. If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. */
+
+/*! ZSTD_resetCStream() :
+ *  start a new compression job, using same parameters from previous job.
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place..
+ *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+ *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+ *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+ *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
+ *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+
+
+typedef struct {
+    unsigned long long ingested;
+    unsigned long long consumed;
+    unsigned long long produced;
+} ZSTD_frameProgression;
+
+/* ZSTD_getFrameProgression():
+ * tells how much data has been ingested (read from input)
+ * consumed (input actually compressed) and produced (output) for current frame.
+ * Therefore, (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Can report progression inside worker threads (multi-threading and non-blocking mode).
+ */
+ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+
+
+/*=====   Advanced Streaming decompression functions  =====*/
+typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
+ZSTDLIB_API size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue);   /* obsolete : this API will be removed in a future version */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /**< note: no dictionary will be used if dict == NULL or dictSize < 8 */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);  /**< note : ddict is referenced, it must outlive decompression session */
+ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);  /**< re-use decompression parameters from previous init; saves dictionary loading */
+
+
+/*********************************************************************
+*  Buffer-less and synchronous inner streaming functions
+*
+*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+*  But it's also a complex one, with several restrictions, documented below.
+*  Prefer normal streaming API for an easier experience.
+********************************************************************* */
+
+/**
+  Buffer-less streaming compression (synchronous mode)
+
+  A ZSTD_CCtx object is required to track streaming operations.
+  Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+
+  Start by initializing a context.
+  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression,
+  or ZSTD_compressBegin_advanced(), for finer parameter control.
+  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+
+  Then, consume your input using ZSTD_compressContinue().
+  There are some important considerations to keep in mind when using this advanced function :
+  - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only.
+  - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks.
+  - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario.
+    Worst case evaluation is provided by ZSTD_compressBound().
+    ZSTD_compressContinue() doesn't guarantee recover after a failed compression.
+  - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog).
+    It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks)
+  - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps.
+    In which case, it will "discard" the relevant memory section from its history.
+
+  Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum.
+  It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+  Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+
+  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+*/
+
+/*=====   Buffer-less streaming compression functions  =====*/
+ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-
+  Buffer-less streaming decompression (synchronous mode)
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times.
+
+  First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+  Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+  Data fragment must be large enough to ensure successful decoding.
+ `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+           errorCode, which can be tested using ZSTD_isError().
+
+  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+  such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+  Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+  As a consequence, check that values remain within valid application range.
+  For example, do not allocate memory blindly, check that `windowSize` is within expectation.
+  Each application can set its own limits, depending on local restrictions.
+  For extended interoperability, it is recommended to support `windowSize` of at least 8 MB.
+
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes.
+  ZSTD_decompressContinue() is very sensitive to contiguity,
+  if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+  or that previous contiguous segment is large enough to properly handle maximum back-reference distance.
+  There are multiple ways to guarantee this condition.
+
+  The most memory efficient way is to use a round buffer of sufficient size.
+  Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+  up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+  which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+  At which point, decoding can resume from the beginning of the buffer.
+  Note that already decoded data stored in the buffer should be flushed before being overwritten.
+
+  There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory.
+
+  Finally, if you control the compression process, you can also ignore all buffer size rules,
+  as long as the encoder and decoder progress in "lock-step",
+  aka use exactly the same buffer sizes, break contiguity at the same place, etc.
+
+  Once buffers are setup, start decompression, with ZSTD_decompressBegin().
+  If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict().
+
+  Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+ @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+  It can also be an error code, which can be tested with ZSTD_isError().
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+
+  Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType().
+  This information is not required to properly decode a frame.
+
+  == Special case : skippable frames ==
+
+  Skippable frames allow integration of user-defined data into a flow of concatenated frames.
+  Skippable frames will be ignored (skipped) by decompressor.
+  The format of skippable frames is as follows :
+  a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+  b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+  c) Frame Content - any content (User Data) of length equal to Frame Size
+  For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame.
+  For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content.
+*/
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTD_frameHeader;
+ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* misc */
+ZSTDLIB_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+
+
+/* ============================================ */
+/**       New advanced API (experimental)       */
+/* ============================================ */
+
+/* notes on API design :
+ *   In this proposal, parameters are pushed one by one into an existing context,
+ *   and then applied on all subsequent compression jobs.
+ *   When no parameter is ever provided, CCtx is created with compression level ZSTD_CLEVEL_DEFAULT.
+ *
+ *   This API is intended to replace all others advanced / experimental API entry points.
+ *   But it stands a reasonable chance to become "stable", after a reasonable testing period.
+ */
+
+/* note on naming convention :
+ *   Initially, the API favored names like ZSTD_setCCtxParameter() .
+ *   In this proposal, convention is changed towards ZSTD_CCtx_setParameter() .
+ *   The main driver is that it identifies more clearly the target object type.
+ *   It feels clearer when considering multiple targets :
+ *   ZSTD_CDict_setParameter() (rather than ZSTD_setCDictParameter())
+ *   ZSTD_CCtxParams_setParameter()  (rather than ZSTD_setCCtxParamsParameter() )
+ *   etc...
+ */
+
+/* note on enum design :
+ * All enum will be pinned to explicit values before reaching "stable API" status */
+
+typedef enum {
+    /* Opened question : should we have a format ZSTD_f_auto ?
+     * Today, it would mean exactly the same as ZSTD_f_zstd1.
+     * But, in the future, should several formats become supported,
+     * on the compression side, it would mean "default format".
+     * On the decompression side, it would mean "automatic format detection",
+     * so that ZSTD_f_zstd1 would mean "accept *only* zstd frames".
+     * Since meaning is a little different, another option could be to define different enums for compression and decompression.
+     * This question could be kept for later, when there are actually multiple formats to support,
+     * but there is also the question of pinning enum values, and pinning value `0` is especially important */
+    ZSTD_f_zstd1 = 0,        /* zstd frame format, specified in zstd_compression_format.md (default) */
+    ZSTD_f_zstd1_magicless,  /* Variant of zstd frame format, without initial 4-bytes magic number.
+                              * Useful to save 4 bytes per generated frame.
+                              * Decoder cannot recognise automatically this format, requiring instructions. */
+} ZSTD_format_e;
+
+typedef enum {
+    /* compression format */
+    ZSTD_p_format = 10,      /* See ZSTD_format_e enum definition.
+                              * Cast selected format as unsigned for ZSTD_CCtx_setParameter() compatibility. */
+
+    /* compression parameters */
+    ZSTD_p_compressionLevel=100, /* Update all compression parameters according to pre-defined cLevel table
+                              * Default level is ZSTD_CLEVEL_DEFAULT==3.
+                              * Special: value 0 means "do not change cLevel".
+                              * Note 1 : it's possible to pass a negative compression level by casting it to unsigned type.
+                              * Note 2 : setting a level sets all default values of other compression parameters.
+                              * Note 3 : setting compressionLevel automatically updates ZSTD_p_compressLiterals. */
+    ZSTD_p_windowLog,        /* Maximum allowed back-reference distance, expressed as power of 2.
+                              * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
+                              * Special: value 0 means "use default windowLog".
+                              * Note: Using a window size greater than ZSTD_MAXWINDOWSIZE_DEFAULT (default: 2^27)
+                              *       requires explicitly allowing such window size during decompression stage. */
+    ZSTD_p_hashLog,          /* Size of the probe table, as a power of 2.
+                              * Resulting table size is (1 << (hashLog+2)).
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
+                              * Larger tables improve compression ratio of strategies <= dFast,
+                              * and improve speed of strategies > dFast.
+                              * Special: value 0 means "use default hashLog". */
+    ZSTD_p_chainLog,         /* Size of the full-search table, as a power of 2.
+                              * Resulting table size is (1 << (chainLog+2)).
+                              * Larger tables result in better and slower compression.
+                              * This parameter is useless when using "fast" strategy.
+                              * Special: value 0 means "use default chainLog". */
+    ZSTD_p_searchLog,        /* Number of search attempts, as a power of 2.
+                              * More attempts result in better and slower compression.
+                              * This parameter is useless when using "fast" and "dFast" strategies.
+                              * Special: value 0 means "use default searchLog". */
+    ZSTD_p_minMatch,         /* Minimum size of searched matches (note : repCode matches can be smaller).
+                              * Larger values make faster compression and decompression, but decrease ratio.
+                              * Must be clamped between ZSTD_SEARCHLENGTH_MIN and ZSTD_SEARCHLENGTH_MAX.
+                              * Note that currently, for all strategies < btopt, effective minimum is 4.
+                              *                    , for all strategies > fast, effective maximum is 6.
+                              * Special: value 0 means "use default minMatchLength". */
+    ZSTD_p_targetLength,     /* Impact of this field depends on strategy.
+                              * For strategies btopt & btultra:
+                              *     Length of Match considered "good enough" to stop search.
+                              *     Larger values make compression stronger, and slower.
+                              * For strategy fast:
+                              *     Distance between match sampling.
+                              *     Larger values make compression faster, and weaker.
+                              * Special: value 0 means "use default targetLength". */
+    ZSTD_p_compressionStrategy, /* See ZSTD_strategy enum definition.
+                              * Cast selected strategy as unsigned for ZSTD_CCtx_setParameter() compatibility.
+                              * The higher the value of selected strategy, the more complex it is,
+                              * resulting in stronger and slower compression.
+                              * Special: value 0 means "use default strategy". */
+
+    ZSTD_p_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                         * This parameter is designed to improve compression ratio
+                                         * for large inputs, by finding large matches at long distance.
+                                         * It increases memory usage and window size.
+                                         * Note: enabling this parameter increases ZSTD_p_windowLog to 128 MB
+                                         * except when expressly set to a different value. */
+    ZSTD_p_ldmHashLog,       /* Size of the table for long distance matching, as a power of 2.
+                              * Larger values increase memory usage and compression ratio,
+                              * but decrease compression speed.
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+                              * default: windowlog - 7.
+                              * Special: value 0 means "automatically determine hashlog". */
+    ZSTD_p_ldmMinMatch,      /* Minimum match size for long distance matcher.
+                              * Larger/too small values usually decrease compression ratio.
+                              * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+                              * Special: value 0 means "use default value" (default: 64). */
+    ZSTD_p_ldmBucketSizeLog, /* Log size of each bucket in the LDM hash table for collision resolution.
+                              * Larger values improve collision resolution but decrease compression speed.
+                              * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX .
+                              * Special: value 0 means "use default value" (default: 3). */
+    ZSTD_p_ldmHashEveryLog,  /* Frequency of inserting/looking up entries in the LDM hash table.
+                              * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+                              * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+                              * Larger values improve compression speed.
+                              * Deviating far from default value will likely result in a compression ratio decrease.
+                              * Special: value 0 means "automatically determine hashEveryLog". */
+
+    /* frame parameters */
+    ZSTD_p_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
+                              * Content size must be known at the beginning of compression,
+                              * it is provided using ZSTD_CCtx_setPledgedSrcSize() */
+    ZSTD_p_checksumFlag,     /* A 32-bits checksum of content is written at end of frame (default:0) */
+    ZSTD_p_dictIDFlag,       /* When applicable, dictionary's ID is written into frame header (default:1) */
+
+    /* multi-threading parameters */
+    /* These parameters are only useful if multi-threading is enabled (ZSTD_MULTITHREAD).
+     * They return an error otherwise. */
+    ZSTD_p_nbWorkers=400,    /* Select how many threads will be spawned to compress in parallel.
+                              * When nbWorkers >= 1, triggers asynchronous mode :
+                              * ZSTD_compress_generic() consumes some input, flush some output if possible, and immediately gives back control to caller,
+                              * while compression work is performed in parallel, within worker threads.
+                              * (note : a strong exception to this rule is when first invocation sets ZSTD_e_end : it becomes a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */
+    ZSTD_p_jobSize,          /* Size of a compression job. This value is enforced only in non-blocking mode.
+                              * Each compression job is completed in parallel, so this value indirectly controls the nb of active threads.
+                              * 0 means default, which is dynamically determined based on compression parameters.
+                              * Job size must be a minimum of overlapSize, or 1 MB, whichever is largest.
+                              * The minimum size is automatically and transparently enforced */
+    ZSTD_p_overlapSizeLog,   /* Size of previous input reloaded at the beginning of each job.
+                              * 0 => no overlap, 6(default) => use 1/8th of windowSize, >=9 => use full windowSize */
+
+    /* =================================================================== */
+    /* experimental parameters - no stability guaranteed                   */
+    /* =================================================================== */
+
+    ZSTD_p_compressLiterals=1000, /* control huffman compression of literals (enabled) by default.
+                              * disabling it improves speed and decreases compression ratio by a large amount.
+                              * note : this setting is automatically updated when changing compression level.
+                              *        positive compression levels set ZSTD_p_compressLiterals to 1.
+                              *        negative compression levels set ZSTD_p_compressLiterals to 0. */
+
+    ZSTD_p_forceMaxWindow=1100, /* Force back-reference distances to remain < windowSize,
+                              * even when referencing into Dictionary content (default:0) */
+
+} ZSTD_cParameter;
+
+
+/*! ZSTD_CCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  Setting a parameter is generally only possible during frame initialization (before starting compression),
+ *  except for a few exceptions which can be updated during compression: compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ *  Note : when `value` is an enum, cast it to unsigned for proper type checking.
+ *  @result : informational value (typically, value being set clamped correctly),
+ *            or an error code (which can be tested with ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);
+
+/*! ZSTD_CCtx_setPledgedSrcSize() :
+ *  Total input data size to be compressed as a single frame.
+ *  This value will be controlled at the end, and result in error if not respected.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : 0 means zero, empty.
+ *           In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN.
+ *           ZSTD_CONTENTSIZE_UNKNOWN is default value for any new compression job.
+ *  Note 2 : If all data is provided and consumed in a single round,
+ *           this value is overriden by srcSize instead. */
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
+
+/*! ZSTD_CCtx_loadDictionary() :
+ *  Create an internal CDict from `dict` buffer.
+ *  Decompression will have to use same dictionary.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Adding a NULL (or 0-size) dictionary invalidates previous dictionary,
+ *           meaning "return to no-dictionary mode".
+ *  Note 1 : Dictionary will be used for all future compression jobs.
+ *           To return to "no-dictionary" situation, load a NULL dictionary
+ *  Note 2 : Loading a dictionary involves building tables, which are dependent on compression parameters.
+ *           For this reason, compression parameters cannot be changed anymore after loading a dictionary.
+ *           It's also a CPU consuming operation, with non-negligible impact on latency.
+ *  Note 3 :`dict` content will be copied internally.
+ *           Use ZSTD_CCtx_loadDictionary_byReference() to reference dictionary content instead.
+ *           In such a case, dictionary buffer must outlive its users.
+ *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+ *           to precisely select how dictionary content must be interpreted. */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+
+/*! ZSTD_CCtx_refCDict() :
+ *  Reference a prepared dictionary, to be used for all next compression jobs.
+ *  Note that compression parameters are enforced from within CDict,
+ *  and supercede any compression parameter previously set within CCtx.
+ *  The dictionary will remain valid for future compression jobs using same CCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : adding a NULL CDict means "return to no-dictionary mode".
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Adding a new dictionary effectively "discards" any previous one.
+ *  Note 2 : CDict is just referenced, its lifetime must outlive CCtx. */
+ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+/*! ZSTD_CCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) for next compression job.
+ *  Decompression need same prefix to properly regenerate data.
+ *  Prefix is **only used once**. Tables are discarded at end of compression job.
+ *  Subsequent compression jobs will be done without prefix (if none is explicitly referenced).
+ *  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict instead.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ *  Note 1 : Prefix buffer is referenced. It must outlive compression job.
+ *  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *           It's a CPU consuming operation, with non-negligible impact on latency.
+ *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+ *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize);
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_CCtx_reset() :
+ *  Return a CCtx to clean state.
+ *  Useful after an error, or to interrupt an ongoing compression job and start a new one.
+ *  Any internal data not yet flushed is cancelled.
+ *  Dictionary (if any) is dropped.
+ *  All parameters are back to default values.
+ *  It's possible to modify compression parameters after a reset.
+ */
+ZSTDLIB_API void ZSTD_CCtx_reset(ZSTD_CCtx* cctx);
+
+
+
+typedef enum {
+    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal conditions */
+    ZSTD_e_flush,      /* flush any data provided so far - frame will continue, future data can still reference previous data for better compression */
+    ZSTD_e_end         /* flush any remaining data and close current frame. Any additional data starts a new frame. */
+} ZSTD_EndDirective;
+
+/*! ZSTD_compress_generic() :
+ *  Behave about the same as ZSTD_compressStream. To note :
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_setParameter()
+ *  - Compression parameters cannot be changed once compression is started.
+ *  - outpot->pos must be <= dstCapacity, input->pos must be <= srcSize
+ *  - outpot->pos and input->pos will be updated. They are guaranteed to remain below their respective limit.
+ *  - In single-thread mode (default), function is blocking : it completed its job before returning to caller.
+ *  - In multi-thread mode, function is non-blocking : it just acquires a copy of input, and distribute job to internal worker threads,
+ *                                                     and then immediately returns, just indicating that there is some data remaining to be flushed.
+ *                                                     The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
+ *  - Exception : in multi-threading mode, if the first call requests a ZSTD_e_end directive, it is blocking : it will complete compression before giving back control to caller.
+ *  - @return provides a minimum amount of data remaining to be flushed from internal buffers
+ *            or an error code, which can be tested using ZSTD_isError().
+ *            if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
+ *            This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+ *            For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
+ *  - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
+ *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+ *            Before starting a new compression job, or changing compression parameters,
+ *            it is required to fully flush internal buffers.
+ */
+ZSTDLIB_API size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
+                                          ZSTD_outBuffer* output,
+                                          ZSTD_inBuffer* input,
+                                          ZSTD_EndDirective endOp);
+
+
+/*! ZSTD_compress_generic_simpleArgs() :
+ *  Same as ZSTD_compress_generic(),
+ *  but using only integral types as arguments.
+ *  Argument list is larger than ZSTD_{in,out}Buffer,
+ *  but can be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_compress_generic_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp);
+
+
+/*! ZSTD_CCtx_params :
+ *  Quick howto :
+ *  - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
+ *  - ZSTD_CCtxParam_setParameter() : Push parameters one by one into
+ *                                    an existing ZSTD_CCtx_params structure.
+ *                                    This is similar to
+ *                                    ZSTD_CCtx_setParameter().
+ *  - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to
+ *                                    an existing CCtx.
+ *                                    These parameters will be applied to
+ *                                    all subsequent compression jobs.
+ *  - ZSTD_compress_generic() : Do compression using the CCtx.
+ *  - ZSTD_freeCCtxParams() : Free the memory.
+ *
+ *  This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams()
+ *  for static allocation for single-threaded compression.
+ */
+ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);
+
+
+/*! ZSTD_CCtxParams_reset() :
+ *  Reset params to default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_init() :
+ *  Initializes the compression parameters of cctxParams according to
+ *  compression level. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);
+
+/*! ZSTD_CCtxParams_init_advanced() :
+ *  Initializes the compression and frame parameters of cctxParams according to
+ *  params. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+
+
+/*! ZSTD_CCtxParam_setParameter() :
+ *  Similar to ZSTD_CCtx_setParameter.
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams().
+ *  Note : when `value` is an enum, cast it to unsigned for proper type checking.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParam_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned value);
+
+/*! ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  Apply a set of ZSTD_CCtx_params to the compression context.
+ *  This can be done even after compression is started,
+ *    if nbWorkers==0, this will have no impact until a new compression is started.
+ *    if nbWorkers>=1, new parameters will be picked up at next job,
+ *       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
+
+
+/*===   Advanced parameters for decompression API  ===*/
+
+/* The following parameters must be set after creating a ZSTD_DCtx* (or ZSTD_DStream*) object,
+ * but before starting decompression of a frame.
+ */
+
+/*! ZSTD_DCtx_loadDictionary() :
+ *  Create an internal DDict from dict buffer,
+ *  to be used to decompress next frames.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+ *            meaning "return to no-dictionary mode".
+ *  Note 1 : `dict` content will be copied internally.
+ *            Use ZSTD_DCtx_loadDictionary_byReference()
+ *            to reference dictionary content instead.
+ *            In which case, the dictionary buffer must outlive its users.
+ *  Note 2 : Loading a dictionary involves building tables,
+ *           which has a non-negligible impact on CPU usage and latency.
+ *  Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to select
+ *           how dictionary content will be interpreted and loaded.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+
+/*! ZSTD_DCtx_refDDict() :
+ *  Reference a prepared dictionary, to be used to decompress next frames.
+ *  The dictionary remains active for decompression of future frames using same DCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Special : adding a NULL DDict means "return to no-dictionary mode".
+ *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+
+/*! ZSTD_DCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) for next compression job.
+ *  Prefix is **only used once**. It must be explicitly referenced before each frame.
+ *  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_DDict instead.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
+ *  Note 2 : Prefix buffer is referenced. It must outlive compression job.
+ *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+ *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode.
+ *  Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize);
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+
+/*! ZSTD_DCtx_setMaxWindowSize() :
+ *  Refuses allocating internal buffers for frames requiring a window size larger than provided limit.
+ *  This is useful to prevent a decoder context from reserving too much memory for itself (potential attack scenario).
+ *  This parameter is only useful in streaming mode, since no internal buffer is allocated in direct mode.
+ *  By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_MAX)
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize);
+
+
+/*! ZSTD_DCtx_setFormat() :
+ *  Instruct the decoder context about what kind of data to decode next.
+ *  This instruction is mandatory to decode data without a fully-formed header,
+ *  such ZSTD_f_zstd1_magicless for example.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+
+
+/*! ZSTD_decompress_generic() :
+ *  Behave the same as ZSTD_decompressStream.
+ *  Decompression parameters cannot be changed once decompression is started.
+ * @return : an error code, which can be tested using ZSTD_isError()
+ *           if >0, a hint, nb of expected input bytes for next invocation.
+ *           `0` means : a frame has just been fully decoded and flushed.
+ */
+ZSTDLIB_API size_t ZSTD_decompress_generic(ZSTD_DCtx* dctx,
+                                           ZSTD_outBuffer* output,
+                                           ZSTD_inBuffer* input);
+
+
+/*! ZSTD_decompress_generic_simpleArgs() :
+ *  Same as ZSTD_decompress_generic(),
+ *  but using only integral types as arguments.
+ *  Argument list is larger than ZSTD_{in,out}Buffer,
+ *  but can be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_decompress_generic_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos);
+
+
+/*! ZSTD_DCtx_reset() :
+ *  Return a DCtx to clean state.
+ *  If a decompression was ongoing, any internal data not yet flushed is cancelled.
+ *  All parameters are back to default values, including sticky ones.
+ *  Dictionary (if any) is dropped.
+ *  Parameters can be modified again after a reset.
+ */
+ZSTDLIB_API void ZSTD_DCtx_reset(ZSTD_DCtx* dctx);
+
+
+
+/* ============================ */
+/**       Block level API       */
+/* ============================ */
+
+/*!
+    Block functions produce and decode raw zstd blocks, without frame metadata.
+    Frame metadata cost is typically ~18 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+    User will have to take in charge required information to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Compressing and decompressing require a context structure
+      + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+    - It is necessary to init context before starting
+      + compression : any ZSTD_compressBegin*() variant, including with dictionary
+      + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+      + copyCCtx() and copyDCtx() can be used too
+    - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+      + If input is larger than a block size, it's necessary to split input data into multiple blocks
+      + For inputs larger than a single block size, consider using the regular ZSTD_compress() instead.
+        Frame metadata is not that costly, and quickly becomes negligible as source size grows larger.
+    - When a block is considered not compressible enough, ZSTD_compressBlock() result will be zero.
+      In which case, nothing is produced into `dst`.
+      + User must test for such outcome and deal directly with uncompressed data
+      + ZSTD_decompressBlock() doesn't accept uncompressed data as input !!!
+      + In case of multiple successive blocks, should some of them be uncompressed,
+        decoder must be informed of their existence in order to follow proper history.
+        Use ZSTD_insertBlock() for such a case.
+*/
+
+#define ZSTD_BLOCKSIZELOG_MAX 17
+#define ZSTD_BLOCKSIZE_MAX   (1<<ZSTD_BLOCKSIZELOG_MAX)   /* define, for static allocation */
+/*=====   Raw zstd block functions  =====*/
+ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+
+
+#endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif

diff --git a/vendor/github.com/DataDog/zstd/zstd_common.c b/vendor/github.com/DataDog/zstd/zstd_common.c
new file mode 100644
index 0000000..bccc948
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_common.c

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdlib.h>      /* malloc, calloc, free */
+#include <string.h>      /* memset */
+#include "error_private.h"
+#include "zstd_internal.h"
+
+
+/*-****************************************
+*  Version
+******************************************/
+unsigned ZSTD_versionNumber(void) { return ZSTD_VERSION_NUMBER; }
+
+const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; }
+
+
+/*-****************************************
+*  ZSTD Error Management
+******************************************/
+/*! ZSTD_isError() :
+ *  tells if a return value is an error code */
+unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTD_getErrorName() :
+ *  provides error code string from function result (useful for debugging) */
+const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+/*! ZSTD_getError() :
+ *  convert a `size_t` function result into a proper ZSTD_errorCode enum */
+ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+
+/*! ZSTD_getErrorString() :
+ *  provides error code string from enum */
+const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+
+/*! g_debuglog_enable :
+ *  turn on/off debug traces (global switch) */
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG >= 2)
+int g_debuglog_enable = 1;
+#endif
+
+
+/*=**************************************************************
+*  Custom allocator
+****************************************************************/
+void* ZSTD_malloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc)
+        return customMem.customAlloc(customMem.opaque, size);
+    return malloc(size);
+}
+
+void* ZSTD_calloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc) {
+        /* calloc implemented as malloc+memset;
+         * not as efficient as calloc, but next best guess for custom malloc */
+        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+        memset(ptr, 0, size);
+        return ptr;
+    }
+    return calloc(1, size);
+}
+
+void ZSTD_free(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL) {
+        if (customMem.customFree)
+            customMem.customFree(customMem.opaque, ptr);
+        else
+            free(ptr);
+    }
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_compress.c b/vendor/github.com/DataDog/zstd/zstd_compress.c
new file mode 100644
index 0000000..2aa26da
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_compress.c

@@ -0,0 +1,3449 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*-*************************************
+*  Tuning parameters
+***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+#  define ZSTD_CLEVEL_DEFAULT 3
+#endif
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <string.h>         /* memset */
+#include "cpu.h"
+#include "mem.h"
+#define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "zstd_compress_internal.h"
+#include "zstd_fast.h"
+#include "zstd_double_fast.h"
+#include "zstd_lazy.h"
+#include "zstd_opt.h"
+#include "zstd_ldm.h"
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+size_t ZSTD_compressBound(size_t srcSize) {
+    return ZSTD_COMPRESSBOUND(srcSize);
+}
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+struct ZSTD_CDict_s {
+    void* dictBuffer;
+    const void* dictContent;
+    size_t dictContentSize;
+    void* workspace;
+    size_t workspaceSize;
+    ZSTD_matchState_t matchState;
+    ZSTD_compressedBlockState_t cBlockState;
+    ZSTD_compressionParameters cParams;
+    ZSTD_customMem customMem;
+    U32 dictID;
+};  /* typedef'd to ZSTD_CDict within "zstd.h" */
+
+ZSTD_CCtx* ZSTD_createCCtx(void)
+{
+    return ZSTD_createCCtx_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
+{
+    ZSTD_STATIC_ASSERT(zcss_init==0);
+    ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1));
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+    {   ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_calloc(sizeof(ZSTD_CCtx), customMem);
+        if (!cctx) return NULL;
+        cctx->customMem = customMem;
+        cctx->requestedParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
+        cctx->requestedParams.fParams.contentSizeFlag = 1;
+        cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+        return cctx;
+    }
+}
+
+ZSTD_CCtx* ZSTD_initStaticCCtx(void *workspace, size_t workspaceSize)
+{
+    ZSTD_CCtx* const cctx = (ZSTD_CCtx*) workspace;
+    if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL;  /* minimum size */
+    if ((size_t)workspace & 7) return NULL;  /* must be 8-aligned */
+    memset(workspace, 0, workspaceSize);   /* may be a bit generous, could memset be smaller ? */
+    cctx->staticSize = workspaceSize;
+    cctx->workSpace = (void*)(cctx+1);
+    cctx->workSpaceSize = workspaceSize - sizeof(ZSTD_CCtx);
+
+    /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+    if (cctx->workSpaceSize < HUF_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t)) return NULL;
+    assert(((size_t)cctx->workSpace & (sizeof(void*)-1)) == 0);   /* ensure correct alignment */
+    cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)cctx->workSpace;
+    cctx->blockState.nextCBlock = cctx->blockState.prevCBlock + 1;
+    {
+        void* const ptr = cctx->blockState.nextCBlock + 1;
+        cctx->entropyWorkspace = (U32*)ptr;
+    }
+    cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    return cctx;
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support free on NULL */
+    if (cctx->staticSize) return ERROR(memory_allocation);   /* not compatible with static CCtx */
+    ZSTD_free(cctx->workSpace, cctx->customMem); cctx->workSpace = NULL;
+    ZSTD_freeCDict(cctx->cdictLocal); cctx->cdictLocal = NULL;
+#ifdef ZSTD_MULTITHREAD
+    ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL;
+#endif
+    ZSTD_free(cctx, cctx->customMem);
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+
+static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    return ZSTDMT_sizeof_CCtx(cctx->mtctx);
+#else
+    (void) cctx;
+    return 0;
+#endif
+}
+
+
+size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*cctx) + cctx->workSpaceSize
+           + ZSTD_sizeof_CDict(cctx->cdictLocal)
+           + ZSTD_sizeof_mtctx(cctx);
+}
+
+size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
+{
+    return ZSTD_sizeof_CCtx(zcs);  /* same object */
+}
+
+/* private API call, for dictBuilder only */
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
+    if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+    if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
+    if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;
+    if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog;
+    if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog;
+    if (CCtxParams->cParams.searchLength) cParams.searchLength = CCtxParams->cParams.searchLength;
+    if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength;
+    if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy;
+    return cParams;
+}
+
+static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+        ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params cctxParams;
+    memset(&cctxParams, 0, sizeof(cctxParams));
+    cctxParams.cParams = cParams;
+    cctxParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;  /* should not matter, as all cParams are presumed properly defined */
+    assert(!ZSTD_checkCParams(cParams));
+    cctxParams.fParams.contentSizeFlag = 1;
+    return cctxParams;
+}
+
+static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced(
+        ZSTD_customMem customMem)
+{
+    ZSTD_CCtx_params* params;
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+    params = (ZSTD_CCtx_params*)ZSTD_calloc(
+            sizeof(ZSTD_CCtx_params), customMem);
+    if (!params) { return NULL; }
+    params->customMem = customMem;
+    params->compressionLevel = ZSTD_CLEVEL_DEFAULT;
+    params->fParams.contentSizeFlag = 1;
+    return params;
+}
+
+ZSTD_CCtx_params* ZSTD_createCCtxParams(void)
+{
+    return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params)
+{
+    if (params == NULL) { return 0; }
+    ZSTD_free(params, params->customMem);
+    return 0;
+}
+
+size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params)
+{
+    return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT);
+}
+
+size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) {
+    if (!cctxParams) { return ERROR(GENERIC); }
+    memset(cctxParams, 0, sizeof(*cctxParams));
+    cctxParams->compressionLevel = compressionLevel;
+    cctxParams->fParams.contentSizeFlag = 1;
+    return 0;
+}
+
+size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
+{
+    if (!cctxParams) { return ERROR(GENERIC); }
+    CHECK_F( ZSTD_checkCParams(params.cParams) );
+    memset(cctxParams, 0, sizeof(*cctxParams));
+    cctxParams->cParams = params.cParams;
+    cctxParams->fParams = params.fParams;
+    cctxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT;   /* should not matter, as all cParams are presumed properly defined */
+    assert(!ZSTD_checkCParams(params.cParams));
+    return 0;
+}
+
+/* ZSTD_assignParamsToCCtxParams() :
+ * params is presumed valid at this stage */
+static ZSTD_CCtx_params ZSTD_assignParamsToCCtxParams(
+        ZSTD_CCtx_params cctxParams, ZSTD_parameters params)
+{
+    ZSTD_CCtx_params ret = cctxParams;
+    ret.cParams = params.cParams;
+    ret.fParams = params.fParams;
+    ret.compressionLevel = ZSTD_CLEVEL_DEFAULT;   /* should not matter, as all cParams are presumed properly defined */
+    assert(!ZSTD_checkCParams(params.cParams));
+    return ret;
+}
+
+#define CLAMPCHECK(val,min,max) {            \
+    if (((val)<(min)) | ((val)>(max))) {     \
+        return ERROR(parameter_outOfBound);  \
+}   }
+
+
+static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+{
+    switch(param)
+    {
+    case ZSTD_p_compressionLevel:
+    case ZSTD_p_hashLog:
+    case ZSTD_p_chainLog:
+    case ZSTD_p_searchLog:
+    case ZSTD_p_minMatch:
+    case ZSTD_p_targetLength:
+    case ZSTD_p_compressionStrategy:
+    case ZSTD_p_compressLiterals:
+        return 1;
+
+    case ZSTD_p_format:
+    case ZSTD_p_windowLog:
+    case ZSTD_p_contentSizeFlag:
+    case ZSTD_p_checksumFlag:
+    case ZSTD_p_dictIDFlag:
+    case ZSTD_p_forceMaxWindow :
+    case ZSTD_p_nbWorkers:
+    case ZSTD_p_jobSize:
+    case ZSTD_p_overlapSizeLog:
+    case ZSTD_p_enableLongDistanceMatching:
+    case ZSTD_p_ldmHashLog:
+    case ZSTD_p_ldmMinMatch:
+    case ZSTD_p_ldmBucketSizeLog:
+    case ZSTD_p_ldmHashEveryLog:
+    default:
+        return 0;
+    }
+}
+
+size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParameter (%u, %u)", (U32)param, value);
+    if (cctx->streamStage != zcss_init) {
+        if (ZSTD_isUpdateAuthorized(param)) {
+            cctx->cParamsChanged = 1;
+        } else {
+            return ERROR(stage_wrong);
+    }   }
+
+    switch(param)
+    {
+    case ZSTD_p_format :
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_compressionLevel:
+        if (cctx->cdict) return ERROR(stage_wrong);
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_windowLog:
+    case ZSTD_p_hashLog:
+    case ZSTD_p_chainLog:
+    case ZSTD_p_searchLog:
+    case ZSTD_p_minMatch:
+    case ZSTD_p_targetLength:
+    case ZSTD_p_compressionStrategy:
+        if (cctx->cdict) return ERROR(stage_wrong);
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_compressLiterals:
+    case ZSTD_p_contentSizeFlag:
+    case ZSTD_p_checksumFlag:
+    case ZSTD_p_dictIDFlag:
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_forceMaxWindow :  /* Force back-references to remain < windowSize,
+                                   * even when referencing into Dictionary content.
+                                   * default : 0 when using a CDict, 1 when using a Prefix */
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_nbWorkers:
+        if ((value>0) && cctx->staticSize) {
+            return ERROR(parameter_unsupported);  /* MT not compatible with static alloc */
+        }
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_jobSize:
+    case ZSTD_p_overlapSizeLog:
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    case ZSTD_p_enableLongDistanceMatching:
+    case ZSTD_p_ldmHashLog:
+    case ZSTD_p_ldmMinMatch:
+    case ZSTD_p_ldmBucketSizeLog:
+    case ZSTD_p_ldmHashEveryLog:
+        if (cctx->cdict) return ERROR(stage_wrong);
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
+    default: return ERROR(parameter_unsupported);
+    }
+}
+
+size_t ZSTD_CCtxParam_setParameter(
+        ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, unsigned value)
+{
+    DEBUGLOG(4, "ZSTD_CCtxParam_setParameter (%u, %u)", (U32)param, value);
+    switch(param)
+    {
+    case ZSTD_p_format :
+        if (value > (unsigned)ZSTD_f_zstd1_magicless)
+            return ERROR(parameter_unsupported);
+        CCtxParams->format = (ZSTD_format_e)value;
+        return (size_t)CCtxParams->format;
+
+    case ZSTD_p_compressionLevel : {
+        int cLevel = (int)value;  /* cast expected to restore negative sign */
+        if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel();
+        if (cLevel) {  /* 0 : does not change current level */
+            CCtxParams->disableLiteralCompression = (cLevel<0);  /* negative levels disable huffman */
+            CCtxParams->compressionLevel = cLevel;
+        }
+        if (CCtxParams->compressionLevel >= 0) return CCtxParams->compressionLevel;
+        return 0;  /* return type (size_t) cannot represent negative values */
+    }
+
+    case ZSTD_p_windowLog :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+        CCtxParams->cParams.windowLog = value;
+        return CCtxParams->cParams.windowLog;
+
+    case ZSTD_p_hashLog :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+        CCtxParams->cParams.hashLog = value;
+        return CCtxParams->cParams.hashLog;
+
+    case ZSTD_p_chainLog :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
+        CCtxParams->cParams.chainLog = value;
+        return CCtxParams->cParams.chainLog;
+
+    case ZSTD_p_searchLog :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+        CCtxParams->cParams.searchLog = value;
+        return value;
+
+    case ZSTD_p_minMatch :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+        CCtxParams->cParams.searchLength = value;
+        return CCtxParams->cParams.searchLength;
+
+    case ZSTD_p_targetLength :
+        /* all values are valid. 0 => use default */
+        CCtxParams->cParams.targetLength = value;
+        return CCtxParams->cParams.targetLength;
+
+    case ZSTD_p_compressionStrategy :
+        if (value>0)   /* 0 => use default */
+            CLAMPCHECK(value, (unsigned)ZSTD_fast, (unsigned)ZSTD_btultra);
+        CCtxParams->cParams.strategy = (ZSTD_strategy)value;
+        return (size_t)CCtxParams->cParams.strategy;
+
+    case ZSTD_p_compressLiterals:
+        CCtxParams->disableLiteralCompression = !value;
+        return !CCtxParams->disableLiteralCompression;
+
+    case ZSTD_p_contentSizeFlag :
+        /* Content size written in frame header _when known_ (default:1) */
+        DEBUGLOG(4, "set content size flag = %u", (value>0));
+        CCtxParams->fParams.contentSizeFlag = value > 0;
+        return CCtxParams->fParams.contentSizeFlag;
+
+    case ZSTD_p_checksumFlag :
+        /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+        CCtxParams->fParams.checksumFlag = value > 0;
+        return CCtxParams->fParams.checksumFlag;
+
+    case ZSTD_p_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+        DEBUGLOG(4, "set dictIDFlag = %u", (value>0));
+        CCtxParams->fParams.noDictIDFlag = !value;
+        return !CCtxParams->fParams.noDictIDFlag;
+
+    case ZSTD_p_forceMaxWindow :
+        CCtxParams->forceWindow = (value > 0);
+        return CCtxParams->forceWindow;
+
+    case ZSTD_p_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        if (value>0) return ERROR(parameter_unsupported);
+        return 0;
+#else
+        return ZSTDMT_CCtxParam_setNbWorkers(CCtxParams, value);
+#endif
+
+    case ZSTD_p_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        return ERROR(parameter_unsupported);
+#else
+        return ZSTDMT_CCtxParam_setMTCtxParameter(CCtxParams, ZSTDMT_p_jobSize, value);
+#endif
+
+    case ZSTD_p_overlapSizeLog :
+#ifndef ZSTD_MULTITHREAD
+        return ERROR(parameter_unsupported);
+#else
+        return ZSTDMT_CCtxParam_setMTCtxParameter(CCtxParams, ZSTDMT_p_overlapSectionLog, value);
+#endif
+
+    case ZSTD_p_enableLongDistanceMatching :
+        CCtxParams->ldmParams.enableLdm = (value>0);
+        return CCtxParams->ldmParams.enableLdm;
+
+    case ZSTD_p_ldmHashLog :
+        if (value>0)   /* 0 ==> auto */
+            CLAMPCHECK(value, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+        CCtxParams->ldmParams.hashLog = value;
+        return CCtxParams->ldmParams.hashLog;
+
+    case ZSTD_p_ldmMinMatch :
+        if (value>0)   /* 0 ==> default */
+            CLAMPCHECK(value, ZSTD_LDM_MINMATCH_MIN, ZSTD_LDM_MINMATCH_MAX);
+        CCtxParams->ldmParams.minMatchLength = value;
+        return CCtxParams->ldmParams.minMatchLength;
+
+    case ZSTD_p_ldmBucketSizeLog :
+        if (value > ZSTD_LDM_BUCKETSIZELOG_MAX)
+            return ERROR(parameter_outOfBound);
+        CCtxParams->ldmParams.bucketSizeLog = value;
+        return CCtxParams->ldmParams.bucketSizeLog;
+
+    case ZSTD_p_ldmHashEveryLog :
+        if (value > ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+            return ERROR(parameter_outOfBound);
+        CCtxParams->ldmParams.hashEveryLog = value;
+        return CCtxParams->ldmParams.hashEveryLog;
+
+    default: return ERROR(parameter_unsupported);
+    }
+}
+
+/** ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  just applies `params` into `cctx`
+ *  no action is performed, parameters are merely stored.
+ *  If ZSTDMT is enabled, parameters are pushed to cctx->mtctx.
+ *    This is possible even if a compression is ongoing.
+ *    In which case, new parameters will be applied on the fly, starting with next compression job.
+ */
+size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params)
+{
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    if (cctx->cdict) return ERROR(stage_wrong);
+
+    cctx->requestedParams = *params;
+    return 0;
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+    return 0;
+}
+
+size_t ZSTD_CCtx_loadDictionary_advanced(
+        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+{
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    if (cctx->staticSize) return ERROR(memory_allocation);  /* no malloc for static CCtx */
+    DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+    ZSTD_freeCDict(cctx->cdictLocal);  /* in case one already exists */
+    if (dict==NULL || dictSize==0) {   /* no dictionary mode */
+        cctx->cdictLocal = NULL;
+        cctx->cdict = NULL;
+    } else {
+        ZSTD_compressionParameters const cParams =
+                ZSTD_getCParamsFromCCtxParams(&cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, dictSize);
+        cctx->cdictLocal = ZSTD_createCDict_advanced(
+                                dict, dictSize,
+                                dictLoadMethod, dictContentType,
+                                cParams, cctx->customMem);
+        cctx->cdict = cctx->cdictLocal;
+        if (cctx->cdictLocal == NULL)
+            return ERROR(memory_allocation);
+    }
+    return 0;
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(
+      ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_CCtx_loadDictionary_advanced(
+            cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_CCtx_loadDictionary_advanced(
+            cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+
+size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    cctx->cdict = cdict;
+    memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));  /* exclusive */
+    return 0;
+}
+
+size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize)
+{
+    return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+size_t ZSTD_CCtx_refPrefix_advanced(
+        ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
+    cctx->cdict = NULL;   /* prefix discards any prior cdict */
+    cctx->prefixDict.dict = prefix;
+    cctx->prefixDict.dictSize = prefixSize;
+    cctx->prefixDict.dictContentType = dictContentType;
+    return 0;
+}
+
+static void ZSTD_startNewCompression(ZSTD_CCtx* cctx)
+{
+    cctx->streamStage = zcss_init;
+    cctx->pledgedSrcSizePlusOne = 0;
+}
+
+/*! ZSTD_CCtx_reset() :
+ *  Also dumps dictionary */
+void ZSTD_CCtx_reset(ZSTD_CCtx* cctx)
+{
+    ZSTD_startNewCompression(cctx);
+    cctx->cdict = NULL;
+}
+
+/** ZSTD_checkCParams() :
+    control CParam values remain within authorized range.
+    @return : 0, or an error code if one value is beyond authorized range */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+{
+    CLAMPCHECK(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+    CLAMPCHECK(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
+    CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+    CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+    CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+    if ((U32)(cParams.targetLength) < ZSTD_TARGETLENGTH_MIN)
+        return ERROR(parameter_unsupported);
+    if ((U32)(cParams.strategy) > (U32)ZSTD_btultra)
+        return ERROR(parameter_unsupported);
+    return 0;
+}
+
+/** ZSTD_clampCParams() :
+ *  make CParam values within valid range.
+ *  @return : valid CParams */
+static ZSTD_compressionParameters ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+{
+#   define CLAMP(val,min,max) {      \
+        if (val<min) val=min;        \
+        else if (val>max) val=max;   \
+    }
+    CLAMP(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+    CLAMP(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
+    CLAMP(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+    CLAMP(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+    CLAMP(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+    if ((U32)(cParams.targetLength) < ZSTD_TARGETLENGTH_MIN) cParams.targetLength = ZSTD_TARGETLENGTH_MIN;
+    if ((U32)(cParams.strategy) > (U32)ZSTD_btultra) cParams.strategy = ZSTD_btultra;
+    return cParams;
+}
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+    U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+    return hashLog - btScale;
+}
+
+/** ZSTD_adjustCParams_internal() :
+    optimize `cPar` for a given input (`srcSize` and `dictSize`).
+    mostly downsizing to reduce memory consumption and initialization latency.
+    Both `srcSize` and `dictSize` are optional (use 0 if unknown).
+    Note : cPar is considered validated at this stage. Use ZSTD_checkCParams() to ensure that condition. */
+ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize)
+{
+    static const U64 minSrcSize = 513; /* (1<<9) + 1 */
+    static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+    assert(ZSTD_checkCParams(cPar)==0);
+
+    if (dictSize && (srcSize+1<2) /* srcSize unknown */ )
+        srcSize = minSrcSize;  /* presumed small when there is a dictionary */
+    else if (srcSize == 0)
+        srcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* 0 == unknown : presumed large */
+
+    /* resize windowLog if input is small enough, to use less memory */
+    if ( (srcSize < maxWindowResize)
+      && (dictSize < maxWindowResize) )  {
+        U32 const tSize = (U32)(srcSize + dictSize);
+        static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+        U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+                            ZSTD_highbit32(tSize-1) + 1;
+        if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
+    }
+    if (cPar.hashLog > cPar.windowLog) cPar.hashLog = cPar.windowLog;
+    {   U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
+        if (cycleLog > cPar.windowLog)
+            cPar.chainLog -= (cycleLog - cPar.windowLog);
+    }
+
+    if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+        cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* required for frame header */
+
+    return cPar;
+}
+
+ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize)
+{
+    cPar = ZSTD_clampCParams(cPar);
+    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize);
+}
+
+static size_t ZSTD_sizeof_matchState(ZSTD_compressionParameters const* cParams, const U32 forCCtx)
+{
+    size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
+    size_t const hSize = ((size_t)1) << cParams->hashLog;
+    U32    const hashLog3 = (forCCtx && cParams->searchLength==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+    size_t const h3Size = ((size_t)1) << hashLog3;
+    size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+    size_t const optPotentialSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<<Litbits)) * sizeof(U32)
+                          + (ZSTD_OPT_NUM+1) * (sizeof(ZSTD_match_t)+sizeof(ZSTD_optimal_t));
+    size_t const optSpace = (forCCtx && ((cParams->strategy == ZSTD_btopt) ||
+                                         (cParams->strategy == ZSTD_btultra)))
+                                ? optPotentialSpace
+                                : 0;
+    DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",
+                (U32)chainSize, (U32)hSize, (U32)h3Size);
+    return tableSpace + optSpace;
+}
+
+size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+    /* Estimate CCtx size is supported for single-threaded compression only. */
+    if (params->nbWorkers > 0) { return ERROR(GENERIC); }
+    {   ZSTD_compressionParameters const cParams =
+                ZSTD_getCParamsFromCCtxParams(params, 0, 0);
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+        U32    const divider = (cParams.searchLength==3) ? 3 : 4;
+        size_t const maxNbSeq = blockSize / divider;
+        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const entropySpace = HUF_WORKSPACE_SIZE;
+        size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t);
+        size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1);
+
+        size_t const ldmSpace = ZSTD_ldm_getTableSize(params->ldmParams);
+        size_t const ldmSeqSpace = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize) * sizeof(rawSeq);
+
+        size_t const neededSpace = entropySpace + blockStateSpace + tokenSpace +
+                                   matchStateSize + ldmSpace + ldmSeqSpace;
+
+        DEBUGLOG(5, "sizeof(ZSTD_CCtx) : %u", (U32)sizeof(ZSTD_CCtx));
+        DEBUGLOG(5, "estimate workSpace : %u", (U32)neededSpace);
+        return sizeof(ZSTD_CCtx) + neededSpace;
+    }
+}
+
+size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams);
+    return ZSTD_estimateCCtxSize_usingCCtxParams(&params);
+}
+
+static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0);
+    return ZSTD_estimateCCtxSize_usingCParams(cParams);
+}
+
+size_t ZSTD_estimateCCtxSize(int compressionLevel)
+{
+    int level;
+    size_t memBudget = 0;
+    for (level=1; level<=compressionLevel; level++) {
+        size_t const newMB = ZSTD_estimateCCtxSize_internal(level);
+        if (newMB > memBudget) memBudget = newMB;
+    }
+    return memBudget;
+}
+
+size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+    if (params->nbWorkers > 0) { return ERROR(GENERIC); }
+    {   size_t const CCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(params);
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << params->cParams.windowLog);
+        size_t const inBuffSize = ((size_t)1 << params->cParams.windowLog) + blockSize;
+        size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1;
+        size_t const streamingSize = inBuffSize + outBuffSize;
+
+        return CCtxSize + streamingSize;
+    }
+}
+
+size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams);
+    return ZSTD_estimateCStreamSize_usingCCtxParams(&params);
+}
+
+static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) {
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0);
+    return ZSTD_estimateCStreamSize_usingCParams(cParams);
+}
+
+size_t ZSTD_estimateCStreamSize(int compressionLevel) {
+    int level;
+    size_t memBudget = 0;
+    for (level=1; level<=compressionLevel; level++) {
+        size_t const newMB = ZSTD_estimateCStreamSize_internal(level);
+        if (newMB > memBudget) memBudget = newMB;
+    }
+    return memBudget;
+}
+
+/* ZSTD_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads (non-blocking mode).
+ */
+ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_getFrameProgression(cctx->mtctx);
+    }
+#endif
+    {   ZSTD_frameProgression fp;
+        size_t const buffered = (cctx->inBuff == NULL) ? 0 :
+                                cctx->inBuffPos - cctx->inToCompress;
+        if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress);
+        assert(buffered <= ZSTD_BLOCKSIZE_MAX);
+        fp.ingested = cctx->consumedSrcSize + buffered;
+        fp.consumed = cctx->consumedSrcSize;
+        fp.produced = cctx->producedCSize;
+        return fp;
+}   }
+
+
+static U32 ZSTD_equivalentCParams(ZSTD_compressionParameters cParams1,
+                                  ZSTD_compressionParameters cParams2)
+{
+    return (cParams1.hashLog  == cParams2.hashLog)
+         & (cParams1.chainLog == cParams2.chainLog)
+         & (cParams1.strategy == cParams2.strategy)   /* opt parser space */
+         & ((cParams1.searchLength==3) == (cParams2.searchLength==3));  /* hashlog3 space */
+}
+
+/** The parameters are equivalent if ldm is not enabled in both sets or
+ *  all the parameters are equivalent. */
+static U32 ZSTD_equivalentLdmParams(ldmParams_t ldmParams1,
+                                    ldmParams_t ldmParams2)
+{
+    return (!ldmParams1.enableLdm && !ldmParams2.enableLdm) ||
+           (ldmParams1.enableLdm == ldmParams2.enableLdm &&
+            ldmParams1.hashLog == ldmParams2.hashLog &&
+            ldmParams1.bucketSizeLog == ldmParams2.bucketSizeLog &&
+            ldmParams1.minMatchLength == ldmParams2.minMatchLength &&
+            ldmParams1.hashEveryLog == ldmParams2.hashEveryLog);
+}
+
+typedef enum { ZSTDb_not_buffered, ZSTDb_buffered } ZSTD_buffered_policy_e;
+
+/* ZSTD_sufficientBuff() :
+ * check internal buffers exist for streaming if buffPol == ZSTDb_buffered .
+ * Note : they are assumed to be correctly sized if ZSTD_equivalentCParams()==1 */
+static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t blockSize1,
+                            ZSTD_buffered_policy_e buffPol2,
+                            ZSTD_compressionParameters cParams2,
+                            U64 pledgedSrcSize)
+{
+    size_t const windowSize2 = MAX(1, (size_t)MIN(((U64)1 << cParams2.windowLog), pledgedSrcSize));
+    size_t const blockSize2 = MIN(ZSTD_BLOCKSIZE_MAX, windowSize2);
+    size_t const neededBufferSize2 = (buffPol2==ZSTDb_buffered) ? windowSize2 + blockSize2 : 0;
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is windowSize2=%u <= wlog1=%u",
+                (U32)windowSize2, cParams2.windowLog);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is blockSize2=%u <= blockSize1=%u",
+                (U32)blockSize2, (U32)blockSize1);
+    return (blockSize2 <= blockSize1) /* seqStore space depends on blockSize */
+         & (neededBufferSize2 <= bufferSize1);
+}
+
+/** Equivalence for resetCCtx purposes */
+static U32 ZSTD_equivalentParams(ZSTD_CCtx_params params1,
+                                 ZSTD_CCtx_params params2,
+                                 size_t buffSize1, size_t blockSize1,
+                                 ZSTD_buffered_policy_e buffPol2,
+                                 U64 pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_equivalentParams: pledgedSrcSize=%u", (U32)pledgedSrcSize);
+    return ZSTD_equivalentCParams(params1.cParams, params2.cParams) &&
+           ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams) &&
+           ZSTD_sufficientBuff(buffSize1, blockSize1, buffPol2, params2.cParams, pledgedSrcSize);
+}
+
+static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
+{
+    int i;
+    for (i = 0; i < ZSTD_REP_NUM; ++i)
+        bs->rep[i] = repStartValue[i];
+    bs->entropy.hufCTable_repeatMode = HUF_repeat_none;
+    bs->entropy.offcode_repeatMode = FSE_repeat_none;
+    bs->entropy.matchlength_repeatMode = FSE_repeat_none;
+    bs->entropy.litlength_repeatMode = FSE_repeat_none;
+}
+
+/*! ZSTD_invalidateMatchState()
+ * Invalidate all the matches in the match finder tables.
+ * Requires nextSrc and base to be set (can be NULL).
+ */
+static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms)
+{
+    ZSTD_window_clear(&ms->window);
+
+    ms->nextToUpdate = ms->window.dictLimit + 1;
+    ms->loadedDictEnd = 0;
+    ms->opt.litLengthSum = 0;  /* force reset of btopt stats */
+}
+
+/*! ZSTD_continueCCtx() :
+ *  reuse CCtx without reset (note : requires no dictionary) */
+static size_t ZSTD_continueCCtx(ZSTD_CCtx* cctx, ZSTD_CCtx_params params, U64 pledgedSrcSize)
+{
+    size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize));
+    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+    DEBUGLOG(4, "ZSTD_continueCCtx: re-use context in place");
+
+    cctx->blockSize = blockSize;   /* previous block size could be different even for same windowLog, due to pledgedSrcSize */
+    cctx->appliedParams = params;
+    cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+    cctx->consumedSrcSize = 0;
+    cctx->producedCSize = 0;
+    if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+        cctx->appliedParams.fParams.contentSizeFlag = 0;
+    DEBUGLOG(4, "pledged content size : %u ; flag : %u",
+        (U32)pledgedSrcSize, cctx->appliedParams.fParams.contentSizeFlag);
+    cctx->stage = ZSTDcs_init;
+    cctx->dictID = 0;
+    if (params.ldmParams.enableLdm)
+        ZSTD_window_clear(&cctx->ldmState.window);
+    ZSTD_referenceExternalSequences(cctx, NULL, 0);
+    ZSTD_invalidateMatchState(&cctx->blockState.matchState);
+    ZSTD_reset_compressedBlockState(cctx->blockState.prevCBlock);
+    XXH64_reset(&cctx->xxhState, 0);
+    return 0;
+}
+
+typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset } ZSTD_compResetPolicy_e;
+
+static void* ZSTD_reset_matchState(ZSTD_matchState_t* ms, void* ptr, ZSTD_compressionParameters const* cParams, ZSTD_compResetPolicy_e const crp, U32 const forCCtx)
+{
+    size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
+    size_t const hSize = ((size_t)1) << cParams->hashLog;
+    U32    const hashLog3 = (forCCtx && cParams->searchLength==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+    size_t const h3Size = ((size_t)1) << hashLog3;
+    size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+
+    assert(((size_t)ptr & 3) == 0);
+
+    ms->hashLog3 = hashLog3;
+    memset(&ms->window, 0, sizeof(ms->window));
+    ZSTD_invalidateMatchState(ms);
+
+    /* opt parser space */
+    if (forCCtx && ((cParams->strategy == ZSTD_btopt) | (cParams->strategy == ZSTD_btultra))) {
+        DEBUGLOG(4, "reserving optimal parser space");
+        ms->opt.litFreq = (U32*)ptr;
+        ms->opt.litLengthFreq = ms->opt.litFreq + (1<<Litbits);
+        ms->opt.matchLengthFreq = ms->opt.litLengthFreq + (MaxLL+1);
+        ms->opt.offCodeFreq = ms->opt.matchLengthFreq + (MaxML+1);
+        ptr = ms->opt.offCodeFreq + (MaxOff+1);
+        ms->opt.matchTable = (ZSTD_match_t*)ptr;
+        ptr = ms->opt.matchTable + ZSTD_OPT_NUM+1;
+        ms->opt.priceTable = (ZSTD_optimal_t*)ptr;
+        ptr = ms->opt.priceTable + ZSTD_OPT_NUM+1;
+    }
+
+    /* table Space */
+    DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_noMemset);
+    assert(((size_t)ptr & 3) == 0);  /* ensure ptr is properly aligned */
+    if (crp!=ZSTDcrp_noMemset) memset(ptr, 0, tableSpace);   /* reset tables only */
+    ms->hashTable = (U32*)(ptr);
+    ms->chainTable = ms->hashTable + hSize;
+    ms->hashTable3 = ms->chainTable + chainSize;
+    ptr = ms->hashTable3 + h3Size;
+
+    assert(((size_t)ptr & 3) == 0);
+    return ptr;
+}
+
+/*! ZSTD_resetCCtx_internal() :
+    note : `params` are assumed fully validated at this stage */
+static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+                                      ZSTD_CCtx_params params, U64 pledgedSrcSize,
+                                      ZSTD_compResetPolicy_e const crp,
+                                      ZSTD_buffered_policy_e const zbuff)
+{
+    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u",
+                (U32)pledgedSrcSize, params.cParams.windowLog);
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+
+    if (crp == ZSTDcrp_continue) {
+        if (ZSTD_equivalentParams(zc->appliedParams, params,
+                                zc->inBuffSize, zc->blockSize,
+                                zbuff, pledgedSrcSize)) {
+            DEBUGLOG(4, "ZSTD_equivalentParams()==1 -> continue mode (wLog1=%u, blockSize1=%u)",
+                        zc->appliedParams.cParams.windowLog, (U32)zc->blockSize);
+            return ZSTD_continueCCtx(zc, params, pledgedSrcSize);
+    }   }
+    DEBUGLOG(4, "ZSTD_equivalentParams()==0 -> reset CCtx");
+
+    if (params.ldmParams.enableLdm) {
+        /* Adjust long distance matching parameters */
+        params.ldmParams.windowLog = params.cParams.windowLog;
+        ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
+        assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
+        assert(params.ldmParams.hashEveryLog < 32);
+        zc->ldmState.hashPower =
+                ZSTD_ldm_getHashPower(params.ldmParams.minMatchLength);
+    }
+
+    {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize));
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+        U32    const divider = (params.cParams.searchLength==3) ? 3 : 4;
+        size_t const maxNbSeq = blockSize / divider;
+        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0;
+        size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0;
+        size_t const matchStateSize = ZSTD_sizeof_matchState(&params.cParams, /* forCCtx */ 1);
+        size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize);
+        void* ptr;
+
+        /* Check if workSpace is large enough, alloc a new one if needed */
+        {   size_t const entropySpace = HUF_WORKSPACE_SIZE;
+            size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t);
+            size_t const bufferSpace = buffInSize + buffOutSize;
+            size_t const ldmSpace = ZSTD_ldm_getTableSize(params.ldmParams);
+            size_t const ldmSeqSpace = maxNbLdmSeq * sizeof(rawSeq);
+
+            size_t const neededSpace = entropySpace + blockStateSpace + ldmSpace +
+                                       ldmSeqSpace + matchStateSize + tokenSpace +
+                                       bufferSpace;
+            DEBUGLOG(4, "Need %uKB workspace, including %uKB for match state, and %uKB for buffers",
+                        (U32)(neededSpace>>10), (U32)(matchStateSize>>10), (U32)(bufferSpace>>10));
+            DEBUGLOG(4, "windowSize: %u - blockSize: %u", (U32)windowSize, (U32)blockSize);
+
+            if (zc->workSpaceSize < neededSpace) {  /* too small : resize */
+                DEBUGLOG(4, "Need to update workSpaceSize from %uK to %uK",
+                            (unsigned)(zc->workSpaceSize>>10),
+                            (unsigned)(neededSpace>>10));
+                /* static cctx : no resize, error out */
+                if (zc->staticSize) return ERROR(memory_allocation);
+
+                zc->workSpaceSize = 0;
+                ZSTD_free(zc->workSpace, zc->customMem);
+                zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem);
+                if (zc->workSpace == NULL) return ERROR(memory_allocation);
+                zc->workSpaceSize = neededSpace;
+                ptr = zc->workSpace;
+
+                /* Statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+                assert(((size_t)zc->workSpace & 3) == 0);   /* ensure correct alignment */
+                assert(zc->workSpaceSize >= 2 * sizeof(ZSTD_compressedBlockState_t));
+                zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)zc->workSpace;
+                zc->blockState.nextCBlock = zc->blockState.prevCBlock + 1;
+                ptr = zc->blockState.nextCBlock + 1;
+                zc->entropyWorkspace = (U32*)ptr;
+        }   }
+
+        /* init params */
+        zc->appliedParams = params;
+        zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+        zc->consumedSrcSize = 0;
+        zc->producedCSize = 0;
+        if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+            zc->appliedParams.fParams.contentSizeFlag = 0;
+        DEBUGLOG(4, "pledged content size : %u ; flag : %u",
+            (U32)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag);
+        zc->blockSize = blockSize;
+
+        XXH64_reset(&zc->xxhState, 0);
+        zc->stage = ZSTDcs_init;
+        zc->dictID = 0;
+
+        ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+
+        ptr = zc->entropyWorkspace + HUF_WORKSPACE_SIZE_U32;
+
+        /* ldm hash table */
+        /* initialize bucketOffsets table later for pointer alignment */
+        if (params.ldmParams.enableLdm) {
+            size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog;
+            memset(ptr, 0, ldmHSize * sizeof(ldmEntry_t));
+            assert(((size_t)ptr & 3) == 0); /* ensure ptr is properly aligned */
+            zc->ldmState.hashTable = (ldmEntry_t*)ptr;
+            ptr = zc->ldmState.hashTable + ldmHSize;
+            zc->ldmSequences = (rawSeq*)ptr;
+            ptr = zc->ldmSequences + maxNbLdmSeq;
+            zc->maxNbLdmSequences = maxNbLdmSeq;
+
+            memset(&zc->ldmState.window, 0, sizeof(zc->ldmState.window));
+        }
+        assert(((size_t)ptr & 3) == 0); /* ensure ptr is properly aligned */
+
+        ptr = ZSTD_reset_matchState(&zc->blockState.matchState, ptr, &params.cParams, crp, /* forCCtx */ 1);
+
+        /* sequences storage */
+        zc->seqStore.sequencesStart = (seqDef*)ptr;
+        ptr = zc->seqStore.sequencesStart + maxNbSeq;
+        zc->seqStore.llCode = (BYTE*) ptr;
+        zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
+        zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
+        zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
+        ptr = zc->seqStore.litStart + blockSize;
+
+        /* ldm bucketOffsets table */
+        if (params.ldmParams.enableLdm) {
+            size_t const ldmBucketSize =
+                  ((size_t)1) << (params.ldmParams.hashLog -
+                                  params.ldmParams.bucketSizeLog);
+            memset(ptr, 0, ldmBucketSize);
+            zc->ldmState.bucketOffsets = (BYTE*)ptr;
+            ptr = zc->ldmState.bucketOffsets + ldmBucketSize;
+            ZSTD_window_clear(&zc->ldmState.window);
+        }
+        ZSTD_referenceExternalSequences(zc, NULL, 0);
+
+        /* buffers */
+        zc->inBuffSize = buffInSize;
+        zc->inBuff = (char*)ptr;
+        zc->outBuffSize = buffOutSize;
+        zc->outBuff = zc->inBuff + buffInSize;
+
+        return 0;
+    }
+}
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) {
+    int i;
+    for (i=0; i<ZSTD_REP_NUM; i++) cctx->blockState.prevCBlock->rep[i] = 0;
+    assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
+}
+
+static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            unsigned windowLog,
+                            ZSTD_frameParameters fParams,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    {   ZSTD_CCtx_params params = cctx->requestedParams;
+        /* Copy only compression parameters related to tables. */
+        params.cParams = cdict->cParams;
+        if (windowLog) params.cParams.windowLog = windowLog;
+        params.fParams = fParams;
+        ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                ZSTDcrp_noMemset, zbuff);
+        assert(cctx->appliedParams.cParams.strategy == cdict->cParams.strategy);
+        assert(cctx->appliedParams.cParams.hashLog == cdict->cParams.hashLog);
+        assert(cctx->appliedParams.cParams.chainLog == cdict->cParams.chainLog);
+    }
+
+    /* copy tables */
+    {   size_t const chainSize = (cdict->cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict->cParams.chainLog);
+        size_t const hSize =  (size_t)1 << cdict->cParams.hashLog;
+        size_t const tableSpace = (chainSize + hSize) * sizeof(U32);
+        assert((U32*)cctx->blockState.matchState.chainTable == (U32*)cctx->blockState.matchState.hashTable + hSize);  /* chainTable must follow hashTable */
+        assert((U32*)cctx->blockState.matchState.hashTable3 == (U32*)cctx->blockState.matchState.chainTable + chainSize);
+        assert((U32*)cdict->matchState.chainTable == (U32*)cdict->matchState.hashTable + hSize);  /* chainTable must follow hashTable */
+        assert((U32*)cdict->matchState.hashTable3 == (U32*)cdict->matchState.chainTable + chainSize);
+        memcpy(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, tableSpace);   /* presumes all tables follow each other */
+    }
+    /* Zero the hashTable3, since the cdict never fills it */
+    {   size_t const h3Size = (size_t)1 << cctx->blockState.matchState.hashLog3;
+        assert(cdict->matchState.hashLog3 == 0);
+        memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
+    }
+
+    /* copy dictionary offsets */
+    {
+        ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+        ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
+        dstMatchState->window       = srcMatchState->window;
+        dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+        dstMatchState->nextToUpdate3= srcMatchState->nextToUpdate3;
+        dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+    }
+    cctx->dictID = cdict->dictID;
+
+    /* copy block state */
+    memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+/*! ZSTD_copyCCtx_internal() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  The "context", in this case, refers to the hash and chain tables,
+ *  entropy tables, and dictionary references.
+ * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx.
+ * @return : 0, or an error code */
+static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+                            const ZSTD_CCtx* srcCCtx,
+                            ZSTD_frameParameters fParams,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    DEBUGLOG(5, "ZSTD_copyCCtx_internal");
+    if (srcCCtx->stage!=ZSTDcs_init) return ERROR(stage_wrong);
+
+    memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
+    {   ZSTD_CCtx_params params = dstCCtx->requestedParams;
+        /* Copy only compression parameters related to tables. */
+        params.cParams = srcCCtx->appliedParams.cParams;
+        params.fParams = fParams;
+        ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize,
+                                ZSTDcrp_noMemset, zbuff);
+        assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog);
+        assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy);
+        assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog);
+        assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog);
+        assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3);
+    }
+
+    /* copy tables */
+    {   size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog);
+        size_t const hSize =  (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;
+        size_t const h3Size = (size_t)1 << srcCCtx->blockState.matchState.hashLog3;
+        size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+        assert((U32*)dstCCtx->blockState.matchState.chainTable == (U32*)dstCCtx->blockState.matchState.hashTable + hSize);  /* chainTable must follow hashTable */
+        assert((U32*)dstCCtx->blockState.matchState.hashTable3 == (U32*)dstCCtx->blockState.matchState.chainTable + chainSize);
+        memcpy(dstCCtx->blockState.matchState.hashTable, srcCCtx->blockState.matchState.hashTable, tableSpace);   /* presumes all tables follow each other */
+    }
+
+    /* copy dictionary offsets */
+    {
+        ZSTD_matchState_t const* srcMatchState = &srcCCtx->blockState.matchState;
+        ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
+        dstMatchState->window       = srcMatchState->window;
+        dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+        dstMatchState->nextToUpdate3= srcMatchState->nextToUpdate3;
+        dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+    }
+    dstCCtx->dictID = srcCCtx->dictID;
+
+    /* copy block state */
+    memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock));
+
+    return 0;
+}
+
+/*! ZSTD_copyCCtx() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  pledgedSrcSize==0 means "unknown".
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize)
+{
+    ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    ZSTD_buffered_policy_e const zbuff = (ZSTD_buffered_policy_e)(srcCCtx->inBuffSize>0);
+    ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1);
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN);
+
+    return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx,
+                                fParams, pledgedSrcSize,
+                                zbuff);
+}
+
+
+#define ZSTD_ROWSIZE 16
+/*! ZSTD_reduceTable() :
+ *  reduce table indexes by `reducerValue`, or squash to zero.
+ *  PreserveMark preserves "unsorted mark" for btlazy2 strategy.
+ *  It must be set to a clear 0/1 value, to remove branch during inlining.
+ *  Presume table size is a multiple of ZSTD_ROWSIZE
+ *  to help auto-vectorization */
+FORCE_INLINE_TEMPLATE void
+ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark)
+{
+    int const nbRows = (int)size / ZSTD_ROWSIZE;
+    int cellNb = 0;
+    int rowNb;
+    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
+    assert(size < (1U<<31));   /* can be casted to int */
+    for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
+        int column;
+        for (column=0; column<ZSTD_ROWSIZE; column++) {
+            if (preserveMark) {
+                U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
+                table[cellNb] += adder;
+            }
+            if (table[cellNb] < reducerValue) table[cellNb] = 0;
+            else table[cellNb] -= reducerValue;
+            cellNb++;
+    }   }
+}
+
+static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 0);
+}
+
+static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 1);
+}
+
+/*! ZSTD_reduceIndex() :
+*   rescale all indexes to avoid future overflow (indexes are U32) */
+static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
+{
+    ZSTD_matchState_t* const ms = &zc->blockState.matchState;
+    {   U32 const hSize = (U32)1 << zc->appliedParams.cParams.hashLog;
+        ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
+    }
+
+    if (zc->appliedParams.cParams.strategy != ZSTD_fast) {
+        U32 const chainSize = (U32)1 << zc->appliedParams.cParams.chainLog;
+        if (zc->appliedParams.cParams.strategy == ZSTD_btlazy2)
+            ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
+        else
+            ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
+    }
+
+    if (ms->hashLog3) {
+        U32 const h3Size = (U32)1 << ms->hashLog3;
+        ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue);
+    }
+}
+
+
+/*-*******************************************************
+*  Block entropic compression
+*********************************************************/
+
+/* See doc/zstd_compression_format.md for detailed format description */
+
+size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    if (srcSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
+    MEM_writeLE24(dst, (U32)(srcSize << 2) + (U32)bt_raw);
+    return ZSTD_blockHeaderSize+srcSize;
+}
+
+
+static size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE* const)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    if (srcSize + flSize > dstCapacity) return ERROR(dstSize_tooSmall);
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    memcpy(ostart + flSize, src, srcSize);
+    return srcSize + flSize;
+}
+
+static size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE* const)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    ostart[flSize] = *(const BYTE*)src;
+    return flSize+1;
+}
+
+
+static size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 2; }
+
+static size_t ZSTD_compressLiterals (ZSTD_entropyCTables_t const* prevEntropy,
+                                     ZSTD_entropyCTables_t* nextEntropy,
+                                     ZSTD_strategy strategy, int disableLiteralCompression,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     U32* workspace, const int bmi2)
+{
+    size_t const minGain = ZSTD_minGain(srcSize);
+    size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+    BYTE*  const ostart = (BYTE*)dst;
+    U32 singleStream = srcSize < 256;
+    symbolEncodingType_e hType = set_compressed;
+    size_t cLitSize;
+
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i)",
+                disableLiteralCompression);
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    nextEntropy->hufCTable_repeatMode = prevEntropy->hufCTable_repeatMode;
+    memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable,
+           sizeof(prevEntropy->hufCTable));
+
+    if (disableLiteralCompression)
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+    /* small ? don't even attempt compression (speed opt) */
+#   define COMPRESS_LITERALS_SIZE_MIN 63
+    {   size_t const minLitSize = (prevEntropy->hufCTable_repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }
+
+    if (dstCapacity < lhSize+1) return ERROR(dstSize_tooSmall);   /* not enough space for compression */
+    {   HUF_repeat repeat = prevEntropy->hufCTable_repeatMode;
+        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+        if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+        cLitSize = singleStream ? HUF_compress1X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11,
+                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat, bmi2)
+                                : HUF_compress4X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11,
+                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat, bmi2);
+        if (repeat != HUF_repeat_none) {
+            /* reused the existing table */
+            hType = set_repeat;
+        }
+    }
+
+    if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) {
+        memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable, sizeof(prevEntropy->hufCTable));
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }
+    if (cLitSize==1) {
+        memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable, sizeof(prevEntropy->hufCTable));
+        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }
+
+    if (hType == set_compressed) {
+        /* using a newly constructed table */
+        nextEntropy->hufCTable_repeatMode = HUF_repeat_check;
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    return lhSize+cLitSize;
+}
+
+
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+{
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    BYTE* const llCodeTable = seqStorePtr->llCode;
+    BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    U32 u;
+    for (u=0; u<nbSeq; u++) {
+        U32 const llv = sequences[u].litLength;
+        U32 const mlv = sequences[u].matchLength;
+        llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset);
+        mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
+    }
+    if (seqStorePtr->longLengthID==1)
+        llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+    if (seqStorePtr->longLengthID==2)
+        mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+}
+
+typedef enum {
+    ZSTD_defaultDisallowed = 0,
+    ZSTD_defaultAllowed = 1
+} ZSTD_defaultPolicy_e;
+
+MEM_STATIC
+symbolEncodingType_e ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, size_t const mostFrequent, size_t nbSeq,
+        U32 defaultNormLog, ZSTD_defaultPolicy_e const isDefaultAllowed)
+{
+#define MIN_SEQ_FOR_DYNAMIC_FSE   64
+#define MAX_SEQ_FOR_STATIC_FSE  1000
+    ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
+    if ((mostFrequent == nbSeq) && (!isDefaultAllowed || nbSeq > 2)) {
+        DEBUGLOG(5, "Selected set_rle");
+        /* Prefer set_basic over set_rle when there are 2 or less symbols,
+         * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+         * If basic encoding isn't possible, always choose RLE.
+         */
+        *repeatMode = FSE_repeat_check;
+        return set_rle;
+    }
+    if ( isDefaultAllowed
+      && (*repeatMode == FSE_repeat_valid) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+        DEBUGLOG(5, "Selected set_repeat");
+        return set_repeat;
+    }
+    if ( isDefaultAllowed
+      && ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (defaultNormLog-1)))) ) {
+        DEBUGLOG(5, "Selected set_basic");
+        /* The format allows default tables to be repeated, but it isn't useful.
+         * When using simple heuristics to select encoding type, we don't want
+         * to confuse these tables with dictionaries. When running more careful
+         * analysis, we don't need to waste time checking both repeating tables
+         * and default tables.
+         */
+        *repeatMode = FSE_repeat_none;
+        return set_basic;
+    }
+    DEBUGLOG(5, "Selected set_compressed");
+    *repeatMode = FSE_repeat_check;
+    return set_compressed;
+}
+
+MEM_STATIC
+size_t ZSTD_buildCTable(void* dst, size_t dstCapacity,
+        FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+        U32* count, U32 max,
+        BYTE const* codeTable, size_t nbSeq,
+        S16 const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+        FSE_CTable const* prevCTable, size_t prevCTableSize,
+        void* workspace, size_t workspaceSize)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE const* const oend = op + dstCapacity;
+
+    switch (type) {
+    case set_rle:
+        *op = codeTable[0];
+        CHECK_F(FSE_buildCTable_rle(nextCTable, (BYTE)max));
+        return 1;
+    case set_repeat:
+        memcpy(nextCTable, prevCTable, prevCTableSize);
+        return 0;
+    case set_basic:
+        CHECK_F(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, workspace, workspaceSize));  /* note : could be pre-calculated */
+        return 0;
+    case set_compressed: {
+        S16 norm[MaxSeq + 1];
+        size_t nbSeq_1 = nbSeq;
+        const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+        if (count[codeTable[nbSeq-1]] > 1) {
+            count[codeTable[nbSeq-1]]--;
+            nbSeq_1--;
+        }
+        assert(nbSeq_1 > 1);
+        CHECK_F(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max));
+        {   size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog);   /* overflow protected */
+            if (FSE_isError(NCountSize)) return NCountSize;
+            CHECK_F(FSE_buildCTable_wksp(nextCTable, norm, max, tableLog, workspace, workspaceSize));
+            return NCountSize;
+        }
+    }
+    default: return assert(0), ERROR(GENERIC);
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_encodeSequences_body(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    BIT_CStream_t blockStream;
+    FSE_CState_t  stateMatchLength;
+    FSE_CState_t  stateOffsetBits;
+    FSE_CState_t  stateLitLength;
+
+    CHECK_E(BIT_initCStream(&blockStream, dst, dstCapacity), dstSize_tooSmall); /* not enough space remaining */
+
+    /* first symbols */
+    FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  ofCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbSeq-1]);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    if (longOffsets) {
+        U32 const ofBits = ofCodeTable[nbSeq-1];
+        int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+        if (extraBits) {
+            BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits);
+            BIT_flushBits(&blockStream);
+        }
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits,
+                    ofBits - extraBits);
+    } else {
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]);
+    }
+    BIT_flushBits(&blockStream);
+
+    {   size_t n;
+        for (n=nbSeq-2 ; n<nbSeq ; n--) {      /* intentional underflow */
+            BYTE const llCode = llCodeTable[n];
+            BYTE const ofCode = ofCodeTable[n];
+            BYTE const mlCode = mlCodeTable[n];
+            U32  const llBits = LL_bits[llCode];
+            U32  const ofBits = ofCode;
+            U32  const mlBits = ML_bits[mlCode];
+            DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u",
+                        sequences[n].litLength,
+                        sequences[n].matchLength + MINMATCH,
+                        sequences[n].offset);
+                                                                            /* 32b*/  /* 64b*/
+                                                                            /* (7)*/  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);       /* 15 */  /* 15 */
+            FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 24 */  /* 24 */
+            if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateLitLength, llCode);        /* 16 */  /* 33 */
+            if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
+                BIT_flushBits(&blockStream);                                /* (7)*/
+            BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+            if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
+            BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
+            if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream);
+            if (longOffsets) {
+                int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+                if (extraBits) {
+                    BIT_addBits(&blockStream, sequences[n].offset, extraBits);
+                    BIT_flushBits(&blockStream);                            /* (7)*/
+                }
+                BIT_addBits(&blockStream, sequences[n].offset >> extraBits,
+                            ofBits - extraBits);                            /* 31 */
+            } else {
+                BIT_addBits(&blockStream, sequences[n].offset, ofBits);     /* 31 */
+            }
+            BIT_flushBits(&blockStream);                                    /* (7)*/
+    }   }
+
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog);
+    FSE_flushCState(&blockStream, &stateMatchLength);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog);
+    FSE_flushCState(&blockStream, &stateOffsetBits);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog);
+    FSE_flushCState(&blockStream, &stateLitLength);
+
+    {   size_t const streamSize = BIT_closeCStream(&blockStream);
+        if (streamSize==0) return ERROR(dstSize_tooSmall);   /* not enough space */
+        return streamSize;
+    }
+}
+
+static size_t
+ZSTD_encodeSequences_default(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+ZSTD_encodeSequences_bmi2(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+#endif
+
+size_t ZSTD_encodeSequences(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return ZSTD_encodeSequences_bmi2(dst, dstCapacity,
+                                         CTable_MatchLength, mlCodeTable,
+                                         CTable_OffsetBits, ofCodeTable,
+                                         CTable_LitLength, llCodeTable,
+                                         sequences, nbSeq, longOffsets);
+    }
+#endif
+    (void)bmi2;
+    return ZSTD_encodeSequences_default(dst, dstCapacity,
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq, longOffsets);
+}
+
+MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr,
+                              ZSTD_entropyCTables_t const* prevEntropy,
+                              ZSTD_entropyCTables_t* nextEntropy,
+                              ZSTD_CCtx_params const* cctxParams,
+                              void* dst, size_t dstCapacity, U32* workspace,
+                              const int bmi2)
+{
+    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    U32 count[MaxSeq+1];
+    FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable;
+    U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    const BYTE* const llCodeTable = seqStorePtr->llCode;
+    const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    BYTE* seqHead;
+
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+
+    /* Compress literals */
+    {   const BYTE* const literals = seqStorePtr->litStart;
+        size_t const litSize = seqStorePtr->lit - literals;
+        size_t const cSize = ZSTD_compressLiterals(
+                                    prevEntropy, nextEntropy,
+                                    cctxParams->cParams.strategy, cctxParams->disableLiteralCompression,
+                                    op, dstCapacity,
+                                    literals, litSize,
+                                    workspace, bmi2);
+        if (ZSTD_isError(cSize))
+          return cSize;
+        assert(cSize <= dstCapacity);
+        op += cSize;
+    }
+
+    /* Sequences Header */
+    if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/) return ERROR(dstSize_tooSmall);
+    if (nbSeq < 0x7F)
+        *op++ = (BYTE)nbSeq;
+    else if (nbSeq < LONGNBSEQ)
+        op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+    else
+        op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+    if (nbSeq==0) {
+      memcpy(nextEntropy->litlengthCTable, prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable));
+      nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
+      memcpy(nextEntropy->offcodeCTable, prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable));
+      nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
+      memcpy(nextEntropy->matchlengthCTable, prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable));
+      nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
+      return op - ostart;
+    }
+
+    /* seqHead : flags for FSE encoding type */
+    seqHead = op++;
+
+    /* convert length/distances into codes */
+    ZSTD_seqToCodes(seqStorePtr);
+    /* build CTable for Literal Lengths */
+    {   U32 max = MaxLL;
+        size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);
+        DEBUGLOG(5, "Building LL table");
+        nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
+        LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, mostFrequent, nbSeq, LL_defaultNormLog, ZSTD_defaultAllowed);
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
+                    count, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                    prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable),
+                    workspace, HUF_WORKSPACE_SIZE);
+            if (ZSTD_isError(countSize)) return countSize;
+            op += countSize;
+    }   }
+    /* build CTable for Offsets */
+    {   U32 max = MaxOff;
+        size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);
+        /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+        DEBUGLOG(5, "Building OF table");
+        nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
+        Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, mostFrequent, nbSeq, OF_defaultNormLog, defaultPolicy);
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype,
+                    count, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                    prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable),
+                    workspace, HUF_WORKSPACE_SIZE);
+            if (ZSTD_isError(countSize)) return countSize;
+            op += countSize;
+    }   }
+    /* build CTable for MatchLengths */
+    {   U32 max = MaxML;
+        size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);
+        DEBUGLOG(5, "Building ML table");
+        nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
+        MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, mostFrequent, nbSeq, ML_defaultNormLog, ZSTD_defaultAllowed);
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
+                    count, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML,
+                    prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable),
+                    workspace, HUF_WORKSPACE_SIZE);
+            if (ZSTD_isError(countSize)) return countSize;
+            op += countSize;
+    }   }
+
+    *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+
+    {   size_t const bitstreamSize = ZSTD_encodeSequences(
+                                        op, oend - op,
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq,
+                                        longOffsets, bmi2);
+        if (ZSTD_isError(bitstreamSize)) return bitstreamSize;
+        op += bitstreamSize;
+    }
+
+    return op - ostart;
+}
+
+MEM_STATIC size_t ZSTD_compressSequences(seqStore_t* seqStorePtr,
+                              ZSTD_entropyCTables_t const* prevEntropy,
+                              ZSTD_entropyCTables_t* nextEntropy,
+                              ZSTD_CCtx_params const* cctxParams,
+                              void* dst, size_t dstCapacity,
+                              size_t srcSize, U32* workspace, int bmi2)
+{
+    size_t const cSize = ZSTD_compressSequences_internal(
+            seqStorePtr, prevEntropy, nextEntropy, cctxParams, dst, dstCapacity,
+            workspace, bmi2);
+    /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+     * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+     */
+    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
+        return 0;  /* block not compressed */
+    if (ZSTD_isError(cSize)) return cSize;
+
+    /* Check compressibility */
+    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize);  /* note : fixed formula, maybe should depend on compression level, or strategy */
+        if (cSize >= maxCSize) return 0;  /* block not compressed */
+    }
+
+    /* We check that dictionaries have offset codes available for the first
+     * block. After the first block, the offcode table might not have large
+     * enough codes to represent the offsets in the data.
+     */
+    if (nextEntropy->offcode_repeatMode == FSE_repeat_valid)
+        nextEntropy->offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}
+
+/* ZSTD_selectBlockCompressor() :
+ * Not static, but internal use only (used by long distance matcher)
+ * assumption : strat is a valid strategy */
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict)
+{
+    static const ZSTD_blockCompressor blockCompressor[2][(unsigned)ZSTD_btultra+1] = {
+        { ZSTD_compressBlock_fast  /* default for 0 */,
+          ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy,
+          ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2, ZSTD_compressBlock_btlazy2,
+          ZSTD_compressBlock_btopt, ZSTD_compressBlock_btultra },
+        { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+          ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict,
+          ZSTD_compressBlock_lazy_extDict,ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict,
+          ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btultra_extDict }
+    };
+    ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
+
+    assert((U32)strat >= (U32)ZSTD_fast);
+    assert((U32)strat <= (U32)ZSTD_btultra);
+    return blockCompressor[extDict!=0][(U32)strat];
+}
+
+static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
+                                   const BYTE* anchor, size_t lastLLSize)
+{
+    memcpy(seqStorePtr->lit, anchor, lastLLSize);
+    seqStorePtr->lit += lastLLSize;
+}
+
+static void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+{
+    ssPtr->lit = ssPtr->litStart;
+    ssPtr->sequences = ssPtr->sequencesStart;
+    ssPtr->longLengthID = 0;
+}
+
+static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                                        void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize)
+{
+    ZSTD_matchState_t* const ms = &zc->blockState.matchState;
+    DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                (U32)dstCapacity, ms->window.dictLimit, ms->nextToUpdate);
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+        ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.searchLength);
+        return 0;   /* don't even attempt compression below a certain srcSize */
+    }
+    ZSTD_resetSeqStore(&(zc->seqStore));
+
+    /* limited update after a very long match */
+    {   const BYTE* const base = ms->window.base;
+        const BYTE* const istart = (const BYTE*)src;
+        const U32 current = (U32)(istart-base);
+        if (current > ms->nextToUpdate + 384)
+            ms->nextToUpdate = current - MIN(192, (U32)(current - ms->nextToUpdate - 384));
+    }
+
+    /* select and store sequences */
+    {   U32 const extDict = ZSTD_window_hasExtDict(ms->window);
+        size_t lastLLSize;
+        {   int i;
+            for (i = 0; i < ZSTD_REP_NUM; ++i)
+                zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i];
+        }
+        if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+            assert(!zc->appliedParams.ldmParams.enableLdm);
+            /* Updates ldmSeqStore.pos */
+            lastLLSize =
+                ZSTD_ldm_blockCompress(&zc->externSeqStore,
+                                       ms, &zc->seqStore,
+                                       zc->blockState.nextCBlock->rep,
+                                       &zc->appliedParams.cParams,
+                                       src, srcSize, extDict);
+            assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
+        } else if (zc->appliedParams.ldmParams.enableLdm) {
+            rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0};
+
+            ldmSeqStore.seq = zc->ldmSequences;
+            ldmSeqStore.capacity = zc->maxNbLdmSequences;
+            /* Updates ldmSeqStore.size */
+            CHECK_F(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore,
+                                               &zc->appliedParams.ldmParams,
+                                               src, srcSize));
+            /* Updates ldmSeqStore.pos */
+            lastLLSize =
+                ZSTD_ldm_blockCompress(&ldmSeqStore,
+                                       ms, &zc->seqStore,
+                                       zc->blockState.nextCBlock->rep,
+                                       &zc->appliedParams.cParams,
+                                       src, srcSize, extDict);
+            assert(ldmSeqStore.pos == ldmSeqStore.size);
+        } else {   /* not long range mode */
+            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, extDict);
+            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, &zc->appliedParams.cParams, src, srcSize);
+        }
+        {   const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
+            ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
+    }   }
+
+    /* encode sequences and literals */
+    {   size_t const cSize = ZSTD_compressSequences(&zc->seqStore,
+                                &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+                                &zc->appliedParams,
+                                dst, dstCapacity,
+                                srcSize, zc->entropyWorkspace, zc->bmi2);
+        if (ZSTD_isError(cSize) || cSize == 0) return cSize;
+        /* confirm repcodes and entropy tables */
+        {   ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
+            zc->blockState.prevCBlock = zc->blockState.nextCBlock;
+            zc->blockState.nextCBlock = tmp;
+        }
+        return cSize;
+    }
+}
+
+
+/*! ZSTD_compress_frameChunk() :
+*   Compress a chunk of data into one or multiple blocks.
+*   All blocks will be terminated, all input will be consumed.
+*   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+*   Frame is supposed already started (header already produced)
+*   @return : compressed size, or an error code
+*/
+static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     U32 lastFrameChunk)
+{
+    size_t blockSize = cctx->blockSize;
+    size_t remaining = srcSize;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog;
+    assert(cctx->appliedParams.cParams.windowLog <= 31);
+
+    DEBUGLOG(5, "ZSTD_compress_frameChunk (blockSize=%u)", (U32)blockSize);
+    if (cctx->appliedParams.fParams.checksumFlag && srcSize)
+        XXH64_update(&cctx->xxhState, src, srcSize);
+
+    while (remaining) {
+        ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+        U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+
+        if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE)
+            return ERROR(dstSize_tooSmall);   /* not enough space to store compressed block */
+        if (remaining < blockSize) blockSize = remaining;
+
+        if (ZSTD_window_needOverflowCorrection(ms->window, ip + blockSize)) {
+            U32 const cycleLog = ZSTD_cycleLog(cctx->appliedParams.cParams.chainLog, cctx->appliedParams.cParams.strategy);
+            U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip);
+            ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+
+            ZSTD_reduceIndex(cctx, correction);
+            if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
+            else ms->nextToUpdate -= correction;
+            ms->loadedDictEnd = 0;
+        }
+        ZSTD_window_enforceMaxDist(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd);
+        if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit;
+
+        {   size_t cSize = ZSTD_compressBlock_internal(cctx,
+                                op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize,
+                                ip, blockSize);
+            if (ZSTD_isError(cSize)) return cSize;
+
+            if (cSize == 0) {  /* block is not compressible */
+                U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(blockSize << 3);
+                if (blockSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
+                MEM_writeLE32(op, cBlockHeader24);   /* 4th byte will be overwritten */
+                memcpy(op + ZSTD_blockHeaderSize, ip, blockSize);
+                cSize = ZSTD_blockHeaderSize + blockSize;
+            } else {
+                U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+                MEM_writeLE24(op, cBlockHeader24);
+                cSize += ZSTD_blockHeaderSize;
+            }
+
+            ip += blockSize;
+            assert(remaining >= blockSize);
+            remaining -= blockSize;
+            op += cSize;
+            assert(dstCapacity >= cSize);
+            dstCapacity -= cSize;
+            DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u",
+                        (U32)cSize);
+    }   }
+
+    if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending;
+    return op-ostart;
+}
+
+
+static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
+                                    ZSTD_CCtx_params params, U64 pledgedSrcSize, U32 dictID)
+{   BYTE* const op = (BYTE*)dst;
+    U32   const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536);   /* 0-3 */
+    U32   const dictIDSizeCode = params.fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength;   /* 0-3 */
+    U32   const checksumFlag = params.fParams.checksumFlag>0;
+    U32   const windowSize = (U32)1 << params.cParams.windowLog;
+    U32   const singleSegment = params.fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
+    BYTE  const windowLogByte = (BYTE)((params.cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
+    U32   const fcsCode = params.fParams.contentSizeFlag ?
+                     (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0;  /* 0-3 */
+    BYTE  const frameHeaderDecriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) );
+    size_t pos=0;
+
+    if (dstCapacity < ZSTD_frameHeaderSize_max) return ERROR(dstSize_tooSmall);
+    DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u",
+                !params.fParams.noDictIDFlag, dictID,  dictIDSizeCode);
+
+    if (params.format == ZSTD_f_zstd1) {
+        MEM_writeLE32(dst, ZSTD_MAGICNUMBER);
+        pos = 4;
+    }
+    op[pos++] = frameHeaderDecriptionByte;
+    if (!singleSegment) op[pos++] = windowLogByte;
+    switch(dictIDSizeCode)
+    {
+        default:  assert(0); /* impossible */
+        case 0 : break;
+        case 1 : op[pos] = (BYTE)(dictID); pos++; break;
+        case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break;
+        case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break;
+    }
+    switch(fcsCode)
+    {
+        default:  assert(0); /* impossible */
+        case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break;
+        case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break;
+        case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break;
+        case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break;
+    }
+    return pos;
+}
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapcity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+{
+    if (dstCapacity < ZSTD_blockHeaderSize) return ERROR(dstSize_tooSmall);
+    {   U32 const cBlockHeader24 = 1 /*lastBlock*/ + (((U32)bt_raw)<<1);  /* 0 size */
+        MEM_writeLE24(dst, cBlockHeader24);
+        return ZSTD_blockHeaderSize;
+    }
+}
+
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+{
+    if (cctx->stage != ZSTDcs_init)
+        return ERROR(stage_wrong);
+    if (cctx->appliedParams.ldmParams.enableLdm)
+        return ERROR(parameter_unsupported);
+    cctx->externSeqStore.seq = seq;
+    cctx->externSeqStore.size = nbSeq;
+    cctx->externSeqStore.capacity = nbSeq;
+    cctx->externSeqStore.pos = 0;
+    return 0;
+}
+
+
+static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                               U32 frame, U32 lastFrameChunk)
+{
+    ZSTD_matchState_t* ms = &cctx->blockState.matchState;
+    size_t fhSize = 0;
+
+    DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
+                cctx->stage, (U32)srcSize);
+    if (cctx->stage==ZSTDcs_created) return ERROR(stage_wrong);   /* missing init (ZSTD_compressBegin) */
+
+    if (frame && (cctx->stage==ZSTDcs_init)) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->appliedParams,
+                                       cctx->pledgedSrcSizePlusOne-1, cctx->dictID);
+        if (ZSTD_isError(fhSize)) return fhSize;
+        dstCapacity -= fhSize;
+        dst = (char*)dst + fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (!srcSize) return fhSize;  /* do not generate an empty block if no input */
+
+    if (!ZSTD_window_update(&ms->window, src, srcSize)) {
+        ms->nextToUpdate = ms->window.dictLimit;
+    }
+    if (cctx->appliedParams.ldmParams.enableLdm)
+        ZSTD_window_update(&cctx->ldmState.window, src, srcSize);
+
+    DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (U32)cctx->blockSize);
+    {   size_t const cSize = frame ?
+                             ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) :
+                             ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize);
+        if (ZSTD_isError(cSize)) return cSize;
+        cctx->consumedSrcSize += srcSize;
+        cctx->producedCSize += (cSize + fhSize);
+        if (cctx->appliedParams.fParams.contentSizeFlag) {  /* control src size */
+            if (cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne) {
+                DEBUGLOG(4, "error : pledgedSrcSize = %u, while realSrcSize >= %u",
+                    (U32)cctx->pledgedSrcSizePlusOne-1, (U32)cctx->consumedSrcSize);
+                return ERROR(srcSize_wrong);
+            }
+        }
+        return cSize + fhSize;
+    }
+}
+
+size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (U32)srcSize);
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+}
+
+
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+{
+    ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+    assert(!ZSTD_checkCParams(cParams));
+    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
+}
+
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
+    if (srcSize > blockSizeMax) return ERROR(srcSize_wrong);
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+}
+
+/*! ZSTD_loadDictionaryContent() :
+ *  @return : 0, or an error code
+ */
+static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const void* src, size_t srcSize)
+{
+    const BYTE* const ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    ZSTD_compressionParameters const* cParams = &params->cParams;
+
+    ZSTD_window_update(&ms->window, src, srcSize);
+    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+
+    if (srcSize <= HASH_READ_SIZE) return 0;
+
+    switch(params->cParams.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, cParams, iend);
+        break;
+    case ZSTD_dfast:
+        ZSTD_fillDoubleHashTable(ms, cParams, iend);
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+        if (srcSize >= HASH_READ_SIZE)
+            ZSTD_insertAndFindFirstIndex(ms, cParams, iend-HASH_READ_SIZE);
+        break;
+
+    case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+        if (srcSize >= HASH_READ_SIZE)
+            ZSTD_updateTree(ms, cParams, iend-HASH_READ_SIZE, iend);
+        break;
+
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
+    }
+
+    ms->nextToUpdate = (U32)(iend - ms->window.base);
+    return 0;
+}
+
+
+/* Dictionaries that assign zero probability to symbols that show up causes problems
+   when FSE encoding.  Refuse dictionaries that assign zero probability to symbols
+   that we may encounter during compression.
+   NOTE: This behavior is not standard and could be improved in the future. */
+static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) {
+    U32 s;
+    if (dictMaxSymbolValue < maxSymbolValue) return ERROR(dictionary_corrupted);
+    for (s = 0; s <= maxSymbolValue; ++s) {
+        if (normalizedCounter[s] == 0) return ERROR(dictionary_corrupted);
+    }
+    return 0;
+}
+
+
+/* Dictionary format :
+ * See :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : dictID, or an error code
+ *  assumptions : magic number supposed already checked
+ *                dictSize supposed > 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const void* dict, size_t dictSize, void* workspace)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+    short offcodeNCount[MaxOff+1];
+    unsigned offcodeMaxValue = MaxOff;
+    size_t dictID;
+
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+
+    dictPtr += 4;   /* skip magic number */
+    dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr);
+    dictPtr += 4;
+
+    {   unsigned maxSymbolValue = 255;
+        size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.hufCTable, &maxSymbolValue, dictPtr, dictEnd-dictPtr);
+        if (HUF_isError(hufHeaderSize)) return ERROR(dictionary_corrupted);
+        if (maxSymbolValue < 255) return ERROR(dictionary_corrupted);
+        dictPtr += hufHeaderSize;
+    }
+
+    {   unsigned offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
+        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, workspace, HUF_WORKSPACE_SIZE),
+                 dictionary_corrupted);
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
+        /* Every match length code must have non-zero probability */
+        CHECK_F( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, workspace, HUF_WORKSPACE_SIZE),
+                 dictionary_corrupted);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
+        /* Every literal length code must have non-zero probability */
+        CHECK_F( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, workspace, HUF_WORKSPACE_SIZE),
+                 dictionary_corrupted);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted);
+    bs->rep[0] = MEM_readLE32(dictPtr+0);
+    bs->rep[1] = MEM_readLE32(dictPtr+4);
+    bs->rep[2] = MEM_readLE32(dictPtr+8);
+    dictPtr += 12;
+
+    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+        U32 offcodeMax = MaxOff;
+        if (dictContentSize <= ((U32)-1) - 128 KB) {
+            U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+            offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
+        }
+        /* All offset values <= dictContentSize + 128 KB must be representable */
+        CHECK_F (ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)));
+        /* All repCodes must be <= dictContentSize and != 0*/
+        {   U32 u;
+            for (u=0; u<3; u++) {
+                if (bs->rep[u] == 0) return ERROR(dictionary_corrupted);
+                if (bs->rep[u] > dictContentSize) return ERROR(dictionary_corrupted);
+        }   }
+
+        bs->entropy.hufCTable_repeatMode = HUF_repeat_valid;
+        bs->entropy.offcode_repeatMode = FSE_repeat_valid;
+        bs->entropy.matchlength_repeatMode = FSE_repeat_valid;
+        bs->entropy.litlength_repeatMode = FSE_repeat_valid;
+        CHECK_F(ZSTD_loadDictionaryContent(ms, params, dictPtr, dictContentSize));
+        return dictID;
+    }
+}
+
+/** ZSTD_compress_insertDictionary() :
+*   @return : dictID, or an error code */
+static size_t ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matchState_t* ms,
+                                             ZSTD_CCtx_params const* params,
+                                       const void* dict, size_t dictSize,
+                                             ZSTD_dictContentType_e dictContentType,
+                                             void* workspace)
+{
+    DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+    if ((dict==NULL) || (dictSize<=8)) return 0;
+
+    ZSTD_reset_compressedBlockState(bs);
+
+    /* dict restricted modes */
+    if (dictContentType == ZSTD_dct_rawContent)
+        return ZSTD_loadDictionaryContent(ms, params, dict, dictSize);
+
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+        if (dictContentType == ZSTD_dct_auto) {
+            DEBUGLOG(4, "raw content dictionary detected");
+            return ZSTD_loadDictionaryContent(ms, params, dict, dictSize);
+        }
+        if (dictContentType == ZSTD_dct_fullDict)
+            return ERROR(dictionary_wrong);
+        assert(0);   /* impossible */
+    }
+
+    /* dict as full zstd dictionary */
+    return ZSTD_loadZstdDictionary(bs, ms, params, dict, dictSize, workspace);
+}
+
+/*! ZSTD_compressBegin_internal() :
+ * @return : 0, or an error code */
+size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                             const void* dict, size_t dictSize,
+                             ZSTD_dictContentType_e dictContentType,
+                             const ZSTD_CDict* cdict,
+                             ZSTD_CCtx_params params, U64 pledgedSrcSize,
+                             ZSTD_buffered_policy_e zbuff)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params.cParams.windowLog);
+    /* params are supposed to be fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+
+    if (cdict && cdict->dictContentSize>0) {
+        cctx->requestedParams = params;
+        return ZSTD_resetCCtx_usingCDict(cctx, cdict, params.cParams.windowLog,
+                                         params.fParams, pledgedSrcSize, zbuff);
+    }
+
+    CHECK_F( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                     ZSTDcrp_continue, zbuff) );
+    {
+        size_t const dictID = ZSTD_compress_insertDictionary(
+                cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                &params, dict, dictSize, dictContentType, cctx->entropyWorkspace);
+        if (ZSTD_isError(dictID)) return dictID;
+        assert(dictID <= (size_t)(U32)-1);
+        cctx->dictID = (U32)dictID;
+    }
+    return 0;
+}
+
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    const ZSTD_CDict* cdict,
+                                    ZSTD_CCtx_params params,
+                                    unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params.cParams.windowLog);
+    /* compression parameters verification and optimization */
+    CHECK_F( ZSTD_checkCParams(params.cParams) );
+    return ZSTD_compressBegin_internal(cctx,
+                                       dict, dictSize, dictContentType,
+                                       cdict,
+                                       params, pledgedSrcSize,
+                                       ZSTDb_not_buffered);
+}
+
+/*! ZSTD_compressBegin_advanced() :
+*   @return : 0, or an error code */
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                             const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
+    return ZSTD_compressBegin_advanced_internal(cctx,
+                                            dict, dictSize, ZSTD_dct_auto,
+                                            NULL /*cdict*/,
+                                            cctxParams, pledgedSrcSize);
+}
+
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
+    DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (U32)dictSize);
+    return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, NULL,
+                                       cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+}
+
+size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
+}
+
+
+/*! ZSTD_writeEpilogue() :
+*   Ends a frame.
+*   @return : nb of bytes written into dst (or an error code) */
+static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    size_t fhSize = 0;
+
+    DEBUGLOG(4, "ZSTD_writeEpilogue");
+    if (cctx->stage == ZSTDcs_created) return ERROR(stage_wrong);  /* init missing */
+
+    /* special case : empty frame */
+    if (cctx->stage == ZSTDcs_init) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->appliedParams, 0, 0);
+        if (ZSTD_isError(fhSize)) return fhSize;
+        dstCapacity -= fhSize;
+        op += fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (cctx->stage != ZSTDcs_ending) {
+        /* write one last empty block, make it the "last" block */
+        U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+        if (dstCapacity<4) return ERROR(dstSize_tooSmall);
+        MEM_writeLE32(op, cBlockHeader24);
+        op += ZSTD_blockHeaderSize;
+        dstCapacity -= ZSTD_blockHeaderSize;
+    }
+
+    if (cctx->appliedParams.fParams.checksumFlag) {
+        U32 const checksum = (U32) XXH64_digest(&cctx->xxhState);
+        if (dstCapacity<4) return ERROR(dstSize_tooSmall);
+        DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", checksum);
+        MEM_writeLE32(op, checksum);
+        op += 4;
+    }
+
+    cctx->stage = ZSTDcs_created;  /* return to "created but no init" status */
+    return op-ostart;
+}
+
+size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize)
+{
+    size_t endResult;
+    size_t const cSize = ZSTD_compressContinue_internal(cctx,
+                                dst, dstCapacity, src, srcSize,
+                                1 /* frame mode */, 1 /* last chunk */);
+    if (ZSTD_isError(cSize)) return cSize;
+    endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize);
+    if (ZSTD_isError(endResult)) return endResult;
+    if (cctx->appliedParams.fParams.contentSizeFlag) {  /* control src size */
+        DEBUGLOG(4, "end of frame : controlling src size");
+        if (cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1) {
+            DEBUGLOG(4, "error : pledgedSrcSize = %u, while realSrcSize = %u",
+                (U32)cctx->pledgedSrcSizePlusOne-1, (U32)cctx->consumedSrcSize);
+            return ERROR(srcSize_wrong);
+    }   }
+    return cSize + endResult;
+}
+
+
+static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict,size_t dictSize,
+                               ZSTD_parameters params)
+{
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
+    DEBUGLOG(4, "ZSTD_compress_internal");
+    return ZSTD_compress_advanced_internal(cctx,
+                                          dst, dstCapacity,
+                                          src, srcSize,
+                                          dict, dictSize,
+                                          cctxParams);
+}
+
+size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict,size_t dictSize,
+                               ZSTD_parameters params)
+{
+    DEBUGLOG(4, "ZSTD_compress_advanced");
+    CHECK_F(ZSTD_checkCParams(params.cParams));
+    return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params);
+}
+
+/* Internal */
+size_t ZSTD_compress_advanced_internal(
+        ZSTD_CCtx* cctx,
+        void* dst, size_t dstCapacity,
+        const void* src, size_t srcSize,
+        const void* dict,size_t dictSize,
+        ZSTD_CCtx_params params)
+{
+    DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)",
+                (U32)srcSize);
+    CHECK_F( ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, NULL,
+                                         params, srcSize, ZSTDb_not_buffered) );
+    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize,
+                               const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, srcSize ? srcSize : 1, dict ? dictSize : 0);
+    ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
+    assert(params.fParams.contentSizeFlag == 1);
+    ZSTD_CCtxParam_setParameter(&cctxParams, ZSTD_p_compressLiterals, compressionLevel>=0);
+    return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, cctxParams);
+}
+
+size_t ZSTD_compressCCtx (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (U32)srcSize);
+    return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel);
+}
+
+size_t ZSTD_compress(void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel)
+{
+    size_t result;
+    ZSTD_CCtx ctxBody;
+    memset(&ctxBody, 0, sizeof(ctxBody));
+    ctxBody.customMem = ZSTD_defaultCMem;
+    result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel);
+    ZSTD_free(ctxBody.workSpace, ZSTD_defaultCMem);  /* can't free ctxBody itself, as it's on stack; free only heap content */
+    return result;
+}
+
+
+/* =====  Dictionary API  ===== */
+
+/*! ZSTD_estimateCDictSize_advanced() :
+ *  Estimate amount of memory that will be needed to create a dictionary with following arguments */
+size_t ZSTD_estimateCDictSize_advanced(
+        size_t dictSize, ZSTD_compressionParameters cParams,
+        ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (U32)sizeof(ZSTD_CDict));
+    return sizeof(ZSTD_CDict) + HUF_WORKSPACE_SIZE + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0)
+           + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+}
+
+size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy);
+}
+
+size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support sizeof on NULL */
+    DEBUGLOG(5, "sizeof(*cdict) : %u", (U32)sizeof(*cdict));
+    return cdict->workspaceSize + (cdict->dictBuffer ? cdict->dictContentSize : 0) + sizeof(*cdict);
+}
+
+static size_t ZSTD_initCDict_internal(
+                    ZSTD_CDict* cdict,
+              const void* dictBuffer, size_t dictSize,
+                    ZSTD_dictLoadMethod_e dictLoadMethod,
+                    ZSTD_dictContentType_e dictContentType,
+                    ZSTD_compressionParameters cParams)
+{
+    DEBUGLOG(3, "ZSTD_initCDict_internal, dictContentType %u", (U32)dictContentType);
+    assert(!ZSTD_checkCParams(cParams));
+    cdict->cParams = cParams;
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
+        cdict->dictBuffer = NULL;
+        cdict->dictContent = dictBuffer;
+    } else {
+        void* const internalBuffer = ZSTD_malloc(dictSize, cdict->customMem);
+        cdict->dictBuffer = internalBuffer;
+        cdict->dictContent = internalBuffer;
+        if (!internalBuffer) return ERROR(memory_allocation);
+        memcpy(internalBuffer, dictBuffer, dictSize);
+    }
+    cdict->dictContentSize = dictSize;
+
+    /* Reset the state to no dictionary */
+    ZSTD_reset_compressedBlockState(&cdict->cBlockState);
+    {   void* const end = ZSTD_reset_matchState(
+                &cdict->matchState,
+                (U32*)cdict->workspace + HUF_WORKSPACE_SIZE_U32,
+                &cParams, ZSTDcrp_continue, /* forCCtx */ 0);
+        assert(end == (char*)cdict->workspace + cdict->workspaceSize);
+        (void)end;
+    }
+    /* (Maybe) load the dictionary
+     * Skips loading the dictionary if it is <= 8 bytes.
+     */
+    {   ZSTD_CCtx_params params;
+        memset(&params, 0, sizeof(params));
+        params.compressionLevel = ZSTD_CLEVEL_DEFAULT;
+        params.fParams.contentSizeFlag = 1;
+        params.cParams = cParams;
+        {   size_t const dictID = ZSTD_compress_insertDictionary(
+                    &cdict->cBlockState, &cdict->matchState, &params,
+                    cdict->dictContent, cdict->dictContentSize,
+                    dictContentType, cdict->workspace);
+            if (ZSTD_isError(dictID)) return dictID;
+            assert(dictID <= (size_t)(U32)-1);
+            cdict->dictID = (U32)dictID;
+        }
+    }
+
+    return 0;
+}
+
+ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_compressionParameters cParams, ZSTD_customMem customMem)
+{
+    DEBUGLOG(3, "ZSTD_createCDict_advanced, mode %u", (U32)dictContentType);
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    {   ZSTD_CDict* const cdict = (ZSTD_CDict*)ZSTD_malloc(sizeof(ZSTD_CDict), customMem);
+        size_t const workspaceSize = HUF_WORKSPACE_SIZE + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0);
+        void* const workspace = ZSTD_malloc(workspaceSize, customMem);
+
+        if (!cdict || !workspace) {
+            ZSTD_free(cdict, customMem);
+            ZSTD_free(workspace, customMem);
+            return NULL;
+        }
+        cdict->customMem = customMem;
+        cdict->workspace = workspace;
+        cdict->workspaceSize = workspaceSize;
+        if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                        dictBuffer, dictSize,
+                                        dictLoadMethod, dictContentType,
+                                        cParams) )) {
+            ZSTD_freeCDict(cdict);
+            return NULL;
+        }
+
+        return cdict;
+    }
+}
+
+ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    return ZSTD_createCDict_advanced(dict, dictSize,
+                                     ZSTD_dlm_byCopy, ZSTD_dct_auto,
+                                     cParams, ZSTD_defaultCMem);
+}
+
+ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    return ZSTD_createCDict_advanced(dict, dictSize,
+                                     ZSTD_dlm_byRef, ZSTD_dct_auto,
+                                     cParams, ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeCDict(ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = cdict->customMem;
+        ZSTD_free(cdict->workspace, cMem);
+        ZSTD_free(cdict->dictBuffer, cMem);
+        ZSTD_free(cdict, cMem);
+        return 0;
+    }
+}
+
+/*! ZSTD_initStaticCDict_advanced() :
+ *  Generate a digested dictionary in provided memory area.
+ *  workspace: The memory area to emplace the dictionary into.
+ *             Provided pointer must 8-bytes aligned.
+ *             It must outlive dictionary usage.
+ *  workspaceSize: Use ZSTD_estimateCDictSize()
+ *                 to determine how large workspace must be.
+ *  cParams : use ZSTD_getCParams() to transform a compression level
+ *            into its relevants cParams.
+ * @return : pointer to ZSTD_CDict*, or NULL if error (size too small)
+ *  Note : there is no corresponding "free" function.
+ *         Since workspace was allocated externally, it must be freed externally.
+ */
+const ZSTD_CDict* ZSTD_initStaticCDict(
+                                 void* workspace, size_t workspaceSize,
+                           const void* dict, size_t dictSize,
+                                 ZSTD_dictLoadMethod_e dictLoadMethod,
+                                 ZSTD_dictContentType_e dictContentType,
+                                 ZSTD_compressionParameters cParams)
+{
+    size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0);
+    size_t const neededSize = sizeof(ZSTD_CDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize)
+                            + HUF_WORKSPACE_SIZE + matchStateSize;
+    ZSTD_CDict* const cdict = (ZSTD_CDict*) workspace;
+    void* ptr;
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+    DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u",
+        (U32)workspaceSize, (U32)neededSize, (U32)(workspaceSize < neededSize));
+    if (workspaceSize < neededSize) return NULL;
+
+    if (dictLoadMethod == ZSTD_dlm_byCopy) {
+        memcpy(cdict+1, dict, dictSize);
+        dict = cdict+1;
+        ptr = (char*)workspace + sizeof(ZSTD_CDict) + dictSize;
+    } else {
+        ptr = cdict+1;
+    }
+    cdict->workspace = ptr;
+    cdict->workspaceSize = HUF_WORKSPACE_SIZE + matchStateSize;
+
+    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType,
+                                              cParams) ))
+        return NULL;
+
+    return cdict;
+}
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict)
+{
+    assert(cdict != NULL);
+    return cdict->cParams;
+}
+
+/* ZSTD_compressBegin_usingCDict_advanced() :
+ * cdict must be != NULL */
+size_t ZSTD_compressBegin_usingCDict_advanced(
+    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
+    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced");
+    if (cdict==NULL) return ERROR(dictionary_wrong);
+    {   ZSTD_CCtx_params params = cctx->requestedParams;
+        params.cParams = ZSTD_getCParamsFromCDict(cdict);
+        /* Increase window log to fit the entire dictionary and source if the
+         * source size is known. Limit the increase to 19, which is the
+         * window log for compression level 1 with the largest source size.
+         */
+        if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+            U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19);
+            U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1;
+            params.cParams.windowLog = MAX(params.cParams.windowLog, limitedSrcLog);
+        }
+        params.fParams = fParams;
+        return ZSTD_compressBegin_internal(cctx,
+                                           NULL, 0, ZSTD_dct_auto,
+                                           cdict,
+                                           params, pledgedSrcSize,
+                                           ZSTDb_not_buffered);
+    }
+}
+
+/* ZSTD_compressBegin_usingCDict() :
+ * pledgedSrcSize=0 means "unknown"
+ * if pledgedSrcSize>0, it will enable contentSizeFlag */
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag);
+    return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, 0);
+}
+
+size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+{
+    CHECK_F (ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize));   /* will check if cdict != NULL */
+    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ *  Note that compression parameters are decided at CDict creation time
+ *  while frame parameters are hardcoded */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
+}
+
+
+
+/* ******************************************************************
+*  Streaming
+********************************************************************/
+
+ZSTD_CStream* ZSTD_createCStream(void)
+{
+    DEBUGLOG(3, "ZSTD_createCStream");
+    return ZSTD_createCStream_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticCCtx(workspace, workspaceSize);
+}
+
+ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem)
+{   /* CStream and CCtx are now same object */
+    return ZSTD_createCCtx_advanced(customMem);
+}
+
+size_t ZSTD_freeCStream(ZSTD_CStream* zcs)
+{
+    return ZSTD_freeCCtx(zcs);   /* same object */
+}
+
+
+
+/*======   Initialization   ======*/
+
+size_t ZSTD_CStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_CStreamOutSize(void)
+{
+    return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ;
+}
+
+static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx,
+                    const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType,
+                    const ZSTD_CDict* const cdict,
+                    ZSTD_CCtx_params const params, unsigned long long const pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_resetCStream_internal (disableLiteralCompression=%i)",
+                params.disableLiteralCompression);
+    /* params are supposed to be fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+
+    CHECK_F( ZSTD_compressBegin_internal(cctx,
+                                         dict, dictSize, dictContentType,
+                                         cdict,
+                                         params, pledgedSrcSize,
+                                         ZSTDb_buffered) );
+
+    cctx->inToCompress = 0;
+    cctx->inBuffPos = 0;
+    cctx->inBuffTarget = cctx->blockSize
+                      + (cctx->blockSize == pledgedSrcSize);   /* for small input: avoid automatic flush on reaching end of block, since it would require to add a 3-bytes null block to end frame */
+    cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0;
+    cctx->streamStage = zcss_load;
+    cctx->frameEnded = 0;
+    return 0;   /* ready to go */
+}
+
+/* ZSTD_resetCStream():
+ * pledgedSrcSize == 0 means "unknown" */
+size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params params = zcs->requestedParams;
+    DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (U32)pledgedSrcSize);
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    params.fParams.contentSizeFlag = 1;
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, 0);
+    return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize);
+}
+
+/*! ZSTD_initCStream_internal() :
+ *  Note : for lib/compress only. Used by zstdmt_compress.c.
+ *  Assumption 1 : params are valid
+ *  Assumption 2 : either dict, or cdict, is defined, not both */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                    const void* dict, size_t dictSize, const ZSTD_CDict* cdict,
+                    ZSTD_CCtx_params params, unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_internal");
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+
+    if (dict && dictSize >= 8) {
+        DEBUGLOG(4, "loading dictionary of size %u", (U32)dictSize);
+        if (zcs->staticSize) {   /* static CCtx : never uses malloc */
+            /* incompatible with internal cdict creation */
+            return ERROR(memory_allocation);
+        }
+        ZSTD_freeCDict(zcs->cdictLocal);
+        zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize,
+                                            ZSTD_dlm_byCopy, ZSTD_dct_auto,
+                                            params.cParams, zcs->customMem);
+        zcs->cdict = zcs->cdictLocal;
+        if (zcs->cdictLocal == NULL) return ERROR(memory_allocation);
+    } else {
+        if (cdict) {
+            params.cParams = ZSTD_getCParamsFromCDict(cdict);  /* cParams are enforced from cdict; it includes windowLog */
+        }
+        ZSTD_freeCDict(zcs->cdictLocal);
+        zcs->cdictLocal = NULL;
+        zcs->cdict = cdict;
+    }
+
+    return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize);
+}
+
+/* ZSTD_initCStream_usingCDict_advanced() :
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */
+size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                            const ZSTD_CDict* cdict,
+                                            ZSTD_frameParameters fParams,
+                                            unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced");
+    if (!cdict) return ERROR(dictionary_wrong); /* cannot handle NULL cdict (does not know what to do) */
+    {   ZSTD_CCtx_params params = zcs->requestedParams;
+        params.cParams = ZSTD_getCParamsFromCDict(cdict);
+        params.fParams = fParams;
+        return ZSTD_initCStream_internal(zcs,
+                                NULL, 0, cdict,
+                                params, pledgedSrcSize);
+    }
+}
+
+/* note : cdict must outlive compression session */
+size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 0 /* contentSizeFlag */, 0 /* checksum */, 0 /* hideDictID */ };
+    DEBUGLOG(4, "ZSTD_initCStream_usingCDict");
+    return ZSTD_initCStream_usingCDict_advanced(zcs, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);  /* note : will check that cdict != NULL */
+}
+
+
+/* ZSTD_initCStream_advanced() :
+ * pledgedSrcSize must be exact.
+ * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * dict is loaded with default parameters ZSTD_dm_auto and ZSTD_dlm_byCopy. */
+size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                                 const void* dict, size_t dictSize,
+                                 ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_advanced: pledgedSrcSize=%u, flag=%u",
+                (U32)pledgedSrcSize, params.fParams.contentSizeFlag);
+    CHECK_F( ZSTD_checkCParams(params.cParams) );
+    if ((pledgedSrcSize==0) && (params.fParams.contentSizeFlag==0)) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* for compatibility with older programs relying on this behavior. Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. This line will be removed in the future. */
+    {   ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
+        return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, cctxParams, pledgedSrcSize);
+    }
+}
+
+size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss)
+{
+    U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;  /* temporary : 0 interpreted as "unknown" during transition period. Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. `0` will be interpreted as "empty" in the future */
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0);
+    ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
+    return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, cctxParams, pledgedSrcSize);
+}
+
+size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_initCStream");
+    return ZSTD_initCStream_srcSize(zcs, compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+/*======   Compression   ======*/
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity,
+                           const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    if (length) memcpy(dst, src, length);
+    return length;
+}
+
+/** ZSTD_compressStream_generic():
+ *  internal function for all *compressStream*() variants and *compress_generic()
+ *  non-static, because can be called from zstdmt_compress.c
+ * @return : hint size for next input */
+size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                   ZSTD_outBuffer* output,
+                                   ZSTD_inBuffer* input,
+                                   ZSTD_EndDirective const flushMode)
+{
+    const char* const istart = (const char*)input->src;
+    const char* const iend = istart + input->size;
+    const char* ip = istart + input->pos;
+    char* const ostart = (char*)output->dst;
+    char* const oend = ostart + output->size;
+    char* op = ostart + output->pos;
+    U32 someMoreWork = 1;
+
+    /* check expectations */
+    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (U32)flushMode);
+    assert(zcs->inBuff != NULL);
+    assert(zcs->inBuffSize > 0);
+    assert(zcs->outBuff !=  NULL);
+    assert(zcs->outBuffSize > 0);
+    assert(output->pos <= output->size);
+    assert(input->pos <= input->size);
+
+    while (someMoreWork) {
+        switch(zcs->streamStage)
+        {
+        case zcss_init:
+            /* call ZSTD_initCStream() first ! */
+            return ERROR(init_missing);
+
+        case zcss_load:
+            if ( (flushMode == ZSTD_e_end)
+              && ((size_t)(oend-op) >= ZSTD_compressBound(iend-ip))  /* enough dstCapacity */
+              && (zcs->inBuffPos == 0) ) {
+                /* shortcut to compression pass directly into output buffer */
+                size_t const cSize = ZSTD_compressEnd(zcs,
+                                                op, oend-op, ip, iend-ip);
+                DEBUGLOG(4, "ZSTD_compressEnd : %u", (U32)cSize);
+                if (ZSTD_isError(cSize)) return cSize;
+                ip = iend;
+                op += cSize;
+                zcs->frameEnded = 1;
+                ZSTD_startNewCompression(zcs);
+                someMoreWork = 0; break;
+            }
+            /* complete loading into inBuffer */
+            {   size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+                size_t const loaded = ZSTD_limitCopy(
+                                        zcs->inBuff + zcs->inBuffPos, toLoad,
+                                        ip, iend-ip);
+                zcs->inBuffPos += loaded;
+                ip += loaded;
+                if ( (flushMode == ZSTD_e_continue)
+                  && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                    /* not enough input to fill full block : stop here */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (zcs->inBuffPos == zcs->inToCompress) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
+            }
+            /* compress current block (note : this stage cannot be stopped in the middle) */
+            DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+            {   void* cDst;
+                size_t cSize;
+                size_t const iSize = zcs->inBuffPos - zcs->inToCompress;
+                size_t oSize = oend-op;
+                unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                if (oSize >= ZSTD_compressBound(iSize))
+                    cDst = op;   /* compress into output buffer, to skip flush stage */
+                else
+                    cDst = zcs->outBuff, oSize = zcs->outBuffSize;
+                cSize = lastBlock ?
+                        ZSTD_compressEnd(zcs, cDst, oSize,
+                                    zcs->inBuff + zcs->inToCompress, iSize) :
+                        ZSTD_compressContinue(zcs, cDst, oSize,
+                                    zcs->inBuff + zcs->inToCompress, iSize);
+                if (ZSTD_isError(cSize)) return cSize;
+                zcs->frameEnded = lastBlock;
+                /* prepare next block */
+                zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
+                if (zcs->inBuffTarget > zcs->inBuffSize)
+                    zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize;
+                DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u",
+                         (U32)zcs->inBuffTarget, (U32)zcs->inBuffSize);
+                if (!lastBlock)
+                    assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                zcs->inToCompress = zcs->inBuffPos;
+                if (cDst == op) {  /* no need to flush */
+                    op += cSize;
+                    if (zcs->frameEnded) {
+                        DEBUGLOG(5, "Frame completed directly in outBuffer");
+                        someMoreWork = 0;
+                        ZSTD_startNewCompression(zcs);
+                    }
+                    break;
+                }
+                zcs->outBuffContentSize = cSize;
+                zcs->outBuffFlushedSize = 0;
+                zcs->streamStage = zcss_flush; /* pass-through to flush stage */
+            }
+	    /* fall-through */
+        case zcss_flush:
+            DEBUGLOG(5, "flush stage");
+            {   size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+                size_t const flushed = ZSTD_limitCopy(op, oend-op,
+                            zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+                DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u",
+                            (U32)toFlush, (U32)(oend-op), (U32)flushed);
+                op += flushed;
+                zcs->outBuffFlushedSize += flushed;
+                if (toFlush!=flushed) {
+                    /* flush not fully completed, presumably because dst is too small */
+                    assert(op==oend);
+                    someMoreWork = 0;
+                    break;
+                }
+                zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+                if (zcs->frameEnded) {
+                    DEBUGLOG(5, "Frame completed on flush");
+                    someMoreWork = 0;
+                    ZSTD_startNewCompression(zcs);
+                    break;
+                }
+                zcs->streamStage = zcss_load;
+                break;
+            }
+
+        default: /* impossible */
+            assert(0);
+        }
+    }
+
+    input->pos = ip - istart;
+    output->pos = op - ostart;
+    if (zcs->frameEnded) return 0;
+    {   size_t hintInSize = zcs->inBuffTarget - zcs->inBuffPos;
+        if (hintInSize==0) hintInSize = zcs->blockSize;
+        return hintInSize;
+    }
+}
+
+size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    /* check conditions */
+    if (output->pos > output->size) return ERROR(GENERIC);
+    if (input->pos  > input->size)  return ERROR(GENERIC);
+
+    return ZSTD_compressStream_generic(zcs, output, input, ZSTD_e_continue);
+}
+
+
+size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
+                              ZSTD_outBuffer* output,
+                              ZSTD_inBuffer* input,
+                              ZSTD_EndDirective endOp)
+{
+    DEBUGLOG(5, "ZSTD_compress_generic, endOp=%u ", (U32)endOp);
+    /* check conditions */
+    if (output->pos > output->size) return ERROR(GENERIC);
+    if (input->pos  > input->size)  return ERROR(GENERIC);
+    assert(cctx!=NULL);
+
+    /* transparent initialization stage */
+    if (cctx->streamStage == zcss_init) {
+        ZSTD_CCtx_params params = cctx->requestedParams;
+        ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+        memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));  /* single usage */
+        assert(prefixDict.dict==NULL || cctx->cdict==NULL);   /* only one can be set */
+        DEBUGLOG(4, "ZSTD_compress_generic : transparent init stage");
+        if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = input->size + 1;  /* auto-fix pledgedSrcSize */
+        params.cParams = ZSTD_getCParamsFromCCtxParams(
+                &cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/);
+
+#ifdef ZSTD_MULTITHREAD
+        if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) {
+            params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */
+        }
+        if (params.nbWorkers > 0) {
+            /* mt context creation */
+            if (cctx->mtctx == NULL || (params.nbWorkers != ZSTDMT_getNbWorkers(cctx->mtctx))) {
+                DEBUGLOG(4, "ZSTD_compress_generic: creating new mtctx for nbWorkers=%u",
+                            params.nbWorkers);
+                if (cctx->mtctx != NULL)
+                    DEBUGLOG(4, "ZSTD_compress_generic: previous nbWorkers was %u",
+                                ZSTDMT_getNbWorkers(cctx->mtctx));
+                ZSTDMT_freeCCtx(cctx->mtctx);
+                cctx->mtctx = ZSTDMT_createCCtx_advanced(params.nbWorkers, cctx->customMem);
+                if (cctx->mtctx == NULL) return ERROR(memory_allocation);
+            }
+            /* mt compression */
+            DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers);
+            CHECK_F( ZSTDMT_initCStream_internal(
+                        cctx->mtctx,
+                        prefixDict.dict, prefixDict.dictSize, ZSTD_dct_rawContent,
+                        cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) );
+            cctx->streamStage = zcss_load;
+            cctx->appliedParams.nbWorkers = params.nbWorkers;
+        } else
+#endif
+        {   CHECK_F( ZSTD_resetCStream_internal(cctx,
+                            prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType,
+                            cctx->cdict,
+                            params, cctx->pledgedSrcSizePlusOne-1) );
+            assert(cctx->streamStage == zcss_load);
+            assert(cctx->appliedParams.nbWorkers == 0);
+    }   }
+
+    /* compression stage */
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        if (cctx->cParamsChanged) {
+            ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams);
+            cctx->cParamsChanged = 0;
+        }
+        {   size_t const flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
+            if ( ZSTD_isError(flushMin)
+              || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */
+                ZSTD_startNewCompression(cctx);
+            }
+            return flushMin;
+    }   }
+#endif
+    CHECK_F( ZSTD_compressStream_generic(cctx, output, input, endOp) );
+    DEBUGLOG(5, "completed ZSTD_compress_generic");
+    return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */
+}
+
+size_t ZSTD_compress_generic_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp)
+{
+    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+    size_t const cErr = ZSTD_compress_generic(cctx, &output, &input, endOp);
+    *dstPos = output.pos;
+    *srcPos = input.pos;
+    return cErr;
+}
+
+
+/*======   Finalize   ======*/
+
+/*! ZSTD_flushStream() :
+ * @return : amount of data remaining to flush */
+size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    ZSTD_inBuffer input = { NULL, 0, 0 };
+    if (output->pos > output->size) return ERROR(GENERIC);
+    CHECK_F( ZSTD_compressStream_generic(zcs, output, &input, ZSTD_e_flush) );
+    return zcs->outBuffContentSize - zcs->outBuffFlushedSize;  /* remaining to flush */
+}
+
+
+size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    ZSTD_inBuffer input = { NULL, 0, 0 };
+    if (output->pos > output->size) return ERROR(GENERIC);
+    CHECK_F( ZSTD_compressStream_generic(zcs, output, &input, ZSTD_e_end) );
+    {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+        size_t const checksumSize = zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4;
+        size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize + lastBlockSize + checksumSize;
+        DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (U32)toFlush);
+        return toFlush;
+    }
+}
+
+
+/*-=====  Pre-defined compression levels  =====-*/
+
+#define ZSTD_MAX_CLEVEL     22
+int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
+{   /* "default" - guarantees a monotonically increasing memory budget */
+    /* W,  C,  H,  S,  L, TL, strat */
+    { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels */
+    { 19, 13, 14,  1,  7,  1, ZSTD_fast    },  /* level  1 */
+    { 19, 15, 16,  1,  6,  1, ZSTD_fast    },  /* level  2 */
+    { 20, 16, 17,  1,  5,  8, ZSTD_dfast   },  /* level  3 */
+    { 20, 17, 18,  1,  5,  8, ZSTD_dfast   },  /* level  4 */
+    { 20, 17, 18,  2,  5, 16, ZSTD_greedy  },  /* level  5 */
+    { 21, 17, 19,  2,  5, 16, ZSTD_lazy    },  /* level  6 */
+    { 21, 18, 19,  3,  5, 16, ZSTD_lazy    },  /* level  7 */
+    { 21, 18, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */
+    { 21, 19, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 21, 19, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */
+    { 22, 20, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */
+    { 22, 20, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 12 */
+    { 22, 21, 22,  4,  5, 32, ZSTD_btlazy2 },  /* level 13 */
+    { 22, 21, 22,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */
+    { 22, 22, 22,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */
+    { 22, 21, 22,  4,  5, 48, ZSTD_btopt   },  /* level 16 */
+    { 23, 22, 22,  4,  4, 48, ZSTD_btopt   },  /* level 17 */
+    { 23, 22, 22,  5,  3, 64, ZSTD_btopt   },  /* level 18 */
+    { 23, 23, 22,  7,  3,128, ZSTD_btopt   },  /* level 19 */
+    { 25, 25, 23,  7,  3,128, ZSTD_btultra },  /* level 20 */
+    { 26, 26, 24,  7,  3,256, ZSTD_btultra },  /* level 21 */
+    { 27, 27, 25,  9,  3,512, ZSTD_btultra },  /* level 22 */
+},
+{   /* for srcSize <= 256 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 18, 13, 14,  1,  6,  1, ZSTD_fast    },  /* level  1 */
+    { 18, 14, 13,  1,  5,  8, ZSTD_dfast   },  /* level  2 */
+    { 18, 16, 15,  1,  5,  8, ZSTD_dfast   },  /* level  3 */
+    { 18, 15, 17,  1,  5,  8, ZSTD_greedy  },  /* level  4.*/
+    { 18, 16, 17,  4,  5,  8, ZSTD_greedy  },  /* level  5.*/
+    { 18, 16, 17,  3,  5,  8, ZSTD_lazy    },  /* level  6.*/
+    { 18, 17, 17,  4,  4,  8, ZSTD_lazy    },  /* level  7 */
+    { 18, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 18, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 18, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 18, 18, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 11.*/
+    { 18, 18, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 12.*/
+    { 18, 19, 17,  7,  4,  8, ZSTD_btlazy2 },  /* level 13 */
+    { 18, 18, 18,  4,  4, 16, ZSTD_btopt   },  /* level 14.*/
+    { 18, 18, 18,  4,  3, 16, ZSTD_btopt   },  /* level 15.*/
+    { 18, 19, 18,  6,  3, 32, ZSTD_btopt   },  /* level 16.*/
+    { 18, 19, 18,  8,  3, 64, ZSTD_btopt   },  /* level 17.*/
+    { 18, 19, 18,  9,  3,128, ZSTD_btopt   },  /* level 18.*/
+    { 18, 19, 18, 10,  3,256, ZSTD_btopt   },  /* level 19.*/
+    { 18, 19, 18, 11,  3,512, ZSTD_btultra },  /* level 20.*/
+    { 18, 19, 18, 12,  3,512, ZSTD_btultra },  /* level 21.*/
+    { 18, 19, 18, 13,  3,512, ZSTD_btultra },  /* level 22.*/
+},
+{   /* for srcSize <= 128 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* level  0 - not used */
+    { 17, 12, 13,  1,  6,  1, ZSTD_fast    },  /* level  1 */
+    { 17, 13, 16,  1,  5,  1, ZSTD_fast    },  /* level  2 */
+    { 17, 16, 16,  2,  5,  8, ZSTD_dfast   },  /* level  3 */
+    { 17, 13, 15,  3,  4,  8, ZSTD_greedy  },  /* level  4 */
+    { 17, 15, 17,  4,  4,  8, ZSTD_greedy  },  /* level  5 */
+    { 17, 16, 17,  3,  4,  8, ZSTD_lazy    },  /* level  6 */
+    { 17, 15, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 17, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 17, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 17, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 17, 17, 17,  7,  4,  8, ZSTD_lazy2   },  /* level 11 */
+    { 17, 17, 17,  8,  4,  8, ZSTD_lazy2   },  /* level 12 */
+    { 17, 18, 17,  6,  4,  8, ZSTD_btlazy2 },  /* level 13.*/
+    { 17, 17, 17,  7,  3,  8, ZSTD_btopt   },  /* level 14.*/
+    { 17, 17, 17,  7,  3, 16, ZSTD_btopt   },  /* level 15.*/
+    { 17, 18, 17,  7,  3, 32, ZSTD_btopt   },  /* level 16.*/
+    { 17, 18, 17,  7,  3, 64, ZSTD_btopt   },  /* level 17.*/
+    { 17, 18, 17,  7,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btopt   },  /* level 19.*/
+    { 17, 18, 17,  9,  3,256, ZSTD_btultra },  /* level 20.*/
+    { 17, 18, 17, 10,  3,256, ZSTD_btultra },  /* level 21.*/
+    { 17, 18, 17, 11,  3,512, ZSTD_btultra },  /* level 22.*/
+},
+{   /* for srcSize <= 16 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 14, 14, 14,  1,  6,  1, ZSTD_fast    },  /* level  1 */
+    { 14, 14, 14,  1,  4,  1, ZSTD_fast    },  /* level  2 */
+    { 14, 14, 14,  1,  4,  6, ZSTD_dfast   },  /* level  3.*/
+    { 14, 14, 14,  4,  4,  6, ZSTD_greedy  },  /* level  4.*/
+    { 14, 14, 14,  3,  4,  6, ZSTD_lazy    },  /* level  5.*/
+    { 14, 14, 14,  4,  4,  6, ZSTD_lazy2   },  /* level  6 */
+    { 14, 14, 14,  5,  4,  6, ZSTD_lazy2   },  /* level  7 */
+    { 14, 14, 14,  6,  4,  6, ZSTD_lazy2   },  /* level  8.*/
+    { 14, 15, 14,  6,  4,  6, ZSTD_btlazy2 },  /* level  9.*/
+    { 14, 15, 14,  3,  3,  6, ZSTD_btopt   },  /* level 10.*/
+    { 14, 15, 14,  6,  3,  8, ZSTD_btopt   },  /* level 11.*/
+    { 14, 15, 14,  6,  3, 16, ZSTD_btopt   },  /* level 12.*/
+    { 14, 15, 14,  6,  3, 24, ZSTD_btopt   },  /* level 13.*/
+    { 14, 15, 15,  6,  3, 48, ZSTD_btopt   },  /* level 14.*/
+    { 14, 15, 15,  6,  3, 64, ZSTD_btopt   },  /* level 15.*/
+    { 14, 15, 15,  6,  3, 96, ZSTD_btopt   },  /* level 16.*/
+    { 14, 15, 15,  6,  3,128, ZSTD_btopt   },  /* level 17.*/
+    { 14, 15, 15,  6,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btopt   },  /* level 19.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btultra },  /* level 20.*/
+    { 14, 15, 15,  9,  3,256, ZSTD_btultra },  /* level 21.*/
+    { 14, 15, 15, 10,  3,256, ZSTD_btultra },  /* level 22.*/
+},
+};
+
+/*! ZSTD_getCParams() :
+*  @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+*   Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
+{
+    size_t const addedSize = srcSizeHint ? 0 : 500;
+    U64 const rSize = srcSizeHint+dictSize ? srcSizeHint+dictSize+addedSize : (U64)-1;
+    U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);   /* intentional underflow for srcSizeHint == 0 */
+    int row = compressionLevel;
+    DEBUGLOG(5, "ZSTD_getCParams (cLevel=%i)", compressionLevel);
+    if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT;   /* 0 == default */
+    if (compressionLevel < 0) row = 0;   /* entry 0 is baseline for fast mode */
+    if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL;
+    {   ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row];
+        if (compressionLevel < 0) cp.targetLength = (unsigned)(-compressionLevel);   /* acceleration factor */
+        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize); }
+
+}
+
+/*! ZSTD_getParams() :
+*   same as ZSTD_getCParams(), but @return a `ZSTD_parameters` object (instead of `ZSTD_compressionParameters`).
+*   All fields of `ZSTD_frameParameters` are set to default (0) */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
+    ZSTD_parameters params;
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSizeHint, dictSize);
+    DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
+    memset(&params, 0, sizeof(params));
+    params.cParams = cParams;
+    params.fParams.contentSizeFlag = 1;
+    return params;
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_compress_internal.h b/vendor/github.com/DataDog/zstd/zstd_compress_internal.h
new file mode 100644
index 0000000..81f12ca
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_compress_internal.h

@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This header contains definitions
+ * that shall **only** be used by modules within lib/compress.
+ */
+
+#ifndef ZSTD_COMPRESS_H
+#define ZSTD_COMPRESS_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "zstd_internal.h"
+#ifdef ZSTD_MULTITHREAD
+#  include "zstdmt_compress.h"
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Constants
+***************************************/
+#define kSearchStrength      8
+#define HASH_READ_SIZE       8
+#define ZSTD_DUBT_UNSORTED_MARK 1   /* For btlazy2 strategy, index 1 now means "unsorted".
+                                       It could be confused for a real successor at index "1", if sorted as larger than its predecessor.
+                                       It's not a big deal though : candidate will just be sorted again.
+                                       Additionnally, candidate position 1 will be lost.
+                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
+typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage;
+
+typedef struct ZSTD_prefixDict_s {
+    const void* dict;
+    size_t dictSize;
+    ZSTD_dictContentType_e dictContentType;
+} ZSTD_prefixDict;
+
+typedef struct {
+    U32 hufCTable[HUF_CTABLE_SIZE_U32(255)];
+    FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+    FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+    FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
+    HUF_repeat hufCTable_repeatMode;
+    FSE_repeat offcode_repeatMode;
+    FSE_repeat matchlength_repeatMode;
+    FSE_repeat litlength_repeatMode;
+} ZSTD_entropyCTables_t;
+
+typedef struct {
+    U32 off;
+    U32 len;
+} ZSTD_match_t;
+
+typedef struct {
+    int price;
+    U32 off;
+    U32 mlen;
+    U32 litlen;
+    U32 rep[ZSTD_REP_NUM];
+} ZSTD_optimal_t;
+
+typedef struct {
+    /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+    U32* litFreq;               /* table of literals statistics, of size 256 */
+    U32* litLengthFreq;         /* table of litLength statistics, of size (MaxLL+1) */
+    U32* matchLengthFreq;       /* table of matchLength statistics, of size (MaxML+1) */
+    U32* offCodeFreq;           /* table of offCode statistics, of size (MaxOff+1) */
+    ZSTD_match_t* matchTable;   /* list of found matches, of size ZSTD_OPT_NUM+1 */
+    ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
+
+    U32  litSum;                 /* nb of literals */
+    U32  litLengthSum;           /* nb of litLength codes */
+    U32  matchLengthSum;         /* nb of matchLength codes */
+    U32  offCodeSum;             /* nb of offset codes */
+    /* begin updated by ZSTD_setLog2Prices */
+    U32  log2litSum;             /* pow2 to compare log2(litfreq) to */
+    U32  log2litLengthSum;       /* pow2 to compare log2(llfreq) to */
+    U32  log2matchLengthSum;     /* pow2 to compare log2(mlfreq) to */
+    U32  log2offCodeSum;         /* pow2 to compare log2(offreq) to */
+    /* end : updated by ZSTD_setLog2Prices */
+    U32  staticPrices;           /* prices follow a pre-defined cost structure, statistics are irrelevant */
+} optState_t;
+
+typedef struct {
+  ZSTD_entropyCTables_t entropy;
+  U32 rep[ZSTD_REP_NUM];
+} ZSTD_compressedBlockState_t;
+
+typedef struct {
+    BYTE const* nextSrc;    /* next block here to continue on current prefix */
+    BYTE const* base;       /* All regular indexes relative to this position */
+    BYTE const* dictBase;   /* extDict indexes relative to this position */
+    U32 dictLimit;          /* below that point, need extDict */
+    U32 lowLimit;           /* below that point, no more data */
+} ZSTD_window_t;
+
+typedef struct {
+    ZSTD_window_t window;      /* State for window round buffer management */
+    U32 loadedDictEnd;         /* index of end of dictionary */
+    U32 nextToUpdate;          /* index from which to continue table update */
+    U32 nextToUpdate3;         /* index from which to continue table update */
+    U32 hashLog3;              /* dispatch table : larger == faster, more memory */
+    U32* hashTable;
+    U32* hashTable3;
+    U32* chainTable;
+    optState_t opt;         /* optimal parser state */
+} ZSTD_matchState_t;
+
+typedef struct {
+    ZSTD_compressedBlockState_t* prevCBlock;
+    ZSTD_compressedBlockState_t* nextCBlock;
+    ZSTD_matchState_t matchState;
+} ZSTD_blockState_t;
+
+typedef struct {
+    U32 offset;
+    U32 checksum;
+} ldmEntry_t;
+
+typedef struct {
+    ZSTD_window_t window;   /* State for the window round buffer management */
+    ldmEntry_t* hashTable;
+    BYTE* bucketOffsets;    /* Next position in bucket to insert entry */
+    U64 hashPower;          /* Used to compute the rolling hash.
+                             * Depends on ldmParams.minMatchLength */
+} ldmState_t;
+
+typedef struct {
+    U32 enableLdm;          /* 1 if enable long distance matching */
+    U32 hashLog;            /* Log size of hashTable */
+    U32 bucketSizeLog;      /* Log bucket size for collision resolution, at most 8 */
+    U32 minMatchLength;     /* Minimum match length */
+    U32 hashEveryLog;       /* Log number of entries to skip */
+    U32 windowLog;          /* Window log for the LDM */
+} ldmParams_t;
+
+typedef struct {
+    U32 offset;
+    U32 litLength;
+    U32 matchLength;
+} rawSeq;
+
+typedef struct {
+  rawSeq* seq;     /* The start of the sequences */
+  size_t pos;      /* The position where reading stopped. <= size. */
+  size_t size;     /* The number of sequences. <= capacity. */
+  size_t capacity; /* The capacity of the `seq` pointer */
+} rawSeqStore_t;
+
+struct ZSTD_CCtx_params_s {
+    ZSTD_format_e format;
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+
+    int compressionLevel;
+    int disableLiteralCompression;
+    int forceWindow;           /* force back-references to respect limit of
+                                * 1<<wLog, even for dictionary */
+
+    /* Multithreading: used to pass parameters to mtctx */
+    unsigned nbWorkers;
+    unsigned jobSize;
+    unsigned overlapSizeLog;
+
+    /* Long distance matching parameters */
+    ldmParams_t ldmParams;
+
+    /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+    ZSTD_customMem customMem;
+};  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+
+struct ZSTD_CCtx_s {
+    ZSTD_compressionStage_e stage;
+    int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
+    int bmi2;                            /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+    ZSTD_CCtx_params requestedParams;
+    ZSTD_CCtx_params appliedParams;
+    U32   dictID;
+    void* workSpace;
+    size_t workSpaceSize;
+    size_t blockSize;
+    unsigned long long pledgedSrcSizePlusOne;  /* this way, 0 (default) == unknown */
+    unsigned long long consumedSrcSize;
+    unsigned long long producedCSize;
+    XXH64_state_t xxhState;
+    ZSTD_customMem customMem;
+    size_t staticSize;
+
+    seqStore_t seqStore;      /* sequences storage ptrs */
+    ldmState_t ldmState;      /* long distance matching state */
+    rawSeq* ldmSequences;     /* Storage for the ldm output sequences */
+    size_t maxNbLdmSequences;
+    rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
+    ZSTD_blockState_t blockState;
+    U32* entropyWorkspace;  /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
+
+    /* streaming */
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inToCompress;
+    size_t inBuffPos;
+    size_t inBuffTarget;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outBuffContentSize;
+    size_t outBuffFlushedSize;
+    ZSTD_cStreamStage streamStage;
+    U32    frameEnded;
+
+    /* Dictionary */
+    ZSTD_CDict* cdictLocal;
+    const ZSTD_CDict* cdict;
+    ZSTD_prefixDict prefixDict;   /* single-usage dictionary */
+
+    /* Multi-threading */
+#ifdef ZSTD_MULTITHREAD
+    ZSTDMT_CCtx* mtctx;
+#endif
+};
+
+
+typedef size_t (*ZSTD_blockCompressor) (
+        ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict);
+
+
+MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
+{
+    static const BYTE LL_Code[64] = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                       8,  9, 10, 11, 12, 13, 14, 15,
+                                      16, 16, 17, 17, 18, 18, 19, 19,
+                                      20, 20, 20, 20, 21, 21, 21, 21,
+                                      22, 22, 22, 22, 22, 22, 22, 22,
+                                      23, 23, 23, 23, 23, 23, 23, 23,
+                                      24, 24, 24, 24, 24, 24, 24, 24,
+                                      24, 24, 24, 24, 24, 24, 24, 24 };
+    static const U32 LL_deltaCode = 19;
+    return (litLength > 63) ? ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+}
+
+/* ZSTD_MLcode() :
+ * note : mlBase = matchLength - MINMATCH;
+ *        because it's the format it's stored in seqStore->sequences */
+MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
+{
+    static const BYTE ML_Code[128] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+                                      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                      32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
+                                      38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
+                                      40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+                                      41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
+                                      42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+                                      42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 };
+    static const U32 ML_deltaCode = 36;
+    return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase];
+}
+
+/*! ZSTD_storeSeq() :
+ *  Store a sequence (literal length, literals, offset code and match length code) into seqStore_t.
+ *  `offsetCode` : distance to match + 3 (values 1-3 are repCodes).
+ *  `mlBase` : matchLength - MINMATCH
+*/
+MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const void* literals, U32 offsetCode, size_t mlBase)
+{
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG >= 6)
+    static const BYTE* g_start = NULL;
+    if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+    {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%3u bytes at dist.code%7u",
+               pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offsetCode);
+    }
+#endif
+    /* copy Literals */
+    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + 128 KB);
+    ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
+    seqStorePtr->lit += litLength;
+
+    /* literal Length */
+    if (litLength>0xFFFF) {
+        assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */
+        seqStorePtr->longLengthID = 1;
+        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    }
+    seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+    /* match offset */
+    seqStorePtr->sequences[0].offset = offsetCode + 1;
+
+    /* match Length */
+    if (mlBase>0xFFFF) {
+        assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */
+        seqStorePtr->longLengthID = 2;
+        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    }
+    seqStorePtr->sequences[0].matchLength = (U16)mlBase;
+
+    seqStorePtr->sequences++;
+}
+
+
+/*-*************************************
+*  Match length counter
+***************************************/
+static unsigned ZSTD_NbCommonBytes (size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 4)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+                                                     0, 3, 1, 3, 1, 4, 2, 7,
+                                                     0, 2, 3, 6, 1, 5, 3, 5,
+                                                     1, 3, 4, 4, 2, 5, 6, 7,
+                                                     7, 0, 1, 2, 3, 3, 4, 6,
+                                                     2, 6, 5, 5, 3, 4, 5, 6,
+                                                     7, 1, 2, 4, 6, 4, 4, 5,
+                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r=0;
+            _BitScanForward( &r, (U32)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+                                                     3, 2, 2, 1, 3, 2, 0, 1,
+                                                     3, 3, 1, 2, 2, 2, 2, 0,
+                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 4)
+            return (__builtin_clzll(val) >> 3);
+#       else
+            unsigned r;
+            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+    }   }
+}
+
+
+MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+{
+    const BYTE* const pStart = pIn;
+    const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1);
+
+    if (pIn < pInLoopLimit) {
+        { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+          if (diff) return ZSTD_NbCommonBytes(diff); }
+        pIn+=sizeof(size_t); pMatch+=sizeof(size_t);
+        while (pIn < pInLoopLimit) {
+            size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+            if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
+            pIn += ZSTD_NbCommonBytes(diff);
+            return (size_t)(pIn - pStart);
+    }   }
+    if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (size_t)(pIn - pStart);
+}
+
+/** ZSTD_count_2segments() :
+ *  can count match length with `ip` & `match` in 2 different segments.
+ *  convention : on reaching mEnd, match count continue starting from iStart
+ */
+MEM_STATIC size_t
+ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+                     const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
+{
+    const BYTE* const vEnd = MIN( ip + (mEnd - match), iEnd);
+    size_t const matchLength = ZSTD_count(ip, match, vEnd);
+    if (match + matchLength != mEnd) return matchLength;
+    return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
+}
+
+
+/*-*************************************
+ *  Hashes
+ ***************************************/
+static const U32 prime3bytes = 506832829U;
+static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+
+static const U32 prime4bytes = 2654435761U;
+static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
+
+static const U64 prime5bytes = 889523592379ULL;
+static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime7bytes = 58295818150454627ULL;
+static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+{
+    switch(mls)
+    {
+    default:
+    case 4: return ZSTD_hash4Ptr(p, hBits);
+    case 5: return ZSTD_hash5Ptr(p, hBits);
+    case 6: return ZSTD_hash6Ptr(p, hBits);
+    case 7: return ZSTD_hash7Ptr(p, hBits);
+    case 8: return ZSTD_hash8Ptr(p, hBits);
+    }
+}
+
+/*-*************************************
+*  Round buffer management
+***************************************/
+/* Max current allowed */
+#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
+/* Maximum chunk size before overflow correction needs to be called again */
+#define ZSTD_CHUNKSIZE_MAX                                                     \
+    ( ((U32)-1)                  /* Maximum ending current index */            \
+    - ZSTD_CURRENT_MAX)          /* Maximum beginning lowLimit */
+
+/**
+ * ZSTD_window_clear():
+ * Clears the window containing the history by simply setting it to empty.
+ */
+MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window)
+{
+    size_t const endT = (size_t)(window->nextSrc - window->base);
+    U32 const end = (U32)endT;
+
+    window->lowLimit = end;
+    window->dictLimit = end;
+}
+
+/**
+ * ZSTD_window_hasExtDict():
+ * Returns non-zero if the window has a non-empty extDict.
+ */
+MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window)
+{
+    return window.lowLimit < window.dictLimit;
+}
+
+/**
+ * ZSTD_window_needOverflowCorrection():
+ * Returns non-zero if the indices are getting too large and need overflow
+ * protection.
+ */
+MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+                                                  void const* srcEnd)
+{
+    U32 const current = (U32)((BYTE const*)srcEnd - window.base);
+    return current > ZSTD_CURRENT_MAX;
+}
+
+/**
+ * ZSTD_window_correctOverflow():
+ * Reduces the indices to protect from index overflow.
+ * Returns the correction made to the indices, which must be applied to every
+ * stored index.
+ *
+ * The least significant cycleLog bits of the indices must remain the same,
+ * which may be 0. Every index up to maxDist in the past must be valid.
+ * NOTE: (maxDist & cycleMask) must be zero.
+ */
+MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                           U32 maxDist, void const* src)
+{
+    /* preemptive overflow correction:
+     * 1. correction is large enough:
+     *    lowLimit > (3<<29) ==> current > 3<<29 + 1<<windowLog
+     *    1<<windowLog <= newCurrent < 1<<chainLog + 1<<windowLog
+     *
+     *    current - newCurrent
+     *    > (3<<29 + 1<<windowLog) - (1<<windowLog + 1<<chainLog)
+     *    > (3<<29) - (1<<chainLog)
+     *    > (3<<29) - (1<<30)             (NOTE: chainLog <= 30)
+     *    > 1<<29
+     *
+     * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow:
+     *    After correction, current is less than (1<<chainLog + 1<<windowLog).
+     *    In 64-bit mode we are safe, because we have 64-bit ptrdiff_t.
+     *    In 32-bit mode we are safe, because (chainLog <= 29), so
+     *    ip+ZSTD_CHUNKSIZE_MAX - cctx->base < 1<<32.
+     * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32:
+     *    windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32.
+     */
+    U32 const cycleMask = (1U << cycleLog) - 1;
+    U32 const current = (U32)((BYTE const*)src - window->base);
+    U32 const newCurrent = (current & cycleMask) + maxDist;
+    U32 const correction = current - newCurrent;
+    assert((maxDist & cycleMask) == 0);
+    assert(current > newCurrent);
+    /* Loose bound, should be around 1<<29 (see above) */
+    assert(correction > 1<<28);
+
+    window->base += correction;
+    window->dictBase += correction;
+    window->lowLimit -= correction;
+    window->dictLimit -= correction;
+
+    DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction,
+             window->lowLimit);
+    return correction;
+}
+
+/**
+ * ZSTD_window_enforceMaxDist():
+ * Updates lowLimit so that:
+ *    (srcEnd - base) - lowLimit == maxDist + loadedDictEnd
+ * This allows a simple check that index >= lowLimit to see if index is valid.
+ * This must be called before a block compression call, with srcEnd as the block
+ * source end.
+ * If loadedDictEndPtr is not NULL, we set it to zero once we update lowLimit.
+ * This is because dictionaries are allowed to be referenced as long as the last
+ * byte of the dictionary is in the window, but once they are out of range,
+ * they cannot be referenced. If loadedDictEndPtr is NULL, we use
+ * loadedDictEnd == 0.
+ */
+MEM_STATIC void ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
+                                           void const* srcEnd, U32 maxDist,
+                                           U32* loadedDictEndPtr)
+{
+    U32 const current = (U32)((BYTE const*)srcEnd - window->base);
+    U32 loadedDictEnd = loadedDictEndPtr != NULL ? *loadedDictEndPtr : 0;
+    if (current > maxDist + loadedDictEnd) {
+        U32 const newLowLimit = current - maxDist;
+        if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit;
+        if (window->dictLimit < window->lowLimit) {
+            DEBUGLOG(5, "Update dictLimit from %u to %u", window->dictLimit,
+                     window->lowLimit);
+            window->dictLimit = window->lowLimit;
+        }
+        if (loadedDictEndPtr)
+            *loadedDictEndPtr = 0;
+    }
+}
+
+/**
+ * ZSTD_window_update():
+ * Updates the window by appending [src, src + srcSize) to the window.
+ * If it is not contiguous, the current prefix becomes the extDict, and we
+ * forget about the extDict. Handles overlap of the prefix and extDict.
+ * Returns non-zero if the segment is contiguous.
+ */
+MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+                                  void const* src, size_t srcSize)
+{
+    BYTE const* const ip = (BYTE const*)src;
+    U32 contiguous = 1;
+    /* Check if blocks follow each other */
+    if (src != window->nextSrc) {
+        /* not contiguous */
+        size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
+        DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u",
+                 window->dictLimit);
+        window->lowLimit = window->dictLimit;
+        assert(distanceFromBase == (size_t)(U32)distanceFromBase);  /* should never overflow */
+        window->dictLimit = (U32)distanceFromBase;
+        window->dictBase = window->base;
+        window->base = ip - distanceFromBase;
+        // ms->nextToUpdate = window->dictLimit;
+        if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit;   /* too small extDict */
+        contiguous = 0;
+    }
+    window->nextSrc = ip + srcSize;
+    /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+    if ( (ip+srcSize > window->dictBase + window->lowLimit)
+       & (ip < window->dictBase + window->dictLimit)) {
+        ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
+        U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
+        window->lowLimit = lowLimitMax;
+    }
+    return contiguous;
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+
+/* ==============================================================
+ * Private declarations
+ * These prototypes shall only be called from within lib/compress
+ * ============================================================== */
+
+/* ZSTD_getCParamsFromCCtxParams() :
+ * cParams are built depending on compressionLevel, src size hints, 
+ * LDM and manually set compression parameters.
+ */
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize);
+
+/*! ZSTD_initCStream_internal() :
+ *  Private use only. Init streaming operation.
+ *  expects params to be valid.
+ *  must receive dict, or cdict, or none, but not both.
+ *  @return : 0, or an error code */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                     const ZSTD_CDict* cdict,
+                     ZSTD_CCtx_params  params, unsigned long long pledgedSrcSize);
+
+/*! ZSTD_compressStream_generic() :
+ *  Private use only. To be called from zstdmt_compress.c in single-thread mode. */
+size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                   ZSTD_outBuffer* output,
+                                   ZSTD_inBuffer* input,
+                                   ZSTD_EndDirective const flushMode);
+
+/*! ZSTD_getCParamsFromCDict() :
+ *  as the name implies */
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict);
+
+/* ZSTD_compressBegin_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    const ZSTD_CDict* cdict,
+                                    ZSTD_CCtx_params params,
+                                    unsigned long long pledgedSrcSize);
+
+/* ZSTD_compress_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 const void* dict,size_t dictSize,
+                                 ZSTD_CCtx_params params);
+
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapcity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+
+
+/* ZSTD_referenceExternalSequences() :
+ * Must be called before starting a compression operation.
+ * seqs must parse a prefix of the source.
+ * This cannot be used when long range matching is enabled.
+ * Zstd will use these sequences, and pass the literals to a secondary block
+ * compressor.
+ * @return : An error code on failure.
+ * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+ * access and data corruption.
+ */
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+
+
+#endif /* ZSTD_COMPRESS_H */

diff --git a/vendor/github.com/DataDog/zstd/zstd_decompress.c b/vendor/github.com/DataDog/zstd/zstd_decompress.c
new file mode 100644
index 0000000..3ec6a1c
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_decompress.c

@@ -0,0 +1,3003 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTD_decompress() allocates its context,
+ * on stack (0), or into heap (1, default; requires malloc()).
+ * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected.
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif
+
+/*!
+*  LEGACY_SUPPORT :
+*  if set to 1+, ZSTD_decompress() can decode older formats (v0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 0
+#endif
+
+/*!
+ *  MAXWINDOWSIZE_DEFAULT :
+ *  maximum window size accepted by DStream __by default__.
+ *  Frames requiring more memory will be rejected.
+ *  It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize().
+ */
+#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
+#  define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_DEFAULTMAX) + 1)
+#endif
+
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include <string.h>      /* memcpy, memmove, memset */
+#include "cpu.h"
+#include "mem.h"         /* low level memory routines */
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "zstd_internal.h"
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+#  include "zstd_legacy.h"
+#endif
+
+
+/*-*************************************
+*  Errors
+***************************************/
+#define ZSTD_isError ERR_isError   /* for inlining */
+#define FSE_isError  ERR_isError
+#define HUF_isError  ERR_isError
+
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
+               ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
+               ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
+
+typedef enum { zdss_init=0, zdss_loadHeader,
+               zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+
+typedef struct {
+    U32 fastMode;
+    U32 tableLog;
+} ZSTD_seqSymbol_header;
+
+typedef struct {
+    U16  nextState;
+    BYTE nbAdditionalBits;
+    BYTE nbBits;
+    U32  baseValue;
+} ZSTD_seqSymbol;
+
+#define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))
+
+typedef struct {
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    U32 rep[ZSTD_REP_NUM];
+} ZSTD_entropyDTables_t;
+
+struct ZSTD_DCtx_s
+{
+    const ZSTD_seqSymbol* LLTptr;
+    const ZSTD_seqSymbol* MLTptr;
+    const ZSTD_seqSymbol* OFTptr;
+    const HUF_DTable* HUFptr;
+    ZSTD_entropyDTables_t entropy;
+    const void* previousDstEnd;   /* detect continuity */
+    const void* base;             /* start of current segment */
+    const void* vBase;            /* virtual start of previous segment if it was just before current one */
+    const void* dictEnd;          /* end of previous segment */
+    size_t expected;
+    ZSTD_frameHeader fParams;
+    U64 decodedSize;
+    blockType_e bType;            /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
+    ZSTD_dStage stage;
+    U32 litEntropy;
+    U32 fseEntropy;
+    XXH64_state_t xxhState;
+    size_t headerSize;
+    U32 dictID;
+    ZSTD_format_e format;
+    const BYTE* litPtr;
+    ZSTD_customMem customMem;
+    size_t litSize;
+    size_t rleSize;
+    size_t staticSize;
+    int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+
+    /* streaming */
+    ZSTD_DDict* ddictLocal;
+    const ZSTD_DDict* ddict;
+    ZSTD_dStreamStage streamStage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    size_t maxWindowSize;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t lhSize;
+    void* legacyContext;
+    U32 previousLegacyVersion;
+    U32 legacyVersion;
+    U32 hostageByte;
+
+    /* workspace */
+    BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
+    BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+};  /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support sizeof NULL */
+    return sizeof(*dctx)
+           + ZSTD_sizeof_DDict(dctx->ddictLocal)
+           + dctx->inBuffSize + dctx->outBuffSize;
+}
+
+size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
+
+
+static size_t ZSTD_startingInputLength(ZSTD_format_e format)
+{
+    size_t const startingInputLength = (format==ZSTD_f_zstd1_magicless) ?
+                    ZSTD_frameHeaderSize_prefix - ZSTD_frameIdSize :
+                    ZSTD_frameHeaderSize_prefix;
+    ZSTD_STATIC_ASSERT(ZSTD_FRAMEHEADERSIZE_PREFIX >= ZSTD_FRAMEIDSIZE);
+    /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
+    assert( (format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless) );
+    return startingInputLength;
+}
+
+static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+{
+    dctx->format = ZSTD_f_zstd1;  /* ZSTD_decompressBegin() invokes ZSTD_startingInputLength() with argument dctx->format */
+    dctx->staticSize  = 0;
+    dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+    dctx->ddict       = NULL;
+    dctx->ddictLocal  = NULL;
+    dctx->inBuff      = NULL;
+    dctx->inBuffSize  = 0;
+    dctx->outBuffSize = 0;
+    dctx->streamStage = zdss_init;
+    dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+}
+
+ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize)
+{
+    ZSTD_DCtx* const dctx = (ZSTD_DCtx*) workspace;
+
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+    if (workspaceSize < sizeof(ZSTD_DCtx)) return NULL;  /* minimum size */
+
+    ZSTD_initDCtx_internal(dctx);
+    dctx->staticSize = workspaceSize;
+    dctx->inBuff = (char*)(dctx+1);
+    return dctx;
+}
+
+ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    {   ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(*dctx), customMem);
+        if (!dctx) return NULL;
+        dctx->customMem = customMem;
+        dctx->legacyContext = NULL;
+        dctx->previousLegacyVersion = 0;
+        ZSTD_initDCtx_internal(dctx);
+        return dctx;
+    }
+}
+
+ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    DEBUGLOG(3, "ZSTD_createDCtx");
+    return ZSTD_createDCtx_advanced(ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support free on NULL */
+    if (dctx->staticSize) return ERROR(memory_allocation);   /* not compatible with static DCtx */
+    {   ZSTD_customMem const cMem = dctx->customMem;
+        ZSTD_freeDDict(dctx->ddictLocal);
+        dctx->ddictLocal = NULL;
+        ZSTD_free(dctx->inBuff, cMem);
+        dctx->inBuff = NULL;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (dctx->legacyContext)
+            ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion);
+#endif
+        ZSTD_free(dctx, cMem);
+        return 0;
+    }
+}
+
+/* no longer useful */
+void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
+{
+    size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx);
+    memcpy(dstDCtx, srcDCtx, toCopy);  /* no need to copy workspace */
+}
+
+
+/*-*************************************************************
+ *   Frame header decoding
+ ***************************************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+unsigned ZSTD_isFrame(const void* buffer, size_t size)
+{
+    if (size < ZSTD_frameIdSize) return 0;
+    {   U32 const magic = MEM_readLE32(buffer);
+        if (magic == ZSTD_MAGICNUMBER) return 1;
+        if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
+    }
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(buffer, size)) return 1;
+#endif
+    return 0;
+}
+
+/** ZSTD_frameHeaderSize_internal() :
+ *  srcSize must be large enough to reach header size fields.
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless.
+ * @return : size of the Frame Header
+ *           or an error code, which can be tested with ZSTD_isError() */
+static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    size_t const minInputSize = ZSTD_startingInputLength(format);
+    if (srcSize < minInputSize) return ERROR(srcSize_wrong);
+
+    {   BYTE const fhd = ((const BYTE*)src)[minInputSize-1];
+        U32 const dictID= fhd & 3;
+        U32 const singleSegment = (fhd >> 5) & 1;
+        U32 const fcsId = fhd >> 6;
+        return minInputSize + !singleSegment
+             + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId]
+             + (singleSegment && !fcsId);
+    }
+}
+
+/** ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
+ * @return : size of the Frame Header */
+size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+{
+    return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1);
+}
+
+
+/** ZSTD_getFrameHeader_internal() :
+ *  decode Frame Header, or require larger `srcSize`.
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_getFrameHeader_internal(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t const minInputSize = ZSTD_startingInputLength(format);
+
+    if (srcSize < minInputSize) return minInputSize;
+
+    if ( (format != ZSTD_f_zstd1_magicless)
+      && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+        if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+            /* skippable frame */
+            if (srcSize < ZSTD_skippableHeaderSize)
+                return ZSTD_skippableHeaderSize; /* magic number + frame length */
+            memset(zfhPtr, 0, sizeof(*zfhPtr));
+            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_frameIdSize);
+            zfhPtr->frameType = ZSTD_skippableFrame;
+            return 0;
+        }
+        return ERROR(prefix_unknown);
+    }
+
+    /* ensure there is enough `srcSize` to fully read/decode frame header */
+    {   size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format);
+        if (srcSize < fhsize) return fhsize;
+        zfhPtr->headerSize = (U32)fhsize;
+    }
+
+    {   BYTE const fhdByte = ip[minInputSize-1];
+        size_t pos = minInputSize;
+        U32 const dictIDSizeCode = fhdByte&3;
+        U32 const checksumFlag = (fhdByte>>2)&1;
+        U32 const singleSegment = (fhdByte>>5)&1;
+        U32 const fcsID = fhdByte>>6;
+        U64 windowSize = 0;
+        U32 dictID = 0;
+        U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN;
+        if ((fhdByte & 0x08) != 0)
+            return ERROR(frameParameter_unsupported); /* reserved bits, must be zero */
+
+        if (!singleSegment) {
+            BYTE const wlByte = ip[pos++];
+            U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+            if (windowLog > ZSTD_WINDOWLOG_MAX)
+                return ERROR(frameParameter_windowTooLarge);
+            windowSize = (1ULL << windowLog);
+            windowSize += (windowSize >> 3) * (wlByte&7);
+        }
+        switch(dictIDSizeCode)
+        {
+            default: assert(0);  /* impossible */
+            case 0 : break;
+            case 1 : dictID = ip[pos]; pos++; break;
+            case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
+            case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break;
+        }
+        switch(fcsID)
+        {
+            default: assert(0);  /* impossible */
+            case 0 : if (singleSegment) frameContentSize = ip[pos]; break;
+            case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
+            case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
+            case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
+        }
+        if (singleSegment) windowSize = frameContentSize;
+
+        zfhPtr->frameType = ZSTD_frame;
+        zfhPtr->frameContentSize = frameContentSize;
+        zfhPtr->windowSize = windowSize;
+        zfhPtr->blockSizeMax = (unsigned) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+        zfhPtr->dictID = dictID;
+        zfhPtr->checksumFlag = checksumFlag;
+    }
+    return 0;
+}
+
+/** ZSTD_getFrameHeader() :
+ *  decode Frame Header, or require larger `srcSize`.
+ *  note : this function does not consume input, it only reads it.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize)
+{
+    return ZSTD_getFrameHeader_internal(zfhPtr, src, srcSize, ZSTD_f_zstd1);
+}
+
+
+/** ZSTD_getFrameContentSize() :
+ *  compatible with legacy mode
+ * @return : decompressed size of the single frame pointed to be `src` if known, otherwise
+ *         - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *         - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+{
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(src, srcSize)) {
+        unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize);
+        return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret;
+    }
+#endif
+    {   ZSTD_frameHeader zfh;
+        if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0)
+            return ZSTD_CONTENTSIZE_ERROR;
+        if (zfh.frameType == ZSTD_skippableFrame) {
+            return 0;
+        } else {
+            return zfh.frameContentSize;
+    }   }
+}
+
+/** ZSTD_findDecompressedSize() :
+ *  compatible with legacy mode
+ *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ *      skippable frames
+ *  @return : decompressed size of the frames contained */
+unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long totalDstSize = 0;
+
+    while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+        U32 const magicNumber = MEM_readLE32(src);
+
+        if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+            size_t skippableSize;
+            if (srcSize < ZSTD_skippableHeaderSize)
+                return ERROR(srcSize_wrong);
+            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_frameIdSize)
+                          + ZSTD_skippableHeaderSize;
+            if (srcSize < skippableSize) {
+                return ZSTD_CONTENTSIZE_ERROR;
+            }
+
+            src = (const BYTE *)src + skippableSize;
+            srcSize -= skippableSize;
+            continue;
+        }
+
+        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
+
+            /* check for overflow */
+            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+            totalDstSize += ret;
+        }
+        {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+            if (ZSTD_isError(frameSrcSize)) {
+                return ZSTD_CONTENTSIZE_ERROR;
+            }
+
+            src = (const BYTE *)src + frameSrcSize;
+            srcSize -= frameSrcSize;
+        }
+    }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+    if (srcSize) return ZSTD_CONTENTSIZE_ERROR;
+
+    return totalDstSize;
+}
+
+/** ZSTD_getDecompressedSize() :
+*   compatible with legacy mode
+*   @return : decompressed size if known, 0 otherwise
+              note : 0 can mean any of the following :
+                   - frame content is empty
+                   - decompressed size field is not present in frame header
+                   - frame header unknown / not supported
+                   - frame header not complete (`srcSize` too small) */
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+    ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN);
+    return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret;
+}
+
+
+/** ZSTD_decodeFrameHeader() :
+*   `headerSize` must be the size provided by ZSTD_frameHeaderSize().
+*   @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize)
+{
+    size_t const result = ZSTD_getFrameHeader_internal(&(dctx->fParams), src, headerSize, dctx->format);
+    if (ZSTD_isError(result)) return result;    /* invalid header */
+    if (result>0) return ERROR(srcSize_wrong);  /* headerSize too small */
+    if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID))
+        return ERROR(dictionary_wrong);
+    if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0);
+    return 0;
+}
+
+
+/*-*************************************************************
+ *   Block decoding
+ ***************************************************************/
+
+/*! ZSTD_getcBlockSize() :
+*   Provides the size of compressed block from block header `src` */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr)
+{
+    if (srcSize < ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    {   U32 const cBlockHeader = MEM_readLE24(src);
+        U32 const cSize = cBlockHeader >> 3;
+        bpPtr->lastBlock = cBlockHeader & 1;
+        bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
+        bpPtr->origSize = cSize;   /* only useful for RLE */
+        if (bpPtr->blockType == bt_rle) return 1;
+        if (bpPtr->blockType == bt_reserved) return ERROR(corruption_detected);
+        return cSize;
+    }
+}
+
+
+static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize)
+{
+    if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                               size_t regenSize)
+{
+    if (srcSize != 1) return ERROR(srcSize_wrong);
+    if (regenSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memset(dst, *(const BYTE*)src, regenSize);
+    return regenSize;
+}
+
+/*! ZSTD_decodeLiteralsBlock() :
+ * @return : nb of bytes read from src (< srcSize )
+ *  note : symbol not declared but exposed for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    {   const BYTE* const istart = (const BYTE*) src;
+        symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+
+        switch(litEncType)
+        {
+        case set_repeat:
+            if (dctx->litEntropy==0) return ERROR(dictionary_corrupted);
+            /* fall-through */
+        case set_compressed:
+            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
+            {   size_t lhSize, litSize, litCSize;
+                U32 singleStream=0;
+                U32 const lhlCode = (istart[0] >> 2) & 3;
+                U32 const lhc = MEM_readLE32(istart);
+                switch(lhlCode)
+                {
+                case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    /* 2 - 2 - 10 - 10 */
+                    singleStream = !lhlCode;
+                    lhSize = 3;
+                    litSize  = (lhc >> 4) & 0x3FF;
+                    litCSize = (lhc >> 14) & 0x3FF;
+                    break;
+                case 2:
+                    /* 2 - 2 - 14 - 14 */
+                    lhSize = 4;
+                    litSize  = (lhc >> 4) & 0x3FFF;
+                    litCSize = lhc >> 18;
+                    break;
+                case 3:
+                    /* 2 - 2 - 18 - 18 */
+                    lhSize = 5;
+                    litSize  = (lhc >> 4) & 0x3FFFF;
+                    litCSize = (lhc >> 22) + (istart[4] << 10);
+                    break;
+                }
+                if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected);
+                if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+                if (HUF_isError((litEncType==set_repeat) ?
+                                    ( singleStream ?
+                                        HUF_decompress1X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) :
+                                        HUF_decompress4X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) ) :
+                                    ( singleStream ?
+                                        HUF_decompress1X2_DCtx_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                                                                         dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2) :
+                                        HUF_decompress4X_hufOnly_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                                                                           dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2))))
+                    return ERROR(corruption_detected);
+
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                dctx->litEntropy = 1;
+                if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
+                memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                return litCSize + lhSize;
+            }
+
+        case set_basic:
+            {   size_t litSize, lhSize;
+                U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    litSize = MEM_readLE24(istart) >> 4;
+                    break;
+                }
+
+                if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                    if (litSize+lhSize > srcSize) return ERROR(corruption_detected);
+                    memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                    dctx->litPtr = dctx->litBuffer;
+                    dctx->litSize = litSize;
+                    memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                    return lhSize+litSize;
+                }
+                /* direct reference into compressed stream */
+                dctx->litPtr = istart+lhSize;
+                dctx->litSize = litSize;
+                return lhSize+litSize;
+            }
+
+        case set_rle:
+            {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                size_t litSize, lhSize;
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    litSize = MEM_readLE24(istart) >> 4;
+                    if (srcSize<4) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+                    break;
+                }
+                if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected);
+                memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                return lhSize+1;
+            }
+        default:
+            return ERROR(corruption_detected);   /* impossible */
+        }
+    }
+}
+
+/* Default FSE distribution tables.
+ * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
+ * They were generated programmatically with following method :
+ * - start from default distributions, present in /lib/common/zstd_internal.h
+ * - generate tables normally, using ZSTD_buildFSETable()
+ * - printout the content of tables
+ * - pretify output, report below, test with fuzzer to ensure it's correct */
+
+/* Default FSE distribution table for Literal Lengths */
+static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
+     {  1,  1,  1, LL_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+     /* nextState, nbAddBits, nbBits, baseVal */
+     {  0,  0,  4,    0},  { 16,  0,  4,    0},
+     { 32,  0,  5,    1},  {  0,  0,  5,    3},
+     {  0,  0,  5,    4},  {  0,  0,  5,    6},
+     {  0,  0,  5,    7},  {  0,  0,  5,    9},
+     {  0,  0,  5,   10},  {  0,  0,  5,   12},
+     {  0,  0,  6,   14},  {  0,  1,  5,   16},
+     {  0,  1,  5,   20},  {  0,  1,  5,   22},
+     {  0,  2,  5,   28},  {  0,  3,  5,   32},
+     {  0,  4,  5,   48},  { 32,  6,  5,   64},
+     {  0,  7,  5,  128},  {  0,  8,  6,  256},
+     {  0, 10,  6, 1024},  {  0, 12,  6, 4096},
+     { 32,  0,  4,    0},  {  0,  0,  4,    1},
+     {  0,  0,  5,    2},  { 32,  0,  5,    4},
+     {  0,  0,  5,    5},  { 32,  0,  5,    7},
+     {  0,  0,  5,    8},  { 32,  0,  5,   10},
+     {  0,  0,  5,   11},  {  0,  0,  6,   13},
+     { 32,  1,  5,   16},  {  0,  1,  5,   18},
+     { 32,  1,  5,   22},  {  0,  2,  5,   24},
+     { 32,  3,  5,   32},  {  0,  3,  5,   40},
+     {  0,  6,  4,   64},  { 16,  6,  4,   64},
+     { 32,  7,  5,  128},  {  0,  9,  6,  512},
+     {  0, 11,  6, 2048},  { 48,  0,  4,    0},
+     { 16,  0,  4,    1},  { 32,  0,  5,    2},
+     { 32,  0,  5,    3},  { 32,  0,  5,    5},
+     { 32,  0,  5,    6},  { 32,  0,  5,    8},
+     { 32,  0,  5,    9},  { 32,  0,  5,   11},
+     { 32,  0,  5,   12},  {  0,  0,  6,   15},
+     { 32,  1,  5,   18},  { 32,  1,  5,   20},
+     { 32,  2,  5,   24},  { 32,  2,  5,   28},
+     { 32,  3,  5,   40},  { 32,  4,  5,   48},
+     {  0, 16,  6,65536},  {  0, 15,  6,32768},
+     {  0, 14,  6,16384},  {  0, 13,  6, 8192},
+};   /* LL_defaultDTable */
+
+/* Default FSE distribution table for Offset Codes */
+static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {
+    {  1,  1,  1, OF_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+    /* nextState, nbAddBits, nbBits, baseVal */
+    {  0,  0,  5,    0},     {  0,  6,  4,   61},
+    {  0,  9,  5,  509},     {  0, 15,  5,32765},
+    {  0, 21,  5,2097149},   {  0,  3,  5,    5},
+    {  0,  7,  4,  125},     {  0, 12,  5, 4093},
+    {  0, 18,  5,262141},    {  0, 23,  5,8388605},
+    {  0,  5,  5,   29},     {  0,  8,  4,  253},
+    {  0, 14,  5,16381},     {  0, 20,  5,1048573},
+    {  0,  2,  5,    1},     { 16,  7,  4,  125},
+    {  0, 11,  5, 2045},     {  0, 17,  5,131069},
+    {  0, 22,  5,4194301},   {  0,  4,  5,   13},
+    { 16,  8,  4,  253},     {  0, 13,  5, 8189},
+    {  0, 19,  5,524285},    {  0,  1,  5,    1},
+    { 16,  6,  4,   61},     {  0, 10,  5, 1021},
+    {  0, 16,  5,65533},     {  0, 28,  5,268435453},
+    {  0, 27,  5,134217725}, {  0, 26,  5,67108861},
+    {  0, 25,  5,33554429},  {  0, 24,  5,16777213},
+};   /* OF_defaultDTable */
+
+
+/* Default FSE distribution table for Match Lengths */
+static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
+    {  1,  1,  1, ML_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+    /* nextState, nbAddBits, nbBits, baseVal */
+    {  0,  0,  6,    3},  {  0,  0,  4,    4},
+    { 32,  0,  5,    5},  {  0,  0,  5,    6},
+    {  0,  0,  5,    8},  {  0,  0,  5,    9},
+    {  0,  0,  5,   11},  {  0,  0,  6,   13},
+    {  0,  0,  6,   16},  {  0,  0,  6,   19},
+    {  0,  0,  6,   22},  {  0,  0,  6,   25},
+    {  0,  0,  6,   28},  {  0,  0,  6,   31},
+    {  0,  0,  6,   34},  {  0,  1,  6,   37},
+    {  0,  1,  6,   41},  {  0,  2,  6,   47},
+    {  0,  3,  6,   59},  {  0,  4,  6,   83},
+    {  0,  7,  6,  131},  {  0,  9,  6,  515},
+    { 16,  0,  4,    4},  {  0,  0,  4,    5},
+    { 32,  0,  5,    6},  {  0,  0,  5,    7},
+    { 32,  0,  5,    9},  {  0,  0,  5,   10},
+    {  0,  0,  6,   12},  {  0,  0,  6,   15},
+    {  0,  0,  6,   18},  {  0,  0,  6,   21},
+    {  0,  0,  6,   24},  {  0,  0,  6,   27},
+    {  0,  0,  6,   30},  {  0,  0,  6,   33},
+    {  0,  1,  6,   35},  {  0,  1,  6,   39},
+    {  0,  2,  6,   43},  {  0,  3,  6,   51},
+    {  0,  4,  6,   67},  {  0,  5,  6,   99},
+    {  0,  8,  6,  259},  { 32,  0,  4,    4},
+    { 48,  0,  4,    4},  { 16,  0,  4,    5},
+    { 32,  0,  5,    7},  { 32,  0,  5,    8},
+    { 32,  0,  5,   10},  { 32,  0,  5,   11},
+    {  0,  0,  6,   14},  {  0,  0,  6,   17},
+    {  0,  0,  6,   20},  {  0,  0,  6,   23},
+    {  0,  0,  6,   26},  {  0,  0,  6,   29},
+    {  0,  0,  6,   32},  {  0, 16,  6,65539},
+    {  0, 15,  6,32771},  {  0, 14,  6,16387},
+    {  0, 13,  6, 8195},  {  0, 12,  6, 4099},
+    {  0, 11,  6, 2051},  {  0, 10,  6, 1027},
+};   /* ML_defaultDTable */
+
+
+static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
+{
+    void* ptr = dt;
+    ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
+    ZSTD_seqSymbol* const cell = dt + 1;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->nbBits = 0;
+    cell->nextState = 0;
+    assert(nbAddBits < 255);
+    cell->nbAdditionalBits = (BYTE)nbAddBits;
+    cell->baseValue = baseValue;
+}
+
+
+/* ZSTD_buildFSETable() :
+ * generate FSE decoding table for one symbol (ll, ml or off) */
+static void
+ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+    const short* normalizedCounter, unsigned maxSymbolValue,
+    const U32* baseValue, const U32* nbAdditionalBits,
+    unsigned tableLog)
+{
+    ZSTD_seqSymbol* const tableDecode = dt+1;
+    U16 symbolNext[MaxSeq+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    assert(maxSymbolValue <= MaxSeq);
+    assert(tableLog <= MaxFSELog);
+
+    /* Init, lay down lowprob symbols */
+    {   ZSTD_seqSymbol_header DTableH;
+        DTableH.tableLog = tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].baseValue = s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].baseValue = s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            U32 const symbol = tableDecode[u].baseValue;
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+            assert(nbAdditionalBits[symbol] < 255);
+            tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
+            tableDecode[u].baseValue = baseValue[symbol];
+    }   }
+}
+
+
+/*! ZSTD_buildSeqTable() :
+ * @return : nb bytes read from src,
+ *           or an error code if it fails */
+static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
+                                 symbolEncodingType_e type, U32 max, U32 maxLog,
+                                 const void* src, size_t srcSize,
+                                 const U32* baseValue, const U32* nbAdditionalBits,
+                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable)
+{
+    switch(type)
+    {
+    case set_rle :
+        if (!srcSize) return ERROR(srcSize_wrong);
+        if ( (*(const BYTE*)src) > max) return ERROR(corruption_detected);
+        {   U32 const symbol = *(const BYTE*)src;
+            U32 const baseline = baseValue[symbol];
+            U32 const nbBits = nbAdditionalBits[symbol];
+            ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
+        }
+        *DTablePtr = DTableSpace;
+        return 1;
+    case set_basic :
+        *DTablePtr = defaultTable;
+        return 0;
+    case set_repeat:
+        if (!flagRepeatTable) return ERROR(corruption_detected);
+        return 0;
+    case set_compressed :
+        {   U32 tableLog;
+            S16 norm[MaxSeq+1];
+            size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+            if (FSE_isError(headerSize)) return ERROR(corruption_detected);
+            if (tableLog > maxLog) return ERROR(corruption_detected);
+            ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
+            *DTablePtr = DTableSpace;
+            return headerSize;
+        }
+    default :   /* impossible */
+        assert(0);
+        return ERROR(GENERIC);
+    }
+}
+
+static const U32 LL_base[MaxLL+1] = {
+                 0,    1,    2,     3,     4,     5,     6,      7,
+                 8,    9,   10,    11,    12,    13,    14,     15,
+                16,   18,   20,    22,    24,    28,    32,     40,
+                48,   64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                0x2000, 0x4000, 0x8000, 0x10000 };
+
+static const U32 OF_base[MaxOff+1] = {
+                 0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                 0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
+
+static const U32 OF_bits[MaxOff+1] = {
+                     0,  1,  2,  3,  4,  5,  6,  7,
+                     8,  9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31 };
+
+static const U32 ML_base[MaxML+1] = {
+                     3,  4,  5,    6,     7,     8,     9,    10,
+                    11, 12, 13,   14,    15,    16,    17,    18,
+                    19, 20, 21,   22,    23,    24,    25,    26,
+                    27, 28, 29,   30,    31,    32,    33,    34,
+                    35, 37, 39,   41,    43,    47,    51,    59,
+                    67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                    0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                             const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+    DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
+
+    /* check */
+    if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    {   int nbSeq = *ip++;
+        if (!nbSeq) { *nbSeqPtr=0; return 1; }
+        if (nbSeq > 0x7F) {
+            if (nbSeq == 0xFF) {
+                if (ip+2 > iend) return ERROR(srcSize_wrong);
+                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+            } else {
+                if (ip >= iend) return ERROR(srcSize_wrong);
+                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+            }
+        }
+        *nbSeqPtr = nbSeq;
+    }
+
+    /* FSE table descriptors */
+    if (ip+4 > iend) return ERROR(srcSize_wrong); /* minimum possible size */
+    {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+        symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+        symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+        ip++;
+
+        /* Build DTables */
+        {   size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,
+                                                      LLtype, MaxLL, LLFSELog,
+                                                      ip, iend-ip,
+                                                      LL_base, LL_bits,
+                                                      LL_defaultDTable, dctx->fseEntropy);
+            if (ZSTD_isError(llhSize)) return ERROR(corruption_detected);
+            ip += llhSize;
+        }
+
+        {   size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,
+                                                      OFtype, MaxOff, OffFSELog,
+                                                      ip, iend-ip,
+                                                      OF_base, OF_bits,
+                                                      OF_defaultDTable, dctx->fseEntropy);
+            if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected);
+            ip += ofhSize;
+        }
+
+        {   size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,
+                                                      MLtype, MaxML, MLFSELog,
+                                                      ip, iend-ip,
+                                                      ML_base, ML_bits,
+                                                      ML_defaultDTable, dctx->fseEntropy);
+            if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected);
+            ip += mlhSize;
+        }
+    }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+    const BYTE* match;
+} seq_t;
+
+typedef struct {
+    size_t state;
+    const ZSTD_seqSymbol* table;
+} ZSTD_fseState;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    ZSTD_fseState stateLL;
+    ZSTD_fseState stateOffb;
+    ZSTD_fseState stateML;
+    size_t prevOffset[ZSTD_REP_NUM];
+    const BYTE* prefixStart;
+    const BYTE* dictEnd;
+    size_t pos;
+} seqState_t;
+
+
+FORCE_NOINLINE
+size_t ZSTD_execSequenceLast7(BYTE* op,
+                              BYTE* const oend, seq_t sequence,
+                              const BYTE** litPtr, const BYTE* const litLimit,
+                              const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+    if (oLitEnd <= oend_w) return ERROR(GENERIC);   /* Precondition */
+
+    /* copy literals */
+    if (op < oend_w) {
+        ZSTD_wildcopy(op, *litPtr, oend_w - op);
+        *litPtr += oend_w - op;
+        op = oend_w;
+    }
+    while (op < oLitEnd) *op++ = *(*litPtr)++;
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+    }   }
+    while (op < oMatchEnd) *op++ = *match++;
+    return sequenceLength;
+}
+
+
+HINT_INLINE
+size_t ZSTD_execSequence(BYTE* op,
+                         BYTE* const oend, seq_t sequence,
+                         const BYTE** litPtr, const BYTE* const litLimit,
+                         const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+    if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+
+    /* copy Literals */
+    ZSTD_copy8(op, *litPtr);
+    if (sequence.litLength > 8)
+        ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix -> go into extDict */
+        if (sequence.offset > (size_t)(oLitEnd - vBase))
+            return ERROR(corruption_detected);
+        match = dictEnd + (match - base);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_w || sequence.matchLength < MINMATCH) {
+              U32 i;
+              for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
+              return sequenceLength;
+            }
+    }   }
+    /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* subtracted */
+        int const sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTD_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTD_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_w) {
+            ZSTD_wildcopy(op, match, oend_w - op);
+            match += oend_w - op;
+            op = oend_w;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    } else {
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+
+HINT_INLINE
+size_t ZSTD_execSequenceLong(BYTE* op,
+                             BYTE* const oend, seq_t sequence,
+                             const BYTE** litPtr, const BYTE* const litLimit,
+                             const BYTE* const prefixStart, const BYTE* const dictStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = sequence.match;
+
+    /* check */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+    if (oLitEnd > oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, dictStart, dictEnd);
+
+    /* copy Literals */
+    ZSTD_copy8(op, *litPtr);  /* note : op <= oLitEnd <= oend_w == oend - 8 */
+    if (sequence.litLength > 8)
+        ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - dictStart)) return ERROR(corruption_detected);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = prefixStart;
+            if (op > oend_w || sequence.matchLength < MINMATCH) {
+              U32 i;
+              for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
+              return sequenceLength;
+            }
+    }   }
+    assert(op <= oend_w);
+    assert(sequence.matchLength >= MINMATCH);
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* subtracted */
+        int const sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTD_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTD_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_w) {
+            ZSTD_wildcopy(op, match, oend_w - op);
+            match += oend_w - op;
+            op = oend_w;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    } else {
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+static void
+ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
+{
+    const void* ptr = dt;
+    const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",
+                (U32)DStatePtr->state, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+FORCE_INLINE_TEMPLATE void
+ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
+{
+    ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.nextState + lowBits;
+}
+
+/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+ * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
+ * bits before reloading. This value is the maximum number of bytes we read
+ * after reloading when we are decoding long offets.
+ */
+#define LONG_OFFSETS_MAX_EXTRA_BITS_32                       \
+    (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32       \
+        ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32  \
+        : 0)
+
+typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+
+FORCE_INLINE_TEMPLATE seq_t
+ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+{
+    seq_t seq;
+    U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
+    U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
+    U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
+    U32 const totalBits = llBits+mlBits+ofBits;
+    U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
+    U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
+    U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
+
+    /* sequence */
+    {   size_t offset;
+        if (!ofBits)
+            offset = 0;
+        else {
+            ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+            ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+            assert(ofBits <= MaxOff);
+            if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+                U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
+                offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                BIT_reloadDStream(&seqState->DStream);
+                if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
+            } else {
+                offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+            }
+        }
+
+        if (ofBits <= 1) {
+            offset += (llBase==0);
+            if (offset) {
+                size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+                temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
+                if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset = temp;
+            } else {  /* offset == 0 */
+                offset = seqState->prevOffset[0];
+            }
+        } else {
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        }
+        seq.offset = offset;
+    }
+
+    seq.matchLength = mlBase
+                    + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0);  /* <=  16 bits */
+    if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+        BIT_reloadDStream(&seqState->DStream);
+    if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+        BIT_reloadDStream(&seqState->DStream);
+    /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+    ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+    seq.litLength = llBase
+                  + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0);    /* <=  16 bits */
+    if (MEM_32bits())
+        BIT_reloadDStream(&seqState->DStream);
+
+    DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+
+    /* ANS state update */
+    ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */
+    ZSTD_updateFseState(&seqState->stateML, &seqState->DStream);    /* <=  9 bits */
+    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+    ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream);  /* <=  8 bits */
+
+    return seq;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize, int nbSeq,
+                         const ZSTD_longOffset_e isLongOffset)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    DEBUGLOG(5, "ZSTD_decompressSequences");
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seqState_t seqState;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected);
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
+            nbSeq--;
+            {   seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                op += oneSeqSize;
+        }   }
+
+        /* check if reached exact end */
+        DEBUGLOG(5, "ZSTD_decompressSequences: after decode loop, remaining nbSeq : %i", nbSeq);
+        if (nbSeq) return ERROR(corruption_detected);
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+static size_t
+ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+
+
+FORCE_INLINE_TEMPLATE seq_t
+ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets)
+{
+    seq_t seq;
+    U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
+    U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
+    U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
+    U32 const totalBits = llBits+mlBits+ofBits;
+    U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
+    U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
+    U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
+
+    /* sequence */
+    {   size_t offset;
+        if (!ofBits)
+            offset = 0;
+        else {
+            ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+            ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+            assert(ofBits <= MaxOff);
+            if (MEM_32bits() && longOffsets) {
+                U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1);
+                offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
+                if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+            } else {
+                offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+            }
+        }
+
+        if (ofBits <= 1) {
+            offset += (llBase==0);
+            if (offset) {
+                size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+                temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
+                if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset = temp;
+            } else {
+                offset = seqState->prevOffset[0];
+            }
+        } else {
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        }
+        seq.offset = offset;
+    }
+
+    seq.matchLength = mlBase + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0);  /* <=  16 bits */
+    if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+        BIT_reloadDStream(&seqState->DStream);
+    if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+        BIT_reloadDStream(&seqState->DStream);
+    /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
+    ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+    seq.litLength = llBase + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0);    /* <=  16 bits */
+    if (MEM_32bits())
+        BIT_reloadDStream(&seqState->DStream);
+
+    {   size_t const pos = seqState->pos + seq.litLength;
+        const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
+        seq.match = matchBase + pos - seq.offset;  /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+                                                    * No consequence though : no memory access will occur, overly large offset will be detected in ZSTD_execSequenceLong() */
+        seqState->pos = pos + seq.matchLength;
+    }
+
+    /* ANS state update */
+    ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */
+    ZSTD_updateFseState(&seqState->stateML, &seqState->DStream);    /* <=  9 bits */
+    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+    ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream);  /* <=  8 bits */
+
+    return seq;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_decompressSequencesLong_body(
+                               ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize, int nbSeq,
+                         const ZSTD_longOffset_e isLongOffset)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    const BYTE* const prefixStart = (const BYTE*) (dctx->base);
+    const BYTE* const dictStart = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+
+    /* Regen sequences */
+    if (nbSeq) {
+#define STORED_SEQS 4
+#define STOSEQ_MASK (STORED_SEQS-1)
+#define ADVANCED_SEQS 4
+        seq_t sequences[STORED_SEQS];
+        int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
+        seqState_t seqState;
+        int seqNb;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        seqState.prefixStart = prefixStart;
+        seqState.pos = (size_t)(op-prefixStart);
+        seqState.dictEnd = dictEnd;
+        CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected);
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+        /* prepare in advance */
+        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+            sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
+        }
+        if (seqNb<seqAdvance) return ERROR(corruption_detected);
+
+        /* decode and decompress */
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
+            seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
+            size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            PREFETCH(sequence.match);  /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+            sequences[seqNb&STOSEQ_MASK] = sequence;
+            op += oneSeqSize;
+        }
+        if (seqNb<nbSeq) return ERROR(corruption_detected);
+
+        /* finish queue */
+        seqNb -= seqAdvance;
+        for ( ; seqNb<nbSeq ; seqNb++) {
+            size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb&STOSEQ_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+#undef STORED_SEQS
+#undef STOSEQ_MASK
+#undef ADVANCED_SEQS
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+static size_t
+ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset)
+{
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+#endif
+
+typedef size_t (*ZSTD_decompressSequences_t)(
+    ZSTD_DCtx *dctx, void *dst, size_t maxDstSize,
+    const void *seqStart, size_t seqSize, int nbSeq,
+    const ZSTD_longOffset_e isLongOffset);
+
+static size_t ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                                const void* seqStart, size_t seqSize, int nbSeq,
+                                const ZSTD_longOffset_e isLongOffset)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequences");
+#if DYNAMIC_BMI2
+    if (dctx->bmi2) {
+        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+    }
+#endif
+  return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                                const void* seqStart, size_t seqSize, int nbSeq,
+                                const ZSTD_longOffset_e isLongOffset)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+#if DYNAMIC_BMI2
+    if (dctx->bmi2) {
+        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+    }
+#endif
+  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+}
+
+/* ZSTD_getLongOffsetsShare() :
+ * condition : offTable must be valid
+ * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+ *           compared to maximum possible of (1<<OffFSELog) */
+static unsigned
+ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+{
+    const void* ptr = offTable;
+    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+    const ZSTD_seqSymbol* table = offTable + 1;
+    U32 const max = 1 << tableLog;
+    U32 u, total = 0;
+    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+    assert(max <= (1 << OffFSELog));  /* max not too large */
+    for (u=0; u<max; u++) {
+        if (table[u].nbAdditionalBits > 22) total += 1;
+    }
+
+    assert(tableLog <= OffFSELog);
+    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+
+    return total;
+}
+
+
+static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize, const int frame)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+    /* isLongOffset must be true if there are long offsets.
+     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+     * We don't expect that to be the case in 64-bit mode.
+     * In block mode, window size is not known, so we have to be conservative. (note: but it could be evaluated from current-lowLimit)
+     */
+    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)));
+    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+
+    if (srcSize >= ZSTD_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);
+
+    /* Decode literals section */
+    {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
+        if (ZSTD_isError(litCSize)) return litCSize;
+        ip += litCSize;
+        srcSize -= litCSize;
+    }
+
+    /* Build Decoding Tables */
+    {   int nbSeq;
+        size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+        if (ZSTD_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+        srcSize -= seqHSize;
+
+        if ( (!frame || dctx->fParams.windowSize > (1<<24))
+          && (nbSeq>0) ) {  /* could probably use a larger nbSeq limit */
+            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+            if (shareLongOffsets >= minShare)
+                return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+        }
+
+        return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+    }
+}
+
+
+static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    size_t dSize;
+    ZSTD_checkContinuity(dctx, dst);
+    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
+    dctx->previousDstEnd = (char*)dst + dSize;
+    return dSize;
+}
+
+
+/** ZSTD_insertBlock() :
+    insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+ZSTDLIB_API size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize)
+{
+    ZSTD_checkContinuity(dctx, blockStart);
+    dctx->previousDstEnd = (const char*)blockStart + blockSize;
+    return blockSize;
+}
+
+
+static size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
+{
+    if (length > dstCapacity) return ERROR(dstSize_tooSmall);
+    memset(dst, byte, length);
+    return length;
+}
+
+/** ZSTD_findFrameCompressedSize() :
+ *  compatible with legacy mode
+ *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+ *  `srcSize` must be at least as large as the frame contained
+ *  @return : the compressed size of the frame starting at `src` */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(src, srcSize))
+        return ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
+#endif
+    if ( (srcSize >= ZSTD_skippableHeaderSize)
+      && (MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START ) {
+        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize);
+    } else {
+        const BYTE* ip = (const BYTE*)src;
+        const BYTE* const ipstart = ip;
+        size_t remainingSize = srcSize;
+        ZSTD_frameHeader zfh;
+
+        /* Extract Frame Header */
+        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
+            if (ZSTD_isError(ret)) return ret;
+            if (ret > 0) return ERROR(srcSize_wrong);
+        }
+
+        ip += zfh.headerSize;
+        remainingSize -= zfh.headerSize;
+
+        /* Loop on each block */
+        while (1) {
+            blockProperties_t blockProperties;
+            size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+            if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+            if (ZSTD_blockHeaderSize + cBlockSize > remainingSize)
+                return ERROR(srcSize_wrong);
+
+            ip += ZSTD_blockHeaderSize + cBlockSize;
+            remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
+
+            if (blockProperties.lastBlock) break;
+        }
+
+        if (zfh.checksumFlag) {   /* Final frame content checksum */
+            if (remainingSize < 4) return ERROR(srcSize_wrong);
+            ip += 4;
+            remainingSize -= 4;
+        }
+
+        return ip - ipstart;
+    }
+}
+
+/*! ZSTD_decompressFrame() :
+*   @dctx must be properly initialized */
+static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+                                   void* dst, size_t dstCapacity,
+                             const void** srcPtr, size_t *srcSizePtr)
+{
+    const BYTE* ip = (const BYTE*)(*srcPtr);
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t remainingSize = *srcSizePtr;
+
+    /* check */
+    if (remainingSize < ZSTD_frameHeaderSize_min+ZSTD_blockHeaderSize)
+        return ERROR(srcSize_wrong);
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTD_frameHeaderSize(ip, ZSTD_frameHeaderSize_prefix);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+        if (remainingSize < frameHeaderSize+ZSTD_blockHeaderSize)
+            return ERROR(srcSize_wrong);
+        CHECK_F( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) );
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t decodedSize;
+        blockProperties_t blockProperties;
+        size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize, /* frame */ 1);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            decodedSize = ZSTD_generateNxBytes(op, oend-op, *ip, blockProperties.origSize);
+            break;
+        case bt_reserved :
+        default:
+            return ERROR(corruption_detected);
+        }
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        if (dctx->fParams.checksumFlag)
+            XXH64_update(&dctx->xxhState, op, decodedSize);
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+        if (blockProperties.lastBlock) break;
+    }
+
+    if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+        if ((U64)(op-ostart) != dctx->fParams.frameContentSize) {
+            return ERROR(corruption_detected);
+    }   }
+    if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */
+        U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState);
+        U32 checkRead;
+        if (remainingSize<4) return ERROR(checksum_wrong);
+        checkRead = MEM_readLE32(ip);
+        if (checkRead != checkCalc) return ERROR(checksum_wrong);
+        ip += 4;
+        remainingSize -= 4;
+    }
+
+    /* Allow caller to get size read */
+    *srcPtr = ip;
+    *srcSizePtr = remainingSize;
+    return op-ostart;
+}
+
+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
+
+static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize,
+                                  const void* dict, size_t dictSize,
+                                  const ZSTD_DDict* ddict)
+{
+    void* const dststart = dst;
+    assert(dict==NULL || ddict==NULL);  /* either dict or ddict set, not both */
+
+    if (ddict) {
+        dict = ZSTD_DDictDictContent(ddict);
+        dictSize = ZSTD_DDictDictSize(ddict);
+    }
+
+    while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+        U32 magicNumber;
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (ZSTD_isLegacy(src, srcSize)) {
+            size_t decodedSize;
+            size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
+            if (ZSTD_isError(frameSize)) return frameSize;
+            /* legacy support is not compatible with static dctx */
+            if (dctx->staticSize) return ERROR(memory_allocation);
+
+            decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize);
+
+            dst = (BYTE*)dst + decodedSize;
+            dstCapacity -= decodedSize;
+
+            src = (const BYTE*)src + frameSize;
+            srcSize -= frameSize;
+
+            continue;
+        }
+#endif
+
+        magicNumber = MEM_readLE32(src);
+        DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+                    (U32)magicNumber, (U32)ZSTD_MAGICNUMBER);
+        if (magicNumber != ZSTD_MAGICNUMBER) {
+            if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+                size_t skippableSize;
+                if (srcSize < ZSTD_skippableHeaderSize)
+                    return ERROR(srcSize_wrong);
+                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize)
+                              + ZSTD_skippableHeaderSize;
+                if (srcSize < skippableSize) return ERROR(srcSize_wrong);
+
+                src = (const BYTE *)src + skippableSize;
+                srcSize -= skippableSize;
+                continue;
+            }
+            return ERROR(prefix_unknown);
+        }
+
+        if (ddict) {
+            /* we were called from ZSTD_decompress_usingDDict */
+            CHECK_F(ZSTD_decompressBegin_usingDDict(dctx, ddict));
+        } else {
+            /* this will initialize correctly with no dict if dict == NULL, so
+             * use this in all cases but ddict */
+            CHECK_F(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize));
+        }
+        ZSTD_checkContinuity(dctx, dst);
+
+        {   const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity,
+                                                    &src, &srcSize);
+            if (ZSTD_isError(res)) return res;
+            /* no need to bound check, ZSTD_decompressFrame already has */
+            dst = (BYTE*)dst + res;
+            dstCapacity -= res;
+        }
+    }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+    if (srcSize) return ERROR(srcSize_wrong); /* input not entirely consumed */
+
+    return (BYTE*)dst - (BYTE*)dststart;
+}
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                           const void* src, size_t srcSize,
+                           const void* dict, size_t dictSize)
+{
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
+}
+
+
+size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0);
+}
+
+
+size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1)
+    size_t regenSize;
+    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+    ZSTD_freeDCtx(dctx);
+    return regenSize;
+#else   /* stack mode */
+    ZSTD_DCtx dctx;
+    return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+
+/*-**************************************
+*   Advanced Streaming Decompression API
+*   Bufferless and synchronous
+****************************************/
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
+    switch(dctx->stage)
+    {
+    default:   /* should not happen */
+        assert(0);
+    case ZSTDds_getFrameHeaderSize:
+    case ZSTDds_decodeFrameHeader:
+        return ZSTDnit_frameHeader;
+    case ZSTDds_decodeBlockHeader:
+        return ZSTDnit_blockHeader;
+    case ZSTDds_decompressBlock:
+        return ZSTDnit_block;
+    case ZSTDds_decompressLastBlock:
+        return ZSTDnit_lastBlock;
+    case ZSTDds_checkChecksum:
+        return ZSTDnit_checksum;
+    case ZSTDds_decodeSkippableHeader:
+    case ZSTDds_skipFrame:
+        return ZSTDnit_skippableFrame;
+    }
+}
+
+static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; }
+
+/** ZSTD_decompressContinue() :
+ *  srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress())
+ *  @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+ *            or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (U32)srcSize);
+    /* Sanity check */
+    if (srcSize != dctx->expected) return ERROR(srcSize_wrong);  /* not allowed */
+    if (dstCapacity) ZSTD_checkContinuity(dctx, dst);
+
+    switch (dctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        assert(src != NULL);
+        if (dctx->format == ZSTD_f_zstd1) {  /* allows header */
+            assert(srcSize >= ZSTD_frameIdSize);  /* to read skippable magic number */
+            if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
+                memcpy(dctx->headerBuffer, src, srcSize);
+                dctx->expected = ZSTD_skippableHeaderSize - srcSize;  /* remaining to load to get full skippable frame header */
+                dctx->stage = ZSTDds_decodeSkippableHeader;
+                return 0;
+        }   }
+        dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format);
+        if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize;
+        memcpy(dctx->headerBuffer, src, srcSize);
+        dctx->expected = dctx->headerSize - srcSize;
+        dctx->stage = ZSTDds_decodeFrameHeader;
+        return 0;
+
+    case ZSTDds_decodeFrameHeader:
+        assert(src != NULL);
+        memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize);
+        CHECK_F(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize));
+        dctx->expected = ZSTD_blockHeaderSize;
+        dctx->stage = ZSTDds_decodeBlockHeader;
+        return 0;
+
+    case ZSTDds_decodeBlockHeader:
+        {   blockProperties_t bp;
+            size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+            if (ZSTD_isError(cBlockSize)) return cBlockSize;
+            dctx->expected = cBlockSize;
+            dctx->bType = bp.blockType;
+            dctx->rleSize = bp.origSize;
+            if (cBlockSize) {
+                dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
+                return 0;
+            }
+            /* empty block */
+            if (bp.lastBlock) {
+                if (dctx->fParams.checksumFlag) {
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    dctx->expected = 0; /* end of frame */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->expected = ZSTD_blockHeaderSize;  /* jump to next header */
+                dctx->stage = ZSTDds_decodeBlockHeader;
+            }
+            return 0;
+        }
+
+    case ZSTDds_decompressLastBlock:
+    case ZSTDds_decompressBlock:
+        DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock");
+        {   size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1);
+                break;
+            case bt_raw :
+                rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize);
+                break;
+            case bt_rle :
+                rSize = ZSTD_setRleBlock(dst, dstCapacity, src, srcSize, dctx->rleSize);
+                break;
+            case bt_reserved :   /* should never happen */
+            default:
+                return ERROR(corruption_detected);
+            }
+            if (ZSTD_isError(rSize)) return rSize;
+            DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (U32)rSize);
+            dctx->decodedSize += rSize;
+            if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize);
+
+            if (dctx->stage == ZSTDds_decompressLastBlock) {   /* end of frame */
+                DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (U32)dctx->decodedSize);
+                if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+                    if (dctx->decodedSize != dctx->fParams.frameContentSize) {
+                        return ERROR(corruption_detected);
+                }   }
+                if (dctx->fParams.checksumFlag) {  /* another round for frame checksum */
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    dctx->expected = 0;   /* ends here */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->stage = ZSTDds_decodeBlockHeader;
+                dctx->expected = ZSTD_blockHeaderSize;
+                dctx->previousDstEnd = (char*)dst + rSize;
+            }
+            return rSize;
+        }
+
+    case ZSTDds_checkChecksum:
+        assert(srcSize == 4);  /* guaranteed by dctx->expected */
+        {   U32 const h32 = (U32)XXH64_digest(&dctx->xxhState);
+            U32 const check32 = MEM_readLE32(src);
+            DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", h32, check32);
+            if (check32 != h32) return ERROR(checksum_wrong);
+            dctx->expected = 0;
+            dctx->stage = ZSTDds_getFrameHeaderSize;
+            return 0;
+        }
+
+    case ZSTDds_decodeSkippableHeader:
+        assert(src != NULL);
+        assert(srcSize <= ZSTD_skippableHeaderSize);
+        memcpy(dctx->headerBuffer + (ZSTD_skippableHeaderSize - srcSize), src, srcSize);   /* complete skippable header */
+        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_frameIdSize);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+        dctx->stage = ZSTDds_skipFrame;
+        return 0;
+
+    case ZSTDds_skipFrame:
+        dctx->expected = 0;
+        dctx->stage = ZSTDds_getFrameHeaderSize;
+        return 0;
+
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+    dctx->base = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+    return 0;
+}
+
+/* ZSTD_loadEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary
+ * @return : size of entropy tables read */
+static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const dict, size_t const dictSize)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+
+    if (dictSize <= 8) return ERROR(dictionary_corrupted);
+    dictPtr += 8;   /* skip header = magic + dictID */
+
+
+    {   size_t const hSize = HUF_readDTableX4_wksp(
+            entropy->hufTable, dictPtr, dictEnd - dictPtr,
+            entropy->workspace, sizeof(entropy->workspace));
+        if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
+        dictPtr += hSize;
+    }
+
+    {   short offcodeNCount[MaxOff+1];
+        U32 offcodeMaxValue = MaxOff, offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+        if (offcodeMaxValue > MaxOff) return ERROR(dictionary_corrupted);
+        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
+        ZSTD_buildFSETable(entropy->OFTable,
+                            offcodeNCount, offcodeMaxValue,
+                            OF_base, OF_bits,
+                            offcodeLog);
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (matchlengthMaxValue > MaxML) return ERROR(dictionary_corrupted);
+        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
+        ZSTD_buildFSETable(entropy->MLTable,
+                            matchlengthNCount, matchlengthMaxValue,
+                            ML_base, ML_bits,
+                            matchlengthLog);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (litlengthMaxValue > MaxLL) return ERROR(dictionary_corrupted);
+        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
+        ZSTD_buildFSETable(entropy->LLTable,
+                            litlengthNCount, litlengthMaxValue,
+                            LL_base, LL_bits,
+                            litlengthLog);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted);
+    {   int i;
+        size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12));
+        for (i=0; i<3; i++) {
+            U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4;
+            if (rep==0 || rep >= dictContentSize) return ERROR(dictionary_corrupted);
+            entropy->rep[i] = rep;
+    }   }
+
+    return dictPtr - (const BYTE*)dict;
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize);
+    {   U32 const magic = MEM_readLE32(dict);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
+    }   }
+    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+
+    /* load entropy tables */
+    {   size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
+        if (ZSTD_isError(eSize)) return ERROR(dictionary_corrupted);
+        dict = (const char*)dict + eSize;
+        dictSize -= eSize;
+    }
+    dctx->litEntropy = dctx->fseEntropy = 1;
+
+    /* reference dictionary content */
+    return ZSTD_refDictContent(dctx, dict, dictSize);
+}
+
+/* Note : this function cannot fail */
+size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+{
+    assert(dctx != NULL);
+    dctx->expected = ZSTD_startingInputLength(dctx->format);  /* dctx->format must be properly set */
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->decodedSize = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    dctx->litEntropy = dctx->fseEntropy = 0;
+    dctx->dictID = 0;
+    ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+    memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+    dctx->LLTptr = dctx->entropy.LLTable;
+    dctx->MLTptr = dctx->entropy.MLTable;
+    dctx->OFTptr = dctx->entropy.OFTable;
+    dctx->HUFptr = dctx->entropy.hufTable;
+    return 0;
+}
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    CHECK_F( ZSTD_decompressBegin(dctx) );
+    if (dict && dictSize)
+        CHECK_E(ZSTD_decompress_insertDictionary(dctx, dict, dictSize), dictionary_corrupted);
+    return 0;
+}
+
+
+/* ======   ZSTD_DDict   ====== */
+
+struct ZSTD_DDict_s {
+    void* dictBuffer;
+    const void* dictContent;
+    size_t dictSize;
+    ZSTD_entropyDTables_t entropy;
+    U32 dictID;
+    U32 entropyPresent;
+    ZSTD_customMem cMem;
+};  /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict)
+{
+    return ddict->dictContent;
+}
+
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict)
+{
+    return ddict->dictSize;
+}
+
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict)
+{
+    CHECK_F( ZSTD_decompressBegin(dstDCtx) );
+    if (ddict) {   /* support begin on NULL */
+        dstDCtx->dictID = ddict->dictID;
+        dstDCtx->base = ddict->dictContent;
+        dstDCtx->vBase = ddict->dictContent;
+        dstDCtx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+        dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+        if (ddict->entropyPresent) {
+            dstDCtx->litEntropy = 1;
+            dstDCtx->fseEntropy = 1;
+            dstDCtx->LLTptr = ddict->entropy.LLTable;
+            dstDCtx->MLTptr = ddict->entropy.MLTable;
+            dstDCtx->OFTptr = ddict->entropy.OFTable;
+            dstDCtx->HUFptr = ddict->entropy.hufTable;
+            dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
+            dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
+            dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+        } else {
+            dstDCtx->litEntropy = 0;
+            dstDCtx->fseEntropy = 0;
+        }
+    }
+    return 0;
+}
+
+static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e dictContentType)
+{
+    ddict->dictID = 0;
+    ddict->entropyPresent = 0;
+    if (dictContentType == ZSTD_dct_rawContent) return 0;
+
+    if (ddict->dictSize < 8) {
+        if (dictContentType == ZSTD_dct_fullDict)
+            return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+        return 0;   /* pure content mode */
+    }
+    {   U32 const magic = MEM_readLE32(ddict->dictContent);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            if (dictContentType == ZSTD_dct_fullDict)
+                return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+            return 0;   /* pure content mode */
+        }
+    }
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_frameIdSize);
+
+    /* load entropy tables */
+    CHECK_E( ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted );
+    ddict->entropyPresent = 1;
+    return 0;
+}
+
+
+static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType)
+{
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
+        ddict->dictBuffer = NULL;
+        ddict->dictContent = dict;
+    } else {
+        void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem);
+        ddict->dictBuffer = internalBuffer;
+        ddict->dictContent = internalBuffer;
+        if (!internalBuffer) return ERROR(memory_allocation);
+        memcpy(internalBuffer, dict, dictSize);
+    }
+    ddict->dictSize = dictSize;
+    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+
+    /* parse dictionary content */
+    CHECK_F( ZSTD_loadEntropy_inDDict(ddict, dictContentType) );
+
+    return 0;
+}
+
+ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_customMem customMem)
+{
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
+        if (!ddict) return NULL;
+        ddict->cMem = customMem;
+
+        if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, dictLoadMethod, dictContentType) )) {
+            ZSTD_freeDDict(ddict);
+            return NULL;
+        }
+
+        return ddict;
+    }
+}
+
+/*! ZSTD_createDDict() :
+*   Create a digested dictionary, to start decompression without startup delay.
+*   `dict` content is copied inside DDict.
+*   Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator);
+}
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, to start decompression without startup delay.
+ *  Dictionary content is simply referenced, it will be accessed during decompression.
+ *  Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */
+ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator);
+}
+
+
+const ZSTD_DDict* ZSTD_initStaticDDict(
+                                void* workspace, size_t workspaceSize,
+                                const void* dict, size_t dictSize,
+                                ZSTD_dictLoadMethod_e dictLoadMethod,
+                                ZSTD_dictContentType_e dictContentType)
+{
+    size_t const neededSpace =
+            sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)workspace;
+    assert(workspace != NULL);
+    assert(dict != NULL);
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+    if (workspaceSize < neededSpace) return NULL;
+    if (dictLoadMethod == ZSTD_dlm_byCopy) {
+        memcpy(ddict+1, dict, dictSize);  /* local copy */
+        dict = ddict+1;
+    }
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, ZSTD_dlm_byRef, dictContentType) ))
+        return NULL;
+    return ddict;
+}
+
+
+size_t ZSTD_freeDDict(ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = ddict->cMem;
+        ZSTD_free(ddict->dictBuffer, cMem);
+        ZSTD_free(ddict, cMem);
+        return 0;
+    }
+}
+
+/*! ZSTD_estimateDDictSize() :
+ *  Estimate amount of memory that will be needed to create a dictionary for decompression.
+ *  Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */
+size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+}
+
+size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ;
+}
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return 0;
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
+    return MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;
+    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+}
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompresse frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary (most common case).
+ *  - The frame was built with dictID intentionally removed.
+ *    Needed dictionary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+ *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use
+ *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+{
+    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
+    size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+    if (ZSTD_isError(hError)) return 0;
+    return zfp.dictID;
+}
+
+
+/*! ZSTD_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Use dictionary without significant overhead. */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const ZSTD_DDict* ddict)
+{
+    /* pass content and size in case legacy frames are encountered */
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize,
+                                     NULL, 0,
+                                     ddict);
+}
+
+
+/*=====================================
+*   Streaming decompression
+*====================================*/
+
+ZSTD_DStream* ZSTD_createDStream(void)
+{
+    DEBUGLOG(3, "ZSTD_createDStream");
+    return ZSTD_createDStream_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticDCtx(workspace, workspaceSize);
+}
+
+ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDCtx_advanced(customMem);
+}
+
+size_t ZSTD_freeDStream(ZSTD_DStream* zds)
+{
+    return ZSTD_freeDCtx(zds);
+}
+
+
+/* *** Initialization *** */
+
+size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
+size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    ZSTD_freeDDict(dctx->ddictLocal);
+    if (dict && dictSize >= 8) {
+        dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem);
+        if (dctx->ddictLocal == NULL) return ERROR(memory_allocation);
+    } else {
+        dctx->ddictLocal = NULL;
+    }
+    dctx->ddict = dctx->ddictLocal;
+    return 0;
+}
+
+size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType);
+}
+
+size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize)
+{
+    return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+
+/* ZSTD_initDStream_usingDict() :
+ * return : expected size, aka ZSTD_frameHeaderSize_prefix.
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize)
+{
+    DEBUGLOG(4, "ZSTD_initDStream_usingDict");
+    zds->streamStage = zdss_init;
+    CHECK_F( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) );
+    return ZSTD_frameHeaderSize_prefix;
+}
+
+/* note : this variant can't fail */
+size_t ZSTD_initDStream(ZSTD_DStream* zds)
+{
+    DEBUGLOG(4, "ZSTD_initDStream");
+    return ZSTD_initDStream_usingDict(zds, NULL, 0);
+}
+
+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->ddict = ddict;
+    return 0;
+}
+
+/* ZSTD_initDStream_usingDDict() :
+ * ddict will just be referenced, and must outlive decompression session
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+{
+    size_t const initResult = ZSTD_initDStream(dctx);
+    dctx->ddict = ddict;
+    return initResult;
+}
+
+/* ZSTD_resetDStream() :
+ * return : expected size, aka ZSTD_frameHeaderSize_prefix.
+ * this function cannot fail */
+size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+{
+    DEBUGLOG(4, "ZSTD_resetDStream");
+    dctx->streamStage = zdss_loadHeader;
+    dctx->lhSize = dctx->inPos = dctx->outStart = dctx->outEnd = 0;
+    dctx->legacyVersion = 0;
+    dctx->hostageByte = 0;
+    return ZSTD_frameHeaderSize_prefix;
+}
+
+size_t ZSTD_setDStreamParameter(ZSTD_DStream* dctx,
+                                ZSTD_DStreamParameter_e paramType, unsigned paramValue)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    switch(paramType)
+    {
+        default : return ERROR(parameter_unsupported);
+        case DStream_p_maxWindowSize :
+            DEBUGLOG(4, "setting maxWindowSize = %u KB", paramValue >> 10);
+            dctx->maxWindowSize = paramValue ? paramValue : (U32)(-1);
+            break;
+    }
+    return 0;
+}
+
+size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->maxWindowSize = maxWindowSize;
+    return 0;
+}
+
+size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format)
+{
+    DEBUGLOG(4, "ZSTD_DCtx_setFormat : %u", (unsigned)format);
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->format = format;
+    return 0;
+}
+
+
+size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+{
+    return ZSTD_sizeof_DCtx(dctx);
+}
+
+size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
+{
+    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+    unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2);
+    unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+    size_t const minRBSize = (size_t) neededSize;
+    if ((unsigned long long)minRBSize != neededSize) return ERROR(frameParameter_windowTooLarge);
+    return minRBSize;
+}
+
+size_t ZSTD_estimateDStreamSize(size_t windowSize)
+{
+    size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+    size_t const inBuffSize = blockSize;  /* no block can be larger */
+    size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN);
+    return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize;
+}
+
+size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize)
+{
+    U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;   /* note : should be user-selectable */
+    ZSTD_frameHeader zfh;
+    size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize);
+    if (ZSTD_isError(err)) return err;
+    if (err>0) return ERROR(srcSize_wrong);
+    if (zfh.windowSize > windowSizeMax)
+        return ERROR(frameParameter_windowTooLarge);
+    return ZSTD_estimateDStreamSize((size_t)zfh.windowSize);
+}
+
+
+/* *****   Decompression   ***** */
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+
+size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    const char* const istart = (const char*)(input->src) + input->pos;
+    const char* const iend = (const char*)(input->src) + input->size;
+    const char* ip = istart;
+    char* const ostart = (char*)(output->dst) + output->pos;
+    char* const oend = (char*)(output->dst) + output->size;
+    char* op = ostart;
+    U32 someMoreWork = 1;
+
+    DEBUGLOG(5, "ZSTD_decompressStream");
+    if (input->pos > input->size) {  /* forbidden */
+        DEBUGLOG(5, "in: pos: %u   vs size: %u",
+                    (U32)input->pos, (U32)input->size);
+        return ERROR(srcSize_wrong);
+    }
+    if (output->pos > output->size) {  /* forbidden */
+        DEBUGLOG(5, "out: pos: %u   vs size: %u",
+                    (U32)output->pos, (U32)output->size);
+        return ERROR(dstSize_tooSmall);
+    }
+    DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos));
+
+    while (someMoreWork) {
+        switch(zds->streamStage)
+        {
+        case zdss_init :
+            DEBUGLOG(5, "stage zdss_init => transparent reset ");
+            ZSTD_resetDStream(zds);   /* transparent reset on starting decoding a new frame */
+            /* fall-through */
+
+        case zdss_loadHeader :
+            DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip));
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+            if (zds->legacyVersion) {
+                /* legacy support is incompatible with static dctx */
+                if (zds->staticSize) return ERROR(memory_allocation);
+                {   size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input);
+                    if (hint==0) zds->streamStage = zdss_init;
+                    return hint;
+            }   }
+#endif
+            {   size_t const hSize = ZSTD_getFrameHeader_internal(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
+                DEBUGLOG(5, "header size : %u", (U32)hSize);
+                if (ZSTD_isError(hSize)) {
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+                    U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart);
+                    if (legacyVersion) {
+                        const void* const dict = zds->ddict ? zds->ddict->dictContent : NULL;
+                        size_t const dictSize = zds->ddict ? zds->ddict->dictSize : 0;
+                        DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion);
+                        /* legacy support is incompatible with static dctx */
+                        if (zds->staticSize) return ERROR(memory_allocation);
+                        CHECK_F(ZSTD_initLegacyStream(&zds->legacyContext,
+                                    zds->previousLegacyVersion, legacyVersion,
+                                    dict, dictSize));
+                        zds->legacyVersion = zds->previousLegacyVersion = legacyVersion;
+                        {   size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input);
+                            if (hint==0) zds->streamStage = zdss_init;   /* or stay in stage zdss_loadHeader */
+                            return hint;
+                    }   }
+#endif
+                    return hSize;   /* error */
+                }
+                if (hSize != 0) {   /* need more input */
+                    size_t const toLoad = hSize - zds->lhSize;   /* if hSize!=0, hSize > zds->lhSize */
+                    size_t const remainingInput = (size_t)(iend-ip);
+                    assert(iend >= ip);
+                    if (toLoad > remainingInput) {   /* not enough input to load full header */
+                        if (remainingInput > 0) {
+                            memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput);
+                            zds->lhSize += remainingInput;
+                        }
+                        input->pos = input->size;
+                        return (MAX(ZSTD_frameHeaderSize_min, hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                    }
+                    assert(ip != NULL);
+                    memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad;
+                    break;
+            }   }
+
+            /* check for single-pass mode opportunity */
+            if (zds->fParams.frameContentSize && zds->fParams.windowSize /* skippable frame if == 0 */
+                && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+                size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend-istart);
+                if (cSize <= (size_t)(iend-istart)) {
+                    /* shortcut : using single-pass mode */
+                    size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, oend-op, istart, cSize, zds->ddict);
+                    if (ZSTD_isError(decompressedSize)) return decompressedSize;
+                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+                    ip = istart + cSize;
+                    op += decompressedSize;
+                    zds->expected = 0;
+                    zds->streamStage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+            }   }
+
+            /* Consume header (see ZSTDds_decodeFrameHeader) */
+            DEBUGLOG(4, "Consume header");
+            CHECK_F(ZSTD_decompressBegin_usingDDict(zds, zds->ddict));
+
+            if ((MEM_readLE32(zds->headerBuffer) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_frameIdSize);
+                zds->stage = ZSTDds_skipFrame;
+            } else {
+                CHECK_F(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize));
+                zds->expected = ZSTD_blockHeaderSize;
+                zds->stage = ZSTDds_decodeBlockHeader;
+            }
+
+            /* control buffer memory usage */
+            DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)",
+                        (U32)(zds->fParams.windowSize >>10),
+                        (U32)(zds->maxWindowSize >> 10) );
+            zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+            if (zds->fParams.windowSize > zds->maxWindowSize) return ERROR(frameParameter_windowTooLarge);
+
+            /* Adapt buffer sizes to frame header instructions */
+            {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                size_t const neededOutBuffSize = ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize);
+                if ((zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize)) {
+                    size_t const bufferSize = neededInBuffSize + neededOutBuffSize;
+                    DEBUGLOG(4, "inBuff  : from %u to %u",
+                                (U32)zds->inBuffSize, (U32)neededInBuffSize);
+                    DEBUGLOG(4, "outBuff : from %u to %u",
+                                (U32)zds->outBuffSize, (U32)neededOutBuffSize);
+                    if (zds->staticSize) {  /* static DCtx */
+                        DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize);
+                        assert(zds->staticSize >= sizeof(ZSTD_DCtx));  /* controlled at init */
+                        if (bufferSize > zds->staticSize - sizeof(ZSTD_DCtx))
+                            return ERROR(memory_allocation);
+                    } else {
+                        ZSTD_free(zds->inBuff, zds->customMem);
+                        zds->inBuffSize = 0;
+                        zds->outBuffSize = 0;
+                        zds->inBuff = (char*)ZSTD_malloc(bufferSize, zds->customMem);
+                        if (zds->inBuff == NULL) return ERROR(memory_allocation);
+                    }
+                    zds->inBuffSize = neededInBuffSize;
+                    zds->outBuff = zds->inBuff + zds->inBuffSize;
+                    zds->outBuffSize = neededOutBuffSize;
+            }   }
+            zds->streamStage = zdss_read;
+            /* fall-through */
+
+        case zdss_read:
+            DEBUGLOG(5, "stage zdss_read");
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
+                DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize);
+                if (neededInSize==0) {  /* end of frame */
+                    zds->streamStage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                    int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                    size_t const decodedSize = ZSTD_decompressContinue(zds,
+                        zds->outBuff + zds->outStart, (isSkipFrame ? 0 : zds->outBuffSize - zds->outStart),
+                        ip, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize && !isSkipFrame) break;   /* this was just a header */
+                    zds->outEnd = zds->outStart + decodedSize;
+                    zds->streamStage = zdss_flush;
+                    break;
+            }   }
+            if (ip==iend) { someMoreWork = 0; break; }   /* no more input */
+            zds->streamStage = zdss_load;
+            /* fall-through */
+
+        case zdss_load:
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
+                size_t const toLoad = neededInSize - zds->inPos;
+                int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                size_t loadedSize;
+                if (isSkipFrame) {
+                    loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                } else {
+                    if (toLoad > zds->inBuffSize - zds->inPos) return ERROR(corruption_detected);   /* should never happen */
+                    loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend-ip);
+                }
+                ip += loadedSize;
+                zds->inPos += loadedSize;
+                if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+
+                /* decode loaded input */
+                {   size_t const decodedSize = ZSTD_decompressContinue(zds,
+                        zds->outBuff + zds->outStart, zds->outBuffSize - zds->outStart,
+                        zds->inBuff, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    zds->inPos = 0;   /* input is consumed */
+                    if (!decodedSize && !isSkipFrame) { zds->streamStage = zdss_read; break; }   /* this was just a header */
+                    zds->outEnd = zds->outStart +  decodedSize;
+            }   }
+            zds->streamStage = zdss_flush;
+            /* fall-through */
+
+        case zdss_flush:
+            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
+                size_t const flushedSize = ZSTD_limitCopy(op, oend-op, zds->outBuff + zds->outStart, toFlushSize);
+                op += flushedSize;
+                zds->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {  /* flush completed */
+                    zds->streamStage = zdss_read;
+                    if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                        DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                (int)(zds->outBuffSize - zds->outStart),
+                                (U32)zds->fParams.blockSizeMax);
+                        zds->outStart = zds->outEnd = 0;
+                    }
+                    break;
+            }   }
+            /* cannot complete flush */
+            someMoreWork = 0;
+            break;
+
+        default: return ERROR(GENERIC);   /* impossible */
+    }   }
+
+    /* result */
+    input->pos += (size_t)(ip-istart);
+    output->pos += (size_t)(op-ostart);
+    {   size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds);
+        if (!nextSrcSizeHint) {   /* frame fully decoded */
+            if (zds->outEnd == zds->outStart) {  /* output fully flushed */
+                if (zds->hostageByte) {
+                    if (input->pos >= input->size) {
+                        /* can't release hostage (not present) */
+                        zds->streamStage = zdss_read;
+                        return 1;
+                    }
+                    input->pos++;  /* release hostage */
+                }   /* zds->hostageByte */
+                return 0;
+            }  /* zds->outEnd == zds->outStart */
+            if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
+                input->pos--;   /* note : pos > 0, otherwise, impossible to finish reading last block */
+                zds->hostageByte=1;
+            }
+            return 1;
+        }  /* nextSrcSizeHint==0 */
+        nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block);   /* preload header of next block */
+        assert(zds->inPos <= nextSrcSizeHint);
+        nextSrcSizeHint -= zds->inPos;   /* part already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+
+size_t ZSTD_decompress_generic(ZSTD_DCtx* dctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    return ZSTD_decompressStream(dctx, output, input);
+}
+
+size_t ZSTD_decompress_generic_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos)
+{
+    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+    size_t const cErr = ZSTD_decompress_generic(dctx, &output, &input);
+    *dstPos = output.pos;
+    *srcPos = input.pos;
+    return cErr;
+}
+
+void ZSTD_DCtx_reset(ZSTD_DCtx* dctx)
+{
+    (void)ZSTD_initDStream(dctx);
+    dctx->format = ZSTD_f_zstd1;
+    dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_double_fast.c b/vendor/github.com/DataDog/zstd/zstd_double_fast.c
new file mode 100644
index 0000000..86e6b39
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_double_fast.c

@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_double_fast.h"
+
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              ZSTD_compressionParameters const* cParams,
+                              void const* end)
+{
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32  const mls = cParams->searchLength;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const current = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0)
+                hashSmall[smHash] = current + i;
+            if (i == 0 || hashLarge[lgHash] == 0)
+                hashLarge[lgHash] = current + i;
+        }
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_doubleFast_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+        U32 const mls /* template */)
+{
+    U32* const hashLong = ms->hashTable;
+    const U32 hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    const U32 hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32 lowestIndex = ms->window.dictLimit;
+    const BYTE* const lowest = base + lowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved = 0;
+
+    /* init */
+    ip += (ip==lowest);
+    {   U32 const maxRep = (U32)(ip-lowest);
+        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+    }
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+        size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+        U32 const current = (U32)(ip-base);
+        U32 const matchIndexL = hashLong[h2];
+        U32 const matchIndexS = hashSmall[h];
+        const BYTE* matchLong = base + matchIndexL;
+        const BYTE* match = base + matchIndexS;
+        hashLong[h2] = hashSmall[h] = current;   /* update hash tables */
+
+        assert(offset_1 <= current);   /* supposed guaranteed by construction */
+        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+            /* favor repcode */
+            mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else {
+            U32 offset;
+            if ( (matchIndexL > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip)) ) {
+                mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+                offset = (U32)(ip-matchLong);
+                while (((ip>anchor) & (matchLong>lowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+            } else if ( (matchIndexS > lowestIndex) && (MEM_read32(match) == MEM_read32(ip)) ) {
+                size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+                U32 const matchIndexL3 = hashLong[hl3];
+                const BYTE* matchL3 = base + matchIndexL3;
+                hashLong[hl3] = current + 1;
+                if ( (matchIndexL3 > lowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1)) ) {
+                    mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
+                    ip++;
+                    offset = (U32)(ip-matchL3);
+                    while (((ip>anchor) & (matchL3>lowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                } else {
+                    mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+                    offset = (U32)(ip-match);
+                    while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+                }
+            } else {
+                ip += ((ip-anchor) >> kSearchStrength) + 1;
+                continue;
+            }
+
+            offset_2 = offset_1;
+            offset_1 = offset;
+
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }
+
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] =
+                hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2;  /* here because current+2 could be > iend-8 */
+            hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] =
+                hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base);
+
+            /* check immediate repcode */
+            while ( (ip <= ilimit)
+                 && ( (offset_2>0)
+                 & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                /* store sequence */
+                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
+                hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+                ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
+                ip += rLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved;
+    rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    const U32 mls = cParams->searchLength;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 4);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 5);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 6);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 7);
+    }
+}
+
+
+static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+        U32 const mls /* template */)
+{
+    U32* const hashLong = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32   lowestIndex = ms->window.lowLimit;
+    const BYTE* const dictStart = dictBase + lowestIndex;
+    const U32   dictLimit = ms->window.dictLimit;
+    const BYTE* const lowPrefixPtr = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+        const U32 matchIndex = hashSmall[hSmall];
+        const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base;
+        const BYTE* match = matchBase + matchIndex;
+
+        const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+        const U32 matchLongIndex = hashLong[hLong];
+        const BYTE* matchLongBase = matchLongIndex < dictLimit ? dictBase : base;
+        const BYTE* matchLong = matchLongBase + matchLongIndex;
+
+        const U32 current = (U32)(ip-base);
+        const U32 repIndex = current + 1 - offset_1;   /* offset_1 expected <= current +1 */
+        const BYTE* repBase = repIndex < dictLimit ? dictBase : base;
+        const BYTE* repMatch = repBase + repIndex;
+        size_t mLength;
+        hashSmall[hSmall] = hashLong[hLong] = current;   /* update hash table */
+
+        if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex))
+           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else {
+            if ((matchLongIndex > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                const BYTE* matchEnd = matchLongIndex < dictLimit ? dictEnd : iend;
+                const BYTE* lowMatchPtr = matchLongIndex < dictLimit ? dictStart : lowPrefixPtr;
+                U32 offset;
+                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, lowPrefixPtr) + 8;
+                offset = current - matchLongIndex;
+                while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+            } else if ((matchIndex > lowestIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+                U32 const matchIndex3 = hashLong[h3];
+                const BYTE* const match3Base = matchIndex3 < dictLimit ? dictBase : base;
+                const BYTE* match3 = match3Base + matchIndex3;
+                U32 offset;
+                hashLong[h3] = current + 1;
+                if ( (matchIndex3 > lowestIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+                    const BYTE* matchEnd = matchIndex3 < dictLimit ? dictEnd : iend;
+                    const BYTE* lowMatchPtr = matchIndex3 < dictLimit ? dictStart : lowPrefixPtr;
+                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, lowPrefixPtr) + 8;
+                    ip++;
+                    offset = current+1 - matchIndex3;
+                    while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
+                } else {
+                    const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
+                    const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4;
+                    offset = current - matchIndex;
+                    while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                }
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+            } else {
+                ip += ((ip-anchor) >> kSearchStrength) + 1;
+                continue;
+        }   }
+
+        /* found a match : store it */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2;
+            hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] = current+2;
+            hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base);
+            hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex))  /* intentional overflow */
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    U32 const mls = cParams->searchLength;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 4);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 5);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 6);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 7);
+    }
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_double_fast.h b/vendor/github.com/DataDog/zstd/zstd_double_fast.h
new file mode 100644
index 0000000..6d80b27
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_double_fast.h

@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_DOUBLE_FAST_H
+#define ZSTD_DOUBLE_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "mem.h"      /* U32 */
+#include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              ZSTD_compressionParameters const* cParams,
+                              void const* end);
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_DOUBLE_FAST_H */

diff --git a/vendor/github.com/DataDog/zstd/zstd_errors.h b/vendor/github.com/DataDog/zstd/zstd_errors.h
new file mode 100644
index 0000000..57533f2
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_errors.h

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*===== dependency =====*/
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDERRORLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDERRORLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#endif
+
+/*-*********************************************
+ *  Error codes list
+ *-*********************************************
+ *  Error codes _values_ are pinned down since v1.3.1 only.
+ *  Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ *  Only values < 100 are considered stable.
+ *
+ *  note 1 : this API shall be used with static linking only.
+ *           dynamic linking is not yet officially supported.
+ *  note 2 : Prefer relying on the enum than on its value whenever possible
+ *           This is the only supported way to use the error list < v1.3.1
+ *  note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+  ZSTD_error_no_error = 0,
+  ZSTD_error_GENERIC  = 1,
+  ZSTD_error_prefix_unknown                = 10,
+  ZSTD_error_version_unsupported           = 12,
+  ZSTD_error_frameParameter_unsupported    = 14,
+  ZSTD_error_frameParameter_windowTooLarge = 16,
+  ZSTD_error_corruption_detected = 20,
+  ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_dictionary_corrupted      = 30,
+  ZSTD_error_dictionary_wrong          = 32,
+  ZSTD_error_dictionaryCreation_failed = 34,
+  ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_outOfBound    = 42,
+  ZSTD_error_tableLog_tooLarge       = 44,
+  ZSTD_error_maxSymbolValue_tooLarge = 46,
+  ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stage_wrong       = 60,
+  ZSTD_error_init_missing      = 62,
+  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
+  ZSTD_error_dstSize_tooSmall = 70,
+  ZSTD_error_srcSize_wrong    = 72,
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+  ZSTD_error_frameIndex_tooLarge = 100,
+  ZSTD_error_seekableIO          = 102,
+  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+    which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_ERRORS_H_398273423 */

diff --git a/vendor/github.com/DataDog/zstd/zstd_fast.c b/vendor/github.com/DataDog/zstd/zstd_fast.c
new file mode 100644
index 0000000..df4d28b
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_fast.c

@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_fast.h"
+
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        ZSTD_compressionParameters const* cParams,
+                        void const* end)
+{
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog;
+    U32  const mls = cParams->searchLength;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const current = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const hash = ZSTD_hashPtr(ip + i, hBits, mls);
+            if (i == 0 || hashTable[hash] == 0)
+                hashTable[hash] = current + i;
+        }
+    }
+}
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_fast_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const hlog, U32 const stepSize, U32 const mls)
+{
+    U32* const hashTable = ms->hashTable;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32   lowestIndex = ms->window.dictLimit;
+    const BYTE* const lowest = base + lowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved = 0;
+
+    /* init */
+    ip += (ip==lowest);
+    {   U32 const maxRep = (U32)(ip-lowest);
+        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+    }
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+        U32 const current = (U32)(ip-base);
+        U32 const matchIndex = hashTable[h];
+        const BYTE* match = base + matchIndex;
+        hashTable[h] = current;   /* update hash table */
+
+        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+            mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else {
+            if ( (matchIndex <= lowestIndex)
+              || (MEM_read32(match) != MEM_read32(ip)) ) {
+                assert(stepSize >= 1);
+                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+                continue;
+            }
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            {   U32 const offset = (U32)(ip-match);
+                while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }   }
+
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;  /* here because current+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+            /* check immediate repcode */
+            while ( (ip <= ilimit)
+                 && ( (offset_2>0)
+                 & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                /* store sequence */
+                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; }  /* swap offset_2 <=> offset_1 */
+                hashTable[ZSTD_hashPtr(ip, hlog, mls)] = (U32)(ip-base);
+                ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
+                ip += rLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved;
+    rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    U32 const hlog = cParams->hashLog;
+    U32 const mls = cParams->searchLength;
+    U32 const stepSize = cParams->targetLength;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4);
+    case 5 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5);
+    case 6 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6);
+    case 7 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7);
+    }
+}
+
+
+static size_t ZSTD_compressBlock_fast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const hlog, U32 const stepSize, U32 const mls)
+{
+    U32* hashTable = ms->hashTable;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32   lowestIndex = ms->window.lowLimit;
+    const BYTE* const dictStart = dictBase + lowestIndex;
+    const U32   dictLimit = ms->window.dictLimit;
+    const BYTE* const lowPrefixPtr = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+        const U32 matchIndex = hashTable[h];
+        const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base;
+        const BYTE* match = matchBase + matchIndex;
+        const U32 current = (U32)(ip-base);
+        const U32 repIndex = current + 1 - offset_1;   /* offset_1 expected <= current +1 */
+        const BYTE* repBase = repIndex < dictLimit ? dictBase : base;
+        const BYTE* repMatch = repBase + repIndex;
+        size_t mLength;
+        hashTable[h] = current;   /* update hash table */
+
+        if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex))
+           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else {
+            if ( (matchIndex < lowestIndex) ||
+                 (MEM_read32(match) != MEM_read32(ip)) ) {
+                assert(stepSize >= 1);
+                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+                continue;
+            }
+            {   const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
+                const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+                U32 offset;
+                mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4;
+                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                offset = current - matchIndex;
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }   }
+
+        /* found a match : store it */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;
+            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex))  /* intentional overflow */
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    U32 const hlog = cParams->hashLog;
+    U32 const mls = cParams->searchLength;
+    U32 const stepSize = cParams->targetLength;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4);
+    case 5 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5);
+    case 6 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6);
+    case 7 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7);
+    }
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_fast.h b/vendor/github.com/DataDog/zstd/zstd_fast.h
new file mode 100644
index 0000000..f0438ad
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_fast.h

@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_FAST_H
+#define ZSTD_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "mem.h"      /* U32 */
+#include "zstd_compress_internal.h"
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        ZSTD_compressionParameters const* cParams,
+                        void const* end);
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */

diff --git a/vendor/github.com/DataDog/zstd/zstd_internal.h b/vendor/github.com/DataDog/zstd/zstd_internal.h
new file mode 100644
index 0000000..65c08a8
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_internal.h

@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/* this module contains definitions which must be identical
+ * across compression, decompression and dictBuilder.
+ * It also contains a few functions useful to at least 2 of them
+ * and which benefit from being inlined */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "compiler.h"
+#include "mem.h"
+#include "error_private.h"
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY  /* XXH64_state_t */
+#endif
+#include "xxhash.h"                /* XXH_reset, update, digest */
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*-*************************************
+*  Debug
+***************************************/
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
+#  include <assert.h>
+#else
+#  ifndef assert
+#    define assert(condition) ((void)0)
+#  endif
+#endif
+
+#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; }
+
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
+#  include <stdio.h>
+extern int g_debuglog_enable;
+/* recommended values for ZSTD_DEBUG display levels :
+ * 1 : no display, enables assert() only
+ * 2 : reserved for currently active debug path
+ * 3 : events once per object lifetime (CCtx, CDict, etc.)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (*very* verbose) */
+#  define RAWLOG(l, ...) {                                      \
+                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
+                    fprintf(stderr, __VA_ARGS__);               \
+            }   }
+#  define DEBUGLOG(l, ...) {                                    \
+                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
+                    fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
+                    fprintf(stderr, " \n");                     \
+            }   }
+#else
+#  define RAWLOG(l, ...)      {}    /* disabled */
+#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#endif
+
+
+/*-*************************************
+*  shared macros
+***************************************/
+#undef MIN
+#undef MAX
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+#define CHECK_F(f) { size_t const errcod = f; if (ERR_isError(errcod)) return errcod; }  /* check and Forward error code */
+#define CHECK_E(f, e) { size_t const errcod = f; if (ERR_isError(errcod)) return ERROR(e); }  /* check and send Error code */
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTD_OPT_NUM    (1<<12)
+
+#define ZSTD_REP_NUM      3                 /* number of repcodes */
+#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+#define ZSTD_WINDOWLOG_DEFAULTMAX 27 /* Default maximum allowed window log */
+static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
+static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
+
+#define ZSTD_FRAMEIDSIZE 4
+static const size_t ZSTD_frameIdSize = ZSTD_FRAMEIDSIZE;  /* magic number size */
+
+#define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define HufLog 12
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+
+#define Litbits  8
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML   52
+#define MaxLL   35
+#define DefaultMaxOff 28
+#define MaxOff  31
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+#define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+
+static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3,
+                                      4, 6, 7, 8, 9,10,11,12,
+                                     13,14,15,16 };
+static const S16 LL_defaultNorm[MaxLL+1] = { 4, 3, 2, 2, 2, 2, 2, 2,
+                                             2, 2, 2, 2, 2, 1, 1, 1,
+                                             2, 2, 2, 2, 2, 2, 2, 2,
+                                             2, 3, 2, 1, 1, 1, 1, 1,
+                                            -1,-1,-1,-1 };
+#define LL_DEFAULTNORMLOG 6  /* for static allocation */
+static const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static const U32 ML_bits[MaxML+1] = { 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3,
+                                      4, 4, 5, 7, 8, 9,10,11,
+                                     12,13,14,15,16 };
+static const S16 ML_defaultNorm[MaxML+1] = { 1, 4, 3, 2, 2, 2, 2, 2,
+                                             2, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1,-1,-1,
+                                            -1,-1,-1,-1,-1 };
+#define ML_DEFAULTNORMLOG 6  /* for static allocation */
+static const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static const S16 OF_defaultNorm[DefaultMaxOff+1] = { 1, 1, 1, 1, 1, 1, 2, 2,
+                                                     2, 1, 1, 1, 1, 1, 1, 1,
+                                                     1, 1, 1, 1, 1, 1, 1, 1,
+                                                    -1,-1,-1,-1,-1 };
+#define OF_DEFAULTNORMLOG 5  /* for static allocation */
+static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTD_wildcopy() :
+ *  custom version of memcpy(), can overwrite up to WILDCOPY_OVERLENGTH bytes (if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+MEM_STATIC void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+MEM_STATIC void ZSTD_wildcopy_e(void* dst, const void* src, void* dstEnd)   /* should be faster for decoding, but strangely, not verified on all platform */
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = (BYTE*)dstEnd;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+/*-*******************************************
+*  Private declarations
+*********************************************/
+typedef struct seqDef_s {
+    U32 offset;
+    U16 litLength;
+    U16 matchLength;
+} seqDef;
+
+typedef struct {
+    seqDef* sequencesStart;
+    seqDef* sequences;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* llCode;
+    BYTE* mlCode;
+    BYTE* ofCode;
+    U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+    U32   longLengthPos;
+} seqStore_t;
+
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+
+/* custom memory allocation functions */
+void* ZSTD_malloc(size_t size, ZSTD_customMem customMem);
+void* ZSTD_calloc(size_t size, ZSTD_customMem customMem);
+void ZSTD_free(void* ptr, ZSTD_customMem customMem);
+
+
+MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+{
+    assert(val != 0);
+    {
+#   if defined(_MSC_VER)   /* Visual */
+        unsigned long r=0;
+        _BitScanReverse(&r, val);
+        return (unsigned)r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* GCC Intrinsic */
+        return 31 - __builtin_clz(val);
+#   else   /* Software version */
+        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+        U32 v = val;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+#   endif
+    }
+}
+
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx);   /* zstdmt, adaptive_compression (shouldn't get this definition from here) */
+
+
+typedef struct {
+    blockType_e blockType;
+    U32 lastBlock;
+    U32 origSize;
+} blockProperties_t;
+
+/*! ZSTD_getcBlockSize() :
+ *  Provides the size of compressed block from block header `src` */
+/* Used by: decompress, fullbench (does not get its definition from here) */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTD_CCOMMON_H_MODULE */

diff --git a/vendor/github.com/DataDog/zstd/zstd_lazy.c b/vendor/github.com/DataDog/zstd/zstd_lazy.c
new file mode 100644
index 0000000..9f15812
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_lazy.c

@@ -0,0 +1,824 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_lazy.h"
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+
+void ZSTD_updateDUBT(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* ip, const BYTE* iend,
+                U32 mls)
+{
+    U32* const hashTable = ms->hashTable;
+    U32  const hashLog = cParams->hashLog;
+
+    U32* const bt = ms->chainTable;
+    U32  const btLog  = cParams->chainLog - 1;
+    U32  const btMask = (1 << btLog) - 1;
+
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    if (idx != target)
+        DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
+                    idx, target, ms->window.dictLimit);
+    assert(ip + 8 <= iend);   /* condition for ZSTD_hashPtr */
+    (void)iend;
+
+    assert(idx >= ms->window.dictLimit);   /* condition for valid base+idx */
+    for ( ; idx < target ; idx++) {
+        size_t const h  = ZSTD_hashPtr(base + idx, hashLog, mls);   /* assumption : ip + 8 <= iend */
+        U32    const matchIndex = hashTable[h];
+
+        U32*   const nextCandidatePtr = bt + 2*(idx&btMask);
+        U32*   const sortMarkPtr  = nextCandidatePtr + 1;
+
+        DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
+        hashTable[h] = idx;   /* Update Hash Table */
+        *nextCandidatePtr = matchIndex;   /* update BT like a chain */
+        *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
+    }
+    ms->nextToUpdate = target;
+}
+
+
+/** ZSTD_insertDUBT1() :
+ *  sort one already inserted but unsorted position
+ *  assumption : current >= btlow == (current - btmask)
+ *  doesn't fail */
+static void ZSTD_insertDUBT1(
+                 ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                 U32 current, const BYTE* inputEnd,
+                 U32 nbCompares, U32 btLow, int extDict)
+{
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current;
+    const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 matchIndex = *smallerPtr;
+    U32 dummy32;   /* to be nullified at the end */
+    U32 const windowLow = ms->window.lowLimit;
+
+    DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
+                current, dictLimit, windowLow);
+    assert(current >= btLow);
+    assert(ip < iend);   /* condition for ZSTD_count */
+
+    while (nbCompares-- && (matchIndex > windowLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < current);
+
+        if ( (!extDict)
+          || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/
+          || (current < dictLimit) /* both in extDict */) {
+            const BYTE* const mBase = !extDict || ((matchIndex+matchLength) >= dictLimit) ? base : dictBase;
+            assert( (matchIndex+matchLength >= dictLimit)   /* might be wrong if extDict is incorrectly set to 0 */
+                 || (current < dictLimit) );
+            match = mBase + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
+                    current, matchIndex, (U32)matchLength);
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
+                        matchIndex, btLow, nextPtr[1]);
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
+                        matchIndex, btLow, nextPtr[0]);
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+}
+
+
+static size_t ZSTD_DUBT_findBestMatch (
+                            ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                            const BYTE* const ip, const BYTE* const iend,
+                            size_t* offsetPtr,
+                            U32 const mls,
+                            U32 const extDict)
+{
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32          matchIndex  = hashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    U32    const current = (U32)(ip-base);
+    U32    const windowLow = ms->window.lowLimit;
+
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32    const btLow = (btMask >= current) ? 0 : current - btMask;
+    U32    const unsortLimit = MAX(btLow, windowLow);
+
+    U32*         nextCandidate = bt + 2*(matchIndex&btMask);
+    U32*         unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+    U32          nbCompares = 1U << cParams->searchLog;
+    U32          nbCandidates = nbCompares;
+    U32          previousCandidate = 0;
+
+    DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current);
+    assert(ip <= iend-8);   /* required for h calculation */
+
+    /* reach end of unsorted candidates list */
+    while ( (matchIndex > unsortLimit)
+         && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
+         && (nbCandidates > 1) ) {
+        DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
+                    matchIndex);
+        *unsortedMark = previousCandidate;
+        previousCandidate = matchIndex;
+        matchIndex = *nextCandidate;
+        nextCandidate = bt + 2*(matchIndex&btMask);
+        unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+        nbCandidates --;
+    }
+
+    if ( (matchIndex > unsortLimit)
+      && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
+        DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
+                    matchIndex);
+        *nextCandidate = *unsortedMark = 0;   /* nullify next candidate if it's still unsorted (note : simplification, detrimental to compression ratio, beneficial for speed) */
+    }
+
+    /* batch sort stacked candidates */
+    matchIndex = previousCandidate;
+    while (matchIndex) {  /* will end on matchIndex == 0 */
+        U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
+        U32 const nextCandidateIdx = *nextCandidateIdxPtr;
+        ZSTD_insertDUBT1(ms, cParams, matchIndex, iend,
+                         nbCandidates, unsortLimit, extDict);
+        matchIndex = nextCandidateIdx;
+        nbCandidates++;
+    }
+
+    /* find longest match */
+    {   size_t commonLengthSmaller=0, commonLengthLarger=0;
+        const BYTE* const dictBase = ms->window.dictBase;
+        const U32 dictLimit = ms->window.dictLimit;
+        const BYTE* const dictEnd = dictBase + dictLimit;
+        const BYTE* const prefixStart = base + dictLimit;
+        U32* smallerPtr = bt + 2*(current&btMask);
+        U32* largerPtr  = bt + 2*(current&btMask) + 1;
+        U32 matchEndIdx = current+8+1;
+        U32 dummy32;   /* to be nullified at the end */
+        size_t bestLength = 0;
+
+        matchIndex  = hashTable[h];
+        hashTable[h] = current;   /* Update Hash Table */
+
+        while (nbCompares-- && (matchIndex > windowLow)) {
+            U32* const nextPtr = bt + 2*(matchIndex & btMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match;
+
+            if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+                match = base + matchIndex;
+                matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+            } else {
+                match = dictBase + matchIndex;
+                matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+                if (matchIndex+matchLength >= dictLimit)
+                    match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+            }
+
+            if (matchLength > bestLength) {
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+                    bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+                if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+                }
+            }
+
+            if (match[matchLength] < ip[matchLength]) {
+                /* match is smaller than current */
+                *smallerPtr = matchIndex;             /* update smaller idx */
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+                matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                *largerPtr = matchIndex;
+                commonLengthLarger = matchLength;
+                if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                largerPtr = nextPtr;
+                matchIndex = nextPtr[0];
+        }   }
+
+        *smallerPtr = *largerPtr = 0;
+
+        assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
+        ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+        if (bestLength >= MINMATCH) {
+            U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+            DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                        current, (U32)bestLength, (U32)*offsetPtr, mIndex);
+        }
+        return bestLength;
+    }
+}
+
+
+/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+static size_t ZSTD_BtFindBestMatch (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls /* template */)
+{
+    DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, 0);
+}
+
+
+static size_t ZSTD_BtFindBestMatch_selectMLS (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(cParams->searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4);
+    case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5);
+    case 7 :
+    case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6);
+    }
+}
+
+
+/** Tree updater, providing best match */
+static size_t ZSTD_BtFindBestMatch_extDict (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls)
+{
+    DEBUGLOG(7, "ZSTD_BtFindBestMatch_extDict");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, 1);
+}
+
+
+static size_t ZSTD_BtFindBestMatch_selectMLS_extDict (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(cParams->searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 4);
+    case 5 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 5);
+    case 7 :
+    case 6 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 6);
+    }
+}
+
+
+
+/* *********************************
+*  Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & mask]
+
+/* Update chains up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+static U32 ZSTD_insertAndFindFirstIndex_internal(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, U32 const mls)
+{
+    U32* const hashTable  = ms->hashTable;
+    const U32 hashLog = cParams->hashLog;
+    U32* const chainTable = ms->chainTable;
+    const U32 chainMask = (1 << cParams->chainLog) - 1;
+    const BYTE* const base = ms->window.base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    while(idx < target) { /* catch up */
+        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
+        NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+        hashTable[h] = idx;
+        idx++;
+    }
+
+    ms->nextToUpdate = target;
+    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+U32 ZSTD_insertAndFindFirstIndex(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip)
+{
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, cParams->searchLength);
+}
+
+
+/* inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_HcFindBestMatch_generic (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls, const U32 extDict)
+{
+    U32* const chainTable = ms->chainTable;
+    const U32 chainSize = (1 << cParams->chainLog);
+    const U32 chainMask = chainSize-1;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32 lowLimit = ms->window.lowLimit;
+    const U32 current = (U32)(ip-base);
+    const U32 minChain = current > chainSize ? current - chainSize : 0;
+    U32 nbAttempts = 1U << cParams->searchLog;
+    size_t ml=4-1;
+
+    /* HC4 match finder */
+    U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
+
+    for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+        size_t currentMl=0;
+        if ((!extDict) || matchIndex >= dictLimit) {
+            const BYTE* const match = base + matchIndex;
+            if (match[ml] == ip[ml])   /* potentially better */
+                currentMl = ZSTD_count(ip, match, iLimit);
+        } else {
+            const BYTE* const match = dictBase + matchIndex;
+            assert(match+4 <= dictEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
+        }
+
+        /* save best solution */
+        if (currentMl > ml) {
+            ml = currentMl;
+            *offsetPtr = current - matchIndex + ZSTD_REP_MOVE;
+            if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+        }
+
+        if (matchIndex <= minChain) break;
+        matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+    }
+
+    return ml;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(cParams->searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, 0);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, 0);
+    case 7 :
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, 0);
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* const offsetPtr)
+{
+    switch(cParams->searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, 1);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, 1);
+    case 7 :
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, 1);
+    }
+}
+
+
+/* *******************************
+*  Common parser - lazy strategy
+*********************************/
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_lazy_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        ZSTD_compressionParameters const* cParams,
+                        const void* src, size_t srcSize,
+                        const U32 searchMethod, const U32 depth)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base + ms->window.dictLimit;
+
+    typedef size_t (*searchMax_f)(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
+    searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS;
+    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
+
+    /* init */
+    ip += (ip==base);
+    ms->nextToUpdate3 = ms->nextToUpdate;
+    {   U32 const maxRep = (U32)(ip-base);
+        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
+    }
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offset=0;
+        const BYTE* start=ip+1;
+
+        /* check repCode */
+        if ((offset_1>0) & (MEM_read32(ip+1) == MEM_read32(ip+1 - offset_1))) {
+            /* repcode : we take it */
+            matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            if (depth==0) goto _storeSequence;
+        }
+
+        /* first search (depth 0) */
+        {   size_t offsetFound = 99999999;
+            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offset=offsetFound;
+        }
+
+        if (matchLength < 4) {
+            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            ip ++;
+            if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                int const gain2 = (int)(mlRep * 3);
+                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                if ((mlRep >= 4) && (gain2 > gain1))
+                    matchLength = mlRep, offset = 0, start = ip;
+            }
+            {   size_t offset2=99999999;
+                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+                if ((ml2 >= 4) && (gain2 > gain1)) {
+                    matchLength = ml2, offset = offset2, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                ip ++;
+                if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                    size_t const ml2 = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                    int const gain2 = (int)(ml2 * 4);
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((ml2 >= 4) && (gain2 > gain1))
+                        matchLength = ml2, offset = 0, start = ip;
+                }
+                {   size_t offset2=99999999;
+                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
+                        matchLength = ml2, offset = offset2, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* NOTE:
+         * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
+         * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
+         * overflows the pointer, which is undefined behavior.
+         */
+        /* catch up */
+        if (offset) {
+            while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > base))
+                 && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */
+                { start--; matchLength++; }
+            offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+        }
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = start - anchor;
+            ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
+            anchor = ip = start + matchLength;
+        }
+
+        /* check immediate repcode */
+        while ( ((ip <= ilimit) & (offset_2>0))
+             && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+            /* store sequence */
+            matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+            offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
+            ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
+            ip += matchLength;
+            anchor = ip;
+            continue;   /* faster when present ... (?) */
+    }   }
+
+    /* Save reps for next block */
+    rep[0] = offset_1 ? offset_1 : savedOffset;
+    rep[1] = offset_2 ? offset_2 : savedOffset;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2);
+}
+
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2);
+}
+
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1);
+}
+
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0);
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_lazy_extDict_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        ZSTD_compressionParameters const* cParams,
+                        const void* src, size_t srcSize,
+                        const U32 searchMethod, const U32 depth)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32 dictLimit = ms->window.dictLimit;
+    const U32 lowestIndex = ms->window.lowLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictEnd  = dictBase + dictLimit;
+    const BYTE* const dictStart  = dictBase + lowestIndex;
+
+    typedef size_t (*searchMax_f)(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
+    searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS;
+
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+
+    /* init */
+    ms->nextToUpdate3 = ms->nextToUpdate;
+    ip += (ip == prefixStart);
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offset=0;
+        const BYTE* start=ip+1;
+        U32 current = (U32)(ip-base);
+
+        /* check repCode */
+        {   const U32 repIndex = (U32)(current+1 - offset_1);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))   /* intentional overflow */
+            if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                if (depth==0) goto _storeSequence;
+        }   }
+
+        /* first search (depth 0) */
+        {   size_t offsetFound = 99999999;
+            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offset=offsetFound;
+        }
+
+         if (matchLength < 4) {
+            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            ip ++;
+            current++;
+            /* check repCode */
+            if (offset) {
+                const U32 repIndex = (U32)(current - offset_1);
+                const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                const BYTE* const repMatch = repBase + repIndex;
+                if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */
+                if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                    /* repcode detected */
+                    const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                    size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                    int const gain2 = (int)(repLength * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((repLength >= 4) && (gain2 > gain1))
+                        matchLength = repLength, offset = 0, start = ip;
+            }   }
+
+            /* search match, depth 1 */
+            {   size_t offset2=99999999;
+                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+                if ((ml2 >= 4) && (gain2 > gain1)) {
+                    matchLength = ml2, offset = offset2, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                ip ++;
+                current++;
+                /* check repCode */
+                if (offset) {
+                    const U32 repIndex = (U32)(current - offset_1);
+                    const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                    const BYTE* const repMatch = repBase + repIndex;
+                    if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */
+                    if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                        /* repcode detected */
+                        const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                        size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                        int const gain2 = (int)(repLength * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                        if ((repLength >= 4) && (gain2 > gain1))
+                            matchLength = repLength, offset = 0, start = ip;
+                }   }
+
+                /* search match, depth 2 */
+                {   size_t offset2=99999999;
+                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
+                        matchLength = ml2, offset = offset2, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* catch up */
+        if (offset) {
+            U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+            const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+            const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+            while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+        }
+
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = start - anchor;
+            ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
+            anchor = ip = start + matchLength;
+        }
+
+        /* check immediate repcode */
+        while (ip <= ilimit) {
+            const U32 repIndex = (U32)((ip-base) - offset_2);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */
+            if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset history */
+                ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+            }
+            break;
+    }   }
+
+    /* Save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0);
+}
+
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1);
+}
+
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2);
+}
+
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2);
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_lazy.h b/vendor/github.com/DataDog/zstd/zstd_lazy.h
new file mode 100644
index 0000000..bda064f
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_lazy.h

@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LAZY_H
+#define ZSTD_LAZY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"
+
+U32 ZSTD_insertAndFindFirstIndex(
+        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+        const BYTE* ip);
+
+void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK */
+
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_LAZY_H */

diff --git a/vendor/github.com/DataDog/zstd/zstd_ldm.c b/vendor/github.com/DataDog/zstd/zstd_ldm.c
new file mode 100644
index 0000000..bffd8a3
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_ldm.c

@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ */
+
+#include "zstd_ldm.h"
+
+#include "zstd_fast.h"          /* ZSTD_fillHashTable() */
+#include "zstd_double_fast.h"   /* ZSTD_fillDoubleHashTable() */
+
+#define LDM_BUCKET_SIZE_LOG 3
+#define LDM_MIN_MATCH_LENGTH 64
+#define LDM_HASH_RLOG 7
+#define LDM_HASH_CHAR_OFFSET 10
+
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams)
+{
+    U32 const windowLog = cParams->windowLog;
+    ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
+    DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
+    if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
+    if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
+    if (cParams->strategy >= ZSTD_btopt) {
+      /* Get out of the way of the optimal parser */
+      U32 const minMatch = MAX(cParams->targetLength, params->minMatchLength);
+      assert(minMatch >= ZSTD_LDM_MINMATCH_MIN);
+      assert(minMatch <= ZSTD_LDM_MINMATCH_MAX);
+      params->minMatchLength = minMatch;
+    }
+    if (params->hashLog == 0) {
+        params->hashLog = MAX(ZSTD_HASHLOG_MIN, windowLog - LDM_HASH_RLOG);
+        assert(params->hashLog <= ZSTD_HASHLOG_MAX);
+    }
+    if (params->hashEveryLog == 0) {
+        params->hashEveryLog =
+                windowLog < params->hashLog ? 0 : windowLog - params->hashLog;
+    }
+    params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
+}
+
+size_t ZSTD_ldm_getTableSize(ldmParams_t params)
+{
+    size_t const ldmHSize = ((size_t)1) << params.hashLog;
+    size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog);
+    size_t const ldmBucketSize =
+        ((size_t)1) << (params.hashLog - ldmBucketSizeLog);
+    size_t const totalSize = ldmBucketSize + ldmHSize * sizeof(ldmEntry_t);
+    return params.enableLdm ? totalSize : 0;
+}
+
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
+{
+    return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0;
+}
+
+/** ZSTD_ldm_getSmallHash() :
+ *  numBits should be <= 32
+ *  If numBits==0, returns 0.
+ *  @return : the most significant numBits of value. */
+static U32 ZSTD_ldm_getSmallHash(U64 value, U32 numBits)
+{
+    assert(numBits <= 32);
+    return numBits == 0 ? 0 : (U32)(value >> (64 - numBits));
+}
+
+/** ZSTD_ldm_getChecksum() :
+ *  numBitsToDiscard should be <= 32
+ *  @return : the next most significant 32 bits after numBitsToDiscard */
+static U32 ZSTD_ldm_getChecksum(U64 hash, U32 numBitsToDiscard)
+{
+    assert(numBitsToDiscard <= 32);
+    return (hash >> (64 - 32 - numBitsToDiscard)) & 0xFFFFFFFF;
+}
+
+/** ZSTD_ldm_getTag() ;
+ *  Given the hash, returns the most significant numTagBits bits
+ *  after (32 + hbits) bits.
+ *
+ *  If there are not enough bits remaining, return the last
+ *  numTagBits bits. */
+static U32 ZSTD_ldm_getTag(U64 hash, U32 hbits, U32 numTagBits)
+{
+    assert(numTagBits < 32 && hbits <= 32);
+    if (32 - hbits < numTagBits) {
+        return hash & (((U32)1 << numTagBits) - 1);
+    } else {
+        return (hash >> (32 - hbits - numTagBits)) & (((U32)1 << numTagBits) - 1);
+    }
+}
+
+/** ZSTD_ldm_getBucket() :
+ *  Returns a pointer to the start of the bucket associated with hash. */
+static ldmEntry_t* ZSTD_ldm_getBucket(
+        ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams)
+{
+    return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
+}
+
+/** ZSTD_ldm_insertEntry() :
+ *  Insert the entry with corresponding hash into the hash table */
+static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
+                                 size_t const hash, const ldmEntry_t entry,
+                                 ldmParams_t const ldmParams)
+{
+    BYTE* const bucketOffsets = ldmState->bucketOffsets;
+    *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + bucketOffsets[hash]) = entry;
+    bucketOffsets[hash]++;
+    bucketOffsets[hash] &= ((U32)1 << ldmParams.bucketSizeLog) - 1;
+}
+
+/** ZSTD_ldm_makeEntryAndInsertByTag() :
+ *
+ *  Gets the small hash, checksum, and tag from the rollingHash.
+ *
+ *  If the tag matches (1 << ldmParams.hashEveryLog)-1, then
+ *  creates an ldmEntry from the offset, and inserts it into the hash table.
+ *
+ *  hBits is the length of the small hash, which is the most significant hBits
+ *  of rollingHash. The checksum is the next 32 most significant bits, followed
+ *  by ldmParams.hashEveryLog bits that make up the tag. */
+static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState,
+                                             U64 const rollingHash,
+                                             U32 const hBits,
+                                             U32 const offset,
+                                             ldmParams_t const ldmParams)
+{
+    U32 const tag = ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashEveryLog);
+    U32 const tagMask = ((U32)1 << ldmParams.hashEveryLog) - 1;
+    if (tag == tagMask) {
+        U32 const hash = ZSTD_ldm_getSmallHash(rollingHash, hBits);
+        U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
+        ldmEntry_t entry;
+        entry.offset = offset;
+        entry.checksum = checksum;
+        ZSTD_ldm_insertEntry(ldmState, hash, entry, ldmParams);
+    }
+}
+
+/** ZSTD_ldm_getRollingHash() :
+ *  Get a 64-bit hash using the first len bytes from buf.
+ *
+ *  Giving bytes s = s_1, s_2, ... s_k, the hash is defined to be
+ *  H(s) = s_1*(a^(k-1)) + s_2*(a^(k-2)) + ... + s_k*(a^0)
+ *
+ *  where the constant a is defined to be prime8bytes.
+ *
+ *  The implementation adds an offset to each byte, so
+ *  H(s) = (s_1 + HASH_CHAR_OFFSET)*(a^(k-1)) + ... */
+static U64 ZSTD_ldm_getRollingHash(const BYTE* buf, U32 len)
+{
+    U64 ret = 0;
+    U32 i;
+    for (i = 0; i < len; i++) {
+        ret *= prime8bytes;
+        ret += buf[i] + LDM_HASH_CHAR_OFFSET;
+    }
+    return ret;
+}
+
+/** ZSTD_ldm_ipow() :
+ *  Return base^exp. */
+static U64 ZSTD_ldm_ipow(U64 base, U64 exp)
+{
+    U64 ret = 1;
+    while (exp) {
+        if (exp & 1) { ret *= base; }
+        exp >>= 1;
+        base *= base;
+    }
+    return ret;
+}
+
+U64 ZSTD_ldm_getHashPower(U32 minMatchLength) {
+    DEBUGLOG(4, "ZSTD_ldm_getHashPower: mml=%u", minMatchLength);
+    assert(minMatchLength >= ZSTD_LDM_MINMATCH_MIN);
+    return ZSTD_ldm_ipow(prime8bytes, minMatchLength - 1);
+}
+
+/** ZSTD_ldm_updateHash() :
+ *  Updates hash by removing toRemove and adding toAdd. */
+static U64 ZSTD_ldm_updateHash(U64 hash, BYTE toRemove, BYTE toAdd, U64 hashPower)
+{
+    hash -= ((toRemove + LDM_HASH_CHAR_OFFSET) * hashPower);
+    hash *= prime8bytes;
+    hash += toAdd + LDM_HASH_CHAR_OFFSET;
+    return hash;
+}
+
+/** ZSTD_ldm_countBackwardsMatch() :
+ *  Returns the number of bytes that match backwards before pIn and pMatch.
+ *
+ *  We count only bytes where pMatch >= pBase and pIn >= pAnchor. */
+static size_t ZSTD_ldm_countBackwardsMatch(
+            const BYTE* pIn, const BYTE* pAnchor,
+            const BYTE* pMatch, const BYTE* pBase)
+{
+    size_t matchLength = 0;
+    while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) {
+        pIn--;
+        pMatch--;
+        matchLength++;
+    }
+    return matchLength;
+}
+
+/** ZSTD_ldm_fillFastTables() :
+ *
+ *  Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies.
+ *  This is similar to ZSTD_loadDictionaryContent.
+ *
+ *  The tables for the other strategies are filled within their
+ *  block compressors. */
+static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+                                      ZSTD_compressionParameters const* cParams,
+                                      void const* end)
+{
+    const BYTE* const iend = (const BYTE*)end;
+
+    switch(cParams->strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, cParams, iend);
+        ms->nextToUpdate = (U32)(iend - ms->window.base);
+        break;
+
+    case ZSTD_dfast:
+        ZSTD_fillDoubleHashTable(ms, cParams, iend);
+        ms->nextToUpdate = (U32)(iend - ms->window.base);
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+    case ZSTD_btlazy2:
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+        break;
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
+    }
+
+    return 0;
+}
+
+/** ZSTD_ldm_fillLdmHashTable() :
+ *
+ *  Fills hashTable from (lastHashed + 1) to iend (non-inclusive).
+ *  lastHash is the rolling hash that corresponds to lastHashed.
+ *
+ *  Returns the rolling hash corresponding to position iend-1. */
+static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state,
+                                     U64 lastHash, const BYTE* lastHashed,
+                                     const BYTE* iend, const BYTE* base,
+                                     U32 hBits, ldmParams_t const ldmParams)
+{
+    U64 rollingHash = lastHash;
+    const BYTE* cur = lastHashed + 1;
+
+    while (cur < iend) {
+        rollingHash = ZSTD_ldm_updateHash(rollingHash, cur[-1],
+                                          cur[ldmParams.minMatchLength-1],
+                                          state->hashPower);
+        ZSTD_ldm_makeEntryAndInsertByTag(state,
+                                         rollingHash, hBits,
+                                         (U32)(cur - base), ldmParams);
+        ++cur;
+    }
+    return rollingHash;
+}
+
+
+/** ZSTD_ldm_limitTableUpdate() :
+ *
+ *  Sets cctx->nextToUpdate to a position corresponding closer to anchor
+ *  if it is far way
+ *  (after a long match, only update tables a limited amount). */
+static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+{
+    U32 const current = (U32)(anchor - ms->window.base);
+    if (current > ms->nextToUpdate + 1024) {
+        ms->nextToUpdate =
+            current - MIN(512, current - ms->nextToUpdate - 1024);
+    }
+}
+
+static size_t ZSTD_ldm_generateSequences_internal(
+        ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    /* LDM parameters */
+    int const extDict = ZSTD_window_hasExtDict(ldmState->window);
+    U32 const minMatchLength = params->minMatchLength;
+    U64 const hashPower = ldmState->hashPower;
+    U32 const hBits = params->hashLog - params->bucketSizeLog;
+    U32 const ldmBucketSize = 1U << params->bucketSizeLog;
+    U32 const hashEveryLog = params->hashEveryLog;
+    U32 const ldmTagMask = (1U << params->hashEveryLog) - 1;
+    /* Prefix and extDict parameters */
+    U32 const dictLimit = ldmState->window.dictLimit;
+    U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit;
+    BYTE const* const base = ldmState->window.base;
+    BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL;
+    BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL;
+    BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL;
+    BYTE const* const lowPrefixPtr = base + dictLimit;
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE);
+    /* Input positions */
+    BYTE const* anchor = istart;
+    BYTE const* ip = istart;
+    /* Rolling hash */
+    BYTE const* lastHashed = NULL;
+    U64 rollingHash = 0;
+
+    while (ip <= ilimit) {
+        size_t mLength;
+        U32 const current = (U32)(ip - base);
+        size_t forwardMatchLength = 0, backwardMatchLength = 0;
+        ldmEntry_t* bestEntry = NULL;
+        if (ip != istart) {
+            rollingHash = ZSTD_ldm_updateHash(rollingHash, lastHashed[0],
+                                              lastHashed[minMatchLength],
+                                              hashPower);
+        } else {
+            rollingHash = ZSTD_ldm_getRollingHash(ip, minMatchLength);
+        }
+        lastHashed = ip;
+
+        /* Do not insert and do not look for a match */
+        if (ZSTD_ldm_getTag(rollingHash, hBits, hashEveryLog) != ldmTagMask) {
+           ip++;
+           continue;
+        }
+
+        /* Get the best entry and compute the match lengths */
+        {
+            ldmEntry_t* const bucket =
+                ZSTD_ldm_getBucket(ldmState,
+                                   ZSTD_ldm_getSmallHash(rollingHash, hBits),
+                                   *params);
+            ldmEntry_t* cur;
+            size_t bestMatchLength = 0;
+            U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
+
+            for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) {
+                size_t curForwardMatchLength, curBackwardMatchLength,
+                       curTotalMatchLength;
+                if (cur->checksum != checksum || cur->offset <= lowestIndex) {
+                    continue;
+                }
+                if (extDict) {
+                    BYTE const* const curMatchBase =
+                        cur->offset < dictLimit ? dictBase : base;
+                    BYTE const* const pMatch = curMatchBase + cur->offset;
+                    BYTE const* const matchEnd =
+                        cur->offset < dictLimit ? dictEnd : iend;
+                    BYTE const* const lowMatchPtr =
+                        cur->offset < dictLimit ? dictStart : lowPrefixPtr;
+
+                    curForwardMatchLength = ZSTD_count_2segments(
+                                                ip, pMatch, iend,
+                                                matchEnd, lowPrefixPtr);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
+                                                     lowMatchPtr);
+                    curTotalMatchLength = curForwardMatchLength +
+                                          curBackwardMatchLength;
+                } else { /* !extDict */
+                    BYTE const* const pMatch = base + cur->offset;
+                    curForwardMatchLength = ZSTD_count(ip, pMatch, iend);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
+                                                     lowPrefixPtr);
+                    curTotalMatchLength = curForwardMatchLength +
+                                          curBackwardMatchLength;
+                }
+
+                if (curTotalMatchLength > bestMatchLength) {
+                    bestMatchLength = curTotalMatchLength;
+                    forwardMatchLength = curForwardMatchLength;
+                    backwardMatchLength = curBackwardMatchLength;
+                    bestEntry = cur;
+                }
+            }
+        }
+
+        /* No match found -- continue searching */
+        if (bestEntry == NULL) {
+            ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash,
+                                             hBits, current,
+                                             *params);
+            ip++;
+            continue;
+        }
+
+        /* Match found */
+        mLength = forwardMatchLength + backwardMatchLength;
+        ip -= backwardMatchLength;
+
+        {
+            /* Store the sequence:
+             * ip = current - backwardMatchLength
+             * The match is at (bestEntry->offset - backwardMatchLength)
+             */
+            U32 const matchIndex = bestEntry->offset;
+            U32 const offset = current - matchIndex;
+            rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;
+
+            /* Out of sequence storage */
+            if (rawSeqStore->size == rawSeqStore->capacity)
+                return ERROR(dstSize_tooSmall);
+            seq->litLength = (U32)(ip - anchor);
+            seq->matchLength = (U32)mLength;
+            seq->offset = offset;
+            rawSeqStore->size++;
+        }
+
+        /* Insert the current entry into the hash table */
+        ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits,
+                                         (U32)(lastHashed - base),
+                                         *params);
+
+        assert(ip + backwardMatchLength == lastHashed);
+
+        /* Fill the hash table from lastHashed+1 to ip+mLength*/
+        /* Heuristic: don't need to fill the entire table at end of block */
+        if (ip + mLength <= ilimit) {
+            rollingHash = ZSTD_ldm_fillLdmHashTable(
+                              ldmState, rollingHash, lastHashed,
+                              ip + mLength, base, hBits, *params);
+            lastHashed = ip + mLength - 1;
+        }
+        ip += mLength;
+        anchor = ip;
+    }
+    return iend - anchor;
+}
+
+/*! ZSTD_ldm_reduceTable() :
+ *  reduce table indexes by `reducerValue` */
+static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
+                                 U32 const reducerValue)
+{
+    U32 u;
+    for (u = 0; u < size; u++) {
+        if (table[u].offset < reducerValue) table[u].offset = 0;
+        else table[u].offset -= reducerValue;
+    }
+}
+
+size_t ZSTD_ldm_generateSequences(
+        ldmState_t* ldmState, rawSeqStore_t* sequences,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    U32 const maxDist = 1U << params->windowLog;
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    size_t const kMaxChunkSize = 1 << 20;
+    size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0);
+    size_t chunk;
+    size_t leftoverSize = 0;
+
+    assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize);
+    /* Check that ZSTD_window_update() has been called for this chunk prior
+     * to passing it to this function.
+     */
+    assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize);
+    /* The input could be very large (in zstdmt), so it must be broken up into
+     * chunks to enforce the maximmum distance and handle overflow correction.
+     */
+    assert(sequences->pos <= sequences->size);
+    assert(sequences->size <= sequences->capacity);
+    for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) {
+        BYTE const* const chunkStart = istart + chunk * kMaxChunkSize;
+        size_t const remaining = (size_t)(iend - chunkStart);
+        BYTE const *const chunkEnd =
+            (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize;
+        size_t const chunkSize = chunkEnd - chunkStart;
+        size_t newLeftoverSize;
+        size_t const prevSize = sequences->size;
+
+        assert(chunkStart < iend);
+        /* 1. Perform overflow correction if necessary. */
+        if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) {
+            U32 const ldmHSize = 1U << params->hashLog;
+            U32 const correction = ZSTD_window_correctOverflow(
+                &ldmState->window, /* cycleLog */ 0, maxDist, src);
+            ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction);
+        }
+        /* 2. We enforce the maximum offset allowed.
+         *
+         * kMaxChunkSize should be small enough that we don't lose too much of
+         * the window through early invalidation.
+         * TODO: * Test the chunk size.
+         *       * Try invalidation after the sequence generation and test the
+         *         the offset against maxDist directly.
+         */
+        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL);
+        /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
+        newLeftoverSize = ZSTD_ldm_generateSequences_internal(
+            ldmState, sequences, params, chunkStart, chunkSize);
+        if (ZSTD_isError(newLeftoverSize))
+            return newLeftoverSize;
+        /* 4. We add the leftover literals from previous iterations to the first
+         *    newly generated sequence, or add the `newLeftoverSize` if none are
+         *    generated.
+         */
+        /* Prepend the leftover literals from the last call */
+        if (prevSize < sequences->size) {
+            sequences->seq[prevSize].litLength += (U32)leftoverSize;
+            leftoverSize = newLeftoverSize;
+        } else {
+            assert(newLeftoverSize == chunkSize);
+            leftoverSize += chunkSize;
+        }
+    }
+    return 0;
+}
+
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) {
+    while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
+        if (srcSize <= seq->litLength) {
+            /* Skip past srcSize literals */
+            seq->litLength -= (U32)srcSize;
+            return;
+        }
+        srcSize -= seq->litLength;
+        seq->litLength = 0;
+        if (srcSize < seq->matchLength) {
+            /* Skip past the first srcSize of the match */
+            seq->matchLength -= (U32)srcSize;
+            if (seq->matchLength < minMatch) {
+                /* The match is too short, omit it */
+                if (rawSeqStore->pos + 1 < rawSeqStore->size) {
+                    seq[1].litLength += seq[0].matchLength;
+                }
+                rawSeqStore->pos++;
+            }
+            return;
+        }
+        srcSize -= seq->matchLength;
+        seq->matchLength = 0;
+        rawSeqStore->pos++;
+    }
+}
+
+/**
+ * If the sequence length is longer than remaining then the sequence is split
+ * between this block and the next.
+ *
+ * Returns the current sequence to handle, or if the rest of the block should
+ * be literals, it returns a sequence with offset == 0.
+ */
+static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
+                                 U32 const remaining, U32 const minMatch)
+{
+    rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
+    assert(sequence.offset > 0);
+    /* Likely: No partial sequence */
+    if (remaining >= sequence.litLength + sequence.matchLength) {
+        rawSeqStore->pos++;
+        return sequence;
+    }
+    /* Cut the sequence short (offset == 0 ==> rest is literals). */
+    if (remaining <= sequence.litLength) {
+        sequence.offset = 0;
+    } else if (remaining < sequence.litLength + sequence.matchLength) {
+        sequence.matchLength = remaining - sequence.litLength;
+        if (sequence.matchLength < minMatch) {
+            sequence.offset = 0;
+        }
+    }
+    /* Skip past `remaining` bytes for the future sequences. */
+    ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch);
+    return sequence;
+}
+
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+    ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+    int const extDict)
+{
+    unsigned const minMatch = cParams->searchLength;
+    ZSTD_blockCompressor const blockCompressor =
+        ZSTD_selectBlockCompressor(cParams->strategy, extDict);
+    BYTE const* const base = ms->window.base;
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    /* Input positions */
+    BYTE const* ip = istart;
+
+    assert(rawSeqStore->pos <= rawSeqStore->size);
+    assert(rawSeqStore->size <= rawSeqStore->capacity);
+    /* Loop through each sequence and apply the block compressor to the lits */
+    while (rawSeqStore->pos < rawSeqStore->size && ip < iend) {
+        /* maybeSplitSequence updates rawSeqStore->pos */
+        rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                   (U32)(iend - ip), minMatch);
+        int i;
+        /* End signal */
+        if (sequence.offset == 0)
+            break;
+
+        assert(sequence.offset <= (1U << cParams->windowLog));
+        assert(ip + sequence.litLength + sequence.matchLength <= iend);
+
+        /* Fill tables for block compressor */
+        ZSTD_ldm_limitTableUpdate(ms, ip);
+        ZSTD_ldm_fillFastTables(ms, cParams, ip);
+        /* Run the block compressor */
+        {
+            size_t const newLitLength =
+                blockCompressor(ms, seqStore, rep, cParams, ip,
+                                sequence.litLength);
+            ip += sequence.litLength;
+            ms->nextToUpdate = (U32)(ip - base);
+            /* Update the repcodes */
+            for (i = ZSTD_REP_NUM - 1; i > 0; i--)
+                rep[i] = rep[i-1];
+            rep[0] = sequence.offset;
+            /* Store the sequence */
+            ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength,
+                          sequence.offset + ZSTD_REP_MOVE,
+                          sequence.matchLength - MINMATCH);
+            ip += sequence.matchLength;
+        }
+    }
+    /* Fill the tables for the block compressor */
+    ZSTD_ldm_limitTableUpdate(ms, ip);
+    ZSTD_ldm_fillFastTables(ms, cParams, ip);
+    /* Compress the last literals */
+    {
+        size_t const lastLiterals = blockCompressor(ms, seqStore, rep, cParams,
+                                                    ip, iend - ip);
+        ms->nextToUpdate = (U32)(iend - base);
+        return lastLiterals;
+    }
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_ldm.h b/vendor/github.com/DataDog/zstd/zstd_ldm.h
new file mode 100644
index 0000000..0c3789f
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_ldm.h

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ */
+
+#ifndef ZSTD_LDM_H
+#define ZSTD_LDM_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"   /* ldmParams_t, U32 */
+#include "zstd.h"   /* ZSTD_CCtx, size_t */
+
+/*-*************************************
+*  Long distance matching
+***************************************/
+
+#define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_DEFAULTMAX
+
+/**
+ * ZSTD_ldm_generateSequences():
+ *
+ * Generates the sequences using the long distance match finder.
+ * Generates long range matching sequences in `sequences`, which parse a prefix
+ * of the source. `sequences` must be large enough to store every sequence,
+ * which can be checked with `ZSTD_ldm_getMaxNbSeq()`.
+ * @returns 0 or an error code.
+ *
+ * NOTE: The user must have called ZSTD_window_update() for all of the input
+ * they have, even if they pass it to ZSTD_ldm_generateSequences() in chunks.
+ * NOTE: This function returns an error if it runs out of space to store
+ *       sequences.
+ */
+size_t ZSTD_ldm_generateSequences(
+            ldmState_t* ldms, rawSeqStore_t* sequences,
+            ldmParams_t const* params, void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_blockCompress():
+ *
+ * Compresses a block using the predefined sequences, along with a secondary
+ * block compressor. The literals section of every sequence is passed to the
+ * secondary block compressor, and those sequences are interspersed with the
+ * predefined sequences. Returns the length of the last literals.
+ * Updates `rawSeqStore.pos` to indicate how many sequences have been consumed.
+ * `rawSeqStore.seq` may also be updated to split the last sequence between two
+ * blocks.
+ * @return The length of the last literals.
+ *
+ * NOTE: The source must be at most the maximum block size, but the predefined
+ * sequences can be any size, and may be longer than the block. In the case that
+ * they are longer than the block, the last sequences may need to be split into
+ * two. We handle that case correctly, and update `rawSeqStore` appropriately.
+ * NOTE: This function does not return any errors.
+ */
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+            ZSTD_compressionParameters const* cParams,
+            void const* src, size_t srcSize,
+            int const extDict);
+
+/**
+ * ZSTD_ldm_skipSequences():
+ *
+ * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`.
+ * Avoids emitting matches less than `minMatch` bytes.
+ * Must be called for data with is not passed to ZSTD_ldm_blockCompress().
+ */
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
+    U32 const minMatch);
+
+
+/** ZSTD_ldm_getTableSize() :
+ *  Estimate the space needed for long distance matching tables or 0 if LDM is
+ *  disabled.
+ */
+size_t ZSTD_ldm_getTableSize(ldmParams_t params);
+
+/** ZSTD_ldm_getSeqSpace() :
+ *  Return an upper bound on the number of sequences that can be produced by
+ *  the long distance matcher, or 0 if LDM is disabled.
+ */
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize);
+
+/** ZSTD_ldm_getTableSize() :
+ *  Return prime8bytes^(minMatchLength-1) */
+U64 ZSTD_ldm_getHashPower(U32 minMatchLength);
+
+/** ZSTD_ldm_adjustParameters() :
+ *  If the params->hashEveryLog is not set, set it to its default value based on
+ *  windowLog and params->hashLog.
+ *
+ *  Ensures that params->bucketSizeLog is <= params->hashLog (setting it to
+ *  params->hashLog if it is not).
+ *
+ *  Ensures that the minMatchLength >= targetLength during optimal parsing.
+ */
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */

diff --git a/vendor/github.com/DataDog/zstd/zstd_legacy.h b/vendor/github.com/DataDog/zstd/zstd_legacy.h
new file mode 100644
index 0000000..5893cb9
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_legacy.h

@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LEGACY_H
+#define ZSTD_LEGACY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include "mem.h"            /* MEM_STATIC */
+#include "error_private.h"  /* ERROR */
+#include "zstd.h"           /* ZSTD_inBuffer, ZSTD_outBuffer */
+
+#if !defined (ZSTD_LEGACY_SUPPORT) || (ZSTD_LEGACY_SUPPORT == 0)
+#  undef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 8
+#endif
+
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+#  include "zstd_v01.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+#  include "zstd_v02.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+#  include "zstd_v03.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+#  include "zstd_v04.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+#  include "zstd_v05.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+#  include "zstd_v06.h"
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+#  include "zstd_v07.h"
+#endif
+
+/** ZSTD_isLegacy() :
+    @return : > 0 if supported by legacy decoder. 0 otherwise.
+              return value is the version.
+*/
+MEM_STATIC unsigned ZSTD_isLegacy(const void* src, size_t srcSize)
+{
+    U32 magicNumberLE;
+    if (srcSize<4) return 0;
+    magicNumberLE = MEM_readLE32(src);
+    switch(magicNumberLE)
+    {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+        case ZSTDv01_magicNumberLE:return 1;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+        case ZSTDv02_magicNumber : return 2;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+        case ZSTDv03_magicNumber : return 3;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case ZSTDv04_magicNumber : return 4;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case ZSTDv05_MAGICNUMBER : return 5;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case ZSTDv06_MAGICNUMBER : return 6;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case ZSTDv07_MAGICNUMBER : return 7;
+#endif
+        default : return 0;
+    }
+}
+
+
+MEM_STATIC unsigned long long ZSTD_getDecompressedSize_legacy(const void* src, size_t srcSize)
+{
+    U32 const version = ZSTD_isLegacy(src, srcSize);
+    if (version < 5) return 0;  /* no decompressed size in frame header, or not a legacy format */
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+    if (version==5) {
+        ZSTDv05_parameters fParams;
+        size_t const frResult = ZSTDv05_getFrameParams(&fParams, src, srcSize);
+        if (frResult != 0) return 0;
+        return fParams.srcSize;
+    }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+    if (version==6) {
+        ZSTDv06_frameParams fParams;
+        size_t const frResult = ZSTDv06_getFrameParams(&fParams, src, srcSize);
+        if (frResult != 0) return 0;
+        return fParams.frameContentSize;
+    }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+    if (version==7) {
+        ZSTDv07_frameParams fParams;
+        size_t const frResult = ZSTDv07_getFrameParams(&fParams, src, srcSize);
+        if (frResult != 0) return 0;
+        return fParams.frameContentSize;
+    }
+#endif
+    return 0;   /* should not be possible */
+}
+
+
+MEM_STATIC size_t ZSTD_decompressLegacy(
+                     void* dst, size_t dstCapacity,
+               const void* src, size_t compressedSize,
+               const void* dict,size_t dictSize)
+{
+    U32 const version = ZSTD_isLegacy(src, compressedSize);
+    (void)dst; (void)dstCapacity; (void)dict; (void)dictSize;  /* unused when ZSTD_LEGACY_SUPPORT >= 8 */
+    switch(version)
+    {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+        case 1 :
+            return ZSTDv01_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+        case 2 :
+            return ZSTDv02_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+        case 3 :
+            return ZSTDv03_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+            return ZSTDv04_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+            {   size_t result;
+                ZSTDv05_DCtx* const zd = ZSTDv05_createDCtx();
+                if (zd==NULL) return ERROR(memory_allocation);
+                result = ZSTDv05_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+                ZSTDv05_freeDCtx(zd);
+                return result;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+            {   size_t result;
+                ZSTDv06_DCtx* const zd = ZSTDv06_createDCtx();
+                if (zd==NULL) return ERROR(memory_allocation);
+                result = ZSTDv06_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+                ZSTDv06_freeDCtx(zd);
+                return result;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+            {   size_t result;
+                ZSTDv07_DCtx* const zd = ZSTDv07_createDCtx();
+                if (zd==NULL) return ERROR(memory_allocation);
+                result = ZSTDv07_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+                ZSTDv07_freeDCtx(zd);
+                return result;
+            }
+#endif
+        default :
+            return ERROR(prefix_unknown);
+    }
+}
+
+MEM_STATIC size_t ZSTD_findFrameCompressedSizeLegacy(const void *src,
+                                             size_t compressedSize)
+{
+    U32 const version = ZSTD_isLegacy(src, compressedSize);
+    switch(version)
+    {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+        case 1 :
+            return ZSTDv01_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+        case 2 :
+            return ZSTDv02_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+        case 3 :
+            return ZSTDv03_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+            return ZSTDv04_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+            return ZSTDv05_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+            return ZSTDv06_findFrameCompressedSize(src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+            return ZSTDv07_findFrameCompressedSize(src, compressedSize);
+#endif
+        default :
+            return ERROR(prefix_unknown);
+    }
+}
+
+MEM_STATIC size_t ZSTD_freeLegacyStreamContext(void* legacyContext, U32 version)
+{
+    switch(version)
+    {
+        default :
+        case 1 :
+        case 2 :
+        case 3 :
+            (void)legacyContext;
+            return ERROR(version_unsupported);
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 : return ZBUFFv04_freeDCtx((ZBUFFv04_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 : return ZBUFFv05_freeDCtx((ZBUFFv05_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 : return ZBUFFv06_freeDCtx((ZBUFFv06_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 : return ZBUFFv07_freeDCtx((ZBUFFv07_DCtx*)legacyContext);
+#endif
+    }
+}
+
+
+MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U32 newVersion,
+                                        const void* dict, size_t dictSize)
+{
+    DEBUGLOG(5, "ZSTD_initLegacyStream for v0.%u", newVersion);
+    if (prevVersion != newVersion) ZSTD_freeLegacyStreamContext(*legacyContext, prevVersion);
+    switch(newVersion)
+    {
+        default :
+        case 1 :
+        case 2 :
+        case 3 :
+            (void)dict; (void)dictSize;
+            return 0;
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+        {
+            ZBUFFv04_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv04_createDCtx() : (ZBUFFv04_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv04_decompressInit(dctx);
+            ZBUFFv04_decompressWithDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+        {
+            ZBUFFv05_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv05_createDCtx() : (ZBUFFv05_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv05_decompressInitDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+        {
+            ZBUFFv06_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv06_createDCtx() : (ZBUFFv06_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv06_decompressInitDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+        {
+            ZBUFFv07_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv07_createDCtx() : (ZBUFFv07_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv07_decompressInitDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+    }
+}
+
+
+
+MEM_STATIC size_t ZSTD_decompressLegacyStream(void* legacyContext, U32 version,
+                                              ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    DEBUGLOG(5, "ZSTD_decompressLegacyStream for v0.%u", version);
+    switch(version)
+    {
+        default :
+        case 1 :
+        case 2 :
+        case 3 :
+            (void)legacyContext; (void)output; (void)input;
+            return ERROR(version_unsupported);
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+            {
+                ZBUFFv04_DCtx* dctx = (ZBUFFv04_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv04_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+            {
+                ZBUFFv05_DCtx* dctx = (ZBUFFv05_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv05_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+            {
+                ZBUFFv06_DCtx* dctx = (ZBUFFv06_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv06_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+            {
+                ZBUFFv07_DCtx* dctx = (ZBUFFv07_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv07_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+    }
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTD_LEGACY_H */

diff --git a/vendor/github.com/DataDog/zstd/zstd_opt.c b/vendor/github.com/DataDog/zstd/zstd_opt.c
new file mode 100644
index 0000000..f63f0c5
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_opt.c

@@ -0,0 +1,923 @@
+/*
+ * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "zstd_compress_internal.h"
+#include "zstd_opt.h"
+
+
+#define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats. Also used for matchSum (?) */
+#define ZSTD_FREQ_DIV       4   /* log factor when using previous stats to init next stats */
+#define ZSTD_MAX_PRICE     (1<<30)
+
+
+/*-*************************************
+*  Price functions for optimal parser
+***************************************/
+static void ZSTD_setLog2Prices(optState_t* optPtr)
+{
+    optPtr->log2litSum = ZSTD_highbit32(optPtr->litSum+1);
+    optPtr->log2litLengthSum = ZSTD_highbit32(optPtr->litLengthSum+1);
+    optPtr->log2matchLengthSum = ZSTD_highbit32(optPtr->matchLengthSum+1);
+    optPtr->log2offCodeSum = ZSTD_highbit32(optPtr->offCodeSum+1);
+}
+
+
+static void ZSTD_rescaleFreqs(optState_t* const optPtr,
+                              const BYTE* const src, size_t const srcSize)
+{
+    optPtr->staticPrices = 0;
+
+    if (optPtr->litLengthSum == 0) {  /* first init */
+        unsigned u;
+        if (srcSize <= 1024) optPtr->staticPrices = 1;
+
+        assert(optPtr->litFreq!=NULL);
+        for (u=0; u<=MaxLit; u++)
+            optPtr->litFreq[u] = 0;
+        for (u=0; u<srcSize; u++)
+            optPtr->litFreq[src[u]]++;
+        optPtr->litSum = 0;
+        for (u=0; u<=MaxLit; u++) {
+            optPtr->litFreq[u] = 1 + (optPtr->litFreq[u] >> ZSTD_FREQ_DIV);
+            optPtr->litSum += optPtr->litFreq[u];
+        }
+
+        for (u=0; u<=MaxLL; u++)
+            optPtr->litLengthFreq[u] = 1;
+        optPtr->litLengthSum = MaxLL+1;
+        for (u=0; u<=MaxML; u++)
+            optPtr->matchLengthFreq[u] = 1;
+        optPtr->matchLengthSum = MaxML+1;
+        for (u=0; u<=MaxOff; u++)
+            optPtr->offCodeFreq[u] = 1;
+        optPtr->offCodeSum = (MaxOff+1);
+
+    } else {
+        unsigned u;
+
+        optPtr->litSum = 0;
+        for (u=0; u<=MaxLit; u++) {
+            optPtr->litFreq[u] = 1 + (optPtr->litFreq[u] >> (ZSTD_FREQ_DIV+1));
+            optPtr->litSum += optPtr->litFreq[u];
+        }
+        optPtr->litLengthSum = 0;
+        for (u=0; u<=MaxLL; u++) {
+            optPtr->litLengthFreq[u] = 1 + (optPtr->litLengthFreq[u]>>(ZSTD_FREQ_DIV+1));
+            optPtr->litLengthSum += optPtr->litLengthFreq[u];
+        }
+        optPtr->matchLengthSum = 0;
+        for (u=0; u<=MaxML; u++) {
+            optPtr->matchLengthFreq[u] = 1 + (optPtr->matchLengthFreq[u]>>ZSTD_FREQ_DIV);
+            optPtr->matchLengthSum += optPtr->matchLengthFreq[u];
+        }
+        optPtr->offCodeSum = 0;
+        for (u=0; u<=MaxOff; u++) {
+            optPtr->offCodeFreq[u] = 1 + (optPtr->offCodeFreq[u]>>ZSTD_FREQ_DIV);
+            optPtr->offCodeSum += optPtr->offCodeFreq[u];
+        }
+    }
+
+    ZSTD_setLog2Prices(optPtr);
+}
+
+
+/* ZSTD_rawLiteralsCost() :
+ * cost of literals (only) in given segment (which length can be null)
+ * does not include cost of literalLength symbol */
+static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                const optState_t* const optPtr)
+{
+    if (optPtr->staticPrices) return (litLength*6);  /* 6 bit per literal - no statistic used */
+    if (litLength == 0) return 0;
+
+    /* literals */
+    {   U32 u;
+        U32 cost = litLength * optPtr->log2litSum;
+        for (u=0; u < litLength; u++)
+            cost -= ZSTD_highbit32(optPtr->litFreq[literals[u]]+1);
+        return cost;
+    }
+}
+
+/* ZSTD_litLengthPrice() :
+ * cost of literalLength symbol */
+static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr)
+{
+    if (optPtr->staticPrices) return ZSTD_highbit32((U32)litLength+1);
+
+    /* literal Length */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        U32 const price = LL_bits[llCode] + optPtr->log2litLengthSum - ZSTD_highbit32(optPtr->litLengthFreq[llCode]+1);
+        return price;
+    }
+}
+
+/* ZSTD_litLengthPrice() :
+ * cost of the literal part of a sequence,
+ * including literals themselves, and literalLength symbol */
+static U32 ZSTD_fullLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                 const optState_t* const optPtr)
+{
+    return ZSTD_rawLiteralsCost(literals, litLength, optPtr)
+         + ZSTD_litLengthPrice(litLength, optPtr);
+}
+
+/* ZSTD_litLengthContribution() :
+ * @return ( cost(litlength) - cost(0) )
+ * this value can then be added to rawLiteralsCost()
+ * to provide a cost which is directly comparable to a match ending at same position */
+static int ZSTD_litLengthContribution(U32 const litLength, const optState_t* const optPtr)
+{
+    if (optPtr->staticPrices) return ZSTD_highbit32(litLength+1);
+
+    /* literal Length */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        int const contribution = LL_bits[llCode]
+                        + ZSTD_highbit32(optPtr->litLengthFreq[0]+1)
+                        - ZSTD_highbit32(optPtr->litLengthFreq[llCode]+1);
+#if 1
+        return contribution;
+#else
+        return MAX(0, contribution); /* sometimes better, sometimes not ... */
+#endif
+    }
+}
+
+/* ZSTD_literalsContribution() :
+ * creates a fake cost for the literals part of a sequence
+ * which can be compared to the ending cost of a match
+ * should a new match start at this position */
+static int ZSTD_literalsContribution(const BYTE* const literals, U32 const litLength,
+                                     const optState_t* const optPtr)
+{
+    int const contribution = ZSTD_rawLiteralsCost(literals, litLength, optPtr)
+                           + ZSTD_litLengthContribution(litLength, optPtr);
+    return contribution;
+}
+
+/* ZSTD_getMatchPrice() :
+ * Provides the cost of the match part (offset + matchLength) of a sequence
+ * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+ * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */
+FORCE_INLINE_TEMPLATE U32 ZSTD_getMatchPrice(
+                                    U32 const offset, U32 const matchLength,
+                                    const optState_t* const optPtr,
+                                    int const optLevel)
+{
+    U32 price;
+    U32 const offCode = ZSTD_highbit32(offset+1);
+    U32 const mlBase = matchLength - MINMATCH;
+    assert(matchLength >= MINMATCH);
+
+    if (optPtr->staticPrices)  /* fixed scheme, do not use statistics */
+        return ZSTD_highbit32((U32)mlBase+1) + 16 + offCode;
+
+    price = offCode + optPtr->log2offCodeSum - ZSTD_highbit32(optPtr->offCodeFreq[offCode]+1);
+    if ((optLevel<2) /*static*/ && offCode >= 20) price += (offCode-19)*2; /* handicap for long distance offsets, favor decompression speed */
+
+    /* match Length */
+    {   U32 const mlCode = ZSTD_MLcode(mlBase);
+        price += ML_bits[mlCode] + optPtr->log2matchLengthSum - ZSTD_highbit32(optPtr->matchLengthFreq[mlCode]+1);
+    }
+
+    DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price);
+    return price;
+}
+
+static void ZSTD_updateStats(optState_t* const optPtr,
+                             U32 litLength, const BYTE* literals,
+                             U32 offsetCode, U32 matchLength)
+{
+    /* literals */
+    {   U32 u;
+        for (u=0; u < litLength; u++)
+            optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+        optPtr->litSum += litLength*ZSTD_LITFREQ_ADD;
+    }
+
+    /* literal Length */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        optPtr->litLengthFreq[llCode]++;
+        optPtr->litLengthSum++;
+    }
+
+    /* match offset code (0-2=>repCode; 3+=>offset+2) */
+    {   U32 const offCode = ZSTD_highbit32(offsetCode+1);
+        assert(offCode <= MaxOff);
+        optPtr->offCodeFreq[offCode]++;
+        optPtr->offCodeSum++;
+    }
+
+    /* match Length */
+    {   U32 const mlBase = matchLength - MINMATCH;
+        U32 const mlCode = ZSTD_MLcode(mlBase);
+        optPtr->matchLengthFreq[mlCode]++;
+        optPtr->matchLengthSum++;
+    }
+}
+
+
+/* ZSTD_readMINMATCH() :
+ * function safe only for comparisons
+ * assumption : memPtr must be at least 4 bytes before end of buffer */
+MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+{
+    switch (length)
+    {
+    default :
+    case 4 : return MEM_read32(memPtr);
+    case 3 : if (MEM_isLittleEndian())
+                return MEM_read32(memPtr)<<8;
+             else
+                return MEM_read32(memPtr)>>8;
+    }
+}
+
+
+/* Update hashTable3 up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, const BYTE* const ip)
+{
+    U32* const hashTable3 = ms->hashTable3;
+    U32 const hashLog3 = ms->hashLog3;
+    const BYTE* const base = ms->window.base;
+    U32 idx = ms->nextToUpdate3;
+    U32 const target = ms->nextToUpdate3 = (U32)(ip - base);
+    size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+    assert(hashLog3 > 0);
+
+    while(idx < target) {
+        hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx;
+        idx++;
+    }
+
+    return hashTable3[hash3];
+}
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+ *  ip : assumed <= iend-8 .
+ * @return : nb of positions added */
+static U32 ZSTD_insertBt1(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* const ip, const BYTE* const iend,
+                U32 const mls, U32 const extDict)
+{
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32 matchIndex = hashTable[h];
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    const U32 current = (U32)(ip-base);
+    const U32 btLow = btMask >= current ? 0 : current - btMask;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 dummy32;   /* to be nullified at the end */
+    U32 const windowLow = ms->window.lowLimit;
+    U32 matchEndIdx = current+8+1;
+    size_t bestLength = 8;
+    U32 nbCompares = 1U << cParams->searchLog;
+#ifdef ZSTD_C_PREDICT
+    U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0);
+    U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1);
+    predictedSmall += (predictedSmall>0);
+    predictedLarge += (predictedLarge>0);
+#endif /* ZSTD_C_PREDICT */
+
+    DEBUGLOG(8, "ZSTD_insertBt1 (%u)", current);
+
+    assert(ip <= iend-8);   /* required for h calculation */
+    hashTable[h] = current;   /* Update Hash Table */
+
+    while (nbCompares-- && (matchIndex > windowLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < current);
+
+#ifdef ZSTD_C_PREDICT   /* note : can create issues when hlog small <= 11 */
+        const U32* predictPtr = bt + 2*((matchIndex-1) & btMask);   /* written this way, as bt is a roll buffer */
+        if (matchIndex == predictedSmall) {
+            /* no need to check length, result known */
+            *smallerPtr = matchIndex;
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            predictedSmall = predictPtr[1] + (predictPtr[1]>0);
+            continue;
+        }
+        if (matchIndex == predictedLarge) {
+            *largerPtr = matchIndex;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+            predictedLarge = predictPtr[0] + (predictPtr[0]>0);
+            continue;
+        }
+#endif
+
+        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if extDict is incorrectly set to 0 */
+            match = base + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            bestLength = matchLength;
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+        }
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+    if (bestLength > 384) return MIN(192, (U32)(bestLength - 384));   /* speed optimization */
+    assert(matchEndIdx > current + 8);
+    return matchEndIdx - (current + 8);
+}
+
+FORCE_INLINE_TEMPLATE
+void ZSTD_updateTree_internal(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* const ip, const BYTE* const iend,
+                const U32 mls, const U32 extDict)
+{
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (extDict:%u)",
+                idx, target, extDict);
+
+    while(idx < target)
+        idx += ZSTD_insertBt1(ms, cParams, base+idx, iend, mls, extDict);
+    ms->nextToUpdate = target;
+}
+
+void ZSTD_updateTree(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* ip, const BYTE* iend)
+{
+    ZSTD_updateTree_internal(ms, cParams, ip, iend, cParams->searchLength, 0 /*extDict*/);
+}
+
+FORCE_INLINE_TEMPLATE
+U32 ZSTD_insertBtAndGetAllMatches (
+                    ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                    const BYTE* const ip, const BYTE* const iLimit, int const extDict,
+                    U32 rep[ZSTD_REP_NUM], U32 const ll0,
+                    ZSTD_match_t* matches, const U32 lengthToBeat, U32 const mls /* template */)
+{
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    const BYTE* const base = ms->window.base;
+    U32 const current = (U32)(ip-base);
+    U32 const hashLog = cParams->hashLog;
+    U32 const minMatch = (mls==3) ? 3 : 4;
+    U32* const hashTable = ms->hashTable;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32 matchIndex  = hashTable[h];
+    U32* const bt   = ms->chainTable;
+    U32 const btLog = cParams->chainLog - 1;
+    U32 const btMask= (1U << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const dictBase = ms->window.dictBase;
+    U32 const dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    U32 const btLow = btMask >= current ? 0 : current - btMask;
+    U32 const windowLow = ms->window.lowLimit;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = bt + 2*(current&btMask) + 1;
+    U32 matchEndIdx = current+8+1;   /* farthest referenced position of any match => detects repetitive patterns */
+    U32 dummy32;   /* to be nullified at the end */
+    U32 mnum = 0;
+    U32 nbCompares = 1U << cParams->searchLog;
+
+    size_t bestLength = lengthToBeat-1;
+    DEBUGLOG(7, "ZSTD_insertBtAndGetAllMatches");
+
+    /* check repCode */
+    {   U32 const lastR = ZSTD_REP_NUM + ll0;
+        U32 repCode;
+        for (repCode = ll0; repCode < lastR; repCode++) {
+            U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            U32 const repIndex = current - repOffset;
+            U32 repLen = 0;
+            assert(current >= dictLimit);
+            if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < current-dictLimit) {  /* equivalent to `current > repIndex >= dictLimit` */
+                if (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch)) {
+                    repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch;
+                }
+            } else {  /* repIndex < dictLimit || repIndex >= current */
+                const BYTE* const repMatch = dictBase + repIndex;
+                assert(current >= windowLow);
+                if ( extDict /* this case only valid in extDict mode */
+                  && ( ((repOffset-1) /*intentional overflow*/ < current - windowLow)  /* equivalent to `current > repIndex >= windowLow` */
+                     & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
+            }   }
+            /* save longer solution */
+            if (repLen > bestLength) {
+                DEBUGLOG(8, "found rep-match %u of length %u",
+                            repCode - ll0, (U32)repLen);
+                bestLength = repLen;
+                matches[mnum].off = repCode - ll0;
+                matches[mnum].len = (U32)repLen;
+                mnum++;
+                if ( (repLen > sufficient_len)
+                   | (ip+repLen == iLimit) ) {  /* best possible */
+                    return mnum;
+    }   }   }   }
+
+    /* HC3 match finder */
+    if ((mls == 3) /*static*/ && (bestLength < mls)) {
+        U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, ip);
+        if ((matchIndex3 > windowLow)
+          & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) {
+            size_t mlen;
+            if ((!extDict) /*static*/ || (matchIndex3 >= dictLimit)) {
+                const BYTE* const match = base + matchIndex3;
+                mlen = ZSTD_count(ip, match, iLimit);
+            } else {
+                const BYTE* const match = dictBase + matchIndex3;
+                mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart);
+            }
+
+            /* save best solution */
+            if (mlen >= mls /* == 3 > bestLength */) {
+                DEBUGLOG(8, "found small match with hlog3, of length %u",
+                            (U32)mlen);
+                bestLength = mlen;
+                assert(current > matchIndex3);
+                assert(mnum==0);  /* no prior solution */
+                matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE;
+                matches[0].len = (U32)mlen;
+                mnum = 1;
+                if ( (mlen > sufficient_len) |
+                     (ip+mlen == iLimit) ) {  /* best possible length */
+                    ms->nextToUpdate = current+1;  /* skip insertion */
+                    return 1;
+    }   }   }   }
+
+    hashTable[h] = current;   /* Update Hash Table */
+
+    while (nbCompares-- && (matchIndex > windowLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match;
+        assert(current > matchIndex);
+
+        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);  /* ensure the condition is correct when !extDict */
+            match = base + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* prepare for match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            DEBUGLOG(8, "found match of length %u at distance %u",
+                    (U32)matchLength, current - matchIndex);
+            assert(matchEndIdx > matchIndex);
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+            bestLength = matchLength;
+            matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
+            matches[mnum].len = (U32)matchLength;
+            mnum++;
+            if (matchLength > ZSTD_OPT_NUM) break;
+            if (ip+matchLength == iLimit) {  /* equal : no way to know if inf or sup */
+                break;   /* drop, to preserve bt consistency (miss a little bit of compression) */
+            }
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            /* match smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new candidate => larger than match, which was smaller than current */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous, closer to current */
+        } else {
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+
+    assert(matchEndIdx > current+8);
+    ms->nextToUpdate = matchEndIdx - 8;  /* skip repetitive patterns */
+    return mnum;
+}
+
+
+FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* const iHighLimit, int const extDict,
+                        U32 rep[ZSTD_REP_NUM], U32 const ll0,
+                        ZSTD_match_t* matches, U32 const lengthToBeat)
+{
+    U32 const matchLengthSearch = cParams->searchLength;
+    DEBUGLOG(7, "ZSTD_BtGetAllMatches");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateTree_internal(ms, cParams, ip, iHighLimit, matchLengthSearch, extDict);
+    switch(matchLengthSearch)
+    {
+    case 3 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 3);
+    default :
+    case 4 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 4);
+    case 5 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 5);
+    case 7 :
+    case 6 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 6);
+    }
+}
+
+
+/*-*******************************
+*  Optimal parser
+*********************************/
+typedef struct repcodes_s {
+    U32 rep[3];
+} repcodes_t;
+
+repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
+{
+    repcodes_t newReps;
+    if (offset >= ZSTD_REP_NUM) {  /* full offset */
+        newReps.rep[2] = rep[1];
+        newReps.rep[1] = rep[0];
+        newReps.rep[0] = offset - ZSTD_REP_MOVE;
+    } else {   /* repcode */
+        U32 const repCode = offset + ll0;
+        if (repCode > 0) {  /* note : if repCode==0, no change */
+            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+            newReps.rep[1] = rep[0];
+            newReps.rep[0] = currentOffset;
+        } else {   /* repCode == 0 */
+            memcpy(&newReps, rep, sizeof(newReps));
+        }
+    }
+    return newReps;
+}
+
+
+typedef struct {
+    const BYTE* anchor;
+    U32 litlen;
+    U32 rawLitCost;
+} cachedLiteralPrice_t;
+
+static U32 ZSTD_rawLiteralsCost_cached(
+                            cachedLiteralPrice_t* const cachedLitPrice,
+                            const BYTE* const anchor, U32 const litlen,
+                            const optState_t* const optStatePtr)
+{
+    U32 startCost;
+    U32 remainingLength;
+    const BYTE* startPosition;
+
+    if (anchor == cachedLitPrice->anchor) {
+        startCost = cachedLitPrice->rawLitCost;
+        startPosition = anchor + cachedLitPrice->litlen;
+        assert(litlen >= cachedLitPrice->litlen);
+        remainingLength = litlen - cachedLitPrice->litlen;
+    } else {
+        startCost = 0;
+        startPosition = anchor;
+        remainingLength = litlen;
+    }
+
+    {   U32 const rawLitCost = startCost + ZSTD_rawLiteralsCost(startPosition, remainingLength, optStatePtr);
+        cachedLitPrice->anchor = anchor;
+        cachedLitPrice->litlen = litlen;
+        cachedLitPrice->rawLitCost = rawLitCost;
+        return rawLitCost;
+    }
+}
+
+static U32 ZSTD_fullLiteralsCost_cached(
+                            cachedLiteralPrice_t* const cachedLitPrice,
+                            const BYTE* const anchor, U32 const litlen,
+                            const optState_t* const optStatePtr)
+{
+    return ZSTD_rawLiteralsCost_cached(cachedLitPrice, anchor, litlen, optStatePtr)
+         + ZSTD_litLengthPrice(litlen, optStatePtr);
+}
+
+static int ZSTD_literalsContribution_cached(
+                            cachedLiteralPrice_t* const cachedLitPrice,
+                            const BYTE* const anchor, U32 const litlen,
+                            const optState_t* const optStatePtr)
+{
+    int const contribution = ZSTD_rawLiteralsCost_cached(cachedLitPrice, anchor, litlen, optStatePtr)
+                           + ZSTD_litLengthContribution(litlen, optStatePtr);
+    return contribution;
+}
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,seqStore_t* seqStore,
+                                      U32 rep[ZSTD_REP_NUM],
+                                      ZSTD_compressionParameters const* cParams,
+                                      const void* src, size_t srcSize,
+                                      const int optLevel, const int extDict)
+{
+    optState_t* const optStatePtr = &ms->opt;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    U32 const minMatch = (cParams->searchLength == 3) ? 3 : 4;
+
+    ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+    ZSTD_match_t* const matches = optStatePtr->matchTable;
+    cachedLiteralPrice_t cachedLitPrice;
+
+    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_opt_generic");
+    ms->nextToUpdate3 = ms->nextToUpdate;
+    ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize);
+    ip += (ip==prefixStart);
+    memset(&cachedLitPrice, 0, sizeof(cachedLitPrice));
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        U32 cur, last_pos = 0;
+        U32 best_mlen, best_off;
+
+        /* find first match */
+        {   U32 const litlen = (U32)(ip - anchor);
+            U32 const ll0 = !litlen;
+            U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, ip, iend, extDict, rep, ll0, matches, minMatch);
+            if (!nbMatches) { ip++; continue; }
+
+            /* initialize opt[0] */
+            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+            opt[0].mlen = 1;
+            opt[0].litlen = litlen;
+
+            /* large match -> immediate encoding */
+            {   U32 const maxML = matches[nbMatches-1].len;
+                DEBUGLOG(7, "found %u matches of maxLength=%u and offset=%u at cPos=%u => start new serie",
+                            nbMatches, maxML, matches[nbMatches-1].off, (U32)(ip-prefixStart));
+
+                if (maxML > sufficient_len) {
+                    best_mlen = maxML;
+                    best_off = matches[nbMatches-1].off;
+                    DEBUGLOG(7, "large match (%u>%u), immediate encoding",
+                                best_mlen, sufficient_len);
+                    cur = 0;
+                    last_pos = 1;
+                    goto _shortestPath;
+            }   }
+
+            /* set prices for first matches starting position == 0 */
+            {   U32 const literalsPrice = ZSTD_fullLiteralsCost_cached(&cachedLitPrice, anchor, litlen, optStatePtr);
+                U32 pos;
+                U32 matchNb;
+                for (pos = 0; pos < minMatch; pos++) {
+                    opt[pos].mlen = 1;
+                    opt[pos].price = ZSTD_MAX_PRICE;
+                }
+                for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+                    U32 const offset = matches[matchNb].off;
+                    U32 const end = matches[matchNb].len;
+                    repcodes_t const repHistory = ZSTD_updateRep(rep, offset, ll0);
+                    for ( ; pos <= end ; pos++ ) {
+                        U32 const matchPrice = literalsPrice + ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel);
+                        DEBUGLOG(7, "rPos:%u => set initial price : %u",
+                                    pos, matchPrice);
+                        opt[pos].mlen = pos;
+                        opt[pos].off = offset;
+                        opt[pos].litlen = litlen;
+                        opt[pos].price = matchPrice;
+                        memcpy(opt[pos].rep, &repHistory, sizeof(repHistory));
+                }   }
+                last_pos = pos-1;
+            }
+        }
+
+        /* check further positions */
+        for (cur = 1; cur <= last_pos; cur++) {
+            const BYTE* const inr = ip + cur;
+            assert(cur < ZSTD_OPT_NUM);
+
+            /* Fix current position with one literal if cheaper */
+            {   U32 const litlen = (opt[cur-1].mlen == 1) ? opt[cur-1].litlen + 1 : 1;
+                int price;  /* note : contribution can be negative */
+                if (cur > litlen) {
+                    price = opt[cur - litlen].price + ZSTD_literalsContribution(inr-litlen, litlen, optStatePtr);
+                } else {
+                    price = ZSTD_literalsContribution_cached(&cachedLitPrice, anchor, litlen, optStatePtr);
+                }
+                assert(price < 1000000000); /* overflow check */
+                if (price <= opt[cur].price) {
+                    DEBUGLOG(7, "rPos:%u : better price (%u<%u) using literal",
+                                cur, price, opt[cur].price);
+                    opt[cur].mlen = 1;
+                    opt[cur].off = 0;
+                    opt[cur].litlen = litlen;
+                    opt[cur].price = price;
+                    memcpy(opt[cur].rep, opt[cur-1].rep, sizeof(opt[cur].rep));
+            }   }
+
+            /* last match must start at a minimum distance of 8 from oend */
+            if (inr > ilimit) continue;
+
+            if (cur == last_pos) break;
+
+             if ( (optLevel==0) /*static*/
+               && (opt[cur+1].price <= opt[cur].price) )
+                continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+
+            {   U32 const ll0 = (opt[cur].mlen != 1);
+                U32 const litlen = (opt[cur].mlen == 1) ? opt[cur].litlen : 0;
+                U32 const previousPrice = (cur > litlen) ? opt[cur-litlen].price : 0;
+                U32 const basePrice = previousPrice + ZSTD_fullLiteralsCost(inr-litlen, litlen, optStatePtr);
+                U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, inr, iend, extDict, opt[cur].rep, ll0, matches, minMatch);
+                U32 matchNb;
+                if (!nbMatches) continue;
+
+                {   U32 const maxML = matches[nbMatches-1].len;
+                    DEBUGLOG(7, "rPos:%u, found %u matches, of maxLength=%u",
+                                cur, nbMatches, maxML);
+
+                    if ( (maxML > sufficient_len)
+                       | (cur + maxML >= ZSTD_OPT_NUM) ) {
+                        best_mlen = maxML;
+                        best_off = matches[nbMatches-1].off;
+                        last_pos = cur + 1;
+                        goto _shortestPath;
+                    }
+                }
+
+                /* set prices using matches found at position == cur */
+                for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+                    U32 const offset = matches[matchNb].off;
+                    repcodes_t const repHistory = ZSTD_updateRep(opt[cur].rep, offset, ll0);
+                    U32 const lastML = matches[matchNb].len;
+                    U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                    U32 mlen;
+
+                    DEBUGLOG(7, "testing match %u => offCode=%u, mlen=%u, llen=%u",
+                                matchNb, matches[matchNb].off, lastML, litlen);
+
+                    for (mlen = lastML; mlen >= startML; mlen--) {
+                        U32 const pos = cur + mlen;
+                        int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+
+                        if ((pos > last_pos) || (price < opt[pos].price)) {
+                            DEBUGLOG(7, "rPos:%u => new better price (%u<%u)",
+                                        pos, price, opt[pos].price);
+                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }
+                            opt[pos].mlen = mlen;
+                            opt[pos].off = offset;
+                            opt[pos].litlen = litlen;
+                            opt[pos].price = price;
+                            memcpy(opt[pos].rep, &repHistory, sizeof(repHistory));
+                        } else {
+                            if (optLevel==0) break;  /* gets ~+10% speed for about -0.01 ratio loss */
+                        }
+            }   }   }
+        }  /* for (cur = 1; cur <= last_pos; cur++) */
+
+        best_mlen = opt[last_pos].mlen;
+        best_off = opt[last_pos].off;
+        cur = last_pos - best_mlen;
+
+_shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+        assert(opt[0].mlen == 1);
+
+        /* reverse traversal */
+        DEBUGLOG(7, "start reverse traversal (last_pos:%u, cur:%u)",
+                    last_pos, cur);
+        {   U32 selectedMatchLength = best_mlen;
+            U32 selectedOffset = best_off;
+            U32 pos = cur;
+            while (1) {
+                U32 const mlen = opt[pos].mlen;
+                U32 const off = opt[pos].off;
+                opt[pos].mlen = selectedMatchLength;
+                opt[pos].off = selectedOffset;
+                selectedMatchLength = mlen;
+                selectedOffset = off;
+                if (mlen > pos) break;
+                pos -= mlen;
+        }   }
+
+        /* save sequences */
+        {   U32 pos;
+            for (pos=0; pos < last_pos; ) {
+                U32 const llen = (U32)(ip - anchor);
+                U32 const mlen = opt[pos].mlen;
+                U32 const offset = opt[pos].off;
+                if (mlen == 1) { ip++; pos++; continue; }  /* literal position => move on */
+                pos += mlen; ip += mlen;
+
+                /* repcodes update : like ZSTD_updateRep(), but update in place */
+                if (offset >= ZSTD_REP_NUM) {  /* full offset */
+                    rep[2] = rep[1];
+                    rep[1] = rep[0];
+                    rep[0] = offset - ZSTD_REP_MOVE;
+                } else {   /* repcode */
+                    U32 const repCode = offset + (llen==0);
+                    if (repCode) {  /* note : if repCode==0, no change */
+                        U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+                        if (repCode >= 2) rep[2] = rep[1];
+                        rep[1] = rep[0];
+                        rep[0] = currentOffset;
+                    }
+                }
+
+                ZSTD_updateStats(optStatePtr, llen, anchor, offset, mlen);
+                ZSTD_storeSeq(seqStore, llen, anchor, offset, mlen-MINMATCH);
+                anchor = ip;
+        }   }
+        ZSTD_setLog2Prices(optStatePtr);
+    }   /* while (ip < ilimit) */
+
+    /* Return the last literals size */
+    return iend - anchor;
+}
+
+
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, 0 /*extDict*/);
+}
+
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, 0 /*extDict*/);
+}
+
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, 1 /*extDict*/);
+}
+
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, 1 /*extDict*/);
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_opt.h b/vendor/github.com/DataDog/zstd/zstd_opt.h
new file mode 100644
index 0000000..b8dc389
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_opt.h

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_OPT_H
+#define ZSTD_OPT_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "zstd_compress_internal.h"
+
+void ZSTD_updateTree(
+        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+        const BYTE* ip, const BYTE* iend);  /* used in ZSTD_loadDictionaryContent() */
+
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_OPT_H */

diff --git a/vendor/github.com/DataDog/zstd/zstd_stream.go b/vendor/github.com/DataDog/zstd/zstd_stream.go
new file mode 100644
index 0000000..28dfd3e
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_stream.go

@@ -0,0 +1,269 @@
+package zstd
+
+/*
+#define ZSTD_STATIC_LINKING_ONLY
+#define ZBUFF_DISABLE_DEPRECATE_WARNINGS
+#include "zstd.h"
+#include "zbuff.h"
+*/
+import "C"
+import (
+	"fmt"
+	"io"
+	"unsafe"
+)
+
+// Writer is an io.WriteCloser that zstd-compresses its input.
+type Writer struct {
+	CompressionLevel int
+
+	ctx              *C.ZSTD_CCtx
+	dict             []byte
+	dstBuffer        []byte
+	firstError       error
+	underlyingWriter io.Writer
+}
+
+func resize(in []byte, newSize int) []byte {
+	if in == nil {
+		return make([]byte, newSize)
+	}
+	if newSize <= cap(in) {
+		return in[:newSize]
+	}
+	toAdd := newSize - len(in)
+	return append(in, make([]byte, toAdd)...)
+}
+
+// NewWriter creates a new Writer with default compression options.  Writes to
+// the writer will be written in compressed form to w.
+func NewWriter(w io.Writer) *Writer {
+	return NewWriterLevelDict(w, DefaultCompression, nil)
+}
+
+// NewWriterLevel is like NewWriter but specifies the compression level instead
+// of assuming default compression.
+//
+// The level can be DefaultCompression or any integer value between BestSpeed
+// and BestCompression inclusive.
+func NewWriterLevel(w io.Writer, level int) *Writer {
+	return NewWriterLevelDict(w, level, nil)
+
+}
+
+// NewWriterLevelDict is like NewWriterLevel but specifies a dictionary to
+// compress with.  If the dictionary is empty or nil it is ignored. The dictionary
+// should not be modified until the writer is closed.
+func NewWriterLevelDict(w io.Writer, level int, dict []byte) *Writer {
+	var err error
+	ctx := C.ZSTD_createCCtx()
+
+	if dict == nil {
+		err = getError(int(C.ZSTD_compressBegin(ctx,
+			C.int(level))))
+	} else {
+		err = getError(int(C.ZSTD_compressBegin_usingDict(
+			ctx,
+			unsafe.Pointer(&dict[0]),
+			C.size_t(len(dict)),
+			C.int(level))))
+	}
+
+	return &Writer{
+		CompressionLevel: level,
+		ctx:              ctx,
+		dict:             dict,
+		dstBuffer:        make([]byte, CompressBound(1024)),
+		firstError:       err,
+		underlyingWriter: w,
+	}
+}
+
+// Write writes a compressed form of p to the underlying io.Writer.
+func (w *Writer) Write(p []byte) (int, error) {
+	if w.firstError != nil {
+		return 0, w.firstError
+	}
+	if len(p) == 0 {
+		return 0, nil
+	}
+	// Check if dstBuffer is enough
+	if len(w.dstBuffer) < CompressBound(len(p)) {
+		w.dstBuffer = make([]byte, CompressBound(len(p)))
+	}
+
+	retCode := C.ZSTD_compressContinue(
+		w.ctx,
+		unsafe.Pointer(&w.dstBuffer[0]),
+		C.size_t(len(w.dstBuffer)),
+		unsafe.Pointer(&p[0]),
+		C.size_t(len(p)))
+
+	if err := getError(int(retCode)); err != nil {
+		return 0, err
+	}
+	written := int(retCode)
+
+	// Write to underlying buffer
+	_, err := w.underlyingWriter.Write(w.dstBuffer[:written])
+
+	// Same behaviour as zlib, we can't know how much data we wrote, only
+	// if there was an error
+	if err != nil {
+		return 0, err
+	}
+	return len(p), err
+}
+
+// Close closes the Writer, flushing any unwritten data to the underlying
+// io.Writer and freeing objects, but does not close the underlying io.Writer.
+func (w *Writer) Close() error {
+	retCode := C.ZSTD_compressEnd(
+		w.ctx,
+		unsafe.Pointer(&w.dstBuffer[0]),
+		C.size_t(len(w.dstBuffer)),
+		unsafe.Pointer(nil),
+		C.size_t(0))
+
+	if err := getError(int(retCode)); err != nil {
+		return err
+	}
+	written := int(retCode)
+	retCode = C.ZSTD_freeCCtx(w.ctx) // Safely close buffer before writing the end
+
+	if err := getError(int(retCode)); err != nil {
+		return err
+	}
+
+	_, err := w.underlyingWriter.Write(w.dstBuffer[:written])
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+// reader is an io.ReadCloser that decompresses when read from.
+type reader struct {
+	ctx                 *C.ZBUFF_DCtx
+	compressionBuffer   []byte
+	compressionLeft     int
+	decompressionBuffer []byte
+	decompOff           int
+	decompSize          int
+	dict                []byte
+	firstError          error
+	recommendedSrcSize  int
+	underlyingReader    io.Reader
+}
+
+// NewReader creates a new io.ReadCloser.  Reads from the returned ReadCloser
+// read and decompress data from r.  It is the caller's responsibility to call
+// Close on the ReadCloser when done.  If this is not done, underlying objects
+// in the zstd library will not be freed.
+func NewReader(r io.Reader) io.ReadCloser {
+	return NewReaderDict(r, nil)
+}
+
+// NewReaderDict is like NewReader but uses a preset dictionary.  NewReaderDict
+// ignores the dictionary if it is nil.
+func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
+	var err error
+	ctx := C.ZBUFF_createDCtx()
+	if len(dict) == 0 {
+		err = getError(int(C.ZBUFF_decompressInit(ctx)))
+	} else {
+		err = getError(int(C.ZBUFF_decompressInitDictionary(
+			ctx,
+			unsafe.Pointer(&dict[0]),
+			C.size_t(len(dict)))))
+	}
+	cSize := int(C.ZBUFF_recommendedDInSize())
+	dSize := int(C.ZBUFF_recommendedDOutSize())
+	if cSize <= 0 {
+		panic(fmt.Errorf("ZBUFF_recommendedDInSize() returned invalid size: %v", cSize))
+	}
+	if dSize <= 0 {
+		panic(fmt.Errorf("ZBUFF_recommendedDOutSize() returned invalid size: %v", dSize))
+	}
+
+	compressionBuffer := make([]byte, cSize)
+	decompressionBuffer := make([]byte, dSize)
+	return &reader{
+		ctx:                 ctx,
+		dict:                dict,
+		compressionBuffer:   compressionBuffer,
+		decompressionBuffer: decompressionBuffer,
+		firstError:          err,
+		recommendedSrcSize:  cSize,
+		underlyingReader:    r,
+	}
+}
+
+// Close frees the allocated C objects
+func (r *reader) Close() error {
+	return getError(int(C.ZBUFF_freeDCtx(r.ctx)))
+}
+
+func (r *reader) Read(p []byte) (int, error) {
+
+	// If we already have enough bytes, return
+	if r.decompSize-r.decompOff >= len(p) {
+		copy(p, r.decompressionBuffer[r.decompOff:])
+		r.decompOff += len(p)
+		return len(p), nil
+	}
+
+	copy(p, r.decompressionBuffer[r.decompOff:r.decompSize])
+	got := r.decompSize - r.decompOff
+	r.decompSize = 0
+	r.decompOff = 0
+
+	for got < len(p) {
+		// Populate src
+		src := r.compressionBuffer
+		reader := r.underlyingReader
+		n, err := io.ReadFull(reader, src[r.compressionLeft:])
+		if err == io.EOF && r.compressionLeft == 0 {
+			return got, io.EOF
+		} else if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
+			return 0, fmt.Errorf("failed to read from underlying reader: %s", err)
+		}
+		src = src[:r.compressionLeft+n]
+
+		// C code
+		cSrcSize := C.size_t(len(src))
+		cDstSize := C.size_t(len(r.decompressionBuffer))
+		retCode := int(C.ZBUFF_decompressContinue(
+			r.ctx,
+			unsafe.Pointer(&r.decompressionBuffer[0]),
+			&cDstSize,
+			unsafe.Pointer(&src[0]),
+			&cSrcSize))
+
+		if err = getError(retCode); err != nil {
+			return 0, fmt.Errorf("failed to decompress: %s", err)
+		}
+
+		// Put everything in buffer
+		if int(cSrcSize) < len(src) {
+			left := src[int(cSrcSize):]
+			copy(r.compressionBuffer, left)
+		}
+		r.compressionLeft = len(src) - int(cSrcSize)
+		r.decompSize = int(cDstSize)
+		r.decompOff = copy(p[got:], r.decompressionBuffer[:r.decompSize])
+		got += r.decompOff
+
+		// Resize buffers
+		nsize := retCode // Hint for next src buffer size
+		if nsize <= 0 {
+			// Reset to recommended size
+			nsize = r.recommendedSrcSize
+		}
+		if nsize < r.compressionLeft {
+			nsize = r.compressionLeft
+		}
+		r.compressionBuffer = resize(r.compressionBuffer, nsize)
+	}
+	return got, nil
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_v01.c b/vendor/github.com/DataDog/zstd/zstd_v01.c
new file mode 100644
index 0000000..ae1cb2c
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v01.c

@@ -0,0 +1,2127 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/******************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include "zstd_v01.h"
+#include "error_private.h"
+
+
+/******************************************
+*  Static allocation
+******************************************/
+/* You can statically allocate FSE CTable/DTable as a table of unsigned using below macro */
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+/* You can statically allocate Huff0 DTable as a table of unsigned short using below macro */
+#define HUF_DTABLE_SIZE_U16(maxTableLog)   (1 + (1<<maxTableLog))
+#define HUF_CREATE_STATIC_DTABLE(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE_U16(maxTableLog)] = { maxTableLog }
+
+
+/******************************************
+*  Error Management
+******************************************/
+#define FSE_LIST_ERRORS(ITEM) \
+        ITEM(FSE_OK_NoError) ITEM(FSE_ERROR_GENERIC) \
+        ITEM(FSE_ERROR_tableLog_tooLarge) ITEM(FSE_ERROR_maxSymbolValue_tooLarge) ITEM(FSE_ERROR_maxSymbolValue_tooSmall) \
+        ITEM(FSE_ERROR_dstSize_tooSmall) ITEM(FSE_ERROR_srcSize_wrong)\
+        ITEM(FSE_ERROR_corruptionDetected) \
+        ITEM(FSE_ERROR_maxCode)
+
+#define FSE_GENERATE_ENUM(ENUM) ENUM,
+typedef enum { FSE_LIST_ERRORS(FSE_GENERATE_ENUM) } FSE_errorCodes;  /* enum is exposed, to detect & handle specific errors; compare function result to -enum value */
+
+
+/******************************************
+*  FSE symbol compression API
+******************************************/
+/*
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   You will want to enable link-time-optimization to ensure these functions are properly inlined in your binary.
+   Visual seems to do it automatically.
+   For gcc or clang, you'll need to add -flto flag at compilation and linking stages.
+   If none of these solutions is applicable, include "fse.c" directly.
+*/
+
+typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+
+typedef struct
+{
+    size_t bitContainer;
+    int    bitPos;
+    char*  startPtr;
+    char*  ptr;
+    char*  endPtr;
+} FSE_CStream_t;
+
+typedef struct
+{
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
+} FSE_CState_t;
+
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} FSE_DStream_t;
+
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+typedef enum { FSE_DStream_unfinished = 0,
+               FSE_DStream_endOfBuffer = 1,
+               FSE_DStream_completed = 2,
+               FSE_DStream_tooFar = 3 } FSE_DStream_status;  /* result of FSE_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... ?! */
+
+
+/****************************************************************
+*  Tuning parameters
+****************************************************************/
+/* MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSE_MAX_MEMORY_USAGE 14
+#define FSE_DEFAULT_MEMORY_USAGE 13
+
+/* FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSE_MAX_SYMBOL_VALUE 255
+
+
+/****************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+
+
+/****************************************************************
+*  Byte symbol type
+****************************************************************/
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+
+
+/****************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/****************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+
+#ifndef MEM_ACCESS_MODULE
+#define MEM_ACCESS_MODULE
+/****************************************************************
+*  Basic Types
+*****************************************************************/
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+# include <stdint.h>
+typedef  uint8_t BYTE;
+typedef uint16_t U16;
+typedef  int16_t S16;
+typedef uint32_t U32;
+typedef  int32_t S32;
+typedef uint64_t U64;
+typedef  int64_t S64;
+#else
+typedef unsigned char       BYTE;
+typedef unsigned short      U16;
+typedef   signed short      S16;
+typedef unsigned int        U32;
+typedef   signed int        S32;
+typedef unsigned long long  U64;
+typedef   signed long long  S64;
+#endif
+
+#endif   /* MEM_ACCESS_MODULE */
+
+/****************************************************************
+*  Memory I/O
+*****************************************************************/
+/* FSE_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets generating assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef FSE_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define FSE_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define FSE_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+
+static unsigned FSE_32bits(void)
+{
+    return sizeof(void*)==4;
+}
+
+static unsigned FSE_isLittleEndian(void)
+{
+    const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==2)
+
+static U16 FSE_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static U32 FSE_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 FSE_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U16 FSE_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+static U32 FSE_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 FSE_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
+
+static U16 FSE_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U32 FSE_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U64 FSE_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+#endif // FSE_FORCE_MEMORY_ACCESS
+
+static U16 FSE_readLE16(const void* memPtr)
+{
+    if (FSE_isLittleEndian())
+        return FSE_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+static U32 FSE_readLE32(const void* memPtr)
+{
+    if (FSE_isLittleEndian())
+        return FSE_read32(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+
+static U64 FSE_readLE64(const void* memPtr)
+{
+    if (FSE_isLittleEndian())
+        return FSE_read64(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
+                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
+    }
+}
+
+static size_t FSE_readLEST(const void* memPtr)
+{
+    if (FSE_32bits())
+        return (size_t)FSE_readLE32(memPtr);
+    else
+        return (size_t)FSE_readLE64(memPtr);
+}
+
+
+
+/****************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+
+/****************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/****************************************************************
+*  Complex types
+****************************************************************/
+typedef struct
+{
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+/****************************************************************
+*  Internal functions
+****************************************************************/
+FORCE_INLINE unsigned FSE_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (GCC_VERSION >= 304)   /* GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+/****************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+
+static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+static size_t FSE_buildDTable
+(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*)(ptr) + 1;   /* because dt is unsigned, 32-bits aligned on 32-bits */
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSE_tableStep(tableSize);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_maxSymbolValue_tooLarge;
+    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_tableLog_tooLarge;
+
+    /* Init, lay down lowprob symbols */
+    DTableH[0].tableLog = (U16)tableLog;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        if (normalizedCounter[s]==-1)
+        {
+            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        }
+        else
+        {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+        }
+    }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++)
+        {
+            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }
+    }
+
+    if (position!=0) return (size_t)-FSE_ERROR_GENERIC;   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++)
+        {
+            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - FSE_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+        }
+    }
+
+    DTableH->fastMode = (U16)noLarge;
+    return 0;
+}
+
+
+/******************************************
+*  FSE byte symbol
+******************************************/
+#ifndef FSE_COMMONDEFS_ONLY
+
+static unsigned FSE_isError(size_t code) { return (code > (size_t)(-FSE_ERROR_maxCode)); }
+
+static short FSE_abs(short a)
+{
+    return a<0? -a : a;
+}
+
+
+/****************************************************************
+*  Header bitstream management
+****************************************************************/
+static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return (size_t)-FSE_ERROR_srcSize_wrong;
+    bitStream = FSE_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return (size_t)-FSE_ERROR_tableLog_tooLarge;
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr))
+    {
+        if (previous0)
+        {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF)
+            {
+                n0+=24;
+                if (ip < iend-5)
+                {
+                    ip+=2;
+                    bitStream = FSE_readLE32(ip) >> bitCount;
+                }
+                else
+                {
+                    bitStream >>= 16;
+                    bitCount+=16;
+                }
+            }
+            while ((bitStream & 3) == 3)
+            {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return (size_t)-FSE_ERROR_maxSymbolValue_tooSmall;
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+            {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = FSE_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max)
+            {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            }
+            else
+            {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSE_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold)
+            {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            {
+                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+                {
+                    ip += bitCount>>3;
+                    bitCount &= 7;
+                }
+                else
+                {
+                    bitCount -= (int)(8 * (iend - 4 - ip));
+                    ip = iend - 4;
+                }
+                bitStream = FSE_readLE32(ip) >> (bitCount & 31);
+            }
+        }
+    }
+    if (remaining != 1) return (size_t)-FSE_ERROR_GENERIC;
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+    return ip-istart;
+}
+
+
+/*********************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const cell = (FSE_decode_t*)(ptr) + 1;   /* because dt is unsigned */
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)(ptr) + 1;   /* because dt is unsigned */
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+
+/* FSE_initDStream
+ * Initialize a FSE_DStream_t.
+ * srcBuffer must point at the beginning of an FSE block.
+ * The function result is the size of the FSE_block (== srcSize).
+ * If srcSize is too small, the function will return an errorCode;
+ */
+static size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) return (size_t)-FSE_ERROR_srcSize_wrong;
+
+    if (srcSize >=  sizeof(size_t))
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = FSE_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
+        bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
+    }
+    else
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+            default:;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
+        bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+
+/*!FSE_lookBits
+ * Provides next n bits from the bitContainer.
+ * bitContainer is not modified (bits are still present for next read/look)
+ * On 32-bits, maxNbBits==25
+ * On 64-bits, maxNbBits==57
+ * return : value extracted.
+ */
+static size_t FSE_lookBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+static size_t FSE_lookBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+static void FSE_skipBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+
+/*!FSE_readBits
+ * Read next n bits from the bitContainer.
+ * On 32-bits, don't read more than maxNbBits==25
+ * On 64-bits, don't read more than maxNbBits==57
+ * Use the fast variant *only* if n >= 1.
+ * return : value extracted.
+ */
+static size_t FSE_readBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = FSE_lookBits(bitD, nbBits);
+    FSE_skipBits(bitD, nbBits);
+    return value;
+}
+
+static size_t FSE_readBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
+{
+    size_t value = FSE_lookBitsFast(bitD, nbBits);
+    FSE_skipBits(bitD, nbBits);
+    return value;
+}
+
+static unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return FSE_DStream_tooFar;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
+    {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = FSE_readLEST(bitD->ptr);
+        return FSE_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start)
+    {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return FSE_DStream_endOfBuffer;
+        return FSE_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        U32 result = FSE_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start)
+        {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = FSE_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = FSE_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+
+static void FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+    DStatePtr->state = FSE_readBits(bitD, DTableH->tableLog);
+    FSE_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+static BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = FSE_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+static BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = FSE_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/* FSE_endOfDStream
+   Tells if bitD has reached end of bitStream or not */
+
+static unsigned FSE_endOfDStream(const FSE_DStream_t* bitD)
+{
+    return ((bitD->ptr == bitD->start) && (bitD->bitsConsumed == sizeof(bitD->bitContainer)*8));
+}
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    FSE_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = FSE_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSE_isError(errorCode)) return errorCode;
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (FSE_reloadDStream(&bitD)==FSE_DStream_unfinished) && (op<olimit) ; op+=4)
+    {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            FSE_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (FSE_reloadDStream(&bitD) > FSE_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            FSE_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : FSE_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly FSE_DStream_completed */
+    while (1)
+    {
+        if ( (FSE_reloadDStream(&bitD)>FSE_DStream_completed) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state1);
+
+        if ( (FSE_reloadDStream(&bitD)>FSE_DStream_completed) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (FSE_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
+
+    return (size_t)-FSE_ERROR_corruptionDetected;
+}
+
+
+static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));   /* memcpy() into local variable, to avoid strict aliasing warning */
+
+    /* select fast mode (static) */
+    if (DTableH.fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
+
+    /* normal FSE decoding mode */
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+/* *******************************************************
+*  Huff0 : Huffman block compression
+*********************************************************/
+#define HUF_MAX_SYMBOL_VALUE 255
+#define HUF_DEFAULT_TABLELOG  12       /* used by default, when not specified */
+#define HUF_MAX_TABLELOG  12           /* max possible tableLog; for allocation purpose; can be modified */
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
+#  error "HUF_MAX_TABLELOG is too large !"
+#endif
+
+typedef struct HUF_CElt_s {
+  U16  val;
+  BYTE nbBits;
+} HUF_CElt ;
+
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+
+/* *******************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct {
+    BYTE byte;
+    BYTE nbBits;
+} HUF_DElt;
+
+static size_t HUF_readDTable (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];  /* large enough for values from 0 to 16 */
+    U32 weightTotal;
+    U32 maxBits;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+    U32 n;
+    U32 nextRankStart;
+    void* ptr = DTable+1;
+    HUF_DElt* const dt = (HUF_DElt*)ptr;
+
+    if (!srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+    iSize = ip[0];
+
+    FSE_STATIC_ASSERT(sizeof(HUF_DElt) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* should not be necessary, but some analyzer complain ... */
+    if (iSize >= 128)  /* special header */
+    {
+        if (iSize >= (242))   /* RLE */
+        {
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, sizeof(huffWeight));
+            iSize = 0;
+        }
+        else   /* Incompressible */
+        {
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+            ip += 1;
+            for (n=0; n<oSize; n+=2)
+            {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+            }
+        }
+    }
+    else  /* header compressed with FSE (normal case) */
+    {
+        if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+        oSize = FSE_decompress(huffWeight, HUF_MAX_SYMBOL_VALUE, ip+1, iSize);   /* max 255 values decoded, last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankVal, 0, sizeof(rankVal));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++)
+    {
+        if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return (size_t)-FSE_ERROR_corruptionDetected;
+        rankVal[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+    if (weightTotal == 0) return (size_t)-FSE_ERROR_corruptionDetected;
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    maxBits = FSE_highbit32(weightTotal) + 1;
+    if (maxBits > DTable[0]) return (size_t)-FSE_ERROR_tableLog_tooLarge;   /* DTable is too small */
+    DTable[0] = (U16)maxBits;
+    {
+        U32 total = 1 << maxBits;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << FSE_highbit32(rest);
+        U32 lastWeight = FSE_highbit32(rest) + 1;
+        if (verif != rest) return (size_t)-FSE_ERROR_corruptionDetected;    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankVal[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankVal[1] < 2) || (rankVal[1] & 1)) return (size_t)-FSE_ERROR_corruptionDetected;   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=maxBits; n++)
+    {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<=oSize; n++)
+    {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUF_DElt D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(maxBits + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize+1;
+}
+
+
+static BYTE HUF_decodeSymbol(FSE_DStream_t* Dstream, const HUF_DElt* dt, const U32 dtLog)
+{
+        const size_t val = FSE_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        FSE_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+static size_t HUF_decompress_usingDTable(   /* -3% slower when non static */
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-15;
+
+    const void* ptr = DTable;
+    const HUF_DElt* const dt = (const HUF_DElt*)(ptr)+1;
+    const U32 dtLog = DTable[0];
+    size_t errorCode;
+    U32 reloadStatus;
+
+    /* Init */
+
+    const U16* jumpTable = (const U16*)cSrc;
+    const size_t length1 = FSE_readLE16(jumpTable);
+    const size_t length2 = FSE_readLE16(jumpTable+1);
+    const size_t length3 = FSE_readLE16(jumpTable+2);
+    const size_t length4 = cSrcSize - 6 - length1 - length2 - length3;   // check coherency !!
+    const char* const start1 = (const char*)(cSrc) + 6;
+    const char* const start2 = start1 + length1;
+    const char* const start3 = start2 + length2;
+    const char* const start4 = start3 + length3;
+    FSE_DStream_t bitD1, bitD2, bitD3, bitD4;
+
+    if (length1+length2+length3+6 >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+
+    errorCode = FSE_initDStream(&bitD1, start1, length1);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD2, start2, length2);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD3, start3, length3);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD4, start4, length4);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    reloadStatus=FSE_reloadDStream(&bitD2);
+
+    /* 16 symbols per loop */
+    for ( ; (reloadStatus<FSE_DStream_completed) && (op<olimit);  /* D2-3-4 are supposed to be synchronized and finish together */
+        op+=16, reloadStatus = FSE_reloadDStream(&bitD2) | FSE_reloadDStream(&bitD3) | FSE_reloadDStream(&bitD4), FSE_reloadDStream(&bitD1))
+    {
+#define HUF_DECODE_SYMBOL_0(n, Dstream) \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog);
+
+#define HUF_DECODE_SYMBOL_1(n, Dstream) \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog); \
+        if (FSE_32bits() && (HUF_MAX_TABLELOG>12)) FSE_reloadDStream(&Dstream)
+
+#define HUF_DECODE_SYMBOL_2(n, Dstream) \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog); \
+        if (FSE_32bits()) FSE_reloadDStream(&Dstream)
+
+        HUF_DECODE_SYMBOL_1( 0, bitD1);
+        HUF_DECODE_SYMBOL_1( 1, bitD2);
+        HUF_DECODE_SYMBOL_1( 2, bitD3);
+        HUF_DECODE_SYMBOL_1( 3, bitD4);
+        HUF_DECODE_SYMBOL_2( 4, bitD1);
+        HUF_DECODE_SYMBOL_2( 5, bitD2);
+        HUF_DECODE_SYMBOL_2( 6, bitD3);
+        HUF_DECODE_SYMBOL_2( 7, bitD4);
+        HUF_DECODE_SYMBOL_1( 8, bitD1);
+        HUF_DECODE_SYMBOL_1( 9, bitD2);
+        HUF_DECODE_SYMBOL_1(10, bitD3);
+        HUF_DECODE_SYMBOL_1(11, bitD4);
+        HUF_DECODE_SYMBOL_0(12, bitD1);
+        HUF_DECODE_SYMBOL_0(13, bitD2);
+        HUF_DECODE_SYMBOL_0(14, bitD3);
+        HUF_DECODE_SYMBOL_0(15, bitD4);
+    }
+
+    if (reloadStatus!=FSE_DStream_completed)   /* not complete : some bitStream might be FSE_DStream_unfinished */
+        return (size_t)-FSE_ERROR_corruptionDetected;
+
+    /* tail */
+    {
+        // bitTail = bitD1;   // *much* slower : -20% !??!
+        FSE_DStream_t bitTail;
+        bitTail.ptr = bitD1.ptr;
+        bitTail.bitsConsumed = bitD1.bitsConsumed;
+        bitTail.bitContainer = bitD1.bitContainer;   // required in case of FSE_DStream_endOfBuffer
+        bitTail.start = start1;
+        for ( ; (FSE_reloadDStream(&bitTail) < FSE_DStream_completed) && (op<omax) ; op++)
+        {
+            HUF_DECODE_SYMBOL_0(0, bitTail);
+        }
+
+        if (FSE_endOfDStream(&bitTail))
+            return op-ostart;
+    }
+
+    if (op==omax) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
+
+    return (size_t)-FSE_ERROR_corruptionDetected;
+}
+
+
+static size_t HUF_decompress (void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLE(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUF_readDTable (DTable, cSrc, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUF_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, DTable);
+}
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/****************************************************************
+*  Tuning parameters
+*****************************************************************/
+/* MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect */
+#define ZSTD_MEMORY_USAGE 17
+
+
+/**************************************
+   CPU Feature Detection
+**************************************/
+/*
+ * Automated efficient unaligned memory access detection
+ * Based on known hardware architectures
+ * This list will be updated thanks to feedbacks
+ */
+#if defined(CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS) \
+    || defined(__ARM_FEATURE_UNALIGNED) \
+    || defined(__i386__) || defined(__x86_64__) \
+    || defined(_M_IX86) || defined(_M_X64) \
+    || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_8__) \
+    || (defined(_M_ARM) && (_M_ARM >= 7))
+#  define ZSTD_UNALIGNED_ACCESS 1
+#else
+#  define ZSTD_UNALIGNED_ACCESS 0
+#endif
+
+
+/********************************************************
+*  Includes
+*********************************************************/
+#include <stdlib.h>      /* calloc */
+#include <string.h>      /* memcpy, memmove */
+#include <stdio.h>       /* debug : printf */
+
+
+/********************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef __AVX2__
+#  include <immintrin.h>   /* AVX2 intrinsics */
+#endif
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+
+#ifndef MEM_ACCESS_MODULE
+#define MEM_ACCESS_MODULE
+/********************************************************
+*  Basic Types
+*********************************************************/
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+# include <stdint.h>
+typedef  uint8_t BYTE;
+typedef uint16_t U16;
+typedef  int16_t S16;
+typedef uint32_t U32;
+typedef  int32_t S32;
+typedef uint64_t U64;
+#else
+typedef unsigned char       BYTE;
+typedef unsigned short      U16;
+typedef   signed short      S16;
+typedef unsigned int        U32;
+typedef   signed int        S32;
+typedef unsigned long long  U64;
+#endif
+
+#endif   /* MEM_ACCESS_MODULE */
+
+
+/********************************************************
+*  Constants
+*********************************************************/
+static const U32 ZSTD_magicNumber = 0xFD2FB51E;   /* 3rd version : seqNb header */
+
+#define HASH_LOG (ZSTD_MEMORY_USAGE - 2)
+#define HASH_TABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASH_TABLESIZE - 1)
+
+#define KNUTH 2654435761
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+
+#define WORKPLACESIZE (BLOCKSIZE*3)
+#define MINMATCH 4
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxML  ((1<<MLbits )-1)
+#define MaxLL  ((1<<LLbits )-1)
+#define MaxOff ((1<<Offbits)-1)
+#define LitFSELog  11
+#define MLFSELog   10
+#define LLFSELog   10
+#define OffFSELog   9
+#define MAX(a,b) ((a)<(b)?(b):(a))
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define LITERAL_NOENTROPY 63
+#define COMMAND_NOENTROPY 7   /* to remove */
+
+static const size_t ZSTD_blockHeaderSize = 3;
+static const size_t ZSTD_frameHeaderSize = 4;
+
+
+/********************************************************
+*  Memory operations
+*********************************************************/
+static unsigned ZSTD_32bits(void) { return sizeof(void*)==4; }
+
+static unsigned ZSTD_isLittleEndian(void)
+{
+    const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+static U16    ZSTD_read16(const void* p) { U16 r; memcpy(&r, p, sizeof(r)); return r; }
+
+static U32    ZSTD_read32(const void* p) { U32 r; memcpy(&r, p, sizeof(r)); return r; }
+
+static void   ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+static void   ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s)    { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+static void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    while (op < oend) COPY8(op, ip);
+}
+
+static U16 ZSTD_readLE16(const void* memPtr)
+{
+    if (ZSTD_isLittleEndian()) return ZSTD_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + ((U16)p[1]<<8));
+    }
+}
+
+
+static U32 ZSTD_readLE32(const void* memPtr)
+{
+    if (ZSTD_isLittleEndian())
+        return ZSTD_read32(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+static U32 ZSTD_readBE32(const void* memPtr)
+{
+    const BYTE* p = (const BYTE*)memPtr;
+    return (U32)(((U32)p[0]<<24) + ((U32)p[1]<<16) + ((U32)p[2]<<8) + ((U32)p[3]<<0));
+}
+
+
+/**************************************
+*  Local structures
+***************************************/
+typedef struct ZSTD_Cctx_s ZSTD_Cctx;
+
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* offCode;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* litLengthStart;
+    BYTE* litLength;
+    BYTE* matchLengthStart;
+    BYTE* matchLength;
+    BYTE* dumpsStart;
+    BYTE* dumps;
+} seqStore_t;
+
+
+typedef struct ZSTD_Cctx_s
+{
+    const BYTE* base;
+    U32 current;
+    U32 nextUpdate;
+    seqStore_t seqStore;
+#ifdef __AVX2__
+    __m256i hashTable[HASH_TABLESIZE>>3];
+#else
+    U32 hashTable[HASH_TABLESIZE];
+#endif
+    BYTE buffer[WORKPLACESIZE];
+} cctxi_t;
+
+
+
+
+/**************************************
+*  Error Management
+**************************************/
+/* published entry point */
+unsigned ZSTDv01_isError(size_t code) { return ERR_isError(code); }
+
+
+/**************************************
+*  Tool functions
+**************************************/
+#define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
+#define ZSTD_VERSION_MINOR    1    /* for new (non-breaking) interface capabilities */
+#define ZSTD_VERSION_RELEASE  3    /* for tweaks, bug-fixes, or development */
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+
+/**************************************************************
+*   Decompression code
+**************************************************************/
+
+size_t ZSTDv01_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    BYTE headerFlags;
+    U32 cSize;
+
+    if (srcSize < 3) return ERROR(srcSize_wrong);
+
+    headerFlags = *in;
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+
+    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+
+static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+static size_t ZSTD_decompressLiterals(void* ctx,
+                                      void* dst, size_t maxDstSize,
+                                const void* src, size_t srcSize)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + maxDstSize;
+    const BYTE* ip = (const BYTE*)src;
+    size_t errorCode;
+    size_t litSize;
+
+    /* check : minimum 2, for litSize, +1, for content */
+    if (srcSize <= 3) return ERROR(corruption_detected);
+
+    litSize = ip[1] + (ip[0]<<8);
+    litSize += ((ip[-3] >> 3) & 7) << 16;   // mmmmh....
+    op = oend - litSize;
+
+    (void)ctx;
+    if (litSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    errorCode = HUF_decompress(op, litSize, ip+2, srcSize-2);
+    if (FSE_isError(errorCode)) return ERROR(GENERIC);
+    return litSize;
+}
+
+
+size_t ZSTDv01_decodeLiteralsBlock(void* ctx,
+                                void* dst, size_t maxDstSize,
+                          const BYTE** litStart, size_t* litSize,
+                          const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    blockProperties_t litbp;
+
+    size_t litcSize = ZSTDv01_getcBlockSize(src, srcSize, &litbp);
+    if (ZSTDv01_isError(litcSize)) return litcSize;
+    if (litcSize > srcSize - ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    ip += ZSTD_blockHeaderSize;
+
+    switch(litbp.blockType)
+    {
+    case bt_raw:
+        *litStart = ip;
+        ip += litcSize;
+        *litSize = litcSize;
+        break;
+    case bt_rle:
+        {
+            size_t rleSize = litbp.origSize;
+            if (rleSize>maxDstSize) return ERROR(dstSize_tooSmall);
+            if (!srcSize) return ERROR(srcSize_wrong);
+            memset(oend - rleSize, *ip, rleSize);
+            *litStart = oend - rleSize;
+            *litSize = rleSize;
+            ip++;
+            break;
+        }
+    case bt_compressed:
+        {
+            size_t decodedLitSize = ZSTD_decompressLiterals(ctx, dst, maxDstSize, ip, litcSize);
+            if (ZSTDv01_isError(decodedLitSize)) return decodedLitSize;
+            *litStart = oend - decodedLitSize;
+            *litSize = decodedLitSize;
+            ip += litcSize;
+            break;
+        }
+    case bt_end:
+    default:
+        return ERROR(GENERIC);
+    }
+
+    return ip-istart;
+}
+
+
+size_t ZSTDv01_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+                         FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
+                         const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    const BYTE* const iend = istart + srcSize;
+    U32 LLtype, Offtype, MLtype;
+    U32 LLlog, Offlog, MLlog;
+    size_t dumpsLength;
+
+    /* check */
+    if (srcSize < 5) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    *nbSeq = ZSTD_readLE16(ip); ip+=2;
+    LLtype  = *ip >> 6;
+    Offtype = (*ip >> 4) & 3;
+    MLtype  = (*ip >> 2) & 3;
+    if (*ip & 2)
+    {
+        dumpsLength  = ip[2];
+        dumpsLength += ip[1] << 8;
+        ip += 3;
+    }
+    else
+    {
+        dumpsLength  = ip[1];
+        dumpsLength += (ip[0] & 1) << 8;
+        ip += 2;
+    }
+    *dumpsPtr = ip;
+    ip += dumpsLength;
+    *dumpsLengthPtr = dumpsLength;
+
+    /* check */
+    if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+    /* sequences */
+    {
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL and MaxOff */
+        size_t headerSize;
+
+        /* Build DTables */
+        switch(LLtype)
+        {
+        case bt_rle :
+            LLlog = 0;
+            FSE_buildDTable_rle(DTableLL, *ip++); break;
+        case bt_raw :
+            LLlog = LLbits;
+            FSE_buildDTable_raw(DTableLL, LLbits); break;
+        default :
+            {   U32 max = MaxLL;
+                headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (LLlog > LLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableLL, norm, max, LLlog);
+        }   }
+
+        switch(Offtype)
+        {
+        case bt_rle :
+            Offlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableOffb, *ip++); break;
+        case bt_raw :
+            Offlog = Offbits;
+            FSE_buildDTable_raw(DTableOffb, Offbits); break;
+        default :
+            {   U32 max = MaxOff;
+                headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (Offlog > OffFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableOffb, norm, max, Offlog);
+        }   }
+
+        switch(MLtype)
+        {
+        case bt_rle :
+            MLlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableML, *ip++); break;
+        case bt_raw :
+            MLlog = MLbits;
+            FSE_buildDTable_raw(DTableML, MLbits); break;
+        default :
+            {   U32 max = MaxML;
+                headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (MLlog > MLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableML, norm, max, MLlog);
+    }   }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t offset;
+    size_t matchLength;
+} seq_t;
+
+typedef struct {
+    FSE_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+    const BYTE* dumpsEnd;
+} seqState_t;
+
+
+static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+    const BYTE* const de = seqState->dumpsEnd;
+
+    /* Literal length */
+    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    seqState->prevOffset = seq->offset;
+    if (litLength == MaxLL)
+    {
+        U32 add = dumps<de ? *dumps++ : 0;
+        if (add < 255) litLength += add;
+        else
+        {
+            if (dumps<=(de-3))
+            {
+                litLength = ZSTD_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+                dumps += 3;
+            }
+        }
+    }
+
+    /* Offset */
+    {
+        U32 offsetCode, nbBits;
+        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));
+        if (ZSTD_32bits()) FSE_reloadDStream(&(seqState->DStream));
+        nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = ((size_t)1 << (nbBits & ((sizeof(offset)*8)-1))) + FSE_readBits(&(seqState->DStream), nbBits);
+        if (ZSTD_32bits()) FSE_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;
+    }
+
+    /* MatchLength */
+    matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML)
+    {
+        U32 add = dumps<de ? *dumps++ : 0;
+        if (add < 255) matchLength += add;
+        else
+        {
+            if (dumps<=(de-3))
+            {
+                matchLength = ZSTD_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+                dumps += 3;
+            }
+        }
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+}
+
+
+static size_t ZSTD_execSequence(BYTE* op,
+                                seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                BYTE* const base, BYTE* const oend)
+{
+    static const int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+    static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
+    const BYTE* const ostart = op;
+    const size_t litLength = sequence.litLength;
+    BYTE* const endMatch = op + litLength + sequence.matchLength;    /* risk : address space overflow (32-bits) */
+    const BYTE* const litEnd = *litPtr + litLength;
+
+    /* check */
+    if (endMatch > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (litEnd > litLimit) return ERROR(corruption_detected);
+    if (sequence.matchLength > (size_t)(*litPtr-op))  return ERROR(dstSize_tooSmall);    /* overwrite literal segment */
+
+    /* copy Literals */
+    if (((size_t)(*litPtr - op) < 8) || ((size_t)(oend-litEnd) < 8) || (op+litLength > oend-8))
+        memmove(op, *litPtr, litLength);   /* overwrite risk */
+    else
+        ZSTD_wildcopy(op, *litPtr, litLength);
+    op += litLength;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* check : last match must be at a minimum distance of 8 from end of dest buffer */
+    if (oend-op < 8) return ERROR(dstSize_tooSmall);
+
+    /* copy Match */
+    {
+        const U32 overlapRisk = (((size_t)(litEnd - endMatch)) < 12);
+        const BYTE* match = op - sequence.offset;            /* possible underflow at op - offset ? */
+        size_t qutt = 12;
+        U64 saved[2];
+
+        /* check */
+        if (match < base) return ERROR(corruption_detected);
+        if (sequence.offset > (size_t)base) return ERROR(corruption_detected);
+
+        /* save beginning of literal sequence, in case of write overlap */
+        if (overlapRisk)
+        {
+            if ((endMatch + qutt) > oend) qutt = oend-endMatch;
+            memcpy(saved, endMatch, qutt);
+        }
+
+        if (sequence.offset < 8)
+        {
+            const int dec64 = dec64table[sequence.offset];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[sequence.offset];
+            ZSTD_copy4(op+4, match);
+            match -= dec64;
+        } else { ZSTD_copy8(op, match); }
+        op += 8; match += 8;
+
+        if (endMatch > oend-(16-MINMATCH))
+        {
+            if (op < oend-8)
+            {
+                ZSTD_wildcopy(op, match, (oend-8) - op);
+                match += (oend-8) - op;
+                op = oend-8;
+            }
+            while (op<endMatch) *op++ = *match++;
+        }
+        else
+            ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+
+        /* restore, in case of overlap */
+        if (overlapRisk) memcpy(endMatch, saved, qutt);
+    }
+
+    return endMatch-ostart;
+}
+
+typedef struct ZSTDv01_Dctx_s
+{
+    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+    U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+    U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    void* previousDstEnd;
+    void* base;
+    size_t expected;
+    blockType_t bType;
+    U32 phase;
+} dctx_t;
+
+
+static size_t ZSTD_decompressSequences(
+                               void* ctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize,
+                         const BYTE* litStart, size_t litSize)
+{
+    dctx_t* dctx = (dctx_t*)ctx;
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode, dumpsLength;
+    const BYTE* litPtr = litStart;
+    const BYTE* const litEnd = litStart + litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    U32* DTableLL = dctx->LLTable;
+    U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+    BYTE* const base = (BYTE*) (dctx->base);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTDv01_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, iend-ip);
+    if (ZSTDv01_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        seqState.dumps = dumps;
+        seqState.dumpsEnd = dumps + dumpsLength;
+        seqState.prevOffset = 1;
+        errorCode = FSE_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (FSE_isError(errorCode)) return ERROR(corruption_detected);
+        FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (FSE_reloadDStream(&(seqState.DStream)) <= FSE_DStream_completed) && (nbSeq>0) ; )
+        {
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTD_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, litEnd, base, oend);
+            if (ZSTDv01_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        if ( !FSE_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* requested too much : data is corrupted */
+        if (nbSeq<0) return ERROR(corruption_detected);   /* requested too many sequences : data is corrupted */
+
+        /* last literal segment */
+        {
+            size_t lastLLSize = litEnd - litPtr;
+            if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+            if (op != litPtr) memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+
+static size_t ZSTD_decompressBlock(
+                            void* ctx,
+                            void* dst, size_t maxDstSize,
+                      const void* src, size_t srcSize)
+{
+    /* blockType == blockCompressed, srcSize is trusted */
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* litPtr = NULL;
+    size_t litSize = 0;
+    size_t errorCode;
+
+    /* Decode literals sub-block */
+    errorCode = ZSTDv01_decodeLiteralsBlock(ctx, dst, maxDstSize, &litPtr, &litSize, src, srcSize);
+    if (ZSTDv01_isError(errorCode)) return errorCode;
+    ip += errorCode;
+    srcSize -= errorCode;
+
+    return ZSTD_decompressSequences(ctx, dst, maxDstSize, ip, srcSize, litPtr, litSize);
+}
+
+
+size_t ZSTDv01_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    size_t errorCode=0;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = ZSTD_readBE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t blockSize = ZSTDv01_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTDv01_isError(blockSize)) return blockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (blockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            errorCode = ZSTD_decompressBlock(ctx, op, oend-op, ip, blockSize);
+            break;
+        case bt_raw :
+            errorCode = ZSTD_copyUncompressedBlock(op, oend-op, ip, blockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);
+        }
+        if (blockSize == 0) break;   /* bt_end */
+
+        if (ZSTDv01_isError(errorCode)) return errorCode;
+        op += errorCode;
+        ip += blockSize;
+        remainingSize -= blockSize;
+    }
+
+    return op-ostart;
+}
+
+size_t ZSTDv01_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    dctx_t ctx;
+    ctx.base = dst;
+    return ZSTDv01_decompressDCtx(&ctx, dst, maxDstSize, src, srcSize);
+}
+
+size_t ZSTDv01_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = ZSTD_readBE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t blockSize = ZSTDv01_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTDv01_isError(blockSize)) return blockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (blockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (blockSize == 0) break;   /* bt_end */
+
+        ip += blockSize;
+        remainingSize -= blockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/*******************************
+*  Streaming Decompression API
+*******************************/
+
+size_t ZSTDv01_resetDCtx(ZSTDv01_Dctx* dctx)
+{
+    dctx->expected = ZSTD_frameHeaderSize;
+    dctx->phase = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    return 0;
+}
+
+ZSTDv01_Dctx* ZSTDv01_createDCtx(void)
+{
+    ZSTDv01_Dctx* dctx = (ZSTDv01_Dctx*)malloc(sizeof(ZSTDv01_Dctx));
+    if (dctx==NULL) return NULL;
+    ZSTDv01_resetDCtx(dctx);
+    return dctx;
+}
+
+size_t ZSTDv01_freeDCtx(ZSTDv01_Dctx* dctx)
+{
+    free(dctx);
+    return 0;
+}
+
+size_t ZSTDv01_nextSrcSizeToDecompress(ZSTDv01_Dctx* dctx)
+{
+    return ((dctx_t*)dctx)->expected;
+}
+
+size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    dctx_t* ctx = (dctx_t*)dctx;
+
+    /* Sanity check */
+    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
+    if (dst != ctx->previousDstEnd)  /* not contiguous */
+        ctx->base = dst;
+
+    /* Decompress : frame header */
+    if (ctx->phase == 0)
+    {
+        /* Check frame magic header */
+        U32 magicNumber = ZSTD_readBE32(src);
+        if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        return 0;
+    }
+
+    /* Decompress : block header */
+    if (ctx->phase == 1)
+    {
+        blockProperties_t bp;
+        size_t blockSize = ZSTDv01_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+        if (ZSTDv01_isError(blockSize)) return blockSize;
+        if (bp.blockType == bt_end)
+        {
+            ctx->expected = 0;
+            ctx->phase = 0;
+        }
+        else
+        {
+            ctx->expected = blockSize;
+            ctx->bType = bp.blockType;
+            ctx->phase = 2;
+        }
+
+        return 0;
+    }
+
+    /* Decompress : block content */
+    {
+        size_t rSize;
+        switch(ctx->bType)
+        {
+        case bt_compressed:
+            rSize = ZSTD_decompressBlock(ctx, dst, maxDstSize, src, srcSize);
+            break;
+        case bt_raw :
+            rSize = ZSTD_copyUncompressedBlock(dst, maxDstSize, src, srcSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet handled */
+            break;
+        case bt_end :   /* should never happen (filtered at phase 1) */
+            rSize = 0;
+            break;
+        default:
+            return ERROR(GENERIC);
+        }
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
+        return rSize;
+    }
+
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_v01.h b/vendor/github.com/DataDog/zstd/zstd_v01.h
new file mode 100644
index 0000000..42f0897
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v01.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V01_H_28739879432
+#define ZSTD_V01_H_28739879432
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv01_decompress() : decompress ZSTD frames compliant with v0.1.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv01_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv01_getFrameSrcSize() : get the source length of a ZSTD frame compliant with v0.1.x format
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv01_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/**
+ZSTDv01_isError() : tells if the result of ZSTDv01_decompress() is an error
+*/
+unsigned ZSTDv01_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv01_Dctx_s ZSTDv01_Dctx;
+ZSTDv01_Dctx* ZSTDv01_createDCtx(void);
+size_t ZSTDv01_freeDCtx(ZSTDv01_Dctx* dctx);
+
+size_t ZSTDv01_decompressDCtx(void* ctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+/* *************************************
+*  Streaming functions
+***************************************/
+size_t ZSTDv01_resetDCtx(ZSTDv01_Dctx* dctx);
+
+size_t ZSTDv01_nextSrcSizeToDecompress(ZSTDv01_Dctx* dctx);
+size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv01_magicNumber   0xFD2FB51E   /* Big Endian version */
+#define ZSTDv01_magicNumberLE 0x1EB52FFD   /* Little Endian version */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V01_H_28739879432 */

diff --git a/vendor/github.com/DataDog/zstd/zstd_v02.c b/vendor/github.com/DataDog/zstd/zstd_v02.c
new file mode 100644
index 0000000..8bc0ece
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v02.c

@@ -0,0 +1,3483 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include "zstd_v02.h"
+#include "error_private.h"
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+
+
+/* ******************************************************************
+   mem.h
+   low-level memory access routines
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/******************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/****************************************************************
+*  Basic Types
+*****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/****************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets generating assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif // MEM_FORCE_MEMORY_ACCESS
+
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian())
+    {
+        MEM_write16(memPtr, val);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
+                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
+    }
+}
+
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+
+
+/* ******************************************************************
+   bitstream
+   Part of NewGen Entropy library
+   header file (to include)
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which highly benefit from being inlined.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+
+/**********************************************
+*  bitStream decompression API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/******************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/****************************************************************
+*  Helper functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+
+/**********************************************************
+* bitStream decoding
+**********************************************************/
+
+/*!BIT_initDStream
+*  Initialize a BIT_DStream_t.
+*  @bitD : a pointer to an already allocated BIT_DStream_t structure
+*  @srcBuffer must point at the beginning of a bitStream
+*  @srcSize must be the exact size of the bitStream
+*  @result : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(size_t))   /* normal case */
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+    }
+    else
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+            default:;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BIT_lookBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BIT_lookBitsFast :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*!BIT_readBitsFast :
+*  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBitsFast(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
+    {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BIT_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start)
+    {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start)
+        {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream
+*   @return Tells if DStream has reached its exact end
+*/
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+/* ******************************************************************
+   Error codes and messages
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#elif defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/******************************************
+*  Error Management
+******************************************/
+#define PREFIX(name) ZSTD_error_##name
+
+#define ERROR(name) (size_t)-PREFIX(name)
+
+#define ERROR_LIST(ITEM) \
+        ITEM(PREFIX(No_Error)) ITEM(PREFIX(GENERIC)) \
+        ITEM(PREFIX(dstSize_tooSmall)) ITEM(PREFIX(srcSize_wrong)) \
+        ITEM(PREFIX(prefix_unknown)) ITEM(PREFIX(corruption_detected)) \
+        ITEM(PREFIX(tableLog_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooSmall)) \
+        ITEM(PREFIX(maxCode))
+
+#define ERROR_GENERATE_ENUM(ENUM) ENUM,
+typedef enum { ERROR_LIST(ERROR_GENERATE_ENUM) } ERR_codes;  /* enum is exposed, to detect & handle specific errors; compare function result to -enum value */
+
+#define ERROR_CONVERTTOSTRING(STRING) #STRING,
+#define ERROR_GENERATE_STRING(EXPR) ERROR_CONVERTTOSTRING(EXPR)
+static const char* ERR_strings[] = { ERROR_LIST(ERROR_GENERATE_STRING) };
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    static const char* codeError = "Unspecified error code";
+    if (ERR_isError(code)) return ERR_strings[-(int)(code)];
+    return codeError;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
+/*
+Constructor and Destructor of type FSE_CTable
+    Note that its size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/******************************************
+*  Static allocation
+******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* You can statically allocate FSE CTable/DTable as a table of unsigned using below macro */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/******************************************
+*  FSE advanced API
+******************************************/
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/* build a fake FSE_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/* build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+
+/******************************************
+*  FSE symbol decompression API
+******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+
+/******************************************
+*  FSE unsafe API
+******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/******************************************
+*  Implementation of inline functions
+******************************************/
+
+/* decompression */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+    DStatePtr->state = BIT_readBits(bitD, DTableH.tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/******************************************
+*  Static allocation macros
+******************************************/
+/* Huff0 buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of Huff0's DTable */
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))  /* nb Cells; use unsigned short for X2, unsigned int for X4 */
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
+
+
+/******************************************
+*  Advanced functions
+******************************************/
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+static size_t HUF_decompress4X6 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* quad-symbols decoder */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+/*
+    zstd - standard compression library
+    Header File
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Version
+***************************************/
+#define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
+#define ZSTD_VERSION_MINOR    2    /* for new (non-breaking) interface capabilities */
+#define ZSTD_VERSION_RELEASE  2    /* for tweaks, bug-fixes, or development */
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;   /* incomplete type */
+
+#if defined (__cplusplus)
+}
+#endif
+/*
+    zstd - standard compression library
+    Header File for static linking only
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* The objects defined into this file should be considered experimental.
+ * They are not labelled stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risk of future changes.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Streaming functions
+***************************************/
+
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+
+/*
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTD_magicNumber 0xFD2FB522   /* v0.2 (current)*/
+
+
+#if defined (__cplusplus)
+}
+#endif
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/****************************************************************
+*  Tuning parameters
+****************************************************************/
+/* MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSE_MAX_MEMORY_USAGE 14
+#define FSE_DEFAULT_MEMORY_USAGE 13
+
+/* FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSE_MAX_SYMBOL_VALUE 255
+
+
+/****************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+
+
+/****************************************************************
+*  Byte symbol type
+****************************************************************/
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/****************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/****************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+/****************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+
+/****************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/****************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+
+/****************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+#define FSE_DECODE_TYPE FSE_decode_t
+
+static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+static size_t FSE_buildDTable
+(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* ptr = dt+1;
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*)ptr;
+    FSE_DTableHeader DTableH;
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSE_tableStep(tableSize);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    DTableH.tableLog = (U16)tableLog;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        if (normalizedCounter[s]==-1)
+        {
+            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        }
+        else
+        {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+        }
+    }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++)
+        {
+            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }
+    }
+
+    if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++)
+        {
+            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+        }
+    }
+
+    DTableH.fastMode = (U16)noLarge;
+    memcpy(dt, &DTableH, sizeof(DTableH));   /* memcpy(), to avoid strict aliasing warnings */
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+/******************************************
+*  FSE helper functions
+******************************************/
+static unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+
+
+/****************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+static short FSE_abs(short a)
+{
+    return (short)(a<0 ? -a : a);
+}
+
+static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr))
+    {
+        if (previous0)
+        {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF)
+            {
+                n0+=24;
+                if (ip < iend-5)
+                {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                }
+                else
+                {
+                    bitStream >>= 16;
+                    bitCount+=16;
+                }
+            }
+            while ((bitStream & 3) == 3)
+            {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+            {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max)
+            {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            }
+            else
+            {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSE_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold)
+            {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            {
+                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+                {
+                    ip += bitCount>>3;
+                    bitCount &= 7;
+                }
+                else
+                {
+                    bitCount -= (int)(8 * (iend - 4 - ip));
+                    ip = iend - 4;
+                }
+                bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+            }
+        }
+    }
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+
+
+/*********************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const cell = (FSE_decode_t*)(ptr) + 1;   /* because dt is unsigned */
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)(ptr) + 1;   /* because dt is unsigned */
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSE_isError(errorCode)) return errorCode;
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) && (op<olimit) ; op+=4)
+    {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1)
+    {
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state1);
+
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (BIT_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return ERROR(dstSize_tooSmall);   /* dst buffer is full, but cSrc unfinished */
+
+    return ERROR(corruption_detected);
+}
+
+
+static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+
+    /* select fast mode (static) */
+    if (DTableH.fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+Huff0 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/****************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/****************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+/****************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/******************************************
+*  Helper functions
+******************************************/
+static unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUF_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_DEFAULT_TABLELOG  HUF_MAX_TABLELOG   /* tableLog by default, when not specified */
+#define HUF_MAX_SYMBOL_VALUE 255
+#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
+#  error "HUF_MAX_TABLELOG is too large !"
+#endif
+
+
+
+/*********************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/*! HUF_readStats
+    Read compact Huffman tree, saved by HUF_writeCTable
+    @huffWeight : destination buffer
+    @return : size read from `src`
+*/
+static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    U32 tableLog;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+    U32 n;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  /* special header */
+    {
+        if (iSize >= (242))   /* RLE */
+        {
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else   /* Incompressible */
+        {
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            for (n=0; n<oSize; n+=2)
+            {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+            }
+        }
+    }
+    else  /* header compressed with FSE (normal case) */
+    {
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++)
+    {
+        if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+        rankStats[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    tableLog = BIT_highbit32(weightTotal) + 1;
+    if (tableLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+    {
+        U32 total = 1 << tableLog;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << BIT_highbit32(rest);
+        U32 lastWeight = BIT_highbit32(rest) + 1;
+        if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankStats[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    *tableLogPtr = tableLog;
+    return iSize+1;
+}
+
+
+/**************************/
+/* single-symbol decoding */
+/**************************/
+
+static size_t HUF_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    void* ptr = DTable+1;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)ptr;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof DTable, as allocated, from used size of DTable, in case of DTable re-use */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=tableLog; n++)
+    {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<nbSymbols; n++)
+    {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUF_DEltX2 D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize;
+}
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+        const size_t val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        BIT_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4))
+    {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+
+static size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+
+        const void* ptr = DTable;
+        const HUF_DEltX2* const dt = ((const HUF_DEltX2*)ptr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUF_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUF_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/***************************/
+/* double-symbols decoding */
+/***************************/
+
+static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX4 DElt;
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    U32 s;
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1)
+    {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)   /* note : sortedSymbols already skipped */
+    {
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }
+}
+
+typedef U32 rankVal_t[HUF_ABSOLUTEMAX_TABLELOG][HUF_ABSOLUTEMAX_TABLELOG + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)
+    {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits)   /* enough room for a second symbol */
+        {
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        }
+        else
+        {
+            U32 i;
+            const U32 end = start + length;
+            HUF_DEltX4 DElt;
+
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits   = (BYTE)(nbBits);
+            DElt.length   = 1;
+            for (i = start; i < end; i++)
+                DTable[i] = DElt;
+        }
+        rankVal[weight] += length;
+    }
+}
+
+static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUF_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUF_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUF_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    const U32 memLog = DTable[0];
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+    void* ptr = DTable;
+    HUF_DEltX4* const dt = ((HUF_DEltX4*)ptr) + 1;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
+    if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        {if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
+
+    /* Get start index of each weight */
+    {
+        U32 w, nextRankStart = 0;
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {
+        U32 s;
+        for (s=0; s<nbSymbols; s++)
+        {
+            U32 w = weightList[s];
+            U32 r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {
+        const U32 minBits = tableLog+1 - maxW;
+        U32 nextRankVal = 0;
+        U32 w, consumed;
+        const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+        U32* rankVal0 = rankVal[0];
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankVal;
+            nextRankVal += rankStats[w] << (w+rescale);
+            rankVal0[w] = current;
+        }
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++)
+        {
+            U32* rankValPtr = rankVal[consumed];
+            for (w = 1; w <= maxW; w++)
+            {
+                rankValPtr[w] = rankVal0[w] >> consumed;
+            }
+        }
+    }
+
+    HUF_fillDTableX4(dt, memLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    return iSize;
+}
+
+
+static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else
+    {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8))
+        {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+        }
+    }
+    return 1;
+}
+
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd-7))
+    {
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+
+static size_t HUF_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+
+        const void* ptr = DTable;
+        const HUF_DEltX4* const dt = ((const HUF_DEltX4*)ptr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUF_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/**********************************/
+/* quad-symbol decoding           */
+/**********************************/
+typedef struct { BYTE nbBits; BYTE nbBytes; } HUF_DDescX6;
+typedef union { BYTE byte[4]; U32 sequence; } HUF_DSeqX6;
+
+/* recursive, up to level 3; may benefit from <template>-like strategy to nest each level inline */
+static void HUF_fillDTableX6LevelN(HUF_DDescX6* DDescription, HUF_DSeqX6* DSequence, int sizeLog,
+                           const rankVal_t rankValOrigin, const U32 consumed, const int minWeight, const U32 maxWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, const U32* rankStart,
+                           const U32 nbBitsBaseline, HUF_DSeqX6 baseSeq, HUF_DDescX6 DDesc)
+{
+    const int scaleLog = nbBitsBaseline - sizeLog;   /* note : targetLog >= (nbBitsBaseline-1), hence scaleLog <= 1 */
+    const int minBits  = nbBitsBaseline - maxWeight;
+    const U32 level = DDesc.nbBytes;
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    U32 symbolStartPos, s;
+
+    /* local rankVal, will be modified */
+    memcpy(rankVal, rankValOrigin[consumed], sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1)
+    {
+        U32 i;
+        const U32 skipSize = rankVal[minWeight];
+        for (i = 0; i < skipSize; i++)
+        {
+            DSequence[i] = baseSeq;
+            DDescription[i] = DDesc;
+        }
+    }
+
+    /* fill DTable */
+    DDesc.nbBytes++;
+    symbolStartPos = rankStart[minWeight];
+    for (s=symbolStartPos; s<sortedListSize; s++)
+    {
+        const BYTE symbol = sortedSymbols[s].symbol;
+        const U32  weight = sortedSymbols[s].weight;   /* >= 1 (sorted) */
+        const int  nbBits = nbBitsBaseline - weight;   /* >= 1 (by construction) */
+        const int  totalBits = consumed+nbBits;
+        const U32  start  = rankVal[weight];
+        const U32  length = 1 << (sizeLog-nbBits);
+        baseSeq.byte[level] = symbol;
+        DDesc.nbBits = (BYTE)totalBits;
+
+        if ((level<3) && (sizeLog-totalBits >= minBits))   /* enough room for another symbol */
+        {
+            int nextMinWeight = totalBits + scaleLog;
+            if (nextMinWeight < 1) nextMinWeight = 1;
+            HUF_fillDTableX6LevelN(DDescription+start, DSequence+start, sizeLog-nbBits,
+                           rankValOrigin, totalBits, nextMinWeight, maxWeight,
+                           sortedSymbols, sortedListSize, rankStart,
+                           nbBitsBaseline, baseSeq, DDesc);   /* recursive (max : level 3) */
+        }
+        else
+        {
+            U32 i;
+            const U32 end = start + length;
+            for (i = start; i < end; i++)
+            {
+                DDescription[i] = DDesc;
+                DSequence[i] = baseSeq;
+            }
+        }
+        rankVal[weight] += length;
+    }
+}
+
+
+/* note : same preparation as X4 */
+static size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUF_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUF_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUF_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    rankVal_t rankVal;
+    const U32 memLog = DTable[0];
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+
+    if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
+
+
+    /* Get start index of each weight */
+    {
+        U32 w, nextRankStart = 0;
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {
+        U32 s;
+        for (s=0; s<nbSymbols; s++)
+        {
+            U32 w = weightList[s];
+            U32 r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {
+        const U32 minBits = tableLog+1 - maxW;
+        U32 nextRankVal = 0;
+        U32 w, consumed;
+        const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+        U32* rankVal0 = rankVal[0];
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankVal;
+            nextRankVal += rankStats[w] << (w+rescale);
+            rankVal0[w] = current;
+        }
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++)
+        {
+            U32* rankValPtr = rankVal[consumed];
+            for (w = 1; w <= maxW; w++)
+            {
+                rankValPtr[w] = rankVal0[w] >> consumed;
+            }
+        }
+    }
+
+
+    /* fill tables */
+    {
+        void* ptr = DTable+1;
+        HUF_DDescX6* DDescription = (HUF_DDescX6*)(ptr);
+        void* dSeqStart = DTable + 1 + ((size_t)1<<(memLog-1));
+        HUF_DSeqX6* DSequence = (HUF_DSeqX6*)(dSeqStart);
+        HUF_DSeqX6 DSeq;
+        HUF_DDescX6 DDesc;
+        DSeq.sequence = 0;
+        DDesc.nbBits = 0;
+        DDesc.nbBytes = 0;
+        HUF_fillDTableX6LevelN(DDescription, DSequence, memLog,
+                       (const U32 (*)[HUF_ABSOLUTEMAX_TABLELOG + 1])rankVal, 0, 1, maxW,
+                       sortedSymbol, sizeOfSort, rankStart0,
+                       tableLog+1, DSeq, DDesc);
+    }
+
+    return iSize;
+}
+
+
+static U32 HUF_decodeSymbolX6(void* op, BIT_DStream_t* DStream, const HUF_DDescX6* dd, const HUF_DSeqX6* ds, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, ds+val, sizeof(HUF_DSeqX6));
+    BIT_skipBits(DStream, dd[val].nbBits);
+    return dd[val].nbBytes;
+}
+
+static U32 HUF_decodeLastSymbolsX6(void* op, const U32 maxL, BIT_DStream_t* DStream,
+                                  const HUF_DDescX6* dd, const HUF_DSeqX6* ds, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    U32 length = dd[val].nbBytes;
+    if (length <= maxL)
+    {
+        memcpy(op, ds+val, length);
+        BIT_skipBits(DStream, dd[val].nbBits);
+        return length;
+    }
+    memcpy(op, ds+val, maxL);
+    if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8))
+    {
+        BIT_skipBits(DStream, dd[val].nbBits);
+        if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+            DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+    }
+    return maxL;
+}
+
+
+#define HUF_DECODE_SYMBOLX6_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX6(ptr, DStreamPtr, dd, ds, dtLog)
+
+#define HUF_DECODE_SYMBOLX6_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        HUF_DECODE_SYMBOLX6_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX6_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX6_0(ptr, DStreamPtr)
+
+static inline size_t HUF_decodeStreamX6(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const U32* DTable, const U32 dtLog)
+{
+    const void* ddPtr = DTable+1;
+    const HUF_DDescX6* dd = (const HUF_DDescX6*)(ddPtr);
+    const void* dsPtr = DTable + 1 + ((size_t)1<<(dtLog-1));
+    const HUF_DSeqX6* ds = (const HUF_DSeqX6*)(dsPtr);
+    BYTE* const pStart = p;
+
+    /* up to 16 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-16))
+    {
+        HUF_DECODE_SYMBOLX6_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX6_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX6_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX6_0(p, bitDPtr);
+    }
+
+    /* closer to the end, up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4))
+        HUF_DECODE_SYMBOLX6_0(p, bitDPtr);
+
+    while (p <= pEnd-4)
+        HUF_DECODE_SYMBOLX6_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    while (p < pEnd)
+        p += HUF_decodeLastSymbolsX6(p, (U32)(pEnd-p), bitDPtr, dd, ds, dtLog);
+
+    return p-pStart;
+}
+
+
+
+static size_t HUF_decompress4X6_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+
+        const U32 dtLog = DTable[0];
+        const void* ddPtr = DTable+1;
+        const HUF_DDescX6* dd = (const HUF_DDescX6*)(ddPtr);
+        const void* dsPtr = DTable + 1 + ((size_t)1<<(dtLog-1));
+        const HUF_DSeqX6* ds = (const HUF_DSeqX6*)(dsPtr);
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-64 symbols per loop (4-16 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (op3 <= opStart4) && (endSignal==BIT_DStream_unfinished) && (op4<=(oend-16)) ; )
+        {
+            HUF_DECODE_SYMBOLX6_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX6_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX6_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX6_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX6_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX6_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX6_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX6_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX6_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX6_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX6_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX6_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX6_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX6_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX6_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX6_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX6(op1, &bitD1, opStart2, DTable, dtLog);
+        HUF_decodeStreamX6(op2, &bitD2, opStart3, DTable, dtLog);
+        HUF_decodeStreamX6(op3, &bitD3, opStart4, DTable, dtLog);
+        HUF_decodeStreamX6(op4, &bitD4, oend,     DTable, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X6 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX6(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX6 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUF_decompress4X6_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/**********************************/
+/* Generic decompression selector */
+/**********************************/
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[3] = { HUF_decompress4X2, HUF_decompress4X4, HUF_decompress4X6 };
+    /* estimate decompression time */
+    U32 Q;
+    const U32 D256 = (U32)(dstSize >> 8);
+    U32 Dtime[3];
+    U32 algoNb = 0;
+    int n;
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    /* decoder timing evaluation */
+    Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    for (n=0; n<3; n++)
+        Dtime[n] = algoTime[Q][n].tableTime + (algoTime[Q][n].decode256Time * D256);
+
+    Dtime[1] += Dtime[1] >> 4; Dtime[2] += Dtime[2] >> 3; /* advantage to algorithms using less memory, for cache eviction */
+
+    if (Dtime[1] < Dtime[0]) algoNb = 1;
+    if (Dtime[2] < Dtime[algoNb]) algoNb = 2;
+
+    return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+
+    //return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUF_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+    //return HUF_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
+}
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+*  MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*/
+#define ZSTD_MEMORY_USAGE 17
+
+/*!
+ * HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0, fastest), or in memory heap (1, requires malloc())
+ * Note that compression context is fairly large, as a consequence heap memory is recommended.
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif /* ZSTD_HEAPMODE */
+
+/*!
+*  LEGACY_SUPPORT :
+*  decompressor can decode older formats (starting from Zstd 0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 1
+#endif
+
+
+/* *******************************************************
+*  Includes
+*********************************************************/
+#include <stdlib.h>      /* calloc */
+#include <string.h>      /* memcpy, memmove */
+#include <stdio.h>       /* debug : printf */
+
+
+/* *******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef __AVX2__
+#  include <immintrin.h>   /* AVX2 intrinsics */
+#endif
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+
+/* *******************************************************
+*  Constants
+*********************************************************/
+#define HASH_LOG (ZSTD_MEMORY_USAGE - 2)
+#define HASH_TABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASH_TABLESIZE - 1)
+
+#define KNUTH 2654435761
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+#define MIN_SEQUENCES_SIZE (2 /*seqNb*/ + 2 /*dumps*/ + 3 /*seqTables*/ + 1 /*bitStream*/)
+#define MIN_CBLOCK_SIZE (3 /*litCSize*/ + MIN_SEQUENCES_SIZE)
+#define IS_RAW BIT0
+#define IS_RLE BIT1
+
+#define WORKPLACESIZE (BLOCKSIZE*3)
+#define MINMATCH 4
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxML  ((1<<MLbits )-1)
+#define MaxLL  ((1<<LLbits )-1)
+#define MaxOff   31
+#define LitFSELog  11
+#define MLFSELog   10
+#define LLFSELog   10
+#define OffFSELog   9
+#define MAX(a,b) ((a)<(b)?(b):(a))
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define LITERAL_NOENTROPY 63
+#define COMMAND_NOENTROPY 7   /* to remove */
+
+static const size_t ZSTD_blockHeaderSize = 3;
+static const size_t ZSTD_frameHeaderSize = 4;
+
+
+/* *******************************************************
+*  Memory operations
+**********************************************************/
+static void   ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+static void   ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTD_wildcopy : custom version of memcpy(), can copy up to 7-8 bytes too many */
+static void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do COPY8(op, ip) while (op < oend);
+}
+
+
+/* **************************************
+*  Local structures
+****************************************/
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* offCode;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* litLengthStart;
+    BYTE* litLength;
+    BYTE* matchLengthStart;
+    BYTE* matchLength;
+    BYTE* dumpsStart;
+    BYTE* dumps;
+} seqStore_t;
+
+
+/* *************************************
+*  Error Management
+***************************************/
+/*! ZSTD_isError
+*   tells if a return value is an error code */
+static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+
+
+/* *************************************************************
+*   Decompression section
+***************************************************************/
+struct ZSTD_DCtx_s
+{
+    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+    U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+    U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    void* previousDstEnd;
+    void* base;
+    size_t expected;
+    blockType_t bType;
+    U32 phase;
+    const BYTE* litPtr;
+    size_t litSize;
+    BYTE litBuffer[BLOCKSIZE + 8 /* margin for wildcopy */];
+};   /* typedef'd to ZSTD_Dctx within "zstd_static.h" */
+
+
+static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    BYTE headerFlags;
+    U32 cSize;
+
+    if (srcSize < 3) return ERROR(srcSize_wrong);
+
+    headerFlags = *in;
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+
+    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/** ZSTD_decompressLiterals
+    @return : nb of bytes read from src, or an error code*/
+static size_t ZSTD_decompressLiterals(void* dst, size_t* maxDstSizePtr,
+                                const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    const size_t litSize = (MEM_readLE32(src) & 0x1FFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+    const size_t litCSize = (MEM_readLE32(ip+2) & 0xFFFFFF) >> 5;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+
+    if (litSize > *maxDstSizePtr) return ERROR(corruption_detected);
+    if (litCSize + 5 > srcSize) return ERROR(corruption_detected);
+
+    if (HUF_isError(HUF_decompress(dst, litSize, ip+5, litCSize))) return ERROR(corruption_detected);
+
+    *maxDstSizePtr = litSize;
+    return litCSize + 5;
+}
+
+
+/** ZSTD_decodeLiteralsBlock
+    @return : nb of bytes read from src (< srcSize )*/
+static size_t ZSTD_decodeLiteralsBlock(void* ctx,
+                          const void* src, size_t srcSize)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)ctx;
+    const BYTE* const istart = (const BYTE* const)src;
+
+    /* any compressed block with literals segment must be at least this size */
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch(*istart & 3)
+    {
+    default:
+    case 0:
+        {
+            size_t litSize = BLOCKSIZE;
+            const size_t readSize = ZSTD_decompressLiterals(dctx->litBuffer, &litSize, src, srcSize);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, 8);
+            return readSize;   /* works if it's an error too */
+        }
+    case IS_RAW:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > srcSize-11)   /* risk of reading too far with wildcopy */
+            {
+                if (litSize > srcSize-3) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, 8);
+                return litSize+3;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+3;
+            dctx->litSize = litSize;
+            return litSize+3;
+        }
+    case IS_RLE:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[3], litSize + 8);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return 4;
+        }
+    }
+}
+
+
+static size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+                         FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
+                         const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    const BYTE* const iend = istart + srcSize;
+    U32 LLtype, Offtype, MLtype;
+    U32 LLlog, Offlog, MLlog;
+    size_t dumpsLength;
+
+    /* check */
+    if (srcSize < 5) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    *nbSeq = MEM_readLE16(ip); ip+=2;
+    LLtype  = *ip >> 6;
+    Offtype = (*ip >> 4) & 3;
+    MLtype  = (*ip >> 2) & 3;
+    if (*ip & 2)
+    {
+        dumpsLength  = ip[2];
+        dumpsLength += ip[1] << 8;
+        ip += 3;
+    }
+    else
+    {
+        dumpsLength  = ip[1];
+        dumpsLength += (ip[0] & 1) << 8;
+        ip += 2;
+    }
+    *dumpsPtr = ip;
+    ip += dumpsLength;
+    *dumpsLengthPtr = dumpsLength;
+
+    /* check */
+    if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+    /* sequences */
+    {
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL and MaxOff */
+        size_t headerSize;
+
+        /* Build DTables */
+        switch(LLtype)
+        {
+        case bt_rle :
+            LLlog = 0;
+            FSE_buildDTable_rle(DTableLL, *ip++); break;
+        case bt_raw :
+            LLlog = LLbits;
+            FSE_buildDTable_raw(DTableLL, LLbits); break;
+        default :
+            {   U32 max = MaxLL;
+                headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (LLlog > LLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableLL, norm, max, LLlog);
+        }   }
+
+        switch(Offtype)
+        {
+        case bt_rle :
+            Offlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong);   /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableOffb, *ip++ & MaxOff); /* if *ip > MaxOff, data is corrupted */
+            break;
+        case bt_raw :
+            Offlog = Offbits;
+            FSE_buildDTable_raw(DTableOffb, Offbits); break;
+        default :
+            {   U32 max = MaxOff;
+                headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (Offlog > OffFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableOffb, norm, max, Offlog);
+        }   }
+
+        switch(MLtype)
+        {
+        case bt_rle :
+            MLlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableML, *ip++); break;
+        case bt_raw :
+            MLlog = MLbits;
+            FSE_buildDTable_raw(DTableML, MLbits); break;
+        default :
+            {   U32 max = MaxML;
+                headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (MLlog > MLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableML, norm, max, MLlog);
+    }   }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t offset;
+    size_t matchLength;
+} seq_t;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+    const BYTE* dumpsEnd;
+} seqState_t;
+
+
+static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+    const BYTE* const de = seqState->dumpsEnd;
+
+    /* Literal length */
+    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    seqState->prevOffset = seq->offset;
+    if (litLength == MaxLL)
+    {
+        U32 add = *dumps++;
+        if (add < 255) litLength += add;
+        else
+        {
+            litLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+            dumps += 3;
+        }
+        if (dumps >= de) dumps = de-1;   /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+
+    /* Offset */
+    {
+        static const size_t offsetPrefix[MaxOff+1] = {  /* note : size_t faster than U32 */
+                1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
+                512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
+                524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
+        U32 offsetCode, nbBits;
+        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));   /* <= maxOff, by table construction */
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = offsetPrefix[offsetCode] + BIT_readBits(&(seqState->DStream), nbBits);
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;   /* cmove */
+    }
+
+    /* MatchLength */
+    matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML)
+    {
+        U32 add = *dumps++;
+        if (add < 255) matchLength += add;
+        else
+        {
+            matchLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+            dumps += 3;
+        }
+        if (dumps >= de) dumps = de-1;   /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+}
+
+
+static size_t ZSTD_execSequence(BYTE* op,
+                                seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                BYTE* const base, BYTE* const oend)
+{
+    static const int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+    static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
+    const BYTE* const ostart = op;
+    BYTE* const oLitEnd = op + sequence.litLength;
+    BYTE* const oMatchEnd = op + sequence.litLength + sequence.matchLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_8 = oend-8;
+    const BYTE* const litEnd = *litPtr + sequence.litLength;
+
+    /* checks */
+    if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (litEnd > litLimit) return ERROR(corruption_detected);   /* overRead beyond lit buffer */
+
+    /* copy Literals */
+    ZSTD_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* copy Match */
+    {
+        const BYTE* match = op - sequence.offset;
+
+        /* check */
+        if (sequence.offset > (size_t)op) return ERROR(corruption_detected);   /* address space overflow test (this test seems kept by clang optimizer) */
+        //if (match > op) return ERROR(corruption_detected);   /* address space overflow test (is clang optimizer removing this test ?) */
+        if (match < base) return ERROR(corruption_detected);
+
+        /* close range match, overlap */
+        if (sequence.offset < 8)
+        {
+            const int dec64 = dec64table[sequence.offset];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[sequence.offset];
+            ZSTD_copy4(op+4, match);
+            match -= dec64;
+        }
+        else
+        {
+            ZSTD_copy8(op, match);
+        }
+        op += 8; match += 8;
+
+        if (oMatchEnd > oend-(16-MINMATCH))
+        {
+            if (op < oend_8)
+            {
+                ZSTD_wildcopy(op, match, oend_8 - op);
+                match += oend_8 - op;
+                op = oend_8;
+            }
+            while (op < oMatchEnd) *op++ = *match++;
+        }
+        else
+        {
+            ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+        }
+    }
+
+    return oMatchEnd - ostart;
+}
+
+static size_t ZSTD_decompressSequences(
+                               void* ctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)ctx;
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode, dumpsLength;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    U32* DTableLL = dctx->LLTable;
+    U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+    BYTE* const base = (BYTE*) (dctx->base);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, iend-ip);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        seqState.dumps = dumps;
+        seqState.dumpsEnd = dumps + dumpsLength;
+        seqState.prevOffset = 1;
+        errorCode = BIT_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (ERR_isError(errorCode)) return ERROR(corruption_detected);
+        FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (nbSeq>0) ; )
+        {
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTD_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, litEnd, base, oend);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        if ( !BIT_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* requested too much : data is corrupted */
+        if (nbSeq<0) return ERROR(corruption_detected);   /* requested too many sequences : data is corrupted */
+
+        /* last literal segment */
+        {
+            size_t lastLLSize = litEnd - litPtr;
+            if (litPtr > litEnd) return ERROR(corruption_detected);
+            if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+            if (op != litPtr) memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+
+static size_t ZSTD_decompressBlock(
+                            void* ctx,
+                            void* dst, size_t maxDstSize,
+                      const void* src, size_t srcSize)
+{
+    /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    /* Decode literals sub-block */
+    size_t litCSize = ZSTD_decodeLiteralsBlock(ctx, src, srcSize);
+    if (ZSTD_isError(litCSize)) return litCSize;
+    ip += litCSize;
+    srcSize -= litCSize;
+
+    return ZSTD_decompressSequences(ctx, dst, maxDstSize, ip, srcSize);
+}
+
+
+static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t decodedSize=0;
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock(ctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyUncompressedBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+static size_t ZSTD_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    ZSTD_DCtx ctx;
+    ctx.base = dst;
+    return ZSTD_decompressDCtx(&ctx, dst, maxDstSize, src, srcSize);
+}
+
+static size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/*******************************
+*  Streaming Decompression API
+*******************************/
+
+static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
+{
+    dctx->expected = ZSTD_frameHeaderSize;
+    dctx->phase = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    return 0;
+}
+
+static ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)malloc(sizeof(ZSTD_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTD_resetDCtx(dctx);
+    return dctx;
+}
+
+static size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    free(dctx);
+    return 0;
+}
+
+static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
+    if (dst != ctx->previousDstEnd)  /* not contiguous */
+        ctx->base = dst;
+
+    /* Decompress : frame header */
+    if (ctx->phase == 0)
+    {
+        /* Check frame magic header */
+        U32 magicNumber = MEM_readLE32(src);
+        if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        return 0;
+    }
+
+    /* Decompress : block header */
+    if (ctx->phase == 1)
+    {
+        blockProperties_t bp;
+        size_t blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+        if (ZSTD_isError(blockSize)) return blockSize;
+        if (bp.blockType == bt_end)
+        {
+            ctx->expected = 0;
+            ctx->phase = 0;
+        }
+        else
+        {
+            ctx->expected = blockSize;
+            ctx->bType = bp.blockType;
+            ctx->phase = 2;
+        }
+
+        return 0;
+    }
+
+    /* Decompress : block content */
+    {
+        size_t rSize;
+        switch(ctx->bType)
+        {
+        case bt_compressed:
+            rSize = ZSTD_decompressBlock(ctx, dst, maxDstSize, src, srcSize);
+            break;
+        case bt_raw :
+            rSize = ZSTD_copyUncompressedBlock(dst, maxDstSize, src, srcSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet handled */
+            break;
+        case bt_end :   /* should never happen (filtered at phase 1) */
+            rSize = 0;
+            break;
+        default:
+            return ERROR(GENERIC);
+        }
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
+        return rSize;
+    }
+
+}
+
+
+/* wrapper layer */
+
+unsigned ZSTDv02_isError(size_t code)
+{
+    return ZSTD_isError(code);
+}
+
+size_t ZSTDv02_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize)
+{
+    return ZSTD_decompress(dst, maxOriginalSize, src, compressedSize);
+}
+
+size_t ZSTDv02_findFrameCompressedSize(const void *src, size_t compressedSize)
+{
+    return ZSTD_findFrameCompressedSize(src, compressedSize);
+}
+
+ZSTDv02_Dctx* ZSTDv02_createDCtx(void)
+{
+    return (ZSTDv02_Dctx*)ZSTD_createDCtx();
+}
+
+size_t ZSTDv02_freeDCtx(ZSTDv02_Dctx* dctx)
+{
+    return ZSTD_freeDCtx((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv02_resetDCtx(ZSTDv02_Dctx* dctx)
+{
+    return ZSTD_resetDCtx((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv02_nextSrcSizeToDecompress(ZSTDv02_Dctx* dctx)
+{
+    return ZSTD_nextSrcSizeToDecompress((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize);
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_v02.h b/vendor/github.com/DataDog/zstd/zstd_v02.h
new file mode 100644
index 0000000..0dde7a6
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v02.h

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V02_H_4174539423
+#define ZSTD_V02_H_4174539423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv02_decompress() : decompress ZSTD frames compliant with v0.2.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv02_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv02_getFrameSrcSize() : get the source length of a ZSTD frame compliant with v0.2.x format
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv02_isError())
+*/
+size_t ZSTDv02_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/**
+ZSTDv02_isError() : tells if the result of ZSTDv02_decompress() is an error
+*/
+unsigned ZSTDv02_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv02_Dctx_s ZSTDv02_Dctx;
+ZSTDv02_Dctx* ZSTDv02_createDCtx(void);
+size_t ZSTDv02_freeDCtx(ZSTDv02_Dctx* dctx);
+
+size_t ZSTDv02_decompressDCtx(void* ctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+/* *************************************
+*  Streaming functions
+***************************************/
+size_t ZSTDv02_resetDCtx(ZSTDv02_Dctx* dctx);
+
+size_t ZSTDv02_nextSrcSizeToDecompress(ZSTDv02_Dctx* dctx);
+size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv02_magicNumber 0xFD2FB522   /* v0.2 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V02_H_4174539423 */

diff --git a/vendor/github.com/DataDog/zstd/zstd_v03.c b/vendor/github.com/DataDog/zstd/zstd_v03.c
new file mode 100644
index 0000000..54445af
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v03.c

@@ -0,0 +1,3124 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include "zstd_v03.h"
+#include "error_private.h"
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+
+
+
+/* ******************************************************************
+   mem.h
+   low-level memory access routines
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/******************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/****************************************************************
+*  Basic Types
+*****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/****************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets generating assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+
+#endif // MEM_FORCE_MEMORY_ACCESS
+
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian())
+    {
+        MEM_write16(memPtr, val);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
+                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
+    }
+}
+
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+
+
+/* ******************************************************************
+   bitstream
+   Part of NewGen Entropy library
+   header file (to include)
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which highly benefit from being inlined.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+
+/**********************************************
+*  bitStream decompression API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+
+/******************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/****************************************************************
+*  Helper functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+
+/**********************************************************
+* bitStream decoding
+**********************************************************/
+
+/*!BIT_initDStream
+*  Initialize a BIT_DStream_t.
+*  @bitD : a pointer to an already allocated BIT_DStream_t structure
+*  @srcBuffer must point at the beginning of a bitStream
+*  @srcSize must be the exact size of the bitStream
+*  @result : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(size_t))   /* normal case */
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+    }
+    else
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+            default:;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+MEM_STATIC size_t BIT_lookBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BIT_lookBitsFast :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*!BIT_readBitsFast :
+*  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBitsFast(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
+    {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BIT_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start)
+    {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start)
+        {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream
+*   @return Tells if DStream has reached its exact end
+*/
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+/* ******************************************************************
+   Error codes and messages
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#elif defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/******************************************
+*  Error Management
+******************************************/
+#define PREFIX(name) ZSTD_error_##name
+
+#define ERROR(name) (size_t)-PREFIX(name)
+
+#define ERROR_LIST(ITEM) \
+        ITEM(PREFIX(No_Error)) ITEM(PREFIX(GENERIC)) \
+        ITEM(PREFIX(dstSize_tooSmall)) ITEM(PREFIX(srcSize_wrong)) \
+        ITEM(PREFIX(prefix_unknown)) ITEM(PREFIX(corruption_detected)) \
+        ITEM(PREFIX(tableLog_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooLarge)) ITEM(PREFIX(maxSymbolValue_tooSmall)) \
+        ITEM(PREFIX(maxCode))
+
+#define ERROR_GENERATE_ENUM(ENUM) ENUM,
+typedef enum { ERROR_LIST(ERROR_GENERATE_ENUM) } ERR_codes;  /* enum is exposed, to detect & handle specific errors; compare function result to -enum value */
+
+#define ERROR_CONVERTTOSTRING(STRING) #STRING,
+#define ERROR_GENERATE_STRING(EXPR) ERROR_CONVERTTOSTRING(EXPR)
+static const char* ERR_strings[] = { ERROR_LIST(ERROR_GENERATE_STRING) };
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    static const char* codeError = "Unspecified error code";
+    if (ERR_isError(code)) return ERR_strings[-(int)(code)];
+    return codeError;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
+/*
+Constructor and Destructor of type FSE_CTable
+    Note that its size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/******************************************
+*  Static allocation
+******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* You can statically allocate FSE CTable/DTable as a table of unsigned using below macro */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/******************************************
+*  FSE advanced API
+******************************************/
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/* build a fake FSE_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/* build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+
+/******************************************
+*  FSE symbol decompression API
+******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+
+/******************************************
+*  FSE unsafe API
+******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/******************************************
+*  Implementation of inline functions
+******************************************/
+
+/* decompression */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+    DStatePtr->state = BIT_readBits(bitD, DTableH.tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/******************************************
+*  Static allocation macros
+******************************************/
+/* Huff0 buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of Huff0's DTable */
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))  /* nb Cells; use unsigned short for X2, unsigned int for X4 */
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
+
+
+/******************************************
+*  Advanced functions
+******************************************/
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+/*
+    zstd - standard compression library
+    Header File
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Version
+***************************************/
+#define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
+#define ZSTD_VERSION_MINOR    2    /* for new (non-breaking) interface capabilities */
+#define ZSTD_VERSION_RELEASE  2    /* for tweaks, bug-fixes, or development */
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;   /* incomplete type */
+
+#if defined (__cplusplus)
+}
+#endif
+/*
+    zstd - standard compression library
+    Header File for static linking only
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* The objects defined into this file should be considered experimental.
+ * They are not labelled stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risk of future changes.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Streaming functions
+***************************************/
+
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+
+/*
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTD_magicNumber 0xFD2FB523   /* v0.3 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/****************************************************************
+*  Tuning parameters
+****************************************************************/
+/* MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSE_MAX_MEMORY_USAGE 14
+#define FSE_DEFAULT_MEMORY_USAGE 13
+
+/* FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSE_MAX_SYMBOL_VALUE 255
+
+
+/****************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+
+
+/****************************************************************
+*  Byte symbol type
+****************************************************************/
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/****************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/****************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+/****************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+
+/****************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/****************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+
+/****************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+#define FSE_DECODE_TYPE FSE_decode_t
+
+static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+static size_t FSE_buildDTable
+(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* ptr = dt+1;
+    FSE_DTableHeader DTableH;
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*)ptr;
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSE_tableStep(tableSize);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    DTableH.tableLog = (U16)tableLog;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        if (normalizedCounter[s]==-1)
+        {
+            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        }
+        else
+        {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+        }
+    }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++)
+        {
+            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }
+    }
+
+    if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++)
+        {
+            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+        }
+    }
+
+    DTableH.fastMode = (U16)noLarge;
+    memcpy(dt, &DTableH, sizeof(DTableH));
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+/******************************************
+*  FSE helper functions
+******************************************/
+static unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+
+
+/****************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+static short FSE_abs(short a)
+{
+    return a<0 ? -a : a;
+}
+
+static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr))
+    {
+        if (previous0)
+        {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF)
+            {
+                n0+=24;
+                if (ip < iend-5)
+                {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                }
+                else
+                {
+                    bitStream >>= 16;
+                    bitCount+=16;
+                }
+            }
+            while ((bitStream & 3) == 3)
+            {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+            {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max)
+            {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            }
+            else
+            {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSE_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold)
+            {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            {
+                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+                {
+                    ip += bitCount>>3;
+                    bitCount &= 7;
+                }
+                else
+                {
+                    bitCount -= (int)(8 * (iend - 4 - ip));
+                    ip = iend - 4;
+                }
+                bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+            }
+        }
+    }
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+
+
+/*********************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const cell = (FSE_decode_t*)(ptr) + 1;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)(ptr) + 1;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSE_isError(errorCode)) return errorCode;
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) && (op<olimit) ; op+=4)
+    {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1)
+    {
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state1);
+
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (BIT_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return ERROR(dstSize_tooSmall);   /* dst buffer is full, but cSrc unfinished */
+
+    return ERROR(corruption_detected);
+}
+
+
+static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+
+    /* select fast mode (static) */
+    if (DTableH.fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+Huff0 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/****************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+/****************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+/****************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/******************************************
+*  Helper functions
+******************************************/
+static unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUF_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_DEFAULT_TABLELOG  HUF_MAX_TABLELOG   /* tableLog by default, when not specified */
+#define HUF_MAX_SYMBOL_VALUE 255
+#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
+#  error "HUF_MAX_TABLELOG is too large !"
+#endif
+
+
+
+/*********************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/*! HUF_readStats
+    Read compact Huffman tree, saved by HUF_writeCTable
+    @huffWeight : destination buffer
+    @return : size read from `src`
+*/
+static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    U32 tableLog;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+    U32 n;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  /* special header */
+    {
+        if (iSize >= (242))   /* RLE */
+        {
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else   /* Incompressible */
+        {
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            for (n=0; n<oSize; n+=2)
+            {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+            }
+        }
+    }
+    else  /* header compressed with FSE (normal case) */
+    {
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++)
+    {
+        if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+        rankStats[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    tableLog = BIT_highbit32(weightTotal) + 1;
+    if (tableLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+    {
+        U32 total = 1 << tableLog;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << BIT_highbit32(rest);
+        U32 lastWeight = BIT_highbit32(rest) + 1;
+        if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankStats[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    *tableLogPtr = tableLog;
+    return iSize+1;
+}
+
+
+/**************************/
+/* single-symbol decoding */
+/**************************/
+
+static size_t HUF_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    void* ptr = DTable+1;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)(ptr);
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof DTable, as allocated, from used size of DTable, in case of DTable re-use */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=tableLog; n++)
+    {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<nbSymbols; n++)
+    {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUF_DEltX2 D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize;
+}
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+        const size_t val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        BIT_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4))
+    {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+
+static size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+
+        const void* ptr = DTable;
+        const HUF_DEltX2* const dt = ((const HUF_DEltX2*)ptr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUF_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUF_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/***************************/
+/* double-symbols decoding */
+/***************************/
+
+static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX4 DElt;
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    U32 s;
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1)
+    {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)   /* note : sortedSymbols already skipped */
+    {
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }
+}
+
+typedef U32 rankVal_t[HUF_ABSOLUTEMAX_TABLELOG][HUF_ABSOLUTEMAX_TABLELOG + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)
+    {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits)   /* enough room for a second symbol */
+        {
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        }
+        else
+        {
+            U32 i;
+            const U32 end = start + length;
+            HUF_DEltX4 DElt;
+
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits   = (BYTE)(nbBits);
+            DElt.length   = 1;
+            for (i = start; i < end; i++)
+                DTable[i] = DElt;
+        }
+        rankVal[weight] += length;
+    }
+}
+
+static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUF_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUF_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUF_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    const U32 memLog = DTable[0];
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+    void* ptr = DTable;
+    HUF_DEltX4* const dt = ((HUF_DEltX4*)ptr) + 1;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
+    if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
+
+    /* Get start index of each weight */
+    {
+        U32 w, nextRankStart = 0;
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {
+        U32 s;
+        for (s=0; s<nbSymbols; s++)
+        {
+            U32 w = weightList[s];
+            U32 r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {
+        const U32 minBits = tableLog+1 - maxW;
+        U32 nextRankVal = 0;
+        U32 w, consumed;
+        const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+        U32* rankVal0 = rankVal[0];
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankVal;
+            nextRankVal += rankStats[w] << (w+rescale);
+            rankVal0[w] = current;
+        }
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++)
+        {
+            U32* rankValPtr = rankVal[consumed];
+            for (w = 1; w <= maxW; w++)
+            {
+                rankValPtr[w] = rankVal0[w] >> consumed;
+            }
+        }
+    }
+
+    HUF_fillDTableX4(dt, memLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    return iSize;
+}
+
+
+static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else
+    {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8))
+        {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+        }
+    }
+    return 1;
+}
+
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd-7))
+    {
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+
+static size_t HUF_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+
+        const void* ptr = DTable;
+        const HUF_DEltX4* const dt = ((const HUF_DEltX4*)ptr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUF_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/**********************************/
+/* Generic decompression selector */
+/**********************************/
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[3] = { HUF_decompress4X2, HUF_decompress4X4, NULL };
+    /* estimate decompression time */
+    U32 Q;
+    const U32 D256 = (U32)(dstSize >> 8);
+    U32 Dtime[3];
+    U32 algoNb = 0;
+    int n;
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    /* decoder timing evaluation */
+    Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    for (n=0; n<3; n++)
+        Dtime[n] = algoTime[Q][n].tableTime + (algoTime[Q][n].decode256Time * D256);
+
+    Dtime[1] += Dtime[1] >> 4; Dtime[2] += Dtime[2] >> 3; /* advantage to algorithms using less memory, for cache eviction */
+
+    if (Dtime[1] < Dtime[0]) algoNb = 1;
+
+    return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+
+    //return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUF_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+    //return HUF_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
+}
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+*  MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*/
+#define ZSTD_MEMORY_USAGE 17
+
+/*!
+ * HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0, fastest), or in memory heap (1, requires malloc())
+ * Note that compression context is fairly large, as a consequence heap memory is recommended.
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif /* ZSTD_HEAPMODE */
+
+/*!
+*  LEGACY_SUPPORT :
+*  decompressor can decode older formats (starting from Zstd 0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 1
+#endif
+
+
+/* *******************************************************
+*  Includes
+*********************************************************/
+#include <stdlib.h>      /* calloc */
+#include <string.h>      /* memcpy, memmove */
+#include <stdio.h>       /* debug : printf */
+
+
+/* *******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef __AVX2__
+#  include <immintrin.h>   /* AVX2 intrinsics */
+#endif
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#else
+#  define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#endif
+
+
+/* *******************************************************
+*  Constants
+*********************************************************/
+#define HASH_LOG (ZSTD_MEMORY_USAGE - 2)
+#define HASH_TABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASH_TABLESIZE - 1)
+
+#define KNUTH 2654435761
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+#define MIN_SEQUENCES_SIZE (2 /*seqNb*/ + 2 /*dumps*/ + 3 /*seqTables*/ + 1 /*bitStream*/)
+#define MIN_CBLOCK_SIZE (3 /*litCSize*/ + MIN_SEQUENCES_SIZE)
+#define IS_RAW BIT0
+#define IS_RLE BIT1
+
+#define WORKPLACESIZE (BLOCKSIZE*3)
+#define MINMATCH 4
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxML  ((1<<MLbits )-1)
+#define MaxLL  ((1<<LLbits )-1)
+#define MaxOff   31
+#define LitFSELog  11
+#define MLFSELog   10
+#define LLFSELog   10
+#define OffFSELog   9
+#define MAX(a,b) ((a)<(b)?(b):(a))
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define LITERAL_NOENTROPY 63
+#define COMMAND_NOENTROPY 7   /* to remove */
+
+static const size_t ZSTD_blockHeaderSize = 3;
+static const size_t ZSTD_frameHeaderSize = 4;
+
+
+/* *******************************************************
+*  Memory operations
+**********************************************************/
+static void   ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+static void   ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTD_wildcopy : custom version of memcpy(), can copy up to 7-8 bytes too many */
+static void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do COPY8(op, ip) while (op < oend);
+}
+
+
+/* **************************************
+*  Local structures
+****************************************/
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* offCode;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* litLengthStart;
+    BYTE* litLength;
+    BYTE* matchLengthStart;
+    BYTE* matchLength;
+    BYTE* dumpsStart;
+    BYTE* dumps;
+} seqStore_t;
+
+
+/* *************************************
+*  Error Management
+***************************************/
+/*! ZSTD_isError
+*   tells if a return value is an error code */
+static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+
+
+/* *************************************************************
+*   Decompression section
+***************************************************************/
+struct ZSTD_DCtx_s
+{
+    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+    U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+    U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    void* previousDstEnd;
+    void* base;
+    size_t expected;
+    blockType_t bType;
+    U32 phase;
+    const BYTE* litPtr;
+    size_t litSize;
+    BYTE litBuffer[BLOCKSIZE + 8 /* margin for wildcopy */];
+};   /* typedef'd to ZSTD_Dctx within "zstd_static.h" */
+
+
+static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    BYTE headerFlags;
+    U32 cSize;
+
+    if (srcSize < 3) return ERROR(srcSize_wrong);
+
+    headerFlags = *in;
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+
+    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/** ZSTD_decompressLiterals
+    @return : nb of bytes read from src, or an error code*/
+static size_t ZSTD_decompressLiterals(void* dst, size_t* maxDstSizePtr,
+                                const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    const size_t litSize = (MEM_readLE32(src) & 0x1FFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+    const size_t litCSize = (MEM_readLE32(ip+2) & 0xFFFFFF) >> 5;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+
+    if (litSize > *maxDstSizePtr) return ERROR(corruption_detected);
+    if (litCSize + 5 > srcSize) return ERROR(corruption_detected);
+
+    if (HUF_isError(HUF_decompress(dst, litSize, ip+5, litCSize))) return ERROR(corruption_detected);
+
+    *maxDstSizePtr = litSize;
+    return litCSize + 5;
+}
+
+
+/** ZSTD_decodeLiteralsBlock
+    @return : nb of bytes read from src (< srcSize )*/
+static size_t ZSTD_decodeLiteralsBlock(void* ctx,
+                          const void* src, size_t srcSize)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)ctx;
+    const BYTE* const istart = (const BYTE* const)src;
+
+    /* any compressed block with literals segment must be at least this size */
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch(*istart & 3)
+    {
+    default:
+    case 0:
+        {
+            size_t litSize = BLOCKSIZE;
+            const size_t readSize = ZSTD_decompressLiterals(dctx->litBuffer, &litSize, src, srcSize);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, 8);
+            return readSize;   /* works if it's an error too */
+        }
+    case IS_RAW:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > srcSize-11)   /* risk of reading too far with wildcopy */
+            {
+                if (litSize > srcSize-3) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, 8);
+                return litSize+3;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+3;
+            dctx->litSize = litSize;
+            return litSize+3;
+        }
+    case IS_RLE:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[3], litSize + 8);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return 4;
+        }
+    }
+}
+
+
+static size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+                         FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
+                         const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    const BYTE* const iend = istart + srcSize;
+    U32 LLtype, Offtype, MLtype;
+    U32 LLlog, Offlog, MLlog;
+    size_t dumpsLength;
+
+    /* check */
+    if (srcSize < 5) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    *nbSeq = MEM_readLE16(ip); ip+=2;
+    LLtype  = *ip >> 6;
+    Offtype = (*ip >> 4) & 3;
+    MLtype  = (*ip >> 2) & 3;
+    if (*ip & 2)
+    {
+        dumpsLength  = ip[2];
+        dumpsLength += ip[1] << 8;
+        ip += 3;
+    }
+    else
+    {
+        dumpsLength  = ip[1];
+        dumpsLength += (ip[0] & 1) << 8;
+        ip += 2;
+    }
+    *dumpsPtr = ip;
+    ip += dumpsLength;
+    *dumpsLengthPtr = dumpsLength;
+
+    /* check */
+    if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+    /* sequences */
+    {
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL and MaxOff */
+        size_t headerSize;
+
+        /* Build DTables */
+        switch(LLtype)
+        {
+        case bt_rle :
+            LLlog = 0;
+            FSE_buildDTable_rle(DTableLL, *ip++); break;
+        case bt_raw :
+            LLlog = LLbits;
+            FSE_buildDTable_raw(DTableLL, LLbits); break;
+        default :
+            {   U32 max = MaxLL;
+                headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (LLlog > LLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableLL, norm, max, LLlog);
+        }   }
+
+        switch(Offtype)
+        {
+        case bt_rle :
+            Offlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong);   /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableOffb, *ip++ & MaxOff); /* if *ip > MaxOff, data is corrupted */
+            break;
+        case bt_raw :
+            Offlog = Offbits;
+            FSE_buildDTable_raw(DTableOffb, Offbits); break;
+        default :
+            {   U32 max = MaxOff;
+                headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (Offlog > OffFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableOffb, norm, max, Offlog);
+        }   }
+
+        switch(MLtype)
+        {
+        case bt_rle :
+            MLlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableML, *ip++); break;
+        case bt_raw :
+            MLlog = MLbits;
+            FSE_buildDTable_raw(DTableML, MLbits); break;
+        default :
+            {   U32 max = MaxML;
+                headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (MLlog > MLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableML, norm, max, MLlog);
+    }   }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t offset;
+    size_t matchLength;
+} seq_t;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+    const BYTE* dumpsEnd;
+} seqState_t;
+
+
+static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+    const BYTE* const de = seqState->dumpsEnd;
+
+    /* Literal length */
+    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    seqState->prevOffset = seq->offset;
+    if (litLength == MaxLL)
+    {
+        U32 add = *dumps++;
+        if (add < 255) litLength += add;
+        else
+        {
+            litLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+            dumps += 3;
+        }
+        if (dumps >= de) dumps = de-1;   /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+
+    /* Offset */
+    {
+        static const size_t offsetPrefix[MaxOff+1] = {  /* note : size_t faster than U32 */
+                1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
+                512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
+                524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
+        U32 offsetCode, nbBits;
+        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));   /* <= maxOff, by table construction */
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = offsetPrefix[offsetCode] + BIT_readBits(&(seqState->DStream), nbBits);
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;   /* cmove */
+    }
+
+    /* MatchLength */
+    matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML)
+    {
+        U32 add = *dumps++;
+        if (add < 255) matchLength += add;
+        else
+        {
+            matchLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+            dumps += 3;
+        }
+        if (dumps >= de) dumps = de-1;   /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+}
+
+
+static size_t ZSTD_execSequence(BYTE* op,
+                                seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                BYTE* const base, BYTE* const oend)
+{
+    static const int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+    static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
+    const BYTE* const ostart = op;
+    BYTE* const oLitEnd = op + sequence.litLength;
+    BYTE* const oMatchEnd = op + sequence.litLength + sequence.matchLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_8 = oend-8;
+    const BYTE* const litEnd = *litPtr + sequence.litLength;
+
+    /* checks */
+    if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (litEnd > litLimit) return ERROR(corruption_detected);   /* overRead beyond lit buffer */
+
+    /* copy Literals */
+    ZSTD_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* copy Match */
+    {
+        const BYTE* match = op - sequence.offset;
+
+        /* check */
+        if (sequence.offset > (size_t)op) return ERROR(corruption_detected);   /* address space overflow test (this test seems kept by clang optimizer) */
+        //if (match > op) return ERROR(corruption_detected);   /* address space overflow test (is clang optimizer removing this test ?) */
+        if (match < base) return ERROR(corruption_detected);
+
+        /* close range match, overlap */
+        if (sequence.offset < 8)
+        {
+            const int dec64 = dec64table[sequence.offset];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[sequence.offset];
+            ZSTD_copy4(op+4, match);
+            match -= dec64;
+        }
+        else
+        {
+            ZSTD_copy8(op, match);
+        }
+        op += 8; match += 8;
+
+        if (oMatchEnd > oend-(16-MINMATCH))
+        {
+            if (op < oend_8)
+            {
+                ZSTD_wildcopy(op, match, oend_8 - op);
+                match += oend_8 - op;
+                op = oend_8;
+            }
+            while (op < oMatchEnd) *op++ = *match++;
+        }
+        else
+        {
+            ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+        }
+    }
+
+    return oMatchEnd - ostart;
+}
+
+static size_t ZSTD_decompressSequences(
+                               void* ctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)ctx;
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode, dumpsLength;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    U32* DTableLL = dctx->LLTable;
+    U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+    BYTE* const base = (BYTE*) (dctx->base);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, iend-ip);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        seqState.dumps = dumps;
+        seqState.dumpsEnd = dumps + dumpsLength;
+        seqState.prevOffset = sequence.offset = 4;
+        errorCode = BIT_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (ERR_isError(errorCode)) return ERROR(corruption_detected);
+        FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (nbSeq>0) ; )
+        {
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTD_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, litEnd, base, oend);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        if ( !BIT_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* requested too much : data is corrupted */
+        if (nbSeq<0) return ERROR(corruption_detected);   /* requested too many sequences : data is corrupted */
+
+        /* last literal segment */
+        {
+            size_t lastLLSize = litEnd - litPtr;
+            if (litPtr > litEnd) return ERROR(corruption_detected);
+            if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+            if (op != litPtr) memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+
+static size_t ZSTD_decompressBlock(
+                            void* ctx,
+                            void* dst, size_t maxDstSize,
+                      const void* src, size_t srcSize)
+{
+    /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    /* Decode literals sub-block */
+    size_t litCSize = ZSTD_decodeLiteralsBlock(ctx, src, srcSize);
+    if (ZSTD_isError(litCSize)) return litCSize;
+    ip += litCSize;
+    srcSize -= litCSize;
+
+    return ZSTD_decompressSequences(ctx, dst, maxDstSize, ip, srcSize);
+}
+
+
+static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t decodedSize=0;
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock(ctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyUncompressedBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+static size_t ZSTD_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    ZSTD_DCtx ctx;
+    ctx.base = dst;
+    return ZSTD_decompressDCtx(&ctx, dst, maxDstSize, src, srcSize);
+}
+
+static size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    U32 magicNumber;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+
+/*******************************
+*  Streaming Decompression API
+*******************************/
+
+static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
+{
+    dctx->expected = ZSTD_frameHeaderSize;
+    dctx->phase = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    return 0;
+}
+
+static ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)malloc(sizeof(ZSTD_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTD_resetDCtx(dctx);
+    return dctx;
+}
+
+static size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    free(dctx);
+    return 0;
+}
+
+static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
+    if (dst != ctx->previousDstEnd)  /* not contiguous */
+        ctx->base = dst;
+
+    /* Decompress : frame header */
+    if (ctx->phase == 0)
+    {
+        /* Check frame magic header */
+        U32 magicNumber = MEM_readLE32(src);
+        if (magicNumber != ZSTD_magicNumber) return ERROR(prefix_unknown);
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        return 0;
+    }
+
+    /* Decompress : block header */
+    if (ctx->phase == 1)
+    {
+        blockProperties_t bp;
+        size_t blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+        if (ZSTD_isError(blockSize)) return blockSize;
+        if (bp.blockType == bt_end)
+        {
+            ctx->expected = 0;
+            ctx->phase = 0;
+        }
+        else
+        {
+            ctx->expected = blockSize;
+            ctx->bType = bp.blockType;
+            ctx->phase = 2;
+        }
+
+        return 0;
+    }
+
+    /* Decompress : block content */
+    {
+        size_t rSize;
+        switch(ctx->bType)
+        {
+        case bt_compressed:
+            rSize = ZSTD_decompressBlock(ctx, dst, maxDstSize, src, srcSize);
+            break;
+        case bt_raw :
+            rSize = ZSTD_copyUncompressedBlock(dst, maxDstSize, src, srcSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet handled */
+            break;
+        case bt_end :   /* should never happen (filtered at phase 1) */
+            rSize = 0;
+            break;
+        default:
+            return ERROR(GENERIC);
+        }
+        ctx->phase = 1;
+        ctx->expected = ZSTD_blockHeaderSize;
+        ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
+        return rSize;
+    }
+
+}
+
+
+/* wrapper layer */
+
+unsigned ZSTDv03_isError(size_t code)
+{
+    return ZSTD_isError(code);
+}
+
+size_t ZSTDv03_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize)
+{
+    return ZSTD_decompress(dst, maxOriginalSize, src, compressedSize);
+}
+
+size_t ZSTDv03_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    return ZSTD_findFrameCompressedSize(src, srcSize);
+}
+
+ZSTDv03_Dctx* ZSTDv03_createDCtx(void)
+{
+    return (ZSTDv03_Dctx*)ZSTD_createDCtx();
+}
+
+size_t ZSTDv03_freeDCtx(ZSTDv03_Dctx* dctx)
+{
+    return ZSTD_freeDCtx((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv03_resetDCtx(ZSTDv03_Dctx* dctx)
+{
+    return ZSTD_resetDCtx((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv03_nextSrcSizeToDecompress(ZSTDv03_Dctx* dctx)
+{
+    return ZSTD_nextSrcSizeToDecompress((ZSTD_DCtx*)dctx);
+}
+
+size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize);
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_v03.h b/vendor/github.com/DataDog/zstd/zstd_v03.h
new file mode 100644
index 0000000..b4449e2
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v03.h

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V03_H_298734209782
+#define ZSTD_V03_H_298734209782
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv03_decompress() : decompress ZSTD frames compliant with v0.3.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv03_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv03_getFrameSrcSize() : get the source length of a ZSTD frame compliant with v0.3.x format
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv03_isError())
+*/
+size_t ZSTDv03_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+    /**
+ZSTDv03_isError() : tells if the result of ZSTDv03_decompress() is an error
+*/
+unsigned ZSTDv03_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv03_Dctx_s ZSTDv03_Dctx;
+ZSTDv03_Dctx* ZSTDv03_createDCtx(void);
+size_t ZSTDv03_freeDCtx(ZSTDv03_Dctx* dctx);
+
+size_t ZSTDv03_decompressDCtx(void* ctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+/* *************************************
+*  Streaming functions
+***************************************/
+size_t ZSTDv03_resetDCtx(ZSTDv03_Dctx* dctx);
+
+size_t ZSTDv03_nextSrcSizeToDecompress(ZSTDv03_Dctx* dctx);
+size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv03_magicNumber 0xFD2FB523   /* v0.3 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V03_H_298734209782 */

diff --git a/vendor/github.com/DataDog/zstd/zstd_v04.c b/vendor/github.com/DataDog/zstd/zstd_v04.c
new file mode 100644
index 0000000..fb6d1d4
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v04.c

@@ -0,0 +1,3677 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*- Dependencies -*/
+#include "zstd_v04.h"
+#include "error_private.h"
+
+
+/* ******************************************************************
+   mem.h
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/******************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+
+
+/******************************************
+*  Compiler-specific
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/****************************************************************
+*  Basic Types
+*****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-*************************************
+*  Debug
+***************************************/
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
+#  include <assert.h>
+#else
+#  ifndef assert
+#    define assert(condition) ((void)0)
+#  endif
+#endif
+
+#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; }
+
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
+#  include <stdio.h>
+extern int g_debuglog_enable;
+/* recommended values for ZSTD_DEBUG display levels :
+ * 1 : no display, enables assert() only
+ * 2 : reserved for currently active debug path
+ * 3 : events once per object lifetime (CCtx, CDict, etc.)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (*very* verbose) */
+#  define RAWLOG(l, ...) {                                      \
+                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
+                    fprintf(stderr, __VA_ARGS__);               \
+            }   }
+#  define DEBUGLOG(l, ...) {                                    \
+                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
+                    fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
+                    fprintf(stderr, " \n");                     \
+            }   }
+#else
+#  define RAWLOG(l, ...)      {}    /* disabled */
+#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#endif
+
+
+/****************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets generating assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif // MEM_FORCE_MEMORY_ACCESS
+
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian())
+    {
+        MEM_write16(memPtr, val);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
+                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
+    }
+}
+
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+
+/*
+    zstd - standard compression library
+    Header File for static linking only
+*/
+#ifndef ZSTD_STATIC_H
+#define ZSTD_STATIC_H
+
+/* The objects defined into this file shall be considered experimental.
+ * They are not considered stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risks of future changes.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Types
+***************************************/
+#define ZSTD_WINDOWLOG_MAX 26
+#define ZSTD_WINDOWLOG_MIN 18
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 11
+#define ZSTD_CONTENTLOG_MAX (ZSTD_WINDOWLOG_MAX+1)
+#define ZSTD_CONTENTLOG_MIN 4
+#define ZSTD_HASHLOG_MAX 28
+#define ZSTD_HASHLOG_MIN 4
+#define ZSTD_SEARCHLOG_MAX (ZSTD_CONTENTLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN 1
+#define ZSTD_SEARCHLENGTH_MAX 7
+#define ZSTD_SEARCHLENGTH_MIN 4
+
+/** from faster to stronger */
+typedef enum { ZSTD_fast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2, ZSTD_btlazy2 } ZSTD_strategy;
+
+typedef struct
+{
+    U64 srcSize;       /* optional : tells how much bytes are present in the frame. Use 0 if not known. */
+    U32 windowLog;     /* largest match distance : larger == more compression, more memory needed during decompression */
+    U32 contentLog;    /* full search segment : larger == more compression, slower, more memory (useless for fast) */
+    U32 hashLog;       /* dispatch table : larger == more memory, faster */
+    U32 searchLog;     /* nb of searches : larger == more compression, slower */
+    U32 searchLength;  /* size of matches : larger == faster decompression, sometimes less compression */
+    ZSTD_strategy strategy;
+} ZSTD_parameters;
+
+typedef ZSTDv04_Dctx ZSTD_DCtx;
+
+/* *************************************
+*  Advanced functions
+***************************************/
+/** ZSTD_decompress_usingDict
+*   Same as ZSTD_decompressDCtx, using a Dictionary content as prefix
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTD_decompressDCtx() */
+static size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
+                                             void* dst, size_t maxDstSize,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/* **************************************
+*  Streaming functions (direct mode)
+****************************************/
+static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx);
+static size_t ZSTD_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize);
+static void   ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* src, size_t srcSize);
+
+static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+static size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+
+/**
+  Streaming decompression, bufferless mode
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times. Use ZSTD_resetDCtx() to return to fresh status.
+
+  First operation is to retrieve frame parameters, using ZSTD_getFrameParams().
+  This function doesn't consume its input. It needs enough input data to properly decode the frame header.
+  Objective is to retrieve *params.windowlog, to know minimum amount of memory required during decoding.
+  Result : 0 when successful, it means the ZSTD_parameters structure has been filled.
+           >0 : means there is not enough data into src. Provides the expected size to successfully decode header.
+           errorCode, which can be tested using ZSTD_isError() (For example, if it's not a ZSTD header)
+
+  Then, you can optionally insert a dictionary.
+  This operation must mimic the compressor behavior, otherwise decompression will fail or be corrupted.
+
+  Then it's possible to start decompression.
+  Use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this exact amount of bytes, or it will fail.
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to (1 << windowlog).
+  They should preferably be located contiguously, prior to current block. Alternatively, a round buffer is also possible.
+
+  @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+*/
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+
+#endif  /* ZSTD_STATIC_H */
+
+
+/*
+    zstd_internal - common functions to include
+    Header File for include
+*/
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Common macros
+***************************************/
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+
+/* *************************************
+*  Common constants
+***************************************/
+#define ZSTD_MAGICNUMBER 0xFD2FB524   /* v0.4 */
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+
+static const size_t ZSTD_blockHeaderSize = 3;
+static const size_t ZSTD_frameHeaderSize_min = 5;
+#define ZSTD_frameHeaderSize_max 5         /* define, for static allocation */
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define IS_RAW BIT0
+#define IS_RLE BIT1
+
+#define MINMATCH 4
+#define REPCODE_STARTVALUE 4
+
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxML  ((1<<MLbits) - 1)
+#define MaxLL  ((1<<LLbits) - 1)
+#define MaxOff ((1<<Offbits)- 1)
+#define MLFSELog   10
+#define LLFSELog   10
+#define OffFSELog   9
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define MIN_SEQUENCES_SIZE (2 /*seqNb*/ + 2 /*dumps*/ + 3 /*seqTables*/ + 1 /*bitStream*/)
+#define MIN_CBLOCK_SIZE (3 /*litCSize*/ + MIN_SEQUENCES_SIZE)
+
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+
+/* ******************************************
+*  Shared functions to include for inlining
+********************************************/
+static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTD_wildcopy : custom version of memcpy(), can copy up to 7-8 bytes too many */
+static void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file
+****************************************************************** */
+#ifndef FSE_H
+#define FSE_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *****************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+
+
+/* *****************************************
+*  FSE simple functions
+******************************************/
+static size_t FSE_decompress(void* dst,  size_t maxDstSize,
+                const void* cSrc, size_t cSrcSize);
+/*!
+FSE_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'maxDstSize'.
+    return : size of regenerated data (<= maxDstSize)
+             or an error code, which can be tested using FSE_isError()
+
+    ** Important ** : FSE_decompress() doesn't decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+
+
+/* *****************************************
+*  Tool functions
+******************************************/
+/* Error Management */
+static unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+
+
+
+/* *****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[]
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*!
+FSE_readNCount():
+   Read compactly saved 'normalizedCounter' from 'rBuffer'.
+   return : size read from 'rBuffer'
+            or an errorCode, which can be tested using FSE_isError()
+            maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+static  size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*!
+Constructor and Destructor of type FSE_DTable
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+
+/*!
+FSE_buildDTable():
+   Builds 'dt', which must be already allocated, using FSE_createDTable()
+   return : 0,
+            or an errorCode, which can be tested using FSE_isError() */
+static size_t FSE_buildDTable ( FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*!
+FSE_decompress_usingDTable():
+   Decompress compressed source 'cSrc' of size 'cSrcSize' using 'dt'
+   into 'dst' which must be already allocated.
+   return : size of regenerated data (necessarily <= maxDstSize)
+            or an errorCode, which can be tested using FSE_isError() */
+static  size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+'FSE_DTable' can then be used to decompress 'cSrc', with FSE_decompress_usingDTable().
+'cSrcSize' must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=maxDstSize).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSE_H */
+
+
+/* ******************************************************************
+   bitstream
+   Part of NewGen Entropy library
+   header file (to include)
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which highly benefit from being inlined.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/**********************************************
+*  bitStream decompression API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+
+
+/******************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/****************************************************************
+*  Helper functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+/**********************************************************
+* bitStream decoding
+**********************************************************/
+
+/*!BIT_initDStream
+*  Initialize a BIT_DStream_t.
+*  @bitD : a pointer to an already allocated BIT_DStream_t structure
+*  @srcBuffer must point at the beginning of a bitStream
+*  @srcSize must be the exact size of the bitStream
+*  @result : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(size_t))   /* normal case */
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+    }
+    else
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);/* fall-through */
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);/* fall-through */
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);/* fall-through */
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24; /* fall-through */
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16; /* fall-through */
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8; /* fall-through */
+            default: break;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BIT_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BIT_lookBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BIT_lookBitsFast :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*!BIT_readBitsFast :
+*  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BIT_lookBitsFast(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
+    {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BIT_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start)
+    {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start)
+        {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream
+*   @return Tells if DStream has reached its exact end
+*/
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+
+
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef FSE_STATIC_H
+#define FSE_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of unsigned using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/* build a fake FSE_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/* build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+/* decompression */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    memcpy(&DTableH, dt, sizeof(DTableH));
+    DStatePtr->state = BIT_readBits(bitD, DTableH.tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSE_STATIC_H */
+
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSE_MAX_MEMORY_USAGE 14
+#define FSE_DEFAULT_MEMORY_USAGE 13
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSE_MAX_SYMBOL_VALUE 255
+
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+
+/*-**************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+static U32 FSE_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+
+static size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    FSE_DTableHeader DTableH;
+    void* const tdPtr = dt+1;   /* because dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSE_tableStep(tableSize);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    DTableH.tableLog = (U16)tableLog;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        if (normalizedCounter[s]==-1)
+        {
+            tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        }
+        else
+        {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+        }
+    }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++)
+        {
+            tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }
+    }
+
+    if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++)
+        {
+            FSE_FUNCTION_TYPE symbol = (FSE_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+        }
+    }
+
+    DTableH.fastMode = (U16)noLarge;
+    memcpy(dt, &DTableH, sizeof(DTableH));
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+/******************************************
+*  FSE helper functions
+******************************************/
+static unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+
+
+/****************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+static short FSE_abs(short a)
+{
+    return a<0 ? -a : a;
+}
+
+static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr))
+    {
+        if (previous0)
+        {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF)
+            {
+                n0+=24;
+                if (ip < iend-5)
+                {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                }
+                else
+                {
+                    bitStream >>= 16;
+                    bitCount+=16;
+                }
+            }
+            while ((bitStream & 3) == 3)
+            {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+            {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max)
+            {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            }
+            else
+            {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSE_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold)
+            {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            {
+                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+                {
+                    ip += bitCount>>3;
+                    bitCount &= 7;
+                }
+                else
+                {
+                    bitCount -= (int)(8 * (iend - 4 - ip));
+                    ip = iend - 4;
+                }
+                bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+            }
+        }
+    }
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+
+
+/*********************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+static size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSE_isError(errorCode)) return errorCode;
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) && (op<olimit) ; op+=4)
+    {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1)
+    {
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state1);
+
+        if ( (BIT_reloadDStream(&bitD)>BIT_DStream_completed) || (op==omax) || (BIT_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (BIT_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return ERROR(dstSize_tooSmall);   /* dst buffer is full, but cSrc unfinished */
+
+    return ERROR(corruption_detected);
+}
+
+
+static size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    FSE_DTableHeader DTableH;
+    U32 fastMode;
+
+    memcpy(&DTableH, dt, sizeof(DTableH));
+    fastMode = DTableH.fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+
+
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef HUFF0_H
+#define HUFF0_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependency
+******************************************/
+#include <stddef.h>    /* size_t */
+
+
+/* ****************************************
+*  Huff0 simple functions
+******************************************/
+static size_t HUF_decompress(void* dst,  size_t dstSize,
+                const void* cSrc, size_t cSrcSize);
+/*!
+HUF_decompress():
+    Decompress Huff0 data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstSize'.
+    'dstSize' must be the exact size of original (uncompressed) data.
+    Note : in contrast with FSE, HUF_decompress can regenerate RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, because it knows size to regenerate.
+    @return : size of regenerated data (== dstSize)
+              or an error code, which can be tested using HUF_isError()
+*/
+
+
+/* ****************************************
+*  Tool functions
+******************************************/
+/* Error Management */
+static unsigned    HUF_isError(size_t code);        /* tells if a return value is an error code */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* HUFF0_H */
+
+
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef HUFF0_STATIC_H
+#define HUFF0_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* ****************************************
+*  Static allocation macros
+******************************************/
+/* static allocation of Huff0's DTable */
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))  /* nb Cells; use unsigned short for X2, unsigned int for X4 */
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUF_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUF_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+
+
+/* ****************************************
+*  Huff0 detailed API
+******************************************/
+/*!
+HUF_decompress() does the following:
+1. select the decompression algorithm (X2, X4, X6) based on pre-computed heuristics
+2. build Huffman table from save, using HUF_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable
+
+*/
+static size_t HUF_readDTableX2 (unsigned short* DTable, const void* src, size_t srcSize);
+static size_t HUF_readDTableX4 (unsigned* DTable, const void* src, size_t srcSize);
+
+static size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+static size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* HUFF0_STATIC_H */
+
+
+
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+Huff0 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+
+/* **************************************************************
+*  Constants
+****************************************************************/
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUF_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_DEFAULT_TABLELOG  HUF_MAX_TABLELOG   /* tableLog by default, when not specified */
+#define HUF_MAX_SYMBOL_VALUE 255
+#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
+#  error "HUF_MAX_TABLELOG is too large !"
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+static unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+
+/*-*******************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/*! HUF_readStats
+    Read compact Huffman tree, saved by HUF_writeCTable
+    @huffWeight : destination buffer
+    @return : size read from `src`
+*/
+static size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    U32 tableLog;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+    U32 n;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  /* special header */
+    {
+        if (iSize >= (242))   /* RLE */
+        {
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else   /* Incompressible */
+        {
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            for (n=0; n<oSize; n+=2)
+            {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+            }
+        }
+    }
+    else  /* header compressed with FSE (normal case) */
+    {
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++)
+    {
+        if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+        rankStats[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    tableLog = BIT_highbit32(weightTotal) + 1;
+    if (tableLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+    {
+        U32 total = 1 << tableLog;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << BIT_highbit32(rest);
+        U32 lastWeight = BIT_highbit32(rest) + 1;
+        if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankStats[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    *tableLogPtr = tableLog;
+    return iSize+1;
+}
+
+
+/**************************/
+/* single-symbol decoding */
+/**************************/
+
+static size_t HUF_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    size_t iSize;
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    void* const dtPtr = DTable + 1;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof DTable, as allocated, from used size of DTable, in case of DTable re-use */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=tableLog; n++)
+    {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<nbSymbols; n++)
+    {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUF_DEltX2 D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize;
+}
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+        const size_t val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        BIT_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4))
+    {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+
+static size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable;
+        const HUF_DEltX2* const dt = ((const HUF_DEltX2*)dtPtr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUF_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUF_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/***************************/
+/* double-symbols decoding */
+/***************************/
+
+static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX4 DElt;
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    U32 s;
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1)
+    {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)   /* note : sortedSymbols already skipped */
+    {
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }
+}
+
+typedef U32 rankVal_t[HUF_ABSOLUTEMAX_TABLELOG][HUF_ABSOLUTEMAX_TABLELOG + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++)
+    {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits)   /* enough room for a second symbol */
+        {
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        }
+        else
+        {
+            U32 i;
+            const U32 end = start + length;
+            HUF_DEltX4 DElt;
+
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits   = (BYTE)(nbBits);
+            DElt.length   = 1;
+            for (i = start; i < end; i++)
+                DTable[i] = DElt;
+        }
+        rankVal[weight] += length;
+    }
+}
+
+static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUF_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUF_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUF_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    const U32 memLog = DTable[0];
+    size_t iSize;
+    void* dtPtr = DTable;
+    HUF_DEltX4* const dt = ((HUF_DEltX4*)dtPtr) + 1;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
+    if (memLog > HUF_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--)
+        { if (!maxW) return ERROR(GENERIC); }  /* necessarily finds a solution before maxW==0 */
+
+    /* Get start index of each weight */
+    {
+        U32 w, nextRankStart = 0;
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {
+        U32 s;
+        for (s=0; s<nbSymbols; s++)
+        {
+            U32 w = weightList[s];
+            U32 r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {
+        const U32 minBits = tableLog+1 - maxW;
+        U32 nextRankVal = 0;
+        U32 w, consumed;
+        const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+        U32* rankVal0 = rankVal[0];
+        for (w=1; w<=maxW; w++)
+        {
+            U32 current = nextRankVal;
+            nextRankVal += rankStats[w] << (w+rescale);
+            rankVal0[w] = current;
+        }
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++)
+        {
+            U32* rankValPtr = rankVal[consumed];
+            for (w = 1; w <= maxW; w++)
+            {
+                rankValPtr[w] = rankVal0[w] >> consumed;
+            }
+        }
+    }
+
+    HUF_fillDTableX4(dt, memLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    return iSize;
+}
+
+
+static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else
+    {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8))
+        {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+        }
+    }
+    return 1;
+}
+
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_MAX_TABLELOG<=12)) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd-7))
+    {
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+static size_t HUF_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable;
+        const HUF_DEltX4* const dt = ((const HUF_DEltX4*)dtPtr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BIT_initDStream(&bitD1, istart1, length1);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD2, istart2, length2);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD3, istart3, length3);
+        if (HUF_isError(errorCode)) return errorCode;
+        errorCode = BIT_initDStream(&bitD4, istart4, length4);
+        if (HUF_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; )
+        {
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUF_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/**********************************/
+/* Generic decompression selector */
+/**********************************/
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[3] = { HUF_decompress4X2, HUF_decompress4X4, NULL };
+    /* estimate decompression time */
+    U32 Q;
+    const U32 D256 = (U32)(dstSize >> 8);
+    U32 Dtime[3];
+    U32 algoNb = 0;
+    int n;
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    /* decoder timing evaluation */
+    Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    for (n=0; n<3; n++)
+        Dtime[n] = algoTime[Q][n].tableTime + (algoTime[Q][n].decode256Time * D256);
+
+    Dtime[1] += Dtime[1] >> 4; Dtime[2] += Dtime[2] >> 3; /* advantage to algorithms using less memory, for cache eviction */
+
+    if (Dtime[1] < Dtime[0]) algoNb = 1;
+
+    return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+
+    //return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUF_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+    //return HUF_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
+}
+
+
+
+#endif   /* ZSTD_CCOMMON_H_MODULE */
+
+
+/*
+    zstd - decompression module fo v0.4 legacy format
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTD_decompress() will allocate memory,
+ * in memory stack (0), or in memory heap (1, requires malloc())
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif
+
+
+/* *******************************************************
+*  Includes
+*********************************************************/
+#include <stdlib.h>      /* calloc */
+#include <string.h>      /* memcpy, memmove */
+#include <stdio.h>       /* debug : printf */
+
+
+/* *******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+
+/* *************************************
+*  Local types
+***************************************/
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+
+/* *******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/* *************************************
+*  Error Management
+***************************************/
+
+/*! ZSTD_isError
+*   tells if a return value is an error code */
+static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+
+/* *************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock } ZSTD_dStage;
+
+struct ZSTDv04_Dctx_s
+{
+    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+    U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+    U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    const void* previousDstEnd;
+    const void* base;
+    const void* vBase;
+    const void* dictEnd;
+    size_t expected;
+    size_t headerSize;
+    ZSTD_parameters params;
+    blockType_t bType;
+    ZSTD_dStage stage;
+    const BYTE* litPtr;
+    size_t litSize;
+    BYTE litBuffer[BLOCKSIZE + 8 /* margin for wildcopy */];
+    BYTE headerBuffer[ZSTD_frameHeaderSize_max];
+};  /* typedef'd to ZSTD_DCtx within "zstd_static.h" */
+
+static size_t ZSTD_resetDCtx(ZSTD_DCtx* dctx)
+{
+    dctx->expected = ZSTD_frameHeaderSize_min;
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    return 0;
+}
+
+static ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)malloc(sizeof(ZSTD_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTD_resetDCtx(dctx);
+    return dctx;
+}
+
+static size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    free(dctx);
+    return 0;
+}
+
+
+/* *************************************************************
+*   Decompression section
+***************************************************************/
+/** ZSTD_decodeFrameHeader_Part1
+*   decode the 1st part of the Frame Header, which tells Frame Header size.
+*   srcSize must be == ZSTD_frameHeaderSize_min
+*   @return : the full size of the Frame Header */
+static size_t ZSTD_decodeFrameHeader_Part1(ZSTD_DCtx* zc, const void* src, size_t srcSize)
+{
+    U32 magicNumber;
+    if (srcSize != ZSTD_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_MAGICNUMBER) return ERROR(prefix_unknown);
+    zc->headerSize = ZSTD_frameHeaderSize_min;
+    return zc->headerSize;
+}
+
+
+static size_t ZSTD_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize)
+{
+    U32 magicNumber;
+    if (srcSize < ZSTD_frameHeaderSize_min) return ZSTD_frameHeaderSize_max;
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTD_MAGICNUMBER) return ERROR(prefix_unknown);
+    memset(params, 0, sizeof(*params));
+    params->windowLog = (((const BYTE*)src)[4] & 15) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+    if ((((const BYTE*)src)[4] >> 4) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits */
+    return 0;
+}
+
+/** ZSTD_decodeFrameHeader_Part2
+*   decode the full Frame Header
+*   srcSize must be the size provided by ZSTD_decodeFrameHeader_Part1
+*   @return : 0, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader_Part2(ZSTD_DCtx* zc, const void* src, size_t srcSize)
+{
+    size_t result;
+    if (srcSize != zc->headerSize) return ERROR(srcSize_wrong);
+    result = ZSTD_getFrameParams(&(zc->params), src, srcSize);
+    if ((MEM_32bits()) && (zc->params.windowLog > 25)) return ERROR(frameParameter_unsupported);
+    return result;
+}
+
+
+static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    BYTE headerFlags;
+    U32 cSize;
+
+    if (srcSize < 3) return ERROR(srcSize_wrong);
+
+    headerFlags = *in;
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+
+    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+static size_t ZSTD_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/** ZSTD_decompressLiterals
+    @return : nb of bytes read from src, or an error code*/
+static size_t ZSTD_decompressLiterals(void* dst, size_t* maxDstSizePtr,
+                                const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    const size_t litSize = (MEM_readLE32(src) & 0x1FFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+    const size_t litCSize = (MEM_readLE32(ip+2) & 0xFFFFFF) >> 5;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+
+    if (litSize > *maxDstSizePtr) return ERROR(corruption_detected);
+    if (litCSize + 5 > srcSize) return ERROR(corruption_detected);
+
+    if (HUF_isError(HUF_decompress(dst, litSize, ip+5, litCSize))) return ERROR(corruption_detected);
+
+    *maxDstSizePtr = litSize;
+    return litCSize + 5;
+}
+
+
+/** ZSTD_decodeLiteralsBlock
+    @return : nb of bytes read from src (< srcSize ) */
+static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    const BYTE* const istart = (const BYTE*) src;
+
+    /* any compressed block with literals segment must be at least this size */
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch(*istart & 3)
+    {
+    /* compressed */
+    case 0:
+        {
+            size_t litSize = BLOCKSIZE;
+            const size_t readSize = ZSTD_decompressLiterals(dctx->litBuffer, &litSize, src, srcSize);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, 8);
+            return readSize;   /* works if it's an error too */
+        }
+    case IS_RAW:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > srcSize-11)   /* risk of reading too far with wildcopy */
+            {
+                if (litSize > srcSize-3) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, 8);
+                return litSize+3;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+3;
+            dctx->litSize = litSize;
+            return litSize+3;        }
+    case IS_RLE:
+        {
+            const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[3], litSize + 8);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return 4;
+        }
+    default:
+        return ERROR(corruption_detected);   /* forbidden nominal case */
+    }
+}
+
+
+static size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+                         FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
+                         const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    const BYTE* const iend = istart + srcSize;
+    U32 LLtype, Offtype, MLtype;
+    U32 LLlog, Offlog, MLlog;
+    size_t dumpsLength;
+
+    /* check */
+    if (srcSize < 5) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    *nbSeq = MEM_readLE16(ip); ip+=2;
+    LLtype  = *ip >> 6;
+    Offtype = (*ip >> 4) & 3;
+    MLtype  = (*ip >> 2) & 3;
+    if (*ip & 2)
+    {
+        dumpsLength  = ip[2];
+        dumpsLength += ip[1] << 8;
+        ip += 3;
+    }
+    else
+    {
+        dumpsLength  = ip[1];
+        dumpsLength += (ip[0] & 1) << 8;
+        ip += 2;
+    }
+    *dumpsPtr = ip;
+    ip += dumpsLength;
+    *dumpsLengthPtr = dumpsLength;
+
+    /* check */
+    if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+    /* sequences */
+    {
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL >= MaxOff */
+        size_t headerSize;
+
+        /* Build DTables */
+        switch(LLtype)
+        {
+        case bt_rle :
+            LLlog = 0;
+            FSE_buildDTable_rle(DTableLL, *ip++); break;
+        case bt_raw :
+            LLlog = LLbits;
+            FSE_buildDTable_raw(DTableLL, LLbits); break;
+        default :
+            {   U32 max = MaxLL;
+                headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (LLlog > LLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableLL, norm, max, LLlog);
+        }   }
+
+        switch(Offtype)
+        {
+        case bt_rle :
+            Offlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong);   /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableOffb, *ip++ & MaxOff); /* if *ip > MaxOff, data is corrupted */
+            break;
+        case bt_raw :
+            Offlog = Offbits;
+            FSE_buildDTable_raw(DTableOffb, Offbits); break;
+        default :
+            {   U32 max = MaxOff;
+                headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (Offlog > OffFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableOffb, norm, max, Offlog);
+        }   }
+
+        switch(MLtype)
+        {
+        case bt_rle :
+            MLlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSE_buildDTable_rle(DTableML, *ip++); break;
+        case bt_raw :
+            MLlog = MLbits;
+            FSE_buildDTable_raw(DTableML, MLbits); break;
+        default :
+            {   U32 max = MaxML;
+                headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
+                if (FSE_isError(headerSize)) return ERROR(GENERIC);
+                if (MLlog > MLFSELog) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSE_buildDTable(DTableML, norm, max, MLlog);
+    }   }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t offset;
+    size_t matchLength;
+} seq_t;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+    const BYTE* dumpsEnd;
+} seqState_t;
+
+
+static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+    const BYTE* const de = seqState->dumpsEnd;
+
+    /* Literal length */
+    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    if (litLength == MaxLL) {
+        U32 add = *dumps++;
+        if (add < 255) litLength += add;
+        else {
+            litLength = dumps[0] + (dumps[1]<<8) + (dumps[2]<<16);
+            dumps += 3;
+        }
+        if (dumps > de) { litLength = MaxLL+255; }  /* late correction, to avoid using uninitialized memory */
+        if (dumps >= de) { dumps = de-1; }  /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+
+    /* Offset */
+    {   static const U32 offsetPrefix[MaxOff+1] = {
+                1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
+                512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
+                524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
+        U32 offsetCode, nbBits;
+        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));   /* <= maxOff, by table construction */
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = offsetPrefix[offsetCode] + BIT_readBits(&(seqState->DStream), nbBits);
+        if (MEM_32bits()) BIT_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;   /* cmove */
+        if (offsetCode | !litLength) seqState->prevOffset = seq->offset;   /* cmove */
+    }
+
+    /* MatchLength */
+    matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML) {
+        U32 add = *dumps++;
+        if (add < 255) matchLength += add;
+        else {
+            matchLength = dumps[0] + (dumps[1]<<8) + (dumps[2]<<16);
+            dumps += 3;
+        }
+        if (dumps > de) { matchLength = MaxML+255; }  /* late correction, to avoid using uninitialized memory */
+        if (dumps >= de) { dumps = de-1; }  /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+}
+
+
+static size_t ZSTD_execSequence(BYTE* op,
+                                BYTE* const oend, seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    static const int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+    static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* substracted */
+    BYTE* const oLitEnd = op + sequence.litLength;
+    const size_t sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_8 = oend-8;
+    const BYTE* const litEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (litEnd > litLimit) return ERROR(corruption_detected);   /* risk read beyond lit buffer */
+
+    /* copy Literals */
+    ZSTD_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base))
+    {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase))
+            return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd)
+        {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {
+            size_t length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_8 || sequence.matchLength < MINMATCH) {
+              while (op < oMatchEnd) *op++ = *match++;
+              return sequenceLength;
+            }
+        }
+    }
+    /* Requirement: op <= oend_8 */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        const int sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTD_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTD_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH))
+    {
+        if (op < oend_8)
+        {
+            ZSTD_wildcopy(op, match, oend_8 - op);
+            match += oend_8 - op;
+            op = oend_8;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    }
+    else
+    {
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+
+static size_t ZSTD_decompressSequences(
+                               ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode, dumpsLength;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    U32* DTableLL = dctx->LLTable;
+    U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, iend-ip);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        sequence.offset = 4;
+        seqState.dumps = dumps;
+        seqState.dumpsEnd = dumps + dumpsLength;
+        seqState.prevOffset = 4;
+        errorCode = BIT_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (ERR_isError(errorCode)) return ERROR(corruption_detected);
+        FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; )
+        {
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTD_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        if ( !BIT_endOfDStream(&(seqState.DStream)) ) return ERROR(corruption_detected);   /* DStream should be entirely and exactly consumed; otherwise data is corrupted */
+
+        /* last literal segment */
+        {
+            size_t lastLLSize = litEnd - litPtr;
+            if (litPtr > litEnd) return ERROR(corruption_detected);
+            if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+            if (op != litPtr) memcpy(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+
+static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd)   /* not contiguous */
+    {
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                            void* dst, size_t maxDstSize,
+                      const void* src, size_t srcSize)
+{
+    /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    /* Decode literals sub-block */
+    size_t litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+    if (ZSTD_isError(litCSize)) return litCSize;
+    ip += litCSize;
+    srcSize -= litCSize;
+
+    return ZSTD_decompressSequences(dctx, dst, maxDstSize, ip, srcSize);
+}
+
+
+static size_t ZSTD_decompress_usingDict(ZSTD_DCtx* ctx,
+                                 void* dst, size_t maxDstSize,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties;
+
+    /* init */
+    ZSTD_resetDCtx(ctx);
+    if (dict)
+    {
+        ZSTD_decompress_insertDictionary(ctx, dict, dictSize);
+        ctx->dictEnd = ctx->previousDstEnd;
+        ctx->vBase = (const char*)dst - ((const char*)(ctx->previousDstEnd) - (const char*)(ctx->base));
+        ctx->base = dst;
+    }
+    else
+    {
+        ctx->vBase = ctx->base = ctx->dictEnd = dst;
+    }
+
+    /* Frame Header */
+    {
+        size_t frameHeaderSize;
+        if (srcSize < ZSTD_frameHeaderSize_min+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+        frameHeaderSize = ZSTD_decodeFrameHeader_Part1(ctx, src, ZSTD_frameHeaderSize_min);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+        if (srcSize < frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+        frameHeaderSize = ZSTD_decodeFrameHeader_Part2(ctx, src, frameHeaderSize);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t decodedSize=0;
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock_internal(ctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+static size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    if (MEM_readLE32(src) != ZSTD_MAGICNUMBER) return ERROR(prefix_unknown);
+    ip += ZSTD_frameHeaderSize_min; remainingSize -= ZSTD_frameHeaderSize_min;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/* ******************************
+*  Streaming Decompression API
+********************************/
+static size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != ctx->expected) return ERROR(srcSize_wrong);
+    ZSTD_checkContinuity(ctx, dst);
+
+    /* Decompress : frame header; part 1 */
+    switch (ctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        /* get frame header size */
+        if (srcSize != ZSTD_frameHeaderSize_min) return ERROR(srcSize_wrong);   /* impossible */
+        ctx->headerSize = ZSTD_decodeFrameHeader_Part1(ctx, src, ZSTD_frameHeaderSize_min);
+        if (ZSTD_isError(ctx->headerSize)) return ctx->headerSize;
+        memcpy(ctx->headerBuffer, src, ZSTD_frameHeaderSize_min);
+        if (ctx->headerSize > ZSTD_frameHeaderSize_min) return ERROR(GENERIC);   /* impossible */
+        ctx->expected = 0;   /* not necessary to copy more */
+        /* fallthrough */
+    case ZSTDds_decodeFrameHeader:
+        /* get frame header */
+        {   size_t const result = ZSTD_decodeFrameHeader_Part2(ctx, ctx->headerBuffer, ctx->headerSize);
+            if (ZSTD_isError(result)) return result;
+            ctx->expected = ZSTD_blockHeaderSize;
+            ctx->stage = ZSTDds_decodeBlockHeader;
+            return 0;
+        }
+    case ZSTDds_decodeBlockHeader:
+        /* Decode block header */
+        {   blockProperties_t bp;
+            size_t const blockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+            if (ZSTD_isError(blockSize)) return blockSize;
+            if (bp.blockType == bt_end)
+            {
+                ctx->expected = 0;
+                ctx->stage = ZSTDds_getFrameHeaderSize;
+            }
+            else
+            {
+                ctx->expected = blockSize;
+                ctx->bType = bp.blockType;
+                ctx->stage = ZSTDds_decompressBlock;
+            }
+            return 0;
+        }
+    case ZSTDds_decompressBlock:
+        {
+            /* Decompress : block content */
+            size_t rSize;
+            switch(ctx->bType)
+            {
+            case bt_compressed:
+                rSize = ZSTD_decompressBlock_internal(ctx, dst, maxDstSize, src, srcSize);
+                break;
+            case bt_raw :
+                rSize = ZSTD_copyRawBlock(dst, maxDstSize, src, srcSize);
+                break;
+            case bt_rle :
+                return ERROR(GENERIC);   /* not yet handled */
+                break;
+            case bt_end :   /* should never happen (filtered at phase 1) */
+                rSize = 0;
+                break;
+            default:
+                return ERROR(GENERIC);
+            }
+            ctx->stage = ZSTDds_decodeBlockHeader;
+            ctx->expected = ZSTD_blockHeaderSize;
+            ctx->previousDstEnd = (char*)dst + rSize;
+            return rSize;
+        }
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static void ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* dict, size_t dictSize)
+{
+    ctx->dictEnd = ctx->previousDstEnd;
+    ctx->vBase = (const char*)dict - ((const char*)(ctx->previousDstEnd) - (const char*)(ctx->base));
+    ctx->base = dict;
+    ctx->previousDstEnd = (const char*)dict + dictSize;
+}
+
+
+
+/*
+    Buffered version of Zstd compression library
+    Copyright (C) 2015, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* The objects defined into this file should be considered experimental.
+ * They are not labelled stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risk of future changes.
+ */
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stdlib.h>
+
+
+/** ************************************************
+*  Streaming decompression
+*
+*  A ZBUFF_DCtx object is required to track streaming operation.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation.
+*  ZBUFF_DCtx objects can be reused multiple times.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *maxDstSizePtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input.
+*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst .
+*  return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory)
+*  output : 128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
+*  input : just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* **************************************************/
+
+typedef enum { ZBUFFds_init, ZBUFFds_readHeader, ZBUFFds_loadHeader, ZBUFFds_decodeHeader,
+               ZBUFFds_read, ZBUFFds_load, ZBUFFds_flush } ZBUFF_dStage;
+
+/* *** Resource management *** */
+
+#define ZSTD_frameHeaderSize_max 5   /* too magical, should come from reference */
+struct ZBUFFv04_DCtx_s {
+    ZSTD_DCtx* zc;
+    ZSTD_parameters params;
+    char* inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    char* outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t hPos;
+    const char* dict;
+    size_t dictSize;
+    ZBUFF_dStage stage;
+    unsigned char headerBuffer[ZSTD_frameHeaderSize_max];
+};   /* typedef'd to ZBUFF_DCtx within "zstd_buffered.h" */
+
+typedef ZBUFFv04_DCtx ZBUFF_DCtx;
+
+
+static ZBUFF_DCtx* ZBUFF_createDCtx(void)
+{
+    ZBUFF_DCtx* zbc = (ZBUFF_DCtx*)malloc(sizeof(ZBUFF_DCtx));
+    if (zbc==NULL) return NULL;
+    memset(zbc, 0, sizeof(*zbc));
+    zbc->zc = ZSTD_createDCtx();
+    zbc->stage = ZBUFFds_init;
+    return zbc;
+}
+
+static size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbc)
+{
+    if (zbc==NULL) return 0;   /* support free on null */
+    ZSTD_freeDCtx(zbc->zc);
+    free(zbc->inBuff);
+    free(zbc->outBuff);
+    free(zbc);
+    return 0;
+}
+
+
+/* *** Initialization *** */
+
+static size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbc)
+{
+    zbc->stage = ZBUFFds_readHeader;
+    zbc->hPos = zbc->inPos = zbc->outStart = zbc->outEnd = zbc->dictSize = 0;
+    return ZSTD_resetDCtx(zbc->zc);
+}
+
+
+static size_t ZBUFF_decompressWithDictionary(ZBUFF_DCtx* zbc, const void* src, size_t srcSize)
+{
+    zbc->dict = (const char*)src;
+    zbc->dictSize = srcSize;
+    return 0;
+}
+
+static size_t ZBUFF_limitCopy(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    size_t length = MIN(maxDstSize, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+/* *** Decompression *** */
+
+static size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr)
+{
+    const char* const istart = (const char*)src;
+    const char* ip = istart;
+    const char* const iend = istart + *srcSizePtr;
+    char* const ostart = (char*)dst;
+    char* op = ostart;
+    char* const oend = ostart + *maxDstSizePtr;
+    U32 notDone = 1;
+
+    DEBUGLOG(5, "ZBUFF_decompressContinue");
+    while (notDone)
+    {
+        switch(zbc->stage)
+        {
+
+        case ZBUFFds_init :
+            DEBUGLOG(5, "ZBUFF_decompressContinue: stage==ZBUFFds_init => ERROR(init_missing)");
+            return ERROR(init_missing);
+
+        case ZBUFFds_readHeader :
+            /* read header from src */
+            {   size_t const headerSize = ZSTD_getFrameParams(&(zbc->params), src, *srcSizePtr);
+                if (ZSTD_isError(headerSize)) return headerSize;
+                if (headerSize) {
+                    /* not enough input to decode header : tell how many bytes would be necessary */
+                    memcpy(zbc->headerBuffer+zbc->hPos, src, *srcSizePtr);
+                    zbc->hPos += *srcSizePtr;
+                    *maxDstSizePtr = 0;
+                    zbc->stage = ZBUFFds_loadHeader;
+                    return headerSize - zbc->hPos;
+                }
+                zbc->stage = ZBUFFds_decodeHeader;
+                break;
+            }
+
+        case ZBUFFds_loadHeader:
+            /* complete header from src */
+            {   size_t headerSize = ZBUFF_limitCopy(
+                    zbc->headerBuffer + zbc->hPos, ZSTD_frameHeaderSize_max - zbc->hPos,
+                    src, *srcSizePtr);
+                zbc->hPos += headerSize;
+                ip += headerSize;
+                headerSize = ZSTD_getFrameParams(&(zbc->params), zbc->headerBuffer, zbc->hPos);
+                if (ZSTD_isError(headerSize)) return headerSize;
+                if (headerSize) {
+                    /* not enough input to decode header : tell how many bytes would be necessary */
+                    *maxDstSizePtr = 0;
+                    return headerSize - zbc->hPos;
+            }   }
+            /* intentional fallthrough */
+
+        case ZBUFFds_decodeHeader:
+                /* apply header to create / resize buffers */
+                {   size_t const neededOutSize = (size_t)1 << zbc->params.windowLog;
+                    size_t const neededInSize = BLOCKSIZE;   /* a block is never > BLOCKSIZE */
+                    if (zbc->inBuffSize < neededInSize) {
+                        free(zbc->inBuff);
+                        zbc->inBuffSize = neededInSize;
+                        zbc->inBuff = (char*)malloc(neededInSize);
+                        if (zbc->inBuff == NULL) return ERROR(memory_allocation);
+                    }
+                    if (zbc->outBuffSize < neededOutSize) {
+                        free(zbc->outBuff);
+                        zbc->outBuffSize = neededOutSize;
+                        zbc->outBuff = (char*)malloc(neededOutSize);
+                        if (zbc->outBuff == NULL) return ERROR(memory_allocation);
+                }   }
+                if (zbc->dictSize)
+                    ZSTD_decompress_insertDictionary(zbc->zc, zbc->dict, zbc->dictSize);
+                if (zbc->hPos) {
+                    /* some data already loaded into headerBuffer : transfer into inBuff */
+                    memcpy(zbc->inBuff, zbc->headerBuffer, zbc->hPos);
+                    zbc->inPos = zbc->hPos;
+                    zbc->hPos = 0;
+                    zbc->stage = ZBUFFds_load;
+                    break;
+                }
+                zbc->stage = ZBUFFds_read;
+		/* fall-through */
+        case ZBUFFds_read:
+            {
+                size_t neededInSize = ZSTD_nextSrcSizeToDecompress(zbc->zc);
+                if (neededInSize==0)   /* end of frame */
+                {
+                    zbc->stage = ZBUFFds_init;
+                    notDone = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize)
+                {
+                    /* directly decode from src */
+                    size_t decodedSize = ZSTD_decompressContinue(zbc->zc,
+                        zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
+                        ip, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize) break;   /* this was just a header */
+                    zbc->outEnd = zbc->outStart +  decodedSize;
+                    zbc->stage = ZBUFFds_flush;
+                    break;
+                }
+                if (ip==iend) { notDone = 0; break; }   /* no more input */
+                zbc->stage = ZBUFFds_load;
+            }
+	    /* fall-through */
+        case ZBUFFds_load:
+            {
+                size_t neededInSize = ZSTD_nextSrcSizeToDecompress(zbc->zc);
+                size_t toLoad = neededInSize - zbc->inPos;   /* should always be <= remaining space within inBuff */
+                size_t loadedSize;
+                if (toLoad > zbc->inBuffSize - zbc->inPos) return ERROR(corruption_detected);   /* should never happen */
+                loadedSize = ZBUFF_limitCopy(zbc->inBuff + zbc->inPos, toLoad, ip, iend-ip);
+                ip += loadedSize;
+                zbc->inPos += loadedSize;
+                if (loadedSize < toLoad) { notDone = 0; break; }   /* not enough input, wait for more */
+                {
+                    size_t decodedSize = ZSTD_decompressContinue(zbc->zc,
+                        zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
+                        zbc->inBuff, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    zbc->inPos = 0;   /* input is consumed */
+                    if (!decodedSize) { zbc->stage = ZBUFFds_read; break; }   /* this was just a header */
+                    zbc->outEnd = zbc->outStart +  decodedSize;
+                    zbc->stage = ZBUFFds_flush;
+                    /* ZBUFFds_flush follows */
+                }
+            }
+	    /* fall-through */
+        case ZBUFFds_flush:
+            {
+                size_t toFlushSize = zbc->outEnd - zbc->outStart;
+                size_t flushedSize = ZBUFF_limitCopy(op, oend-op, zbc->outBuff + zbc->outStart, toFlushSize);
+                op += flushedSize;
+                zbc->outStart += flushedSize;
+                if (flushedSize == toFlushSize)
+                {
+                    zbc->stage = ZBUFFds_read;
+                    if (zbc->outStart + BLOCKSIZE > zbc->outBuffSize)
+                        zbc->outStart = zbc->outEnd = 0;
+                    break;
+                }
+                /* cannot flush everything */
+                notDone = 0;
+                break;
+            }
+        default: return ERROR(GENERIC);   /* impossible */
+        }
+    }
+
+    *srcSizePtr = ip-istart;
+    *maxDstSizePtr = op-ostart;
+
+    {
+        size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zbc->zc);
+        if (nextSrcSizeHint > 3) nextSrcSizeHint+= 3;   /* get the next block header while at it */
+        nextSrcSizeHint -= zbc->inPos;   /* already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+unsigned ZBUFFv04_isError(size_t errorCode) { return ERR_isError(errorCode); }
+const char* ZBUFFv04_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+size_t ZBUFFv04_recommendedDInSize()  { return BLOCKSIZE + 3; }
+size_t ZBUFFv04_recommendedDOutSize() { return BLOCKSIZE; }
+
+
+
+/*- ========================================================================= -*/
+
+/* final wrapping stage */
+
+size_t ZSTDv04_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return ZSTD_decompress_usingDict(dctx, dst, maxDstSize, src, srcSize, NULL, 0);
+}
+
+size_t ZSTDv04_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE==1)
+    size_t regenSize;
+    ZSTD_DCtx* dctx = ZSTD_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTDv04_decompressDCtx(dctx, dst, maxDstSize, src, srcSize);
+    ZSTD_freeDCtx(dctx);
+    return regenSize;
+#else
+    ZSTD_DCtx dctx;
+    return ZSTDv04_decompressDCtx(&dctx, dst, maxDstSize, src, srcSize);
+#endif
+}
+
+size_t ZSTDv04_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    return ZSTD_findFrameCompressedSize(src, srcSize);
+}
+
+size_t ZSTDv04_resetDCtx(ZSTDv04_Dctx* dctx) { return ZSTD_resetDCtx(dctx); }
+
+size_t ZSTDv04_nextSrcSizeToDecompress(ZSTDv04_Dctx* dctx)
+{
+    return ZSTD_nextSrcSizeToDecompress(dctx);
+}
+
+size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return ZSTD_decompressContinue(dctx, dst, maxDstSize, src, srcSize);
+}
+
+
+
+ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void) { return ZBUFF_createDCtx(); }
+size_t ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx) { return ZBUFF_freeDCtx(dctx); }
+
+size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx) { return ZBUFF_decompressInit(dctx); }
+size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* src, size_t srcSize)
+{ return ZBUFF_decompressWithDictionary(dctx, src, srcSize); }
+
+size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr)
+{
+    DEBUGLOG(5, "ZBUFFv04_decompressContinue");
+    return ZBUFF_decompressContinue(dctx, dst, maxDstSizePtr, src, srcSizePtr);
+}
+
+ZSTD_DCtx* ZSTDv04_createDCtx(void) { return ZSTD_createDCtx(); }
+size_t ZSTDv04_freeDCtx(ZSTD_DCtx* dctx) { return ZSTD_freeDCtx(dctx); }
+
+size_t ZSTDv04_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize)
+{
+    return ZSTD_getFrameParams(params, src, srcSize);
+}

diff --git a/vendor/github.com/DataDog/zstd/zstd_v04.h b/vendor/github.com/DataDog/zstd/zstd_v04.h
new file mode 100644
index 0000000..6391631
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v04.h

@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V04_H_91868324769238
+#define ZSTD_V04_H_91868324769238
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv04_decompress() : decompress ZSTD frames compliant with v0.4.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv04_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv04_getFrameSrcSize() : get the source length of a ZSTD frame compliant with v0.4.x format
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv04_isError())
+*/
+size_t ZSTDv04_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/**
+ZSTDv04_isError() : tells if the result of ZSTDv04_decompress() is an error
+*/
+unsigned ZSTDv04_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv04_Dctx_s ZSTDv04_Dctx;
+ZSTDv04_Dctx* ZSTDv04_createDCtx(void);
+size_t ZSTDv04_freeDCtx(ZSTDv04_Dctx* dctx);
+
+size_t ZSTDv04_decompressDCtx(ZSTDv04_Dctx* dctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+
+/* *************************************
+*  Direct Streaming
+***************************************/
+size_t ZSTDv04_resetDCtx(ZSTDv04_Dctx* dctx);
+
+size_t ZSTDv04_nextSrcSizeToDecompress(ZSTDv04_Dctx* dctx);
+size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+
+/* *************************************
+*  Buffered Streaming
+***************************************/
+typedef struct ZBUFFv04_DCtx_s ZBUFFv04_DCtx;
+ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void);
+size_t         ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx);
+
+size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx);
+size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* dict, size_t dictSize);
+
+size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr);
+
+/** ************************************************
+*  Streaming decompression
+*
+*  A ZBUFF_DCtx object is required to track streaming operation.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation.
+*  ZBUFF_DCtx objects can be reused multiple times.
+*
+*  Optionally, a reference to a static dictionary can be set, using ZBUFF_decompressWithDictionary()
+*  It must be the same content as the one set during compression phase.
+*  Dictionary content must remain accessible during the decompression process.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *maxDstSizePtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize / ZBUFF_recommendedDOutSize
+*  output : ZBUFF_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
+*  input : ZBUFF_recommendedDInSize==128Kb+3; just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* **************************************************/
+unsigned ZBUFFv04_isError(size_t errorCode);
+const char* ZBUFFv04_getErrorName(size_t errorCode);
+
+
+/** The below functions provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are not compulsory, they just tend to offer better latency */
+size_t ZBUFFv04_recommendedDInSize(void);
+size_t ZBUFFv04_recommendedDOutSize(void);
+
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv04_magicNumber 0xFD2FB524   /* v0.4 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V04_H_91868324769238 */

diff --git a/vendor/github.com/DataDog/zstd/zstd_v05.c b/vendor/github.com/DataDog/zstd/zstd_v05.c
new file mode 100644
index 0000000..a5e1b1f
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v05.c

@@ -0,0 +1,4011 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*- Dependencies -*/
+#include "zstd_v05.h"
+#include "error_private.h"
+
+
+/* ******************************************************************
+   mem.h
+   low-level memory access routines
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSEv05 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign*)memPtr)->u64 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U32)((U32)p[0] + ((U32)p[1]<<8) + ((U32)p[2]<<16) + ((U32)p[3]<<24));
+    }
+}
+
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U64)((U64)p[0] + ((U64)p[1]<<8) + ((U64)p[2]<<16) + ((U64)p[3]<<24)
+                     + ((U64)p[4]<<32) + ((U64)p[5]<<40) + ((U64)p[6]<<48) + ((U64)p[7]<<56));
+    }
+}
+
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+
+/*
+    zstd - standard compression library
+    Header File for static linking only
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net
+*/
+#ifndef ZSTD_STATIC_H
+#define ZSTD_STATIC_H
+
+/* The prototypes defined within this file are considered experimental.
+ * They should not be used in the context DLL as they may change in the future.
+ * Prefer static linking if you need them, to control breaking version changes issues.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/*-*************************************
+*  Types
+***************************************/
+#define ZSTDv05_WINDOWLOG_ABSOLUTEMIN 11
+
+
+/*-*************************************
+*  Advanced functions
+***************************************/
+/*- Advanced Decompression functions -*/
+
+/*! ZSTDv05_decompress_usingPreparedDCtx() :
+*   Same as ZSTDv05_decompress_usingDict, but using a reference context `preparedDCtx`, where dictionary has been loaded.
+*   It avoids reloading the dictionary each time.
+*   `preparedDCtx` must have been properly initialized using ZSTDv05_decompressBegin_usingDict().
+*   Requires 2 contexts : 1 for reference, which will not be modified, and 1 to run the decompression operation */
+size_t ZSTDv05_decompress_usingPreparedDCtx(
+                                             ZSTDv05_DCtx* dctx, const ZSTDv05_DCtx* preparedDCtx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize);
+
+
+/* **************************************
+*  Streaming functions (direct mode)
+****************************************/
+size_t ZSTDv05_decompressBegin(ZSTDv05_DCtx* dctx);
+
+/*
+  Streaming decompression, direct mode (bufferless)
+
+  A ZSTDv05_DCtx object is required to track streaming operations.
+  Use ZSTDv05_createDCtx() / ZSTDv05_freeDCtx() to manage it.
+  A ZSTDv05_DCtx object can be re-used multiple times.
+
+  First typical operation is to retrieve frame parameters, using ZSTDv05_getFrameParams().
+  This operation is independent, and just needs enough input data to properly decode the frame header.
+  Objective is to retrieve *params.windowlog, to know minimum amount of memory required during decoding.
+  Result : 0 when successful, it means the ZSTDv05_parameters structure has been filled.
+           >0 : means there is not enough data into src. Provides the expected size to successfully decode header.
+           errorCode, which can be tested using ZSTDv05_isError()
+
+  Start decompression, with ZSTDv05_decompressBegin() or ZSTDv05_decompressBegin_usingDict()
+  Alternatively, you can copy a prepared context, using ZSTDv05_copyDCtx()
+
+  Then use ZSTDv05_nextSrcSizeToDecompress() and ZSTDv05_decompressContinue() alternatively.
+  ZSTDv05_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTDv05_decompressContinue().
+  ZSTDv05_decompressContinue() requires this exact amount of bytes, or it will fail.
+  ZSTDv05_decompressContinue() needs previous data blocks during decompression, up to (1 << windowlog).
+  They should preferably be located contiguously, prior to current block. Alternatively, a round buffer is also possible.
+
+  @result of ZSTDv05_decompressContinue() is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTDv05_decompressContinue() has decoded some header.
+
+  A frame is fully decoded when ZSTDv05_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+*/
+
+
+/* **************************************
+*  Block functions
+****************************************/
+/*! Block functions produce and decode raw zstd blocks, without frame metadata.
+    User will have to take in charge required information to regenerate data, such as block sizes.
+
+    A few rules to respect :
+    - Uncompressed block size must be <= 128 KB
+    - Compressing or decompressing requires a context structure
+      + Use ZSTDv05_createCCtx() and ZSTDv05_createDCtx()
+    - It is necessary to init context before starting
+      + compression : ZSTDv05_compressBegin()
+      + decompression : ZSTDv05_decompressBegin()
+      + variants _usingDict() are also allowed
+      + copyCCtx() and copyDCtx() work too
+    - When a block is considered not compressible enough, ZSTDv05_compressBlock() result will be zero.
+      In which case, nothing is produced into `dst`.
+      + User must test for such outcome and deal directly with uncompressed data
+      + ZSTDv05_decompressBlock() doesn't accept uncompressed data as input !!
+*/
+
+size_t ZSTDv05_decompressBlock(ZSTDv05_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv05_STATIC_H */
+
+
+/*
+    zstd_internal - common functions to include
+    Header File for include
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+*/
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+
+
+/*-*************************************
+*  Common macros
+***************************************/
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTDv05_DICT_MAGIC  0xEC30A435
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BLOCKSIZE (128 KB)                 /* define, for static allocation */
+
+static const size_t ZSTDv05_blockHeaderSize = 3;
+static const size_t ZSTDv05_frameHeaderSize_min = 5;
+#define ZSTDv05_frameHeaderSize_max 5         /* define, for static allocation */
+
+#define BITv057 128
+#define BITv056  64
+#define BITv055  32
+#define BITv054  16
+#define BITv051   2
+#define BITv050   1
+
+#define IS_HUFv05 0
+#define IS_PCH 1
+#define IS_RAW 2
+#define IS_RLE 3
+
+#define MINMATCH 4
+#define REPCODE_STARTVALUE 1
+
+#define Litbits  8
+#define MLbits   7
+#define LLbits   6
+#define Offbits  5
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML  ((1<<MLbits) - 1)
+#define MaxLL  ((1<<LLbits) - 1)
+#define MaxOff ((1<<Offbits)- 1)
+#define MLFSEv05Log   10
+#define LLFSEv05Log   10
+#define OffFSEv05Log   9
+#define MaxSeq MAX(MaxLL, MaxML)
+
+#define FSEv05_ENCODING_RAW     0
+#define FSEv05_ENCODING_RLE     1
+#define FSEv05_ENCODING_STATIC  2
+#define FSEv05_ENCODING_DYNAMIC 3
+
+
+#define HufLog 12
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define WILDCOPY_OVERLENGTH 8
+
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTDv05_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
+#define COPY8(d,s) { ZSTDv05_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTDv05_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+MEM_STATIC void ZSTDv05_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* offCode;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* litLengthStart;
+    BYTE* litLength;
+    BYTE* matchLengthStart;
+    BYTE* matchLength;
+    BYTE* dumpsStart;
+    BYTE* dumps;
+    /* opt */
+    U32* matchLengthFreq;
+    U32* litLengthFreq;
+    U32* litFreq;
+    U32* offCodeFreq;
+    U32  matchLengthSum;
+    U32  litLengthSum;
+    U32  litSum;
+    U32  offCodeSum;
+} seqStore_t;
+
+
+
+#endif   /* ZSTDv05_CCOMMON_H_MODULE */
+/* ******************************************************************
+   FSEv05 : Finite State Entropy coder
+   header file
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef FSEv05_H
+#define FSEv05_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *****************************************
+*  Includes
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+
+
+/*-****************************************
+*  FSEv05 simple functions
+******************************************/
+size_t FSEv05_decompress(void* dst,  size_t maxDstSize,
+                const void* cSrc, size_t cSrcSize);
+/*!
+FSEv05_decompress():
+    Decompress FSEv05 data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'maxDstSize'.
+    return : size of regenerated data (<= maxDstSize)
+             or an error code, which can be tested using FSEv05_isError()
+
+    ** Important ** : FSEv05_decompress() doesn't decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+
+
+/* *****************************************
+*  Tool functions
+******************************************/
+/* Error Management */
+unsigned    FSEv05_isError(size_t code);        /* tells if a return value is an error code */
+const char* FSEv05_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+
+
+/* *****************************************
+*  FSEv05 detailed API
+******************************************/
+/* *** DECOMPRESSION *** */
+
+/*!
+FSEv05_readNCount():
+   Read compactly saved 'normalizedCounter' from 'rBuffer'.
+   return : size read from 'rBuffer'
+            or an errorCode, which can be tested using FSEv05_isError()
+            maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+size_t FSEv05_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*!
+Constructor and Destructor of type FSEv05_DTable
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSEv05_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSEv05_DTable* FSEv05_createDTable(unsigned tableLog);
+void        FSEv05_freeDTable(FSEv05_DTable* dt);
+
+/*!
+FSEv05_buildDTable():
+   Builds 'dt', which must be already allocated, using FSEv05_createDTable()
+   @return : 0,
+             or an errorCode, which can be tested using FSEv05_isError() */
+size_t FSEv05_buildDTable (FSEv05_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*!
+FSEv05_decompress_usingDTable():
+   Decompress compressed source @cSrc of size @cSrcSize using `dt`
+   into `dst` which must be already allocated.
+   @return : size of regenerated data (necessarily <= @dstCapacity)
+             or an errorCode, which can be tested using FSEv05_isError() */
+size_t FSEv05_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSEv05_DTable* dt);
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSEv05_H */
+/* ******************************************************************
+   bitstream
+   Part of FSEv05 library
+   header file (to include)
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef BITv05STREAM_H_MODULE
+#define BITv05STREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which highly benefit from being inlined.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BITv05_DStream_t;
+
+typedef enum { BITv05_DStream_unfinished = 0,
+               BITv05_DStream_endOfBuffer = 1,
+               BITv05_DStream_completed = 2,
+               BITv05_DStream_overflow = 3 } BITv05_DStream_status;  /* result of BITv05_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BITv05_initDStream(BITv05_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BITv05_readBits(BITv05_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BITv05_DStream_status BITv05_reloadDStream(BITv05_DStream_t* bitD);
+MEM_STATIC unsigned BITv05_endOfDStream(const BITv05_DStream_t* bitD);
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BITv05_readBitsFast(BITv05_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Helper functions
+****************************************************************/
+MEM_STATIC unsigned BITv05_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*!BITv05_initDStream
+*  Initialize a BITv05_DStream_t.
+*  @bitD : a pointer to an already allocated BITv05_DStream_t structure
+*  @srcBuffer must point at the beginning of a bitStream
+*  @srcSize must be the exact size of the bitStream
+*  @result : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BITv05_initDStream(BITv05_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(size_t)) {  /* normal case */
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BITv05_highbit32(contain32);
+    } else {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);/* fall-through */
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);/* fall-through */
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);/* fall-through */
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24; /* fall-through */
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16; /* fall-through */
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8; /* fall-through */
+            default: break;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return ERROR(GENERIC);   /* endMark not present */
+        bitD->bitsConsumed = 8 - BITv05_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BITv05_lookBits(BITv05_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BITv05_lookBitsFast :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv05_lookBitsFast(BITv05_DStream_t* bitD, U32 nbBits)
+{
+    const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BITv05_skipBits(BITv05_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BITv05_readBits(BITv05_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BITv05_lookBits(bitD, nbBits);
+    BITv05_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*!BITv05_readBitsFast :
+*  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv05_readBitsFast(BITv05_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = BITv05_lookBitsFast(bitD, nbBits);
+    BITv05_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BITv05_DStream_status BITv05_reloadDStream(BITv05_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BITv05_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BITv05_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BITv05_DStream_endOfBuffer;
+        return BITv05_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        BITv05_DStream_status result = BITv05_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BITv05_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BITv05_endOfDStream
+*   @return Tells if DStream has reached its exact end
+*/
+MEM_STATIC unsigned BITv05_endOfDStream(const BITv05_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITv05STREAM_H_MODULE */
+/* ******************************************************************
+   FSEv05 : Finite State Entropy coder
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef FSEv05_STATIC_H
+#define FSEv05_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* It is possible to statically allocate FSEv05 CTable/DTable as a table of unsigned using below macros */
+#define FSEv05_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/* *****************************************
+*  FSEv05 advanced API
+*******************************************/
+size_t FSEv05_buildDTable_raw (FSEv05_DTable* dt, unsigned nbBits);
+/* build a fake FSEv05_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+size_t FSEv05_buildDTable_rle (FSEv05_DTable* dt, unsigned char symbolValue);
+/* build a fake FSEv05_DTable, designed to always generate the same symbolValue */
+
+
+
+/* *****************************************
+*  FSEv05 symbol decompression API
+*******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSEv05_DState_t;
+
+
+static void     FSEv05_initDState(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD, const FSEv05_DTable* dt);
+
+static unsigned char FSEv05_decodeSymbol(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD);
+
+static unsigned FSEv05_endOfDState(const FSEv05_DState_t* DStatePtr);
+
+
+
+/* *****************************************
+*  FSEv05 unsafe API
+*******************************************/
+static unsigned char FSEv05_decodeSymbolFast(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+/* decompression */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSEv05_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSEv05_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSEv05_initDState(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD, const FSEv05_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv05_DTableHeader* const DTableH = (const FSEv05_DTableHeader*)ptr;
+    DStatePtr->state = BITv05_readBits(bitD, DTableH->tableLog);
+    BITv05_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSEv05_peakSymbol(FSEv05_DState_t* DStatePtr)
+{
+    const FSEv05_decode_t DInfo = ((const FSEv05_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC BYTE FSEv05_decodeSymbol(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD)
+{
+    const FSEv05_decode_t DInfo = ((const FSEv05_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BITv05_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC BYTE FSEv05_decodeSymbolFast(FSEv05_DState_t* DStatePtr, BITv05_DStream_t* bitD)
+{
+    const FSEv05_decode_t DInfo = ((const FSEv05_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = BITv05_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSEv05_endOfDState(const FSEv05_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSEv05_STATIC_H */
+/* ******************************************************************
+   FSEv05 : Finite State Entropy coder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSEv05 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+#ifndef FSEv05_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSEv05_MAX_MEMORY_USAGE 14
+#define FSEv05_DEFAULT_MEMORY_USAGE 13
+
+/*!FSEv05_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSEv05_MAX_SYMBOL_VALUE 255
+
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSEv05_FUNCTION_TYPE BYTE
+#define FSEv05_FUNCTION_EXTENSION
+#define FSEv05_DECODE_TYPE FSEv05_decode_t
+
+
+#endif   /* !FSEv05_COMMONDEFS_ONLY */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSEv05_MAX_TABLELOG  (FSEv05_MAX_MEMORY_USAGE-2)
+#define FSEv05_MAX_TABLESIZE (1U<<FSEv05_MAX_TABLELOG)
+#define FSEv05_MAXTABLESIZE_MASK (FSEv05_MAX_TABLESIZE-1)
+#define FSEv05_DEFAULT_TABLELOG (FSEv05_DEFAULT_MEMORY_USAGE-2)
+#define FSEv05_MIN_TABLELOG 5
+
+#define FSEv05_TABLELOG_ABSOLUTE_MAX 15
+#if FSEv05_MAX_TABLELOG > FSEv05_TABLELOG_ABSOLUTE_MAX
+#error "FSEv05_MAX_TABLELOG > FSEv05_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSEv05_STATIC_ASSERT(c) { enum { FSEv05_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSEv05_DTABLE_SIZE_U32(FSEv05_MAX_TABLELOG)];
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSEv05_FUNCTION_EXTENSION
+#  error "FSEv05_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSEv05_FUNCTION_TYPE
+#  error "FSEv05_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSEv05_CAT(X,Y) X##Y
+#define FSEv05_FUNCTION_NAME(X,Y) FSEv05_CAT(X,Y)
+#define FSEv05_TYPE_NAME(X,Y) FSEv05_CAT(X,Y)
+
+
+/* Function templates */
+static U32 FSEv05_tableStep(U32 tableSize) { return (tableSize>>1) + (tableSize>>3) + 3; }
+
+
+
+FSEv05_DTable* FSEv05_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSEv05_TABLELOG_ABSOLUTE_MAX) tableLog = FSEv05_TABLELOG_ABSOLUTE_MAX;
+    return (FSEv05_DTable*)malloc( FSEv05_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSEv05_freeDTable (FSEv05_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSEv05_buildDTable(FSEv05_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    FSEv05_DTableHeader DTableH;
+    void* const tdPtr = dt+1;   /* because dt is unsigned, 32-bits aligned on 32-bits */
+    FSEv05_DECODE_TYPE* const tableDecode = (FSEv05_DECODE_TYPE*) (tdPtr);
+    const U32 tableSize = 1 << tableLog;
+    const U32 tableMask = tableSize-1;
+    const U32 step = FSEv05_tableStep(tableSize);
+    U16 symbolNext[FSEv05_MAX_SYMBOL_VALUE+1];
+    U32 position = 0;
+    U32 highThreshold = tableSize-1;
+    const S16 largeLimit= (S16)(1 << (tableLog-1));
+    U32 noLarge = 1;
+    U32 s;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSEv05_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSEv05_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    DTableH.tableLog = (U16)tableLog;
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (normalizedCounter[s]==-1) {
+            tableDecode[highThreshold--].symbol = (FSEv05_FUNCTION_TYPE)s;
+            symbolNext[s] = 1;
+        } else {
+            if (normalizedCounter[s] >= largeLimit) noLarge=0;
+            symbolNext[s] = normalizedCounter[s];
+    }   }
+
+    /* Spread symbols */
+    for (s=0; s<=maxSymbolValue; s++) {
+        int i;
+        for (i=0; i<normalizedCounter[s]; i++) {
+            tableDecode[position].symbol = (FSEv05_FUNCTION_TYPE)s;
+            position = (position + step) & tableMask;
+            while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+    }   }
+
+    if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+
+    /* Build Decoding table */
+    {
+        U32 i;
+        for (i=0; i<tableSize; i++) {
+            FSEv05_FUNCTION_TYPE symbol = (FSEv05_FUNCTION_TYPE)(tableDecode[i].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[i].nbBits = (BYTE) (tableLog - BITv05_highbit32 ((U32)nextState) );
+            tableDecode[i].newState = (U16) ( (nextState << tableDecode[i].nbBits) - tableSize);
+    }   }
+
+    DTableH.fastMode = (U16)noLarge;
+    memcpy(dt, &DTableH, sizeof(DTableH));
+    return 0;
+}
+
+
+#ifndef FSEv05_COMMONDEFS_ONLY
+/*-****************************************
+*  FSEv05 helper functions
+******************************************/
+unsigned FSEv05_isError(size_t code) { return ERR_isError(code); }
+
+const char* FSEv05_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSEv05 NCount encoding-decoding
+****************************************************************/
+static short FSEv05_abs(short a) { return a<0 ? -a : a; }
+
+
+size_t FSEv05_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSEv05_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSEv05_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0+=24;
+                if (ip < iend-5) {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount+=16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            } else {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSEv05_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+
+
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSEv05_buildDTable_rle (FSEv05_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSEv05_DTableHeader* const DTableH = (FSEv05_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv05_decode_t* const cell = (FSEv05_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSEv05_buildDTable_raw (FSEv05_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSEv05_DTableHeader* const DTableH = (FSEv05_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv05_decode_t* const dinfo = (FSEv05_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSEv05_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSEv05_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BITv05_DStream_t bitD;
+    FSEv05_DState_t state1;
+    FSEv05_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = BITv05_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSEv05_isError(errorCode)) return errorCode;
+
+    FSEv05_initDState(&state1, &bitD, dt);
+    FSEv05_initDState(&state2, &bitD, dt);
+
+#define FSEv05_GETSYMBOL(statePtr) fast ? FSEv05_decodeSymbolFast(statePtr, &bitD) : FSEv05_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BITv05_reloadDStream(&bitD)==BITv05_DStream_unfinished) && (op<olimit) ; op+=4) {
+        op[0] = FSEv05_GETSYMBOL(&state1);
+
+        if (FSEv05_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv05_reloadDStream(&bitD);
+
+        op[1] = FSEv05_GETSYMBOL(&state2);
+
+        if (FSEv05_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BITv05_reloadDStream(&bitD) > BITv05_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSEv05_GETSYMBOL(&state1);
+
+        if (FSEv05_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv05_reloadDStream(&bitD);
+
+        op[3] = FSEv05_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BITv05_reloadDStream(&bitD) >= FSEv05_DStream_partiallyFilled; Ends at exactly BITv05_DStream_completed */
+    while (1) {
+        if ( (BITv05_reloadDStream(&bitD)>BITv05_DStream_completed) || (op==omax) || (BITv05_endOfDStream(&bitD) && (fast || FSEv05_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSEv05_GETSYMBOL(&state1);
+
+        if ( (BITv05_reloadDStream(&bitD)>BITv05_DStream_completed) || (op==omax) || (BITv05_endOfDStream(&bitD) && (fast || FSEv05_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSEv05_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (BITv05_endOfDStream(&bitD) && FSEv05_endOfDState(&state1) && FSEv05_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return ERROR(dstSize_tooSmall);   /* dst buffer is full, but cSrc unfinished */
+
+    return ERROR(corruption_detected);
+}
+
+
+size_t FSEv05_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSEv05_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv05_DTableHeader* DTableH = (const FSEv05_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSEv05_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSEv05_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSEv05_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSEv05_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSEv05_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSEv05 decoding mode */
+    errorCode = FSEv05_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSEv05_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSEv05_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSEv05_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSEv05_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+#endif   /* FSEv05_COMMONDEFS_ONLY */
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUFF0_H
+#define HUFF0_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* ****************************************
+*  Huff0 simple functions
+******************************************/
+size_t HUFv05_decompress(void* dst,  size_t dstSize,
+                const void* cSrc, size_t cSrcSize);
+/*!
+HUFv05_decompress():
+    Decompress Huff0 data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstSize'.
+    @dstSize : must be the **exact** size of original (uncompressed) data.
+    Note : in contrast with FSEv05, HUFv05_decompress can regenerate
+           RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+           because it knows size to regenerate.
+    @return : size of regenerated data (== dstSize)
+              or an error code, which can be tested using HUFv05_isError()
+*/
+
+
+/* ****************************************
+*  Tool functions
+******************************************/
+/* Error Management */
+unsigned    HUFv05_isError(size_t code);        /* tells if a return value is an error code */
+const char* HUFv05_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* HUF0_H */
+/* ******************************************************************
+   Huff0 : Huffman codec, part of New Generation Entropy library
+   header file, for static linking only
+   Copyright (C) 2013-2016, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUF0_STATIC_H
+#define HUF0_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* static allocation of Huff0's DTable */
+#define HUFv05_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))
+#define HUFv05_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUFv05_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUFv05_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUFv05_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUFv05_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUFv05_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUFv05_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUFv05_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+
+
+/* ****************************************
+*  Huff0 detailed API
+******************************************/
+/*!
+HUFv05_decompress() does the following:
+1. select the decompression algorithm (X2, X4, X6) based on pre-computed heuristics
+2. build Huffman table from save, using HUFv05_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUFv05_decompressSXn_usingDTable
+*/
+size_t HUFv05_readDTableX2 (unsigned short* DTable, const void* src, size_t srcSize);
+size_t HUFv05_readDTableX4 (unsigned* DTable, const void* src, size_t srcSize);
+
+size_t HUFv05_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+size_t HUFv05_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+/* single stream variants */
+
+size_t HUFv05_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUFv05_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+
+size_t HUFv05_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+size_t HUFv05_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* HUF0_STATIC_H */
+/* ******************************************************************
+   Huff0 : Huffman coder, part of New Generation Entropy library
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSEv05+Huff0 source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+
+
+/* **************************************************************
+*  Constants
+****************************************************************/
+#define HUFv05_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUFv05_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUFv05_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUFv05_ABSOLUTEMAX_TABLELOG */
+#define HUFv05_DEFAULT_TABLELOG  HUFv05_MAX_TABLELOG   /* tableLog by default, when not specified */
+#define HUFv05_MAX_SYMBOL_VALUE 255
+#if (HUFv05_MAX_TABLELOG > HUFv05_ABSOLUTEMAX_TABLELOG)
+#  error "HUFv05_MAX_TABLELOG is too large !"
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+unsigned HUFv05_isError(size_t code) { return ERR_isError(code); }
+const char* HUFv05_getErrorName(size_t code) { return ERR_getErrorName(code); }
+#define HUFv05_STATIC_ASSERT(c) { enum { HUFv05_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* *******************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUFv05_DEltX2;   /* single-symbol decoding */
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUFv05_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/*! HUFv05_readStats
+    Read compact Huffman tree, saved by HUFv05_writeCTable
+    @huffWeight : destination buffer
+    @return : size read from `src`
+*/
+static size_t HUFv05_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    U32 tableLog;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+    U32 n;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  { /* special header */
+        if (iSize >= (242)) {  /* RLE */
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else {   /* Incompressible */
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            for (n=0; n<oSize; n+=2) {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }
+    else  {   /* header compressed with FSEv05 (normal case) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSEv05_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSEv05_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUFv05_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++) {
+        if (huffWeight[n] >= HUFv05_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+        rankStats[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    tableLog = BITv05_highbit32(weightTotal) + 1;
+    if (tableLog > HUFv05_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+    {   /* determine last weight */
+        U32 total = 1 << tableLog;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << BITv05_highbit32(rest);
+        U32 lastWeight = BITv05_highbit32(rest) + 1;
+        if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankStats[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    *tableLogPtr = tableLog;
+    return iSize+1;
+}
+
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+
+size_t HUFv05_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUFv05_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUFv05_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    size_t iSize;
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    void* const dtPtr = DTable + 1;
+    HUFv05_DEltX2* const dt = (HUFv05_DEltX2*)dtPtr;
+
+    HUFv05_STATIC_ASSERT(sizeof(HUFv05_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv05_readStats(huffWeight, HUFv05_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv05_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof allocated DTable, from used size of DTable, in case of re-use */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=tableLog; n++) {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<nbSymbols; n++) {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUFv05_DEltX2 D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize;
+}
+
+static BYTE HUFv05_decodeSymbolX2(BITv05_DStream_t* Dstream, const HUFv05_DEltX2* dt, const U32 dtLog)
+{
+        const size_t val = BITv05_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        BITv05_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+#define HUFv05_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUFv05_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUFv05_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv05_MAX_TABLELOG<=12)) \
+        HUFv05_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUFv05_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUFv05_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUFv05_decodeStreamX2(BYTE* p, BITv05_DStream_t* const bitDPtr, BYTE* const pEnd, const HUFv05_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BITv05_reloadDStream(bitDPtr) == BITv05_DStream_unfinished) && (p <= pEnd-4)) {
+        HUFv05_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BITv05_reloadDStream(bitDPtr) == BITv05_DStream_unfinished) && (p < pEnd))
+        HUFv05_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUFv05_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+size_t HUFv05_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const U32 dtLog = DTable[0];
+    const void* dtPtr = DTable;
+    const HUFv05_DEltX2* const dt = ((const HUFv05_DEltX2*)dtPtr)+1;
+    BITv05_DStream_t bitD;
+
+    if (dstSize <= cSrcSize) return ERROR(dstSize_tooSmall);
+    { size_t const errorCode = BITv05_initDStream(&bitD, cSrc, cSrcSize);
+      if (HUFv05_isError(errorCode)) return errorCode; }
+
+    HUFv05_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+    /* check */
+    if (!BITv05_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+size_t HUFv05_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv05_CREATE_STATIC_DTABLEX2(DTable, HUFv05_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUFv05_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUFv05_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUFv05_decompress1X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+size_t HUFv05_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    const BYTE* const istart = (const BYTE*) cSrc;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+    const void* const dtPtr = DTable;
+    const HUFv05_DEltX2* const dt = ((const HUFv05_DEltX2*)dtPtr) +1;
+    const U32 dtLog = DTable[0];
+    size_t errorCode;
+
+    /* Init */
+    BITv05_DStream_t bitD1;
+    BITv05_DStream_t bitD2;
+    BITv05_DStream_t bitD3;
+    BITv05_DStream_t bitD4;
+    const size_t length1 = MEM_readLE16(istart);
+    const size_t length2 = MEM_readLE16(istart+2);
+    const size_t length3 = MEM_readLE16(istart+4);
+    size_t length4;
+    const BYTE* const istart1 = istart + 6;  /* jumpTable */
+    const BYTE* const istart2 = istart1 + length1;
+    const BYTE* const istart3 = istart2 + length2;
+    const BYTE* const istart4 = istart3 + length3;
+    const size_t segmentSize = (dstSize+3) / 4;
+    BYTE* const opStart2 = ostart + segmentSize;
+    BYTE* const opStart3 = opStart2 + segmentSize;
+    BYTE* const opStart4 = opStart3 + segmentSize;
+    BYTE* op1 = ostart;
+    BYTE* op2 = opStart2;
+    BYTE* op3 = opStart3;
+    BYTE* op4 = opStart4;
+    U32 endSignal;
+
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    length4 = cSrcSize - (length1 + length2 + length3 + 6);
+    if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+    errorCode = BITv05_initDStream(&bitD1, istart1, length1);
+    if (HUFv05_isError(errorCode)) return errorCode;
+    errorCode = BITv05_initDStream(&bitD2, istart2, length2);
+    if (HUFv05_isError(errorCode)) return errorCode;
+    errorCode = BITv05_initDStream(&bitD3, istart3, length3);
+    if (HUFv05_isError(errorCode)) return errorCode;
+    errorCode = BITv05_initDStream(&bitD4, istart4, length4);
+    if (HUFv05_isError(errorCode)) return errorCode;
+
+    /* 16-32 symbols per loop (4-8 symbols per stream) */
+    endSignal = BITv05_reloadDStream(&bitD1) | BITv05_reloadDStream(&bitD2) | BITv05_reloadDStream(&bitD3) | BITv05_reloadDStream(&bitD4);
+    for ( ; (endSignal==BITv05_DStream_unfinished) && (op4<(oend-7)) ; ) {
+        HUFv05_DECODE_SYMBOLX2_2(op1, &bitD1);
+        HUFv05_DECODE_SYMBOLX2_2(op2, &bitD2);
+        HUFv05_DECODE_SYMBOLX2_2(op3, &bitD3);
+        HUFv05_DECODE_SYMBOLX2_2(op4, &bitD4);
+        HUFv05_DECODE_SYMBOLX2_1(op1, &bitD1);
+        HUFv05_DECODE_SYMBOLX2_1(op2, &bitD2);
+        HUFv05_DECODE_SYMBOLX2_1(op3, &bitD3);
+        HUFv05_DECODE_SYMBOLX2_1(op4, &bitD4);
+        HUFv05_DECODE_SYMBOLX2_2(op1, &bitD1);
+        HUFv05_DECODE_SYMBOLX2_2(op2, &bitD2);
+        HUFv05_DECODE_SYMBOLX2_2(op3, &bitD3);
+        HUFv05_DECODE_SYMBOLX2_2(op4, &bitD4);
+        HUFv05_DECODE_SYMBOLX2_0(op1, &bitD1);
+        HUFv05_DECODE_SYMBOLX2_0(op2, &bitD2);
+        HUFv05_DECODE_SYMBOLX2_0(op3, &bitD3);
+        HUFv05_DECODE_SYMBOLX2_0(op4, &bitD4);
+        endSignal = BITv05_reloadDStream(&bitD1) | BITv05_reloadDStream(&bitD2) | BITv05_reloadDStream(&bitD3) | BITv05_reloadDStream(&bitD4);
+    }
+
+    /* check corruption */
+    if (op1 > opStart2) return ERROR(corruption_detected);
+    if (op2 > opStart3) return ERROR(corruption_detected);
+    if (op3 > opStart4) return ERROR(corruption_detected);
+    /* note : op4 supposed already verified within main loop */
+
+    /* finish bitStreams one by one */
+    HUFv05_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+    HUFv05_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+    HUFv05_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+    HUFv05_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+    /* check */
+    endSignal = BITv05_endOfDStream(&bitD1) & BITv05_endOfDStream(&bitD2) & BITv05_endOfDStream(&bitD3) & BITv05_endOfDStream(&bitD4);
+    if (!endSignal) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+
+size_t HUFv05_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv05_CREATE_STATIC_DTABLEX2(DTable, HUFv05_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUFv05_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUFv05_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUFv05_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+
+static void HUFv05_fillDTableX4Level2(HUFv05_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUFv05_DEltX4 DElt;
+    U32 rankVal[HUFv05_ABSOLUTEMAX_TABLELOG + 1];
+    U32 s;
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }
+}
+
+typedef U32 rankVal_t[HUFv05_ABSOLUTEMAX_TABLELOG][HUFv05_ABSOLUTEMAX_TABLELOG + 1];
+
+static void HUFv05_fillDTableX4(HUFv05_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUFv05_ABSOLUTEMAX_TABLELOG + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUFv05_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            U32 i;
+            const U32 end = start + length;
+            HUFv05_DEltX4 DElt;
+
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits   = (BYTE)(nbBits);
+            DElt.length   = 1;
+            for (i = start; i < end; i++)
+                DTable[i] = DElt;
+        }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUFv05_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUFv05_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUFv05_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUFv05_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUFv05_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    const U32 memLog = DTable[0];
+    size_t iSize;
+    void* dtPtr = DTable;
+    HUFv05_DEltX4* const dt = ((HUFv05_DEltX4*)dtPtr) + 1;
+
+    HUFv05_STATIC_ASSERT(sizeof(HUFv05_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
+    if (memLog > HUFv05_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv05_readStats(weightList, HUFv05_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv05_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {
+        U32 w, nextRankStart = 0;
+        for (w=1; w<=maxW; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {
+        U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 w = weightList[s];
+            U32 r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {
+        const U32 minBits = tableLog+1 - maxW;
+        U32 nextRankVal = 0;
+        U32 w, consumed;
+        const int rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+        U32* rankVal0 = rankVal[0];
+        for (w=1; w<=maxW; w++) {
+            U32 current = nextRankVal;
+            nextRankVal += rankStats[w] << (w+rescale);
+            rankVal0[w] = current;
+        }
+        for (consumed = minBits; consumed <= memLog - minBits; consumed++) {
+            U32* rankValPtr = rankVal[consumed];
+            for (w = 1; w <= maxW; w++) {
+                rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }
+
+    HUFv05_fillDTableX4(dt, memLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    return iSize;
+}
+
+
+static U32 HUFv05_decodeSymbolX4(void* op, BITv05_DStream_t* DStream, const HUFv05_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv05_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BITv05_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUFv05_decodeLastSymbolX4(void* op, BITv05_DStream_t* DStream, const HUFv05_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv05_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BITv05_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BITv05_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+    }   }
+    return 1;
+}
+
+
+#define HUFv05_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUFv05_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv05_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv05_MAX_TABLELOG<=12)) \
+        ptr += HUFv05_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv05_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUFv05_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUFv05_decodeStreamX4(BYTE* p, BITv05_DStream_t* bitDPtr, BYTE* const pEnd, const HUFv05_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BITv05_reloadDStream(bitDPtr) == BITv05_DStream_unfinished) && (p < pEnd-7)) {
+        HUFv05_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv05_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BITv05_reloadDStream(bitDPtr) == BITv05_DStream_unfinished) && (p <= pEnd-2))
+        HUFv05_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUFv05_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUFv05_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+size_t HUFv05_decompress1X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    const BYTE* const istart = (const BYTE*) cSrc;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+
+    const U32 dtLog = DTable[0];
+    const void* const dtPtr = DTable;
+    const HUFv05_DEltX4* const dt = ((const HUFv05_DEltX4*)dtPtr) +1;
+    size_t errorCode;
+
+    /* Init */
+    BITv05_DStream_t bitD;
+    errorCode = BITv05_initDStream(&bitD, istart, cSrcSize);
+    if (HUFv05_isError(errorCode)) return errorCode;
+
+    /* finish bitStreams one by one */
+    HUFv05_decodeStreamX4(ostart, &bitD, oend,     dt, dtLog);
+
+    /* check */
+    if (!BITv05_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+size_t HUFv05_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv05_CREATE_STATIC_DTABLEX4(DTable, HUFv05_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUFv05_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUFv05_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUFv05_decompress1X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+size_t HUFv05_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {
+        const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable;
+        const HUFv05_DEltX4* const dt = ((const HUFv05_DEltX4*)dtPtr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BITv05_DStream_t bitD1;
+        BITv05_DStream_t bitD2;
+        BITv05_DStream_t bitD3;
+        BITv05_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BITv05_initDStream(&bitD1, istart1, length1);
+        if (HUFv05_isError(errorCode)) return errorCode;
+        errorCode = BITv05_initDStream(&bitD2, istart2, length2);
+        if (HUFv05_isError(errorCode)) return errorCode;
+        errorCode = BITv05_initDStream(&bitD3, istart3, length3);
+        if (HUFv05_isError(errorCode)) return errorCode;
+        errorCode = BITv05_initDStream(&bitD4, istart4, length4);
+        if (HUFv05_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BITv05_reloadDStream(&bitD1) | BITv05_reloadDStream(&bitD2) | BITv05_reloadDStream(&bitD3) | BITv05_reloadDStream(&bitD4);
+        for ( ; (endSignal==BITv05_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUFv05_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv05_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv05_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv05_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv05_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUFv05_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUFv05_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUFv05_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUFv05_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv05_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv05_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv05_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv05_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUFv05_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUFv05_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUFv05_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BITv05_reloadDStream(&bitD1) | BITv05_reloadDStream(&bitD2) | BITv05_reloadDStream(&bitD3) | BITv05_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUFv05_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUFv05_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUFv05_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUFv05_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BITv05_endOfDStream(&bitD1) & BITv05_endOfDStream(&bitD2) & BITv05_endOfDStream(&bitD3) & BITv05_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUFv05_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv05_CREATE_STATIC_DTABLEX4(DTable, HUFv05_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUFv05_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUFv05_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUFv05_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUFv05_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[3] = { HUFv05_decompress4X2, HUFv05_decompress4X4, NULL };
+    /* estimate decompression time */
+    U32 Q;
+    const U32 D256 = (U32)(dstSize >> 8);
+    U32 Dtime[3];
+    U32 algoNb = 0;
+    int n;
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize >= dstSize) return ERROR(corruption_detected);   /* invalid, or not compressed, but not compressed already dealt with */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    /* decoder timing evaluation */
+    Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    for (n=0; n<3; n++)
+        Dtime[n] = algoTime[Q][n].tableTime + (algoTime[Q][n].decode256Time * D256);
+
+    Dtime[1] += Dtime[1] >> 4; Dtime[2] += Dtime[2] >> 3; /* advantage to algorithms using less memory, for cache eviction */
+
+    if (Dtime[1] < Dtime[0]) algoNb = 1;
+
+    return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+
+    //return HUFv05_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUFv05_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+    //return HUFv05_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
+}
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTDv05_decompress() will allocate memory,
+ * in memory stack (0), or in memory heap (1, requires malloc())
+ */
+#ifndef ZSTDv05_HEAPMODE
+#  define ZSTDv05_HEAPMODE 1
+#endif
+
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include <stdlib.h>      /* calloc */
+#include <string.h>      /* memcpy, memmove */
+#include <stdio.h>       /* debug only : printf */
+
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+
+/*-*************************************
+*  Local types
+***************************************/
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+
+/* *******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTDv05_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/* *************************************
+*  Error Management
+***************************************/
+/*! ZSTDv05_isError() :
+*   tells if a return value is an error code */
+unsigned ZSTDv05_isError(size_t code) { return ERR_isError(code); }
+
+
+/*! ZSTDv05_getErrorName() :
+*   provides error code string (useful for debugging) */
+const char* ZSTDv05_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/* *************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDv05ds_getFrameHeaderSize, ZSTDv05ds_decodeFrameHeader,
+               ZSTDv05ds_decodeBlockHeader, ZSTDv05ds_decompressBlock } ZSTDv05_dStage;
+
+struct ZSTDv05_DCtx_s
+{
+    FSEv05_DTable LLTable[FSEv05_DTABLE_SIZE_U32(LLFSEv05Log)];
+    FSEv05_DTable OffTable[FSEv05_DTABLE_SIZE_U32(OffFSEv05Log)];
+    FSEv05_DTable MLTable[FSEv05_DTABLE_SIZE_U32(MLFSEv05Log)];
+    unsigned   hufTableX4[HUFv05_DTABLE_SIZE(HufLog)];
+    const void* previousDstEnd;
+    const void* base;
+    const void* vBase;
+    const void* dictEnd;
+    size_t expected;
+    size_t headerSize;
+    ZSTDv05_parameters params;
+    blockType_t bType;   /* used in ZSTDv05_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
+    ZSTDv05_dStage stage;
+    U32 flagStaticTables;
+    const BYTE* litPtr;
+    size_t litSize;
+    BYTE litBuffer[BLOCKSIZE + WILDCOPY_OVERLENGTH];
+    BYTE headerBuffer[ZSTDv05_frameHeaderSize_max];
+};  /* typedef'd to ZSTDv05_DCtx within "zstd_static.h" */
+
+size_t ZSTDv05_sizeofDCtx (void) { return sizeof(ZSTDv05_DCtx); }
+
+size_t ZSTDv05_decompressBegin(ZSTDv05_DCtx* dctx)
+{
+    dctx->expected = ZSTDv05_frameHeaderSize_min;
+    dctx->stage = ZSTDv05ds_getFrameHeaderSize;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    dctx->hufTableX4[0] = HufLog;
+    dctx->flagStaticTables = 0;
+    return 0;
+}
+
+ZSTDv05_DCtx* ZSTDv05_createDCtx(void)
+{
+    ZSTDv05_DCtx* dctx = (ZSTDv05_DCtx*)malloc(sizeof(ZSTDv05_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTDv05_decompressBegin(dctx);
+    return dctx;
+}
+
+size_t ZSTDv05_freeDCtx(ZSTDv05_DCtx* dctx)
+{
+    free(dctx);
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+void ZSTDv05_copyDCtx(ZSTDv05_DCtx* dstDCtx, const ZSTDv05_DCtx* srcDCtx)
+{
+    memcpy(dstDCtx, srcDCtx,
+           sizeof(ZSTDv05_DCtx) - (BLOCKSIZE+WILDCOPY_OVERLENGTH + ZSTDv05_frameHeaderSize_max));  /* no need to copy workspace */
+}
+
+
+/* *************************************************************
+*   Decompression section
+***************************************************************/
+
+/* Frame format description
+   Frame Header -  [ Block Header - Block ] - Frame End
+   1) Frame Header
+      - 4 bytes - Magic Number : ZSTDv05_MAGICNUMBER (defined within zstd_internal.h)
+      - 1 byte  - Window Descriptor
+   2) Block Header
+      - 3 bytes, starting with a 2-bits descriptor
+                 Uncompressed, Compressed, Frame End, unused
+   3) Block
+      See Block Format Description
+   4) Frame End
+      - 3 bytes, compatible with Block Header
+*/
+
+/* Block format description
+
+   Block = Literal Section - Sequences Section
+   Prerequisite : size of (compressed) block, maximum size of regenerated data
+
+   1) Literal Section
+
+   1.1) Header : 1-5 bytes
+        flags: 2 bits
+            00 compressed by Huff0
+            01 unused
+            10 is Raw (uncompressed)
+            11 is Rle
+            Note : using 01 => Huff0 with precomputed table ?
+            Note : delta map ? => compressed ?
+
+   1.1.1) Huff0-compressed literal block : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+   1.1.2) Raw (uncompressed) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RAW<<6) + (0<<4) + size
+               12 bits: (IS_RAW<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RAW<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.3) Rle (repeated single byte) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RLE<<6) + (0<<4) + size
+               12 bits: (IS_RLE<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RLE<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.4) Huff0-compressed literal block, using precomputed CTables : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+        1- CTable available (stored into workspace ?)
+        2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
+
+
+   1.2) Literal block content
+
+   1.2.1) Huff0 block, using sizes from header
+        See Huff0 format
+
+   1.2.2) Huff0 block, using prepared table
+
+   1.2.3) Raw content
+
+   1.2.4) single byte
+
+
+   2) Sequences section
+      TO DO
+*/
+
+
+/** ZSTDv05_decodeFrameHeader_Part1() :
+*   decode the 1st part of the Frame Header, which tells Frame Header size.
+*   srcSize must be == ZSTDv05_frameHeaderSize_min.
+*   @return : the full size of the Frame Header */
+static size_t ZSTDv05_decodeFrameHeader_Part1(ZSTDv05_DCtx* zc, const void* src, size_t srcSize)
+{
+    U32 magicNumber;
+    if (srcSize != ZSTDv05_frameHeaderSize_min)
+        return ERROR(srcSize_wrong);
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTDv05_MAGICNUMBER) return ERROR(prefix_unknown);
+    zc->headerSize = ZSTDv05_frameHeaderSize_min;
+    return zc->headerSize;
+}
+
+
+size_t ZSTDv05_getFrameParams(ZSTDv05_parameters* params, const void* src, size_t srcSize)
+{
+    U32 magicNumber;
+    if (srcSize < ZSTDv05_frameHeaderSize_min) return ZSTDv05_frameHeaderSize_max;
+    magicNumber = MEM_readLE32(src);
+    if (magicNumber != ZSTDv05_MAGICNUMBER) return ERROR(prefix_unknown);
+    memset(params, 0, sizeof(*params));
+    params->windowLog = (((const BYTE*)src)[4] & 15) + ZSTDv05_WINDOWLOG_ABSOLUTEMIN;
+    if ((((const BYTE*)src)[4] >> 4) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits */
+    return 0;
+}
+
+/** ZSTDv05_decodeFrameHeader_Part2() :
+*   decode the full Frame Header.
+*   srcSize must be the size provided by ZSTDv05_decodeFrameHeader_Part1().
+*   @return : 0, or an error code, which can be tested using ZSTDv05_isError() */
+static size_t ZSTDv05_decodeFrameHeader_Part2(ZSTDv05_DCtx* zc, const void* src, size_t srcSize)
+{
+    size_t result;
+    if (srcSize != zc->headerSize)
+        return ERROR(srcSize_wrong);
+    result = ZSTDv05_getFrameParams(&(zc->params), src, srcSize);
+    if ((MEM_32bits()) && (zc->params.windowLog > 25)) return ERROR(frameParameter_unsupported);
+    return result;
+}
+
+
+size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    BYTE headerFlags;
+    U32 cSize;
+
+    if (srcSize < 3)
+        return ERROR(srcSize_wrong);
+
+    headerFlags = *in;
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+
+    bpPtr->blockType = (blockType_t)(headerFlags >> 6);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+
+static size_t ZSTDv05_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/*! ZSTDv05_decodeLiteralsBlock() :
+    @return : nb of bytes read from src (< srcSize ) */
+size_t ZSTDv05_decodeLiteralsBlock(ZSTDv05_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    const BYTE* const istart = (const BYTE*) src;
+
+    /* any compressed block with literals segment must be at least this size */
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch(istart[0]>> 6)
+    {
+    case IS_HUFv05:
+        {
+            size_t litSize, litCSize, singleStream=0;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                /* 2 - 2 - 10 - 10 */
+                lhSize=3;
+                singleStream = istart[0] & 16;
+                litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+                litCSize = ((istart[1] &  3) << 8) + istart[2];
+                break;
+            case 2:
+                /* 2 - 2 - 14 - 14 */
+                lhSize=4;
+                litSize  = ((istart[0] & 15) << 10) + (istart[1] << 2) + (istart[2] >> 6);
+                litCSize = ((istart[2] & 63) <<  8) + istart[3];
+                break;
+            case 3:
+                /* 2 - 2 - 18 - 18 */
+                lhSize=5;
+                litSize  = ((istart[0] & 15) << 14) + (istart[1] << 6) + (istart[2] >> 2);
+                litCSize = ((istart[2] &  3) << 16) + (istart[3] << 8) + istart[4];
+                break;
+            }
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            if (HUFv05_isError(singleStream ?
+                            HUFv05_decompress1X2(dctx->litBuffer, litSize, istart+lhSize, litCSize) :
+                            HUFv05_decompress   (dctx->litBuffer, litSize, istart+lhSize, litCSize) ))
+                return ERROR(corruption_detected);
+
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case IS_PCH:
+        {
+            size_t errorCode;
+            size_t litSize, litCSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (lhSize != 1)  /* only case supported for now : small litSize, single stream */
+                return ERROR(corruption_detected);
+            if (!dctx->flagStaticTables)
+                return ERROR(dictionary_corrupted);
+
+            /* 2 - 2 - 10 - 10 */
+            lhSize=3;
+            litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+            litCSize = ((istart[1] &  3) << 8) + istart[2];
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            errorCode = HUFv05_decompress1X4_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->hufTableX4);
+            if (HUFv05_isError(errorCode)) return ERROR(corruption_detected);
+
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case IS_RAW:
+        {
+            size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize=1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                break;
+            }
+
+            if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                if (litSize+lhSize > srcSize) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                return lhSize+litSize;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+lhSize;
+            dctx->litSize = litSize;
+            return lhSize+litSize;
+        }
+    case IS_RLE:
+        {
+            size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize = 1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                if (srcSize<4) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+                break;
+            }
+            if (litSize > BLOCKSIZE) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return lhSize+1;
+        }
+    default:
+        return ERROR(corruption_detected);   /* impossible */
+    }
+}
+
+
+size_t ZSTDv05_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+                         FSEv05_DTable* DTableLL, FSEv05_DTable* DTableML, FSEv05_DTable* DTableOffb,
+                         const void* src, size_t srcSize, U32 flagStaticTable)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* ip = istart;
+    const BYTE* const iend = istart + srcSize;
+    U32 LLtype, Offtype, MLtype;
+    U32 LLlog, Offlog, MLlog;
+    size_t dumpsLength;
+
+    /* check */
+    if (srcSize < MIN_SEQUENCES_SIZE)
+        return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    *nbSeq = *ip++;
+    if (*nbSeq==0) return 1;
+    if (*nbSeq >= 128) {
+        if (ip >= iend) return ERROR(srcSize_wrong);
+        *nbSeq = ((nbSeq[0]-128)<<8) + *ip++;
+    }
+
+    if (ip >= iend) return ERROR(srcSize_wrong);
+    LLtype  = *ip >> 6;
+    Offtype = (*ip >> 4) & 3;
+    MLtype  = (*ip >> 2) & 3;
+    if (*ip & 2) {
+        if (ip+3 > iend) return ERROR(srcSize_wrong);
+        dumpsLength  = ip[2];
+        dumpsLength += ip[1] << 8;
+        ip += 3;
+    } else {
+        if (ip+2 > iend) return ERROR(srcSize_wrong);
+        dumpsLength  = ip[1];
+        dumpsLength += (ip[0] & 1) << 8;
+        ip += 2;
+    }
+    *dumpsPtr = ip;
+    ip += dumpsLength;
+    *dumpsLengthPtr = dumpsLength;
+
+    /* check */
+    if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+    /* sequences */
+    {
+        S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL >= MaxOff */
+        size_t headerSize;
+
+        /* Build DTables */
+        switch(LLtype)
+        {
+        case FSEv05_ENCODING_RLE :
+            LLlog = 0;
+            FSEv05_buildDTable_rle(DTableLL, *ip++);
+            break;
+        case FSEv05_ENCODING_RAW :
+            LLlog = LLbits;
+            FSEv05_buildDTable_raw(DTableLL, LLbits);
+            break;
+        case FSEv05_ENCODING_STATIC:
+            if (!flagStaticTable) return ERROR(corruption_detected);
+            break;
+        case FSEv05_ENCODING_DYNAMIC :
+        default :   /* impossible */
+            {   U32 max = MaxLL;
+                headerSize = FSEv05_readNCount(norm, &max, &LLlog, ip, iend-ip);
+                if (FSEv05_isError(headerSize)) return ERROR(GENERIC);
+                if (LLlog > LLFSEv05Log) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSEv05_buildDTable(DTableLL, norm, max, LLlog);
+        }   }
+
+        switch(Offtype)
+        {
+        case FSEv05_ENCODING_RLE :
+            Offlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong);   /* min : "raw", hence no header, but at least xxLog bits */
+            FSEv05_buildDTable_rle(DTableOffb, *ip++ & MaxOff); /* if *ip > MaxOff, data is corrupted */
+            break;
+        case FSEv05_ENCODING_RAW :
+            Offlog = Offbits;
+            FSEv05_buildDTable_raw(DTableOffb, Offbits);
+            break;
+        case FSEv05_ENCODING_STATIC:
+            if (!flagStaticTable) return ERROR(corruption_detected);
+            break;
+        case FSEv05_ENCODING_DYNAMIC :
+        default :   /* impossible */
+            {   U32 max = MaxOff;
+                headerSize = FSEv05_readNCount(norm, &max, &Offlog, ip, iend-ip);
+                if (FSEv05_isError(headerSize)) return ERROR(GENERIC);
+                if (Offlog > OffFSEv05Log) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSEv05_buildDTable(DTableOffb, norm, max, Offlog);
+        }   }
+
+        switch(MLtype)
+        {
+        case FSEv05_ENCODING_RLE :
+            MLlog = 0;
+            if (ip > iend-2) return ERROR(srcSize_wrong); /* min : "raw", hence no header, but at least xxLog bits */
+            FSEv05_buildDTable_rle(DTableML, *ip++);
+            break;
+        case FSEv05_ENCODING_RAW :
+            MLlog = MLbits;
+            FSEv05_buildDTable_raw(DTableML, MLbits);
+            break;
+        case FSEv05_ENCODING_STATIC:
+            if (!flagStaticTable) return ERROR(corruption_detected);
+            break;
+        case FSEv05_ENCODING_DYNAMIC :
+        default :   /* impossible */
+            {   U32 max = MaxML;
+                headerSize = FSEv05_readNCount(norm, &max, &MLlog, ip, iend-ip);
+                if (FSEv05_isError(headerSize)) return ERROR(GENERIC);
+                if (MLlog > MLFSEv05Log) return ERROR(corruption_detected);
+                ip += headerSize;
+                FSEv05_buildDTable(DTableML, norm, max, MLlog);
+    }   }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+} seq_t;
+
+typedef struct {
+    BITv05_DStream_t DStream;
+    FSEv05_DState_t stateLL;
+    FSEv05_DState_t stateOffb;
+    FSEv05_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+    const BYTE* dumpsEnd;
+} seqState_t;
+
+
+
+static void ZSTDv05_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+    const BYTE* const de = seqState->dumpsEnd;
+
+    /* Literal length */
+    litLength = FSEv05_peakSymbol(&(seqState->stateLL));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    if (litLength == MaxLL) {
+        U32 add = *dumps++;
+        if (add < 255) litLength += add;
+        else {
+            litLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no risk : dumps is always followed by seq tables > 1 byte */
+            if (litLength&1) litLength>>=1, dumps += 3;
+            else litLength = (U16)(litLength)>>1, dumps += 2;
+        }
+        if (dumps > de) { litLength = MaxLL+255; }  /* late correction, to avoid using uninitialized memory */
+        if (dumps >= de) { dumps = de-1; }  /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+
+    /* Offset */
+    {
+        static const U32 offsetPrefix[MaxOff+1] = {
+                1 /*fake*/, 1, 2, 4, 8, 16, 32, 64, 128, 256,
+                512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144,
+                524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, /*fake*/ 1, 1, 1, 1, 1 };
+        U32 offsetCode = FSEv05_peakSymbol(&(seqState->stateOffb));   /* <= maxOff, by table construction */
+        U32 nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = offsetPrefix[offsetCode] + BITv05_readBits(&(seqState->DStream), nbBits);
+        if (MEM_32bits()) BITv05_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;   /* repcode, cmove */
+        if (offsetCode | !litLength) seqState->prevOffset = seq->offset;   /* cmove */
+        FSEv05_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));    /* update */
+    }
+
+    /* Literal length update */
+    FSEv05_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));   /* update */
+    if (MEM_32bits()) BITv05_reloadDStream(&(seqState->DStream));
+
+    /* MatchLength */
+    matchLength = FSEv05_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML) {
+        U32 add = *dumps++;
+        if (add < 255) matchLength += add;
+        else {
+            matchLength = MEM_readLE32(dumps) & 0xFFFFFF;  /* no pb : dumps is always followed by seq tables > 1 byte */
+            if (matchLength&1) matchLength>>=1, dumps += 3;
+            else matchLength = (U16)(matchLength)>>1, dumps += 2;
+        }
+        if (dumps > de) { matchLength = MaxML+255; }  /* late correction, to avoid using uninitialized memory */
+        if (dumps >= de) { dumps = de-1; }  /* late correction, to avoid read overflow (data is now corrupted anyway) */
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+
+#if 0   /* debug */
+    {
+        static U64 totalDecoded = 0;
+        printf("pos %6u : %3u literals & match %3u bytes at distance %6u \n",
+           (U32)(totalDecoded), (U32)litLength, (U32)matchLength, (U32)offset);
+        totalDecoded += litLength + matchLength;
+    }
+#endif
+}
+
+
+static size_t ZSTDv05_execSequence(BYTE* op,
+                                BYTE* const oend, seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    static const int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+    static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* substracted */
+    BYTE* const oLitEnd = op + sequence.litLength;
+    const size_t sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_8 = oend-8;
+    const BYTE* const litEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (litEnd > litLimit) return ERROR(corruption_detected);   /* risk read beyond lit buffer */
+
+    /* copy Literals */
+    ZSTDv05_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase))
+            return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {
+            size_t length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_8 || sequence.matchLength < MINMATCH) {
+              while (op < oMatchEnd) *op++ = *match++;
+              return sequenceLength;
+            }
+    }   }
+    /* Requirement: op <= oend_8 */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        const int sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTDv05_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTDv05_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_8) {
+            ZSTDv05_wildcopy(op, match, oend_8 - op);
+            match += oend_8 - op;
+            op = oend_8;
+        }
+        while (op < oMatchEnd)
+            *op++ = *match++;
+    } else {
+        ZSTDv05_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+
+static size_t ZSTDv05_decompressSequences(
+                               ZSTDv05_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t errorCode, dumpsLength;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    int nbSeq;
+    const BYTE* dumps;
+    U32* DTableLL = dctx->LLTable;
+    U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+
+    /* Build Decoding Tables */
+    errorCode = ZSTDv05_decodeSeqHeaders(&nbSeq, &dumps, &dumpsLength,
+                                      DTableLL, DTableML, DTableOffb,
+                                      ip, seqSize, dctx->flagStaticTables);
+    if (ZSTDv05_isError(errorCode)) return errorCode;
+    ip += errorCode;
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        sequence.offset = REPCODE_STARTVALUE;
+        seqState.dumps = dumps;
+        seqState.dumpsEnd = dumps + dumpsLength;
+        seqState.prevOffset = REPCODE_STARTVALUE;
+        errorCode = BITv05_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (ERR_isError(errorCode)) return ERROR(corruption_detected);
+        FSEv05_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSEv05_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSEv05_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BITv05_reloadDStream(&(seqState.DStream)) <= BITv05_DStream_completed) && nbSeq ; ) {
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTDv05_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTDv05_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+            if (ZSTDv05_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* check if reached exact end */
+        if (nbSeq) return ERROR(corruption_detected);
+    }
+
+    /* last literal segment */
+    {
+        size_t lastLLSize = litEnd - litPtr;
+        if (litPtr > litEnd) return ERROR(corruption_detected);   /* too many literals already used */
+        if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+
+static void ZSTDv05_checkContinuity(ZSTDv05_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+static size_t ZSTDv05_decompressBlock_internal(ZSTDv05_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+    size_t litCSize;
+
+    if (srcSize >= BLOCKSIZE) return ERROR(srcSize_wrong);
+
+    /* Decode literals sub-block */
+    litCSize = ZSTDv05_decodeLiteralsBlock(dctx, src, srcSize);
+    if (ZSTDv05_isError(litCSize)) return litCSize;
+    ip += litCSize;
+    srcSize -= litCSize;
+
+    return ZSTDv05_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
+}
+
+
+size_t ZSTDv05_decompressBlock(ZSTDv05_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    ZSTDv05_checkContinuity(dctx, dst);
+    return ZSTDv05_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+/*! ZSTDv05_decompress_continueDCtx
+*   dctx must have been properly initialized */
+static size_t ZSTDv05_decompress_continueDCtx(ZSTDv05_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                                 const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + maxDstSize;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    {
+        size_t frameHeaderSize;
+        if (srcSize < ZSTDv05_frameHeaderSize_min+ZSTDv05_blockHeaderSize) return ERROR(srcSize_wrong);
+        frameHeaderSize = ZSTDv05_decodeFrameHeader_Part1(dctx, src, ZSTDv05_frameHeaderSize_min);
+        if (ZSTDv05_isError(frameHeaderSize)) return frameHeaderSize;
+        if (srcSize < frameHeaderSize+ZSTDv05_blockHeaderSize) return ERROR(srcSize_wrong);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+        frameHeaderSize = ZSTDv05_decodeFrameHeader_Part2(dctx, src, frameHeaderSize);
+        if (ZSTDv05_isError(frameHeaderSize)) return frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t decodedSize=0;
+        size_t cBlockSize = ZSTDv05_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTDv05_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv05_blockHeaderSize;
+        remainingSize -= ZSTDv05_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTDv05_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTDv05_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        if (ZSTDv05_isError(decodedSize)) return decodedSize;
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+
+size_t ZSTDv05_decompress_usingPreparedDCtx(ZSTDv05_DCtx* dctx, const ZSTDv05_DCtx* refDCtx,
+                                         void* dst, size_t maxDstSize,
+                                   const void* src, size_t srcSize)
+{
+    ZSTDv05_copyDCtx(dctx, refDCtx);
+    ZSTDv05_checkContinuity(dctx, dst);
+    return ZSTDv05_decompress_continueDCtx(dctx, dst, maxDstSize, src, srcSize);
+}
+
+
+size_t ZSTDv05_decompress_usingDict(ZSTDv05_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
+{
+    ZSTDv05_decompressBegin_usingDict(dctx, dict, dictSize);
+    ZSTDv05_checkContinuity(dctx, dst);
+    return ZSTDv05_decompress_continueDCtx(dctx, dst, maxDstSize, src, srcSize);
+}
+
+
+size_t ZSTDv05_decompressDCtx(ZSTDv05_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return ZSTDv05_decompress_usingDict(dctx, dst, maxDstSize, src, srcSize, NULL, 0);
+}
+
+size_t ZSTDv05_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+#if defined(ZSTDv05_HEAPMODE) && (ZSTDv05_HEAPMODE==1)
+    size_t regenSize;
+    ZSTDv05_DCtx* dctx = ZSTDv05_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTDv05_decompressDCtx(dctx, dst, maxDstSize, src, srcSize);
+    ZSTDv05_freeDCtx(dctx);
+    return regenSize;
+#else
+    ZSTDv05_DCtx dctx;
+    return ZSTDv05_decompressDCtx(&dctx, dst, maxDstSize, src, srcSize);
+#endif
+}
+
+size_t ZSTDv05_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties;
+
+    /* Frame Header */
+    if (srcSize < ZSTDv05_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    if (MEM_readLE32(src) != ZSTDv05_MAGICNUMBER) return ERROR(prefix_unknown);
+    ip += ZSTDv05_frameHeaderSize_min; remainingSize -= ZSTDv05_frameHeaderSize_min;
+
+    /* Loop on each block */
+    while (1)
+    {
+        size_t cBlockSize = ZSTDv05_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTDv05_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv05_blockHeaderSize;
+        remainingSize -= ZSTDv05_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/* ******************************
+*  Streaming Decompression API
+********************************/
+size_t ZSTDv05_nextSrcSizeToDecompress(ZSTDv05_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+size_t ZSTDv05_decompressContinue(ZSTDv05_DCtx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != dctx->expected) return ERROR(srcSize_wrong);
+    ZSTDv05_checkContinuity(dctx, dst);
+
+    /* Decompress : frame header; part 1 */
+    switch (dctx->stage)
+    {
+    case ZSTDv05ds_getFrameHeaderSize :
+        /* get frame header size */
+        if (srcSize != ZSTDv05_frameHeaderSize_min) return ERROR(srcSize_wrong);   /* impossible */
+        dctx->headerSize = ZSTDv05_decodeFrameHeader_Part1(dctx, src, ZSTDv05_frameHeaderSize_min);
+        if (ZSTDv05_isError(dctx->headerSize)) return dctx->headerSize;
+        memcpy(dctx->headerBuffer, src, ZSTDv05_frameHeaderSize_min);
+        if (dctx->headerSize > ZSTDv05_frameHeaderSize_min) return ERROR(GENERIC); /* should never happen */
+        dctx->expected = 0;   /* not necessary to copy more */
+        /* fallthrough */
+    case ZSTDv05ds_decodeFrameHeader:
+        /* get frame header */
+        {   size_t const result = ZSTDv05_decodeFrameHeader_Part2(dctx, dctx->headerBuffer, dctx->headerSize);
+            if (ZSTDv05_isError(result)) return result;
+            dctx->expected = ZSTDv05_blockHeaderSize;
+            dctx->stage = ZSTDv05ds_decodeBlockHeader;
+            return 0;
+        }
+    case ZSTDv05ds_decodeBlockHeader:
+        {
+            /* Decode block header */
+            blockProperties_t bp;
+            size_t blockSize = ZSTDv05_getcBlockSize(src, ZSTDv05_blockHeaderSize, &bp);
+            if (ZSTDv05_isError(blockSize)) return blockSize;
+            if (bp.blockType == bt_end) {
+                dctx->expected = 0;
+                dctx->stage = ZSTDv05ds_getFrameHeaderSize;
+            }
+            else {
+                dctx->expected = blockSize;
+                dctx->bType = bp.blockType;
+                dctx->stage = ZSTDv05ds_decompressBlock;
+            }
+            return 0;
+        }
+    case ZSTDv05ds_decompressBlock:
+        {
+            /* Decompress : block content */
+            size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                rSize = ZSTDv05_decompressBlock_internal(dctx, dst, maxDstSize, src, srcSize);
+                break;
+            case bt_raw :
+                rSize = ZSTDv05_copyRawBlock(dst, maxDstSize, src, srcSize);
+                break;
+            case bt_rle :
+                return ERROR(GENERIC);   /* not yet handled */
+                break;
+            case bt_end :   /* should never happen (filtered at phase 1) */
+                rSize = 0;
+                break;
+            default:
+                return ERROR(GENERIC);   /* impossible */
+            }
+            dctx->stage = ZSTDv05ds_decodeBlockHeader;
+            dctx->expected = ZSTDv05_blockHeaderSize;
+            dctx->previousDstEnd = (char*)dst + rSize;
+            return rSize;
+        }
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static void ZSTDv05_refDictContent(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+    dctx->base = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+}
+
+static size_t ZSTDv05_loadEntropy(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t hSize, offcodeHeaderSize, matchlengthHeaderSize, errorCode, litlengthHeaderSize;
+    short offcodeNCount[MaxOff+1];
+    U32 offcodeMaxValue=MaxOff, offcodeLog;
+    short matchlengthNCount[MaxML+1];
+    unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+    short litlengthNCount[MaxLL+1];
+    unsigned litlengthMaxValue = MaxLL, litlengthLog;
+
+    hSize = HUFv05_readDTableX4(dctx->hufTableX4, dict, dictSize);
+    if (HUFv05_isError(hSize)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + hSize;
+    dictSize -= hSize;
+
+    offcodeHeaderSize = FSEv05_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dict, dictSize);
+    if (FSEv05_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+    if (offcodeLog > OffFSEv05Log) return ERROR(dictionary_corrupted);
+    errorCode = FSEv05_buildDTable(dctx->OffTable, offcodeNCount, offcodeMaxValue, offcodeLog);
+    if (FSEv05_isError(errorCode)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + offcodeHeaderSize;
+    dictSize -= offcodeHeaderSize;
+
+    matchlengthHeaderSize = FSEv05_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dict, dictSize);
+    if (FSEv05_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+    if (matchlengthLog > MLFSEv05Log) return ERROR(dictionary_corrupted);
+    errorCode = FSEv05_buildDTable(dctx->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog);
+    if (FSEv05_isError(errorCode)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + matchlengthHeaderSize;
+    dictSize -= matchlengthHeaderSize;
+
+    litlengthHeaderSize = FSEv05_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dict, dictSize);
+    if (litlengthLog > LLFSEv05Log) return ERROR(dictionary_corrupted);
+    if (FSEv05_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+    errorCode = FSEv05_buildDTable(dctx->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog);
+    if (FSEv05_isError(errorCode)) return ERROR(dictionary_corrupted);
+
+    dctx->flagStaticTables = 1;
+    return hSize + offcodeHeaderSize + matchlengthHeaderSize + litlengthHeaderSize;
+}
+
+static size_t ZSTDv05_decompress_insertDictionary(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t eSize;
+    U32 magic = MEM_readLE32(dict);
+    if (magic != ZSTDv05_DICT_MAGIC) {
+        /* pure content mode */
+        ZSTDv05_refDictContent(dctx, dict, dictSize);
+        return 0;
+    }
+    /* load entropy tables */
+    dict = (const char*)dict + 4;
+    dictSize -= 4;
+    eSize = ZSTDv05_loadEntropy(dctx, dict, dictSize);
+    if (ZSTDv05_isError(eSize)) return ERROR(dictionary_corrupted);
+
+    /* reference dictionary content */
+    dict = (const char*)dict + eSize;
+    dictSize -= eSize;
+    ZSTDv05_refDictContent(dctx, dict, dictSize);
+
+    return 0;
+}
+
+
+size_t ZSTDv05_decompressBegin_usingDict(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t errorCode;
+    errorCode = ZSTDv05_decompressBegin(dctx);
+    if (ZSTDv05_isError(errorCode)) return errorCode;
+
+    if (dict && dictSize) {
+        errorCode = ZSTDv05_decompress_insertDictionary(dctx, dict, dictSize);
+        if (ZSTDv05_isError(errorCode)) return ERROR(dictionary_corrupted);
+    }
+
+    return 0;
+}
+
+/*
+    Buffered version of Zstd compression library
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd source repository : https://github.com/Cyan4973/zstd
+    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* The objects defined into this file should be considered experimental.
+ * They are not labelled stable, as their prototype may change in the future.
+ * You can use them for tests, provide feedback, or if you can endure risk of future changes.
+ */
+
+
+
+/* *************************************
+*  Constants
+***************************************/
+static size_t ZBUFFv05_blockHeaderSize = 3;
+
+
+
+/* *** Compression *** */
+
+static size_t ZBUFFv05_limitCopy(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    size_t length = MIN(maxDstSize, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+
+
+
+/** ************************************************
+*  Streaming decompression
+*
+*  A ZBUFFv05_DCtx object is required to track streaming operation.
+*  Use ZBUFFv05_createDCtx() and ZBUFFv05_freeDCtx() to create/release resources.
+*  Use ZBUFFv05_decompressInit() to start a new decompression operation.
+*  ZBUFFv05_DCtx objects can be reused multiple times.
+*
+*  Use ZBUFFv05_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *maxDstSizePtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input.
+*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst .
+*  return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFFv05_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory)
+*  output : 128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
+*  input : just follow indications from ZBUFFv05_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* **************************************************/
+
+typedef enum { ZBUFFv05ds_init, ZBUFFv05ds_readHeader, ZBUFFv05ds_loadHeader, ZBUFFv05ds_decodeHeader,
+               ZBUFFv05ds_read, ZBUFFv05ds_load, ZBUFFv05ds_flush } ZBUFFv05_dStage;
+
+/* *** Resource management *** */
+
+#define ZSTDv05_frameHeaderSize_max 5   /* too magical, should come from reference */
+struct ZBUFFv05_DCtx_s {
+    ZSTDv05_DCtx* zc;
+    ZSTDv05_parameters params;
+    char* inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    char* outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t hPos;
+    ZBUFFv05_dStage stage;
+    unsigned char headerBuffer[ZSTDv05_frameHeaderSize_max];
+};   /* typedef'd to ZBUFFv05_DCtx within "zstd_buffered.h" */
+
+
+ZBUFFv05_DCtx* ZBUFFv05_createDCtx(void)
+{
+    ZBUFFv05_DCtx* zbc = (ZBUFFv05_DCtx*)malloc(sizeof(ZBUFFv05_DCtx));
+    if (zbc==NULL) return NULL;
+    memset(zbc, 0, sizeof(*zbc));
+    zbc->zc = ZSTDv05_createDCtx();
+    zbc->stage = ZBUFFv05ds_init;
+    return zbc;
+}
+
+size_t ZBUFFv05_freeDCtx(ZBUFFv05_DCtx* zbc)
+{
+    if (zbc==NULL) return 0;   /* support free on null */
+    ZSTDv05_freeDCtx(zbc->zc);
+    free(zbc->inBuff);
+    free(zbc->outBuff);
+    free(zbc);
+    return 0;
+}
+
+
+/* *** Initialization *** */
+
+size_t ZBUFFv05_decompressInitDictionary(ZBUFFv05_DCtx* zbc, const void* dict, size_t dictSize)
+{
+    zbc->stage = ZBUFFv05ds_readHeader;
+    zbc->hPos = zbc->inPos = zbc->outStart = zbc->outEnd = 0;
+    return ZSTDv05_decompressBegin_usingDict(zbc->zc, dict, dictSize);
+}
+
+size_t ZBUFFv05_decompressInit(ZBUFFv05_DCtx* zbc)
+{
+    return ZBUFFv05_decompressInitDictionary(zbc, NULL, 0);
+}
+
+
+/* *** Decompression *** */
+
+size_t ZBUFFv05_decompressContinue(ZBUFFv05_DCtx* zbc, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr)
+{
+    const char* const istart = (const char*)src;
+    const char* ip = istart;
+    const char* const iend = istart + *srcSizePtr;
+    char* const ostart = (char*)dst;
+    char* op = ostart;
+    char* const oend = ostart + *maxDstSizePtr;
+    U32 notDone = 1;
+
+    while (notDone) {
+        switch(zbc->stage)
+        {
+        case ZBUFFv05ds_init :
+            return ERROR(init_missing);
+
+        case ZBUFFv05ds_readHeader :
+            /* read header from src */
+            {
+                size_t headerSize = ZSTDv05_getFrameParams(&(zbc->params), src, *srcSizePtr);
+                if (ZSTDv05_isError(headerSize)) return headerSize;
+                if (headerSize) {
+                    /* not enough input to decode header : tell how many bytes would be necessary */
+                    memcpy(zbc->headerBuffer+zbc->hPos, src, *srcSizePtr);
+                    zbc->hPos += *srcSizePtr;
+                    *maxDstSizePtr = 0;
+                    zbc->stage = ZBUFFv05ds_loadHeader;
+                    return headerSize - zbc->hPos;
+                }
+                zbc->stage = ZBUFFv05ds_decodeHeader;
+                break;
+            }
+	    /* fall-through */
+        case ZBUFFv05ds_loadHeader:
+            /* complete header from src */
+            {
+                size_t headerSize = ZBUFFv05_limitCopy(
+                    zbc->headerBuffer + zbc->hPos, ZSTDv05_frameHeaderSize_max - zbc->hPos,
+                    src, *srcSizePtr);
+                zbc->hPos += headerSize;
+                ip += headerSize;
+                headerSize = ZSTDv05_getFrameParams(&(zbc->params), zbc->headerBuffer, zbc->hPos);
+                if (ZSTDv05_isError(headerSize)) return headerSize;
+                if (headerSize) {
+                    /* not enough input to decode header : tell how many bytes would be necessary */
+                    *maxDstSizePtr = 0;
+                    return headerSize - zbc->hPos;
+                }
+                // zbc->stage = ZBUFFv05ds_decodeHeader; break;   /* useless : stage follows */
+            }
+	    /* fall-through */
+        case ZBUFFv05ds_decodeHeader:
+                /* apply header to create / resize buffers */
+                {
+                    size_t neededOutSize = (size_t)1 << zbc->params.windowLog;
+                    size_t neededInSize = BLOCKSIZE;   /* a block is never > BLOCKSIZE */
+                    if (zbc->inBuffSize < neededInSize) {
+                        free(zbc->inBuff);
+                        zbc->inBuffSize = neededInSize;
+                        zbc->inBuff = (char*)malloc(neededInSize);
+                        if (zbc->inBuff == NULL) return ERROR(memory_allocation);
+                    }
+                    if (zbc->outBuffSize < neededOutSize) {
+                        free(zbc->outBuff);
+                        zbc->outBuffSize = neededOutSize;
+                        zbc->outBuff = (char*)malloc(neededOutSize);
+                        if (zbc->outBuff == NULL) return ERROR(memory_allocation);
+                }   }
+                if (zbc->hPos) {
+                    /* some data already loaded into headerBuffer : transfer into inBuff */
+                    memcpy(zbc->inBuff, zbc->headerBuffer, zbc->hPos);
+                    zbc->inPos = zbc->hPos;
+                    zbc->hPos = 0;
+                    zbc->stage = ZBUFFv05ds_load;
+                    break;
+                }
+                zbc->stage = ZBUFFv05ds_read;
+		/* fall-through */
+        case ZBUFFv05ds_read:
+            {
+                size_t neededInSize = ZSTDv05_nextSrcSizeToDecompress(zbc->zc);
+                if (neededInSize==0) {  /* end of frame */
+                    zbc->stage = ZBUFFv05ds_init;
+                    notDone = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {
+                    /* directly decode from src */
+                    size_t decodedSize = ZSTDv05_decompressContinue(zbc->zc,
+                        zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
+                        ip, neededInSize);
+                    if (ZSTDv05_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize) break;   /* this was just a header */
+                    zbc->outEnd = zbc->outStart +  decodedSize;
+                    zbc->stage = ZBUFFv05ds_flush;
+                    break;
+                }
+                if (ip==iend) { notDone = 0; break; }   /* no more input */
+                zbc->stage = ZBUFFv05ds_load;
+            }
+	    /* fall-through */
+        case ZBUFFv05ds_load:
+            {
+                size_t neededInSize = ZSTDv05_nextSrcSizeToDecompress(zbc->zc);
+                size_t toLoad = neededInSize - zbc->inPos;   /* should always be <= remaining space within inBuff */
+                size_t loadedSize;
+                if (toLoad > zbc->inBuffSize - zbc->inPos) return ERROR(corruption_detected);   /* should never happen */
+                loadedSize = ZBUFFv05_limitCopy(zbc->inBuff + zbc->inPos, toLoad, ip, iend-ip);
+                ip += loadedSize;
+                zbc->inPos += loadedSize;
+                if (loadedSize < toLoad) { notDone = 0; break; }   /* not enough input, wait for more */
+                {
+                    size_t decodedSize = ZSTDv05_decompressContinue(zbc->zc,
+                        zbc->outBuff + zbc->outStart, zbc->outBuffSize - zbc->outStart,
+                        zbc->inBuff, neededInSize);
+                    if (ZSTDv05_isError(decodedSize)) return decodedSize;
+                    zbc->inPos = 0;   /* input is consumed */
+                    if (!decodedSize) { zbc->stage = ZBUFFv05ds_read; break; }   /* this was just a header */
+                    zbc->outEnd = zbc->outStart +  decodedSize;
+                    zbc->stage = ZBUFFv05ds_flush;
+                    // break; /* ZBUFFv05ds_flush follows */
+                }
+	    }
+	    /* fall-through */
+        case ZBUFFv05ds_flush:
+            {
+                size_t toFlushSize = zbc->outEnd - zbc->outStart;
+                size_t flushedSize = ZBUFFv05_limitCopy(op, oend-op, zbc->outBuff + zbc->outStart, toFlushSize);
+                op += flushedSize;
+                zbc->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {
+                    zbc->stage = ZBUFFv05ds_read;
+                    if (zbc->outStart + BLOCKSIZE > zbc->outBuffSize)
+                        zbc->outStart = zbc->outEnd = 0;
+                    break;
+                }
+                /* cannot flush everything */
+                notDone = 0;
+                break;
+            }
+        default: return ERROR(GENERIC);   /* impossible */
+    }   }
+
+    *srcSizePtr = ip-istart;
+    *maxDstSizePtr = op-ostart;
+
+    {   size_t nextSrcSizeHint = ZSTDv05_nextSrcSizeToDecompress(zbc->zc);
+        if (nextSrcSizeHint > ZBUFFv05_blockHeaderSize) nextSrcSizeHint+= ZBUFFv05_blockHeaderSize;   /* get next block header too */
+        nextSrcSizeHint -= zbc->inPos;   /* already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+unsigned ZBUFFv05_isError(size_t errorCode) { return ERR_isError(errorCode); }
+const char* ZBUFFv05_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+size_t ZBUFFv05_recommendedDInSize(void)  { return BLOCKSIZE + ZBUFFv05_blockHeaderSize /* block header size*/ ; }
+size_t ZBUFFv05_recommendedDOutSize(void) { return BLOCKSIZE; }

diff --git a/vendor/github.com/DataDog/zstd/zstd_v05.h b/vendor/github.com/DataDog/zstd/zstd_v05.h
new file mode 100644
index 0000000..b68fd57
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v05.h

@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv05_H
+#define ZSTDv05_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stddef.h>   /* size_t */
+#include "mem.h"      /* U64, U32 */
+
+
+/* *************************************
+*  Simple functions
+***************************************/
+/*! ZSTDv05_decompress() :
+    `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail.
+    `dstCapacity` must be large enough, equal or larger than originalSize.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTDv05_isError()) */
+size_t ZSTDv05_decompress( void* dst, size_t dstCapacity,
+                     const void* src, size_t compressedSize);
+
+/**
+ZSTDv05_getFrameSrcSize() : get the source length of a ZSTD frame
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv05_isError())
+*/
+size_t ZSTDv05_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/* *************************************
+*  Helper functions
+***************************************/
+/* Error Management */
+unsigned    ZSTDv05_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+const char* ZSTDv05_getErrorName(size_t code);     /*!< provides readable string for an error code */
+
+
+/* *************************************
+*  Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv05_DCtx_s ZSTDv05_DCtx;
+ZSTDv05_DCtx* ZSTDv05_createDCtx(void);
+size_t ZSTDv05_freeDCtx(ZSTDv05_DCtx* dctx);      /*!< @return : errorCode */
+
+/** ZSTDv05_decompressDCtx() :
+*   Same as ZSTDv05_decompress(), but requires an already allocated ZSTDv05_DCtx (see ZSTDv05_createDCtx()) */
+size_t ZSTDv05_decompressDCtx(ZSTDv05_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+*  Simple Dictionary API
+*************************/
+/*! ZSTDv05_decompress_usingDict() :
+*   Decompression using a pre-defined Dictionary content (see dictBuilder).
+*   Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted.
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTDv05_decompressDCtx() */
+size_t ZSTDv05_decompress_usingDict(ZSTDv05_DCtx* dctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const void* dict,size_t dictSize);
+
+/*-************************
+*  Advanced Streaming API
+***************************/
+typedef enum { ZSTDv05_fast, ZSTDv05_greedy, ZSTDv05_lazy, ZSTDv05_lazy2, ZSTDv05_btlazy2, ZSTDv05_opt, ZSTDv05_btopt } ZSTDv05_strategy;
+typedef struct {
+    U64 srcSize;
+    U32 windowLog;     /* the only useful information to retrieve */
+    U32 contentLog; U32 hashLog; U32 searchLog; U32 searchLength; U32 targetLength; ZSTDv05_strategy strategy;
+} ZSTDv05_parameters;
+size_t ZSTDv05_getFrameParams(ZSTDv05_parameters* params, const void* src, size_t srcSize);
+
+size_t ZSTDv05_decompressBegin_usingDict(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize);
+void   ZSTDv05_copyDCtx(ZSTDv05_DCtx* dstDCtx, const ZSTDv05_DCtx* srcDCtx);
+size_t ZSTDv05_nextSrcSizeToDecompress(ZSTDv05_DCtx* dctx);
+size_t ZSTDv05_decompressContinue(ZSTDv05_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+*  ZBUFF API
+*************************/
+typedef struct ZBUFFv05_DCtx_s ZBUFFv05_DCtx;
+ZBUFFv05_DCtx* ZBUFFv05_createDCtx(void);
+size_t         ZBUFFv05_freeDCtx(ZBUFFv05_DCtx* dctx);
+
+size_t ZBUFFv05_decompressInit(ZBUFFv05_DCtx* dctx);
+size_t ZBUFFv05_decompressInitDictionary(ZBUFFv05_DCtx* dctx, const void* dict, size_t dictSize);
+
+size_t ZBUFFv05_decompressContinue(ZBUFFv05_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression
+*
+*  A ZBUFFv05_DCtx object is required to track streaming operations.
+*  Use ZBUFFv05_createDCtx() and ZBUFFv05_freeDCtx() to create/release resources.
+*  Use ZBUFFv05_decompressInit() to start a new decompression operation,
+*   or ZBUFFv05_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv05_DCtx objects can be reused multiple times.
+*
+*  Use ZBUFFv05_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change @dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFFv05_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv05_recommendedDInSize() / ZBUFFv05_recommendedDOutSize()
+*  output : ZBUFFv05_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv05_recommendedDInSize==128Kb+3; just follow indications from ZBUFFv05_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+unsigned ZBUFFv05_isError(size_t errorCode);
+const char* ZBUFFv05_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, and tend to offer better latency */
+size_t ZBUFFv05_recommendedDInSize(void);
+size_t ZBUFFv05_recommendedDOutSize(void);
+
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define ZSTDv05_MAGICNUMBER 0xFD2FB525   /* v0.5 */
+
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv0505_H */

diff --git a/vendor/github.com/DataDog/zstd/zstd_v06.c b/vendor/github.com/DataDog/zstd/zstd_v06.c
new file mode 100644
index 0000000..8b068b3
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v06.c

@@ -0,0 +1,4124 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*- Dependencies -*/
+#include "zstd_v06.h"
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+#include <stdlib.h>    /* malloc, free, qsort */
+#include "error_private.h"
+
+
+
+/* ******************************************************************
+   mem.h
+   low-level memory access routines
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap32(in);
+#else
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap64(in);
+#else
+    return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+#endif
+}
+
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+
+/*
+    zstd - standard compression library
+    Header File for static linking only
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net
+*/
+#ifndef ZSTDv06_STATIC_H
+#define ZSTDv06_STATIC_H
+
+/* The prototypes defined within this file are considered experimental.
+ * They should not be used in the context DLL as they may change in the future.
+ * Prefer static linking if you need them, to control breaking version changes issues.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/*- Advanced Decompression functions -*/
+
+/*! ZSTDv06_decompress_usingPreparedDCtx() :
+*   Same as ZSTDv06_decompress_usingDict, but using a reference context `preparedDCtx`, where dictionary has been loaded.
+*   It avoids reloading the dictionary each time.
+*   `preparedDCtx` must have been properly initialized using ZSTDv06_decompressBegin_usingDict().
+*   Requires 2 contexts : 1 for reference (preparedDCtx), which will not be modified, and 1 to run the decompression operation (dctx) */
+ZSTDLIBv06_API size_t ZSTDv06_decompress_usingPreparedDCtx(
+                                           ZSTDv06_DCtx* dctx, const ZSTDv06_DCtx* preparedDCtx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize);
+
+
+
+#define ZSTDv06_FRAMEHEADERSIZE_MAX 13    /* for static allocation */
+static const size_t ZSTDv06_frameHeaderSize_min = 5;
+static const size_t ZSTDv06_frameHeaderSize_max = ZSTDv06_FRAMEHEADERSIZE_MAX;
+
+ZSTDLIBv06_API size_t ZSTDv06_decompressBegin(ZSTDv06_DCtx* dctx);
+
+/*
+  Streaming decompression, direct mode (bufferless)
+
+  A ZSTDv06_DCtx object is required to track streaming operations.
+  Use ZSTDv06_createDCtx() / ZSTDv06_freeDCtx() to manage it.
+  A ZSTDv06_DCtx object can be re-used multiple times.
+
+  First optional operation is to retrieve frame parameters, using ZSTDv06_getFrameParams(), which doesn't consume the input.
+  It can provide the minimum size of rolling buffer required to properly decompress data,
+  and optionally the final size of uncompressed content.
+  (Note : content size is an optional info that may not be present. 0 means : content size unknown)
+  Frame parameters are extracted from the beginning of compressed frame.
+  The amount of data to read is variable, from ZSTDv06_frameHeaderSize_min to ZSTDv06_frameHeaderSize_max (so if `srcSize` >= ZSTDv06_frameHeaderSize_max, it will always work)
+  If `srcSize` is too small for operation to succeed, function will return the minimum size it requires to produce a result.
+  Result : 0 when successful, it means the ZSTDv06_frameParams structure has been filled.
+          >0 : means there is not enough data into `src`. Provides the expected size to successfully decode header.
+           errorCode, which can be tested using ZSTDv06_isError()
+
+  Start decompression, with ZSTDv06_decompressBegin() or ZSTDv06_decompressBegin_usingDict().
+  Alternatively, you can copy a prepared context, using ZSTDv06_copyDCtx().
+
+  Then use ZSTDv06_nextSrcSizeToDecompress() and ZSTDv06_decompressContinue() alternatively.
+  ZSTDv06_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTDv06_decompressContinue().
+  ZSTDv06_decompressContinue() requires this exact amount of bytes, or it will fail.
+  ZSTDv06_decompressContinue() needs previous data blocks during decompression, up to (1 << windowlog).
+  They should preferably be located contiguously, prior to current block. Alternatively, a round buffer is also possible.
+
+  @result of ZSTDv06_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity)
+  It can be zero, which is not an error; it just means ZSTDv06_decompressContinue() has decoded some header.
+
+  A frame is fully decoded when ZSTDv06_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+*/
+
+
+/* **************************************
+*  Block functions
+****************************************/
+/*! Block functions produce and decode raw zstd blocks, without frame metadata.
+    User will have to take in charge required information to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Uncompressed block size must be <= ZSTDv06_BLOCKSIZE_MAX (128 KB)
+    - Compressing or decompressing requires a context structure
+      + Use ZSTDv06_createCCtx() and ZSTDv06_createDCtx()
+    - It is necessary to init context before starting
+      + compression : ZSTDv06_compressBegin()
+      + decompression : ZSTDv06_decompressBegin()
+      + variants _usingDict() are also allowed
+      + copyCCtx() and copyDCtx() work too
+    - When a block is considered not compressible enough, ZSTDv06_compressBlock() result will be zero.
+      In which case, nothing is produced into `dst`.
+      + User must test for such outcome and deal directly with uncompressed data
+      + ZSTDv06_decompressBlock() doesn't accept uncompressed data as input !!
+*/
+
+#define ZSTDv06_BLOCKSIZE_MAX (128 * 1024)   /* define, for static allocation */
+ZSTDLIBv06_API size_t ZSTDv06_decompressBlock(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv06_STATIC_H */
+/*
+    zstd_internal - common functions to include
+    Header File for include
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : https://www.zstd.net
+*/
+#ifndef ZSTDv06_CCOMMON_H_MODULE
+#define ZSTDv06_CCOMMON_H_MODULE
+
+
+/*-*************************************
+*  Common macros
+***************************************/
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTDv06_DICT_MAGIC  0xEC30A436
+
+#define ZSTDv06_REP_NUM    3
+#define ZSTDv06_REP_INIT   ZSTDv06_REP_NUM
+#define ZSTDv06_REP_MOVE   (ZSTDv06_REP_NUM-1)
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTDv06_WINDOWLOG_ABSOLUTEMIN 12
+static const size_t ZSTDv06_fcs_fieldSize[4] = { 0, 1, 2, 8 };
+
+#define ZSTDv06_BLOCKHEADERSIZE 3   /* because C standard does not allow a static const value to be defined using another static const value .... :( */
+static const size_t ZSTDv06_blockHeaderSize = ZSTDv06_BLOCKHEADERSIZE;
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define HufLog 12
+
+#define IS_HUF 0
+#define IS_PCH 1
+#define IS_RAW 2
+#define IS_RLE 3
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+#define EQUAL_READ32 4
+#define REPCODE_STARTVALUE 1
+
+#define Litbits  8
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML  52
+#define MaxLL  35
+#define MaxOff 28
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+
+#define FSEv06_ENCODING_RAW     0
+#define FSEv06_ENCODING_RLE     1
+#define FSEv06_ENCODING_STATIC  2
+#define FSEv06_ENCODING_DYNAMIC 3
+
+static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9,10,11,12,
+                                     13,14,15,16 };
+static const S16 LL_defaultNorm[MaxLL+1] = { 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
+                                             2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
+                                            -1,-1,-1,-1 };
+static const U32 LL_defaultNormLog = 6;
+
+static const U32 ML_bits[MaxML+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9,10,11,
+                                     12,13,14,15,16 };
+static const S16 ML_defaultNorm[MaxML+1] = { 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
+                                            -1,-1,-1,-1,-1 };
+static const U32 ML_defaultNormLog = 6;
+
+static const S16 OF_defaultNorm[MaxOff+1] = { 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+                                              1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 };
+static const U32 OF_defaultNormLog = 5;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTDv06_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+#define COPY8(d,s) { ZSTDv06_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTDv06_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+MEM_STATIC void ZSTDv06_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct {
+    U32 off;
+    U32 len;
+} ZSTDv06_match_t;
+
+typedef struct {
+    U32 price;
+    U32 off;
+    U32 mlen;
+    U32 litlen;
+    U32 rep[ZSTDv06_REP_INIT];
+} ZSTDv06_optimal_t;
+
+typedef struct { U32  unused; } ZSTDv06_stats_t;
+
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* litStart;
+    BYTE* lit;
+    U16*  litLengthStart;
+    U16*  litLength;
+    BYTE* llCodeStart;
+    U16*  matchLengthStart;
+    U16*  matchLength;
+    BYTE* mlCodeStart;
+    U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+    U32   longLengthPos;
+    /* opt */
+    ZSTDv06_optimal_t* priceTable;
+    ZSTDv06_match_t* matchTable;
+    U32* matchLengthFreq;
+    U32* litLengthFreq;
+    U32* litFreq;
+    U32* offCodeFreq;
+    U32  matchLengthSum;
+    U32  matchSum;
+    U32  litLengthSum;
+    U32  litSum;
+    U32  offCodeSum;
+    U32  log2matchLengthSum;
+    U32  log2matchSum;
+    U32  log2litLengthSum;
+    U32  log2litSum;
+    U32  log2offCodeSum;
+    U32  factor;
+    U32  cachedPrice;
+    U32  cachedLitLength;
+    const BYTE* cachedLiterals;
+    ZSTDv06_stats_t stats;
+} seqStore_t;
+
+void ZSTDv06_seqToCodes(const seqStore_t* seqStorePtr, size_t const nbSeq);
+
+
+#endif   /* ZSTDv06_CCOMMON_H_MODULE */
+/* ******************************************************************
+   FSE : Finite State Entropy codec
+   Public Prototypes declaration
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef FSEv06_H
+#define FSEv06_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/*-****************************************
+*  FSE simple functions
+******************************************/
+/*! FSEv06_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+    @return : size of regenerated data (<= maxDstSize),
+              or an error code, which can be tested using FSEv06_isError() .
+
+    ** Important ** : FSEv06_decompress() does not decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+size_t FSEv06_decompress(void* dst,  size_t dstCapacity,
+                const void* cSrc, size_t cSrcSize);
+
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+size_t FSEv06_compressBound(size_t size);       /* maximum compressed size */
+
+/* Error Management */
+unsigned    FSEv06_isError(size_t code);        /* tells if a return value is an error code */
+const char* FSEv06_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+
+FSEv06_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSEv06_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSEv06_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+size_t FSEv06_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSEv06_DTable.
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSEv06_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSEv06_DTable* FSEv06_createDTable(unsigned tableLog);
+void        FSEv06_freeDTable(FSEv06_DTable* dt);
+
+/*! FSEv06_buildDTable():
+    Builds 'dt', which must be already allocated, using FSEv06_createDTable().
+    return : 0, or an errorCode, which can be tested using FSEv06_isError() */
+size_t FSEv06_buildDTable (FSEv06_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSEv06_decompress_usingDTable():
+    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+    into `dst` which must be already allocated.
+    @return : size of regenerated data (necessarily <= `dstCapacity`),
+              or an errorCode, which can be tested using FSEv06_isError() */
+size_t FSEv06_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSEv06_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSEv06_readNCount() if it was saved using FSEv06_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSEv06_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSEv06_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSEv06_isError().
+
+The next step is to build the decompression tables 'FSEv06_DTable' from 'normalizedCounter'.
+This is performed by the function FSEv06_buildDTable().
+The space required by 'FSEv06_DTable' must be already allocated using FSEv06_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSEv06_isError().
+
+`FSEv06_DTable` can then be used to decompress `cSrc`, with FSEv06_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSEv06_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSEv06_isError(). (ex: dst buffer too small)
+*/
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSEv06_H */
+/* ******************************************************************
+   bitstream
+   Part of FSE library
+   header file (to include)
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#if defined(__BMI__) && defined(__GNUC__)
+#  include <immintrin.h>   /* support for bextr (experimental) */
+#endif
+
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BITv06_DStream_t;
+
+typedef enum { BITv06_DStream_unfinished = 0,
+               BITv06_DStream_endOfBuffer = 1,
+               BITv06_DStream_completed = 2,
+               BITv06_DStream_overflow = 3 } BITv06_DStream_status;  /* result of BITv06_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BITv06_initDStream(BITv06_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BITv06_readBits(BITv06_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BITv06_DStream_status BITv06_reloadDStream(BITv06_DStream_t* bitD);
+MEM_STATIC unsigned BITv06_endOfDStream(const BITv06_DStream_t* bitD);
+
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BITv06_readBitsFast(BITv06_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+MEM_STATIC unsigned BITv06_highbit32 ( U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    unsigned r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*! BITv06_initDStream() :
+*   Initialize a BITv06_DStream_t.
+*   `bitD` : a pointer to an already allocated BITv06_DStream_t structure.
+*   `srcSize` must be the *exact* size of the bitStream, in bytes.
+*   @return : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BITv06_initDStream(BITv06_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          if (lastByte == 0) return ERROR(GENERIC);   /* endMark not present */
+          bitD->bitsConsumed = 8 - BITv06_highbit32(lastByte); }
+    } else {
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);/* fall-through */
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);/* fall-through */
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);/* fall-through */
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; /* fall-through */
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; /* fall-through */
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8; /* fall-through */
+            default: break;
+        }
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          if (lastByte == 0) return ERROR(GENERIC);   /* endMark not present */
+          bitD->bitsConsumed = 8 - BITv06_highbit32(lastByte); }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+
+ MEM_STATIC size_t BITv06_lookBits(const BITv06_DStream_t* bitD, U32 nbBits)
+{
+    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BITv06_lookBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv06_lookBitsFast(const BITv06_DStream_t* bitD, U32 nbBits)
+{
+    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BITv06_skipBits(BITv06_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BITv06_readBits(BITv06_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BITv06_lookBits(bitD, nbBits);
+    BITv06_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BITv06_readBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv06_readBitsFast(BITv06_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BITv06_lookBitsFast(bitD, nbBits);
+    BITv06_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BITv06_DStream_status BITv06_reloadDStream(BITv06_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BITv06_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BITv06_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BITv06_DStream_endOfBuffer;
+        return BITv06_DStream_completed;
+    }
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BITv06_DStream_status result = BITv06_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BITv06_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BITv06_endOfDStream() :
+*   @return Tells if DStream has exactly reached its end (all bits consumed).
+*/
+MEM_STATIC unsigned BITv06_endOfDStream(const BITv06_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+/* ******************************************************************
+   FSE : Finite State Entropy coder
+   header file for static linking (only)
+   Copyright (C) 2013-2015, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+   - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef FSEv06_STATIC_H
+#define FSEv06_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSEv06_NCOUNTBOUND 512
+#define FSEv06_BLOCKBOUND(size) (size + (size>>7))
+#define FSEv06_COMPRESSBOUND(size) (FSEv06_NCOUNTBOUND + FSEv06_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of unsigned using below macros */
+#define FSEv06_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+size_t FSEv06_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+/* same as FSEv06_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr  */
+
+size_t FSEv06_buildDTable_raw (FSEv06_DTable* dt, unsigned nbBits);
+/* build a fake FSEv06_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+size_t FSEv06_buildDTable_rle (FSEv06_DTable* dt, unsigned char symbolValue);
+/* build a fake FSEv06_DTable, designed to always generate the same symbolValue */
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSEv06_DState_t;
+
+
+static void     FSEv06_initDState(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD, const FSEv06_DTable* dt);
+
+static unsigned char FSEv06_decodeSymbol(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD);
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSEv06_decodeSymbolFast(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSEv06_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSEv06_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSEv06_initDState(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD, const FSEv06_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv06_DTableHeader* const DTableH = (const FSEv06_DTableHeader*)ptr;
+    DStatePtr->state = BITv06_readBits(bitD, DTableH->tableLog);
+    BITv06_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSEv06_peekSymbol(const FSEv06_DState_t* DStatePtr)
+{
+    FSEv06_decode_t const DInfo = ((const FSEv06_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSEv06_updateState(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD)
+{
+    FSEv06_decode_t const DInfo = ((const FSEv06_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BITv06_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSEv06_decodeSymbol(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD)
+{
+    FSEv06_decode_t const DInfo = ((const FSEv06_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BITv06_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSEv06_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSEv06_decodeSymbolFast(FSEv06_DState_t* DStatePtr, BITv06_DStream_t* bitD)
+{
+    FSEv06_decode_t const DInfo = ((const FSEv06_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BITv06_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+
+
+#ifndef FSEv06_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSEv06_MAX_MEMORY_USAGE 14
+#define FSEv06_DEFAULT_MEMORY_USAGE 13
+
+/*!FSEv06_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSEv06_MAX_SYMBOL_VALUE 255
+
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSEv06_FUNCTION_TYPE BYTE
+#define FSEv06_FUNCTION_EXTENSION
+#define FSEv06_DECODE_TYPE FSEv06_decode_t
+
+
+#endif   /* !FSEv06_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSEv06_MAX_TABLELOG  (FSEv06_MAX_MEMORY_USAGE-2)
+#define FSEv06_MAX_TABLESIZE (1U<<FSEv06_MAX_TABLELOG)
+#define FSEv06_MAXTABLESIZE_MASK (FSEv06_MAX_TABLESIZE-1)
+#define FSEv06_DEFAULT_TABLELOG (FSEv06_DEFAULT_MEMORY_USAGE-2)
+#define FSEv06_MIN_TABLELOG 5
+
+#define FSEv06_TABLELOG_ABSOLUTE_MAX 15
+#if FSEv06_MAX_TABLELOG > FSEv06_TABLELOG_ABSOLUTE_MAX
+#error "FSEv06_MAX_TABLELOG > FSEv06_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSEv06_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3)
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSEv06_STATIC_H */
+/*
+   Common functions of New Generation Entropy library
+   Copyright (C) 2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+*************************************************************************** */
+
+
+/*-****************************************
+*  FSE Error Management
+******************************************/
+unsigned FSEv06_isError(size_t code) { return ERR_isError(code); }
+
+const char* FSEv06_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/* **************************************************************
+*  HUF Error Management
+****************************************************************/
+unsigned HUFv06_isError(size_t code) { return ERR_isError(code); }
+
+const char* HUFv06_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+static short FSEv06_abs(short a) { return a<0 ? -a : a; }
+
+size_t FSEv06_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSEv06_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSEv06_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0+=24;
+                if (ip < iend-5) {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount+=16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {   short const max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            } else {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSEv06_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }   /* while ((remaining>1) && (charnum<=*maxSVPtr)) */
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+/* ******************************************************************
+   FSE : Finite State Entropy decoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSEv06_isError ERR_isError
+#define FSEv06_STATIC_ASSERT(c) { enum { FSEv06_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSEv06_DTABLE_SIZE_U32(FSEv06_MAX_TABLELOG)];
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSEv06_FUNCTION_EXTENSION
+#  error "FSEv06_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSEv06_FUNCTION_TYPE
+#  error "FSEv06_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSEv06_CAT(X,Y) X##Y
+#define FSEv06_FUNCTION_NAME(X,Y) FSEv06_CAT(X,Y)
+#define FSEv06_TYPE_NAME(X,Y) FSEv06_CAT(X,Y)
+
+
+/* Function templates */
+FSEv06_DTable* FSEv06_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSEv06_TABLELOG_ABSOLUTE_MAX) tableLog = FSEv06_TABLELOG_ABSOLUTE_MAX;
+    return (FSEv06_DTable*)malloc( FSEv06_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSEv06_freeDTable (FSEv06_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSEv06_buildDTable(FSEv06_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSEv06_DECODE_TYPE* const tableDecode = (FSEv06_DECODE_TYPE*) (tdPtr);
+    U16 symbolNext[FSEv06_MAX_SYMBOL_VALUE+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSEv06_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSEv06_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSEv06_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSEv06_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSEv06_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSEv06_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSEv06_FUNCTION_TYPE const symbol = (FSEv06_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BITv06_highbit32 ((U32)nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+
+
+#ifndef FSEv06_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSEv06_buildDTable_rle (FSEv06_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSEv06_DTableHeader* const DTableH = (FSEv06_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv06_decode_t* const cell = (FSEv06_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSEv06_buildDTable_raw (FSEv06_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSEv06_DTableHeader* const DTableH = (FSEv06_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv06_decode_t* const dinfo = (FSEv06_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSV1 = tableMask+1;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<maxSV1; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSEv06_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSEv06_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BITv06_DStream_t bitD;
+    FSEv06_DState_t state1;
+    FSEv06_DState_t state2;
+
+    /* Init */
+    { size_t const errorCode = BITv06_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+      if (FSEv06_isError(errorCode)) return errorCode; }
+
+    FSEv06_initDState(&state1, &bitD, dt);
+    FSEv06_initDState(&state2, &bitD, dt);
+
+#define FSEv06_GETSYMBOL(statePtr) fast ? FSEv06_decodeSymbolFast(statePtr, &bitD) : FSEv06_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BITv06_reloadDStream(&bitD)==BITv06_DStream_unfinished) && (op<olimit) ; op+=4) {
+        op[0] = FSEv06_GETSYMBOL(&state1);
+
+        if (FSEv06_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv06_reloadDStream(&bitD);
+
+        op[1] = FSEv06_GETSYMBOL(&state2);
+
+        if (FSEv06_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BITv06_reloadDStream(&bitD) > BITv06_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSEv06_GETSYMBOL(&state1);
+
+        if (FSEv06_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv06_reloadDStream(&bitD);
+
+        op[3] = FSEv06_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BITv06_reloadDStream(&bitD) >= FSEv06_DStream_partiallyFilled; Ends at exactly BITv06_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+
+        *op++ = FSEv06_GETSYMBOL(&state1);
+
+        if (BITv06_reloadDStream(&bitD)==BITv06_DStream_overflow) {
+            *op++ = FSEv06_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+
+        *op++ = FSEv06_GETSYMBOL(&state2);
+
+        if (BITv06_reloadDStream(&bitD)==BITv06_DStream_overflow) {
+            *op++ = FSEv06_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+
+size_t FSEv06_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSEv06_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv06_DTableHeader* DTableH = (const FSEv06_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSEv06_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSEv06_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSEv06_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSEv06_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSEv06_MAX_SYMBOL_VALUE;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    {   size_t const NCountLength = FSEv06_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+        if (FSEv06_isError(NCountLength)) return NCountLength;
+        if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+        ip += NCountLength;
+        cSrcSize -= NCountLength;
+    }
+
+    { size_t const errorCode = FSEv06_buildDTable (dt, counting, maxSymbolValue, tableLog);
+      if (FSEv06_isError(errorCode)) return errorCode; }
+
+    return FSEv06_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);   /* always return, even if it is an error code */
+}
+
+
+
+#endif   /* FSEv06_COMMONDEFS_ONLY */
+/* ******************************************************************
+   Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUFv06_H
+#define HUFv06_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  HUF simple functions
+******************************************/
+size_t HUFv06_decompress(void* dst,  size_t dstSize,
+                const void* cSrc, size_t cSrcSize);
+/*
+HUFv06_decompress() :
+    Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstSize'.
+    `dstSize` : must be the **exact** size of original (uncompressed) data.
+    Note : in contrast with FSE, HUFv06_decompress can regenerate
+           RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+           because it knows size to regenerate.
+    @return : size of regenerated data (== dstSize)
+              or an error code, which can be tested using HUFv06_isError()
+*/
+
+
+/* ****************************************
+*  Tool functions
+******************************************/
+size_t HUFv06_compressBound(size_t size);       /**< maximum compressed size */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* HUFv06_H */
+/* ******************************************************************
+   Huffman codec, part of New Generation Entropy library
+   header file, for static linking only
+   Copyright (C) 2013-2016, Yann Collet
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUFv06_STATIC_H
+#define HUFv06_STATIC_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUFv06_CTABLEBOUND 129
+#define HUFv06_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
+#define HUFv06_COMPRESSBOUND(size) (HUFv06_CTABLEBOUND + HUFv06_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's DTable */
+#define HUFv06_DTABLE_SIZE(maxTableLog)   (1 + (1<<maxTableLog))
+#define HUFv06_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        unsigned short DTable[HUFv06_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUFv06_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        unsigned int DTable[HUFv06_DTABLE_SIZE(maxTableLog)] = { maxTableLog }
+#define HUFv06_CREATE_STATIC_DTABLEX6(DTable, maxTableLog) \
+        unsigned int DTable[HUFv06_DTABLE_SIZE(maxTableLog) * 3 / 2] = { maxTableLog }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUFv06_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUFv06_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbols decoder */
+
+
+
+/*!
+HUFv06_decompress() does the following:
+1. select the decompression algorithm (X2, X4, X6) based on pre-computed heuristics
+2. build Huffman table from save, using HUFv06_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUFv06_decompressSXn_usingDTable
+*/
+size_t HUFv06_readDTableX2 (unsigned short* DTable, const void* src, size_t srcSize);
+size_t HUFv06_readDTableX4 (unsigned* DTable, const void* src, size_t srcSize);
+
+size_t HUFv06_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+size_t HUFv06_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+/* single stream variants */
+size_t HUFv06_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUFv06_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+
+size_t HUFv06_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned short* DTable);
+size_t HUFv06_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const unsigned* DTable);
+
+
+
+/* **************************************************************
+*  Constants
+****************************************************************/
+#define HUFv06_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUFv06_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUFv06_MAX_TABLELOG  12           /* max configured tableLog (for static allocation); can be modified up to HUFv06_ABSOLUTEMAX_TABLELOG */
+#define HUFv06_DEFAULT_TABLELOG  HUFv06_MAX_TABLELOG   /* tableLog by default, when not specified */
+#define HUFv06_MAX_SYMBOL_VALUE 255
+#if (HUFv06_MAX_TABLELOG > HUFv06_ABSOLUTEMAX_TABLELOG)
+#  error "HUFv06_MAX_TABLELOG is too large !"
+#endif
+
+
+
+/*! HUFv06_readStats() :
+    Read compact Huffman tree, saved by HUFv06_writeCTable().
+    `huffWeight` is destination buffer.
+    @return : size read from `src`
+*/
+MEM_STATIC size_t HUFv06_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                            U32* nbSymbolsPtr, U32* tableLogPtr,
+                            const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  { /* special header */
+        if (iSize >= (242)) {  /* RLE */
+            static U32 l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else {   /* Incompressible */
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            {   U32 n;
+                for (n=0; n<oSize; n+=2) {
+                    huffWeight[n]   = ip[n/2] >> 4;
+                    huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSEv06_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSEv06_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUFv06_ABSOLUTEMAX_TABLELOG + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] >= HUFv06_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = BITv06_highbit32(weightTotal) + 1;
+        if (tableLog > HUFv06_ABSOLUTEMAX_TABLELOG) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << BITv06_highbit32(rest);
+            U32 const lastWeight = BITv06_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* HUFv06_STATIC_H */
+/* ******************************************************************
+   Huffman decoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUFv06_STATIC_ASSERT(c) { enum { HUFv06_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+
+/* *******************************************************
+*  HUF : Huffman block decompression
+*********************************************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUFv06_DEltX2;   /* single-symbol decoding */
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUFv06_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+
+size_t HUFv06_readDTableX2 (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUFv06_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUFv06_ABSOLUTEMAX_TABLELOG + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    size_t iSize;
+    U32 nbSymbols = 0;
+    U32 n;
+    U32 nextRankStart;
+    void* const dtPtr = DTable + 1;
+    HUFv06_DEltX2* const dt = (HUFv06_DEltX2*)dtPtr;
+
+    HUFv06_STATIC_ASSERT(sizeof(HUFv06_DEltX2) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv06_readStats(huffWeight, HUFv06_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv06_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > DTable[0]) return ERROR(tableLog_tooLarge);   /* DTable is too small */
+    DTable[0] = (U16)tableLog;   /* maybe should separate sizeof allocated DTable, from used size of DTable, in case of re-use */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<tableLog+1; n++) {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<nbSymbols; n++) {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUFv06_DEltX2 D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize;
+}
+
+
+static BYTE HUFv06_decodeSymbolX2(BITv06_DStream_t* Dstream, const HUFv06_DEltX2* dt, const U32 dtLog)
+{
+    const size_t val = BITv06_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    const BYTE c = dt[val].byte;
+    BITv06_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUFv06_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUFv06_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUFv06_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv06_MAX_TABLELOG<=12)) \
+        HUFv06_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUFv06_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUFv06_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUFv06_decodeStreamX2(BYTE* p, BITv06_DStream_t* const bitDPtr, BYTE* const pEnd, const HUFv06_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BITv06_reloadDStream(bitDPtr) == BITv06_DStream_unfinished) && (p <= pEnd-4)) {
+        HUFv06_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BITv06_reloadDStream(bitDPtr) == BITv06_DStream_unfinished) && (p < pEnd))
+        HUFv06_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUFv06_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+size_t HUFv06_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const U32 dtLog = DTable[0];
+    const void* dtPtr = DTable;
+    const HUFv06_DEltX2* const dt = ((const HUFv06_DEltX2*)dtPtr)+1;
+    BITv06_DStream_t bitD;
+
+    { size_t const errorCode = BITv06_initDStream(&bitD, cSrc, cSrcSize);
+      if (HUFv06_isError(errorCode)) return errorCode; }
+
+    HUFv06_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+    /* check */
+    if (!BITv06_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+size_t HUFv06_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv06_CREATE_STATIC_DTABLEX2(DTable, HUFv06_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const errorCode = HUFv06_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUFv06_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUFv06_decompress1X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+size_t HUFv06_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable;
+        const HUFv06_DEltX2* const dt = ((const HUFv06_DEltX2*)dtPtr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BITv06_DStream_t bitD1;
+        BITv06_DStream_t bitD2;
+        BITv06_DStream_t bitD3;
+        BITv06_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BITv06_initDStream(&bitD1, istart1, length1);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD2, istart2, length2);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD3, istart3, length3);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD4, istart4, length4);
+        if (HUFv06_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BITv06_reloadDStream(&bitD1) | BITv06_reloadDStream(&bitD2) | BITv06_reloadDStream(&bitD3) | BITv06_reloadDStream(&bitD4);
+        for ( ; (endSignal==BITv06_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUFv06_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX2_0(op4, &bitD4);
+            endSignal = BITv06_reloadDStream(&bitD1) | BITv06_reloadDStream(&bitD2) | BITv06_reloadDStream(&bitD3) | BITv06_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUFv06_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUFv06_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUFv06_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUFv06_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BITv06_endOfDStream(&bitD1) & BITv06_endOfDStream(&bitD2) & BITv06_endOfDStream(&bitD3) & BITv06_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUFv06_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv06_CREATE_STATIC_DTABLEX2(DTable, HUFv06_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const errorCode = HUFv06_readDTableX2 (DTable, cSrc, cSrcSize);
+    if (HUFv06_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUFv06_decompress4X2_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+
+static void HUFv06_fillDTableX4Level2(HUFv06_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUFv06_DEltX4 DElt;
+    U32 rankVal[HUFv06_ABSOLUTEMAX_TABLELOG + 1];
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    { U32 s; for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }}
+}
+
+typedef U32 rankVal_t[HUFv06_ABSOLUTEMAX_TABLELOG][HUFv06_ABSOLUTEMAX_TABLELOG + 1];
+
+static void HUFv06_fillDTableX4(HUFv06_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUFv06_ABSOLUTEMAX_TABLELOG + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUFv06_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            HUFv06_DEltX4 DElt;
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits = (BYTE)(nbBits);
+            DElt.length = 1;
+            {   U32 u;
+                const U32 end = start + length;
+                for (u = start; u < end; u++) DTable[u] = DElt;
+        }   }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUFv06_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUFv06_MAX_SYMBOL_VALUE + 1];
+    sortedSymbol_t sortedSymbol[HUFv06_MAX_SYMBOL_VALUE + 1];
+    U32 rankStats[HUFv06_ABSOLUTEMAX_TABLELOG + 1] = { 0 };
+    U32 rankStart0[HUFv06_ABSOLUTEMAX_TABLELOG + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    const U32 memLog = DTable[0];
+    size_t iSize;
+    void* dtPtr = DTable;
+    HUFv06_DEltX4* const dt = ((HUFv06_DEltX4*)dtPtr) + 1;
+
+    HUFv06_STATIC_ASSERT(sizeof(HUFv06_DEltX4) == sizeof(U32));   /* if compilation fails here, assertion is false */
+    if (memLog > HUFv06_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv06_readStats(weightList, HUFv06_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv06_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > memLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = weightList[s];
+            U32 const r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = rankVal[0];
+        {   int const rescale = (memLog-tableLog) - 1;   /* tableLog <= memLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 current = nextRankVal;
+                nextRankVal += rankStats[w] << (w+rescale);
+                rankVal0[w] = current;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < memLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUFv06_fillDTableX4(dt, memLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    return iSize;
+}
+
+
+static U32 HUFv06_decodeSymbolX4(void* op, BITv06_DStream_t* DStream, const HUFv06_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv06_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BITv06_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUFv06_decodeLastSymbolX4(void* op, BITv06_DStream_t* DStream, const HUFv06_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv06_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BITv06_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BITv06_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+    }   }
+    return 1;
+}
+
+
+#define HUFv06_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUFv06_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv06_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv06_MAX_TABLELOG<=12)) \
+        ptr += HUFv06_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv06_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUFv06_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUFv06_decodeStreamX4(BYTE* p, BITv06_DStream_t* bitDPtr, BYTE* const pEnd, const HUFv06_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BITv06_reloadDStream(bitDPtr) == BITv06_DStream_unfinished) && (p < pEnd-7)) {
+        HUFv06_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv06_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BITv06_reloadDStream(bitDPtr) == BITv06_DStream_unfinished) && (p <= pEnd-2))
+        HUFv06_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUFv06_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUFv06_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+size_t HUFv06_decompress1X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    const BYTE* const istart = (const BYTE*) cSrc;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+
+    const U32 dtLog = DTable[0];
+    const void* const dtPtr = DTable;
+    const HUFv06_DEltX4* const dt = ((const HUFv06_DEltX4*)dtPtr) +1;
+
+    /* Init */
+    BITv06_DStream_t bitD;
+    { size_t const errorCode = BITv06_initDStream(&bitD, istart, cSrcSize);
+      if (HUFv06_isError(errorCode)) return errorCode; }
+
+    /* decode */
+    HUFv06_decodeStreamX4(ostart, &bitD, oend, dt, dtLog);
+
+    /* check */
+    if (!BITv06_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+size_t HUFv06_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv06_CREATE_STATIC_DTABLEX4(DTable, HUFv06_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUFv06_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUFv06_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUFv06_decompress1X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+size_t HUFv06_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U32* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable;
+        const HUFv06_DEltX4* const dt = ((const HUFv06_DEltX4*)dtPtr) +1;
+        const U32 dtLog = DTable[0];
+        size_t errorCode;
+
+        /* Init */
+        BITv06_DStream_t bitD1;
+        BITv06_DStream_t bitD2;
+        BITv06_DStream_t bitD3;
+        BITv06_DStream_t bitD4;
+        const size_t length1 = MEM_readLE16(istart);
+        const size_t length2 = MEM_readLE16(istart+2);
+        const size_t length3 = MEM_readLE16(istart+4);
+        size_t length4;
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+
+        length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        errorCode = BITv06_initDStream(&bitD1, istart1, length1);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD2, istart2, length2);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD3, istart3, length3);
+        if (HUFv06_isError(errorCode)) return errorCode;
+        errorCode = BITv06_initDStream(&bitD4, istart4, length4);
+        if (HUFv06_isError(errorCode)) return errorCode;
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BITv06_reloadDStream(&bitD1) | BITv06_reloadDStream(&bitD2) | BITv06_reloadDStream(&bitD3) | BITv06_reloadDStream(&bitD4);
+        for ( ; (endSignal==BITv06_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUFv06_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv06_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUFv06_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUFv06_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUFv06_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BITv06_reloadDStream(&bitD1) | BITv06_reloadDStream(&bitD2) | BITv06_reloadDStream(&bitD3) | BITv06_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUFv06_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUFv06_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUFv06_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUFv06_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BITv06_endOfDStream(&bitD1) & BITv06_endOfDStream(&bitD2) & BITv06_endOfDStream(&bitD3) & BITv06_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUFv06_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv06_CREATE_STATIC_DTABLEX4(DTable, HUFv06_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUFv06_readDTableX4 (DTable, cSrc, cSrcSize);
+    if (HUFv06_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize;
+    cSrcSize -= hSize;
+
+    return HUFv06_decompress4X4_usingDTable (dst, dstSize, ip, cSrcSize, DTable);
+}
+
+
+
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUFv06_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[3] = { HUFv06_decompress4X2, HUFv06_decompress4X4, NULL };
+    U32 Dtime[3];   /* decompression time estimation */
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    /* decoder timing evaluation */
+    {   U32 const Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+        U32 const D256 = (U32)(dstSize >> 8);
+        U32 n; for (n=0; n<3; n++)
+            Dtime[n] = algoTime[Q][n].tableTime + (algoTime[Q][n].decode256Time * D256);
+    }
+
+    Dtime[1] += Dtime[1] >> 4; Dtime[2] += Dtime[2] >> 3; /* advantage to algorithms using less memory, for cache eviction */
+
+    {   U32 algoNb = 0;
+        if (Dtime[1] < Dtime[0]) algoNb = 1;
+        // if (Dtime[2] < Dtime[algoNb]) algoNb = 2;   /* current speed of HUFv06_decompress4X6 is not good */
+        return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+    }
+
+    //return HUFv06_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUFv06_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+    //return HUFv06_decompress4X6(dst, dstSize, cSrc, cSrcSize);   /* multi-streams quad-symbols decoding */
+}
+/*
+    Common functions of Zstd compression library
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net/
+*/
+
+
+/*-****************************************
+*  Version
+******************************************/
+
+/*-****************************************
+*  ZSTD Error Management
+******************************************/
+/*! ZSTDv06_isError() :
+*   tells if a return value is an error code */
+unsigned ZSTDv06_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTDv06_getErrorName() :
+*   provides error code string from function result (useful for debugging) */
+const char* ZSTDv06_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/* **************************************************************
+*  ZBUFF Error Management
+****************************************************************/
+unsigned ZBUFFv06_isError(size_t errorCode) { return ERR_isError(errorCode); }
+
+const char* ZBUFFv06_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTDv06_decompress() will allocate memory,
+ * in memory stack (0), or in memory heap (1, requires malloc())
+ */
+#ifndef ZSTDv06_HEAPMODE
+#  define ZSTDv06_HEAPMODE 1
+#endif
+
+
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+
+/*-*************************************
+*  Macros
+***************************************/
+#define ZSTDv06_isError ERR_isError   /* for inlining */
+#define FSEv06_isError  ERR_isError
+#define HUFv06_isError  ERR_isError
+
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTDv06_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock } ZSTDv06_dStage;
+
+struct ZSTDv06_DCtx_s
+{
+    FSEv06_DTable LLTable[FSEv06_DTABLE_SIZE_U32(LLFSELog)];
+    FSEv06_DTable OffTable[FSEv06_DTABLE_SIZE_U32(OffFSELog)];
+    FSEv06_DTable MLTable[FSEv06_DTABLE_SIZE_U32(MLFSELog)];
+    unsigned   hufTableX4[HUFv06_DTABLE_SIZE(HufLog)];
+    const void* previousDstEnd;
+    const void* base;
+    const void* vBase;
+    const void* dictEnd;
+    size_t expected;
+    size_t headerSize;
+    ZSTDv06_frameParams fParams;
+    blockType_t bType;   /* used in ZSTDv06_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
+    ZSTDv06_dStage stage;
+    U32 flagRepeatTable;
+    const BYTE* litPtr;
+    size_t litSize;
+    BYTE litBuffer[ZSTDv06_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
+    BYTE headerBuffer[ZSTDv06_FRAMEHEADERSIZE_MAX];
+};  /* typedef'd to ZSTDv06_DCtx within "zstd_static.h" */
+
+size_t ZSTDv06_sizeofDCtx (void) { return sizeof(ZSTDv06_DCtx); }   /* non published interface */
+
+size_t ZSTDv06_decompressBegin(ZSTDv06_DCtx* dctx)
+{
+    dctx->expected = ZSTDv06_frameHeaderSize_min;
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    dctx->hufTableX4[0] = HufLog;
+    dctx->flagRepeatTable = 0;
+    return 0;
+}
+
+ZSTDv06_DCtx* ZSTDv06_createDCtx(void)
+{
+    ZSTDv06_DCtx* dctx = (ZSTDv06_DCtx*)malloc(sizeof(ZSTDv06_DCtx));
+    if (dctx==NULL) return NULL;
+    ZSTDv06_decompressBegin(dctx);
+    return dctx;
+}
+
+size_t ZSTDv06_freeDCtx(ZSTDv06_DCtx* dctx)
+{
+    free(dctx);
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+void ZSTDv06_copyDCtx(ZSTDv06_DCtx* dstDCtx, const ZSTDv06_DCtx* srcDCtx)
+{
+    memcpy(dstDCtx, srcDCtx,
+           sizeof(ZSTDv06_DCtx) - (ZSTDv06_BLOCKSIZE_MAX+WILDCOPY_OVERLENGTH + ZSTDv06_frameHeaderSize_max));  /* no need to copy workspace */
+}
+
+
+/*-*************************************************************
+*   Decompression section
+***************************************************************/
+
+/* Frame format description
+   Frame Header -  [ Block Header - Block ] - Frame End
+   1) Frame Header
+      - 4 bytes - Magic Number : ZSTDv06_MAGICNUMBER (defined within zstd_static.h)
+      - 1 byte  - Frame Descriptor
+   2) Block Header
+      - 3 bytes, starting with a 2-bits descriptor
+                 Uncompressed, Compressed, Frame End, unused
+   3) Block
+      See Block Format Description
+   4) Frame End
+      - 3 bytes, compatible with Block Header
+*/
+
+
+/* Frame descriptor
+
+   1 byte, using :
+   bit 0-3 : windowLog - ZSTDv06_WINDOWLOG_ABSOLUTEMIN   (see zstd_internal.h)
+   bit 4   : minmatch 4(0) or 3(1)
+   bit 5   : reserved (must be zero)
+   bit 6-7 : Frame content size : unknown, 1 byte, 2 bytes, 8 bytes
+
+   Optional : content size (0, 1, 2 or 8 bytes)
+   0 : unknown
+   1 : 0-255 bytes
+   2 : 256 - 65535+256
+   8 : up to 16 exa
+*/
+
+
+/* Compressed Block, format description
+
+   Block = Literal Section - Sequences Section
+   Prerequisite : size of (compressed) block, maximum size of regenerated data
+
+   1) Literal Section
+
+   1.1) Header : 1-5 bytes
+        flags: 2 bits
+            00 compressed by Huff0
+            01 unused
+            10 is Raw (uncompressed)
+            11 is Rle
+            Note : using 01 => Huff0 with precomputed table ?
+            Note : delta map ? => compressed ?
+
+   1.1.1) Huff0-compressed literal block : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+   1.1.2) Raw (uncompressed) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RAW<<6) + (0<<4) + size
+               12 bits: (IS_RAW<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RAW<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.3) Rle (repeated single byte) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RLE<<6) + (0<<4) + size
+               12 bits: (IS_RLE<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RLE<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.4) Huff0-compressed literal block, using precomputed CTables : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+        1- CTable available (stored into workspace ?)
+        2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
+
+
+   1.2) Literal block content
+
+   1.2.1) Huff0 block, using sizes from header
+        See Huff0 format
+
+   1.2.2) Huff0 block, using prepared table
+
+   1.2.3) Raw content
+
+   1.2.4) single byte
+
+
+   2) Sequences section
+      TO DO
+*/
+
+/** ZSTDv06_frameHeaderSize() :
+*   srcSize must be >= ZSTDv06_frameHeaderSize_min.
+*   @return : size of the Frame Header */
+static size_t ZSTDv06_frameHeaderSize(const void* src, size_t srcSize)
+{
+    if (srcSize < ZSTDv06_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    { U32 const fcsId = (((const BYTE*)src)[4]) >> 6;
+      return ZSTDv06_frameHeaderSize_min + ZSTDv06_fcs_fieldSize[fcsId]; }
+}
+
+
+/** ZSTDv06_getFrameParams() :
+*   decode Frame Header, or provide expected `srcSize`.
+*   @return : 0, `fparamsPtr` is correctly filled,
+*            >0, `srcSize` is too small, result is expected `srcSize`,
+*             or an error code, which can be tested using ZSTDv06_isError() */
+size_t ZSTDv06_getFrameParams(ZSTDv06_frameParams* fparamsPtr, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    if (srcSize < ZSTDv06_frameHeaderSize_min) return ZSTDv06_frameHeaderSize_min;
+    if (MEM_readLE32(src) != ZSTDv06_MAGICNUMBER) return ERROR(prefix_unknown);
+
+    /* ensure there is enough `srcSize` to fully read/decode frame header */
+    { size_t const fhsize = ZSTDv06_frameHeaderSize(src, srcSize);
+      if (srcSize < fhsize) return fhsize; }
+
+    memset(fparamsPtr, 0, sizeof(*fparamsPtr));
+    {   BYTE const frameDesc = ip[4];
+        fparamsPtr->windowLog = (frameDesc & 0xF) + ZSTDv06_WINDOWLOG_ABSOLUTEMIN;
+        if ((frameDesc & 0x20) != 0) return ERROR(frameParameter_unsupported);   /* reserved 1 bit */
+        switch(frameDesc >> 6)  /* fcsId */
+        {
+            default:   /* impossible */
+            case 0 : fparamsPtr->frameContentSize = 0; break;
+            case 1 : fparamsPtr->frameContentSize = ip[5]; break;
+            case 2 : fparamsPtr->frameContentSize = MEM_readLE16(ip+5)+256; break;
+            case 3 : fparamsPtr->frameContentSize = MEM_readLE64(ip+5); break;
+    }   }
+    return 0;
+}
+
+
+/** ZSTDv06_decodeFrameHeader() :
+*   `srcSize` must be the size provided by ZSTDv06_frameHeaderSize().
+*   @return : 0 if success, or an error code, which can be tested using ZSTDv06_isError() */
+static size_t ZSTDv06_decodeFrameHeader(ZSTDv06_DCtx* zc, const void* src, size_t srcSize)
+{
+    size_t const result = ZSTDv06_getFrameParams(&(zc->fParams), src, srcSize);
+    if ((MEM_32bits()) && (zc->fParams.windowLog > 25)) return ERROR(frameParameter_unsupported);
+    return result;
+}
+
+
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+/*! ZSTDv06_getcBlockSize() :
+*   Provides the size of compressed block from block header `src` */
+size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    U32 cSize;
+
+    if (srcSize < ZSTDv06_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    bpPtr->blockType = (blockType_t)((*in) >> 6);
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+
+static size_t ZSTDv06_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/*! ZSTDv06_decodeLiteralsBlock() :
+    @return : nb of bytes read from src (< srcSize ) */
+size_t ZSTDv06_decodeLiteralsBlock(ZSTDv06_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    const BYTE* const istart = (const BYTE*) src;
+
+    /* any compressed block with literals segment must be at least this size */
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch(istart[0]>> 6)
+    {
+    case IS_HUF:
+        {   size_t litSize, litCSize, singleStream=0;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for lhSize, + cSize (+nbSeq) */
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                /* 2 - 2 - 10 - 10 */
+                lhSize=3;
+                singleStream = istart[0] & 16;
+                litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+                litCSize = ((istart[1] &  3) << 8) + istart[2];
+                break;
+            case 2:
+                /* 2 - 2 - 14 - 14 */
+                lhSize=4;
+                litSize  = ((istart[0] & 15) << 10) + (istart[1] << 2) + (istart[2] >> 6);
+                litCSize = ((istart[2] & 63) <<  8) + istart[3];
+                break;
+            case 3:
+                /* 2 - 2 - 18 - 18 */
+                lhSize=5;
+                litSize  = ((istart[0] & 15) << 14) + (istart[1] << 6) + (istart[2] >> 2);
+                litCSize = ((istart[2] &  3) << 16) + (istart[3] << 8) + istart[4];
+                break;
+            }
+            if (litSize > ZSTDv06_BLOCKSIZE_MAX) return ERROR(corruption_detected);
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            if (HUFv06_isError(singleStream ?
+                            HUFv06_decompress1X2(dctx->litBuffer, litSize, istart+lhSize, litCSize) :
+                            HUFv06_decompress   (dctx->litBuffer, litSize, istart+lhSize, litCSize) ))
+                return ERROR(corruption_detected);
+
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case IS_PCH:
+        {   size_t litSize, litCSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (lhSize != 1)  /* only case supported for now : small litSize, single stream */
+                return ERROR(corruption_detected);
+            if (!dctx->flagRepeatTable)
+                return ERROR(dictionary_corrupted);
+
+            /* 2 - 2 - 10 - 10 */
+            lhSize=3;
+            litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+            litCSize = ((istart[1] &  3) << 8) + istart[2];
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            {   size_t const errorCode = HUFv06_decompress1X4_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->hufTableX4);
+                if (HUFv06_isError(errorCode)) return ERROR(corruption_detected);
+            }
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case IS_RAW:
+        {   size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize=1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                break;
+            }
+
+            if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                if (litSize+lhSize > srcSize) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                return lhSize+litSize;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+lhSize;
+            dctx->litSize = litSize;
+            return lhSize+litSize;
+        }
+    case IS_RLE:
+        {   size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize = 1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                if (srcSize<4) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+                break;
+            }
+            if (litSize > ZSTDv06_BLOCKSIZE_MAX) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return lhSize+1;
+        }
+    default:
+        return ERROR(corruption_detected);   /* impossible */
+    }
+}
+
+
+/*! ZSTDv06_buildSeqTable() :
+    @return : nb bytes read from src,
+              or an error code if it fails, testable with ZSTDv06_isError()
+*/
+size_t ZSTDv06_buildSeqTable(FSEv06_DTable* DTable, U32 type, U32 max, U32 maxLog,
+                                 const void* src, size_t srcSize,
+                                 const S16* defaultNorm, U32 defaultLog, U32 flagRepeatTable)
+{
+    switch(type)
+    {
+    case FSEv06_ENCODING_RLE :
+        if (!srcSize) return ERROR(srcSize_wrong);
+        if ( (*(const BYTE*)src) > max) return ERROR(corruption_detected);
+        FSEv06_buildDTable_rle(DTable, *(const BYTE*)src);   /* if *src > max, data is corrupted */
+        return 1;
+    case FSEv06_ENCODING_RAW :
+        FSEv06_buildDTable(DTable, defaultNorm, max, defaultLog);
+        return 0;
+    case FSEv06_ENCODING_STATIC:
+        if (!flagRepeatTable) return ERROR(corruption_detected);
+        return 0;
+    default :   /* impossible */
+    case FSEv06_ENCODING_DYNAMIC :
+        {   U32 tableLog;
+            S16 norm[MaxSeq+1];
+            size_t const headerSize = FSEv06_readNCount(norm, &max, &tableLog, src, srcSize);
+            if (FSEv06_isError(headerSize)) return ERROR(corruption_detected);
+            if (tableLog > maxLog) return ERROR(corruption_detected);
+            FSEv06_buildDTable(DTable, norm, max, tableLog);
+            return headerSize;
+    }   }
+}
+
+
+size_t ZSTDv06_decodeSeqHeaders(int* nbSeqPtr,
+                             FSEv06_DTable* DTableLL, FSEv06_DTable* DTableML, FSEv06_DTable* DTableOffb, U32 flagRepeatTable,
+                             const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+
+    /* check */
+    if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    {   int nbSeq = *ip++;
+        if (!nbSeq) { *nbSeqPtr=0; return 1; }
+        if (nbSeq > 0x7F) {
+            if (nbSeq == 0xFF) {
+                if (ip+2 > iend) return ERROR(srcSize_wrong);
+                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+            } else {
+                if (ip >= iend) return ERROR(srcSize_wrong);
+                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+            }
+        }
+        *nbSeqPtr = nbSeq;
+    }
+
+    /* FSE table descriptors */
+    {   U32 const LLtype  = *ip >> 6;
+        U32 const Offtype = (*ip >> 4) & 3;
+        U32 const MLtype  = (*ip >> 2) & 3;
+        ip++;
+
+        /* check */
+        if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+        /* Build DTables */
+        {   size_t const bhSize = ZSTDv06_buildSeqTable(DTableLL, LLtype, MaxLL, LLFSELog, ip, iend-ip, LL_defaultNorm, LL_defaultNormLog, flagRepeatTable);
+            if (ZSTDv06_isError(bhSize)) return ERROR(corruption_detected);
+            ip += bhSize;
+        }
+        {   size_t const bhSize = ZSTDv06_buildSeqTable(DTableOffb, Offtype, MaxOff, OffFSELog, ip, iend-ip, OF_defaultNorm, OF_defaultNormLog, flagRepeatTable);
+            if (ZSTDv06_isError(bhSize)) return ERROR(corruption_detected);
+            ip += bhSize;
+        }
+        {   size_t const bhSize = ZSTDv06_buildSeqTable(DTableML, MLtype, MaxML, MLFSELog, ip, iend-ip, ML_defaultNorm, ML_defaultNormLog, flagRepeatTable);
+            if (ZSTDv06_isError(bhSize)) return ERROR(corruption_detected);
+            ip += bhSize;
+    }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+} seq_t;
+
+typedef struct {
+    BITv06_DStream_t DStream;
+    FSEv06_DState_t stateLL;
+    FSEv06_DState_t stateOffb;
+    FSEv06_DState_t stateML;
+    size_t prevOffset[ZSTDv06_REP_INIT];
+} seqState_t;
+
+
+
+static void ZSTDv06_decodeSequence(seq_t* seq, seqState_t* seqState)
+{
+    /* Literal length */
+    U32 const llCode = FSEv06_peekSymbol(&(seqState->stateLL));
+    U32 const mlCode = FSEv06_peekSymbol(&(seqState->stateML));
+    U32 const ofCode = FSEv06_peekSymbol(&(seqState->stateOffb));   /* <= maxOff, by table construction */
+
+    U32 const llBits = LL_bits[llCode];
+    U32 const mlBits = ML_bits[mlCode];
+    U32 const ofBits = ofCode;
+    U32 const totalBits = llBits+mlBits+ofBits;
+
+    static const U32 LL_base[MaxLL+1] = {
+                             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,   10,    11,    12,    13,    14,     15,
+                            16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                            0x2000, 0x4000, 0x8000, 0x10000 };
+
+    static const U32 ML_base[MaxML+1] = {
+                             0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,   11,    12,    13,    14,    15,
+                            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,   27,    28,    29,    30,    31,
+                            32, 34, 36, 38, 40, 44, 48, 56, 64, 80, 96, 0x80, 0x100, 0x200, 0x400, 0x800,
+                            0x1000, 0x2000, 0x4000, 0x8000, 0x10000 };
+
+    static const U32 OF_base[MaxOff+1] = {
+                 0,        1,       3,       7,     0xF,     0x1F,     0x3F,     0x7F,
+                 0xFF,   0x1FF,   0x3FF,   0x7FF,   0xFFF,   0x1FFF,   0x3FFF,   0x7FFF,
+                 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,
+                 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, /*fake*/ 1, 1 };
+
+    /* sequence */
+    {   size_t offset;
+        if (!ofCode)
+            offset = 0;
+        else {
+            offset = OF_base[ofCode] + BITv06_readBits(&(seqState->DStream), ofBits);   /* <=  26 bits */
+            if (MEM_32bits()) BITv06_reloadDStream(&(seqState->DStream));
+        }
+
+        if (offset < ZSTDv06_REP_NUM) {
+            if (llCode == 0 && offset <= 1) offset = 1-offset;
+
+            if (offset != 0) {
+                size_t temp = seqState->prevOffset[offset];
+                if (offset != 1) {
+                    seqState->prevOffset[2] = seqState->prevOffset[1];
+                }
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset = temp;
+
+            } else {
+                offset = seqState->prevOffset[0];
+            }
+        } else {
+            offset -= ZSTDv06_REP_MOVE;
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        }
+        seq->offset = offset;
+    }
+
+    seq->matchLength = ML_base[mlCode] + MINMATCH + ((mlCode>31) ? BITv06_readBits(&(seqState->DStream), mlBits) : 0);   /* <=  16 bits */
+    if (MEM_32bits() && (mlBits+llBits>24)) BITv06_reloadDStream(&(seqState->DStream));
+
+    seq->litLength = LL_base[llCode] + ((llCode>15) ? BITv06_readBits(&(seqState->DStream), llBits) : 0);   /* <=  16 bits */
+    if (MEM_32bits() ||
+       (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BITv06_reloadDStream(&(seqState->DStream));
+
+    /* ANS state update */
+    FSEv06_updateState(&(seqState->stateLL), &(seqState->DStream));   /* <=  9 bits */
+    FSEv06_updateState(&(seqState->stateML), &(seqState->DStream));   /* <=  9 bits */
+    if (MEM_32bits()) BITv06_reloadDStream(&(seqState->DStream));     /* <= 18 bits */
+    FSEv06_updateState(&(seqState->stateOffb), &(seqState->DStream)); /* <=  8 bits */
+}
+
+
+size_t ZSTDv06_execSequence(BYTE* op,
+                                BYTE* const oend, seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_8 = oend-8;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall);   /* last match must start at a minimum distance of 8 from oend */
+    if (oMatchEnd > oend) return ERROR(dstSize_tooSmall);   /* overwrite beyond dst buffer */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+
+    /* copy Literals */
+    ZSTDv06_wildcopy(op, *litPtr, sequence.litLength);   /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_8 || sequence.matchLength < MINMATCH) {
+              while (op < oMatchEnd) *op++ = *match++;
+              return sequenceLength;
+            }
+    }   }
+    /* Requirement: op <= oend_8 */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* substracted */
+        int const sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTDv06_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTDv06_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_8) {
+            ZSTDv06_wildcopy(op, match, oend_8 - op);
+            match += oend_8 - op;
+            op = oend_8;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    } else {
+        ZSTDv06_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+
+static size_t ZSTDv06_decompressSequences(
+                               ZSTDv06_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    FSEv06_DTable* DTableLL = dctx->LLTable;
+    FSEv06_DTable* DTableML = dctx->MLTable;
+    FSEv06_DTable* DTableOffb = dctx->OffTable;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    int nbSeq;
+
+    /* Build Decoding Tables */
+    {   size_t const seqHSize = ZSTDv06_decodeSeqHeaders(&nbSeq, DTableLL, DTableML, DTableOffb, dctx->flagRepeatTable, ip, seqSize);
+        if (ZSTDv06_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+        dctx->flagRepeatTable = 0;
+    }
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seq_t sequence;
+        seqState_t seqState;
+
+        memset(&sequence, 0, sizeof(sequence));
+        sequence.offset = REPCODE_STARTVALUE;
+        { U32 i; for (i=0; i<ZSTDv06_REP_INIT; i++) seqState.prevOffset[i] = REPCODE_STARTVALUE; }
+        { size_t const errorCode = BITv06_initDStream(&(seqState.DStream), ip, iend-ip);
+          if (ERR_isError(errorCode)) return ERROR(corruption_detected); }
+        FSEv06_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSEv06_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSEv06_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BITv06_reloadDStream(&(seqState.DStream)) <= BITv06_DStream_completed) && nbSeq ; ) {
+            nbSeq--;
+            ZSTDv06_decodeSequence(&sequence, &seqState);
+
+#if 0  /* debug */
+            static BYTE* start = NULL;
+            if (start==NULL) start = op;
+            size_t pos = (size_t)(op-start);
+            if ((pos >= 5810037) && (pos < 5810400))
+                printf("Dpos %6u :%5u literals & match %3u bytes at distance %6u \n",
+                       pos, (U32)sequence.litLength, (U32)sequence.matchLength, (U32)sequence.offset);
+#endif
+
+            {   size_t const oneSeqSize = ZSTDv06_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+                if (ZSTDv06_isError(oneSeqSize)) return oneSeqSize;
+                op += oneSeqSize;
+        }   }
+
+        /* check if reached exact end */
+        if (nbSeq) return ERROR(corruption_detected);
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        if (litPtr > litEnd) return ERROR(corruption_detected);   /* too many literals already used */
+        if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+
+static void ZSTDv06_checkContinuity(ZSTDv06_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+static size_t ZSTDv06_decompressBlock_internal(ZSTDv06_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    if (srcSize >= ZSTDv06_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);
+
+    /* Decode literals sub-block */
+    {   size_t const litCSize = ZSTDv06_decodeLiteralsBlock(dctx, src, srcSize);
+        if (ZSTDv06_isError(litCSize)) return litCSize;
+        ip += litCSize;
+        srcSize -= litCSize;
+    }
+    return ZSTDv06_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
+}
+
+
+size_t ZSTDv06_decompressBlock(ZSTDv06_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    ZSTDv06_checkContinuity(dctx, dst);
+    return ZSTDv06_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+/*! ZSTDv06_decompressFrame() :
+*   `dctx` must be properly initialized */
+static size_t ZSTDv06_decompressFrame(ZSTDv06_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstCapacity;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties = { bt_compressed, 0 };
+
+    /* check */
+    if (srcSize < ZSTDv06_frameHeaderSize_min+ZSTDv06_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTDv06_frameHeaderSize(src, ZSTDv06_frameHeaderSize_min);
+        if (ZSTDv06_isError(frameHeaderSize)) return frameHeaderSize;
+        if (srcSize < frameHeaderSize+ZSTDv06_blockHeaderSize) return ERROR(srcSize_wrong);
+        if (ZSTDv06_decodeFrameHeader(dctx, src, frameHeaderSize)) return ERROR(corruption_detected);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t decodedSize=0;
+        size_t const cBlockSize = ZSTDv06_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTDv06_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv06_blockHeaderSize;
+        remainingSize -= ZSTDv06_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTDv06_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTDv06_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            return ERROR(GENERIC);   /* not yet supported */
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        if (ZSTDv06_isError(decodedSize)) return decodedSize;
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+
+size_t ZSTDv06_decompress_usingPreparedDCtx(ZSTDv06_DCtx* dctx, const ZSTDv06_DCtx* refDCtx,
+                                         void* dst, size_t dstCapacity,
+                                   const void* src, size_t srcSize)
+{
+    ZSTDv06_copyDCtx(dctx, refDCtx);
+    ZSTDv06_checkContinuity(dctx, dst);
+    return ZSTDv06_decompressFrame(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+size_t ZSTDv06_decompress_usingDict(ZSTDv06_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
+{
+    ZSTDv06_decompressBegin_usingDict(dctx, dict, dictSize);
+    ZSTDv06_checkContinuity(dctx, dst);
+    return ZSTDv06_decompressFrame(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+size_t ZSTDv06_decompressDCtx(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTDv06_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0);
+}
+
+
+size_t ZSTDv06_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTDv06_HEAPMODE) && (ZSTDv06_HEAPMODE==1)
+    size_t regenSize;
+    ZSTDv06_DCtx* dctx = ZSTDv06_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTDv06_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+    ZSTDv06_freeDCtx(dctx);
+    return regenSize;
+#else   /* stack mode */
+    ZSTDv06_DCtx dctx;
+    return ZSTDv06_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+size_t ZSTDv06_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+    blockProperties_t blockProperties = { bt_compressed, 0 };
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTDv06_frameHeaderSize(src, ZSTDv06_frameHeaderSize_min);
+        if (ZSTDv06_isError(frameHeaderSize)) return frameHeaderSize;
+        if (MEM_readLE32(src) != ZSTDv06_MAGICNUMBER) return ERROR(prefix_unknown);
+        if (srcSize < frameHeaderSize+ZSTDv06_blockHeaderSize) return ERROR(srcSize_wrong);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t const cBlockSize = ZSTDv06_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTDv06_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv06_blockHeaderSize;
+        remainingSize -= ZSTDv06_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        if (cBlockSize == 0) break;   /* bt_end */
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/*_******************************
+*  Streaming Decompression API
+********************************/
+size_t ZSTDv06_nextSrcSizeToDecompress(ZSTDv06_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+size_t ZSTDv06_decompressContinue(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != dctx->expected) return ERROR(srcSize_wrong);
+    if (dstCapacity) ZSTDv06_checkContinuity(dctx, dst);
+
+    /* Decompress : frame header; part 1 */
+    switch (dctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        if (srcSize != ZSTDv06_frameHeaderSize_min) return ERROR(srcSize_wrong);   /* impossible */
+        dctx->headerSize = ZSTDv06_frameHeaderSize(src, ZSTDv06_frameHeaderSize_min);
+        if (ZSTDv06_isError(dctx->headerSize)) return dctx->headerSize;
+        memcpy(dctx->headerBuffer, src, ZSTDv06_frameHeaderSize_min);
+        if (dctx->headerSize > ZSTDv06_frameHeaderSize_min) {
+            dctx->expected = dctx->headerSize - ZSTDv06_frameHeaderSize_min;
+            dctx->stage = ZSTDds_decodeFrameHeader;
+            return 0;
+        }
+        dctx->expected = 0;   /* not necessary to copy more */
+	/* fall-through */
+    case ZSTDds_decodeFrameHeader:
+        {   size_t result;
+            memcpy(dctx->headerBuffer + ZSTDv06_frameHeaderSize_min, src, dctx->expected);
+            result = ZSTDv06_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize);
+            if (ZSTDv06_isError(result)) return result;
+            dctx->expected = ZSTDv06_blockHeaderSize;
+            dctx->stage = ZSTDds_decodeBlockHeader;
+            return 0;
+        }
+    case ZSTDds_decodeBlockHeader:
+        {   blockProperties_t bp;
+            size_t const cBlockSize = ZSTDv06_getcBlockSize(src, ZSTDv06_blockHeaderSize, &bp);
+            if (ZSTDv06_isError(cBlockSize)) return cBlockSize;
+            if (bp.blockType == bt_end) {
+                dctx->expected = 0;
+                dctx->stage = ZSTDds_getFrameHeaderSize;
+            } else {
+                dctx->expected = cBlockSize;
+                dctx->bType = bp.blockType;
+                dctx->stage = ZSTDds_decompressBlock;
+            }
+            return 0;
+        }
+    case ZSTDds_decompressBlock:
+        {   size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                rSize = ZSTDv06_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+                break;
+            case bt_raw :
+                rSize = ZSTDv06_copyRawBlock(dst, dstCapacity, src, srcSize);
+                break;
+            case bt_rle :
+                return ERROR(GENERIC);   /* not yet handled */
+                break;
+            case bt_end :   /* should never happen (filtered at phase 1) */
+                rSize = 0;
+                break;
+            default:
+                return ERROR(GENERIC);   /* impossible */
+            }
+            dctx->stage = ZSTDds_decodeBlockHeader;
+            dctx->expected = ZSTDv06_blockHeaderSize;
+            dctx->previousDstEnd = (char*)dst + rSize;
+            return rSize;
+        }
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static void ZSTDv06_refDictContent(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+    dctx->base = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+}
+
+static size_t ZSTDv06_loadEntropy(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t hSize, offcodeHeaderSize, matchlengthHeaderSize, litlengthHeaderSize;
+
+    hSize = HUFv06_readDTableX4(dctx->hufTableX4, dict, dictSize);
+    if (HUFv06_isError(hSize)) return ERROR(dictionary_corrupted);
+    dict = (const char*)dict + hSize;
+    dictSize -= hSize;
+
+    {   short offcodeNCount[MaxOff+1];
+        U32 offcodeMaxValue=MaxOff, offcodeLog;
+        offcodeHeaderSize = FSEv06_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dict, dictSize);
+        if (FSEv06_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv06_buildDTable(dctx->OffTable, offcodeNCount, offcodeMaxValue, offcodeLog);
+          if (FSEv06_isError(errorCode)) return ERROR(dictionary_corrupted); }
+        dict = (const char*)dict + offcodeHeaderSize;
+        dictSize -= offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        matchlengthHeaderSize = FSEv06_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dict, dictSize);
+        if (FSEv06_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv06_buildDTable(dctx->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog);
+          if (FSEv06_isError(errorCode)) return ERROR(dictionary_corrupted); }
+        dict = (const char*)dict + matchlengthHeaderSize;
+        dictSize -= matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        litlengthHeaderSize = FSEv06_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dict, dictSize);
+        if (FSEv06_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv06_buildDTable(dctx->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog);
+          if (FSEv06_isError(errorCode)) return ERROR(dictionary_corrupted); }
+    }
+
+    dctx->flagRepeatTable = 1;
+    return hSize + offcodeHeaderSize + matchlengthHeaderSize + litlengthHeaderSize;
+}
+
+static size_t ZSTDv06_decompress_insertDictionary(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    size_t eSize;
+    U32 const magic = MEM_readLE32(dict);
+    if (magic != ZSTDv06_DICT_MAGIC) {
+        /* pure content mode */
+        ZSTDv06_refDictContent(dctx, dict, dictSize);
+        return 0;
+    }
+    /* load entropy tables */
+    dict = (const char*)dict + 4;
+    dictSize -= 4;
+    eSize = ZSTDv06_loadEntropy(dctx, dict, dictSize);
+    if (ZSTDv06_isError(eSize)) return ERROR(dictionary_corrupted);
+
+    /* reference dictionary content */
+    dict = (const char*)dict + eSize;
+    dictSize -= eSize;
+    ZSTDv06_refDictContent(dctx, dict, dictSize);
+
+    return 0;
+}
+
+
+size_t ZSTDv06_decompressBegin_usingDict(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    { size_t const errorCode = ZSTDv06_decompressBegin(dctx);
+      if (ZSTDv06_isError(errorCode)) return errorCode; }
+
+    if (dict && dictSize) {
+        size_t const errorCode = ZSTDv06_decompress_insertDictionary(dctx, dict, dictSize);
+        if (ZSTDv06_isError(errorCode)) return ERROR(dictionary_corrupted);
+    }
+
+    return 0;
+}
+
+/*
+    Buffered version of Zstd compression library
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net/
+*/
+
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFFv06_DCtx object is required to track streaming operations.
+*  Use ZBUFFv06_createDCtx() and ZBUFFv06_freeDCtx() to create/release resources.
+*  Use ZBUFFv06_decompressInit() to start a new decompression operation,
+*   or ZBUFFv06_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv06_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFFv06_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change @dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+*            or 0 when a frame is completely decoded,
+*            or an error code, which can be tested using ZBUFFv06_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv06_recommendedDInSize() and ZBUFFv06_recommendedDOutSize()
+*  output : ZBUFFv06_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv06_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFFv06_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+typedef enum { ZBUFFds_init, ZBUFFds_loadHeader,
+               ZBUFFds_read, ZBUFFds_load, ZBUFFds_flush } ZBUFFv06_dStage;
+
+/* *** Resource management *** */
+struct ZBUFFv06_DCtx_s {
+    ZSTDv06_DCtx* zd;
+    ZSTDv06_frameParams fParams;
+    ZBUFFv06_dStage stage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t blockSize;
+    BYTE headerBuffer[ZSTDv06_FRAMEHEADERSIZE_MAX];
+    size_t lhSize;
+};   /* typedef'd to ZBUFFv06_DCtx within "zstd_buffered.h" */
+
+
+ZBUFFv06_DCtx* ZBUFFv06_createDCtx(void)
+{
+    ZBUFFv06_DCtx* zbd = (ZBUFFv06_DCtx*)malloc(sizeof(ZBUFFv06_DCtx));
+    if (zbd==NULL) return NULL;
+    memset(zbd, 0, sizeof(*zbd));
+    zbd->zd = ZSTDv06_createDCtx();
+    zbd->stage = ZBUFFds_init;
+    return zbd;
+}
+
+size_t ZBUFFv06_freeDCtx(ZBUFFv06_DCtx* zbd)
+{
+    if (zbd==NULL) return 0;   /* support free on null */
+    ZSTDv06_freeDCtx(zbd->zd);
+    free(zbd->inBuff);
+    free(zbd->outBuff);
+    free(zbd);
+    return 0;
+}
+
+
+/* *** Initialization *** */
+
+size_t ZBUFFv06_decompressInitDictionary(ZBUFFv06_DCtx* zbd, const void* dict, size_t dictSize)
+{
+    zbd->stage = ZBUFFds_loadHeader;
+    zbd->lhSize = zbd->inPos = zbd->outStart = zbd->outEnd = 0;
+    return ZSTDv06_decompressBegin_usingDict(zbd->zd, dict, dictSize);
+}
+
+size_t ZBUFFv06_decompressInit(ZBUFFv06_DCtx* zbd)
+{
+    return ZBUFFv06_decompressInitDictionary(zbd, NULL, 0);
+}
+
+
+
+MEM_STATIC size_t ZBUFFv06_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t length = MIN(dstCapacity, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+
+/* *** Decompression *** */
+
+size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* zbd,
+                                void* dst, size_t* dstCapacityPtr,
+                          const void* src, size_t* srcSizePtr)
+{
+    const char* const istart = (const char*)src;
+    const char* const iend = istart + *srcSizePtr;
+    const char* ip = istart;
+    char* const ostart = (char*)dst;
+    char* const oend = ostart + *dstCapacityPtr;
+    char* op = ostart;
+    U32 notDone = 1;
+
+    while (notDone) {
+        switch(zbd->stage)
+        {
+        case ZBUFFds_init :
+            return ERROR(init_missing);
+
+        case ZBUFFds_loadHeader :
+            {   size_t const hSize = ZSTDv06_getFrameParams(&(zbd->fParams), zbd->headerBuffer, zbd->lhSize);
+                if (hSize != 0) {
+                    size_t const toLoad = hSize - zbd->lhSize;   /* if hSize!=0, hSize > zbd->lhSize */
+                    if (ZSTDv06_isError(hSize)) return hSize;
+                    if (toLoad > (size_t)(iend-ip)) {   /* not enough input to load full header */
+                        memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip);
+                        zbd->lhSize += iend-ip; ip = iend; notDone = 0;
+                        *dstCapacityPtr = 0;
+                        return (hSize - zbd->lhSize) + ZSTDv06_blockHeaderSize;   /* remaining header bytes + next block header */
+                    }
+                    memcpy(zbd->headerBuffer + zbd->lhSize, ip, toLoad); zbd->lhSize = hSize; ip += toLoad;
+                    break;
+            }   }
+
+            /* Consume header */
+            {   size_t const h1Size = ZSTDv06_nextSrcSizeToDecompress(zbd->zd);  /* == ZSTDv06_frameHeaderSize_min */
+                size_t const h1Result = ZSTDv06_decompressContinue(zbd->zd, NULL, 0, zbd->headerBuffer, h1Size);
+                if (ZSTDv06_isError(h1Result)) return h1Result;
+                if (h1Size < zbd->lhSize) {   /* long header */
+                    size_t const h2Size = ZSTDv06_nextSrcSizeToDecompress(zbd->zd);
+                    size_t const h2Result = ZSTDv06_decompressContinue(zbd->zd, NULL, 0, zbd->headerBuffer+h1Size, h2Size);
+                    if (ZSTDv06_isError(h2Result)) return h2Result;
+            }   }
+
+            /* Frame header instruct buffer sizes */
+            {   size_t const blockSize = MIN(1 << zbd->fParams.windowLog, ZSTDv06_BLOCKSIZE_MAX);
+                zbd->blockSize = blockSize;
+                if (zbd->inBuffSize < blockSize) {
+                    free(zbd->inBuff);
+                    zbd->inBuffSize = blockSize;
+                    zbd->inBuff = (char*)malloc(blockSize);
+                    if (zbd->inBuff == NULL) return ERROR(memory_allocation);
+                }
+                {   size_t const neededOutSize = ((size_t)1 << zbd->fParams.windowLog) + blockSize + WILDCOPY_OVERLENGTH * 2;
+                    if (zbd->outBuffSize < neededOutSize) {
+                        free(zbd->outBuff);
+                        zbd->outBuffSize = neededOutSize;
+                        zbd->outBuff = (char*)malloc(neededOutSize);
+                        if (zbd->outBuff == NULL) return ERROR(memory_allocation);
+            }   }   }
+            zbd->stage = ZBUFFds_read;
+	    /* fall-through */
+        case ZBUFFds_read:
+            {   size_t const neededInSize = ZSTDv06_nextSrcSizeToDecompress(zbd->zd);
+                if (neededInSize==0) {  /* end of frame */
+                    zbd->stage = ZBUFFds_init;
+                    notDone = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                    size_t const decodedSize = ZSTDv06_decompressContinue(zbd->zd,
+                        zbd->outBuff + zbd->outStart, zbd->outBuffSize - zbd->outStart,
+                        ip, neededInSize);
+                    if (ZSTDv06_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize) break;   /* this was just a header */
+                    zbd->outEnd = zbd->outStart +  decodedSize;
+                    zbd->stage = ZBUFFds_flush;
+                    break;
+                }
+                if (ip==iend) { notDone = 0; break; }   /* no more input */
+                zbd->stage = ZBUFFds_load;
+            }
+	    /* fall-through */
+        case ZBUFFds_load:
+            {   size_t const neededInSize = ZSTDv06_nextSrcSizeToDecompress(zbd->zd);
+                size_t const toLoad = neededInSize - zbd->inPos;   /* should always be <= remaining space within inBuff */
+                size_t loadedSize;
+                if (toLoad > zbd->inBuffSize - zbd->inPos) return ERROR(corruption_detected);   /* should never happen */
+                loadedSize = ZBUFFv06_limitCopy(zbd->inBuff + zbd->inPos, toLoad, ip, iend-ip);
+                ip += loadedSize;
+                zbd->inPos += loadedSize;
+                if (loadedSize < toLoad) { notDone = 0; break; }   /* not enough input, wait for more */
+
+                /* decode loaded input */
+                {   size_t const decodedSize = ZSTDv06_decompressContinue(zbd->zd,
+                        zbd->outBuff + zbd->outStart, zbd->outBuffSize - zbd->outStart,
+                        zbd->inBuff, neededInSize);
+                    if (ZSTDv06_isError(decodedSize)) return decodedSize;
+                    zbd->inPos = 0;   /* input is consumed */
+                    if (!decodedSize) { zbd->stage = ZBUFFds_read; break; }   /* this was just a header */
+                    zbd->outEnd = zbd->outStart +  decodedSize;
+                    zbd->stage = ZBUFFds_flush;
+                    // break; /* ZBUFFds_flush follows */
+                }
+	    }
+	    /* fall-through */
+        case ZBUFFds_flush:
+            {   size_t const toFlushSize = zbd->outEnd - zbd->outStart;
+                size_t const flushedSize = ZBUFFv06_limitCopy(op, oend-op, zbd->outBuff + zbd->outStart, toFlushSize);
+                op += flushedSize;
+                zbd->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {
+                    zbd->stage = ZBUFFds_read;
+                    if (zbd->outStart + zbd->blockSize > zbd->outBuffSize)
+                        zbd->outStart = zbd->outEnd = 0;
+                    break;
+                }
+                /* cannot flush everything */
+                notDone = 0;
+                break;
+            }
+        default: return ERROR(GENERIC);   /* impossible */
+    }   }
+
+    /* result */
+    *srcSizePtr = ip-istart;
+    *dstCapacityPtr = op-ostart;
+    {   size_t nextSrcSizeHint = ZSTDv06_nextSrcSizeToDecompress(zbd->zd);
+        if (nextSrcSizeHint > ZSTDv06_blockHeaderSize) nextSrcSizeHint+= ZSTDv06_blockHeaderSize;   /* get following block header too */
+        nextSrcSizeHint -= zbd->inPos;   /* already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFFv06_recommendedDInSize(void)  { return ZSTDv06_BLOCKSIZE_MAX + ZSTDv06_blockHeaderSize /* block header size*/ ; }
+size_t ZBUFFv06_recommendedDOutSize(void) { return ZSTDv06_BLOCKSIZE_MAX; }

diff --git a/vendor/github.com/DataDog/zstd/zstd_v06.h b/vendor/github.com/DataDog/zstd/zstd_v06.h
new file mode 100644
index 0000000..fb4eb37
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v06.h

@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv06_H
+#define ZSTDv06_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*======  Dependency  ======*/
+#include <stddef.h>   /* size_t */
+
+
+/*======  Export for Windows  ======*/
+/*!
+*  ZSTDv06_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*/
+#if defined(_WIN32) && defined(ZSTDv06_DLL_EXPORT) && (ZSTDv06_DLL_EXPORT==1)
+#  define ZSTDLIBv06_API __declspec(dllexport)
+#else
+#  define ZSTDLIBv06_API
+#endif
+
+
+/* *************************************
+*  Simple functions
+***************************************/
+/*! ZSTDv06_decompress() :
+    `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail.
+    `dstCapacity` must be large enough, equal or larger than originalSize.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTDv06_isError()) */
+ZSTDLIBv06_API size_t ZSTDv06_decompress( void* dst, size_t dstCapacity,
+                                    const void* src, size_t compressedSize);
+
+/**
+ZSTDv06_getFrameSrcSize() : get the source length of a ZSTD frame
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv06_isError())
+*/
+size_t ZSTDv06_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/* *************************************
+*  Helper functions
+***************************************/
+ZSTDLIBv06_API size_t      ZSTDv06_compressBound(size_t srcSize); /*!< maximum compressed size (worst case scenario) */
+
+/* Error Management */
+ZSTDLIBv06_API unsigned    ZSTDv06_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIBv06_API const char* ZSTDv06_getErrorName(size_t code);     /*!< provides readable string for an error code */
+
+
+/* *************************************
+*  Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv06_DCtx_s ZSTDv06_DCtx;
+ZSTDLIBv06_API ZSTDv06_DCtx* ZSTDv06_createDCtx(void);
+ZSTDLIBv06_API size_t     ZSTDv06_freeDCtx(ZSTDv06_DCtx* dctx);      /*!< @return : errorCode */
+
+/** ZSTDv06_decompressDCtx() :
+*   Same as ZSTDv06_decompress(), but requires an already allocated ZSTDv06_DCtx (see ZSTDv06_createDCtx()) */
+ZSTDLIBv06_API size_t ZSTDv06_decompressDCtx(ZSTDv06_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+*  Dictionary API
+*************************/
+/*! ZSTDv06_decompress_usingDict() :
+*   Decompression using a pre-defined Dictionary content (see dictBuilder).
+*   Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted.
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTDv06_decompressDCtx() */
+ZSTDLIBv06_API size_t ZSTDv06_decompress_usingDict(ZSTDv06_DCtx* dctx,
+                                                   void* dst, size_t dstCapacity,
+                                             const void* src, size_t srcSize,
+                                             const void* dict,size_t dictSize);
+
+
+/*-************************
+*  Advanced Streaming API
+***************************/
+struct ZSTDv06_frameParams_s { unsigned long long frameContentSize; unsigned windowLog; };
+typedef struct ZSTDv06_frameParams_s ZSTDv06_frameParams;
+
+ZSTDLIBv06_API size_t ZSTDv06_getFrameParams(ZSTDv06_frameParams* fparamsPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+ZSTDLIBv06_API size_t ZSTDv06_decompressBegin_usingDict(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIBv06_API void   ZSTDv06_copyDCtx(ZSTDv06_DCtx* dctx, const ZSTDv06_DCtx* preparedDCtx);
+
+ZSTDLIBv06_API size_t ZSTDv06_nextSrcSizeToDecompress(ZSTDv06_DCtx* dctx);
+ZSTDLIBv06_API size_t ZSTDv06_decompressContinue(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+
+/* *************************************
+*  ZBUFF API
+***************************************/
+
+typedef struct ZBUFFv06_DCtx_s ZBUFFv06_DCtx;
+ZSTDLIBv06_API ZBUFFv06_DCtx* ZBUFFv06_createDCtx(void);
+ZSTDLIBv06_API size_t         ZBUFFv06_freeDCtx(ZBUFFv06_DCtx* dctx);
+
+ZSTDLIBv06_API size_t ZBUFFv06_decompressInit(ZBUFFv06_DCtx* dctx);
+ZSTDLIBv06_API size_t ZBUFFv06_decompressInitDictionary(ZBUFFv06_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZSTDLIBv06_API size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* dctx,
+                                                  void* dst, size_t* dstCapacityPtr,
+                                            const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFFv06_DCtx object is required to track streaming operations.
+*  Use ZBUFFv06_createDCtx() and ZBUFFv06_freeDCtx() to create/release resources.
+*  Use ZBUFFv06_decompressInit() to start a new decompression operation,
+*   or ZBUFFv06_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv06_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFFv06_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+*            or 0 when a frame is completely decoded,
+*            or an error code, which can be tested using ZBUFFv06_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv06_recommendedDInSize() and ZBUFFv06_recommendedDOutSize()
+*  output : ZBUFFv06_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv06_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFFv06_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+ZSTDLIBv06_API unsigned ZBUFFv06_isError(size_t errorCode);
+ZSTDLIBv06_API const char* ZBUFFv06_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, they tend to offer better latency */
+ZSTDLIBv06_API size_t ZBUFFv06_recommendedDInSize(void);
+ZSTDLIBv06_API size_t ZBUFFv06_recommendedDOutSize(void);
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define ZSTDv06_MAGICNUMBER 0xFD2FB526   /* v0.6 */
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv06_BUFFERED_H */

diff --git a/vendor/github.com/DataDog/zstd/zstd_v07.c b/vendor/github.com/DataDog/zstd/zstd_v07.c
new file mode 100644
index 0000000..70b170f
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v07.c

@@ -0,0 +1,4502 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*- Dependencies -*/
+#include <stddef.h>     /* size_t, ptrdiff_t */
+#include <string.h>     /* memcpy */
+#include <stdlib.h>     /* malloc, free, qsort */
+
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY    /* XXH64_state_t */
+#endif
+#include "xxhash.h"                  /* XXH64_* */
+#include "zstd_v07.h"
+
+#define FSEv07_STATIC_LINKING_ONLY   /* FSEv07_MIN_TABLELOG */
+#define HUFv07_STATIC_LINKING_ONLY   /* HUFv07_TABLELOG_ABSOLUTEMAX */
+#define ZSTDv07_STATIC_LINKING_ONLY
+
+#include "error_private.h"
+
+
+#ifdef ZSTDv07_STATIC_LINKING_ONLY
+
+/* ====================================================================================
+ * The definitions in this section are considered experimental.
+ * They should never be used with a dynamic library, as they may change in the future.
+ * They are provided for advanced usages.
+ * Use them only in association with static linking.
+ * ==================================================================================== */
+
+/*--- Constants ---*/
+#define ZSTDv07_MAGIC_SKIPPABLE_START  0x184D2A50U
+
+#define ZSTDv07_WINDOWLOG_MAX_32  25
+#define ZSTDv07_WINDOWLOG_MAX_64  27
+#define ZSTDv07_WINDOWLOG_MAX    ((U32)(MEM_32bits() ? ZSTDv07_WINDOWLOG_MAX_32 : ZSTDv07_WINDOWLOG_MAX_64))
+#define ZSTDv07_WINDOWLOG_MIN     18
+#define ZSTDv07_CHAINLOG_MAX     (ZSTDv07_WINDOWLOG_MAX+1)
+#define ZSTDv07_CHAINLOG_MIN       4
+#define ZSTDv07_HASHLOG_MAX       ZSTDv07_WINDOWLOG_MAX
+#define ZSTDv07_HASHLOG_MIN       12
+#define ZSTDv07_HASHLOG3_MAX      17
+#define ZSTDv07_SEARCHLOG_MAX    (ZSTDv07_WINDOWLOG_MAX-1)
+#define ZSTDv07_SEARCHLOG_MIN      1
+#define ZSTDv07_SEARCHLENGTH_MAX   7
+#define ZSTDv07_SEARCHLENGTH_MIN   3
+#define ZSTDv07_TARGETLENGTH_MIN   4
+#define ZSTDv07_TARGETLENGTH_MAX 999
+
+#define ZSTDv07_FRAMEHEADERSIZE_MAX 18    /* for static allocation */
+static const size_t ZSTDv07_frameHeaderSize_min = 5;
+static const size_t ZSTDv07_frameHeaderSize_max = ZSTDv07_FRAMEHEADERSIZE_MAX;
+static const size_t ZSTDv07_skippableHeaderSize = 8;  /* magic number + skippable frame length */
+
+
+/* custom memory allocation functions */
+typedef void* (*ZSTDv07_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTDv07_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTDv07_allocFunction customAlloc; ZSTDv07_freeFunction customFree; void* opaque; } ZSTDv07_customMem;
+
+
+/*--- Advanced Decompression functions ---*/
+
+/*! ZSTDv07_estimateDCtxSize() :
+ *  Gives the potential amount of memory allocated to create a ZSTDv07_DCtx */
+ZSTDLIBv07_API size_t ZSTDv07_estimateDCtxSize(void);
+
+/*! ZSTDv07_createDCtx_advanced() :
+ *  Create a ZSTD decompression context using external alloc and free functions */
+ZSTDLIBv07_API ZSTDv07_DCtx* ZSTDv07_createDCtx_advanced(ZSTDv07_customMem customMem);
+
+/*! ZSTDv07_sizeofDCtx() :
+ *  Gives the amount of memory used by a given ZSTDv07_DCtx */
+ZSTDLIBv07_API size_t ZSTDv07_sizeofDCtx(const ZSTDv07_DCtx* dctx);
+
+
+/* ******************************************************************
+*  Buffer-less streaming functions (synchronous mode)
+********************************************************************/
+
+ZSTDLIBv07_API size_t ZSTDv07_decompressBegin(ZSTDv07_DCtx* dctx);
+ZSTDLIBv07_API size_t ZSTDv07_decompressBegin_usingDict(ZSTDv07_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIBv07_API void   ZSTDv07_copyDCtx(ZSTDv07_DCtx* dctx, const ZSTDv07_DCtx* preparedDCtx);
+
+ZSTDLIBv07_API size_t ZSTDv07_nextSrcSizeToDecompress(ZSTDv07_DCtx* dctx);
+ZSTDLIBv07_API size_t ZSTDv07_decompressContinue(ZSTDv07_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/*
+  Buffer-less streaming decompression (synchronous mode)
+
+  A ZSTDv07_DCtx object is required to track streaming operations.
+  Use ZSTDv07_createDCtx() / ZSTDv07_freeDCtx() to manage it.
+  A ZSTDv07_DCtx object can be re-used multiple times.
+
+  First optional operation is to retrieve frame parameters, using ZSTDv07_getFrameParams(), which doesn't consume the input.
+  It can provide the minimum size of rolling buffer required to properly decompress data (`windowSize`),
+  and optionally the final size of uncompressed content.
+  (Note : content size is an optional info that may not be present. 0 means : content size unknown)
+  Frame parameters are extracted from the beginning of compressed frame.
+  The amount of data to read is variable, from ZSTDv07_frameHeaderSize_min to ZSTDv07_frameHeaderSize_max (so if `srcSize` >= ZSTDv07_frameHeaderSize_max, it will always work)
+  If `srcSize` is too small for operation to succeed, function will return the minimum size it requires to produce a result.
+  Result : 0 when successful, it means the ZSTDv07_frameParams structure has been filled.
+          >0 : means there is not enough data into `src`. Provides the expected size to successfully decode header.
+           errorCode, which can be tested using ZSTDv07_isError()
+
+  Start decompression, with ZSTDv07_decompressBegin() or ZSTDv07_decompressBegin_usingDict().
+  Alternatively, you can copy a prepared context, using ZSTDv07_copyDCtx().
+
+  Then use ZSTDv07_nextSrcSizeToDecompress() and ZSTDv07_decompressContinue() alternatively.
+  ZSTDv07_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTDv07_decompressContinue().
+  ZSTDv07_decompressContinue() requires this exact amount of bytes, or it will fail.
+
+  @result of ZSTDv07_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  It can be zero, which is not an error; it just means ZSTDv07_decompressContinue() has decoded some header.
+
+  ZSTDv07_decompressContinue() needs previous data blocks during decompression, up to `windowSize`.
+  They should preferably be located contiguously, prior to current block.
+  Alternatively, a round buffer of sufficient size is also possible. Sufficient size is determined by frame parameters.
+  ZSTDv07_decompressContinue() is very sensitive to contiguity,
+  if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+    or that previous contiguous segment is large enough to properly handle maximum back-reference.
+
+  A frame is fully decoded when ZSTDv07_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+
+
+  == Special case : skippable frames ==
+
+  Skippable frames allow the integration of user-defined data into a flow of concatenated frames.
+  Skippable frames will be ignored (skipped) by a decompressor. The format of skippable frame is following:
+  a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+  b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+  c) Frame Content - any content (User Data) of length equal to Frame Size
+  For skippable frames ZSTDv07_decompressContinue() always returns 0.
+  For skippable frames ZSTDv07_getFrameParams() returns fparamsPtr->windowLog==0 what means that a frame is skippable.
+  It also returns Frame Size as fparamsPtr->frameContentSize.
+*/
+
+
+/* **************************************
+*  Block functions
+****************************************/
+/*! Block functions produce and decode raw zstd blocks, without frame metadata.
+    Frame metadata cost is typically ~18 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+    User will have to take in charge required information to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Compressing and decompressing require a context structure
+      + Use ZSTDv07_createCCtx() and ZSTDv07_createDCtx()
+    - It is necessary to init context before starting
+      + compression : ZSTDv07_compressBegin()
+      + decompression : ZSTDv07_decompressBegin()
+      + variants _usingDict() are also allowed
+      + copyCCtx() and copyDCtx() work too
+    - Block size is limited, it must be <= ZSTDv07_getBlockSizeMax()
+      + If you need to compress more, cut data into multiple blocks
+      + Consider using the regular ZSTDv07_compress() instead, as frame metadata costs become negligible when source size is large.
+    - When a block is considered not compressible enough, ZSTDv07_compressBlock() result will be zero.
+      In which case, nothing is produced into `dst`.
+      + User must test for such outcome and deal directly with uncompressed data
+      + ZSTDv07_decompressBlock() doesn't accept uncompressed data as input !!!
+      + In case of multiple successive blocks, decoder must be informed of uncompressed block existence to follow proper history.
+        Use ZSTDv07_insertBlock() in such a case.
+*/
+
+#define ZSTDv07_BLOCKSIZE_ABSOLUTEMAX (128 * 1024)   /* define, for static allocation */
+ZSTDLIBv07_API size_t ZSTDv07_decompressBlock(ZSTDv07_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIBv07_API size_t ZSTDv07_insertBlock(ZSTDv07_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert block into `dctx` history. Useful for uncompressed blocks */
+
+
+#endif   /* ZSTDv07_STATIC_LINKING_ONLY */
+
+
+/* ******************************************************************
+   mem.h
+   low-level memory access routines
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap32(in);
+#else
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap64(in);
+#else
+    return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+#endif
+}
+
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+/* ******************************************************************
+   bitstream
+   Part of FSE library
+   header file (to include)
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#if defined(__BMI__) && defined(__GNUC__)
+#  include <immintrin.h>   /* support for bextr (experimental) */
+#endif
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BITv07_DStream_t;
+
+typedef enum { BITv07_DStream_unfinished = 0,
+               BITv07_DStream_endOfBuffer = 1,
+               BITv07_DStream_completed = 2,
+               BITv07_DStream_overflow = 3 } BITv07_DStream_status;  /* result of BITv07_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BITv07_initDStream(BITv07_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BITv07_readBits(BITv07_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BITv07_DStream_status BITv07_reloadDStream(BITv07_DStream_t* bitD);
+MEM_STATIC unsigned BITv07_endOfDStream(const BITv07_DStream_t* bitD);
+
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC size_t BITv07_readBitsFast(BITv07_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+MEM_STATIC unsigned BITv07_highbit32 (U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+#   endif
+}
+
+
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*! BITv07_initDStream() :
+*   Initialize a BITv07_DStream_t.
+*   `bitD` : a pointer to an already allocated BITv07_DStream_t structure.
+*   `srcSize` must be the *exact* size of the bitStream, in bytes.
+*   @return : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BITv07_initDStream(BITv07_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - BITv07_highbit32(lastByte) : 0;
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+    } else {
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);/* fall-through */
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);/* fall-through */
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);/* fall-through */
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; /* fall-through */
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; /* fall-through */
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8; /* fall-through */
+            default: break;
+        }
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - BITv07_highbit32(lastByte) : 0;
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+
+ MEM_STATIC size_t BITv07_lookBits(const BITv07_DStream_t* bitD, U32 nbBits)
+{
+    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+}
+
+/*! BITv07_lookBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv07_lookBitsFast(const BITv07_DStream_t* bitD, U32 nbBits)
+{
+    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BITv07_skipBits(BITv07_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+MEM_STATIC size_t BITv07_readBits(BITv07_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BITv07_lookBits(bitD, nbBits);
+    BITv07_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BITv07_readBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BITv07_readBitsFast(BITv07_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BITv07_lookBitsFast(bitD, nbBits);
+    BITv07_skipBits(bitD, nbBits);
+    return value;
+}
+
+MEM_STATIC BITv07_DStream_status BITv07_reloadDStream(BITv07_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should not happen => corruption detected */
+        return BITv07_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BITv07_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BITv07_DStream_endOfBuffer;
+        return BITv07_DStream_completed;
+    }
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BITv07_DStream_status result = BITv07_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BITv07_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BITv07_endOfDStream() :
+*   @return Tells if DStream has exactly reached its end (all bits consumed).
+*/
+MEM_STATIC unsigned BITv07_endOfDStream(const BITv07_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+/* ******************************************************************
+   FSE : Finite State Entropy codec
+   Public Prototypes declaration
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef FSEv07_H
+#define FSEv07_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/*-****************************************
+*  FSE simple functions
+******************************************/
+
+/*! FSEv07_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+    @return : size of regenerated data (<= maxDstSize),
+              or an error code, which can be tested using FSEv07_isError() .
+
+    ** Important ** : FSEv07_decompress() does not decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+size_t FSEv07_decompress(void* dst,  size_t dstCapacity,
+                const void* cSrc, size_t cSrcSize);
+
+
+/* Error Management */
+unsigned    FSEv07_isError(size_t code);        /* tells if a return value is an error code */
+const char* FSEv07_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSEv07_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSEv07_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSEv07_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+size_t FSEv07_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSEv07_DTable.
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSEv07_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSEv07_DTable* FSEv07_createDTable(unsigned tableLog);
+void        FSEv07_freeDTable(FSEv07_DTable* dt);
+
+/*! FSEv07_buildDTable():
+    Builds 'dt', which must be already allocated, using FSEv07_createDTable().
+    return : 0, or an errorCode, which can be tested using FSEv07_isError() */
+size_t FSEv07_buildDTable (FSEv07_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSEv07_decompress_usingDTable():
+    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+    into `dst` which must be already allocated.
+    @return : size of regenerated data (necessarily <= `dstCapacity`),
+              or an errorCode, which can be tested using FSEv07_isError() */
+size_t FSEv07_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSEv07_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSEv07_readNCount() if it was saved using FSEv07_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSEv07_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSEv07_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSEv07_isError().
+
+The next step is to build the decompression tables 'FSEv07_DTable' from 'normalizedCounter'.
+This is performed by the function FSEv07_buildDTable().
+The space required by 'FSEv07_DTable' must be already allocated using FSEv07_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSEv07_isError().
+
+`FSEv07_DTable` can then be used to decompress `cSrc`, with FSEv07_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSEv07_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSEv07_isError(). (ex: dst buffer too small)
+*/
+
+
+#ifdef FSEv07_STATIC_LINKING_ONLY
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSEv07_NCOUNTBOUND 512
+#define FSEv07_BLOCKBOUND(size) (size + (size>>7))
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of unsigned using below macros */
+#define FSEv07_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+size_t FSEv07_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+/**< same as FSEv07_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr  */
+
+unsigned FSEv07_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSEv07_optimalTableLog(), which used `minus==2` */
+
+size_t FSEv07_buildDTable_raw (FSEv07_DTable* dt, unsigned nbBits);
+/**< build a fake FSEv07_DTable, designed to read an uncompressed bitstream where each symbol uses nbBits */
+
+size_t FSEv07_buildDTable_rle (FSEv07_DTable* dt, unsigned char symbolValue);
+/**< build a fake FSEv07_DTable, designed to always generate the same symbolValue */
+
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct
+{
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSEv07_DState_t;
+
+
+static void     FSEv07_initDState(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD, const FSEv07_DTable* dt);
+
+static unsigned char FSEv07_decodeSymbol(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD);
+
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSEv07_decodeSymbolFast(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSEv07_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSEv07_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSEv07_initDState(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD, const FSEv07_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv07_DTableHeader* const DTableH = (const FSEv07_DTableHeader*)ptr;
+    DStatePtr->state = BITv07_readBits(bitD, DTableH->tableLog);
+    BITv07_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSEv07_peekSymbol(const FSEv07_DState_t* DStatePtr)
+{
+    FSEv07_decode_t const DInfo = ((const FSEv07_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSEv07_updateState(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD)
+{
+    FSEv07_decode_t const DInfo = ((const FSEv07_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BITv07_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSEv07_decodeSymbol(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD)
+{
+    FSEv07_decode_t const DInfo = ((const FSEv07_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BITv07_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSEv07_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSEv07_decodeSymbolFast(FSEv07_DState_t* DStatePtr, BITv07_DStream_t* bitD)
+{
+    FSEv07_decode_t const DInfo = ((const FSEv07_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BITv07_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+
+
+#ifndef FSEv07_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#define FSEv07_MAX_MEMORY_USAGE 14
+#define FSEv07_DEFAULT_MEMORY_USAGE 13
+
+/*!FSEv07_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#define FSEv07_MAX_SYMBOL_VALUE 255
+
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSEv07_FUNCTION_TYPE BYTE
+#define FSEv07_FUNCTION_EXTENSION
+#define FSEv07_DECODE_TYPE FSEv07_decode_t
+
+
+#endif   /* !FSEv07_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSEv07_MAX_TABLELOG  (FSEv07_MAX_MEMORY_USAGE-2)
+#define FSEv07_MAX_TABLESIZE (1U<<FSEv07_MAX_TABLELOG)
+#define FSEv07_MAXTABLESIZE_MASK (FSEv07_MAX_TABLESIZE-1)
+#define FSEv07_DEFAULT_TABLELOG (FSEv07_DEFAULT_MEMORY_USAGE-2)
+#define FSEv07_MIN_TABLELOG 5
+
+#define FSEv07_TABLELOG_ABSOLUTE_MAX 15
+#if FSEv07_MAX_TABLELOG > FSEv07_TABLELOG_ABSOLUTE_MAX
+#  error "FSEv07_MAX_TABLELOG > FSEv07_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSEv07_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3)
+
+
+#endif /* FSEv07_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSEv07_H */
+/* ******************************************************************
+   Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUFv07_H_298734234
+#define HUFv07_H_298734234
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* *** simple functions *** */
+/**
+HUFv07_decompress() :
+    Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated buffer 'dst', of minimum size 'dstSize'.
+    `dstSize` : **must** be the ***exact*** size of original (uncompressed) data.
+    Note : in contrast with FSE, HUFv07_decompress can regenerate
+           RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+           because it knows size to regenerate.
+    @return : size of regenerated data (== dstSize),
+              or an error code, which can be tested using HUFv07_isError()
+*/
+size_t HUFv07_decompress(void* dst,  size_t dstSize,
+                const void* cSrc, size_t cSrcSize);
+
+
+/* ****************************************
+*  Tool functions
+******************************************/
+#define HUFv07_BLOCKSIZE_MAX (128 * 1024)
+
+/* Error Management */
+unsigned    HUFv07_isError(size_t code);        /**< tells if a return value is an error code */
+const char* HUFv07_getErrorName(size_t code);   /**< provides error code string (useful for debugging) */
+
+
+/* *** Advanced function *** */
+
+
+#ifdef HUFv07_STATIC_LINKING_ONLY
+
+
+/* *** Constants *** */
+#define HUFv07_TABLELOG_ABSOLUTEMAX  16   /* absolute limit of HUFv07_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUFv07_TABLELOG_MAX  12           /* max configured tableLog (for static allocation); can be modified up to HUFv07_ABSOLUTEMAX_TABLELOG */
+#define HUFv07_TABLELOG_DEFAULT  11       /* tableLog by default, when not specified */
+#define HUFv07_SYMBOLVALUE_MAX 255
+#if (HUFv07_TABLELOG_MAX > HUFv07_TABLELOG_ABSOLUTEMAX)
+#  error "HUFv07_TABLELOG_MAX is too large !"
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUFv07_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUFv07_DTable;
+#define HUFv07_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
+#define HUFv07_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        HUFv07_DTable DTable[HUFv07_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1)*0x1000001) }
+#define HUFv07_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        HUFv07_DTable DTable[HUFv07_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog)*0x1000001) }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUFv07_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUFv07_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+size_t HUFv07_decompress4X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
+size_t HUFv07_decompress4X_hufOnly(HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
+size_t HUFv07_decompress4X2_DCtx(HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUFv07_decompress4X4_DCtx(HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+size_t HUFv07_decompress1X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+size_t HUFv07_decompress1X2_DCtx(HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUFv07_decompress1X4_DCtx(HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+
+/* ****************************************
+*  HUF detailed API
+******************************************/
+/*!
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and regenerate 'CTable' using external methods.
+*/
+/* FSEv07_count() : find it within "fse.h" */
+
+/*! HUFv07_readStats() :
+    Read compact Huffman tree, saved by HUFv07_writeCTable().
+    `huffWeight` is destination buffer.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUFv07_readCTable() and HUFv07_readDTableXn() . */
+size_t HUFv07_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize);
+
+
+/*
+HUFv07_decompress() does the following:
+1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+2. build Huffman table from save, using HUFv07_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUFv07_decompressSXn_usingDTable
+*/
+
+/** HUFv07_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUFv07_decompress4X2, 1==HUFv07_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUFv07_selectDecoder (size_t dstSize, size_t cSrcSize);
+
+size_t HUFv07_readDTableX2 (HUFv07_DTable* DTable, const void* src, size_t srcSize);
+size_t HUFv07_readDTableX4 (HUFv07_DTable* DTable, const void* src, size_t srcSize);
+
+size_t HUFv07_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+size_t HUFv07_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+size_t HUFv07_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+
+
+/* single stream variants */
+size_t HUFv07_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUFv07_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+
+size_t HUFv07_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+size_t HUFv07_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+size_t HUFv07_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUFv07_DTable* DTable);
+
+
+#endif /* HUFv07_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* HUFv07_H_298734234 */
+/*
+   Common functions of New Generation Entropy library
+   Copyright (C) 2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+*************************************************************************** */
+
+
+
+/*-****************************************
+*  FSE Error Management
+******************************************/
+unsigned FSEv07_isError(size_t code) { return ERR_isError(code); }
+
+const char* FSEv07_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/* **************************************************************
+*  HUF Error Management
+****************************************************************/
+unsigned HUFv07_isError(size_t code) { return ERR_isError(code); }
+
+const char* HUFv07_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+static short FSEv07_abs(short a) { return (short)(a<0 ? -a : a); }
+
+size_t FSEv07_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSEv07_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSEv07_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0+=24;
+                if (ip < iend-5) {
+                    ip+=2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount+=16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {   short const max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            } else {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSEv07_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }   /* while ((remaining>1) && (charnum<=*maxSVPtr)) */
+    if (remaining != 1) return ERROR(GENERIC);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return ERROR(srcSize_wrong);
+    return ip-istart;
+}
+
+
+/*! HUFv07_readStats() :
+    Read compact Huffman tree, saved by HUFv07_writeCTable().
+    `huffWeight` is destination buffer.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUFv07_readCTable() and HUFv07_readDTableXn() .
+*/
+size_t HUFv07_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    //memset(huffWeight, 0, hwSize);   /* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128)  { /* special header */
+        if (iSize >= (242)) {  /* RLE */
+            static U32 l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, hwSize);
+            iSize = 0;
+        }
+        else {   /* Incompressible */
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+            if (oSize >= hwSize) return ERROR(corruption_detected);
+            ip += 1;
+            {   U32 n;
+                for (n=0; n<oSize; n+=2) {
+                    huffWeight[n]   = ip[n/2] >> 4;
+                    huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSEv07_decompress(huffWeight, hwSize-1, ip+1, iSize);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSEv07_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUFv07_TABLELOG_ABSOLUTEMAX + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] >= HUFv07_TABLELOG_ABSOLUTEMAX) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = BITv07_highbit32(weightTotal) + 1;
+        if (tableLog > HUFv07_TABLELOG_ABSOLUTEMAX) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << BITv07_highbit32(rest);
+            U32 const lastWeight = BITv07_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
+/* ******************************************************************
+   FSE : Finite State Entropy decoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSEv07_isError ERR_isError
+#define FSEv07_STATIC_ASSERT(c) { enum { FSEv07_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Complex types
+****************************************************************/
+typedef U32 DTable_max_t[FSEv07_DTABLE_SIZE_U32(FSEv07_MAX_TABLELOG)];
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSEv07_FUNCTION_EXTENSION
+#  error "FSEv07_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSEv07_FUNCTION_TYPE
+#  error "FSEv07_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSEv07_CAT(X,Y) X##Y
+#define FSEv07_FUNCTION_NAME(X,Y) FSEv07_CAT(X,Y)
+#define FSEv07_TYPE_NAME(X,Y) FSEv07_CAT(X,Y)
+
+
+/* Function templates */
+FSEv07_DTable* FSEv07_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSEv07_TABLELOG_ABSOLUTE_MAX) tableLog = FSEv07_TABLELOG_ABSOLUTE_MAX;
+    return (FSEv07_DTable*)malloc( FSEv07_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSEv07_freeDTable (FSEv07_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSEv07_buildDTable(FSEv07_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSEv07_DECODE_TYPE* const tableDecode = (FSEv07_DECODE_TYPE*) (tdPtr);
+    U16 symbolNext[FSEv07_MAX_SYMBOL_VALUE+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSEv07_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSEv07_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSEv07_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSEv07_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSEv07_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSEv07_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSEv07_FUNCTION_TYPE const symbol = (FSEv07_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BITv07_highbit32 ((U32)nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+
+
+#ifndef FSEv07_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSEv07_buildDTable_rle (FSEv07_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSEv07_DTableHeader* const DTableH = (FSEv07_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv07_decode_t* const cell = (FSEv07_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSEv07_buildDTable_raw (FSEv07_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSEv07_DTableHeader* const DTableH = (FSEv07_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSEv07_decode_t* const dinfo = (FSEv07_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSV1 = tableMask+1;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<maxSV1; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSEv07_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSEv07_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BITv07_DStream_t bitD;
+    FSEv07_DState_t state1;
+    FSEv07_DState_t state2;
+
+    /* Init */
+    { size_t const errorCode = BITv07_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+      if (FSEv07_isError(errorCode)) return errorCode; }
+
+    FSEv07_initDState(&state1, &bitD, dt);
+    FSEv07_initDState(&state2, &bitD, dt);
+
+#define FSEv07_GETSYMBOL(statePtr) fast ? FSEv07_decodeSymbolFast(statePtr, &bitD) : FSEv07_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BITv07_reloadDStream(&bitD)==BITv07_DStream_unfinished) && (op<olimit) ; op+=4) {
+        op[0] = FSEv07_GETSYMBOL(&state1);
+
+        if (FSEv07_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv07_reloadDStream(&bitD);
+
+        op[1] = FSEv07_GETSYMBOL(&state2);
+
+        if (FSEv07_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BITv07_reloadDStream(&bitD) > BITv07_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSEv07_GETSYMBOL(&state1);
+
+        if (FSEv07_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BITv07_reloadDStream(&bitD);
+
+        op[3] = FSEv07_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BITv07_reloadDStream(&bitD) >= FSEv07_DStream_partiallyFilled; Ends at exactly BITv07_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+
+        *op++ = FSEv07_GETSYMBOL(&state1);
+
+        if (BITv07_reloadDStream(&bitD)==BITv07_DStream_overflow) {
+            *op++ = FSEv07_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+
+        *op++ = FSEv07_GETSYMBOL(&state2);
+
+        if (BITv07_reloadDStream(&bitD)==BITv07_DStream_overflow) {
+            *op++ = FSEv07_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+
+size_t FSEv07_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSEv07_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSEv07_DTableHeader* DTableH = (const FSEv07_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSEv07_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSEv07_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSEv07_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSEv07_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSEv07_MAX_SYMBOL_VALUE;
+
+    if (cSrcSize<2) return ERROR(srcSize_wrong);   /* too small input size */
+
+    /* normal FSE decoding mode */
+    {   size_t const NCountLength = FSEv07_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+        if (FSEv07_isError(NCountLength)) return NCountLength;
+        if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size */
+        ip += NCountLength;
+        cSrcSize -= NCountLength;
+    }
+
+    { size_t const errorCode = FSEv07_buildDTable (dt, counting, maxSymbolValue, tableLog);
+      if (FSEv07_isError(errorCode)) return errorCode; }
+
+    return FSEv07_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);   /* always return, even if it is an error code */
+}
+
+
+
+#endif   /* FSEv07_COMMONDEFS_ONLY */
+
+/* ******************************************************************
+   Huffman decoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUFv07_STATIC_ASSERT(c) { enum { HUFv07_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+
+typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
+
+static DTableDesc HUFv07_getDTableDesc(const HUFv07_DTable* table)
+{
+    DTableDesc dtd;
+    memcpy(&dtd, table, sizeof(dtd));
+    return dtd;
+}
+
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+
+typedef struct { BYTE byte; BYTE nbBits; } HUFv07_DEltX2;   /* single-symbol decoding */
+
+size_t HUFv07_readDTableX2 (HUFv07_DTable* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUFv07_SYMBOLVALUE_MAX + 1];
+    U32 rankVal[HUFv07_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    size_t iSize;
+    void* const dtPtr = DTable + 1;
+    HUFv07_DEltX2* const dt = (HUFv07_DEltX2*)dtPtr;
+
+    HUFv07_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUFv07_DTable));
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv07_readStats(huffWeight, HUFv07_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv07_isError(iSize)) return iSize;
+
+    /* Table header */
+    {   DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+        if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge);   /* DTable too small, huffman tree cannot fit in */
+        dtd.tableType = 0;
+        dtd.tableLog = (BYTE)tableLog;
+        memcpy(DTable, &dtd, sizeof(dtd));
+    }
+
+    /* Prepare ranks */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<tableLog+1; n++) {
+            U32 current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill DTable */
+    {   U32 n;
+        for (n=0; n<nbSymbols; n++) {
+            U32 const w = huffWeight[n];
+            U32 const length = (1 << w) >> 1;
+            U32 i;
+            HUFv07_DEltX2 D;
+            D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+            for (i = rankVal[w]; i < rankVal[w] + length; i++)
+                dt[i] = D;
+            rankVal[w] += length;
+    }   }
+
+    return iSize;
+}
+
+
+static BYTE HUFv07_decodeSymbolX2(BITv07_DStream_t* Dstream, const HUFv07_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BITv07_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    BYTE const c = dt[val].byte;
+    BITv07_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUFv07_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUFv07_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUFv07_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv07_TABLELOG_MAX<=12)) \
+        HUFv07_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUFv07_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUFv07_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUFv07_decodeStreamX2(BYTE* p, BITv07_DStream_t* const bitDPtr, BYTE* const pEnd, const HUFv07_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BITv07_reloadDStream(bitDPtr) == BITv07_DStream_unfinished) && (p <= pEnd-4)) {
+        HUFv07_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BITv07_reloadDStream(bitDPtr) == BITv07_DStream_unfinished) && (p < pEnd))
+        HUFv07_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUFv07_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+static size_t HUFv07_decompress1X2_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const void* dtPtr = DTable + 1;
+    const HUFv07_DEltX2* const dt = (const HUFv07_DEltX2*)dtPtr;
+    BITv07_DStream_t bitD;
+    DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+    U32 const dtLog = dtd.tableLog;
+
+    { size_t const errorCode = BITv07_initDStream(&bitD, cSrc, cSrcSize);
+      if (HUFv07_isError(errorCode)) return errorCode; }
+
+    HUFv07_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+    /* check */
+    if (!BITv07_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+size_t HUFv07_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUFv07_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUFv07_decompress1X2_DCtx (HUFv07_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUFv07_readDTableX2 (DCtx, cSrc, cSrcSize);
+    if (HUFv07_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUFv07_decompress1X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+size_t HUFv07_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv07_CREATE_STATIC_DTABLEX2(DTable, HUFv07_TABLELOG_MAX);
+    return HUFv07_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+static size_t HUFv07_decompress4X2_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable + 1;
+        const HUFv07_DEltX2* const dt = (const HUFv07_DEltX2*)dtPtr;
+
+        /* Init */
+        BITv07_DStream_t bitD1;
+        BITv07_DStream_t bitD2;
+        BITv07_DStream_t bitD3;
+        BITv07_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        { size_t const errorCode = BITv07_initDStream(&bitD1, istart1, length1);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD2, istart2, length2);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD3, istart3, length3);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD4, istart4, length4);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BITv07_reloadDStream(&bitD1) | BITv07_reloadDStream(&bitD2) | BITv07_reloadDStream(&bitD3) | BITv07_reloadDStream(&bitD4);
+        for ( ; (endSignal==BITv07_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUFv07_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX2_0(op4, &bitD4);
+            endSignal = BITv07_reloadDStream(&bitD1) | BITv07_reloadDStream(&bitD2) | BITv07_reloadDStream(&bitD3) | BITv07_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUFv07_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUFv07_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUFv07_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUFv07_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BITv07_endOfDStream(&bitD1) & BITv07_endOfDStream(&bitD2) & BITv07_endOfDStream(&bitD3) & BITv07_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUFv07_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUFv07_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+
+size_t HUFv07_decompress4X2_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUFv07_readDTableX2 (dctx, cSrc, cSrcSize);
+    if (HUFv07_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUFv07_decompress4X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, dctx);
+}
+
+size_t HUFv07_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv07_CREATE_STATIC_DTABLEX2(DTable, HUFv07_TABLELOG_MAX);
+    return HUFv07_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUFv07_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+static void HUFv07_fillDTableX4Level2(HUFv07_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUFv07_DEltX4 DElt;
+    U32 rankVal[HUFv07_TABLELOG_ABSOLUTEMAX + 1];
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    { U32 s; for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+        const U32 symbol = sortedSymbols[s].symbol;
+        const U32 weight = sortedSymbols[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 length = 1 << (sizeLog-nbBits);
+        const U32 start = rankVal[weight];
+        U32 i = start;
+        const U32 end = start + length;
+
+        MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+        DElt.nbBits = (BYTE)(nbBits + consumed);
+        DElt.length = 2;
+        do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+        rankVal[weight] += length;
+    }}
+}
+
+typedef U32 rankVal_t[HUFv07_TABLELOG_ABSOLUTEMAX][HUFv07_TABLELOG_ABSOLUTEMAX + 1];
+
+static void HUFv07_fillDTableX4(HUFv07_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUFv07_TABLELOG_ABSOLUTEMAX + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUFv07_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            HUFv07_DEltX4 DElt;
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits = (BYTE)(nbBits);
+            DElt.length = 1;
+            {   U32 u;
+                const U32 end = start + length;
+                for (u = start; u < end; u++) DTable[u] = DElt;
+        }   }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUFv07_readDTableX4 (HUFv07_DTable* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUFv07_SYMBOLVALUE_MAX + 1];
+    sortedSymbol_t sortedSymbol[HUFv07_SYMBOLVALUE_MAX + 1];
+    U32 rankStats[HUFv07_TABLELOG_ABSOLUTEMAX + 1] = { 0 };
+    U32 rankStart0[HUFv07_TABLELOG_ABSOLUTEMAX + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+    U32 const maxTableLog = dtd.maxTableLog;
+    size_t iSize;
+    void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
+    HUFv07_DEltX4* const dt = (HUFv07_DEltX4*)dtPtr;
+
+    HUFv07_STATIC_ASSERT(sizeof(HUFv07_DEltX4) == sizeof(HUFv07_DTable));   /* if compilation fails here, assertion is false */
+    if (maxTableLog > HUFv07_TABLELOG_ABSOLUTEMAX) return ERROR(tableLog_tooLarge);
+    //memset(weightList, 0, sizeof(weightList));   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUFv07_readStats(weightList, HUFv07_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUFv07_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = weightList[s];
+            U32 const r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = rankVal[0];
+        {   int const rescale = (maxTableLog-tableLog) - 1;   /* tableLog <= maxTableLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 current = nextRankVal;
+                nextRankVal += rankStats[w] << (w+rescale);
+                rankVal0[w] = current;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUFv07_fillDTableX4(dt, maxTableLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    dtd.tableLog = (BYTE)maxTableLog;
+    dtd.tableType = 1;
+    memcpy(DTable, &dtd, sizeof(dtd));
+    return iSize;
+}
+
+
+static U32 HUFv07_decodeSymbolX4(void* op, BITv07_DStream_t* DStream, const HUFv07_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv07_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BITv07_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUFv07_decodeLastSymbolX4(void* op, BITv07_DStream_t* DStream, const HUFv07_DEltX4* dt, const U32 dtLog)
+{
+    const size_t val = BITv07_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BITv07_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BITv07_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+    }   }
+    return 1;
+}
+
+
+#define HUFv07_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUFv07_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv07_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUFv07_TABLELOG_MAX<=12)) \
+        ptr += HUFv07_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUFv07_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUFv07_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUFv07_decodeStreamX4(BYTE* p, BITv07_DStream_t* bitDPtr, BYTE* const pEnd, const HUFv07_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BITv07_reloadDStream(bitDPtr) == BITv07_DStream_unfinished) && (p < pEnd-7)) {
+        HUFv07_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUFv07_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    while ((BITv07_reloadDStream(bitDPtr) == BITv07_DStream_unfinished) && (p <= pEnd-2))
+        HUFv07_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUFv07_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUFv07_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+static size_t HUFv07_decompress1X4_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    BITv07_DStream_t bitD;
+
+    /* Init */
+    {   size_t const errorCode = BITv07_initDStream(&bitD, cSrc, cSrcSize);
+        if (HUFv07_isError(errorCode)) return errorCode;
+    }
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUFv07_DEltX4* const dt = (const HUFv07_DEltX4*)dtPtr;
+        DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+        HUFv07_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BITv07_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+size_t HUFv07_decompress1X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUFv07_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUFv07_decompress1X4_DCtx (HUFv07_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUFv07_readDTableX4 (DCtx, cSrc, cSrcSize);
+    if (HUFv07_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUFv07_decompress1X4_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+size_t HUFv07_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv07_CREATE_STATIC_DTABLEX4(DTable, HUFv07_TABLELOG_MAX);
+    return HUFv07_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+static size_t HUFv07_decompress4X4_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;
+        const HUFv07_DEltX4* const dt = (const HUFv07_DEltX4*)dtPtr;
+
+        /* Init */
+        BITv07_DStream_t bitD1;
+        BITv07_DStream_t bitD2;
+        BITv07_DStream_t bitD3;
+        BITv07_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        { size_t const errorCode = BITv07_initDStream(&bitD1, istart1, length1);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD2, istart2, length2);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD3, istart3, length3);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BITv07_initDStream(&bitD4, istart4, length4);
+          if (HUFv07_isError(errorCode)) return errorCode; }
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BITv07_reloadDStream(&bitD1) | BITv07_reloadDStream(&bitD2) | BITv07_reloadDStream(&bitD3) | BITv07_reloadDStream(&bitD4);
+        for ( ; (endSignal==BITv07_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUFv07_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUFv07_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUFv07_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUFv07_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUFv07_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BITv07_reloadDStream(&bitD1) | BITv07_reloadDStream(&bitD2) | BITv07_reloadDStream(&bitD3) | BITv07_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUFv07_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUFv07_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUFv07_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUFv07_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BITv07_endOfDStream(&bitD1) & BITv07_endOfDStream(&bitD2) & BITv07_endOfDStream(&bitD3) & BITv07_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUFv07_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUFv07_DTable* DTable)
+{
+    DTableDesc dtd = HUFv07_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUFv07_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+
+size_t HUFv07_decompress4X4_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUFv07_readDTableX4 (dctx, cSrc, cSrcSize);
+    if (HUFv07_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUFv07_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+size_t HUFv07_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUFv07_CREATE_STATIC_DTABLEX4(DTable, HUFv07_TABLELOG_MAX);
+    return HUFv07_decompress4X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+size_t HUFv07_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUFv07_DTable* DTable)
+{
+    DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+    return dtd.tableType ? HUFv07_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
+                           HUFv07_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUFv07_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUFv07_DTable* DTable)
+{
+    DTableDesc const dtd = HUFv07_getDTableDesc(DTable);
+    return dtd.tableType ? HUFv07_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
+                           HUFv07_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+/** HUFv07_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUFv07_decompress4X2, 1==HUFv07_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUFv07_selectDecoder (size_t dstSize, size_t cSrcSize)
+{
+    /* decoder timing evaluation */
+    U32 const Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    U32 const D256 = (U32)(dstSize >> 8);
+    U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+    U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+    DTime1 += DTime1 >> 3;  /* advantage to algorithm using less memory, for cache eviction */
+
+    return DTime1 < DTime0;
+}
+
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUFv07_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[2] = { HUFv07_decompress4X2, HUFv07_decompress4X4 };
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUFv07_selectDecoder(dstSize, cSrcSize);
+        return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+    }
+
+    //return HUFv07_decompress4X2(dst, dstSize, cSrc, cSrcSize);   /* multi-streams single-symbol decoding */
+    //return HUFv07_decompress4X4(dst, dstSize, cSrc, cSrcSize);   /* multi-streams double-symbols decoding */
+}
+
+size_t HUFv07_decompress4X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUFv07_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUFv07_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUFv07_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+
+size_t HUFv07_decompress4X_hufOnly (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if ((cSrcSize >= dstSize) || (cSrcSize <= 1)) return ERROR(corruption_detected);   /* invalid */
+
+    {   U32 const algoNb = HUFv07_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUFv07_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUFv07_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+
+size_t HUFv07_decompress1X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUFv07_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUFv07_decompress1X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUFv07_decompress1X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+/*
+    Common functions of Zstd compression library
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net/
+*/
+
+
+
+/*-****************************************
+*  ZSTD Error Management
+******************************************/
+/*! ZSTDv07_isError() :
+*   tells if a return value is an error code */
+unsigned ZSTDv07_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTDv07_getErrorName() :
+*   provides error code string from function result (useful for debugging) */
+const char* ZSTDv07_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+
+/* **************************************************************
+*  ZBUFF Error Management
+****************************************************************/
+unsigned ZBUFFv07_isError(size_t errorCode) { return ERR_isError(errorCode); }
+
+const char* ZBUFFv07_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+
+
+void* ZSTDv07_defaultAllocFunction(void* opaque, size_t size)
+{
+    void* address = malloc(size);
+    (void)opaque;
+    /* printf("alloc %p, %d opaque=%p \n", address, (int)size, opaque); */
+    return address;
+}
+
+void ZSTDv07_defaultFreeFunction(void* opaque, void* address)
+{
+    (void)opaque;
+    /* if (address) printf("free %p opaque=%p \n", address, opaque); */
+    free(address);
+}
+/*
+    zstd_internal - common functions to include
+    Header File for include
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : https://www.zstd.net
+*/
+#ifndef ZSTDv07_CCOMMON_H_MODULE
+#define ZSTDv07_CCOMMON_H_MODULE
+
+
+/*-*************************************
+*  Common macros
+***************************************/
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTDv07_OPT_NUM    (1<<12)
+#define ZSTDv07_DICT_MAGIC  0xEC30A437   /* v0.7 */
+
+#define ZSTDv07_REP_NUM    3
+#define ZSTDv07_REP_INIT   ZSTDv07_REP_NUM
+#define ZSTDv07_REP_MOVE   (ZSTDv07_REP_NUM-1)
+static const U32 repStartValue[ZSTDv07_REP_NUM] = { 1, 4, 8 };
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTDv07_WINDOWLOG_ABSOLUTEMIN 10
+static const size_t ZSTDv07_fcs_fieldSize[4] = { 0, 2, 4, 8 };
+static const size_t ZSTDv07_did_fieldSize[4] = { 0, 1, 2, 4 };
+
+#define ZSTDv07_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTDv07_blockHeaderSize = ZSTDv07_BLOCKHEADERSIZE;
+typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t;
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define HufLog 12
+typedef enum { lbt_huffman, lbt_repeat, lbt_raw, lbt_rle } litBlockType_t;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+#define EQUAL_READ32 4
+
+#define Litbits  8
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML  52
+#define MaxLL  35
+#define MaxOff 28
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+
+#define FSEv07_ENCODING_RAW     0
+#define FSEv07_ENCODING_RLE     1
+#define FSEv07_ENCODING_STATIC  2
+#define FSEv07_ENCODING_DYNAMIC 3
+
+static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9,10,11,12,
+                                     13,14,15,16 };
+static const S16 LL_defaultNorm[MaxLL+1] = { 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
+                                             2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
+                                            -1,-1,-1,-1 };
+static const U32 LL_defaultNormLog = 6;
+
+static const U32 ML_bits[MaxML+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9,10,11,
+                                     12,13,14,15,16 };
+static const S16 ML_defaultNorm[MaxML+1] = { 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
+                                            -1,-1,-1,-1,-1 };
+static const U32 ML_defaultNormLog = 6;
+
+static const S16 OF_defaultNorm[MaxOff+1] = { 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+                                              1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 };
+static const U32 OF_defaultNormLog = 5;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTDv07_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+#define COPY8(d,s) { ZSTDv07_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTDv07_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+MEM_STATIC void ZSTDv07_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct ZSTDv07_stats_s ZSTDv07_stats_t;
+
+typedef struct {
+    U32 off;
+    U32 len;
+} ZSTDv07_match_t;
+
+typedef struct {
+    U32 price;
+    U32 off;
+    U32 mlen;
+    U32 litlen;
+    U32 rep[ZSTDv07_REP_INIT];
+} ZSTDv07_optimal_t;
+
+struct ZSTDv07_stats_s { U32 unused; };
+
+typedef struct {
+    void* buffer;
+    U32*  offsetStart;
+    U32*  offset;
+    BYTE* offCodeStart;
+    BYTE* litStart;
+    BYTE* lit;
+    U16*  litLengthStart;
+    U16*  litLength;
+    BYTE* llCodeStart;
+    U16*  matchLengthStart;
+    U16*  matchLength;
+    BYTE* mlCodeStart;
+    U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+    U32   longLengthPos;
+    /* opt */
+    ZSTDv07_optimal_t* priceTable;
+    ZSTDv07_match_t* matchTable;
+    U32* matchLengthFreq;
+    U32* litLengthFreq;
+    U32* litFreq;
+    U32* offCodeFreq;
+    U32  matchLengthSum;
+    U32  matchSum;
+    U32  litLengthSum;
+    U32  litSum;
+    U32  offCodeSum;
+    U32  log2matchLengthSum;
+    U32  log2matchSum;
+    U32  log2litLengthSum;
+    U32  log2litSum;
+    U32  log2offCodeSum;
+    U32  factor;
+    U32  cachedPrice;
+    U32  cachedLitLength;
+    const BYTE* cachedLiterals;
+    ZSTDv07_stats_t stats;
+} seqStore_t;
+
+void ZSTDv07_seqToCodes(const seqStore_t* seqStorePtr, size_t const nbSeq);
+
+/* custom memory allocation functions */
+static const ZSTDv07_customMem defaultCustomMem = { ZSTDv07_defaultAllocFunction, ZSTDv07_defaultFreeFunction, NULL };
+
+#endif   /* ZSTDv07_CCOMMON_H_MODULE */
+/*
+    zstd - standard compression library
+    Copyright (C) 2014-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net
+*/
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTDv07_decompress() will allocate memory,
+ * in memory stack (0), or in memory heap (1, requires malloc())
+ */
+#ifndef ZSTDv07_HEAPMODE
+#  define ZSTDv07_HEAPMODE 1
+#endif
+
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
+#endif
+
+
+/*-*************************************
+*  Macros
+***************************************/
+#define ZSTDv07_isError ERR_isError   /* for inlining */
+#define FSEv07_isError  ERR_isError
+#define HUFv07_isError  ERR_isError
+
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTDv07_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
+               ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTDv07_dStage;
+
+struct ZSTDv07_DCtx_s
+{
+    FSEv07_DTable LLTable[FSEv07_DTABLE_SIZE_U32(LLFSELog)];
+    FSEv07_DTable OffTable[FSEv07_DTABLE_SIZE_U32(OffFSELog)];
+    FSEv07_DTable MLTable[FSEv07_DTABLE_SIZE_U32(MLFSELog)];
+    HUFv07_DTable hufTable[HUFv07_DTABLE_SIZE(HufLog)];  /* can accommodate HUFv07_decompress4X */
+    const void* previousDstEnd;
+    const void* base;
+    const void* vBase;
+    const void* dictEnd;
+    size_t expected;
+    U32 rep[3];
+    ZSTDv07_frameParams fParams;
+    blockType_t bType;   /* used in ZSTDv07_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
+    ZSTDv07_dStage stage;
+    U32 litEntropy;
+    U32 fseEntropy;
+    XXH64_state_t xxhState;
+    size_t headerSize;
+    U32 dictID;
+    const BYTE* litPtr;
+    ZSTDv07_customMem customMem;
+    size_t litSize;
+    BYTE litBuffer[ZSTDv07_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH];
+    BYTE headerBuffer[ZSTDv07_FRAMEHEADERSIZE_MAX];
+};  /* typedef'd to ZSTDv07_DCtx within "zstd_static.h" */
+
+int ZSTDv07_isSkipFrame(ZSTDv07_DCtx* dctx);
+
+size_t ZSTDv07_sizeofDCtx (const ZSTDv07_DCtx* dctx) { return sizeof(*dctx); }
+
+size_t ZSTDv07_estimateDCtxSize(void) { return sizeof(ZSTDv07_DCtx); }
+
+size_t ZSTDv07_decompressBegin(ZSTDv07_DCtx* dctx)
+{
+    dctx->expected = ZSTDv07_frameHeaderSize_min;
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    dctx->hufTable[0] = (HUFv07_DTable)((HufLog)*0x1000001);
+    dctx->litEntropy = dctx->fseEntropy = 0;
+    dctx->dictID = 0;
+    { int i; for (i=0; i<ZSTDv07_REP_NUM; i++) dctx->rep[i] = repStartValue[i]; }
+    return 0;
+}
+
+ZSTDv07_DCtx* ZSTDv07_createDCtx_advanced(ZSTDv07_customMem customMem)
+{
+    ZSTDv07_DCtx* dctx;
+
+    if (!customMem.customAlloc && !customMem.customFree)
+        customMem = defaultCustomMem;
+
+    if (!customMem.customAlloc || !customMem.customFree)
+        return NULL;
+
+    dctx = (ZSTDv07_DCtx*) customMem.customAlloc(customMem.opaque, sizeof(ZSTDv07_DCtx));
+    if (!dctx) return NULL;
+    memcpy(&dctx->customMem, &customMem, sizeof(ZSTDv07_customMem));
+    ZSTDv07_decompressBegin(dctx);
+    return dctx;
+}
+
+ZSTDv07_DCtx* ZSTDv07_createDCtx(void)
+{
+    return ZSTDv07_createDCtx_advanced(defaultCustomMem);
+}
+
+size_t ZSTDv07_freeDCtx(ZSTDv07_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support free on NULL */
+    dctx->customMem.customFree(dctx->customMem.opaque, dctx);
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+void ZSTDv07_copyDCtx(ZSTDv07_DCtx* dstDCtx, const ZSTDv07_DCtx* srcDCtx)
+{
+    memcpy(dstDCtx, srcDCtx,
+           sizeof(ZSTDv07_DCtx) - (ZSTDv07_BLOCKSIZE_ABSOLUTEMAX+WILDCOPY_OVERLENGTH + ZSTDv07_frameHeaderSize_max));  /* no need to copy workspace */
+}
+
+
+/*-*************************************************************
+*   Decompression section
+***************************************************************/
+
+/* Frame format description
+   Frame Header -  [ Block Header - Block ] - Frame End
+   1) Frame Header
+      - 4 bytes - Magic Number : ZSTDv07_MAGICNUMBER (defined within zstd.h)
+      - 1 byte  - Frame Descriptor
+   2) Block Header
+      - 3 bytes, starting with a 2-bits descriptor
+                 Uncompressed, Compressed, Frame End, unused
+   3) Block
+      See Block Format Description
+   4) Frame End
+      - 3 bytes, compatible with Block Header
+*/
+
+
+/* Frame Header :
+
+   1 byte - FrameHeaderDescription :
+   bit 0-1 : dictID (0, 1, 2 or 4 bytes)
+   bit 2   : checksumFlag
+   bit 3   : reserved (must be zero)
+   bit 4   : reserved (unused, can be any value)
+   bit 5   : Single Segment (if 1, WindowLog byte is not present)
+   bit 6-7 : FrameContentFieldSize (0, 2, 4, or 8)
+             if (SkippedWindowLog && !FrameContentFieldsize) FrameContentFieldsize=1;
+
+   Optional : WindowLog (0 or 1 byte)
+   bit 0-2 : octal Fractional (1/8th)
+   bit 3-7 : Power of 2, with 0 = 1 KB (up to 2 TB)
+
+   Optional : dictID (0, 1, 2 or 4 bytes)
+   Automatic adaptation
+   0 : no dictID
+   1 : 1 - 255
+   2 : 256 - 65535
+   4 : all other values
+
+   Optional : content size (0, 1, 2, 4 or 8 bytes)
+   0 : unknown          (fcfs==0 and swl==0)
+   1 : 0-255 bytes      (fcfs==0 and swl==1)
+   2 : 256 - 65535+256  (fcfs==1)
+   4 : 0 - 4GB-1        (fcfs==2)
+   8 : 0 - 16EB-1       (fcfs==3)
+*/
+
+
+/* Compressed Block, format description
+
+   Block = Literal Section - Sequences Section
+   Prerequisite : size of (compressed) block, maximum size of regenerated data
+
+   1) Literal Section
+
+   1.1) Header : 1-5 bytes
+        flags: 2 bits
+            00 compressed by Huff0
+            01 unused
+            10 is Raw (uncompressed)
+            11 is Rle
+            Note : using 01 => Huff0 with precomputed table ?
+            Note : delta map ? => compressed ?
+
+   1.1.1) Huff0-compressed literal block : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+   1.1.2) Raw (uncompressed) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RAW<<6) + (0<<4) + size
+               12 bits: (IS_RAW<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RAW<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.3) Rle (repeated single byte) literal block header : 1-3 bytes
+        size :  5 bits: (IS_RLE<<6) + (0<<4) + size
+               12 bits: (IS_RLE<<6) + (2<<4) + (size>>8)
+                        size&255
+               20 bits: (IS_RLE<<6) + (3<<4) + (size>>16)
+                        size>>8&255
+                        size&255
+
+   1.1.4) Huff0-compressed literal block, using precomputed CTables : 3-5 bytes
+            srcSize < 1 KB => 3 bytes (2-2-10-10) => single stream
+            srcSize < 1 KB => 3 bytes (2-2-10-10)
+            srcSize < 16KB => 4 bytes (2-2-14-14)
+            else           => 5 bytes (2-2-18-18)
+            big endian convention
+
+        1- CTable available (stored into workspace ?)
+        2- Small input (fast heuristic ? Full comparison ? depend on clevel ?)
+
+
+   1.2) Literal block content
+
+   1.2.1) Huff0 block, using sizes from header
+        See Huff0 format
+
+   1.2.2) Huff0 block, using prepared table
+
+   1.2.3) Raw content
+
+   1.2.4) single byte
+
+
+   2) Sequences section
+      TO DO
+*/
+
+/** ZSTDv07_frameHeaderSize() :
+*   srcSize must be >= ZSTDv07_frameHeaderSize_min.
+*   @return : size of the Frame Header */
+static size_t ZSTDv07_frameHeaderSize(const void* src, size_t srcSize)
+{
+    if (srcSize < ZSTDv07_frameHeaderSize_min) return ERROR(srcSize_wrong);
+    {   BYTE const fhd = ((const BYTE*)src)[4];
+        U32 const dictID= fhd & 3;
+        U32 const directMode = (fhd >> 5) & 1;
+        U32 const fcsId = fhd >> 6;
+        return ZSTDv07_frameHeaderSize_min + !directMode + ZSTDv07_did_fieldSize[dictID] + ZSTDv07_fcs_fieldSize[fcsId]
+                + (directMode && !ZSTDv07_fcs_fieldSize[fcsId]);
+    }
+}
+
+
+/** ZSTDv07_getFrameParams() :
+*   decode Frame Header, or require larger `srcSize`.
+*   @return : 0, `fparamsPtr` is correctly filled,
+*            >0, `srcSize` is too small, result is expected `srcSize`,
+*             or an error code, which can be tested using ZSTDv07_isError() */
+size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    if (srcSize < ZSTDv07_frameHeaderSize_min) return ZSTDv07_frameHeaderSize_min;
+    if (MEM_readLE32(src) != ZSTDv07_MAGICNUMBER) {
+        if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTDv07_MAGIC_SKIPPABLE_START) {
+            if (srcSize < ZSTDv07_skippableHeaderSize) return ZSTDv07_skippableHeaderSize; /* magic number + skippable frame length */
+            memset(fparamsPtr, 0, sizeof(*fparamsPtr));
+            fparamsPtr->frameContentSize = MEM_readLE32((const char *)src + 4);
+            fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */
+            return 0;
+        }
+        return ERROR(prefix_unknown);
+    }
+
+    /* ensure there is enough `srcSize` to fully read/decode frame header */
+    { size_t const fhsize = ZSTDv07_frameHeaderSize(src, srcSize);
+      if (srcSize < fhsize) return fhsize; }
+
+    {   BYTE const fhdByte = ip[4];
+        size_t pos = 5;
+        U32 const dictIDSizeCode = fhdByte&3;
+        U32 const checksumFlag = (fhdByte>>2)&1;
+        U32 const directMode = (fhdByte>>5)&1;
+        U32 const fcsID = fhdByte>>6;
+        U32 const windowSizeMax = 1U << ZSTDv07_WINDOWLOG_MAX;
+        U32 windowSize = 0;
+        U32 dictID = 0;
+        U64 frameContentSize = 0;
+        if ((fhdByte & 0x08) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits, which must be zero */
+        if (!directMode) {
+            BYTE const wlByte = ip[pos++];
+            U32 const windowLog = (wlByte >> 3) + ZSTDv07_WINDOWLOG_ABSOLUTEMIN;
+            if (windowLog > ZSTDv07_WINDOWLOG_MAX) return ERROR(frameParameter_unsupported);
+            windowSize = (1U << windowLog);
+            windowSize += (windowSize >> 3) * (wlByte&7);
+        }
+
+        switch(dictIDSizeCode)
+        {
+            default:   /* impossible */
+            case 0 : break;
+            case 1 : dictID = ip[pos]; pos++; break;
+            case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
+            case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break;
+        }
+        switch(fcsID)
+        {
+            default:   /* impossible */
+            case 0 : if (directMode) frameContentSize = ip[pos]; break;
+            case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
+            case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
+            case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
+        }
+        if (!windowSize) windowSize = (U32)frameContentSize;
+        if (windowSize > windowSizeMax) return ERROR(frameParameter_unsupported);
+        fparamsPtr->frameContentSize = frameContentSize;
+        fparamsPtr->windowSize = windowSize;
+        fparamsPtr->dictID = dictID;
+        fparamsPtr->checksumFlag = checksumFlag;
+    }
+    return 0;
+}
+
+
+/** ZSTDv07_getDecompressedSize() :
+*   compatible with legacy mode
+*   @return : decompressed size if known, 0 otherwise
+              note : 0 can mean any of the following :
+                   - decompressed size is not provided within frame header
+                   - frame header unknown / not supported
+                   - frame header not completely provided (`srcSize` too small) */
+unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize)
+{
+    {   ZSTDv07_frameParams fparams;
+        size_t const frResult = ZSTDv07_getFrameParams(&fparams, src, srcSize);
+        if (frResult!=0) return 0;
+        return fparams.frameContentSize;
+    }
+}
+
+
+/** ZSTDv07_decodeFrameHeader() :
+*   `srcSize` must be the size provided by ZSTDv07_frameHeaderSize().
+*   @return : 0 if success, or an error code, which can be tested using ZSTDv07_isError() */
+static size_t ZSTDv07_decodeFrameHeader(ZSTDv07_DCtx* dctx, const void* src, size_t srcSize)
+{
+    size_t const result = ZSTDv07_getFrameParams(&(dctx->fParams), src, srcSize);
+    if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID)) return ERROR(dictionary_wrong);
+    if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0);
+    return result;
+}
+
+
+typedef struct
+{
+    blockType_t blockType;
+    U32 origSize;
+} blockProperties_t;
+
+/*! ZSTDv07_getcBlockSize() :
+*   Provides the size of compressed block from block header `src` */
+size_t ZSTDv07_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    const BYTE* const in = (const BYTE* const)src;
+    U32 cSize;
+
+    if (srcSize < ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    bpPtr->blockType = (blockType_t)((*in) >> 6);
+    cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
+    bpPtr->origSize = (bpPtr->blockType == bt_rle) ? cSize : 0;
+
+    if (bpPtr->blockType == bt_end) return 0;
+    if (bpPtr->blockType == bt_rle) return 1;
+    return cSize;
+}
+
+
+static size_t ZSTDv07_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+/*! ZSTDv07_decodeLiteralsBlock() :
+    @return : nb of bytes read from src (< srcSize ) */
+size_t ZSTDv07_decodeLiteralsBlock(ZSTDv07_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    const BYTE* const istart = (const BYTE*) src;
+
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    switch((litBlockType_t)(istart[0]>> 6))
+    {
+    case lbt_huffman:
+        {   size_t litSize, litCSize, singleStream=0;
+            U32 lhSize = (istart[0] >> 4) & 3;
+            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for lhSize, + cSize (+nbSeq) */
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                /* 2 - 2 - 10 - 10 */
+                lhSize=3;
+                singleStream = istart[0] & 16;
+                litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+                litCSize = ((istart[1] &  3) << 8) + istart[2];
+                break;
+            case 2:
+                /* 2 - 2 - 14 - 14 */
+                lhSize=4;
+                litSize  = ((istart[0] & 15) << 10) + (istart[1] << 2) + (istart[2] >> 6);
+                litCSize = ((istart[2] & 63) <<  8) + istart[3];
+                break;
+            case 3:
+                /* 2 - 2 - 18 - 18 */
+                lhSize=5;
+                litSize  = ((istart[0] & 15) << 14) + (istart[1] << 6) + (istart[2] >> 2);
+                litCSize = ((istart[2] &  3) << 16) + (istart[3] << 8) + istart[4];
+                break;
+            }
+            if (litSize > ZSTDv07_BLOCKSIZE_ABSOLUTEMAX) return ERROR(corruption_detected);
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            if (HUFv07_isError(singleStream ?
+                            HUFv07_decompress1X2_DCtx(dctx->hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize) :
+                            HUFv07_decompress4X_hufOnly (dctx->hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize) ))
+                return ERROR(corruption_detected);
+
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            dctx->litEntropy = 1;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case lbt_repeat:
+        {   size_t litSize, litCSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            if (lhSize != 1)  /* only case supported for now : small litSize, single stream */
+                return ERROR(corruption_detected);
+            if (dctx->litEntropy==0)
+                return ERROR(dictionary_corrupted);
+
+            /* 2 - 2 - 10 - 10 */
+            lhSize=3;
+            litSize  = ((istart[0] & 15) << 6) + (istart[1] >> 2);
+            litCSize = ((istart[1] &  3) << 8) + istart[2];
+            if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+            {   size_t const errorCode = HUFv07_decompress1X4_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->hufTable);
+                if (HUFv07_isError(errorCode)) return ERROR(corruption_detected);
+            }
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+            return litCSize + lhSize;
+        }
+    case lbt_raw:
+        {   size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize=1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                break;
+            }
+
+            if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                if (litSize+lhSize > srcSize) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                return lhSize+litSize;
+            }
+            /* direct reference into compressed stream */
+            dctx->litPtr = istart+lhSize;
+            dctx->litSize = litSize;
+            return lhSize+litSize;
+        }
+    case lbt_rle:
+        {   size_t litSize;
+            U32 lhSize = ((istart[0]) >> 4) & 3;
+            switch(lhSize)
+            {
+            case 0: case 1: default:   /* note : default is impossible, since lhSize into [0..3] */
+                lhSize = 1;
+                litSize = istart[0] & 31;
+                break;
+            case 2:
+                litSize = ((istart[0] & 15) << 8) + istart[1];
+                break;
+            case 3:
+                litSize = ((istart[0] & 15) << 16) + (istart[1] << 8) + istart[2];
+                if (srcSize<4) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+                break;
+            }
+            if (litSize > ZSTDv07_BLOCKSIZE_ABSOLUTEMAX) return ERROR(corruption_detected);
+            memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+            dctx->litPtr = dctx->litBuffer;
+            dctx->litSize = litSize;
+            return lhSize+1;
+        }
+    default:
+        return ERROR(corruption_detected);   /* impossible */
+    }
+}
+
+
+/*! ZSTDv07_buildSeqTable() :
+    @return : nb bytes read from src,
+              or an error code if it fails, testable with ZSTDv07_isError()
+*/
+size_t ZSTDv07_buildSeqTable(FSEv07_DTable* DTable, U32 type, U32 max, U32 maxLog,
+                                 const void* src, size_t srcSize,
+                                 const S16* defaultNorm, U32 defaultLog, U32 flagRepeatTable)
+{
+    switch(type)
+    {
+    case FSEv07_ENCODING_RLE :
+        if (!srcSize) return ERROR(srcSize_wrong);
+        if ( (*(const BYTE*)src) > max) return ERROR(corruption_detected);
+        FSEv07_buildDTable_rle(DTable, *(const BYTE*)src);   /* if *src > max, data is corrupted */
+        return 1;
+    case FSEv07_ENCODING_RAW :
+        FSEv07_buildDTable(DTable, defaultNorm, max, defaultLog);
+        return 0;
+    case FSEv07_ENCODING_STATIC:
+        if (!flagRepeatTable) return ERROR(corruption_detected);
+        return 0;
+    default :   /* impossible */
+    case FSEv07_ENCODING_DYNAMIC :
+        {   U32 tableLog;
+            S16 norm[MaxSeq+1];
+            size_t const headerSize = FSEv07_readNCount(norm, &max, &tableLog, src, srcSize);
+            if (FSEv07_isError(headerSize)) return ERROR(corruption_detected);
+            if (tableLog > maxLog) return ERROR(corruption_detected);
+            FSEv07_buildDTable(DTable, norm, max, tableLog);
+            return headerSize;
+    }   }
+}
+
+
+size_t ZSTDv07_decodeSeqHeaders(int* nbSeqPtr,
+                             FSEv07_DTable* DTableLL, FSEv07_DTable* DTableML, FSEv07_DTable* DTableOffb, U32 flagRepeatTable,
+                             const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+
+    /* check */
+    if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    {   int nbSeq = *ip++;
+        if (!nbSeq) { *nbSeqPtr=0; return 1; }
+        if (nbSeq > 0x7F) {
+            if (nbSeq == 0xFF) {
+                if (ip+2 > iend) return ERROR(srcSize_wrong);
+                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+            } else {
+                if (ip >= iend) return ERROR(srcSize_wrong);
+                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+            }
+        }
+        *nbSeqPtr = nbSeq;
+    }
+
+    /* FSE table descriptors */
+    {   U32 const LLtype  = *ip >> 6;
+        U32 const OFtype = (*ip >> 4) & 3;
+        U32 const MLtype  = (*ip >> 2) & 3;
+        ip++;
+
+        /* check */
+        if (ip > iend-3) return ERROR(srcSize_wrong); /* min : all 3 are "raw", hence no header, but at least xxLog bits per type */
+
+        /* Build DTables */
+        {   size_t const llhSize = ZSTDv07_buildSeqTable(DTableLL, LLtype, MaxLL, LLFSELog, ip, iend-ip, LL_defaultNorm, LL_defaultNormLog, flagRepeatTable);
+            if (ZSTDv07_isError(llhSize)) return ERROR(corruption_detected);
+            ip += llhSize;
+        }
+        {   size_t const ofhSize = ZSTDv07_buildSeqTable(DTableOffb, OFtype, MaxOff, OffFSELog, ip, iend-ip, OF_defaultNorm, OF_defaultNormLog, flagRepeatTable);
+            if (ZSTDv07_isError(ofhSize)) return ERROR(corruption_detected);
+            ip += ofhSize;
+        }
+        {   size_t const mlhSize = ZSTDv07_buildSeqTable(DTableML, MLtype, MaxML, MLFSELog, ip, iend-ip, ML_defaultNorm, ML_defaultNormLog, flagRepeatTable);
+            if (ZSTDv07_isError(mlhSize)) return ERROR(corruption_detected);
+            ip += mlhSize;
+    }   }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+} seq_t;
+
+typedef struct {
+    BITv07_DStream_t DStream;
+    FSEv07_DState_t stateLL;
+    FSEv07_DState_t stateOffb;
+    FSEv07_DState_t stateML;
+    size_t prevOffset[ZSTDv07_REP_INIT];
+} seqState_t;
+
+
+static seq_t ZSTDv07_decodeSequence(seqState_t* seqState)
+{
+    seq_t seq;
+
+    U32 const llCode = FSEv07_peekSymbol(&(seqState->stateLL));
+    U32 const mlCode = FSEv07_peekSymbol(&(seqState->stateML));
+    U32 const ofCode = FSEv07_peekSymbol(&(seqState->stateOffb));   /* <= maxOff, by table construction */
+
+    U32 const llBits = LL_bits[llCode];
+    U32 const mlBits = ML_bits[mlCode];
+    U32 const ofBits = ofCode;
+    U32 const totalBits = llBits+mlBits+ofBits;
+
+    static const U32 LL_base[MaxLL+1] = {
+                             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,   10,    11,    12,    13,    14,     15,
+                            16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                            0x2000, 0x4000, 0x8000, 0x10000 };
+
+    static const U32 ML_base[MaxML+1] = {
+                             3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,   14,    15,    16,    17,    18,
+                            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,   30,    31,    32,    33,    34,
+                            35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                            0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+    static const U32 OF_base[MaxOff+1] = {
+                 0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                 0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD };
+
+    /* sequence */
+    {   size_t offset;
+        if (!ofCode)
+            offset = 0;
+        else {
+            offset = OF_base[ofCode] + BITv07_readBits(&(seqState->DStream), ofBits);   /* <=  (ZSTDv07_WINDOWLOG_MAX-1) bits */
+            if (MEM_32bits()) BITv07_reloadDStream(&(seqState->DStream));
+        }
+
+        if (ofCode <= 1) {
+            if ((llCode == 0) & (offset <= 1)) offset = 1-offset;
+            if (offset) {
+                size_t const temp = seqState->prevOffset[offset];
+                if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset = temp;
+            } else {
+                offset = seqState->prevOffset[0];
+            }
+        } else {
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        }
+        seq.offset = offset;
+    }
+
+    seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BITv07_readBits(&(seqState->DStream), mlBits) : 0);   /* <=  16 bits */
+    if (MEM_32bits() && (mlBits+llBits>24)) BITv07_reloadDStream(&(seqState->DStream));
+
+    seq.litLength = LL_base[llCode] + ((llCode>15) ? BITv07_readBits(&(seqState->DStream), llBits) : 0);   /* <=  16 bits */
+    if (MEM_32bits() ||
+       (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BITv07_reloadDStream(&(seqState->DStream));
+
+    /* ANS state update */
+    FSEv07_updateState(&(seqState->stateLL), &(seqState->DStream));   /* <=  9 bits */
+    FSEv07_updateState(&(seqState->stateML), &(seqState->DStream));   /* <=  9 bits */
+    if (MEM_32bits()) BITv07_reloadDStream(&(seqState->DStream));     /* <= 18 bits */
+    FSEv07_updateState(&(seqState->stateOffb), &(seqState->DStream)); /* <=  8 bits */
+
+    return seq;
+}
+
+
+static
+size_t ZSTDv07_execSequence(BYTE* op,
+                                BYTE* const oend, seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend-WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if ((oLitEnd>oend_w) | (oMatchEnd>oend)) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+
+    /* copy Literals */
+    ZSTDv07_wildcopy(op, *litPtr, sequence.litLength);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_w || sequence.matchLength < MINMATCH) {
+              while (op < oMatchEnd) *op++ = *match++;
+              return sequenceLength;
+            }
+    }   }
+    /* Requirement: op <= oend_w */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* substracted */
+        int const sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTDv07_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTDv07_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_w) {
+            ZSTDv07_wildcopy(op, match, oend_w - op);
+            match += oend_w - op;
+            op = oend_w;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    } else {
+        ZSTDv07_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+
+static size_t ZSTDv07_decompressSequences(
+                               ZSTDv07_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    FSEv07_DTable* DTableLL = dctx->LLTable;
+    FSEv07_DTable* DTableML = dctx->MLTable;
+    FSEv07_DTable* DTableOffb = dctx->OffTable;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    int nbSeq;
+
+    /* Build Decoding Tables */
+    {   size_t const seqHSize = ZSTDv07_decodeSeqHeaders(&nbSeq, DTableLL, DTableML, DTableOffb, dctx->fseEntropy, ip, seqSize);
+        if (ZSTDv07_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+    }
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seqState_t seqState;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTDv07_REP_INIT; i++) seqState.prevOffset[i] = dctx->rep[i]; }
+        { size_t const errorCode = BITv07_initDStream(&(seqState.DStream), ip, iend-ip);
+          if (ERR_isError(errorCode)) return ERROR(corruption_detected); }
+        FSEv07_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSEv07_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSEv07_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
+
+        for ( ; (BITv07_reloadDStream(&(seqState.DStream)) <= BITv07_DStream_completed) && nbSeq ; ) {
+            nbSeq--;
+            {   seq_t const sequence = ZSTDv07_decodeSequence(&seqState);
+                size_t const oneSeqSize = ZSTDv07_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+                if (ZSTDv07_isError(oneSeqSize)) return oneSeqSize;
+                op += oneSeqSize;
+        }   }
+
+        /* check if reached exact end */
+        if (nbSeq) return ERROR(corruption_detected);
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTDv07_REP_INIT; i++) dctx->rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        //if (litPtr > litEnd) return ERROR(corruption_detected);   /* too many literals already used */
+        if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+
+static void ZSTDv07_checkContinuity(ZSTDv07_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+static size_t ZSTDv07_decompressBlock_internal(ZSTDv07_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    if (srcSize >= ZSTDv07_BLOCKSIZE_ABSOLUTEMAX) return ERROR(srcSize_wrong);
+
+    /* Decode literals sub-block */
+    {   size_t const litCSize = ZSTDv07_decodeLiteralsBlock(dctx, src, srcSize);
+        if (ZSTDv07_isError(litCSize)) return litCSize;
+        ip += litCSize;
+        srcSize -= litCSize;
+    }
+    return ZSTDv07_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
+}
+
+
+size_t ZSTDv07_decompressBlock(ZSTDv07_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    size_t dSize;
+    ZSTDv07_checkContinuity(dctx, dst);
+    dSize = ZSTDv07_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+    dctx->previousDstEnd = (char*)dst + dSize;
+    return dSize;
+}
+
+
+/** ZSTDv07_insertBlock() :
+    insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+ZSTDLIBv07_API size_t ZSTDv07_insertBlock(ZSTDv07_DCtx* dctx, const void* blockStart, size_t blockSize)
+{
+    ZSTDv07_checkContinuity(dctx, blockStart);
+    dctx->previousDstEnd = (const char*)blockStart + blockSize;
+    return blockSize;
+}
+
+
+size_t ZSTDv07_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
+{
+    if (length > dstCapacity) return ERROR(dstSize_tooSmall);
+    memset(dst, byte, length);
+    return length;
+}
+
+
+/*! ZSTDv07_decompressFrame() :
+*   `dctx` must be properly initialized */
+static size_t ZSTDv07_decompressFrame(ZSTDv07_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t remainingSize = srcSize;
+
+    /* check */
+    if (srcSize < ZSTDv07_frameHeaderSize_min+ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTDv07_frameHeaderSize(src, ZSTDv07_frameHeaderSize_min);
+        if (ZSTDv07_isError(frameHeaderSize)) return frameHeaderSize;
+        if (srcSize < frameHeaderSize+ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong);
+        if (ZSTDv07_decodeFrameHeader(dctx, src, frameHeaderSize)) return ERROR(corruption_detected);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t decodedSize;
+        blockProperties_t blockProperties;
+        size_t const cBlockSize = ZSTDv07_getcBlockSize(ip, iend-ip, &blockProperties);
+        if (ZSTDv07_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv07_blockHeaderSize;
+        remainingSize -= ZSTDv07_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTDv07_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTDv07_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            decodedSize = ZSTDv07_generateNxBytes(op, oend-op, *ip, blockProperties.origSize);
+            break;
+        case bt_end :
+            /* end of frame */
+            if (remainingSize) return ERROR(srcSize_wrong);
+            decodedSize = 0;
+            break;
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+        if (blockProperties.blockType == bt_end) break;   /* bt_end */
+
+        if (ZSTDv07_isError(decodedSize)) return decodedSize;
+        if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, op, decodedSize);
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return op-ostart;
+}
+
+
+/*! ZSTDv07_decompress_usingPreparedDCtx() :
+*   Same as ZSTDv07_decompress_usingDict, but using a reference context `preparedDCtx`, where dictionary has been loaded.
+*   It avoids reloading the dictionary each time.
+*   `preparedDCtx` must have been properly initialized using ZSTDv07_decompressBegin_usingDict().
+*   Requires 2 contexts : 1 for reference (preparedDCtx), which will not be modified, and 1 to run the decompression operation (dctx) */
+size_t ZSTDv07_decompress_usingPreparedDCtx(ZSTDv07_DCtx* dctx, const ZSTDv07_DCtx* refDCtx,
+                                         void* dst, size_t dstCapacity,
+                                   const void* src, size_t srcSize)
+{
+    ZSTDv07_copyDCtx(dctx, refDCtx);
+    ZSTDv07_checkContinuity(dctx, dst);
+    return ZSTDv07_decompressFrame(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+size_t ZSTDv07_decompress_usingDict(ZSTDv07_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 const void* dict, size_t dictSize)
+{
+    ZSTDv07_decompressBegin_usingDict(dctx, dict, dictSize);
+    ZSTDv07_checkContinuity(dctx, dst);
+    return ZSTDv07_decompressFrame(dctx, dst, dstCapacity, src, srcSize);
+}
+
+
+size_t ZSTDv07_decompressDCtx(ZSTDv07_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTDv07_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0);
+}
+
+
+size_t ZSTDv07_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTDv07_HEAPMODE) && (ZSTDv07_HEAPMODE==1)
+    size_t regenSize;
+    ZSTDv07_DCtx* const dctx = ZSTDv07_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTDv07_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+    ZSTDv07_freeDCtx(dctx);
+    return regenSize;
+#else   /* stack mode */
+    ZSTDv07_DCtx dctx;
+    return ZSTDv07_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+size_t ZSTDv07_findFrameCompressedSize(const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t remainingSize = srcSize;
+
+    /* check */
+    if (srcSize < ZSTDv07_frameHeaderSize_min+ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTDv07_frameHeaderSize(src, ZSTDv07_frameHeaderSize_min);
+        if (ZSTDv07_isError(frameHeaderSize)) return frameHeaderSize;
+        if (MEM_readLE32(src) != ZSTDv07_MAGICNUMBER) return ERROR(prefix_unknown);
+        if (srcSize < frameHeaderSize+ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong);
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        blockProperties_t blockProperties;
+        size_t const cBlockSize = ZSTDv07_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTDv07_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTDv07_blockHeaderSize;
+        remainingSize -= ZSTDv07_blockHeaderSize;
+
+        if (blockProperties.blockType == bt_end) break;
+
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+    }
+
+    return ip - (const BYTE*)src;
+}
+
+/*_******************************
+*  Streaming Decompression API
+********************************/
+size_t ZSTDv07_nextSrcSizeToDecompress(ZSTDv07_DCtx* dctx)
+{
+    return dctx->expected;
+}
+
+int ZSTDv07_isSkipFrame(ZSTDv07_DCtx* dctx)
+{
+    return dctx->stage == ZSTDds_skipFrame;
+}
+
+/** ZSTDv07_decompressContinue() :
+*   @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+*             or an error code, which can be tested using ZSTDv07_isError() */
+size_t ZSTDv07_decompressContinue(ZSTDv07_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != dctx->expected) return ERROR(srcSize_wrong);
+    if (dstCapacity) ZSTDv07_checkContinuity(dctx, dst);
+
+    switch (dctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        if (srcSize != ZSTDv07_frameHeaderSize_min) return ERROR(srcSize_wrong);   /* impossible */
+        if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTDv07_MAGIC_SKIPPABLE_START) {
+            memcpy(dctx->headerBuffer, src, ZSTDv07_frameHeaderSize_min);
+            dctx->expected = ZSTDv07_skippableHeaderSize - ZSTDv07_frameHeaderSize_min; /* magic number + skippable frame length */
+            dctx->stage = ZSTDds_decodeSkippableHeader;
+            return 0;
+        }
+        dctx->headerSize = ZSTDv07_frameHeaderSize(src, ZSTDv07_frameHeaderSize_min);
+        if (ZSTDv07_isError(dctx->headerSize)) return dctx->headerSize;
+        memcpy(dctx->headerBuffer, src, ZSTDv07_frameHeaderSize_min);
+        if (dctx->headerSize > ZSTDv07_frameHeaderSize_min) {
+            dctx->expected = dctx->headerSize - ZSTDv07_frameHeaderSize_min;
+            dctx->stage = ZSTDds_decodeFrameHeader;
+            return 0;
+        }
+        dctx->expected = 0;   /* not necessary to copy more */
+	/* fall-through */
+    case ZSTDds_decodeFrameHeader:
+        {   size_t result;
+            memcpy(dctx->headerBuffer + ZSTDv07_frameHeaderSize_min, src, dctx->expected);
+            result = ZSTDv07_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize);
+            if (ZSTDv07_isError(result)) return result;
+            dctx->expected = ZSTDv07_blockHeaderSize;
+            dctx->stage = ZSTDds_decodeBlockHeader;
+            return 0;
+        }
+    case ZSTDds_decodeBlockHeader:
+        {   blockProperties_t bp;
+            size_t const cBlockSize = ZSTDv07_getcBlockSize(src, ZSTDv07_blockHeaderSize, &bp);
+            if (ZSTDv07_isError(cBlockSize)) return cBlockSize;
+            if (bp.blockType == bt_end) {
+                if (dctx->fParams.checksumFlag) {
+                    U64 const h64 = XXH64_digest(&dctx->xxhState);
+                    U32 const h32 = (U32)(h64>>11) & ((1<<22)-1);
+                    const BYTE* const ip = (const BYTE*)src;
+                    U32 const check32 = ip[2] + (ip[1] << 8) + ((ip[0] & 0x3F) << 16);
+                    if (check32 != h32) return ERROR(checksum_wrong);
+                }
+                dctx->expected = 0;
+                dctx->stage = ZSTDds_getFrameHeaderSize;
+            } else {
+                dctx->expected = cBlockSize;
+                dctx->bType = bp.blockType;
+                dctx->stage = ZSTDds_decompressBlock;
+            }
+            return 0;
+        }
+    case ZSTDds_decompressBlock:
+        {   size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                rSize = ZSTDv07_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+                break;
+            case bt_raw :
+                rSize = ZSTDv07_copyRawBlock(dst, dstCapacity, src, srcSize);
+                break;
+            case bt_rle :
+                return ERROR(GENERIC);   /* not yet handled */
+                break;
+            case bt_end :   /* should never happen (filtered at phase 1) */
+                rSize = 0;
+                break;
+            default:
+                return ERROR(GENERIC);   /* impossible */
+            }
+            dctx->stage = ZSTDds_decodeBlockHeader;
+            dctx->expected = ZSTDv07_blockHeaderSize;
+            dctx->previousDstEnd = (char*)dst + rSize;
+            if (ZSTDv07_isError(rSize)) return rSize;
+            if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize);
+            return rSize;
+        }
+    case ZSTDds_decodeSkippableHeader:
+        {   memcpy(dctx->headerBuffer + ZSTDv07_frameHeaderSize_min, src, dctx->expected);
+            dctx->expected = MEM_readLE32(dctx->headerBuffer + 4);
+            dctx->stage = ZSTDds_skipFrame;
+            return 0;
+        }
+    case ZSTDds_skipFrame:
+        {   dctx->expected = 0;
+            dctx->stage = ZSTDds_getFrameHeaderSize;
+            return 0;
+        }
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static size_t ZSTDv07_refDictContent(ZSTDv07_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+    dctx->base = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+    return 0;
+}
+
+static size_t ZSTDv07_loadEntropy(ZSTDv07_DCtx* dctx, const void* const dict, size_t const dictSize)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+
+    {   size_t const hSize = HUFv07_readDTableX4(dctx->hufTable, dict, dictSize);
+        if (HUFv07_isError(hSize)) return ERROR(dictionary_corrupted);
+        dictPtr += hSize;
+    }
+
+    {   short offcodeNCount[MaxOff+1];
+        U32 offcodeMaxValue=MaxOff, offcodeLog;
+        size_t const offcodeHeaderSize = FSEv07_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        if (FSEv07_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv07_buildDTable(dctx->OffTable, offcodeNCount, offcodeMaxValue, offcodeLog);
+          if (FSEv07_isError(errorCode)) return ERROR(dictionary_corrupted); }
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSEv07_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSEv07_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv07_buildDTable(dctx->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog);
+          if (FSEv07_isError(errorCode)) return ERROR(dictionary_corrupted); }
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSEv07_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSEv07_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
+        { size_t const errorCode = FSEv07_buildDTable(dctx->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog);
+          if (FSEv07_isError(errorCode)) return ERROR(dictionary_corrupted); }
+        dictPtr += litlengthHeaderSize;
+    }
+
+    if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted);
+    dctx->rep[0] = MEM_readLE32(dictPtr+0); if (dctx->rep[0] == 0 || dctx->rep[0] >= dictSize) return ERROR(dictionary_corrupted);
+    dctx->rep[1] = MEM_readLE32(dictPtr+4); if (dctx->rep[1] == 0 || dctx->rep[1] >= dictSize) return ERROR(dictionary_corrupted);
+    dctx->rep[2] = MEM_readLE32(dictPtr+8); if (dctx->rep[2] == 0 || dctx->rep[2] >= dictSize) return ERROR(dictionary_corrupted);
+    dictPtr += 12;
+
+    dctx->litEntropy = dctx->fseEntropy = 1;
+    return dictPtr - (const BYTE*)dict;
+}
+
+static size_t ZSTDv07_decompress_insertDictionary(ZSTDv07_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return ZSTDv07_refDictContent(dctx, dict, dictSize);
+    {   U32 const magic = MEM_readLE32(dict);
+        if (magic != ZSTDv07_DICT_MAGIC) {
+            return ZSTDv07_refDictContent(dctx, dict, dictSize);   /* pure content mode */
+    }   }
+    dctx->dictID = MEM_readLE32((const char*)dict + 4);
+
+    /* load entropy tables */
+    dict = (const char*)dict + 8;
+    dictSize -= 8;
+    {   size_t const eSize = ZSTDv07_loadEntropy(dctx, dict, dictSize);
+        if (ZSTDv07_isError(eSize)) return ERROR(dictionary_corrupted);
+        dict = (const char*)dict + eSize;
+        dictSize -= eSize;
+    }
+
+    /* reference dictionary content */
+    return ZSTDv07_refDictContent(dctx, dict, dictSize);
+}
+
+
+size_t ZSTDv07_decompressBegin_usingDict(ZSTDv07_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    { size_t const errorCode = ZSTDv07_decompressBegin(dctx);
+      if (ZSTDv07_isError(errorCode)) return errorCode; }
+
+    if (dict && dictSize) {
+        size_t const errorCode = ZSTDv07_decompress_insertDictionary(dctx, dict, dictSize);
+        if (ZSTDv07_isError(errorCode)) return ERROR(dictionary_corrupted);
+    }
+
+    return 0;
+}
+
+
+struct ZSTDv07_DDict_s {
+    void* dict;
+    size_t dictSize;
+    ZSTDv07_DCtx* refContext;
+};  /* typedef'd tp ZSTDv07_CDict within zstd.h */
+
+ZSTDv07_DDict* ZSTDv07_createDDict_advanced(const void* dict, size_t dictSize, ZSTDv07_customMem customMem)
+{
+    if (!customMem.customAlloc && !customMem.customFree)
+        customMem = defaultCustomMem;
+
+    if (!customMem.customAlloc || !customMem.customFree)
+        return NULL;
+
+    {   ZSTDv07_DDict* const ddict = (ZSTDv07_DDict*) customMem.customAlloc(customMem.opaque, sizeof(*ddict));
+        void* const dictContent = customMem.customAlloc(customMem.opaque, dictSize);
+        ZSTDv07_DCtx* const dctx = ZSTDv07_createDCtx_advanced(customMem);
+
+        if (!dictContent || !ddict || !dctx) {
+            customMem.customFree(customMem.opaque, dictContent);
+            customMem.customFree(customMem.opaque, ddict);
+            customMem.customFree(customMem.opaque, dctx);
+            return NULL;
+        }
+
+        memcpy(dictContent, dict, dictSize);
+        {   size_t const errorCode = ZSTDv07_decompressBegin_usingDict(dctx, dictContent, dictSize);
+            if (ZSTDv07_isError(errorCode)) {
+                customMem.customFree(customMem.opaque, dictContent);
+                customMem.customFree(customMem.opaque, ddict);
+                customMem.customFree(customMem.opaque, dctx);
+                return NULL;
+        }   }
+
+        ddict->dict = dictContent;
+        ddict->dictSize = dictSize;
+        ddict->refContext = dctx;
+        return ddict;
+    }
+}
+
+/*! ZSTDv07_createDDict() :
+*   Create a digested dictionary, ready to start decompression without startup delay.
+*   `dict` can be released after `ZSTDv07_DDict` creation */
+ZSTDv07_DDict* ZSTDv07_createDDict(const void* dict, size_t dictSize)
+{
+    ZSTDv07_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTDv07_createDDict_advanced(dict, dictSize, allocator);
+}
+
+size_t ZSTDv07_freeDDict(ZSTDv07_DDict* ddict)
+{
+    ZSTDv07_freeFunction const cFree = ddict->refContext->customMem.customFree;
+    void* const opaque = ddict->refContext->customMem.opaque;
+    ZSTDv07_freeDCtx(ddict->refContext);
+    cFree(opaque, ddict->dict);
+    cFree(opaque, ddict);
+    return 0;
+}
+
+/*! ZSTDv07_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Use dictionary without significant overhead. */
+ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDDict(ZSTDv07_DCtx* dctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const ZSTDv07_DDict* ddict)
+{
+    return ZSTDv07_decompress_usingPreparedDCtx(dctx, ddict->refContext,
+                                           dst, dstCapacity,
+                                           src, srcSize);
+}
+/*
+    Buffered version of Zstd compression library
+    Copyright (C) 2015-2016, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - zstd homepage : http://www.zstd.net/
+*/
+
+
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFFv07_DCtx object is required to track streaming operations.
+*  Use ZBUFFv07_createDCtx() and ZBUFFv07_freeDCtx() to create/release resources.
+*  Use ZBUFFv07_decompressInit() to start a new decompression operation,
+*   or ZBUFFv07_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv07_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFFv07_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change @dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+*            or 0 when a frame is completely decoded,
+*            or an error code, which can be tested using ZBUFFv07_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv07_recommendedDInSize() and ZBUFFv07_recommendedDOutSize()
+*  output : ZBUFFv07_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv07_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFFv07_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+typedef enum { ZBUFFds_init, ZBUFFds_loadHeader,
+               ZBUFFds_read, ZBUFFds_load, ZBUFFds_flush } ZBUFFv07_dStage;
+
+/* *** Resource management *** */
+struct ZBUFFv07_DCtx_s {
+    ZSTDv07_DCtx* zd;
+    ZSTDv07_frameParams fParams;
+    ZBUFFv07_dStage stage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t blockSize;
+    BYTE headerBuffer[ZSTDv07_FRAMEHEADERSIZE_MAX];
+    size_t lhSize;
+    ZSTDv07_customMem customMem;
+};   /* typedef'd to ZBUFFv07_DCtx within "zstd_buffered.h" */
+
+ZSTDLIBv07_API ZBUFFv07_DCtx* ZBUFFv07_createDCtx_advanced(ZSTDv07_customMem customMem);
+
+ZBUFFv07_DCtx* ZBUFFv07_createDCtx(void)
+{
+    return ZBUFFv07_createDCtx_advanced(defaultCustomMem);
+}
+
+ZBUFFv07_DCtx* ZBUFFv07_createDCtx_advanced(ZSTDv07_customMem customMem)
+{
+    ZBUFFv07_DCtx* zbd;
+
+    if (!customMem.customAlloc && !customMem.customFree)
+        customMem = defaultCustomMem;
+
+    if (!customMem.customAlloc || !customMem.customFree)
+        return NULL;
+
+    zbd = (ZBUFFv07_DCtx*)customMem.customAlloc(customMem.opaque, sizeof(ZBUFFv07_DCtx));
+    if (zbd==NULL) return NULL;
+    memset(zbd, 0, sizeof(ZBUFFv07_DCtx));
+    memcpy(&zbd->customMem, &customMem, sizeof(ZSTDv07_customMem));
+    zbd->zd = ZSTDv07_createDCtx_advanced(customMem);
+    if (zbd->zd == NULL) { ZBUFFv07_freeDCtx(zbd); return NULL; }
+    zbd->stage = ZBUFFds_init;
+    return zbd;
+}
+
+size_t ZBUFFv07_freeDCtx(ZBUFFv07_DCtx* zbd)
+{
+    if (zbd==NULL) return 0;   /* support free on null */
+    ZSTDv07_freeDCtx(zbd->zd);
+    if (zbd->inBuff) zbd->customMem.customFree(zbd->customMem.opaque, zbd->inBuff);
+    if (zbd->outBuff) zbd->customMem.customFree(zbd->customMem.opaque, zbd->outBuff);
+    zbd->customMem.customFree(zbd->customMem.opaque, zbd);
+    return 0;
+}
+
+
+/* *** Initialization *** */
+
+size_t ZBUFFv07_decompressInitDictionary(ZBUFFv07_DCtx* zbd, const void* dict, size_t dictSize)
+{
+    zbd->stage = ZBUFFds_loadHeader;
+    zbd->lhSize = zbd->inPos = zbd->outStart = zbd->outEnd = 0;
+    return ZSTDv07_decompressBegin_usingDict(zbd->zd, dict, dictSize);
+}
+
+size_t ZBUFFv07_decompressInit(ZBUFFv07_DCtx* zbd)
+{
+    return ZBUFFv07_decompressInitDictionary(zbd, NULL, 0);
+}
+
+
+/* internal util function */
+MEM_STATIC size_t ZBUFFv07_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+
+/* *** Decompression *** */
+
+size_t ZBUFFv07_decompressContinue(ZBUFFv07_DCtx* zbd,
+                                void* dst, size_t* dstCapacityPtr,
+                          const void* src, size_t* srcSizePtr)
+{
+    const char* const istart = (const char*)src;
+    const char* const iend = istart + *srcSizePtr;
+    const char* ip = istart;
+    char* const ostart = (char*)dst;
+    char* const oend = ostart + *dstCapacityPtr;
+    char* op = ostart;
+    U32 notDone = 1;
+
+    while (notDone) {
+        switch(zbd->stage)
+        {
+        case ZBUFFds_init :
+            return ERROR(init_missing);
+
+        case ZBUFFds_loadHeader :
+            {   size_t const hSize = ZSTDv07_getFrameParams(&(zbd->fParams), zbd->headerBuffer, zbd->lhSize);
+                if (ZSTDv07_isError(hSize)) return hSize;
+                if (hSize != 0) {
+                    size_t const toLoad = hSize - zbd->lhSize;   /* if hSize!=0, hSize > zbd->lhSize */
+                    if (toLoad > (size_t)(iend-ip)) {   /* not enough input to load full header */
+                        memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip);
+                        zbd->lhSize += iend-ip;
+                        *dstCapacityPtr = 0;
+                        return (hSize - zbd->lhSize) + ZSTDv07_blockHeaderSize;   /* remaining header bytes + next block header */
+                    }
+                    memcpy(zbd->headerBuffer + zbd->lhSize, ip, toLoad); zbd->lhSize = hSize; ip += toLoad;
+                    break;
+            }   }
+
+            /* Consume header */
+            {   size_t const h1Size = ZSTDv07_nextSrcSizeToDecompress(zbd->zd);  /* == ZSTDv07_frameHeaderSize_min */
+                size_t const h1Result = ZSTDv07_decompressContinue(zbd->zd, NULL, 0, zbd->headerBuffer, h1Size);
+                if (ZSTDv07_isError(h1Result)) return h1Result;
+                if (h1Size < zbd->lhSize) {   /* long header */
+                    size_t const h2Size = ZSTDv07_nextSrcSizeToDecompress(zbd->zd);
+                    size_t const h2Result = ZSTDv07_decompressContinue(zbd->zd, NULL, 0, zbd->headerBuffer+h1Size, h2Size);
+                    if (ZSTDv07_isError(h2Result)) return h2Result;
+            }   }
+
+            zbd->fParams.windowSize = MAX(zbd->fParams.windowSize, 1U << ZSTDv07_WINDOWLOG_ABSOLUTEMIN);
+
+            /* Frame header instruct buffer sizes */
+            {   size_t const blockSize = MIN(zbd->fParams.windowSize, ZSTDv07_BLOCKSIZE_ABSOLUTEMAX);
+                zbd->blockSize = blockSize;
+                if (zbd->inBuffSize < blockSize) {
+                    zbd->customMem.customFree(zbd->customMem.opaque, zbd->inBuff);
+                    zbd->inBuffSize = blockSize;
+                    zbd->inBuff = (char*)zbd->customMem.customAlloc(zbd->customMem.opaque, blockSize);
+                    if (zbd->inBuff == NULL) return ERROR(memory_allocation);
+                }
+                {   size_t const neededOutSize = zbd->fParams.windowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+                    if (zbd->outBuffSize < neededOutSize) {
+                        zbd->customMem.customFree(zbd->customMem.opaque, zbd->outBuff);
+                        zbd->outBuffSize = neededOutSize;
+                        zbd->outBuff = (char*)zbd->customMem.customAlloc(zbd->customMem.opaque, neededOutSize);
+                        if (zbd->outBuff == NULL) return ERROR(memory_allocation);
+            }   }   }
+            zbd->stage = ZBUFFds_read;
+            /* pass-through */
+	    /* fall-through */
+        case ZBUFFds_read:
+            {   size_t const neededInSize = ZSTDv07_nextSrcSizeToDecompress(zbd->zd);
+                if (neededInSize==0) {  /* end of frame */
+                    zbd->stage = ZBUFFds_init;
+                    notDone = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                    const int isSkipFrame = ZSTDv07_isSkipFrame(zbd->zd);
+                    size_t const decodedSize = ZSTDv07_decompressContinue(zbd->zd,
+                        zbd->outBuff + zbd->outStart, (isSkipFrame ? 0 : zbd->outBuffSize - zbd->outStart),
+                        ip, neededInSize);
+                    if (ZSTDv07_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize && !isSkipFrame) break;   /* this was just a header */
+                    zbd->outEnd = zbd->outStart +  decodedSize;
+                    zbd->stage = ZBUFFds_flush;
+                    break;
+                }
+                if (ip==iend) { notDone = 0; break; }   /* no more input */
+                zbd->stage = ZBUFFds_load;
+            }
+	    /* fall-through */
+        case ZBUFFds_load:
+            {   size_t const neededInSize = ZSTDv07_nextSrcSizeToDecompress(zbd->zd);
+                size_t const toLoad = neededInSize - zbd->inPos;   /* should always be <= remaining space within inBuff */
+                size_t loadedSize;
+                if (toLoad > zbd->inBuffSize - zbd->inPos) return ERROR(corruption_detected);   /* should never happen */
+                loadedSize = ZBUFFv07_limitCopy(zbd->inBuff + zbd->inPos, toLoad, ip, iend-ip);
+                ip += loadedSize;
+                zbd->inPos += loadedSize;
+                if (loadedSize < toLoad) { notDone = 0; break; }   /* not enough input, wait for more */
+
+                /* decode loaded input */
+                {  const int isSkipFrame = ZSTDv07_isSkipFrame(zbd->zd);
+                   size_t const decodedSize = ZSTDv07_decompressContinue(zbd->zd,
+                        zbd->outBuff + zbd->outStart, zbd->outBuffSize - zbd->outStart,
+                        zbd->inBuff, neededInSize);
+                    if (ZSTDv07_isError(decodedSize)) return decodedSize;
+                    zbd->inPos = 0;   /* input is consumed */
+                    if (!decodedSize && !isSkipFrame) { zbd->stage = ZBUFFds_read; break; }   /* this was just a header */
+                    zbd->outEnd = zbd->outStart +  decodedSize;
+                    zbd->stage = ZBUFFds_flush;
+                    /* break; */
+                    /* pass-through */
+                }
+	    }
+	    /* fall-through */
+        case ZBUFFds_flush:
+            {   size_t const toFlushSize = zbd->outEnd - zbd->outStart;
+                size_t const flushedSize = ZBUFFv07_limitCopy(op, oend-op, zbd->outBuff + zbd->outStart, toFlushSize);
+                op += flushedSize;
+                zbd->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {
+                    zbd->stage = ZBUFFds_read;
+                    if (zbd->outStart + zbd->blockSize > zbd->outBuffSize)
+                        zbd->outStart = zbd->outEnd = 0;
+                    break;
+                }
+                /* cannot flush everything */
+                notDone = 0;
+                break;
+            }
+        default: return ERROR(GENERIC);   /* impossible */
+    }   }
+
+    /* result */
+    *srcSizePtr = ip-istart;
+    *dstCapacityPtr = op-ostart;
+    {   size_t nextSrcSizeHint = ZSTDv07_nextSrcSizeToDecompress(zbd->zd);
+        nextSrcSizeHint -= zbd->inPos;   /* already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFFv07_recommendedDInSize(void)  { return ZSTDv07_BLOCKSIZE_ABSOLUTEMAX + ZSTDv07_blockHeaderSize /* block header size*/ ; }
+size_t ZBUFFv07_recommendedDOutSize(void) { return ZSTDv07_BLOCKSIZE_ABSOLUTEMAX; }

diff --git a/vendor/github.com/DataDog/zstd/zstd_v07.h b/vendor/github.com/DataDog/zstd/zstd_v07.h
new file mode 100644
index 0000000..6591cd3
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd_v07.h

@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv07_H_235446
+#define ZSTDv07_H_235446
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*======  Dependency  ======*/
+#include <stddef.h>   /* size_t */
+
+
+/*======  Export for Windows  ======*/
+/*!
+*  ZSTDv07_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*/
+#if defined(_WIN32) && defined(ZSTDv07_DLL_EXPORT) && (ZSTDv07_DLL_EXPORT==1)
+#  define ZSTDLIBv07_API __declspec(dllexport)
+#else
+#  define ZSTDLIBv07_API
+#endif
+
+
+/* *************************************
+*  Simple API
+***************************************/
+/*! ZSTDv07_getDecompressedSize() :
+*   @return : decompressed size if known, 0 otherwise.
+       note 1 : if `0`, follow up with ZSTDv07_getFrameParams() to know precise failure cause.
+       note 2 : decompressed size could be wrong or intentionally modified !
+                always ensure results fit within application's authorized limits */
+unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTDv07_decompress() :
+    `compressedSize` : must be _exact_ size of compressed input, otherwise decompression will fail.
+    `dstCapacity` must be equal or larger than originalSize.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTDv07_isError()) */
+ZSTDLIBv07_API size_t ZSTDv07_decompress( void* dst, size_t dstCapacity,
+                                    const void* src, size_t compressedSize);
+
+/**
+ZSTDv07_getFrameSrcSize() : get the source length of a ZSTD frame
+    compressedSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    return : the number of bytes that would be read to decompress this frame
+             or an errorCode if it fails (which can be tested using ZSTDv07_isError())
+*/
+size_t ZSTDv07_findFrameCompressedSize(const void* src, size_t compressedSize);
+
+/*======  Helper functions  ======*/
+ZSTDLIBv07_API unsigned    ZSTDv07_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIBv07_API const char* ZSTDv07_getErrorName(size_t code);     /*!< provides readable string from an error code */
+
+
+/*-*************************************
+*  Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv07_DCtx_s ZSTDv07_DCtx;
+ZSTDLIBv07_API ZSTDv07_DCtx* ZSTDv07_createDCtx(void);
+ZSTDLIBv07_API size_t     ZSTDv07_freeDCtx(ZSTDv07_DCtx* dctx);      /*!< @return : errorCode */
+
+/** ZSTDv07_decompressDCtx() :
+*   Same as ZSTDv07_decompress(), requires an allocated ZSTDv07_DCtx (see ZSTDv07_createDCtx()) */
+ZSTDLIBv07_API size_t ZSTDv07_decompressDCtx(ZSTDv07_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-************************
+*  Simple dictionary API
+***************************/
+/*! ZSTDv07_decompress_usingDict() :
+*   Decompression using a pre-defined Dictionary content (see dictBuilder).
+*   Dictionary must be identical to the one used during compression.
+*   Note : This function load the dictionary, resulting in a significant startup time */
+ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDict(ZSTDv07_DCtx* dctx,
+                                                   void* dst, size_t dstCapacity,
+                                             const void* src, size_t srcSize,
+                                             const void* dict,size_t dictSize);
+
+
+/*-**************************
+*  Advanced Dictionary API
+****************************/
+/*! ZSTDv07_createDDict() :
+*   Create a digested dictionary, ready to start decompression operation without startup delay.
+*   `dict` can be released after creation */
+typedef struct ZSTDv07_DDict_s ZSTDv07_DDict;
+ZSTDLIBv07_API ZSTDv07_DDict* ZSTDv07_createDDict(const void* dict, size_t dictSize);
+ZSTDLIBv07_API size_t      ZSTDv07_freeDDict(ZSTDv07_DDict* ddict);
+
+/*! ZSTDv07_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Faster startup than ZSTDv07_decompress_usingDict(), recommended when same dictionary is used multiple times. */
+ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDDict(ZSTDv07_DCtx* dctx,
+                                                    void* dst, size_t dstCapacity,
+                                              const void* src, size_t srcSize,
+                                              const ZSTDv07_DDict* ddict);
+
+typedef struct {
+    unsigned long long frameContentSize;
+    unsigned windowSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTDv07_frameParams;
+
+ZSTDLIBv07_API size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+
+
+
+
+/* *************************************
+*  Streaming functions
+***************************************/
+typedef struct ZBUFFv07_DCtx_s ZBUFFv07_DCtx;
+ZSTDLIBv07_API ZBUFFv07_DCtx* ZBUFFv07_createDCtx(void);
+ZSTDLIBv07_API size_t      ZBUFFv07_freeDCtx(ZBUFFv07_DCtx* dctx);
+
+ZSTDLIBv07_API size_t ZBUFFv07_decompressInit(ZBUFFv07_DCtx* dctx);
+ZSTDLIBv07_API size_t ZBUFFv07_decompressInitDictionary(ZBUFFv07_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZSTDLIBv07_API size_t ZBUFFv07_decompressContinue(ZBUFFv07_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFFv07_DCtx object is required to track streaming operations.
+*  Use ZBUFFv07_createDCtx() and ZBUFFv07_freeDCtx() to create/release resources.
+*  Use ZBUFFv07_decompressInit() to start a new decompression operation,
+*   or ZBUFFv07_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv07_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFFv07_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+*            or 0 when a frame is completely decoded,
+*            or an error code, which can be tested using ZBUFFv07_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv07_recommendedDInSize() and ZBUFFv07_recommendedDOutSize()
+*  output : ZBUFFv07_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv07_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFFv07_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+ZSTDLIBv07_API unsigned ZBUFFv07_isError(size_t errorCode);
+ZSTDLIBv07_API const char* ZBUFFv07_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, they tend to offer better latency */
+ZSTDLIBv07_API size_t ZBUFFv07_recommendedDInSize(void);
+ZSTDLIBv07_API size_t ZBUFFv07_recommendedDOutSize(void);
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define ZSTDv07_MAGICNUMBER            0xFD2FB527   /* v0.7 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv07_H_235446 */

diff --git a/vendor/github.com/DataDog/zstd/zstdmt_compress.c b/vendor/github.com/DataDog/zstd/zstdmt_compress.c
new file mode 100755
index 0000000..c7a205d
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstdmt_compress.c

@@ -0,0 +1,1831 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Tuning parameters   ====== */
+#define ZSTDMT_NBWORKERS_MAX 200
+#define ZSTDMT_JOBSIZE_MAX  (MEM_32bits() ? (512 MB) : (2 GB))  /* note : limited by `jobSize` type, which is `unsigned` */
+#define ZSTDMT_OVERLAPLOG_DEFAULT 6
+
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)   /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+/* ======   Dependencies   ====== */
+#include <string.h>      /* memcpy, memset */
+#include <limits.h>      /* INT_MAX */
+#include "pool.h"        /* threadpool */
+#include "threading.h"   /* mutex */
+#include "zstd_compress_internal.h"  /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */
+#include "zstd_ldm.h"
+#include "zstdmt_compress.h"
+
+/* Guards code to support resizing the SeqPool.
+ * We will want to resize the SeqPool to save memory in the future.
+ * Until then, comment the code out since it is unused.
+ */
+#define ZSTD_RESIZE_SEQPOOL 0
+
+/* ======   Debug   ====== */
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
+
+#  include <stdio.h>
+#  include <unistd.h>
+#  include <sys/times.h>
+#  define DEBUGLOGRAW(l, ...) if (l<=ZSTD_DEBUG) { fprintf(stderr, __VA_ARGS__); }
+
+#  define DEBUG_PRINTHEX(l,p,n) {            \
+    unsigned debug_u;                        \
+    for (debug_u=0; debug_u<(n); debug_u++)  \
+        DEBUGLOGRAW(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
+    DEBUGLOGRAW(l, " \n");                   \
+}
+
+static unsigned long long GetCurrentClockTimeMicroseconds(void)
+{
+   static clock_t _ticksPerSecond = 0;
+   if (_ticksPerSecond <= 0) _ticksPerSecond = sysconf(_SC_CLK_TCK);
+
+   { struct tms junk; clock_t newTicks = (clock_t) times(&junk);
+     return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond); }
+}
+
+#define MUTEX_WAIT_TIME_DLEVEL 6
+#define ZSTD_PTHREAD_MUTEX_LOCK(mutex) {          \
+    if (ZSTD_DEBUG >= MUTEX_WAIT_TIME_DLEVEL) {   \
+        unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \
+        ZSTD_pthread_mutex_lock(mutex);           \
+        {   unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \
+            unsigned long long const elapsedTime = (afterTime-beforeTime); \
+            if (elapsedTime > 1000) {  /* or whatever threshold you like; I'm using 1 millisecond here */ \
+                DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \
+                   elapsedTime, #mutex);          \
+        }   }                                     \
+    } else {                                      \
+        ZSTD_pthread_mutex_lock(mutex);           \
+    }                                             \
+}
+
+#else
+
+#  define ZSTD_PTHREAD_MUTEX_LOCK(m) ZSTD_pthread_mutex_lock(m)
+#  define DEBUG_PRINTHEX(l,p,n) {}
+
+#endif
+
+
+/* =====   Buffer Pool   ===== */
+/* a single Buffer Pool can be invoked from multiple threads in parallel */
+
+typedef struct buffer_s {
+    void* start;
+    size_t capacity;
+} buffer_t;
+
+static const buffer_t g_nullBuffer = { NULL, 0 };
+
+typedef struct ZSTDMT_bufferPool_s {
+    ZSTD_pthread_mutex_t poolMutex;
+    size_t bufferSize;
+    unsigned totalBuffers;
+    unsigned nbBuffers;
+    ZSTD_customMem cMem;
+    buffer_t bTable[1];   /* variable size */
+} ZSTDMT_bufferPool;
+
+static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned nbWorkers, ZSTD_customMem cMem)
+{
+    unsigned const maxNbBuffers = 2*nbWorkers + 3;
+    ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_calloc(
+        sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t), cMem);
+    if (bufPool==NULL) return NULL;
+    if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) {
+        ZSTD_free(bufPool, cMem);
+        return NULL;
+    }
+    bufPool->bufferSize = 64 KB;
+    bufPool->totalBuffers = maxNbBuffers;
+    bufPool->nbBuffers = 0;
+    bufPool->cMem = cMem;
+    return bufPool;
+}
+
+static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool)
+{
+    unsigned u;
+    DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool);
+    if (!bufPool) return;   /* compatibility with free on NULL */
+    for (u=0; u<bufPool->totalBuffers; u++) {
+        DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->bTable[u].start);
+        ZSTD_free(bufPool->bTable[u].start, bufPool->cMem);
+    }
+    ZSTD_pthread_mutex_destroy(&bufPool->poolMutex);
+    ZSTD_free(bufPool, bufPool->cMem);
+}
+
+/* only works at initialization, not during compression */
+static size_t ZSTDMT_sizeof_bufferPool(ZSTDMT_bufferPool* bufPool)
+{
+    size_t const poolSize = sizeof(*bufPool)
+                          + (bufPool->totalBuffers - 1) * sizeof(buffer_t);
+    unsigned u;
+    size_t totalBufferSize = 0;
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    for (u=0; u<bufPool->totalBuffers; u++)
+        totalBufferSize += bufPool->bTable[u].capacity;
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+
+    return poolSize + totalBufferSize;
+}
+
+/* ZSTDMT_setBufferSize() :
+ * all future buffers provided by this buffer pool will have _at least_ this size
+ * note : it's better for all buffers to have same size,
+ * as they become freely interchangeable, reducing malloc/free usages and memory fragmentation */
+static void ZSTDMT_setBufferSize(ZSTDMT_bufferPool* const bufPool, size_t const bSize)
+{
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    DEBUGLOG(4, "ZSTDMT_setBufferSize: bSize = %u", (U32)bSize);
+    bufPool->bufferSize = bSize;
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+}
+
+/** ZSTDMT_getBuffer() :
+ *  assumption : bufPool must be valid
+ * @return : a buffer, with start pointer and size
+ *  note: allocation may fail, in this case, start==NULL and size==0 */
+static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool)
+{
+    size_t const bSize = bufPool->bufferSize;
+    DEBUGLOG(5, "ZSTDMT_getBuffer: bSize = %u", (U32)bufPool->bufferSize);
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    if (bufPool->nbBuffers) {   /* try to use an existing buffer */
+        buffer_t const buf = bufPool->bTable[--(bufPool->nbBuffers)];
+        size_t const availBufferSize = buf.capacity;
+        bufPool->bTable[bufPool->nbBuffers] = g_nullBuffer;
+        if ((availBufferSize >= bSize) & ((availBufferSize>>3) <= bSize)) {
+            /* large enough, but not too much */
+            DEBUGLOG(5, "ZSTDMT_getBuffer: provide buffer %u of size %u",
+                        bufPool->nbBuffers, (U32)buf.capacity);
+            ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+            return buf;
+        }
+        /* size conditions not respected : scratch this buffer, create new one */
+        DEBUGLOG(5, "ZSTDMT_getBuffer: existing buffer does not meet size conditions => freeing");
+        ZSTD_free(buf.start, bufPool->cMem);
+    }
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+    /* create new buffer */
+    DEBUGLOG(5, "ZSTDMT_getBuffer: create a new buffer");
+    {   buffer_t buffer;
+        void* const start = ZSTD_malloc(bSize, bufPool->cMem);
+        buffer.start = start;   /* note : start can be NULL if malloc fails ! */
+        buffer.capacity = (start==NULL) ? 0 : bSize;
+        if (start==NULL) {
+            DEBUGLOG(5, "ZSTDMT_getBuffer: buffer allocation failure !!");
+        } else {
+            DEBUGLOG(5, "ZSTDMT_getBuffer: created buffer of size %u", (U32)bSize);
+        }
+        return buffer;
+    }
+}
+
+#if ZSTD_RESIZE_SEQPOOL
+/** ZSTDMT_resizeBuffer() :
+ * assumption : bufPool must be valid
+ * @return : a buffer that is at least the buffer pool buffer size.
+ *           If a reallocation happens, the data in the input buffer is copied.
+ */
+static buffer_t ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buffer)
+{
+    size_t const bSize = bufPool->bufferSize;
+    if (buffer.capacity < bSize) {
+        void* const start = ZSTD_malloc(bSize, bufPool->cMem);
+        buffer_t newBuffer;
+        newBuffer.start = start;
+        newBuffer.capacity = start == NULL ? 0 : bSize;
+        if (start != NULL) {
+            assert(newBuffer.capacity >= buffer.capacity);
+            memcpy(newBuffer.start, buffer.start, buffer.capacity);
+            DEBUGLOG(5, "ZSTDMT_resizeBuffer: created buffer of size %u", (U32)bSize);
+            return newBuffer;
+        }
+        DEBUGLOG(5, "ZSTDMT_resizeBuffer: buffer allocation failure !!");
+    }
+    return buffer;
+}
+#endif
+
+/* store buffer for later re-use, up to pool capacity */
+static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf)
+{
+    if (buf.start == NULL) return;   /* compatible with release on NULL */
+    DEBUGLOG(5, "ZSTDMT_releaseBuffer");
+    ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
+    if (bufPool->nbBuffers < bufPool->totalBuffers) {
+        bufPool->bTable[bufPool->nbBuffers++] = buf;  /* stored for later use */
+        DEBUGLOG(5, "ZSTDMT_releaseBuffer: stored buffer of size %u in slot %u",
+                    (U32)buf.capacity, (U32)(bufPool->nbBuffers-1));
+        ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+        return;
+    }
+    ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
+    /* Reached bufferPool capacity (should not happen) */
+    DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing ");
+    ZSTD_free(buf.start, bufPool->cMem);
+}
+
+
+/* =====   Seq Pool Wrapper   ====== */
+
+static rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0};
+
+typedef ZSTDMT_bufferPool ZSTDMT_seqPool;
+
+static size_t ZSTDMT_sizeof_seqPool(ZSTDMT_seqPool* seqPool)
+{
+    return ZSTDMT_sizeof_bufferPool(seqPool);
+}
+
+static rawSeqStore_t bufferToSeq(buffer_t buffer)
+{
+    rawSeqStore_t seq = {NULL, 0, 0, 0};
+    seq.seq = (rawSeq*)buffer.start;
+    seq.capacity = buffer.capacity / sizeof(rawSeq);
+    return seq;
+}
+
+static buffer_t seqToBuffer(rawSeqStore_t seq)
+{
+    buffer_t buffer;
+    buffer.start = seq.seq;
+    buffer.capacity = seq.capacity * sizeof(rawSeq);
+    return buffer;
+}
+
+static rawSeqStore_t ZSTDMT_getSeq(ZSTDMT_seqPool* seqPool)
+{
+    if (seqPool->bufferSize == 0) {
+        return kNullRawSeqStore;
+    }
+    return bufferToSeq(ZSTDMT_getBuffer(seqPool));
+}
+
+#if ZSTD_RESIZE_SEQPOOL
+static rawSeqStore_t ZSTDMT_resizeSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq)
+{
+  return bufferToSeq(ZSTDMT_resizeBuffer(seqPool, seqToBuffer(seq)));
+}
+#endif
+
+static void ZSTDMT_releaseSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq)
+{
+  ZSTDMT_releaseBuffer(seqPool, seqToBuffer(seq));
+}
+
+static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq)
+{
+  ZSTDMT_setBufferSize(seqPool, nbSeq * sizeof(rawSeq));
+}
+
+static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem)
+{
+    ZSTDMT_seqPool* seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    ZSTDMT_setNbSeq(seqPool, 0);
+    return seqPool;
+}
+
+static void ZSTDMT_freeSeqPool(ZSTDMT_seqPool* seqPool)
+{
+    ZSTDMT_freeBufferPool(seqPool);
+}
+
+
+
+/* =====   CCtx Pool   ===== */
+/* a single CCtx Pool can be invoked from multiple threads in parallel */
+
+typedef struct {
+    ZSTD_pthread_mutex_t poolMutex;
+    unsigned totalCCtx;
+    unsigned availCCtx;
+    ZSTD_customMem cMem;
+    ZSTD_CCtx* cctx[1];   /* variable size */
+} ZSTDMT_CCtxPool;
+
+/* note : all CCtx borrowed from the pool should be released back to the pool _before_ freeing the pool */
+static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool)
+{
+    unsigned u;
+    for (u=0; u<pool->totalCCtx; u++)
+        ZSTD_freeCCtx(pool->cctx[u]);  /* note : compatible with free on NULL */
+    ZSTD_pthread_mutex_destroy(&pool->poolMutex);
+    ZSTD_free(pool, pool->cMem);
+}
+
+/* ZSTDMT_createCCtxPool() :
+ * implies nbWorkers >= 1 , checked by caller ZSTDMT_createCCtx() */
+static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(unsigned nbWorkers,
+                                              ZSTD_customMem cMem)
+{
+    ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) ZSTD_calloc(
+        sizeof(ZSTDMT_CCtxPool) + (nbWorkers-1)*sizeof(ZSTD_CCtx*), cMem);
+    assert(nbWorkers > 0);
+    if (!cctxPool) return NULL;
+    if (ZSTD_pthread_mutex_init(&cctxPool->poolMutex, NULL)) {
+        ZSTD_free(cctxPool, cMem);
+        return NULL;
+    }
+    cctxPool->cMem = cMem;
+    cctxPool->totalCCtx = nbWorkers;
+    cctxPool->availCCtx = 1;   /* at least one cctx for single-thread mode */
+    cctxPool->cctx[0] = ZSTD_createCCtx_advanced(cMem);
+    if (!cctxPool->cctx[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; }
+    DEBUGLOG(3, "cctxPool created, with %u workers", nbWorkers);
+    return cctxPool;
+}
+
+/* only works during initialization phase, not during compression */
+static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool)
+{
+    ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
+    {   unsigned const nbWorkers = cctxPool->totalCCtx;
+        size_t const poolSize = sizeof(*cctxPool)
+                                + (nbWorkers-1) * sizeof(ZSTD_CCtx*);
+        unsigned u;
+        size_t totalCCtxSize = 0;
+        for (u=0; u<nbWorkers; u++) {
+            totalCCtxSize += ZSTD_sizeof_CCtx(cctxPool->cctx[u]);
+        }
+        ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+        assert(nbWorkers > 0);
+        return poolSize + totalCCtxSize;
+    }
+}
+
+static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* cctxPool)
+{
+    DEBUGLOG(5, "ZSTDMT_getCCtx");
+    ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
+    if (cctxPool->availCCtx) {
+        cctxPool->availCCtx--;
+        {   ZSTD_CCtx* const cctx = cctxPool->cctx[cctxPool->availCCtx];
+            ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+            return cctx;
+    }   }
+    ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
+    DEBUGLOG(5, "create one more CCtx");
+    return ZSTD_createCCtx_advanced(cctxPool->cMem);   /* note : can be NULL, when creation fails ! */
+}
+
+static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return;   /* compatibility with release on NULL */
+    ZSTD_pthread_mutex_lock(&pool->poolMutex);
+    if (pool->availCCtx < pool->totalCCtx)
+        pool->cctx[pool->availCCtx++] = cctx;
+    else {
+        /* pool overflow : should not happen, since totalCCtx==nbWorkers */
+        DEBUGLOG(4, "CCtx pool overflow : free cctx");
+        ZSTD_freeCCtx(cctx);
+    }
+    ZSTD_pthread_mutex_unlock(&pool->poolMutex);
+}
+
+/* ====   Serial State   ==== */
+
+typedef struct {
+    void const* start;
+    size_t size;
+} range_t;
+
+typedef struct {
+    /* All variables in the struct are protected by mutex. */
+    ZSTD_pthread_mutex_t mutex;
+    ZSTD_pthread_cond_t cond;
+    ZSTD_CCtx_params params;
+    ldmState_t ldmState;
+    XXH64_state_t xxhState;
+    unsigned nextJobID;
+    /* Protects ldmWindow.
+     * Must be acquired after the main mutex when acquiring both.
+     */
+    ZSTD_pthread_mutex_t ldmWindowMutex;
+    ZSTD_pthread_cond_t ldmWindowCond;  /* Signaled when ldmWindow is udpated */
+    ZSTD_window_t ldmWindow;  /* A thread-safe copy of ldmState.window */
+} serialState_t;
+
+static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool* seqPool, ZSTD_CCtx_params params)
+{
+    /* Adjust parameters */
+    if (params.ldmParams.enableLdm) {
+        DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10);
+        params.ldmParams.windowLog = params.cParams.windowLog;
+        ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
+        assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
+        assert(params.ldmParams.hashEveryLog < 32);
+        serialState->ldmState.hashPower =
+                ZSTD_ldm_getHashPower(params.ldmParams.minMatchLength);
+    } else {
+        memset(&params.ldmParams, 0, sizeof(params.ldmParams));
+    }
+    serialState->nextJobID = 0;
+    if (params.fParams.checksumFlag)
+        XXH64_reset(&serialState->xxhState, 0);
+    if (params.ldmParams.enableLdm) {
+        ZSTD_customMem cMem = params.customMem;
+        unsigned const hashLog = params.ldmParams.hashLog;
+        size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t);
+        unsigned const bucketLog =
+            params.ldmParams.hashLog - params.ldmParams.bucketSizeLog;
+        size_t const bucketSize = (size_t)1 << bucketLog;
+        unsigned const prevBucketLog =
+            serialState->params.ldmParams.hashLog -
+            serialState->params.ldmParams.bucketSizeLog;
+        /* Size the seq pool tables */
+        ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, params.jobSize));
+        /* Reset the window */
+        ZSTD_window_clear(&serialState->ldmState.window);
+        serialState->ldmWindow = serialState->ldmState.window;
+        /* Resize tables and output space if necessary. */
+        if (serialState->ldmState.hashTable == NULL || serialState->params.ldmParams.hashLog < hashLog) {
+            ZSTD_free(serialState->ldmState.hashTable, cMem);
+            serialState->ldmState.hashTable = (ldmEntry_t*)ZSTD_malloc(hashSize, cMem);
+        }
+        if (serialState->ldmState.bucketOffsets == NULL || prevBucketLog < bucketLog) {
+            ZSTD_free(serialState->ldmState.bucketOffsets, cMem);
+            serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_malloc(bucketSize, cMem);
+        }
+        if (!serialState->ldmState.hashTable || !serialState->ldmState.bucketOffsets)
+            return 1;
+        /* Zero the tables */
+        memset(serialState->ldmState.hashTable, 0, hashSize);
+        memset(serialState->ldmState.bucketOffsets, 0, bucketSize);
+    }
+    serialState->params = params;
+    return 0;
+}
+
+static int ZSTDMT_serialState_init(serialState_t* serialState)
+{
+    int initError = 0;
+    memset(serialState, 0, sizeof(*serialState));
+    initError |= ZSTD_pthread_mutex_init(&serialState->mutex, NULL);
+    initError |= ZSTD_pthread_cond_init(&serialState->cond, NULL);
+    initError |= ZSTD_pthread_mutex_init(&serialState->ldmWindowMutex, NULL);
+    initError |= ZSTD_pthread_cond_init(&serialState->ldmWindowCond, NULL);
+    return initError;
+}
+
+static void ZSTDMT_serialState_free(serialState_t* serialState)
+{
+    ZSTD_customMem cMem = serialState->params.customMem;
+    ZSTD_pthread_mutex_destroy(&serialState->mutex);
+    ZSTD_pthread_cond_destroy(&serialState->cond);
+    ZSTD_pthread_mutex_destroy(&serialState->ldmWindowMutex);
+    ZSTD_pthread_cond_destroy(&serialState->ldmWindowCond);
+    ZSTD_free(serialState->ldmState.hashTable, cMem);
+    ZSTD_free(serialState->ldmState.bucketOffsets, cMem);
+}
+
+static void ZSTDMT_serialState_update(serialState_t* serialState,
+                                      ZSTD_CCtx* jobCCtx, rawSeqStore_t seqStore,
+                                      range_t src, unsigned jobID)
+{
+    /* Wait for our turn */
+    ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
+    while (serialState->nextJobID < jobID) {
+        ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex);
+    }
+    /* A future job may error and skip our job */
+    if (serialState->nextJobID == jobID) {
+        /* It is now our turn, do any processing necessary */
+        if (serialState->params.ldmParams.enableLdm) {
+            size_t error;
+            assert(seqStore.seq != NULL && seqStore.pos == 0 &&
+                   seqStore.size == 0 && seqStore.capacity > 0);
+            ZSTD_window_update(&serialState->ldmState.window, src.start, src.size);
+            error = ZSTD_ldm_generateSequences(
+                &serialState->ldmState, &seqStore,
+                &serialState->params.ldmParams, src.start, src.size);
+            /* We provide a large enough buffer to never fail. */
+            assert(!ZSTD_isError(error)); (void)error;
+            /* Update ldmWindow to match the ldmState.window and signal the main
+             * thread if it is waiting for a buffer.
+             */
+            ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex);
+            serialState->ldmWindow = serialState->ldmState.window;
+            ZSTD_pthread_cond_signal(&serialState->ldmWindowCond);
+            ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex);
+        }
+        if (serialState->params.fParams.checksumFlag && src.size > 0)
+            XXH64_update(&serialState->xxhState, src.start, src.size);
+    }
+    /* Now it is the next jobs turn */
+    serialState->nextJobID++;
+    ZSTD_pthread_cond_broadcast(&serialState->cond);
+    ZSTD_pthread_mutex_unlock(&serialState->mutex);
+
+    if (seqStore.size > 0) {
+        size_t const err = ZSTD_referenceExternalSequences(
+            jobCCtx, seqStore.seq, seqStore.size);
+        assert(serialState->params.ldmParams.enableLdm);
+        assert(!ZSTD_isError(err));
+        (void)err;
+    }
+}
+
+static void ZSTDMT_serialState_ensureFinished(serialState_t* serialState,
+                                              unsigned jobID, size_t cSize)
+{
+    ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
+    if (serialState->nextJobID <= jobID) {
+        assert(ZSTD_isError(cSize)); (void)cSize;
+        DEBUGLOG(5, "Skipping past job %u because of error", jobID);
+        serialState->nextJobID = jobID + 1;
+        ZSTD_pthread_cond_broadcast(&serialState->cond);
+
+        ZSTD_PTHREAD_MUTEX_LOCK(&serialState->ldmWindowMutex);
+        ZSTD_window_clear(&serialState->ldmWindow);
+        ZSTD_pthread_cond_signal(&serialState->ldmWindowCond);
+        ZSTD_pthread_mutex_unlock(&serialState->ldmWindowMutex);
+    }
+    ZSTD_pthread_mutex_unlock(&serialState->mutex);
+
+}
+
+
+/* ------------------------------------------ */
+/* =====          Worker thread         ===== */
+/* ------------------------------------------ */
+
+static const range_t kNullRange = { NULL, 0 };
+
+typedef struct {
+    size_t   consumed;                   /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx */
+    size_t   cSize;                      /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx, then set0 by mtctx */
+    ZSTD_pthread_mutex_t job_mutex;      /* Thread-safe - used by mtctx and worker */
+    ZSTD_pthread_cond_t job_cond;        /* Thread-safe - used by mtctx and worker */
+    ZSTDMT_CCtxPool* cctxPool;           /* Thread-safe - used by mtctx and (all) workers */
+    ZSTDMT_bufferPool* bufPool;          /* Thread-safe - used by mtctx and (all) workers */
+    ZSTDMT_seqPool* seqPool;             /* Thread-safe - used by mtctx and (all) workers */
+    serialState_t* serial;               /* Thread-safe - used by mtctx and (all) workers */
+    buffer_t dstBuff;                    /* set by worker (or mtctx), then read by worker & mtctx, then modified by mtctx => no barrier */
+    range_t prefix;                      /* set by mtctx, then read by worker & mtctx => no barrier */
+    range_t src;                         /* set by mtctx, then read by worker & mtctx => no barrier */
+    unsigned jobID;                      /* set by mtctx, then read by worker => no barrier */
+    unsigned firstJob;                   /* set by mtctx, then read by worker => no barrier */
+    unsigned lastJob;                    /* set by mtctx, then read by worker => no barrier */
+    ZSTD_CCtx_params params;             /* set by mtctx, then read by worker => no barrier */
+    const ZSTD_CDict* cdict;             /* set by mtctx, then read by worker => no barrier */
+    unsigned long long fullFrameSize;    /* set by mtctx, then read by worker => no barrier */
+    size_t   dstFlushed;                 /* used only by mtctx */
+    unsigned frameChecksumNeeded;        /* used only by mtctx */
+} ZSTDMT_jobDescription;
+
+/* ZSTDMT_compressionJob() is a POOL_function type */
+void ZSTDMT_compressionJob(void* jobDescription)
+{
+    ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
+    ZSTD_CCtx_params jobParams = job->params;   /* do not modify job->params ! copy it, modify the copy */
+    ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool);
+    rawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool);
+    buffer_t dstBuff = job->dstBuff;
+
+    /* Don't compute the checksum for chunks, since we compute it externally,
+     * but write it in the header.
+     */
+    if (job->jobID != 0) jobParams.fParams.checksumFlag = 0;
+    /* Don't run LDM for the chunks, since we handle it externally */
+    jobParams.ldmParams.enableLdm = 0;
+
+    /* ressources */
+    if (cctx==NULL) {
+        job->cSize = ERROR(memory_allocation);
+        goto _endJob;
+    }
+    if (dstBuff.start == NULL) {   /* streaming job : doesn't provide a dstBuffer */
+        dstBuff = ZSTDMT_getBuffer(job->bufPool);
+        if (dstBuff.start==NULL) {
+            job->cSize = ERROR(memory_allocation);
+            goto _endJob;
+        }
+        job->dstBuff = dstBuff;   /* this value can be read in ZSTDMT_flush, when it copies the whole job */
+    }
+
+    /* init */
+    if (job->cdict) {
+        size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, job->cdict, jobParams, job->fullFrameSize);
+        assert(job->firstJob);  /* only allowed for first job */
+        if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; }
+    } else {  /* srcStart points at reloaded section */
+        U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size;
+        {   size_t const forceWindowError = ZSTD_CCtxParam_setParameter(&jobParams, ZSTD_p_forceMaxWindow, !job->firstJob);
+            if (ZSTD_isError(forceWindowError)) {
+                job->cSize = forceWindowError;
+                goto _endJob;
+        }   }
+        {   size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
+                                        job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
+                                        NULL, /*cdict*/
+                                        jobParams, pledgedSrcSize);
+            if (ZSTD_isError(initError)) {
+                job->cSize = initError;
+                goto _endJob;
+    }   }   }
+
+    /* Perform serial step as early as possible, but after CCtx initialization */
+    ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID);
+
+    if (!job->firstJob) {  /* flush and overwrite frame header when it's not first job */
+        size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0);
+        if (ZSTD_isError(hSize)) { job->cSize = hSize; /* save error code */ goto _endJob; }
+        DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize);
+        ZSTD_invalidateRepCodes(cctx);
+    }
+
+    /* compress */
+    {   size_t const chunkSize = 4*ZSTD_BLOCKSIZE_MAX;
+        int const nbChunks = (int)((job->src.size + (chunkSize-1)) / chunkSize);
+        const BYTE* ip = (const BYTE*) job->src.start;
+        BYTE* const ostart = (BYTE*)dstBuff.start;
+        BYTE* op = ostart;
+        BYTE* oend = op + dstBuff.capacity;
+        int chunkNb;
+        if (sizeof(size_t) > sizeof(int)) assert(job->src.size < ((size_t)INT_MAX) * chunkSize);   /* check overflow */
+        DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->src.size, nbChunks);
+        assert(job->cSize == 0);
+        for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) {
+            size_t const cSize = ZSTD_compressContinue(cctx, op, oend-op, ip, chunkSize);
+            if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; }
+            ip += chunkSize;
+            op += cSize; assert(op < oend);
+            /* stats */
+            ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
+            job->cSize += cSize;
+            job->consumed = chunkSize * chunkNb;
+            DEBUGLOG(5, "ZSTDMT_compressionJob: compress new block : cSize==%u bytes (total: %u)",
+                        (U32)cSize, (U32)job->cSize);
+            ZSTD_pthread_cond_signal(&job->job_cond);   /* warns some more data is ready to be flushed */
+            ZSTD_pthread_mutex_unlock(&job->job_mutex);
+        }
+        /* last block */
+        assert(chunkSize > 0); assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
+        if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) {
+            size_t const lastBlockSize1 = job->src.size & (chunkSize-1);
+            size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1;
+            size_t const cSize = (job->lastJob) ?
+                 ZSTD_compressEnd     (cctx, op, oend-op, ip, lastBlockSize) :
+                 ZSTD_compressContinue(cctx, op, oend-op, ip, lastBlockSize);
+            if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; }
+            /* stats */
+            ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
+            job->cSize += cSize;
+            ZSTD_pthread_mutex_unlock(&job->job_mutex);
+    }   }
+
+_endJob:
+    ZSTDMT_serialState_ensureFinished(job->serial, job->jobID, job->cSize);
+    if (job->prefix.size > 0)
+        DEBUGLOG(5, "Finished with prefix: %zx", (size_t)job->prefix.start);
+    DEBUGLOG(5, "Finished with source: %zx", (size_t)job->src.start);
+    /* release resources */
+    ZSTDMT_releaseSeq(job->seqPool, rawSeqStore);
+    ZSTDMT_releaseCCtx(job->cctxPool, cctx);
+    /* report */
+    ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
+    job->consumed = job->src.size;
+    ZSTD_pthread_cond_signal(&job->job_cond);
+    ZSTD_pthread_mutex_unlock(&job->job_mutex);
+}
+
+
+/* ------------------------------------------ */
+/* =====   Multi-threaded compression   ===== */
+/* ------------------------------------------ */
+
+typedef struct {
+    range_t prefix;         /* read-only non-owned prefix buffer */
+    buffer_t buffer;
+    size_t filled;
+} inBuff_t;
+
+typedef struct {
+  BYTE* buffer;     /* The round input buffer. All jobs get references
+                     * to pieces of the buffer. ZSTDMT_tryGetInputRange()
+                     * handles handing out job input buffers, and makes
+                     * sure it doesn't overlap with any pieces still in use.
+                     */
+  size_t capacity;  /* The capacity of buffer. */
+  size_t pos;       /* The position of the current inBuff in the round
+                     * buffer. Updated past the end if the inBuff once
+                     * the inBuff is sent to the worker thread.
+                     * pos <= capacity.
+                     */
+} roundBuff_t;
+
+static const roundBuff_t kNullRoundBuff = {NULL, 0, 0};
+
+struct ZSTDMT_CCtx_s {
+    POOL_ctx* factory;
+    ZSTDMT_jobDescription* jobs;
+    ZSTDMT_bufferPool* bufPool;
+    ZSTDMT_CCtxPool* cctxPool;
+    ZSTDMT_seqPool* seqPool;
+    ZSTD_CCtx_params params;
+    size_t targetSectionSize;
+    size_t targetPrefixSize;
+    roundBuff_t roundBuff;
+    inBuff_t inBuff;
+    int jobReady;        /* 1 => one job is already prepared, but pool has shortage of workers. Don't create another one. */
+    serialState_t serial;
+    unsigned singleBlockingThread;
+    unsigned jobIDMask;
+    unsigned doneJobID;
+    unsigned nextJobID;
+    unsigned frameEnded;
+    unsigned allJobsCompleted;
+    unsigned long long frameContentSize;
+    unsigned long long consumed;
+    unsigned long long produced;
+    ZSTD_customMem cMem;
+    ZSTD_CDict* cdictLocal;
+    const ZSTD_CDict* cdict;
+};
+
+static void ZSTDMT_freeJobsTable(ZSTDMT_jobDescription* jobTable, U32 nbJobs, ZSTD_customMem cMem)
+{
+    U32 jobNb;
+    if (jobTable == NULL) return;
+    for (jobNb=0; jobNb<nbJobs; jobNb++) {
+        ZSTD_pthread_mutex_destroy(&jobTable[jobNb].job_mutex);
+        ZSTD_pthread_cond_destroy(&jobTable[jobNb].job_cond);
+    }
+    ZSTD_free(jobTable, cMem);
+}
+
+/* ZSTDMT_allocJobsTable()
+ * allocate and init a job table.
+ * update *nbJobsPtr to next power of 2 value, as size of table */
+static ZSTDMT_jobDescription* ZSTDMT_createJobsTable(U32* nbJobsPtr, ZSTD_customMem cMem)
+{
+    U32 const nbJobsLog2 = ZSTD_highbit32(*nbJobsPtr) + 1;
+    U32 const nbJobs = 1 << nbJobsLog2;
+    U32 jobNb;
+    ZSTDMT_jobDescription* const jobTable = (ZSTDMT_jobDescription*)
+                ZSTD_calloc(nbJobs * sizeof(ZSTDMT_jobDescription), cMem);
+    int initError = 0;
+    if (jobTable==NULL) return NULL;
+    *nbJobsPtr = nbJobs;
+    for (jobNb=0; jobNb<nbJobs; jobNb++) {
+        initError |= ZSTD_pthread_mutex_init(&jobTable[jobNb].job_mutex, NULL);
+        initError |= ZSTD_pthread_cond_init(&jobTable[jobNb].job_cond, NULL);
+    }
+    if (initError != 0) {
+        ZSTDMT_freeJobsTable(jobTable, nbJobs, cMem);
+        return NULL;
+    }
+    return jobTable;
+}
+
+/* ZSTDMT_CCtxParam_setNbWorkers():
+ * Internal use only */
+size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers)
+{
+    if (nbWorkers > ZSTDMT_NBWORKERS_MAX) nbWorkers = ZSTDMT_NBWORKERS_MAX;
+    params->nbWorkers = nbWorkers;
+    params->overlapSizeLog = ZSTDMT_OVERLAPLOG_DEFAULT;
+    params->jobSize = 0;
+    return nbWorkers;
+}
+
+ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem)
+{
+    ZSTDMT_CCtx* mtctx;
+    U32 nbJobs = nbWorkers + 2;
+    int initError;
+    DEBUGLOG(3, "ZSTDMT_createCCtx_advanced (nbWorkers = %u)", nbWorkers);
+
+    if (nbWorkers < 1) return NULL;
+    nbWorkers = MIN(nbWorkers , ZSTDMT_NBWORKERS_MAX);
+    if ((cMem.customAlloc!=NULL) ^ (cMem.customFree!=NULL))
+        /* invalid custom allocator */
+        return NULL;
+
+    mtctx = (ZSTDMT_CCtx*) ZSTD_calloc(sizeof(ZSTDMT_CCtx), cMem);
+    if (!mtctx) return NULL;
+    ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
+    mtctx->cMem = cMem;
+    mtctx->allJobsCompleted = 1;
+    mtctx->factory = POOL_create_advanced(nbWorkers, 0, cMem);
+    mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, cMem);
+    assert(nbJobs > 0); assert((nbJobs & (nbJobs - 1)) == 0);  /* ensure nbJobs is a power of 2 */
+    mtctx->jobIDMask = nbJobs - 1;
+    mtctx->bufPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem);
+    mtctx->seqPool = ZSTDMT_createSeqPool(nbWorkers, cMem);
+    initError = ZSTDMT_serialState_init(&mtctx->serial);
+    mtctx->roundBuff = kNullRoundBuff;
+    if (!mtctx->factory | !mtctx->jobs | !mtctx->bufPool | !mtctx->cctxPool | !mtctx->seqPool | initError) {
+        ZSTDMT_freeCCtx(mtctx);
+        return NULL;
+    }
+    DEBUGLOG(3, "mt_cctx created, for %u threads", nbWorkers);
+    return mtctx;
+}
+
+ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers)
+{
+    return ZSTDMT_createCCtx_advanced(nbWorkers, ZSTD_defaultCMem);
+}
+
+
+/* ZSTDMT_releaseAllJobResources() :
+ * note : ensure all workers are killed first ! */
+static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx)
+{
+    unsigned jobID;
+    DEBUGLOG(3, "ZSTDMT_releaseAllJobResources");
+    for (jobID=0; jobID <= mtctx->jobIDMask; jobID++) {
+        DEBUGLOG(4, "job%02u: release dst address %08X", jobID, (U32)(size_t)mtctx->jobs[jobID].dstBuff.start);
+        ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff);
+        mtctx->jobs[jobID].dstBuff = g_nullBuffer;
+        mtctx->jobs[jobID].cSize = 0;
+    }
+    memset(mtctx->jobs, 0, (mtctx->jobIDMask+1)*sizeof(ZSTDMT_jobDescription));
+    mtctx->inBuff.buffer = g_nullBuffer;
+    mtctx->inBuff.filled = 0;
+    mtctx->allJobsCompleted = 1;
+}
+
+static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* mtctx)
+{
+    DEBUGLOG(4, "ZSTDMT_waitForAllJobsCompleted");
+    while (mtctx->doneJobID < mtctx->nextJobID) {
+        unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask;
+        ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
+        while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
+            DEBUGLOG(5, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
+            ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
+        }
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
+        mtctx->doneJobID++;
+    }
+}
+
+size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx)
+{
+    if (mtctx==NULL) return 0;   /* compatible with free on NULL */
+    POOL_free(mtctx->factory);   /* stop and free worker threads */
+    ZSTDMT_releaseAllJobResources(mtctx);  /* release job resources into pools first */
+    ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
+    ZSTDMT_freeBufferPool(mtctx->bufPool);
+    ZSTDMT_freeCCtxPool(mtctx->cctxPool);
+    ZSTDMT_freeSeqPool(mtctx->seqPool);
+    ZSTDMT_serialState_free(&mtctx->serial);
+    ZSTD_freeCDict(mtctx->cdictLocal);
+    if (mtctx->roundBuff.buffer)
+        ZSTD_free(mtctx->roundBuff.buffer, mtctx->cMem);
+    ZSTD_free(mtctx, mtctx->cMem);
+    return 0;
+}
+
+size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx)
+{
+    if (mtctx == NULL) return 0;   /* supports sizeof NULL */
+    return sizeof(*mtctx)
+            + POOL_sizeof(mtctx->factory)
+            + ZSTDMT_sizeof_bufferPool(mtctx->bufPool)
+            + (mtctx->jobIDMask+1) * sizeof(ZSTDMT_jobDescription)
+            + ZSTDMT_sizeof_CCtxPool(mtctx->cctxPool)
+            + ZSTDMT_sizeof_seqPool(mtctx->seqPool)
+            + ZSTD_sizeof_CDict(mtctx->cdictLocal)
+            + mtctx->roundBuff.capacity;
+}
+
+/* Internal only */
+size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params,
+                                ZSTDMT_parameter parameter, unsigned value) {
+    DEBUGLOG(4, "ZSTDMT_CCtxParam_setMTCtxParameter");
+    switch(parameter)
+    {
+    case ZSTDMT_p_jobSize :
+        DEBUGLOG(4, "ZSTDMT_CCtxParam_setMTCtxParameter : set jobSize to %u", value);
+        if ( (value > 0)  /* value==0 => automatic job size */
+           & (value < ZSTDMT_JOBSIZE_MIN) )
+            value = ZSTDMT_JOBSIZE_MIN;
+        params->jobSize = value;
+        return value;
+    case ZSTDMT_p_overlapSectionLog :
+        if (value > 9) value = 9;
+        DEBUGLOG(4, "ZSTDMT_p_overlapSectionLog : %u", value);
+        params->overlapSizeLog = (value >= 9) ? 9 : value;
+        return value;
+    default :
+        return ERROR(parameter_unsupported);
+    }
+}
+
+size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned value)
+{
+    DEBUGLOG(4, "ZSTDMT_setMTCtxParameter");
+    switch(parameter)
+    {
+    case ZSTDMT_p_jobSize :
+        return ZSTDMT_CCtxParam_setMTCtxParameter(&mtctx->params, parameter, value);
+    case ZSTDMT_p_overlapSectionLog :
+        return ZSTDMT_CCtxParam_setMTCtxParameter(&mtctx->params, parameter, value);
+    default :
+        return ERROR(parameter_unsupported);
+    }
+}
+
+/* Sets parameters relevant to the compression job,
+ * initializing others to default values. */
+static ZSTD_CCtx_params ZSTDMT_initJobCCtxParams(ZSTD_CCtx_params const params)
+{
+    ZSTD_CCtx_params jobParams;
+    memset(&jobParams, 0, sizeof(jobParams));
+
+    jobParams.cParams = params.cParams;
+    jobParams.fParams = params.fParams;
+    jobParams.compressionLevel = params.compressionLevel;
+    jobParams.disableLiteralCompression = params.disableLiteralCompression;
+
+    return jobParams;
+}
+
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams)
+{
+    U32 const saved_wlog = mtctx->params.cParams.windowLog;   /* Do not modify windowLog while compressing */
+    int const compressionLevel = cctxParams->compressionLevel;
+    DEBUGLOG(5, "ZSTDMT_updateCParams_whileCompressing (level:%i)",
+                compressionLevel);
+    mtctx->params.compressionLevel = compressionLevel;
+    {   ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, 0, 0);
+        cParams.windowLog = saved_wlog;
+        mtctx->params.cParams = cParams;
+    }
+}
+
+/* ZSTDMT_getNbWorkers():
+ * @return nb threads currently active in mtctx.
+ * mtctx must be valid */
+unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx)
+{
+    assert(mtctx != NULL);
+    return mtctx->params.nbWorkers;
+}
+
+/* ZSTDMT_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads.
+ * Note : mutex will be acquired during statistics collection. */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
+{
+    ZSTD_frameProgression fps;
+    DEBUGLOG(6, "ZSTDMT_getFrameProgression");
+    fps.consumed = mtctx->consumed;
+    fps.produced = mtctx->produced;
+    fps.ingested = mtctx->consumed + mtctx->inBuff.filled;
+    {   unsigned jobNb;
+        unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1);
+        DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)",
+                    mtctx->doneJobID, lastJobNb, mtctx->jobReady)
+        for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) {
+            unsigned const wJobID = jobNb & mtctx->jobIDMask;
+            ZSTD_pthread_mutex_lock(&mtctx->jobs[wJobID].job_mutex);
+            {   size_t const cResult = mtctx->jobs[wJobID].cSize;
+                size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
+                fps.consumed += mtctx->jobs[wJobID].consumed;
+                fps.ingested += mtctx->jobs[wJobID].src.size;
+                fps.produced += produced;
+            }
+            ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+        }
+    }
+    return fps;
+}
+
+
+/* ------------------------------------------ */
+/* =====   Multi-threaded compression   ===== */
+/* ------------------------------------------ */
+
+static size_t ZSTDMT_computeTargetJobLog(ZSTD_CCtx_params const params)
+{
+    if (params.ldmParams.enableLdm)
+        return MAX(21, params.cParams.chainLog + 4);
+    return MAX(20, params.cParams.windowLog + 2);
+}
+
+static size_t ZSTDMT_computeOverlapLog(ZSTD_CCtx_params const params)
+{
+    unsigned const overlapRLog = (params.overlapSizeLog>9) ? 0 : 9-params.overlapSizeLog;
+    if (params.ldmParams.enableLdm)
+        return (MIN(params.cParams.windowLog, ZSTDMT_computeTargetJobLog(params) - 2) - overlapRLog);
+    return overlapRLog >= 9 ? 0 : (params.cParams.windowLog - overlapRLog);
+}
+
+static unsigned ZSTDMT_computeNbJobs(ZSTD_CCtx_params params, size_t srcSize, unsigned nbWorkers) {
+    assert(nbWorkers>0);
+    {   size_t const jobSizeTarget = (size_t)1 << ZSTDMT_computeTargetJobLog(params);
+        size_t const jobMaxSize = jobSizeTarget << 2;
+        size_t const passSizeMax = jobMaxSize * nbWorkers;
+        unsigned const multiplier = (unsigned)(srcSize / passSizeMax) + 1;
+        unsigned const nbJobsLarge = multiplier * nbWorkers;
+        unsigned const nbJobsMax = (unsigned)(srcSize / jobSizeTarget) + 1;
+        unsigned const nbJobsSmall = MIN(nbJobsMax, nbWorkers);
+        return (multiplier>1) ? nbJobsLarge : nbJobsSmall;
+}   }
+
+/* ZSTDMT_compress_advanced_internal() :
+ * This is a blocking function : it will only give back control to caller after finishing its compression job.
+ */
+static size_t ZSTDMT_compress_advanced_internal(
+                ZSTDMT_CCtx* mtctx,
+                void* dst, size_t dstCapacity,
+          const void* src, size_t srcSize,
+          const ZSTD_CDict* cdict,
+                ZSTD_CCtx_params params)
+{
+    ZSTD_CCtx_params const jobParams = ZSTDMT_initJobCCtxParams(params);
+    size_t const overlapSize = (size_t)1 << ZSTDMT_computeOverlapLog(params);
+    unsigned const nbJobs = ZSTDMT_computeNbJobs(params, srcSize, params.nbWorkers);
+    size_t const proposedJobSize = (srcSize + (nbJobs-1)) / nbJobs;
+    size_t const avgJobSize = (((proposedJobSize-1) & 0x1FFFF) < 0x7FFF) ? proposedJobSize + 0xFFFF : proposedJobSize;   /* avoid too small last block */
+    const char* const srcStart = (const char*)src;
+    size_t remainingSrcSize = srcSize;
+    unsigned const compressWithinDst = (dstCapacity >= ZSTD_compressBound(srcSize)) ? nbJobs : (unsigned)(dstCapacity / ZSTD_compressBound(avgJobSize));  /* presumes avgJobSize >= 256 KB, which should be the case */
+    size_t frameStartPos = 0, dstBufferPos = 0;
+    assert(jobParams.nbWorkers == 0);
+    assert(mtctx->cctxPool->totalCCtx == params.nbWorkers);
+
+    params.jobSize = (U32)avgJobSize;
+    DEBUGLOG(4, "ZSTDMT_compress_advanced_internal: nbJobs=%2u (rawSize=%u bytes; fixedSize=%u) ",
+                nbJobs, (U32)proposedJobSize, (U32)avgJobSize);
+
+    if ((nbJobs==1) | (params.nbWorkers<=1)) {   /* fallback to single-thread mode : this is a blocking invocation anyway */
+        ZSTD_CCtx* const cctx = mtctx->cctxPool->cctx[0];
+        DEBUGLOG(4, "ZSTDMT_compress_advanced_internal: fallback to single-thread mode");
+        if (cdict) return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, jobParams.fParams);
+        return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, NULL, 0, jobParams);
+    }
+
+    assert(avgJobSize >= 256 KB);  /* condition for ZSTD_compressBound(A) + ZSTD_compressBound(B) <= ZSTD_compressBound(A+B), required to compress directly into Dst (no additional buffer) */
+    ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(avgJobSize) );
+    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params))
+        return ERROR(memory_allocation);
+
+    if (nbJobs > mtctx->jobIDMask+1) {  /* enlarge job table */
+        U32 jobsTableSize = nbJobs;
+        ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
+        mtctx->jobIDMask = 0;
+        mtctx->jobs = ZSTDMT_createJobsTable(&jobsTableSize, mtctx->cMem);
+        if (mtctx->jobs==NULL) return ERROR(memory_allocation);
+        assert((jobsTableSize != 0) && ((jobsTableSize & (jobsTableSize - 1)) == 0));  /* ensure jobsTableSize is a power of 2 */
+        mtctx->jobIDMask = jobsTableSize - 1;
+    }
+
+    {   unsigned u;
+        for (u=0; u<nbJobs; u++) {
+            size_t const jobSize = MIN(remainingSrcSize, avgJobSize);
+            size_t const dstBufferCapacity = ZSTD_compressBound(jobSize);
+            buffer_t const dstAsBuffer = { (char*)dst + dstBufferPos, dstBufferCapacity };
+            buffer_t const dstBuffer = u < compressWithinDst ? dstAsBuffer : g_nullBuffer;
+            size_t dictSize = u ? overlapSize : 0;
+
+            mtctx->jobs[u].prefix.start = srcStart + frameStartPos - dictSize;
+            mtctx->jobs[u].prefix.size = dictSize;
+            mtctx->jobs[u].src.start = srcStart + frameStartPos;
+            mtctx->jobs[u].src.size = jobSize; assert(jobSize > 0);  /* avoid job.src.size == 0 */
+            mtctx->jobs[u].consumed = 0;
+            mtctx->jobs[u].cSize = 0;
+            mtctx->jobs[u].cdict = (u==0) ? cdict : NULL;
+            mtctx->jobs[u].fullFrameSize = srcSize;
+            mtctx->jobs[u].params = jobParams;
+            /* do not calculate checksum within sections, but write it in header for first section */
+            mtctx->jobs[u].dstBuff = dstBuffer;
+            mtctx->jobs[u].cctxPool = mtctx->cctxPool;
+            mtctx->jobs[u].bufPool = mtctx->bufPool;
+            mtctx->jobs[u].seqPool = mtctx->seqPool;
+            mtctx->jobs[u].serial = &mtctx->serial;
+            mtctx->jobs[u].jobID = u;
+            mtctx->jobs[u].firstJob = (u==0);
+            mtctx->jobs[u].lastJob = (u==nbJobs-1);
+
+            DEBUGLOG(5, "ZSTDMT_compress_advanced_internal: posting job %u  (%u bytes)", u, (U32)jobSize);
+            DEBUG_PRINTHEX(6, mtctx->jobs[u].prefix.start, 12);
+            POOL_add(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[u]);
+
+            frameStartPos += jobSize;
+            dstBufferPos += dstBufferCapacity;
+            remainingSrcSize -= jobSize;
+    }   }
+
+    /* collect result */
+    {   size_t error = 0, dstPos = 0;
+        unsigned jobID;
+        for (jobID=0; jobID<nbJobs; jobID++) {
+            DEBUGLOG(5, "waiting for job %u ", jobID);
+            ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
+            while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
+                DEBUGLOG(5, "waiting for jobCompleted signal from job %u", jobID);
+                ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
+            }
+            ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
+            DEBUGLOG(5, "ready to write job %u ", jobID);
+
+            {   size_t const cSize = mtctx->jobs[jobID].cSize;
+                if (ZSTD_isError(cSize)) error = cSize;
+                if ((!error) && (dstPos + cSize > dstCapacity)) error = ERROR(dstSize_tooSmall);
+                if (jobID) {   /* note : job 0 is written directly at dst, which is correct position */
+                    if (!error)
+                        memmove((char*)dst + dstPos, mtctx->jobs[jobID].dstBuff.start, cSize);  /* may overlap when job compressed within dst */
+                    if (jobID >= compressWithinDst) {  /* job compressed into its own buffer, which must be released */
+                        DEBUGLOG(5, "releasing buffer %u>=%u", jobID, compressWithinDst);
+                        ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff);
+                }   }
+                mtctx->jobs[jobID].dstBuff = g_nullBuffer;
+                mtctx->jobs[jobID].cSize = 0;
+                dstPos += cSize ;
+            }
+        }  /* for (jobID=0; jobID<nbJobs; jobID++) */
+
+        DEBUGLOG(4, "checksumFlag : %u ", params.fParams.checksumFlag);
+        if (params.fParams.checksumFlag) {
+            U32 const checksum = (U32)XXH64_digest(&mtctx->serial.xxhState);
+            if (dstPos + 4 > dstCapacity) {
+                error = ERROR(dstSize_tooSmall);
+            } else {
+                DEBUGLOG(4, "writing checksum : %08X \n", checksum);
+                MEM_writeLE32((char*)dst + dstPos, checksum);
+                dstPos += 4;
+        }   }
+
+        if (!error) DEBUGLOG(4, "compressed size : %u  ", (U32)dstPos);
+        return error ? error : dstPos;
+    }
+}
+
+size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const ZSTD_CDict* cdict,
+                               ZSTD_parameters params,
+                               unsigned overlapLog)
+{
+    ZSTD_CCtx_params cctxParams = mtctx->params;
+    cctxParams.cParams = params.cParams;
+    cctxParams.fParams = params.fParams;
+    cctxParams.overlapSizeLog = overlapLog;
+    return ZSTDMT_compress_advanced_internal(mtctx,
+                                             dst, dstCapacity,
+                                             src, srcSize,
+                                             cdict, cctxParams);
+}
+
+
+size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
+                           void* dst, size_t dstCapacity,
+                     const void* src, size_t srcSize,
+                           int compressionLevel)
+{
+    U32 const overlapLog = (compressionLevel >= ZSTD_maxCLevel()) ? 9 : ZSTDMT_OVERLAPLOG_DEFAULT;
+    ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, 0);
+    params.fParams.contentSizeFlag = 1;
+    return ZSTDMT_compress_advanced(mtctx, dst, dstCapacity, src, srcSize, NULL, params, overlapLog);
+}
+
+
+/* ====================================== */
+/* =======      Streaming API     ======= */
+/* ====================================== */
+
+size_t ZSTDMT_initCStream_internal(
+        ZSTDMT_CCtx* mtctx,
+        const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+        const ZSTD_CDict* cdict, ZSTD_CCtx_params params,
+        unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u, disableLiteralCompression=%i)",
+                (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx, params.disableLiteralCompression);
+    /* params are supposed to be fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+    assert(mtctx->cctxPool->totalCCtx == params.nbWorkers);
+
+    /* init */
+    if (params.jobSize == 0) {
+        params.jobSize = 1U << ZSTDMT_computeTargetJobLog(params);
+    }
+    if (params.jobSize > ZSTDMT_JOBSIZE_MAX) params.jobSize = ZSTDMT_JOBSIZE_MAX;
+
+    mtctx->singleBlockingThread = (pledgedSrcSize <= ZSTDMT_JOBSIZE_MIN);  /* do not trigger multi-threading when srcSize is too small */
+    if (mtctx->singleBlockingThread) {
+        ZSTD_CCtx_params const singleThreadParams = ZSTDMT_initJobCCtxParams(params);
+        DEBUGLOG(5, "ZSTDMT_initCStream_internal: switch to single blocking thread mode");
+        assert(singleThreadParams.nbWorkers == 0);
+        return ZSTD_initCStream_internal(mtctx->cctxPool->cctx[0],
+                                         dict, dictSize, cdict,
+                                         singleThreadParams, pledgedSrcSize);
+    }
+
+    DEBUGLOG(4, "ZSTDMT_initCStream_internal: %u workers", params.nbWorkers);
+
+    if (mtctx->allJobsCompleted == 0) {   /* previous compression not correctly finished */
+        ZSTDMT_waitForAllJobsCompleted(mtctx);
+        ZSTDMT_releaseAllJobResources(mtctx);
+        mtctx->allJobsCompleted = 1;
+    }
+
+    mtctx->params = params;
+    mtctx->frameContentSize = pledgedSrcSize;
+    if (dict) {
+        ZSTD_freeCDict(mtctx->cdictLocal);
+        mtctx->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize,
+                                                    ZSTD_dlm_byCopy, dictContentType, /* note : a loadPrefix becomes an internal CDict */
+                                                    params.cParams, mtctx->cMem);
+        mtctx->cdict = mtctx->cdictLocal;
+        if (mtctx->cdictLocal == NULL) return ERROR(memory_allocation);
+    } else {
+        ZSTD_freeCDict(mtctx->cdictLocal);
+        mtctx->cdictLocal = NULL;
+        mtctx->cdict = cdict;
+    }
+
+    mtctx->targetPrefixSize = (size_t)1 << ZSTDMT_computeOverlapLog(params);
+    DEBUGLOG(4, "overlapLog=%u => %u KB", params.overlapSizeLog, (U32)(mtctx->targetPrefixSize>>10));
+    mtctx->targetSectionSize = params.jobSize;
+    if (mtctx->targetSectionSize < ZSTDMT_JOBSIZE_MIN) mtctx->targetSectionSize = ZSTDMT_JOBSIZE_MIN;
+    if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize;  /* job size must be >= overlap size */
+    DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), params.jobSize);
+    DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10));
+    ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize));
+    {
+        /* If ldm is enabled we need windowSize space. */
+        size_t const windowSize = mtctx->params.ldmParams.enableLdm ? (1U << mtctx->params.cParams.windowLog) : 0;
+        /* Two buffers of slack, plus extra space for the overlap
+         * This is the minimum slack that LDM works with. One extra because
+         * flush might waste up to targetSectionSize-1 bytes. Another extra
+         * for the overlap (if > 0), then one to fill which doesn't overlap
+         * with the LDM window.
+         */
+        size_t const nbSlackBuffers = 2 + (mtctx->targetPrefixSize > 0);
+        size_t const slackSize = mtctx->targetSectionSize * nbSlackBuffers;
+        /* Compute the total size, and always have enough slack */
+        size_t const nbWorkers = MAX(mtctx->params.nbWorkers, 1);
+        size_t const sectionsSize = mtctx->targetSectionSize * nbWorkers;
+        size_t const capacity = MAX(windowSize, sectionsSize) + slackSize;
+        if (mtctx->roundBuff.capacity < capacity) {
+            if (mtctx->roundBuff.buffer)
+                ZSTD_free(mtctx->roundBuff.buffer, mtctx->cMem);
+            mtctx->roundBuff.buffer = (BYTE*)ZSTD_malloc(capacity, mtctx->cMem);
+            if (mtctx->roundBuff.buffer == NULL) {
+                mtctx->roundBuff.capacity = 0;
+                return ERROR(memory_allocation);
+            }
+            mtctx->roundBuff.capacity = capacity;
+        }
+    }
+    DEBUGLOG(4, "roundBuff capacity : %u KB", (U32)(mtctx->roundBuff.capacity>>10));
+    mtctx->roundBuff.pos = 0;
+    mtctx->inBuff.buffer = g_nullBuffer;
+    mtctx->inBuff.filled = 0;
+    mtctx->inBuff.prefix = kNullRange;
+    mtctx->doneJobID = 0;
+    mtctx->nextJobID = 0;
+    mtctx->frameEnded = 0;
+    mtctx->allJobsCompleted = 0;
+    mtctx->consumed = 0;
+    mtctx->produced = 0;
+    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params))
+        return ERROR(memory_allocation);
+    return 0;
+}
+
+size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx,
+                             const void* dict, size_t dictSize,
+                                   ZSTD_parameters params,
+                                   unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params cctxParams = mtctx->params;  /* retrieve sticky params */
+    DEBUGLOG(4, "ZSTDMT_initCStream_advanced (pledgedSrcSize=%u)", (U32)pledgedSrcSize);
+    cctxParams.cParams = params.cParams;
+    cctxParams.fParams = params.fParams;
+    return ZSTDMT_initCStream_internal(mtctx, dict, dictSize, ZSTD_dct_auto, NULL,
+                                       cctxParams, pledgedSrcSize);
+}
+
+size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx,
+                               const ZSTD_CDict* cdict,
+                                     ZSTD_frameParameters fParams,
+                                     unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params cctxParams = mtctx->params;
+    if (cdict==NULL) return ERROR(dictionary_wrong);   /* method incompatible with NULL cdict */
+    cctxParams.cParams = ZSTD_getCParamsFromCDict(cdict);
+    cctxParams.fParams = fParams;
+    return ZSTDMT_initCStream_internal(mtctx, NULL, 0 /*dictSize*/, ZSTD_dct_auto, cdict,
+                                       cctxParams, pledgedSrcSize);
+}
+
+
+/* ZSTDMT_resetCStream() :
+ * pledgedSrcSize can be zero == unknown (for the time being)
+ * prefer using ZSTD_CONTENTSIZE_UNKNOWN,
+ * as `0` might mean "empty" in the future */
+size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize)
+{
+    if (!pledgedSrcSize) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTDMT_initCStream_internal(mtctx, NULL, 0, ZSTD_dct_auto, 0, mtctx->params,
+                                       pledgedSrcSize);
+}
+
+size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel) {
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+    ZSTD_CCtx_params cctxParams = mtctx->params;   /* retrieve sticky params */
+    DEBUGLOG(4, "ZSTDMT_initCStream (cLevel=%i)", compressionLevel);
+    cctxParams.cParams = params.cParams;
+    cctxParams.fParams = params.fParams;
+    return ZSTDMT_initCStream_internal(mtctx, NULL, 0, ZSTD_dct_auto, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+
+/* ZSTDMT_writeLastEmptyBlock()
+ * Write a single empty block with an end-of-frame to finish a frame.
+ * Job must be created from streaming variant.
+ * This function is always successfull if expected conditions are fulfilled.
+ */
+static void ZSTDMT_writeLastEmptyBlock(ZSTDMT_jobDescription* job)
+{
+    assert(job->lastJob == 1);
+    assert(job->src.size == 0);   /* last job is empty -> will be simplified into a last empty block */
+    assert(job->firstJob == 0);   /* cannot be first job, as it also needs to create frame header */
+    assert(job->dstBuff.start == NULL);   /* invoked from streaming variant only (otherwise, dstBuff might be user's output) */
+    job->dstBuff = ZSTDMT_getBuffer(job->bufPool);
+    if (job->dstBuff.start == NULL) {
+      job->cSize = ERROR(memory_allocation);
+      return;
+    }
+    assert(job->dstBuff.capacity >= ZSTD_blockHeaderSize);   /* no buffer should ever be that small */
+    job->src = kNullRange;
+    job->cSize = ZSTD_writeLastEmptyBlock(job->dstBuff.start, job->dstBuff.capacity);
+    assert(!ZSTD_isError(job->cSize));
+    assert(job->consumed == 0);
+}
+
+static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZSTD_EndDirective endOp)
+{
+    unsigned const jobID = mtctx->nextJobID & mtctx->jobIDMask;
+    int const endFrame = (endOp == ZSTD_e_end);
+
+    if (mtctx->nextJobID > mtctx->doneJobID + mtctx->jobIDMask) {
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: will not create new job : table is full");
+        assert((mtctx->nextJobID & mtctx->jobIDMask) == (mtctx->doneJobID & mtctx->jobIDMask));
+        return 0;
+    }
+
+    if (!mtctx->jobReady) {
+        BYTE const* src = (BYTE const*)mtctx->inBuff.buffer.start;
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: preparing job %u to compress %u bytes with %u preload ",
+                    mtctx->nextJobID, (U32)srcSize, (U32)mtctx->inBuff.prefix.size);
+        mtctx->jobs[jobID].src.start = src;
+        mtctx->jobs[jobID].src.size = srcSize;
+        assert(mtctx->inBuff.filled >= srcSize);
+        mtctx->jobs[jobID].prefix = mtctx->inBuff.prefix;
+        mtctx->jobs[jobID].consumed = 0;
+        mtctx->jobs[jobID].cSize = 0;
+        mtctx->jobs[jobID].params = mtctx->params;
+        mtctx->jobs[jobID].cdict = mtctx->nextJobID==0 ? mtctx->cdict : NULL;
+        mtctx->jobs[jobID].fullFrameSize = mtctx->frameContentSize;
+        mtctx->jobs[jobID].dstBuff = g_nullBuffer;
+        mtctx->jobs[jobID].cctxPool = mtctx->cctxPool;
+        mtctx->jobs[jobID].bufPool = mtctx->bufPool;
+        mtctx->jobs[jobID].seqPool = mtctx->seqPool;
+        mtctx->jobs[jobID].serial = &mtctx->serial;
+        mtctx->jobs[jobID].jobID = mtctx->nextJobID;
+        mtctx->jobs[jobID].firstJob = (mtctx->nextJobID==0);
+        mtctx->jobs[jobID].lastJob = endFrame;
+        mtctx->jobs[jobID].frameChecksumNeeded = endFrame && (mtctx->nextJobID>0) && mtctx->params.fParams.checksumFlag;
+        mtctx->jobs[jobID].dstFlushed = 0;
+
+        /* Update the round buffer pos and clear the input buffer to be reset */
+        mtctx->roundBuff.pos += srcSize;
+        mtctx->inBuff.buffer = g_nullBuffer;
+        mtctx->inBuff.filled = 0;
+        /* Set the prefix */
+        if (!endFrame) {
+            size_t const newPrefixSize = MIN(srcSize, mtctx->targetPrefixSize);
+            mtctx->inBuff.prefix.start = src + srcSize - newPrefixSize;
+            mtctx->inBuff.prefix.size = newPrefixSize;
+        } else {   /* endFrame==1 => no need for another input buffer */
+            mtctx->inBuff.prefix = kNullRange;
+            mtctx->frameEnded = endFrame;
+            if (mtctx->nextJobID == 0) {
+                /* single job exception : checksum is already calculated directly within worker thread */
+                mtctx->params.fParams.checksumFlag = 0;
+        }   }
+
+        if ( (srcSize == 0)
+          && (mtctx->nextJobID>0)/*single job must also write frame header*/ ) {
+            DEBUGLOG(5, "ZSTDMT_createCompressionJob: creating a last empty block to end frame");
+            assert(endOp == ZSTD_e_end);  /* only possible case : need to end the frame with an empty last block */
+            ZSTDMT_writeLastEmptyBlock(mtctx->jobs + jobID);
+            mtctx->nextJobID++;
+            return 0;
+        }
+    }
+
+    DEBUGLOG(5, "ZSTDMT_createCompressionJob: posting job %u : %u bytes  (end:%u, jobNb == %u (mod:%u))",
+                mtctx->nextJobID,
+                (U32)mtctx->jobs[jobID].src.size,
+                mtctx->jobs[jobID].lastJob,
+                mtctx->nextJobID,
+                jobID);
+    if (POOL_tryAdd(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[jobID])) {
+        mtctx->nextJobID++;
+        mtctx->jobReady = 0;
+    } else {
+        DEBUGLOG(5, "ZSTDMT_createCompressionJob: no worker available for job %u", mtctx->nextJobID);
+        mtctx->jobReady = 1;
+    }
+    return 0;
+}
+
+
+/*! ZSTDMT_flushProduced() :
+ * `output` : `pos` will be updated with amount of data flushed .
+ * `blockToFlush` : if >0, the function will block and wait if there is no data available to flush .
+ * @return : amount of data remaining within internal buffer, 0 if no more, 1 if unknown but > 0, or an error code */
+static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, unsigned blockToFlush, ZSTD_EndDirective end)
+{
+    unsigned const wJobID = mtctx->doneJobID & mtctx->jobIDMask;
+    DEBUGLOG(5, "ZSTDMT_flushProduced (blocking:%u , job %u <= %u)",
+                blockToFlush, mtctx->doneJobID, mtctx->nextJobID);
+    assert(output->size >= output->pos);
+
+    ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
+    if (  blockToFlush
+      && (mtctx->doneJobID < mtctx->nextJobID) ) {
+        assert(mtctx->jobs[wJobID].dstFlushed <= mtctx->jobs[wJobID].cSize);
+        while (mtctx->jobs[wJobID].dstFlushed == mtctx->jobs[wJobID].cSize) {  /* nothing to flush */
+            if (mtctx->jobs[wJobID].consumed == mtctx->jobs[wJobID].src.size) {
+                DEBUGLOG(5, "job %u is completely consumed (%u == %u) => don't wait for cond, there will be none",
+                            mtctx->doneJobID, (U32)mtctx->jobs[wJobID].consumed, (U32)mtctx->jobs[wJobID].src.size);
+                break;
+            }
+            DEBUGLOG(5, "waiting for something to flush from job %u (currently flushed: %u bytes)",
+                        mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
+            ZSTD_pthread_cond_wait(&mtctx->jobs[wJobID].job_cond, &mtctx->jobs[wJobID].job_mutex);  /* block when nothing to flush but some to come */
+    }   }
+
+    /* try to flush something */
+    {   size_t cSize = mtctx->jobs[wJobID].cSize;                  /* shared */
+        size_t const srcConsumed = mtctx->jobs[wJobID].consumed;   /* shared */
+        size_t const srcSize = mtctx->jobs[wJobID].src.size;        /* read-only, could be done after mutex lock, but no-declaration-after-statement */
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+        if (ZSTD_isError(cSize)) {
+            DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s",
+                        mtctx->doneJobID, ZSTD_getErrorName(cSize));
+            ZSTDMT_waitForAllJobsCompleted(mtctx);
+            ZSTDMT_releaseAllJobResources(mtctx);
+            return cSize;
+        }
+        /* add frame checksum if necessary (can only happen once) */
+        assert(srcConsumed <= srcSize);
+        if ( (srcConsumed == srcSize)   /* job completed -> worker no longer active */
+          && mtctx->jobs[wJobID].frameChecksumNeeded ) {
+            U32 const checksum = (U32)XXH64_digest(&mtctx->serial.xxhState);
+            DEBUGLOG(4, "ZSTDMT_flushProduced: writing checksum : %08X \n", checksum);
+            MEM_writeLE32((char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].cSize, checksum);
+            cSize += 4;
+            mtctx->jobs[wJobID].cSize += 4;  /* can write this shared value, as worker is no longer active */
+            mtctx->jobs[wJobID].frameChecksumNeeded = 0;
+        }
+        if (cSize > 0) {   /* compression is ongoing or completed */
+            size_t const toFlush = MIN(cSize - mtctx->jobs[wJobID].dstFlushed, output->size - output->pos);
+            DEBUGLOG(5, "ZSTDMT_flushProduced: Flushing %u bytes from job %u (completion:%u/%u, generated:%u)",
+                        (U32)toFlush, mtctx->doneJobID, (U32)srcConsumed, (U32)srcSize, (U32)cSize);
+            assert(mtctx->doneJobID < mtctx->nextJobID);
+            assert(cSize >= mtctx->jobs[wJobID].dstFlushed);
+            assert(mtctx->jobs[wJobID].dstBuff.start != NULL);
+            memcpy((char*)output->dst + output->pos,
+                   (const char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].dstFlushed,
+                   toFlush);
+            output->pos += toFlush;
+            mtctx->jobs[wJobID].dstFlushed += toFlush;  /* can write : this value is only used by mtctx */
+
+            if ( (srcConsumed == srcSize)    /* job completed */
+              && (mtctx->jobs[wJobID].dstFlushed == cSize) ) {   /* output buffer fully flushed => free this job position */
+                DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one",
+                        mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
+                ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff);
+                mtctx->jobs[wJobID].dstBuff = g_nullBuffer;
+                mtctx->jobs[wJobID].cSize = 0;   /* ensure this job slot is considered "not started" in future check */
+                mtctx->consumed += srcSize;
+                mtctx->produced += cSize;
+                mtctx->doneJobID++;
+        }   }
+
+        /* return value : how many bytes left in buffer ; fake it to 1 when unknown but >0 */
+        if (cSize > mtctx->jobs[wJobID].dstFlushed) return (cSize - mtctx->jobs[wJobID].dstFlushed);
+        if (srcSize > srcConsumed) return 1;   /* current job not completely compressed */
+    }
+    if (mtctx->doneJobID < mtctx->nextJobID) return 1;   /* some more jobs ongoing */
+    if (mtctx->jobReady) return 1;      /* one job is ready to push, just not yet in the list */
+    if (mtctx->inBuff.filled > 0) return 1;   /* input is not empty, and still needs to be converted into a job */
+    mtctx->allJobsCompleted = mtctx->frameEnded;   /* all jobs are entirely flushed => if this one is last one, frame is completed */
+    if (end == ZSTD_e_end) return !mtctx->frameEnded;  /* for ZSTD_e_end, question becomes : is frame completed ? instead of : are internal buffers fully flushed ? */
+    return 0;   /* internal buffers fully flushed */
+}
+
+/**
+ * Returns the range of data used by the earliest job that is not yet complete.
+ * If the data of the first job is broken up into two segments, we cover both
+ * sections.
+ */
+static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx)
+{
+    unsigned const firstJobID = mtctx->doneJobID;
+    unsigned const lastJobID = mtctx->nextJobID;
+    unsigned jobID;
+
+    for (jobID = firstJobID; jobID < lastJobID; ++jobID) {
+        unsigned const wJobID = jobID & mtctx->jobIDMask;
+        size_t consumed;
+
+        ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
+        consumed = mtctx->jobs[wJobID].consumed;
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+
+        if (consumed < mtctx->jobs[wJobID].src.size) {
+            range_t range = mtctx->jobs[wJobID].prefix;
+            if (range.size == 0) {
+                /* Empty prefix */
+                range = mtctx->jobs[wJobID].src;
+            }
+            /* Job source in multiple segments not supported yet */
+            assert(range.start <= mtctx->jobs[wJobID].src.start);
+            return range;
+        }
+    }
+    return kNullRange;
+}
+
+/**
+ * Returns non-zero iff buffer and range overlap.
+ */
+static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range)
+{
+    BYTE const* const bufferStart = (BYTE const*)buffer.start;
+    BYTE const* const bufferEnd = bufferStart + buffer.capacity;
+    BYTE const* const rangeStart = (BYTE const*)range.start;
+    BYTE const* const rangeEnd = rangeStart + range.size;
+
+    if (rangeStart == NULL || bufferStart == NULL)
+        return 0;
+    /* Empty ranges cannot overlap */
+    if (bufferStart == bufferEnd || rangeStart == rangeEnd)
+        return 0;
+
+    return bufferStart < rangeEnd && rangeStart < bufferEnd;
+}
+
+static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window)
+{
+    range_t extDict;
+    range_t prefix;
+
+    extDict.start = window.dictBase + window.lowLimit;
+    extDict.size = window.dictLimit - window.lowLimit;
+
+    prefix.start = window.base + window.dictLimit;
+    prefix.size = window.nextSrc - (window.base + window.dictLimit);
+    DEBUGLOG(5, "extDict [0x%zx, 0x%zx)",
+                (size_t)extDict.start,
+                (size_t)extDict.start + extDict.size);
+    DEBUGLOG(5, "prefix  [0x%zx, 0x%zx)",
+                (size_t)prefix.start,
+                (size_t)prefix.start + prefix.size);
+
+    return ZSTDMT_isOverlapped(buffer, extDict)
+        || ZSTDMT_isOverlapped(buffer, prefix);
+}
+
+static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer)
+{
+    if (mtctx->params.ldmParams.enableLdm) {
+        ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex;
+        DEBUGLOG(5, "source  [0x%zx, 0x%zx)",
+                    (size_t)buffer.start,
+                    (size_t)buffer.start + buffer.capacity);
+        ZSTD_PTHREAD_MUTEX_LOCK(mutex);
+        while (ZSTDMT_doesOverlapWindow(buffer, mtctx->serial.ldmWindow)) {
+            DEBUGLOG(6, "Waiting for LDM to finish...");
+            ZSTD_pthread_cond_wait(&mtctx->serial.ldmWindowCond, mutex);
+        }
+        DEBUGLOG(6, "Done waiting for LDM to finish");
+        ZSTD_pthread_mutex_unlock(mutex);
+    }
+}
+
+/**
+ * Attempts to set the inBuff to the next section to fill.
+ * If any part of the new section is still in use we give up.
+ * Returns non-zero if the buffer is filled.
+ */
+static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx)
+{
+    range_t const inUse = ZSTDMT_getInputDataInUse(mtctx);
+    size_t const spaceLeft = mtctx->roundBuff.capacity - mtctx->roundBuff.pos;
+    size_t const target = mtctx->targetSectionSize;
+    buffer_t buffer;
+
+    assert(mtctx->inBuff.buffer.start == NULL);
+    assert(mtctx->roundBuff.capacity >= target);
+
+    if (spaceLeft < target) {
+        /* ZSTD_invalidateRepCodes() doesn't work for extDict variants.
+         * Simply copy the prefix to the beginning in that case.
+         */
+        BYTE* const start = (BYTE*)mtctx->roundBuff.buffer;
+        size_t const prefixSize = mtctx->inBuff.prefix.size;
+
+        buffer.start = start;
+        buffer.capacity = prefixSize;
+        if (ZSTDMT_isOverlapped(buffer, inUse)) {
+            DEBUGLOG(6, "Waiting for buffer...");
+            return 0;
+        }
+        ZSTDMT_waitForLdmComplete(mtctx, buffer);
+        memmove(start, mtctx->inBuff.prefix.start, prefixSize);
+        mtctx->inBuff.prefix.start = start;
+        mtctx->roundBuff.pos = prefixSize;
+    }
+    buffer.start = mtctx->roundBuff.buffer + mtctx->roundBuff.pos;
+    buffer.capacity = target;
+
+    if (ZSTDMT_isOverlapped(buffer, inUse)) {
+        DEBUGLOG(6, "Waiting for buffer...");
+        return 0;
+    }
+    assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix));
+
+    ZSTDMT_waitForLdmComplete(mtctx, buffer);
+
+    DEBUGLOG(5, "Using prefix range [%zx, %zx)",
+                (size_t)mtctx->inBuff.prefix.start,
+                (size_t)mtctx->inBuff.prefix.start + mtctx->inBuff.prefix.size);
+    DEBUGLOG(5, "Using source range [%zx, %zx)",
+                (size_t)buffer.start,
+                (size_t)buffer.start + buffer.capacity);
+
+
+    mtctx->inBuff.buffer = buffer;
+    mtctx->inBuff.filled = 0;
+    assert(mtctx->roundBuff.pos + buffer.capacity <= mtctx->roundBuff.capacity);
+    return 1;
+}
+
+
+/** ZSTDMT_compressStream_generic() :
+ *  internal use only - exposed to be invoked from zstd_compress.c
+ *  assumption : output and input are valid (pos <= size)
+ * @return : minimum amount of data remaining to flush, 0 if none */
+size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+                                     ZSTD_outBuffer* output,
+                                     ZSTD_inBuffer* input,
+                                     ZSTD_EndDirective endOp)
+{
+    unsigned forwardInputProgress = 0;
+    DEBUGLOG(5, "ZSTDMT_compressStream_generic (endOp=%u, srcSize=%u)",
+                (U32)endOp, (U32)(input->size - input->pos));
+    assert(output->pos <= output->size);
+    assert(input->pos  <= input->size);
+
+    if (mtctx->singleBlockingThread) {  /* delegate to single-thread (synchronous) */
+        return ZSTD_compressStream_generic(mtctx->cctxPool->cctx[0], output, input, endOp);
+    }
+
+    if ((mtctx->frameEnded) && (endOp==ZSTD_e_continue)) {
+        /* current frame being ended. Only flush/end are allowed */
+        return ERROR(stage_wrong);
+    }
+
+    /* single-pass shortcut (note : synchronous-mode) */
+    if ( (mtctx->nextJobID == 0)      /* just started */
+      && (mtctx->inBuff.filled == 0)  /* nothing buffered */
+      && (!mtctx->jobReady)           /* no job already created */
+      && (endOp == ZSTD_e_end)        /* end order */
+      && (output->size - output->pos >= ZSTD_compressBound(input->size - input->pos)) ) { /* enough space in dst */
+        size_t const cSize = ZSTDMT_compress_advanced_internal(mtctx,
+                (char*)output->dst + output->pos, output->size - output->pos,
+                (const char*)input->src + input->pos, input->size - input->pos,
+                mtctx->cdict, mtctx->params);
+        if (ZSTD_isError(cSize)) return cSize;
+        input->pos = input->size;
+        output->pos += cSize;
+        mtctx->allJobsCompleted = 1;
+        mtctx->frameEnded = 1;
+        return 0;
+    }
+
+    /* fill input buffer */
+    if ( (!mtctx->jobReady)
+      && (input->size > input->pos) ) {   /* support NULL input */
+        if (mtctx->inBuff.buffer.start == NULL) {
+            assert(mtctx->inBuff.filled == 0); /* Can't fill an empty buffer */
+            if (!ZSTDMT_tryGetInputRange(mtctx)) {
+                /* It is only possible for this operation to fail if there are
+                 * still compression jobs ongoing.
+                 */
+                assert(mtctx->doneJobID != mtctx->nextJobID);
+            }
+        }
+        if (mtctx->inBuff.buffer.start != NULL) {
+            size_t const toLoad = MIN(input->size - input->pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
+            assert(mtctx->inBuff.buffer.capacity >= mtctx->targetSectionSize);
+            DEBUGLOG(5, "ZSTDMT_compressStream_generic: adding %u bytes on top of %u to buffer of size %u",
+                        (U32)toLoad, (U32)mtctx->inBuff.filled, (U32)mtctx->targetSectionSize);
+            memcpy((char*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled, (const char*)input->src + input->pos, toLoad);
+            input->pos += toLoad;
+            mtctx->inBuff.filled += toLoad;
+            forwardInputProgress = toLoad>0;
+        }
+        if ((input->pos < input->size) && (endOp == ZSTD_e_end))
+            endOp = ZSTD_e_flush;   /* can't end now : not all input consumed */
+    }
+
+    if ( (mtctx->jobReady)
+      || (mtctx->inBuff.filled >= mtctx->targetSectionSize)  /* filled enough : let's compress */
+      || ((endOp != ZSTD_e_continue) && (mtctx->inBuff.filled > 0))  /* something to flush : let's go */
+      || ((endOp == ZSTD_e_end) && (!mtctx->frameEnded)) ) {   /* must finish the frame with a zero-size block */
+        size_t const jobSize = mtctx->inBuff.filled;
+        assert(mtctx->inBuff.filled <= mtctx->targetSectionSize);
+        CHECK_F( ZSTDMT_createCompressionJob(mtctx, jobSize, endOp) );
+    }
+
+    /* check for potential compressed data ready to be flushed */
+    {   size_t const remainingToFlush = ZSTDMT_flushProduced(mtctx, output, !forwardInputProgress, endOp); /* block if there was no forward input progress */
+        if (input->pos < input->size) return MAX(remainingToFlush, 1);  /* input not consumed : do not end flush yet */
+        return remainingToFlush;
+    }
+}
+
+
+size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    CHECK_F( ZSTDMT_compressStream_generic(mtctx, output, input, ZSTD_e_continue) );
+
+    /* recommended next input size : fill current input buffer */
+    return mtctx->targetSectionSize - mtctx->inBuff.filled;   /* note : could be zero when input buffer is fully filled and no more availability to create new job */
+}
+
+
+static size_t ZSTDMT_flushStream_internal(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_EndDirective endFrame)
+{
+    size_t const srcSize = mtctx->inBuff.filled;
+    DEBUGLOG(5, "ZSTDMT_flushStream_internal");
+
+    if ( mtctx->jobReady     /* one job ready for a worker to pick up */
+      || (srcSize > 0)       /* still some data within input buffer */
+      || ((endFrame==ZSTD_e_end) && !mtctx->frameEnded)) {  /* need a last 0-size block to end frame */
+           DEBUGLOG(5, "ZSTDMT_flushStream_internal : create a new job (%u bytes, end:%u)",
+                        (U32)srcSize, (U32)endFrame);
+        CHECK_F( ZSTDMT_createCompressionJob(mtctx, srcSize, endFrame) );
+    }
+
+    /* check if there is any data available to flush */
+    return ZSTDMT_flushProduced(mtctx, output, 1 /* blockToFlush */, endFrame);
+}
+
+
+size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output)
+{
+    DEBUGLOG(5, "ZSTDMT_flushStream");
+    if (mtctx->singleBlockingThread)
+        return ZSTD_flushStream(mtctx->cctxPool->cctx[0], output);
+    return ZSTDMT_flushStream_internal(mtctx, output, ZSTD_e_flush);
+}
+
+size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output)
+{
+    DEBUGLOG(4, "ZSTDMT_endStream");
+    if (mtctx->singleBlockingThread)
+        return ZSTD_endStream(mtctx->cctxPool->cctx[0], output);
+    return ZSTDMT_flushStream_internal(mtctx, output, ZSTD_e_end);
+}

diff --git a/vendor/github.com/DataDog/zstd/zstdmt_compress.h b/vendor/github.com/DataDog/zstd/zstdmt_compress.h
new file mode 100755
index 0000000..f79e3b4
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstdmt_compress.h

@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ #ifndef ZSTDMT_COMPRESS_H
+ #define ZSTDMT_COMPRESS_H
+
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
+
+
+/* Note : This is an internal API.
+ *        Some methods are still exposed (ZSTDLIB_API),
+ *        because it used to be the only way to invoke MT compression.
+ *        Now, it's recommended to use ZSTD_compress_generic() instead.
+ *        These methods will stop being exposed in a future version */
+
+/* ===   Dependencies   === */
+#include <stddef.h>                /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters */
+#include "zstd.h"            /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */
+
+
+/* ===   Memory management   === */
+typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
+ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers);
+ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers,
+                                                    ZSTD_customMem cMem);
+ZSTDLIB_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx);
+
+ZSTDLIB_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx);
+
+
+/* ===   Simple one-pass compression function   === */
+
+ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                       int compressionLevel);
+
+
+
+/* ===   Streaming functions   === */
+
+ZSTDLIB_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize);  /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */
+
+ZSTDLIB_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output);   /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output);     /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */
+
+
+/* ===   Advanced functions and parameters  === */
+
+#ifndef ZSTDMT_JOBSIZE_MIN
+#  define ZSTDMT_JOBSIZE_MIN (1U << 20)   /* 1 MB - Minimum size of each compression job */
+#endif
+
+ZSTDLIB_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const ZSTD_CDict* cdict,
+                                           ZSTD_parameters params,
+                                           unsigned overlapLog);
+
+ZSTDLIB_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx,
+                                        const void* dict, size_t dictSize,   /* dict can be released after init, a local copy is preserved within zcs */
+                                        ZSTD_parameters params,
+                                        unsigned long long pledgedSrcSize);  /* pledgedSrcSize is optional and can be zero == unknown */
+
+ZSTDLIB_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx,
+                                        const ZSTD_CDict* cdict,
+                                        ZSTD_frameParameters fparams,
+                                        unsigned long long pledgedSrcSize);  /* note : zero means empty */
+
+/* ZSTDMT_parameter :
+ * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */
+typedef enum {
+    ZSTDMT_p_jobSize,           /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */
+    ZSTDMT_p_overlapSectionLog  /* Each job may reload a part of previous job to enhance compressionr ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */
+} ZSTDMT_parameter;
+
+/* ZSTDMT_setMTCtxParameter() :
+ * allow setting individual parameters, one at a time, among a list of enums defined in ZSTDMT_parameter.
+ * The function must be called typically after ZSTD_createCCtx() but __before ZSTDMT_init*() !__
+ * Parameters not explicitly reset by ZSTDMT_init*() remain the same in consecutive compression sessions.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned value);
+
+
+/*! ZSTDMT_compressStream_generic() :
+ *  Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
+ *  depending on flush directive.
+ * @return : minimum amount of data still to be flushed
+ *           0 if fully flushed
+ *           or an error code
+ *  note : needs to be init using any ZSTD_initCStream*() variant */
+ZSTDLIB_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+                                                ZSTD_outBuffer* output,
+                                                ZSTD_inBuffer* input,
+                                                ZSTD_EndDirective endOp);
+
+
+/* ========================================================
+ * ===  Private interface, for use by ZSTD_compress.c   ===
+ * ===  Not exposed in libzstd. Never invoke directly   ===
+ * ======================================================== */
+
+size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, unsigned value);
+
+/* ZSTDMT_CCtxParam_setNbWorkers()
+ * Set nbWorkers, and clamp it.
+ * Also reset jobSize and overlapLog */
+size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers);
+
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
+
+/* ZSTDMT_getNbWorkers():
+ * @return nb threads currently active in mtctx.
+ * mtctx must be valid */
+unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx);
+
+/* ZSTDMT_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads.
+ */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
+
+
+/*! ZSTDMT_initCStream_internal() :
+ *  Private use only. Init streaming operation.
+ *  expects params to be valid.
+ *  must receive dict, or cdict, or none, but not both.
+ *  @return : 0, or an error code */
+size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs,
+                    const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+                    const ZSTD_CDict* cdict,
+                    ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTDMT_COMPRESS_H */

diff --git a/vendor/github.com/Shopify/sarama/.gitignore b/vendor/github.com/Shopify/sarama/.gitignore
new file mode 100644
index 0000000..6e362e4
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/.gitignore

@@ -0,0 +1,27 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+*.test
+
+# Folders
+_obj
+_test
+.vagrant
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+
+coverage.txt
+profile.out

diff --git a/vendor/github.com/Shopify/sarama/.travis.yml b/vendor/github.com/Shopify/sarama/.travis.yml
new file mode 100644
index 0000000..eb54a0d
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/.travis.yml

@@ -0,0 +1,35 @@
+language: go
+go:
+- 1.10.x
+- 1.11.x
+
+env:
+  global:
+  - KAFKA_PEERS=localhost:9091,localhost:9092,localhost:9093,localhost:9094,localhost:9095
+  - TOXIPROXY_ADDR=http://localhost:8474
+  - KAFKA_INSTALL_ROOT=/home/travis/kafka
+  - KAFKA_HOSTNAME=localhost
+  - DEBUG=true
+  matrix:
+  - KAFKA_VERSION=1.1.1
+  - KAFKA_VERSION=2.0.1
+  - KAFKA_VERSION=2.1.0
+
+before_install:
+- export REPOSITORY_ROOT=${TRAVIS_BUILD_DIR}
+- vagrant/install_cluster.sh
+- vagrant/boot_cluster.sh
+- vagrant/create_topics.sh
+
+install: make install_dependencies
+
+script:
+- make test
+- make vet
+- make errcheck
+- if [[ "$TRAVIS_GO_VERSION" == 1.11* ]]; then make fmt; fi
+
+after_success:
+- bash <(curl -s https://codecov.io/bash)
+
+after_script: vagrant/halt_cluster.sh

diff --git a/vendor/github.com/Shopify/sarama/CHANGELOG.md b/vendor/github.com/Shopify/sarama/CHANGELOG.md
new file mode 100644
index 0000000..9f955d5
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/CHANGELOG.md

@@ -0,0 +1,623 @@
+# Changelog
+
+#### Version 1.20.0 (2018-12-10)
+
+New Features:
+ - Add support for zstd compression
+   ([#1170](https://github.com/Shopify/sarama/pull/1170)).
+ - Add support for Idempotent Producer
+   ([#1152](https://github.com/Shopify/sarama/pull/1152)).
+ - Add support support for Kafka 2.1.0
+   ([#1229](https://github.com/Shopify/sarama/pull/1229)).
+ - Add support support for OffsetCommit request/response pairs versions v1 to v5
+   ([#1201](https://github.com/Shopify/sarama/pull/1201)).
+ - Add support support for OffsetFetch request/response pair up to version v5
+   ([#1198](https://github.com/Shopify/sarama/pull/1198)).
+
+Improvements:
+ - Export broker's Rack setting
+   ([#1173](https://github.com/Shopify/sarama/pull/1173)).
+ - Always use latest patch version of Go on CI
+   ([#1202](https://github.com/Shopify/sarama/pull/1202)).
+ - Add error codes 61 to 72
+   ([#1195](https://github.com/Shopify/sarama/pull/1195)).
+
+Bug Fixes:
+ - Fix build without cgo
+   ([#1182](https://github.com/Shopify/sarama/pull/1182)).
+ - Fix go vet suggestion in consumer group file
+   ([#1209](https://github.com/Shopify/sarama/pull/1209)).
+ - Fix typos in code and comments
+   ([#1228](https://github.com/Shopify/sarama/pull/1228)).
+
+#### Version 1.19.0 (2018-09-27)
+
+New Features:
+ - Implement a higher-level consumer group
+   ([#1099](https://github.com/Shopify/sarama/pull/1099)).
+
+Improvements:
+ - Add support for Go 1.11
+   ([#1176](https://github.com/Shopify/sarama/pull/1176)).
+
+Bug Fixes:
+ - Fix encoding of `MetadataResponse` with version 2 and higher
+   ([#1174](https://github.com/Shopify/sarama/pull/1174)).
+ - Fix race condition in mock async producer
+   ([#1174](https://github.com/Shopify/sarama/pull/1174)).
+
+#### Version 1.18.0 (2018-09-07)
+
+New Features:
+ - Make `Partitioner.RequiresConsistency` vary per-message
+   ([#1112](https://github.com/Shopify/sarama/pull/1112)).
+ - Add customizable partitioner
+   ([#1118](https://github.com/Shopify/sarama/pull/1118)).
+ - Add `ClusterAdmin` support for `CreateTopic`, `DeleteTopic`, `CreatePartitions`,
+   `DeleteRecords`, `DescribeConfig`, `AlterConfig`, `CreateACL`, `ListAcls`, `DeleteACL`
+   ([#1055](https://github.com/Shopify/sarama/pull/1055)).
+
+Improvements:
+ - Add support for Kafka 2.0.0
+   ([#1149](https://github.com/Shopify/sarama/pull/1149)).
+ - Allow setting `LocalAddr` when dialing an address to support multi-homed hosts
+   ([#1123](https://github.com/Shopify/sarama/pull/1123)).
+ - Simpler offset management
+   ([#1127](https://github.com/Shopify/sarama/pull/1127)).
+
+Bug Fixes:
+ - Fix mutation of `ProducerMessage.MetaData` when producing to Kafka
+   ([#1110](https://github.com/Shopify/sarama/pull/1110)).
+ - Fix consumer block when response did not contain all the
+   expected topic/partition blocks
+   ([#1086](https://github.com/Shopify/sarama/pull/1086)).
+ - Fix consumer block when response contains only constrol messages
+   ([#1115](https://github.com/Shopify/sarama/pull/1115)).
+ - Add timeout config for ClusterAdmin requests
+   ([#1142](https://github.com/Shopify/sarama/pull/1142)).
+ - Add version check when producing message with headers
+   ([#1117](https://github.com/Shopify/sarama/pull/1117)).
+ - Fix `MetadataRequest` for empty list of topics
+   ([#1132](https://github.com/Shopify/sarama/pull/1132)).
+ - Fix producer topic metadata on-demand fetch when topic error happens in metadata response
+   ([#1125](https://github.com/Shopify/sarama/pull/1125)).
+
+#### Version 1.17.0 (2018-05-30)
+
+New Features:
+ - Add support for gzip compression levels
+   ([#1044](https://github.com/Shopify/sarama/pull/1044)).
+ - Add support for Metadata request/response pairs versions v1 to v5
+   ([#1047](https://github.com/Shopify/sarama/pull/1047),
+    [#1069](https://github.com/Shopify/sarama/pull/1069)).
+ - Add versioning to JoinGroup request/response pairs
+   ([#1098](https://github.com/Shopify/sarama/pull/1098))
+ - Add support for CreatePartitions, DeleteGroups, DeleteRecords request/response pairs
+   ([#1065](https://github.com/Shopify/sarama/pull/1065),
+    [#1096](https://github.com/Shopify/sarama/pull/1096),
+    [#1027](https://github.com/Shopify/sarama/pull/1027)).
+ - Add `Controller()` method to Client interface
+   ([#1063](https://github.com/Shopify/sarama/pull/1063)).
+
+Improvements:
+ - ConsumerMetadataReq/Resp has been migrated to FindCoordinatorReq/Resp
+   ([#1010](https://github.com/Shopify/sarama/pull/1010)).
+ - Expose missing protocol parts: `msgSet` and `recordBatch`
+   ([#1049](https://github.com/Shopify/sarama/pull/1049)).
+ - Add support for v1 DeleteTopics Request
+   ([#1052](https://github.com/Shopify/sarama/pull/1052)).
+ - Add support for Go 1.10
+   ([#1064](https://github.com/Shopify/sarama/pull/1064)).
+ - Claim support for Kafka 1.1.0
+   ([#1073](https://github.com/Shopify/sarama/pull/1073)).
+
+Bug Fixes:
+ - Fix FindCoordinatorResponse.encode to allow nil Coordinator
+   ([#1050](https://github.com/Shopify/sarama/pull/1050),
+    [#1051](https://github.com/Shopify/sarama/pull/1051)).
+ - Clear all metadata when we have the latest topic info
+   ([#1033](https://github.com/Shopify/sarama/pull/1033)).
+ - Make `PartitionConsumer.Close` idempotent
+   ([#1092](https://github.com/Shopify/sarama/pull/1092)).
+
+#### Version 1.16.0 (2018-02-12)
+
+New Features:
+ - Add support for the Create/Delete Topics request/response pairs
+   ([#1007](https://github.com/Shopify/sarama/pull/1007),
+    [#1008](https://github.com/Shopify/sarama/pull/1008)).
+ - Add support for the Describe/Create/Delete ACL request/response pairs
+   ([#1009](https://github.com/Shopify/sarama/pull/1009)).
+ - Add support for the five transaction-related request/response pairs
+   ([#1016](https://github.com/Shopify/sarama/pull/1016)).
+
+Improvements:
+ - Permit setting version on mock producer responses
+   ([#999](https://github.com/Shopify/sarama/pull/999)).
+ - Add `NewMockBrokerListener` helper for testing TLS connections
+   ([#1019](https://github.com/Shopify/sarama/pull/1019)).
+ - Changed the default value for `Consumer.Fetch.Default` from 32KiB to 1MiB
+   which results in much higher throughput in most cases
+   ([#1024](https://github.com/Shopify/sarama/pull/1024)).
+ - Reuse the `time.Ticker` across fetch requests in the PartitionConsumer to
+   reduce CPU and memory usage when processing many partitions
+   ([#1028](https://github.com/Shopify/sarama/pull/1028)).
+ - Assign relative offsets to messages in the producer to save the brokers a
+   recompression pass
+   ([#1002](https://github.com/Shopify/sarama/pull/1002),
+    [#1015](https://github.com/Shopify/sarama/pull/1015)).
+
+Bug Fixes:
+ - Fix producing uncompressed batches with the new protocol format
+   ([#1032](https://github.com/Shopify/sarama/issues/1032)).
+ - Fix consuming compacted topics with the new protocol format
+   ([#1005](https://github.com/Shopify/sarama/issues/1005)).
+ - Fix consuming topics with a mix of protocol formats
+   ([#1021](https://github.com/Shopify/sarama/issues/1021)).
+ - Fix consuming when the broker includes multiple batches in a single response
+   ([#1022](https://github.com/Shopify/sarama/issues/1022)).
+ - Fix detection of `PartialTrailingMessage` when the partial message was
+   truncated before the magic value indicating its version
+   ([#1030](https://github.com/Shopify/sarama/pull/1030)).
+ - Fix expectation-checking in the mock of `SyncProducer.SendMessages`
+   ([#1035](https://github.com/Shopify/sarama/pull/1035)).
+
+#### Version 1.15.0 (2017-12-08)
+
+New Features:
+ - Claim official support for Kafka 1.0, though it did already work
+   ([#984](https://github.com/Shopify/sarama/pull/984)).
+ - Helper methods for Kafka version numbers to/from strings
+   ([#989](https://github.com/Shopify/sarama/pull/989)).
+ - Implement CreatePartitions request/response
+   ([#985](https://github.com/Shopify/sarama/pull/985)).
+
+Improvements:
+ - Add error codes 45-60
+   ([#986](https://github.com/Shopify/sarama/issues/986)).
+
+Bug Fixes:
+ - Fix slow consuming for certain Kafka 0.11/1.0 configurations
+   ([#982](https://github.com/Shopify/sarama/pull/982)).
+ - Correctly determine when a FetchResponse contains the new message format
+   ([#990](https://github.com/Shopify/sarama/pull/990)).
+ - Fix producing with multiple headers
+   ([#996](https://github.com/Shopify/sarama/pull/996)).
+ - Fix handling of truncated record batches
+   ([#998](https://github.com/Shopify/sarama/pull/998)).
+ - Fix leaking metrics when closing brokers
+   ([#991](https://github.com/Shopify/sarama/pull/991)).
+
+#### Version 1.14.0 (2017-11-13)
+
+New Features:
+ - Add support for the new Kafka 0.11 record-batch format, including the wire
+   protocol and the necessary behavioural changes in the producer and consumer.
+   Transactions and idempotency are not yet supported, but producing and
+   consuming should work with all the existing bells and whistles (batching,
+   compression, etc) as well as the new custom headers. Thanks to Vlad Hanciuta
+   of Arista Networks for this work. Part of
+   ([#901](https://github.com/Shopify/sarama/issues/901)).
+
+Bug Fixes:
+ - Fix encoding of ProduceResponse versions in test
+   ([#970](https://github.com/Shopify/sarama/pull/970)).
+ - Return partial replicas list when we have it
+   ([#975](https://github.com/Shopify/sarama/pull/975)).
+
+#### Version 1.13.0 (2017-10-04)
+
+New Features:
+ - Support for FetchRequest version 3
+   ([#905](https://github.com/Shopify/sarama/pull/905)).
+ - Permit setting version on mock FetchResponses
+   ([#939](https://github.com/Shopify/sarama/pull/939)).
+ - Add a configuration option to support storing only minimal metadata for
+   extremely large clusters
+   ([#937](https://github.com/Shopify/sarama/pull/937)).
+ - Add `PartitionOffsetManager.ResetOffset` for backtracking tracked offsets
+   ([#932](https://github.com/Shopify/sarama/pull/932)).
+
+Improvements:
+ - Provide the block-level timestamp when consuming compressed messages
+   ([#885](https://github.com/Shopify/sarama/issues/885)).
+ - `Client.Replicas` and `Client.InSyncReplicas` now respect the order returned
+   by the broker, which can be meaningful
+   ([#930](https://github.com/Shopify/sarama/pull/930)).
+ - Use a `Ticker` to reduce consumer timer overhead at the cost of higher
+   variance in the actual timeout
+   ([#933](https://github.com/Shopify/sarama/pull/933)).
+
+Bug Fixes:
+ - Gracefully handle messages with negative timestamps
+   ([#907](https://github.com/Shopify/sarama/pull/907)).
+ - Raise a proper error when encountering an unknown message version
+   ([#940](https://github.com/Shopify/sarama/pull/940)).
+
+#### Version 1.12.0 (2017-05-08)
+
+New Features:
+ - Added support for the `ApiVersions` request and response pair, and Kafka
+   version 0.10.2 ([#867](https://github.com/Shopify/sarama/pull/867)). Note
+   that you still need to specify the Kafka version in the Sarama configuration
+   for the time being.
+ - Added a `Brokers` method to the Client which returns the complete set of
+   active brokers ([#813](https://github.com/Shopify/sarama/pull/813)).
+ - Added an `InSyncReplicas` method to the Client which returns the set of all
+   in-sync broker IDs for the given partition, now that the Kafka versions for
+   which this was misleading are no longer in our supported set
+   ([#872](https://github.com/Shopify/sarama/pull/872)).
+ - Added a `NewCustomHashPartitioner` method which allows constructing a hash
+   partitioner with a custom hash method in case the default (FNV-1a) is not
+   suitable
+   ([#837](https://github.com/Shopify/sarama/pull/837),
+    [#841](https://github.com/Shopify/sarama/pull/841)).
+
+Improvements:
+ - Recognize more Kafka error codes
+   ([#859](https://github.com/Shopify/sarama/pull/859)).
+
+Bug Fixes:
+ - Fix an issue where decoding a malformed FetchRequest would not return the
+   correct error ([#818](https://github.com/Shopify/sarama/pull/818)).
+ - Respect ordering of group protocols in JoinGroupRequests. This fix is
+   transparent if you're using the `AddGroupProtocol` or
+   `AddGroupProtocolMetadata` helpers; otherwise you will need to switch from
+   the `GroupProtocols` field (now deprecated) to use `OrderedGroupProtocols`
+   ([#812](https://github.com/Shopify/sarama/issues/812)).
+ - Fix an alignment-related issue with atomics on 32-bit architectures
+   ([#859](https://github.com/Shopify/sarama/pull/859)).
+
+#### Version 1.11.0 (2016-12-20)
+
+_Important:_ As of Sarama 1.11 it is necessary to set the config value of
+`Producer.Return.Successes` to true in order to use the SyncProducer. Previous
+versions would silently override this value when instantiating a SyncProducer
+which led to unexpected values and data races.
+
+New Features:
+ - Metrics! Thanks to Sébastien Launay for all his work on this feature
+   ([#701](https://github.com/Shopify/sarama/pull/701),
+    [#746](https://github.com/Shopify/sarama/pull/746),
+    [#766](https://github.com/Shopify/sarama/pull/766)).
+ - Add support for LZ4 compression
+   ([#786](https://github.com/Shopify/sarama/pull/786)).
+ - Add support for ListOffsetRequest v1 and Kafka 0.10.1
+   ([#775](https://github.com/Shopify/sarama/pull/775)).
+ - Added a `HighWaterMarks` method to the Consumer which aggregates the
+   `HighWaterMarkOffset` values of its child topic/partitions
+   ([#769](https://github.com/Shopify/sarama/pull/769)).
+
+Bug Fixes:
+ - Fixed producing when using timestamps, compression and Kafka 0.10
+   ([#759](https://github.com/Shopify/sarama/pull/759)).
+ - Added missing decoder methods to DescribeGroups response
+   ([#756](https://github.com/Shopify/sarama/pull/756)).
+ - Fix producer shutdown when `Return.Errors` is disabled
+   ([#787](https://github.com/Shopify/sarama/pull/787)).
+ - Don't mutate configuration in SyncProducer
+   ([#790](https://github.com/Shopify/sarama/pull/790)).
+ - Fix crash on SASL initialization failure
+   ([#795](https://github.com/Shopify/sarama/pull/795)).
+
+#### Version 1.10.1 (2016-08-30)
+
+Bug Fixes:
+ - Fix the documentation for `HashPartitioner` which was incorrect
+   ([#717](https://github.com/Shopify/sarama/pull/717)).
+ - Permit client creation even when it is limited by ACLs
+   ([#722](https://github.com/Shopify/sarama/pull/722)).
+ - Several fixes to the consumer timer optimization code, regressions introduced
+   in v1.10.0. Go's timers are finicky
+   ([#730](https://github.com/Shopify/sarama/pull/730),
+    [#733](https://github.com/Shopify/sarama/pull/733),
+    [#734](https://github.com/Shopify/sarama/pull/734)).
+ - Handle consuming compressed relative offsets with Kafka 0.10
+   ([#735](https://github.com/Shopify/sarama/pull/735)).
+
+#### Version 1.10.0 (2016-08-02)
+
+_Important:_ As of Sarama 1.10 it is necessary to tell Sarama the version of
+Kafka you are running against (via the `config.Version` value) in order to use
+features that may not be compatible with old Kafka versions. If you don't
+specify this value it will default to 0.8.2 (the minimum supported), and trying
+to use more recent features (like the offset manager) will fail with an error.
+
+_Also:_ The offset-manager's behaviour has been changed to match the upstream
+java consumer (see [#705](https://github.com/Shopify/sarama/pull/705) and
+[#713](https://github.com/Shopify/sarama/pull/713)). If you use the
+offset-manager, please ensure that you are committing one *greater* than the
+last consumed message offset or else you may end up consuming duplicate
+messages.
+
+New Features:
+ - Support for Kafka 0.10
+   ([#672](https://github.com/Shopify/sarama/pull/672),
+    [#678](https://github.com/Shopify/sarama/pull/678),
+    [#681](https://github.com/Shopify/sarama/pull/681), and others).
+ - Support for configuring the target Kafka version
+   ([#676](https://github.com/Shopify/sarama/pull/676)).
+ - Batch producing support in the SyncProducer
+   ([#677](https://github.com/Shopify/sarama/pull/677)).
+ - Extend producer mock to allow setting expectations on message contents
+   ([#667](https://github.com/Shopify/sarama/pull/667)).
+
+Improvements:
+ - Support `nil` compressed messages for deleting in compacted topics
+   ([#634](https://github.com/Shopify/sarama/pull/634)).
+ - Pre-allocate decoding errors, greatly reducing heap usage and GC time against
+   misbehaving brokers ([#690](https://github.com/Shopify/sarama/pull/690)).
+ - Re-use consumer expiry timers, removing one allocation per consumed message
+   ([#707](https://github.com/Shopify/sarama/pull/707)).
+
+Bug Fixes:
+ - Actually default the client ID to "sarama" like we say we do
+   ([#664](https://github.com/Shopify/sarama/pull/664)).
+ - Fix a rare issue where `Client.Leader` could return the wrong error
+   ([#685](https://github.com/Shopify/sarama/pull/685)).
+ - Fix a possible tight loop in the consumer
+   ([#693](https://github.com/Shopify/sarama/pull/693)).
+ - Match upstream's offset-tracking behaviour
+   ([#705](https://github.com/Shopify/sarama/pull/705)).
+ - Report UnknownTopicOrPartition errors from the offset manager
+   ([#706](https://github.com/Shopify/sarama/pull/706)).
+ - Fix possible negative partition value from the HashPartitioner
+   ([#709](https://github.com/Shopify/sarama/pull/709)).
+
+#### Version 1.9.0 (2016-05-16)
+
+New Features:
+ - Add support for custom offset manager retention durations
+   ([#602](https://github.com/Shopify/sarama/pull/602)).
+ - Publish low-level mocks to enable testing of third-party producer/consumer
+   implementations ([#570](https://github.com/Shopify/sarama/pull/570)).
+ - Declare support for Golang 1.6
+   ([#611](https://github.com/Shopify/sarama/pull/611)).
+ - Support for SASL plain-text auth
+   ([#648](https://github.com/Shopify/sarama/pull/648)).
+
+Improvements:
+ - Simplified broker locking scheme slightly
+   ([#604](https://github.com/Shopify/sarama/pull/604)).
+ - Documentation cleanup
+   ([#605](https://github.com/Shopify/sarama/pull/605),
+    [#621](https://github.com/Shopify/sarama/pull/621),
+    [#654](https://github.com/Shopify/sarama/pull/654)).
+
+Bug Fixes:
+ - Fix race condition shutting down the OffsetManager
+   ([#658](https://github.com/Shopify/sarama/pull/658)).
+
+#### Version 1.8.0 (2016-02-01)
+
+New Features:
+ - Full support for Kafka 0.9:
+   - All protocol messages and fields
+   ([#586](https://github.com/Shopify/sarama/pull/586),
+   [#588](https://github.com/Shopify/sarama/pull/588),
+   [#590](https://github.com/Shopify/sarama/pull/590)).
+   - Verified that TLS support works
+   ([#581](https://github.com/Shopify/sarama/pull/581)).
+   - Fixed the OffsetManager compatibility
+   ([#585](https://github.com/Shopify/sarama/pull/585)).
+
+Improvements:
+ - Optimize for fewer system calls when reading from the network
+   ([#584](https://github.com/Shopify/sarama/pull/584)).
+ - Automatically retry `InvalidMessage` errors to match upstream behaviour
+   ([#589](https://github.com/Shopify/sarama/pull/589)).
+
+#### Version 1.7.0 (2015-12-11)
+
+New Features:
+ - Preliminary support for Kafka 0.9
+   ([#572](https://github.com/Shopify/sarama/pull/572)). This comes with several
+   caveats:
+   - Protocol-layer support is mostly in place
+     ([#577](https://github.com/Shopify/sarama/pull/577)), however Kafka 0.9
+     renamed some messages and fields, which we did not in order to preserve API
+     compatibility.
+   - The producer and consumer work against 0.9, but the offset manager does
+     not ([#573](https://github.com/Shopify/sarama/pull/573)).
+   - TLS support may or may not work
+     ([#581](https://github.com/Shopify/sarama/pull/581)).
+
+Improvements:
+ - Don't wait for request timeouts on dead brokers, greatly speeding recovery
+   when the TCP connection is left hanging
+   ([#548](https://github.com/Shopify/sarama/pull/548)).
+ - Refactored part of the producer. The new version provides a much more elegant
+   solution to [#449](https://github.com/Shopify/sarama/pull/449). It is also
+   slightly more efficient, and much more precise in calculating batch sizes
+   when compression is used
+   ([#549](https://github.com/Shopify/sarama/pull/549),
+   [#550](https://github.com/Shopify/sarama/pull/550),
+   [#551](https://github.com/Shopify/sarama/pull/551)).
+
+Bug Fixes:
+ - Fix race condition in consumer test mock
+   ([#553](https://github.com/Shopify/sarama/pull/553)).
+
+#### Version 1.6.1 (2015-09-25)
+
+Bug Fixes:
+ - Fix panic that could occur if a user-supplied message value failed to encode
+   ([#449](https://github.com/Shopify/sarama/pull/449)).
+
+#### Version 1.6.0 (2015-09-04)
+
+New Features:
+ - Implementation of a consumer offset manager using the APIs introduced in
+   Kafka 0.8.2. The API is designed mainly for integration into a future
+   high-level consumer, not for direct use, although it is *possible* to use it
+   directly.
+   ([#461](https://github.com/Shopify/sarama/pull/461)).
+
+Improvements:
+ - CRC32 calculation is much faster on machines with SSE4.2 instructions,
+   removing a major hotspot from most profiles
+   ([#255](https://github.com/Shopify/sarama/pull/255)).
+
+Bug Fixes:
+ - Make protocol decoding more robust against some malformed packets generated
+   by go-fuzz ([#523](https://github.com/Shopify/sarama/pull/523),
+   [#525](https://github.com/Shopify/sarama/pull/525)) or found in other ways
+   ([#528](https://github.com/Shopify/sarama/pull/528)).
+ - Fix a potential race condition panic in the consumer on shutdown
+   ([#529](https://github.com/Shopify/sarama/pull/529)).
+
+#### Version 1.5.0 (2015-08-17)
+
+New Features:
+ - TLS-encrypted network connections are now supported. This feature is subject
+   to change when Kafka releases built-in TLS support, but for now this is
+   enough to work with TLS-terminating proxies
+   ([#154](https://github.com/Shopify/sarama/pull/154)).
+
+Improvements:
+ - The consumer will not block if a single partition is not drained by the user;
+   all other partitions will continue to consume normally
+   ([#485](https://github.com/Shopify/sarama/pull/485)).
+ - Formatting of error strings has been much improved
+   ([#495](https://github.com/Shopify/sarama/pull/495)).
+ - Internal refactoring of the producer for code cleanliness and to enable
+   future work ([#300](https://github.com/Shopify/sarama/pull/300)).
+
+Bug Fixes:
+ - Fix a potential deadlock in the consumer on shutdown
+   ([#475](https://github.com/Shopify/sarama/pull/475)).
+
+#### Version 1.4.3 (2015-07-21)
+
+Bug Fixes:
+ - Don't include the partitioner in the producer's "fetch partitions"
+   circuit-breaker ([#466](https://github.com/Shopify/sarama/pull/466)).
+ - Don't retry messages until the broker is closed when abandoning a broker in
+   the producer ([#468](https://github.com/Shopify/sarama/pull/468)).
+ - Update the import path for snappy-go, it has moved again and the API has
+   changed slightly ([#486](https://github.com/Shopify/sarama/pull/486)).
+
+#### Version 1.4.2 (2015-05-27)
+
+Bug Fixes:
+ - Update the import path for snappy-go, it has moved from google code to github
+   ([#456](https://github.com/Shopify/sarama/pull/456)).
+
+#### Version 1.4.1 (2015-05-25)
+
+Improvements:
+ - Optimizations when decoding snappy messages, thanks to John Potocny
+   ([#446](https://github.com/Shopify/sarama/pull/446)).
+
+Bug Fixes:
+ - Fix hypothetical race conditions on producer shutdown
+   ([#450](https://github.com/Shopify/sarama/pull/450),
+   [#451](https://github.com/Shopify/sarama/pull/451)).
+
+#### Version 1.4.0 (2015-05-01)
+
+New Features:
+ - The consumer now implements `Topics()` and `Partitions()` methods to enable
+   users to dynamically choose what topics/partitions to consume without
+   instantiating a full client
+   ([#431](https://github.com/Shopify/sarama/pull/431)).
+ - The partition-consumer now exposes the high water mark offset value returned
+   by the broker via the `HighWaterMarkOffset()` method ([#339](https://github.com/Shopify/sarama/pull/339)).
+ - Added a `kafka-console-consumer` tool capable of handling multiple
+   partitions, and deprecated the now-obsolete `kafka-console-partitionConsumer`
+   ([#439](https://github.com/Shopify/sarama/pull/439),
+   [#442](https://github.com/Shopify/sarama/pull/442)).
+
+Improvements:
+ - The producer's logging during retry scenarios is more consistent, more
+   useful, and slightly less verbose
+   ([#429](https://github.com/Shopify/sarama/pull/429)).
+ - The client now shuffles its initial list of seed brokers in order to prevent
+   thundering herd on the first broker in the list
+   ([#441](https://github.com/Shopify/sarama/pull/441)).
+
+Bug Fixes:
+ - The producer now correctly manages its state if retries occur when it is
+   shutting down, fixing several instances of confusing behaviour and at least
+   one potential deadlock ([#419](https://github.com/Shopify/sarama/pull/419)).
+ - The consumer now handles messages for different partitions asynchronously,
+   making it much more resilient to specific user code ordering
+   ([#325](https://github.com/Shopify/sarama/pull/325)).
+
+#### Version 1.3.0 (2015-04-16)
+
+New Features:
+ - The client now tracks consumer group coordinators using
+   ConsumerMetadataRequests similar to how it tracks partition leadership using
+   regular MetadataRequests ([#411](https://github.com/Shopify/sarama/pull/411)).
+   This adds two methods to the client API:
+   - `Coordinator(consumerGroup string) (*Broker, error)`
+   - `RefreshCoordinator(consumerGroup string) error`
+
+Improvements:
+ - ConsumerMetadataResponses now automatically create a Broker object out of the
+   ID/address/port combination for the Coordinator; accessing the fields
+   individually has been deprecated
+   ([#413](https://github.com/Shopify/sarama/pull/413)).
+ - Much improved handling of `OffsetOutOfRange` errors in the consumer.
+   Consumers will fail to start if the provided offset is out of range
+   ([#418](https://github.com/Shopify/sarama/pull/418))
+   and they will automatically shut down if the offset falls out of range
+   ([#424](https://github.com/Shopify/sarama/pull/424)).
+ - Small performance improvement in encoding and decoding protocol messages
+   ([#427](https://github.com/Shopify/sarama/pull/427)).
+
+Bug Fixes:
+ - Fix a rare race condition in the client's background metadata refresher if
+   it happens to be activated while the client is being closed
+   ([#422](https://github.com/Shopify/sarama/pull/422)).
+
+#### Version 1.2.0 (2015-04-07)
+
+Improvements:
+ - The producer's behaviour when `Flush.Frequency` is set is now more intuitive
+   ([#389](https://github.com/Shopify/sarama/pull/389)).
+ - The producer is now somewhat more memory-efficient during and after retrying
+   messages due to an improved queue implementation
+   ([#396](https://github.com/Shopify/sarama/pull/396)).
+ - The consumer produces much more useful logging output when leadership
+   changes ([#385](https://github.com/Shopify/sarama/pull/385)).
+ - The client's `GetOffset` method will now automatically refresh metadata and
+   retry once in the event of stale information or similar
+   ([#394](https://github.com/Shopify/sarama/pull/394)).
+ - Broker connections now have support for using TCP keepalives
+   ([#407](https://github.com/Shopify/sarama/issues/407)).
+
+Bug Fixes:
+ - The OffsetCommitRequest message now correctly implements all three possible
+   API versions ([#390](https://github.com/Shopify/sarama/pull/390),
+   [#400](https://github.com/Shopify/sarama/pull/400)).
+
+#### Version 1.1.0 (2015-03-20)
+
+Improvements:
+ - Wrap the producer's partitioner call in a circuit-breaker so that repeatedly
+   broken topics don't choke throughput
+   ([#373](https://github.com/Shopify/sarama/pull/373)).
+
+Bug Fixes:
+ - Fix the producer's internal reference counting in certain unusual scenarios
+   ([#367](https://github.com/Shopify/sarama/pull/367)).
+ - Fix the consumer's internal reference counting in certain unusual scenarios
+   ([#369](https://github.com/Shopify/sarama/pull/369)).
+ - Fix a condition where the producer's internal control messages could have
+   gotten stuck ([#368](https://github.com/Shopify/sarama/pull/368)).
+ - Fix an issue where invalid partition lists would be cached when asking for
+   metadata for a non-existant topic ([#372](https://github.com/Shopify/sarama/pull/372)).
+
+
+#### Version 1.0.0 (2015-03-17)
+
+Version 1.0.0 is the first tagged version, and is almost a complete rewrite. The primary differences with previous untagged versions are:
+
+- The producer has been rewritten; there is now a `SyncProducer` with a blocking API, and an `AsyncProducer` that is non-blocking.
+- The consumer has been rewritten to only open one connection per broker instead of one connection per partition.
+- The main types of Sarama are now interfaces to make depedency injection easy; mock implementations for `Consumer`, `SyncProducer` and `AsyncProducer` are provided in the `github.com/Shopify/sarama/mocks` package.
+- For most uses cases, it is no longer necessary to open a `Client`; this will be done for you.
+- All the configuration values have been unified in the `Config` struct.
+- Much improved test suite.

diff --git a/vendor/github.com/Shopify/sarama/LICENSE b/vendor/github.com/Shopify/sarama/LICENSE
new file mode 100644
index 0000000..d2bf435
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/LICENSE

@@ -0,0 +1,20 @@
+Copyright (c) 2013 Shopify
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

diff --git a/vendor/github.com/Shopify/sarama/Makefile b/vendor/github.com/Shopify/sarama/Makefile
new file mode 100644
index 0000000..8fcf219
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/Makefile

@@ -0,0 +1,30 @@
+default: fmt vet errcheck test
+
+# Taken from https://github.com/codecov/example-go#caveat-multiple-files
+test:
+	echo "" > coverage.txt
+	for d in `go list ./... | grep -v vendor`; do \
+		go test -p 1 -v -timeout 240s -race -coverprofile=profile.out -covermode=atomic $$d || exit 1; \
+		if [ -f profile.out ]; then \
+			cat profile.out >> coverage.txt; \
+			rm profile.out; \
+		fi \
+	done
+
+vet:
+	go vet ./...
+
+# See https://github.com/kisielk/errcheck/pull/141 for details on ignorepkg
+errcheck:
+	errcheck -ignorepkg fmt github.com/Shopify/sarama/...
+
+fmt:
+	@if [ -n "$$(go fmt ./...)" ]; then echo 'Please run go fmt on your code.' && exit 1; fi
+
+install_dependencies: install_errcheck get
+
+install_errcheck:
+	go get github.com/kisielk/errcheck
+
+get:
+	go get -t

diff --git a/vendor/github.com/Shopify/sarama/README.md b/vendor/github.com/Shopify/sarama/README.md
new file mode 100644
index 0000000..b997093
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/README.md

@@ -0,0 +1,39 @@
+sarama
+======
+
+[![GoDoc](https://godoc.org/github.com/Shopify/sarama?status.png)](https://godoc.org/github.com/Shopify/sarama)
+[![Build Status](https://travis-ci.org/Shopify/sarama.svg?branch=master)](https://travis-ci.org/Shopify/sarama)
+[![Coverage](https://codecov.io/gh/Shopify/sarama/branch/master/graph/badge.svg)](https://codecov.io/gh/Shopify/sarama)
+
+Sarama is an MIT-licensed Go client library for [Apache Kafka](https://kafka.apache.org/) version 0.8 (and later).
+
+### Getting started
+
+- API documentation and examples are available via [godoc](https://godoc.org/github.com/Shopify/sarama).
+- Mocks for testing are available in the [mocks](./mocks) subpackage.
+- The [examples](./examples) directory contains more elaborate example applications.
+- The [tools](./tools) directory contains command line tools that can be useful for testing, diagnostics, and instrumentation.
+
+You might also want to look at the [Frequently Asked Questions](https://github.com/Shopify/sarama/wiki/Frequently-Asked-Questions).
+
+### Compatibility and API stability
+
+Sarama provides a "2 releases + 2 months" compatibility guarantee: we support
+the two latest stable releases of Kafka and Go, and we provide a two month
+grace period for older releases. This means we currently officially support
+Go 1.8 through 1.11, and Kafka 1.0 through 2.0, although older releases are
+still likely to work.
+
+Sarama follows semantic versioning and provides API stability via the gopkg.in service.
+You can import a version with a guaranteed stable API via http://gopkg.in/Shopify/sarama.v1.
+A changelog is available [here](CHANGELOG.md).
+
+### Contributing
+
+* Get started by checking our [contribution guidelines](https://github.com/Shopify/sarama/blob/master/.github/CONTRIBUTING.md).
+* Read the [Sarama wiki](https://github.com/Shopify/sarama/wiki) for more
+  technical and design details.
+* The [Kafka Protocol Specification](https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol)
+  contains a wealth of useful information.
+* For more general issues, there is [a google group](https://groups.google.com/forum/#!forum/kafka-clients) for Kafka client developers.
+* If you have any questions, just ask!

diff --git a/vendor/github.com/Shopify/sarama/Vagrantfile b/vendor/github.com/Shopify/sarama/Vagrantfile
new file mode 100644
index 0000000..f4b848a
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/Vagrantfile

@@ -0,0 +1,20 @@
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+
+# Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
+VAGRANTFILE_API_VERSION = "2"
+
+# We have 5 * 192MB ZK processes and 5 * 320MB Kafka processes => 2560MB
+MEMORY = 3072
+
+Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
+  config.vm.box = "ubuntu/trusty64"
+
+  config.vm.provision :shell, path: "vagrant/provision.sh"
+
+  config.vm.network "private_network", ip: "192.168.100.67"
+
+  config.vm.provider "virtualbox" do |v|
+    v.memory = MEMORY
+  end
+end

diff --git a/vendor/github.com/Shopify/sarama/acl_bindings.go b/vendor/github.com/Shopify/sarama/acl_bindings.go
new file mode 100644
index 0000000..5151735
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/acl_bindings.go

@@ -0,0 +1,119 @@
+package sarama
+
+type Resource struct {
+	ResourceType AclResourceType
+	ResourceName string
+}
+
+func (r *Resource) encode(pe packetEncoder) error {
+	pe.putInt8(int8(r.ResourceType))
+
+	if err := pe.putString(r.ResourceName); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (r *Resource) decode(pd packetDecoder, version int16) (err error) {
+	resourceType, err := pd.getInt8()
+	if err != nil {
+		return err
+	}
+	r.ResourceType = AclResourceType(resourceType)
+
+	if r.ResourceName, err = pd.getString(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+type Acl struct {
+	Principal      string
+	Host           string
+	Operation      AclOperation
+	PermissionType AclPermissionType
+}
+
+func (a *Acl) encode(pe packetEncoder) error {
+	if err := pe.putString(a.Principal); err != nil {
+		return err
+	}
+
+	if err := pe.putString(a.Host); err != nil {
+		return err
+	}
+
+	pe.putInt8(int8(a.Operation))
+	pe.putInt8(int8(a.PermissionType))
+
+	return nil
+}
+
+func (a *Acl) decode(pd packetDecoder, version int16) (err error) {
+	if a.Principal, err = pd.getString(); err != nil {
+		return err
+	}
+
+	if a.Host, err = pd.getString(); err != nil {
+		return err
+	}
+
+	operation, err := pd.getInt8()
+	if err != nil {
+		return err
+	}
+	a.Operation = AclOperation(operation)
+
+	permissionType, err := pd.getInt8()
+	if err != nil {
+		return err
+	}
+	a.PermissionType = AclPermissionType(permissionType)
+
+	return nil
+}
+
+type ResourceAcls struct {
+	Resource
+	Acls []*Acl
+}
+
+func (r *ResourceAcls) encode(pe packetEncoder) error {
+	if err := r.Resource.encode(pe); err != nil {
+		return err
+	}
+
+	if err := pe.putArrayLength(len(r.Acls)); err != nil {
+		return err
+	}
+	for _, acl := range r.Acls {
+		if err := acl.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (r *ResourceAcls) decode(pd packetDecoder, version int16) error {
+	if err := r.Resource.decode(pd, version); err != nil {
+		return err
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.Acls = make([]*Acl, n)
+	for i := 0; i < n; i++ {
+		r.Acls[i] = new(Acl)
+		if err := r.Acls[i].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/acl_create_request.go b/vendor/github.com/Shopify/sarama/acl_create_request.go
new file mode 100644
index 0000000..0b6ecbe
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/acl_create_request.go

@@ -0,0 +1,76 @@
+package sarama
+
+type CreateAclsRequest struct {
+	AclCreations []*AclCreation
+}
+
+func (c *CreateAclsRequest) encode(pe packetEncoder) error {
+	if err := pe.putArrayLength(len(c.AclCreations)); err != nil {
+		return err
+	}
+
+	for _, aclCreation := range c.AclCreations {
+		if err := aclCreation.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (c *CreateAclsRequest) decode(pd packetDecoder, version int16) (err error) {
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	c.AclCreations = make([]*AclCreation, n)
+
+	for i := 0; i < n; i++ {
+		c.AclCreations[i] = new(AclCreation)
+		if err := c.AclCreations[i].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (d *CreateAclsRequest) key() int16 {
+	return 30
+}
+
+func (d *CreateAclsRequest) version() int16 {
+	return 0
+}
+
+func (d *CreateAclsRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}
+
+type AclCreation struct {
+	Resource
+	Acl
+}
+
+func (a *AclCreation) encode(pe packetEncoder) error {
+	if err := a.Resource.encode(pe); err != nil {
+		return err
+	}
+	if err := a.Acl.encode(pe); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (a *AclCreation) decode(pd packetDecoder, version int16) (err error) {
+	if err := a.Resource.decode(pd, version); err != nil {
+		return err
+	}
+	if err := a.Acl.decode(pd, version); err != nil {
+		return err
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/acl_create_response.go b/vendor/github.com/Shopify/sarama/acl_create_response.go
new file mode 100644
index 0000000..8a56f35
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/acl_create_response.go

@@ -0,0 +1,88 @@
+package sarama
+
+import "time"
+
+type CreateAclsResponse struct {
+	ThrottleTime         time.Duration
+	AclCreationResponses []*AclCreationResponse
+}
+
+func (c *CreateAclsResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(c.ThrottleTime / time.Millisecond))
+
+	if err := pe.putArrayLength(len(c.AclCreationResponses)); err != nil {
+		return err
+	}
+
+	for _, aclCreationResponse := range c.AclCreationResponses {
+		if err := aclCreationResponse.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (c *CreateAclsResponse) decode(pd packetDecoder, version int16) (err error) {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	c.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	c.AclCreationResponses = make([]*AclCreationResponse, n)
+	for i := 0; i < n; i++ {
+		c.AclCreationResponses[i] = new(AclCreationResponse)
+		if err := c.AclCreationResponses[i].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (d *CreateAclsResponse) key() int16 {
+	return 30
+}
+
+func (d *CreateAclsResponse) version() int16 {
+	return 0
+}
+
+func (d *CreateAclsResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}
+
+type AclCreationResponse struct {
+	Err    KError
+	ErrMsg *string
+}
+
+func (a *AclCreationResponse) encode(pe packetEncoder) error {
+	pe.putInt16(int16(a.Err))
+
+	if err := pe.putNullableString(a.ErrMsg); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (a *AclCreationResponse) decode(pd packetDecoder, version int16) (err error) {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	a.Err = KError(kerr)
+
+	if a.ErrMsg, err = pd.getNullableString(); err != nil {
+		return err
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/acl_delete_request.go b/vendor/github.com/Shopify/sarama/acl_delete_request.go
new file mode 100644
index 0000000..4133dce
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/acl_delete_request.go

@@ -0,0 +1,48 @@
+package sarama
+
+type DeleteAclsRequest struct {
+	Filters []*AclFilter
+}
+
+func (d *DeleteAclsRequest) encode(pe packetEncoder) error {
+	if err := pe.putArrayLength(len(d.Filters)); err != nil {
+		return err
+	}
+
+	for _, filter := range d.Filters {
+		if err := filter.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (d *DeleteAclsRequest) decode(pd packetDecoder, version int16) (err error) {
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	d.Filters = make([]*AclFilter, n)
+	for i := 0; i < n; i++ {
+		d.Filters[i] = new(AclFilter)
+		if err := d.Filters[i].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (d *DeleteAclsRequest) key() int16 {
+	return 31
+}
+
+func (d *DeleteAclsRequest) version() int16 {
+	return 0
+}
+
+func (d *DeleteAclsRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/acl_delete_response.go b/vendor/github.com/Shopify/sarama/acl_delete_response.go
new file mode 100644
index 0000000..b5e1c45
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/acl_delete_response.go

@@ -0,0 +1,155 @@
+package sarama
+
+import "time"
+
+type DeleteAclsResponse struct {
+	ThrottleTime    time.Duration
+	FilterResponses []*FilterResponse
+}
+
+func (a *DeleteAclsResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(a.ThrottleTime / time.Millisecond))
+
+	if err := pe.putArrayLength(len(a.FilterResponses)); err != nil {
+		return err
+	}
+
+	for _, filterResponse := range a.FilterResponses {
+		if err := filterResponse.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (a *DeleteAclsResponse) decode(pd packetDecoder, version int16) (err error) {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	a.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	a.FilterResponses = make([]*FilterResponse, n)
+
+	for i := 0; i < n; i++ {
+		a.FilterResponses[i] = new(FilterResponse)
+		if err := a.FilterResponses[i].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (d *DeleteAclsResponse) key() int16 {
+	return 31
+}
+
+func (d *DeleteAclsResponse) version() int16 {
+	return 0
+}
+
+func (d *DeleteAclsResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}
+
+type FilterResponse struct {
+	Err          KError
+	ErrMsg       *string
+	MatchingAcls []*MatchingAcl
+}
+
+func (f *FilterResponse) encode(pe packetEncoder) error {
+	pe.putInt16(int16(f.Err))
+	if err := pe.putNullableString(f.ErrMsg); err != nil {
+		return err
+	}
+
+	if err := pe.putArrayLength(len(f.MatchingAcls)); err != nil {
+		return err
+	}
+	for _, matchingAcl := range f.MatchingAcls {
+		if err := matchingAcl.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (f *FilterResponse) decode(pd packetDecoder, version int16) (err error) {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	f.Err = KError(kerr)
+
+	if f.ErrMsg, err = pd.getNullableString(); err != nil {
+		return err
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	f.MatchingAcls = make([]*MatchingAcl, n)
+	for i := 0; i < n; i++ {
+		f.MatchingAcls[i] = new(MatchingAcl)
+		if err := f.MatchingAcls[i].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+type MatchingAcl struct {
+	Err    KError
+	ErrMsg *string
+	Resource
+	Acl
+}
+
+func (m *MatchingAcl) encode(pe packetEncoder) error {
+	pe.putInt16(int16(m.Err))
+	if err := pe.putNullableString(m.ErrMsg); err != nil {
+		return err
+	}
+
+	if err := m.Resource.encode(pe); err != nil {
+		return err
+	}
+
+	if err := m.Acl.encode(pe); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (m *MatchingAcl) decode(pd packetDecoder, version int16) (err error) {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	m.Err = KError(kerr)
+
+	if m.ErrMsg, err = pd.getNullableString(); err != nil {
+		return err
+	}
+
+	if err := m.Resource.decode(pd, version); err != nil {
+		return err
+	}
+
+	if err := m.Acl.decode(pd, version); err != nil {
+		return err
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/acl_describe_request.go b/vendor/github.com/Shopify/sarama/acl_describe_request.go
new file mode 100644
index 0000000..02a5a1f
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/acl_describe_request.go

@@ -0,0 +1,25 @@
+package sarama
+
+type DescribeAclsRequest struct {
+	AclFilter
+}
+
+func (d *DescribeAclsRequest) encode(pe packetEncoder) error {
+	return d.AclFilter.encode(pe)
+}
+
+func (d *DescribeAclsRequest) decode(pd packetDecoder, version int16) (err error) {
+	return d.AclFilter.decode(pd, version)
+}
+
+func (d *DescribeAclsRequest) key() int16 {
+	return 29
+}
+
+func (d *DescribeAclsRequest) version() int16 {
+	return 0
+}
+
+func (d *DescribeAclsRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/acl_describe_response.go b/vendor/github.com/Shopify/sarama/acl_describe_response.go
new file mode 100644
index 0000000..5bc9497
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/acl_describe_response.go

@@ -0,0 +1,80 @@
+package sarama
+
+import "time"
+
+type DescribeAclsResponse struct {
+	ThrottleTime time.Duration
+	Err          KError
+	ErrMsg       *string
+	ResourceAcls []*ResourceAcls
+}
+
+func (d *DescribeAclsResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(d.ThrottleTime / time.Millisecond))
+	pe.putInt16(int16(d.Err))
+
+	if err := pe.putNullableString(d.ErrMsg); err != nil {
+		return err
+	}
+
+	if err := pe.putArrayLength(len(d.ResourceAcls)); err != nil {
+		return err
+	}
+
+	for _, resourceAcl := range d.ResourceAcls {
+		if err := resourceAcl.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (d *DescribeAclsResponse) decode(pd packetDecoder, version int16) (err error) {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	d.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	d.Err = KError(kerr)
+
+	errmsg, err := pd.getString()
+	if err != nil {
+		return err
+	}
+	if errmsg != "" {
+		d.ErrMsg = &errmsg
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	d.ResourceAcls = make([]*ResourceAcls, n)
+
+	for i := 0; i < n; i++ {
+		d.ResourceAcls[i] = new(ResourceAcls)
+		if err := d.ResourceAcls[i].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (d *DescribeAclsResponse) key() int16 {
+	return 29
+}
+
+func (d *DescribeAclsResponse) version() int16 {
+	return 0
+}
+
+func (d *DescribeAclsResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/acl_filter.go b/vendor/github.com/Shopify/sarama/acl_filter.go
new file mode 100644
index 0000000..9706354
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/acl_filter.go

@@ -0,0 +1,61 @@
+package sarama
+
+type AclFilter struct {
+	ResourceType   AclResourceType
+	ResourceName   *string
+	Principal      *string
+	Host           *string
+	Operation      AclOperation
+	PermissionType AclPermissionType
+}
+
+func (a *AclFilter) encode(pe packetEncoder) error {
+	pe.putInt8(int8(a.ResourceType))
+	if err := pe.putNullableString(a.ResourceName); err != nil {
+		return err
+	}
+	if err := pe.putNullableString(a.Principal); err != nil {
+		return err
+	}
+	if err := pe.putNullableString(a.Host); err != nil {
+		return err
+	}
+	pe.putInt8(int8(a.Operation))
+	pe.putInt8(int8(a.PermissionType))
+
+	return nil
+}
+
+func (a *AclFilter) decode(pd packetDecoder, version int16) (err error) {
+	resourceType, err := pd.getInt8()
+	if err != nil {
+		return err
+	}
+	a.ResourceType = AclResourceType(resourceType)
+
+	if a.ResourceName, err = pd.getNullableString(); err != nil {
+		return err
+	}
+
+	if a.Principal, err = pd.getNullableString(); err != nil {
+		return err
+	}
+
+	if a.Host, err = pd.getNullableString(); err != nil {
+		return err
+	}
+
+	operation, err := pd.getInt8()
+	if err != nil {
+		return err
+	}
+	a.Operation = AclOperation(operation)
+
+	permissionType, err := pd.getInt8()
+	if err != nil {
+		return err
+	}
+	a.PermissionType = AclPermissionType(permissionType)
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/acl_types.go b/vendor/github.com/Shopify/sarama/acl_types.go
new file mode 100644
index 0000000..19da6f2
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/acl_types.go

@@ -0,0 +1,42 @@
+package sarama
+
+type AclOperation int
+
+// ref: https://github.com/apache/kafka/blob/trunk/clients/src/main/java/org/apache/kafka/common/acl/AclOperation.java
+const (
+	AclOperationUnknown         AclOperation = 0
+	AclOperationAny             AclOperation = 1
+	AclOperationAll             AclOperation = 2
+	AclOperationRead            AclOperation = 3
+	AclOperationWrite           AclOperation = 4
+	AclOperationCreate          AclOperation = 5
+	AclOperationDelete          AclOperation = 6
+	AclOperationAlter           AclOperation = 7
+	AclOperationDescribe        AclOperation = 8
+	AclOperationClusterAction   AclOperation = 9
+	AclOperationDescribeConfigs AclOperation = 10
+	AclOperationAlterConfigs    AclOperation = 11
+	AclOperationIdempotentWrite AclOperation = 12
+)
+
+type AclPermissionType int
+
+// ref: https://github.com/apache/kafka/blob/trunk/clients/src/main/java/org/apache/kafka/common/acl/AclPermissionType.java
+const (
+	AclPermissionUnknown AclPermissionType = 0
+	AclPermissionAny     AclPermissionType = 1
+	AclPermissionDeny    AclPermissionType = 2
+	AclPermissionAllow   AclPermissionType = 3
+)
+
+type AclResourceType int
+
+// ref: https://github.com/apache/kafka/blob/trunk/clients/src/main/java/org/apache/kafka/common/resource/ResourceType.java
+const (
+	AclResourceUnknown         AclResourceType = 0
+	AclResourceAny             AclResourceType = 1
+	AclResourceTopic           AclResourceType = 2
+	AclResourceGroup           AclResourceType = 3
+	AclResourceCluster         AclResourceType = 4
+	AclResourceTransactionalID AclResourceType = 5
+)

diff --git a/vendor/github.com/Shopify/sarama/add_offsets_to_txn_request.go b/vendor/github.com/Shopify/sarama/add_offsets_to_txn_request.go
new file mode 100644
index 0000000..6da166c
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/add_offsets_to_txn_request.go

@@ -0,0 +1,52 @@
+package sarama
+
+type AddOffsetsToTxnRequest struct {
+	TransactionalID string
+	ProducerID      int64
+	ProducerEpoch   int16
+	GroupID         string
+}
+
+func (a *AddOffsetsToTxnRequest) encode(pe packetEncoder) error {
+	if err := pe.putString(a.TransactionalID); err != nil {
+		return err
+	}
+
+	pe.putInt64(a.ProducerID)
+
+	pe.putInt16(a.ProducerEpoch)
+
+	if err := pe.putString(a.GroupID); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (a *AddOffsetsToTxnRequest) decode(pd packetDecoder, version int16) (err error) {
+	if a.TransactionalID, err = pd.getString(); err != nil {
+		return err
+	}
+	if a.ProducerID, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if a.ProducerEpoch, err = pd.getInt16(); err != nil {
+		return err
+	}
+	if a.GroupID, err = pd.getString(); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (a *AddOffsetsToTxnRequest) key() int16 {
+	return 25
+}
+
+func (a *AddOffsetsToTxnRequest) version() int16 {
+	return 0
+}
+
+func (a *AddOffsetsToTxnRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/add_offsets_to_txn_response.go b/vendor/github.com/Shopify/sarama/add_offsets_to_txn_response.go
new file mode 100644
index 0000000..3a46151
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/add_offsets_to_txn_response.go

@@ -0,0 +1,44 @@
+package sarama
+
+import (
+	"time"
+)
+
+type AddOffsetsToTxnResponse struct {
+	ThrottleTime time.Duration
+	Err          KError
+}
+
+func (a *AddOffsetsToTxnResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(a.ThrottleTime / time.Millisecond))
+	pe.putInt16(int16(a.Err))
+	return nil
+}
+
+func (a *AddOffsetsToTxnResponse) decode(pd packetDecoder, version int16) (err error) {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	a.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	a.Err = KError(kerr)
+
+	return nil
+}
+
+func (a *AddOffsetsToTxnResponse) key() int16 {
+	return 25
+}
+
+func (a *AddOffsetsToTxnResponse) version() int16 {
+	return 0
+}
+
+func (a *AddOffsetsToTxnResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/add_partitions_to_txn_request.go b/vendor/github.com/Shopify/sarama/add_partitions_to_txn_request.go
new file mode 100644
index 0000000..a8a5922
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/add_partitions_to_txn_request.go

@@ -0,0 +1,76 @@
+package sarama
+
+type AddPartitionsToTxnRequest struct {
+	TransactionalID string
+	ProducerID      int64
+	ProducerEpoch   int16
+	TopicPartitions map[string][]int32
+}
+
+func (a *AddPartitionsToTxnRequest) encode(pe packetEncoder) error {
+	if err := pe.putString(a.TransactionalID); err != nil {
+		return err
+	}
+	pe.putInt64(a.ProducerID)
+	pe.putInt16(a.ProducerEpoch)
+
+	if err := pe.putArrayLength(len(a.TopicPartitions)); err != nil {
+		return err
+	}
+	for topic, partitions := range a.TopicPartitions {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := pe.putInt32Array(partitions); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (a *AddPartitionsToTxnRequest) decode(pd packetDecoder, version int16) (err error) {
+	if a.TransactionalID, err = pd.getString(); err != nil {
+		return err
+	}
+	if a.ProducerID, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if a.ProducerEpoch, err = pd.getInt16(); err != nil {
+		return err
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	a.TopicPartitions = make(map[string][]int32)
+	for i := 0; i < n; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		partitions, err := pd.getInt32Array()
+		if err != nil {
+			return err
+		}
+
+		a.TopicPartitions[topic] = partitions
+	}
+
+	return nil
+}
+
+func (a *AddPartitionsToTxnRequest) key() int16 {
+	return 24
+}
+
+func (a *AddPartitionsToTxnRequest) version() int16 {
+	return 0
+}
+
+func (a *AddPartitionsToTxnRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/add_partitions_to_txn_response.go b/vendor/github.com/Shopify/sarama/add_partitions_to_txn_response.go
new file mode 100644
index 0000000..581c556
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/add_partitions_to_txn_response.go

@@ -0,0 +1,108 @@
+package sarama
+
+import (
+	"time"
+)
+
+type AddPartitionsToTxnResponse struct {
+	ThrottleTime time.Duration
+	Errors       map[string][]*PartitionError
+}
+
+func (a *AddPartitionsToTxnResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(a.ThrottleTime / time.Millisecond))
+	if err := pe.putArrayLength(len(a.Errors)); err != nil {
+		return err
+	}
+
+	for topic, e := range a.Errors {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := pe.putArrayLength(len(e)); err != nil {
+			return err
+		}
+		for _, partitionError := range e {
+			if err := partitionError.encode(pe); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func (a *AddPartitionsToTxnResponse) decode(pd packetDecoder, version int16) (err error) {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	a.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	a.Errors = make(map[string][]*PartitionError)
+
+	for i := 0; i < n; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		m, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+
+		a.Errors[topic] = make([]*PartitionError, m)
+
+		for j := 0; j < m; j++ {
+			a.Errors[topic][j] = new(PartitionError)
+			if err := a.Errors[topic][j].decode(pd, version); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func (a *AddPartitionsToTxnResponse) key() int16 {
+	return 24
+}
+
+func (a *AddPartitionsToTxnResponse) version() int16 {
+	return 0
+}
+
+func (a *AddPartitionsToTxnResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}
+
+type PartitionError struct {
+	Partition int32
+	Err       KError
+}
+
+func (p *PartitionError) encode(pe packetEncoder) error {
+	pe.putInt32(p.Partition)
+	pe.putInt16(int16(p.Err))
+	return nil
+}
+
+func (p *PartitionError) decode(pd packetDecoder, version int16) (err error) {
+	if p.Partition, err = pd.getInt32(); err != nil {
+		return err
+	}
+
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	p.Err = KError(kerr)
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/admin.go b/vendor/github.com/Shopify/sarama/admin.go
new file mode 100644
index 0000000..5272575
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/admin.go

@@ -0,0 +1,382 @@
+package sarama
+
+import "errors"
+
+// ClusterAdmin is the administrative client for Kafka, which supports managing and inspecting topics,
+// brokers, configurations and ACLs. The minimum broker version required is 0.10.0.0.
+// Methods with stricter requirements will specify the minimum broker version required.
+// You MUST call Close() on a client to avoid leaks
+type ClusterAdmin interface {
+	// Creates a new topic. This operation is supported by brokers with version 0.10.1.0 or higher.
+	// It may take several seconds after CreateTopic returns success for all the brokers
+	// to become aware that the topic has been created. During this time, listTopics
+	// may not return information about the new topic.The validateOnly option is supported from version 0.10.2.0.
+	CreateTopic(topic string, detail *TopicDetail, validateOnly bool) error
+
+	// Delete a topic. It may take several seconds after the DeleteTopic to returns success
+	// and for all the brokers to become aware that the topics are gone.
+	// During this time, listTopics  may continue to return information about the deleted topic.
+	// If delete.topic.enable is false on the brokers, deleteTopic will mark
+	// the topic for deletion, but not actually delete them.
+	// This operation is supported by brokers with version 0.10.1.0 or higher.
+	DeleteTopic(topic string) error
+
+	// Increase the number of partitions of the topics  according to the corresponding values.
+	// If partitions are increased for a topic that has a key, the partition logic or ordering of
+	// the messages will be affected. It may take several seconds after this method returns
+	// success for all the brokers to become aware that the partitions have been created.
+	// During this time, ClusterAdmin#describeTopics may not return information about the
+	// new partitions. This operation is supported by brokers with version 1.0.0 or higher.
+	CreatePartitions(topic string, count int32, assignment [][]int32, validateOnly bool) error
+
+	// Delete records whose offset is smaller than the given offset of the corresponding partition.
+	// This operation is supported by brokers with version 0.11.0.0 or higher.
+	DeleteRecords(topic string, partitionOffsets map[int32]int64) error
+
+	// Get the configuration for the specified resources.
+	// The returned configuration includes default values and the Default is true
+	// can be used to distinguish them from user supplied values.
+	// Config entries where ReadOnly is true cannot be updated.
+	// The value of config entries where Sensitive is true is always nil so
+	// sensitive information is not disclosed.
+	// This operation is supported by brokers with version 0.11.0.0 or higher.
+	DescribeConfig(resource ConfigResource) ([]ConfigEntry, error)
+
+	// Update the configuration for the specified resources with the default options.
+	// This operation is supported by brokers with version 0.11.0.0 or higher.
+	// The resources with their configs (topic is the only resource type with configs
+	// that can be updated currently Updates are not transactional so they may succeed
+	// for some resources while fail for others. The configs for a particular resource are updated automatically.
+	AlterConfig(resourceType ConfigResourceType, name string, entries map[string]*string, validateOnly bool) error
+
+	// Creates access control lists (ACLs) which are bound to specific resources.
+	// This operation is not transactional so it may succeed for some ACLs while fail for others.
+	// If you attempt to add an ACL that duplicates an existing ACL, no error will be raised, but
+	// no changes will be made. This operation is supported by brokers with version 0.11.0.0 or higher.
+	CreateACL(resource Resource, acl Acl) error
+
+	// Lists access control lists (ACLs) according to the supplied filter.
+	// it may take some time for changes made by createAcls or deleteAcls to be reflected in the output of ListAcls
+	// This operation is supported by brokers with version 0.11.0.0 or higher.
+	ListAcls(filter AclFilter) ([]ResourceAcls, error)
+
+	// Deletes access control lists (ACLs) according to the supplied filters.
+	// This operation is not transactional so it may succeed for some ACLs while fail for others.
+	// This operation is supported by brokers with version 0.11.0.0 or higher.
+	DeleteACL(filter AclFilter, validateOnly bool) ([]MatchingAcl, error)
+
+	// Close shuts down the admin and closes underlying client.
+	Close() error
+}
+
+type clusterAdmin struct {
+	client Client
+	conf   *Config
+}
+
+// NewClusterAdmin creates a new ClusterAdmin using the given broker addresses and configuration.
+func NewClusterAdmin(addrs []string, conf *Config) (ClusterAdmin, error) {
+	client, err := NewClient(addrs, conf)
+	if err != nil {
+		return nil, err
+	}
+
+	//make sure we can retrieve the controller
+	_, err = client.Controller()
+	if err != nil {
+		return nil, err
+	}
+
+	ca := &clusterAdmin{
+		client: client,
+		conf:   client.Config(),
+	}
+	return ca, nil
+}
+
+func (ca *clusterAdmin) Close() error {
+	return ca.client.Close()
+}
+
+func (ca *clusterAdmin) Controller() (*Broker, error) {
+	return ca.client.Controller()
+}
+
+func (ca *clusterAdmin) CreateTopic(topic string, detail *TopicDetail, validateOnly bool) error {
+
+	if topic == "" {
+		return ErrInvalidTopic
+	}
+
+	if detail == nil {
+		return errors.New("You must specify topic details")
+	}
+
+	topicDetails := make(map[string]*TopicDetail)
+	topicDetails[topic] = detail
+
+	request := &CreateTopicsRequest{
+		TopicDetails: topicDetails,
+		ValidateOnly: validateOnly,
+		Timeout:      ca.conf.Admin.Timeout,
+	}
+
+	if ca.conf.Version.IsAtLeast(V0_11_0_0) {
+		request.Version = 1
+	}
+	if ca.conf.Version.IsAtLeast(V1_0_0_0) {
+		request.Version = 2
+	}
+
+	b, err := ca.Controller()
+	if err != nil {
+		return err
+	}
+
+	rsp, err := b.CreateTopics(request)
+	if err != nil {
+		return err
+	}
+
+	topicErr, ok := rsp.TopicErrors[topic]
+	if !ok {
+		return ErrIncompleteResponse
+	}
+
+	if topicErr.Err != ErrNoError {
+		return topicErr.Err
+	}
+
+	return nil
+}
+
+func (ca *clusterAdmin) DeleteTopic(topic string) error {
+
+	if topic == "" {
+		return ErrInvalidTopic
+	}
+
+	request := &DeleteTopicsRequest{
+		Topics:  []string{topic},
+		Timeout: ca.conf.Admin.Timeout,
+	}
+
+	if ca.conf.Version.IsAtLeast(V0_11_0_0) {
+		request.Version = 1
+	}
+
+	b, err := ca.Controller()
+	if err != nil {
+		return err
+	}
+
+	rsp, err := b.DeleteTopics(request)
+	if err != nil {
+		return err
+	}
+
+	topicErr, ok := rsp.TopicErrorCodes[topic]
+	if !ok {
+		return ErrIncompleteResponse
+	}
+
+	if topicErr != ErrNoError {
+		return topicErr
+	}
+	return nil
+}
+
+func (ca *clusterAdmin) CreatePartitions(topic string, count int32, assignment [][]int32, validateOnly bool) error {
+	if topic == "" {
+		return ErrInvalidTopic
+	}
+
+	topicPartitions := make(map[string]*TopicPartition)
+	topicPartitions[topic] = &TopicPartition{Count: count, Assignment: assignment}
+
+	request := &CreatePartitionsRequest{
+		TopicPartitions: topicPartitions,
+		Timeout:         ca.conf.Admin.Timeout,
+	}
+
+	b, err := ca.Controller()
+	if err != nil {
+		return err
+	}
+
+	rsp, err := b.CreatePartitions(request)
+	if err != nil {
+		return err
+	}
+
+	topicErr, ok := rsp.TopicPartitionErrors[topic]
+	if !ok {
+		return ErrIncompleteResponse
+	}
+
+	if topicErr.Err != ErrNoError {
+		return topicErr.Err
+	}
+
+	return nil
+}
+
+func (ca *clusterAdmin) DeleteRecords(topic string, partitionOffsets map[int32]int64) error {
+
+	if topic == "" {
+		return ErrInvalidTopic
+	}
+
+	topics := make(map[string]*DeleteRecordsRequestTopic)
+	topics[topic] = &DeleteRecordsRequestTopic{PartitionOffsets: partitionOffsets}
+	request := &DeleteRecordsRequest{
+		Topics:  topics,
+		Timeout: ca.conf.Admin.Timeout,
+	}
+
+	b, err := ca.Controller()
+	if err != nil {
+		return err
+	}
+
+	rsp, err := b.DeleteRecords(request)
+	if err != nil {
+		return err
+	}
+
+	_, ok := rsp.Topics[topic]
+	if !ok {
+		return ErrIncompleteResponse
+	}
+
+	//todo since we are dealing with couple of partitions it would be good if we return slice of errors
+	//for each partition instead of one error
+	return nil
+}
+
+func (ca *clusterAdmin) DescribeConfig(resource ConfigResource) ([]ConfigEntry, error) {
+
+	var entries []ConfigEntry
+	var resources []*ConfigResource
+	resources = append(resources, &resource)
+
+	request := &DescribeConfigsRequest{
+		Resources: resources,
+	}
+
+	b, err := ca.Controller()
+	if err != nil {
+		return nil, err
+	}
+
+	rsp, err := b.DescribeConfigs(request)
+	if err != nil {
+		return nil, err
+	}
+
+	for _, rspResource := range rsp.Resources {
+		if rspResource.Name == resource.Name {
+			if rspResource.ErrorMsg != "" {
+				return nil, errors.New(rspResource.ErrorMsg)
+			}
+			for _, cfgEntry := range rspResource.Configs {
+				entries = append(entries, *cfgEntry)
+			}
+		}
+	}
+	return entries, nil
+}
+
+func (ca *clusterAdmin) AlterConfig(resourceType ConfigResourceType, name string, entries map[string]*string, validateOnly bool) error {
+
+	var resources []*AlterConfigsResource
+	resources = append(resources, &AlterConfigsResource{
+		Type:          resourceType,
+		Name:          name,
+		ConfigEntries: entries,
+	})
+
+	request := &AlterConfigsRequest{
+		Resources:    resources,
+		ValidateOnly: validateOnly,
+	}
+
+	b, err := ca.Controller()
+	if err != nil {
+		return err
+	}
+
+	rsp, err := b.AlterConfigs(request)
+	if err != nil {
+		return err
+	}
+
+	for _, rspResource := range rsp.Resources {
+		if rspResource.Name == name {
+			if rspResource.ErrorMsg != "" {
+				return errors.New(rspResource.ErrorMsg)
+			}
+		}
+	}
+	return nil
+}
+
+func (ca *clusterAdmin) CreateACL(resource Resource, acl Acl) error {
+	var acls []*AclCreation
+	acls = append(acls, &AclCreation{resource, acl})
+	request := &CreateAclsRequest{AclCreations: acls}
+
+	b, err := ca.Controller()
+	if err != nil {
+		return err
+	}
+
+	_, err = b.CreateAcls(request)
+	return err
+}
+
+func (ca *clusterAdmin) ListAcls(filter AclFilter) ([]ResourceAcls, error) {
+
+	request := &DescribeAclsRequest{AclFilter: filter}
+
+	b, err := ca.Controller()
+	if err != nil {
+		return nil, err
+	}
+
+	rsp, err := b.DescribeAcls(request)
+	if err != nil {
+		return nil, err
+	}
+
+	var lAcls []ResourceAcls
+	for _, rAcl := range rsp.ResourceAcls {
+		lAcls = append(lAcls, *rAcl)
+	}
+	return lAcls, nil
+}
+
+func (ca *clusterAdmin) DeleteACL(filter AclFilter, validateOnly bool) ([]MatchingAcl, error) {
+	var filters []*AclFilter
+	filters = append(filters, &filter)
+	request := &DeleteAclsRequest{Filters: filters}
+
+	b, err := ca.Controller()
+	if err != nil {
+		return nil, err
+	}
+
+	rsp, err := b.DeleteAcls(request)
+	if err != nil {
+		return nil, err
+	}
+
+	var mAcls []MatchingAcl
+	for _, fr := range rsp.FilterResponses {
+		for _, mACL := range fr.MatchingAcls {
+			mAcls = append(mAcls, *mACL)
+		}
+
+	}
+	return mAcls, nil
+}

diff --git a/vendor/github.com/Shopify/sarama/alter_configs_request.go b/vendor/github.com/Shopify/sarama/alter_configs_request.go
new file mode 100644
index 0000000..48c44ea
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/alter_configs_request.go

@@ -0,0 +1,120 @@
+package sarama
+
+type AlterConfigsRequest struct {
+	Resources    []*AlterConfigsResource
+	ValidateOnly bool
+}
+
+type AlterConfigsResource struct {
+	Type          ConfigResourceType
+	Name          string
+	ConfigEntries map[string]*string
+}
+
+func (acr *AlterConfigsRequest) encode(pe packetEncoder) error {
+	if err := pe.putArrayLength(len(acr.Resources)); err != nil {
+		return err
+	}
+
+	for _, r := range acr.Resources {
+		if err := r.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	pe.putBool(acr.ValidateOnly)
+	return nil
+}
+
+func (acr *AlterConfigsRequest) decode(pd packetDecoder, version int16) error {
+	resourceCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	acr.Resources = make([]*AlterConfigsResource, resourceCount)
+	for i := range acr.Resources {
+		r := &AlterConfigsResource{}
+		err = r.decode(pd, version)
+		if err != nil {
+			return err
+		}
+		acr.Resources[i] = r
+	}
+
+	validateOnly, err := pd.getBool()
+	if err != nil {
+		return err
+	}
+
+	acr.ValidateOnly = validateOnly
+
+	return nil
+}
+
+func (ac *AlterConfigsResource) encode(pe packetEncoder) error {
+	pe.putInt8(int8(ac.Type))
+
+	if err := pe.putString(ac.Name); err != nil {
+		return err
+	}
+
+	if err := pe.putArrayLength(len(ac.ConfigEntries)); err != nil {
+		return err
+	}
+	for configKey, configValue := range ac.ConfigEntries {
+		if err := pe.putString(configKey); err != nil {
+			return err
+		}
+		if err := pe.putNullableString(configValue); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (ac *AlterConfigsResource) decode(pd packetDecoder, version int16) error {
+	t, err := pd.getInt8()
+	if err != nil {
+		return err
+	}
+	ac.Type = ConfigResourceType(t)
+
+	name, err := pd.getString()
+	if err != nil {
+		return err
+	}
+	ac.Name = name
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	if n > 0 {
+		ac.ConfigEntries = make(map[string]*string, n)
+		for i := 0; i < n; i++ {
+			configKey, err := pd.getString()
+			if err != nil {
+				return err
+			}
+			if ac.ConfigEntries[configKey], err = pd.getNullableString(); err != nil {
+				return err
+			}
+		}
+	}
+	return err
+}
+
+func (acr *AlterConfigsRequest) key() int16 {
+	return 33
+}
+
+func (acr *AlterConfigsRequest) version() int16 {
+	return 0
+}
+
+func (acr *AlterConfigsRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/alter_configs_response.go b/vendor/github.com/Shopify/sarama/alter_configs_response.go
new file mode 100644
index 0000000..29b09e1
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/alter_configs_response.go

@@ -0,0 +1,95 @@
+package sarama
+
+import "time"
+
+type AlterConfigsResponse struct {
+	ThrottleTime time.Duration
+	Resources    []*AlterConfigsResourceResponse
+}
+
+type AlterConfigsResourceResponse struct {
+	ErrorCode int16
+	ErrorMsg  string
+	Type      ConfigResourceType
+	Name      string
+}
+
+func (ct *AlterConfigsResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(ct.ThrottleTime / time.Millisecond))
+
+	if err := pe.putArrayLength(len(ct.Resources)); err != nil {
+		return err
+	}
+
+	for i := range ct.Resources {
+		pe.putInt16(ct.Resources[i].ErrorCode)
+		err := pe.putString(ct.Resources[i].ErrorMsg)
+		if err != nil {
+			return nil
+		}
+		pe.putInt8(int8(ct.Resources[i].Type))
+		err = pe.putString(ct.Resources[i].Name)
+		if err != nil {
+			return nil
+		}
+	}
+
+	return nil
+}
+
+func (acr *AlterConfigsResponse) decode(pd packetDecoder, version int16) error {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	acr.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	responseCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	acr.Resources = make([]*AlterConfigsResourceResponse, responseCount)
+
+	for i := range acr.Resources {
+		acr.Resources[i] = new(AlterConfigsResourceResponse)
+
+		errCode, err := pd.getInt16()
+		if err != nil {
+			return err
+		}
+		acr.Resources[i].ErrorCode = errCode
+
+		e, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		acr.Resources[i].ErrorMsg = e
+
+		t, err := pd.getInt8()
+		if err != nil {
+			return err
+		}
+		acr.Resources[i].Type = ConfigResourceType(t)
+
+		name, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		acr.Resources[i].Name = name
+	}
+
+	return nil
+}
+
+func (r *AlterConfigsResponse) key() int16 {
+	return 32
+}
+
+func (r *AlterConfigsResponse) version() int16 {
+	return 0
+}
+
+func (r *AlterConfigsResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/api_versions_request.go b/vendor/github.com/Shopify/sarama/api_versions_request.go
new file mode 100644
index 0000000..ab65f01
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/api_versions_request.go

@@ -0,0 +1,24 @@
+package sarama
+
+type ApiVersionsRequest struct {
+}
+
+func (r *ApiVersionsRequest) encode(pe packetEncoder) error {
+	return nil
+}
+
+func (r *ApiVersionsRequest) decode(pd packetDecoder, version int16) (err error) {
+	return nil
+}
+
+func (r *ApiVersionsRequest) key() int16 {
+	return 18
+}
+
+func (r *ApiVersionsRequest) version() int16 {
+	return 0
+}
+
+func (r *ApiVersionsRequest) requiredVersion() KafkaVersion {
+	return V0_10_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/api_versions_response.go b/vendor/github.com/Shopify/sarama/api_versions_response.go
new file mode 100644
index 0000000..23bc326
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/api_versions_response.go

@@ -0,0 +1,87 @@
+package sarama
+
+type ApiVersionsResponseBlock struct {
+	ApiKey     int16
+	MinVersion int16
+	MaxVersion int16
+}
+
+func (b *ApiVersionsResponseBlock) encode(pe packetEncoder) error {
+	pe.putInt16(b.ApiKey)
+	pe.putInt16(b.MinVersion)
+	pe.putInt16(b.MaxVersion)
+	return nil
+}
+
+func (b *ApiVersionsResponseBlock) decode(pd packetDecoder) error {
+	var err error
+
+	if b.ApiKey, err = pd.getInt16(); err != nil {
+		return err
+	}
+
+	if b.MinVersion, err = pd.getInt16(); err != nil {
+		return err
+	}
+
+	if b.MaxVersion, err = pd.getInt16(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+type ApiVersionsResponse struct {
+	Err         KError
+	ApiVersions []*ApiVersionsResponseBlock
+}
+
+func (r *ApiVersionsResponse) encode(pe packetEncoder) error {
+	pe.putInt16(int16(r.Err))
+	if err := pe.putArrayLength(len(r.ApiVersions)); err != nil {
+		return err
+	}
+	for _, apiVersion := range r.ApiVersions {
+		if err := apiVersion.encode(pe); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (r *ApiVersionsResponse) decode(pd packetDecoder, version int16) error {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+
+	r.Err = KError(kerr)
+
+	numBlocks, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.ApiVersions = make([]*ApiVersionsResponseBlock, numBlocks)
+	for i := 0; i < numBlocks; i++ {
+		block := new(ApiVersionsResponseBlock)
+		if err := block.decode(pd); err != nil {
+			return err
+		}
+		r.ApiVersions[i] = block
+	}
+
+	return nil
+}
+
+func (r *ApiVersionsResponse) key() int16 {
+	return 18
+}
+
+func (r *ApiVersionsResponse) version() int16 {
+	return 0
+}
+
+func (r *ApiVersionsResponse) requiredVersion() KafkaVersion {
+	return V0_10_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/async_producer.go b/vendor/github.com/Shopify/sarama/async_producer.go
new file mode 100644
index 0000000..fe9df3b
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/async_producer.go

@@ -0,0 +1,1046 @@
+package sarama
+
+import (
+	"encoding/binary"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/eapache/go-resiliency/breaker"
+	"github.com/eapache/queue"
+)
+
+// AsyncProducer publishes Kafka messages using a non-blocking API. It routes messages
+// to the correct broker for the provided topic-partition, refreshing metadata as appropriate,
+// and parses responses for errors. You must read from the Errors() channel or the
+// producer will deadlock. You must call Close() or AsyncClose() on a producer to avoid
+// leaks: it will not be garbage-collected automatically when it passes out of
+// scope.
+type AsyncProducer interface {
+
+	// AsyncClose triggers a shutdown of the producer. The shutdown has completed
+	// when both the Errors and Successes channels have been closed. When calling
+	// AsyncClose, you *must* continue to read from those channels in order to
+	// drain the results of any messages in flight.
+	AsyncClose()
+
+	// Close shuts down the producer and waits for any buffered messages to be
+	// flushed. You must call this function before a producer object passes out of
+	// scope, as it may otherwise leak memory. You must call this before calling
+	// Close on the underlying client.
+	Close() error
+
+	// Input is the input channel for the user to write messages to that they
+	// wish to send.
+	Input() chan<- *ProducerMessage
+
+	// Successes is the success output channel back to the user when Return.Successes is
+	// enabled. If Return.Successes is true, you MUST read from this channel or the
+	// Producer will deadlock. It is suggested that you send and read messages
+	// together in a single select statement.
+	Successes() <-chan *ProducerMessage
+
+	// Errors is the error output channel back to the user. You MUST read from this
+	// channel or the Producer will deadlock when the channel is full. Alternatively,
+	// you can set Producer.Return.Errors in your config to false, which prevents
+	// errors to be returned.
+	Errors() <-chan *ProducerError
+}
+
+// transactionManager keeps the state necessary to ensure idempotent production
+type transactionManager struct {
+	producerID      int64
+	producerEpoch   int16
+	sequenceNumbers map[string]int32
+	mutex           sync.Mutex
+}
+
+const (
+	noProducerID    = -1
+	noProducerEpoch = -1
+)
+
+func (t *transactionManager) getAndIncrementSequenceNumber(topic string, partition int32) int32 {
+	key := fmt.Sprintf("%s-%d", topic, partition)
+	t.mutex.Lock()
+	defer t.mutex.Unlock()
+	sequence := t.sequenceNumbers[key]
+	t.sequenceNumbers[key] = sequence + 1
+	return sequence
+}
+
+func newTransactionManager(conf *Config, client Client) (*transactionManager, error) {
+	txnmgr := &transactionManager{
+		producerID:    noProducerID,
+		producerEpoch: noProducerEpoch,
+	}
+
+	if conf.Producer.Idempotent {
+		initProducerIDResponse, err := client.InitProducerID()
+		if err != nil {
+			return nil, err
+		}
+		txnmgr.producerID = initProducerIDResponse.ProducerID
+		txnmgr.producerEpoch = initProducerIDResponse.ProducerEpoch
+		txnmgr.sequenceNumbers = make(map[string]int32)
+		txnmgr.mutex = sync.Mutex{}
+
+		Logger.Printf("Obtained a ProducerId: %d and ProducerEpoch: %d\n", txnmgr.producerID, txnmgr.producerEpoch)
+	}
+
+	return txnmgr, nil
+}
+
+type asyncProducer struct {
+	client    Client
+	conf      *Config
+	ownClient bool
+
+	errors                    chan *ProducerError
+	input, successes, retries chan *ProducerMessage
+	inFlight                  sync.WaitGroup
+
+	brokers    map[*Broker]*brokerProducer
+	brokerRefs map[*brokerProducer]int
+	brokerLock sync.Mutex
+
+	txnmgr *transactionManager
+}
+
+// NewAsyncProducer creates a new AsyncProducer using the given broker addresses and configuration.
+func NewAsyncProducer(addrs []string, conf *Config) (AsyncProducer, error) {
+	client, err := NewClient(addrs, conf)
+	if err != nil {
+		return nil, err
+	}
+
+	p, err := NewAsyncProducerFromClient(client)
+	if err != nil {
+		return nil, err
+	}
+	p.(*asyncProducer).ownClient = true
+	return p, nil
+}
+
+// NewAsyncProducerFromClient creates a new Producer using the given client. It is still
+// necessary to call Close() on the underlying client when shutting down this producer.
+func NewAsyncProducerFromClient(client Client) (AsyncProducer, error) {
+	// Check that we are not dealing with a closed Client before processing any other arguments
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	txnmgr, err := newTransactionManager(client.Config(), client)
+	if err != nil {
+		return nil, err
+	}
+
+	p := &asyncProducer{
+		client:     client,
+		conf:       client.Config(),
+		errors:     make(chan *ProducerError),
+		input:      make(chan *ProducerMessage),
+		successes:  make(chan *ProducerMessage),
+		retries:    make(chan *ProducerMessage),
+		brokers:    make(map[*Broker]*brokerProducer),
+		brokerRefs: make(map[*brokerProducer]int),
+		txnmgr:     txnmgr,
+	}
+
+	// launch our singleton dispatchers
+	go withRecover(p.dispatcher)
+	go withRecover(p.retryHandler)
+
+	return p, nil
+}
+
+type flagSet int8
+
+const (
+	syn      flagSet = 1 << iota // first message from partitionProducer to brokerProducer
+	fin                          // final message from partitionProducer to brokerProducer and back
+	shutdown                     // start the shutdown process
+)
+
+// ProducerMessage is the collection of elements passed to the Producer in order to send a message.
+type ProducerMessage struct {
+	Topic string // The Kafka topic for this message.
+	// The partitioning key for this message. Pre-existing Encoders include
+	// StringEncoder and ByteEncoder.
+	Key Encoder
+	// The actual message to store in Kafka. Pre-existing Encoders include
+	// StringEncoder and ByteEncoder.
+	Value Encoder
+
+	// The headers are key-value pairs that are transparently passed
+	// by Kafka between producers and consumers.
+	Headers []RecordHeader
+
+	// This field is used to hold arbitrary data you wish to include so it
+	// will be available when receiving on the Successes and Errors channels.
+	// Sarama completely ignores this field and is only to be used for
+	// pass-through data.
+	Metadata interface{}
+
+	// Below this point are filled in by the producer as the message is processed
+
+	// Offset is the offset of the message stored on the broker. This is only
+	// guaranteed to be defined if the message was successfully delivered and
+	// RequiredAcks is not NoResponse.
+	Offset int64
+	// Partition is the partition that the message was sent to. This is only
+	// guaranteed to be defined if the message was successfully delivered.
+	Partition int32
+	// Timestamp is the timestamp assigned to the message by the broker. This
+	// is only guaranteed to be defined if the message was successfully
+	// delivered, RequiredAcks is not NoResponse, and the Kafka broker is at
+	// least version 0.10.0.
+	Timestamp time.Time
+
+	retries        int
+	flags          flagSet
+	expectation    chan *ProducerError
+	sequenceNumber int32
+}
+
+const producerMessageOverhead = 26 // the metadata overhead of CRC, flags, etc.
+
+func (m *ProducerMessage) byteSize(version int) int {
+	var size int
+	if version >= 2 {
+		size = maximumRecordOverhead
+		for _, h := range m.Headers {
+			size += len(h.Key) + len(h.Value) + 2*binary.MaxVarintLen32
+		}
+	} else {
+		size = producerMessageOverhead
+	}
+	if m.Key != nil {
+		size += m.Key.Length()
+	}
+	if m.Value != nil {
+		size += m.Value.Length()
+	}
+	return size
+}
+
+func (m *ProducerMessage) clear() {
+	m.flags = 0
+	m.retries = 0
+}
+
+// ProducerError is the type of error generated when the producer fails to deliver a message.
+// It contains the original ProducerMessage as well as the actual error value.
+type ProducerError struct {
+	Msg *ProducerMessage
+	Err error
+}
+
+func (pe ProducerError) Error() string {
+	return fmt.Sprintf("kafka: Failed to produce message to topic %s: %s", pe.Msg.Topic, pe.Err)
+}
+
+// ProducerErrors is a type that wraps a batch of "ProducerError"s and implements the Error interface.
+// It can be returned from the Producer's Close method to avoid the need to manually drain the Errors channel
+// when closing a producer.
+type ProducerErrors []*ProducerError
+
+func (pe ProducerErrors) Error() string {
+	return fmt.Sprintf("kafka: Failed to deliver %d messages.", len(pe))
+}
+
+func (p *asyncProducer) Errors() <-chan *ProducerError {
+	return p.errors
+}
+
+func (p *asyncProducer) Successes() <-chan *ProducerMessage {
+	return p.successes
+}
+
+func (p *asyncProducer) Input() chan<- *ProducerMessage {
+	return p.input
+}
+
+func (p *asyncProducer) Close() error {
+	p.AsyncClose()
+
+	if p.conf.Producer.Return.Successes {
+		go withRecover(func() {
+			for range p.successes {
+			}
+		})
+	}
+
+	var errors ProducerErrors
+	if p.conf.Producer.Return.Errors {
+		for event := range p.errors {
+			errors = append(errors, event)
+		}
+	} else {
+		<-p.errors
+	}
+
+	if len(errors) > 0 {
+		return errors
+	}
+	return nil
+}
+
+func (p *asyncProducer) AsyncClose() {
+	go withRecover(p.shutdown)
+}
+
+// singleton
+// dispatches messages by topic
+func (p *asyncProducer) dispatcher() {
+	handlers := make(map[string]chan<- *ProducerMessage)
+	shuttingDown := false
+
+	for msg := range p.input {
+		if msg == nil {
+			Logger.Println("Something tried to send a nil message, it was ignored.")
+			continue
+		}
+
+		if msg.flags&shutdown != 0 {
+			shuttingDown = true
+			p.inFlight.Done()
+			continue
+		} else if msg.retries == 0 {
+			if shuttingDown {
+				// we can't just call returnError here because that decrements the wait group,
+				// which hasn't been incremented yet for this message, and shouldn't be
+				pErr := &ProducerError{Msg: msg, Err: ErrShuttingDown}
+				if p.conf.Producer.Return.Errors {
+					p.errors <- pErr
+				} else {
+					Logger.Println(pErr)
+				}
+				continue
+			}
+			p.inFlight.Add(1)
+		}
+
+		version := 1
+		if p.conf.Version.IsAtLeast(V0_11_0_0) {
+			version = 2
+		} else if msg.Headers != nil {
+			p.returnError(msg, ConfigurationError("Producing headers requires Kafka at least v0.11"))
+			continue
+		}
+		if msg.byteSize(version) > p.conf.Producer.MaxMessageBytes {
+			p.returnError(msg, ErrMessageSizeTooLarge)
+			continue
+		}
+
+		handler := handlers[msg.Topic]
+		if handler == nil {
+			handler = p.newTopicProducer(msg.Topic)
+			handlers[msg.Topic] = handler
+		}
+
+		handler <- msg
+	}
+
+	for _, handler := range handlers {
+		close(handler)
+	}
+}
+
+// one per topic
+// partitions messages, then dispatches them by partition
+type topicProducer struct {
+	parent *asyncProducer
+	topic  string
+	input  <-chan *ProducerMessage
+
+	breaker     *breaker.Breaker
+	handlers    map[int32]chan<- *ProducerMessage
+	partitioner Partitioner
+}
+
+func (p *asyncProducer) newTopicProducer(topic string) chan<- *ProducerMessage {
+	input := make(chan *ProducerMessage, p.conf.ChannelBufferSize)
+	tp := &topicProducer{
+		parent:      p,
+		topic:       topic,
+		input:       input,
+		breaker:     breaker.New(3, 1, 10*time.Second),
+		handlers:    make(map[int32]chan<- *ProducerMessage),
+		partitioner: p.conf.Producer.Partitioner(topic),
+	}
+	go withRecover(tp.dispatch)
+	return input
+}
+
+func (tp *topicProducer) dispatch() {
+	for msg := range tp.input {
+		if msg.retries == 0 {
+			if err := tp.partitionMessage(msg); err != nil {
+				tp.parent.returnError(msg, err)
+				continue
+			}
+		}
+		// All messages being retried (sent or not) have already had their retry count updated
+		if tp.parent.conf.Producer.Idempotent && msg.retries == 0 {
+			msg.sequenceNumber = tp.parent.txnmgr.getAndIncrementSequenceNumber(msg.Topic, msg.Partition)
+		}
+
+		handler := tp.handlers[msg.Partition]
+		if handler == nil {
+			handler = tp.parent.newPartitionProducer(msg.Topic, msg.Partition)
+			tp.handlers[msg.Partition] = handler
+		}
+
+		handler <- msg
+	}
+
+	for _, handler := range tp.handlers {
+		close(handler)
+	}
+}
+
+func (tp *topicProducer) partitionMessage(msg *ProducerMessage) error {
+	var partitions []int32
+
+	err := tp.breaker.Run(func() (err error) {
+		var requiresConsistency = false
+		if ep, ok := tp.partitioner.(DynamicConsistencyPartitioner); ok {
+			requiresConsistency = ep.MessageRequiresConsistency(msg)
+		} else {
+			requiresConsistency = tp.partitioner.RequiresConsistency()
+		}
+
+		if requiresConsistency {
+			partitions, err = tp.parent.client.Partitions(msg.Topic)
+		} else {
+			partitions, err = tp.parent.client.WritablePartitions(msg.Topic)
+		}
+		return
+	})
+
+	if err != nil {
+		return err
+	}
+
+	numPartitions := int32(len(partitions))
+
+	if numPartitions == 0 {
+		return ErrLeaderNotAvailable
+	}
+
+	choice, err := tp.partitioner.Partition(msg, numPartitions)
+
+	if err != nil {
+		return err
+	} else if choice < 0 || choice >= numPartitions {
+		return ErrInvalidPartition
+	}
+
+	msg.Partition = partitions[choice]
+
+	return nil
+}
+
+// one per partition per topic
+// dispatches messages to the appropriate broker
+// also responsible for maintaining message order during retries
+type partitionProducer struct {
+	parent    *asyncProducer
+	topic     string
+	partition int32
+	input     <-chan *ProducerMessage
+
+	leader         *Broker
+	breaker        *breaker.Breaker
+	brokerProducer *brokerProducer
+
+	// highWatermark tracks the "current" retry level, which is the only one where we actually let messages through,
+	// all other messages get buffered in retryState[msg.retries].buf to preserve ordering
+	// retryState[msg.retries].expectChaser simply tracks whether we've seen a fin message for a given level (and
+	// therefore whether our buffer is complete and safe to flush)
+	highWatermark int
+	retryState    []partitionRetryState
+}
+
+type partitionRetryState struct {
+	buf          []*ProducerMessage
+	expectChaser bool
+}
+
+func (p *asyncProducer) newPartitionProducer(topic string, partition int32) chan<- *ProducerMessage {
+	input := make(chan *ProducerMessage, p.conf.ChannelBufferSize)
+	pp := &partitionProducer{
+		parent:    p,
+		topic:     topic,
+		partition: partition,
+		input:     input,
+
+		breaker:    breaker.New(3, 1, 10*time.Second),
+		retryState: make([]partitionRetryState, p.conf.Producer.Retry.Max+1),
+	}
+	go withRecover(pp.dispatch)
+	return input
+}
+
+func (pp *partitionProducer) dispatch() {
+	// try to prefetch the leader; if this doesn't work, we'll do a proper call to `updateLeader`
+	// on the first message
+	pp.leader, _ = pp.parent.client.Leader(pp.topic, pp.partition)
+	if pp.leader != nil {
+		pp.brokerProducer = pp.parent.getBrokerProducer(pp.leader)
+		pp.parent.inFlight.Add(1) // we're generating a syn message; track it so we don't shut down while it's still inflight
+		pp.brokerProducer.input <- &ProducerMessage{Topic: pp.topic, Partition: pp.partition, flags: syn}
+	}
+
+	for msg := range pp.input {
+		if msg.retries > pp.highWatermark {
+			// a new, higher, retry level; handle it and then back off
+			pp.newHighWatermark(msg.retries)
+			time.Sleep(pp.parent.conf.Producer.Retry.Backoff)
+		} else if pp.highWatermark > 0 {
+			// we are retrying something (else highWatermark would be 0) but this message is not a *new* retry level
+			if msg.retries < pp.highWatermark {
+				// in fact this message is not even the current retry level, so buffer it for now (unless it's a just a fin)
+				if msg.flags&fin == fin {
+					pp.retryState[msg.retries].expectChaser = false
+					pp.parent.inFlight.Done() // this fin is now handled and will be garbage collected
+				} else {
+					pp.retryState[msg.retries].buf = append(pp.retryState[msg.retries].buf, msg)
+				}
+				continue
+			} else if msg.flags&fin == fin {
+				// this message is of the current retry level (msg.retries == highWatermark) and the fin flag is set,
+				// meaning this retry level is done and we can go down (at least) one level and flush that
+				pp.retryState[pp.highWatermark].expectChaser = false
+				pp.flushRetryBuffers()
+				pp.parent.inFlight.Done() // this fin is now handled and will be garbage collected
+				continue
+			}
+		}
+
+		// if we made it this far then the current msg contains real data, and can be sent to the next goroutine
+		// without breaking any of our ordering guarantees
+
+		if pp.brokerProducer == nil {
+			if err := pp.updateLeader(); err != nil {
+				pp.parent.returnError(msg, err)
+				time.Sleep(pp.parent.conf.Producer.Retry.Backoff)
+				continue
+			}
+			Logger.Printf("producer/leader/%s/%d selected broker %d\n", pp.topic, pp.partition, pp.leader.ID())
+		}
+
+		pp.brokerProducer.input <- msg
+	}
+
+	if pp.brokerProducer != nil {
+		pp.parent.unrefBrokerProducer(pp.leader, pp.brokerProducer)
+	}
+}
+
+func (pp *partitionProducer) newHighWatermark(hwm int) {
+	Logger.Printf("producer/leader/%s/%d state change to [retrying-%d]\n", pp.topic, pp.partition, hwm)
+	pp.highWatermark = hwm
+
+	// send off a fin so that we know when everything "in between" has made it
+	// back to us and we can safely flush the backlog (otherwise we risk re-ordering messages)
+	pp.retryState[pp.highWatermark].expectChaser = true
+	pp.parent.inFlight.Add(1) // we're generating a fin message; track it so we don't shut down while it's still inflight
+	pp.brokerProducer.input <- &ProducerMessage{Topic: pp.topic, Partition: pp.partition, flags: fin, retries: pp.highWatermark - 1}
+
+	// a new HWM means that our current broker selection is out of date
+	Logger.Printf("producer/leader/%s/%d abandoning broker %d\n", pp.topic, pp.partition, pp.leader.ID())
+	pp.parent.unrefBrokerProducer(pp.leader, pp.brokerProducer)
+	pp.brokerProducer = nil
+}
+
+func (pp *partitionProducer) flushRetryBuffers() {
+	Logger.Printf("producer/leader/%s/%d state change to [flushing-%d]\n", pp.topic, pp.partition, pp.highWatermark)
+	for {
+		pp.highWatermark--
+
+		if pp.brokerProducer == nil {
+			if err := pp.updateLeader(); err != nil {
+				pp.parent.returnErrors(pp.retryState[pp.highWatermark].buf, err)
+				goto flushDone
+			}
+			Logger.Printf("producer/leader/%s/%d selected broker %d\n", pp.topic, pp.partition, pp.leader.ID())
+		}
+
+		for _, msg := range pp.retryState[pp.highWatermark].buf {
+			pp.brokerProducer.input <- msg
+		}
+
+	flushDone:
+		pp.retryState[pp.highWatermark].buf = nil
+		if pp.retryState[pp.highWatermark].expectChaser {
+			Logger.Printf("producer/leader/%s/%d state change to [retrying-%d]\n", pp.topic, pp.partition, pp.highWatermark)
+			break
+		} else if pp.highWatermark == 0 {
+			Logger.Printf("producer/leader/%s/%d state change to [normal]\n", pp.topic, pp.partition)
+			break
+		}
+	}
+}
+
+func (pp *partitionProducer) updateLeader() error {
+	return pp.breaker.Run(func() (err error) {
+		if err = pp.parent.client.RefreshMetadata(pp.topic); err != nil {
+			return err
+		}
+
+		if pp.leader, err = pp.parent.client.Leader(pp.topic, pp.partition); err != nil {
+			return err
+		}
+
+		pp.brokerProducer = pp.parent.getBrokerProducer(pp.leader)
+		pp.parent.inFlight.Add(1) // we're generating a syn message; track it so we don't shut down while it's still inflight
+		pp.brokerProducer.input <- &ProducerMessage{Topic: pp.topic, Partition: pp.partition, flags: syn}
+
+		return nil
+	})
+}
+
+// one per broker; also constructs an associated flusher
+func (p *asyncProducer) newBrokerProducer(broker *Broker) *brokerProducer {
+	var (
+		input     = make(chan *ProducerMessage)
+		bridge    = make(chan *produceSet)
+		responses = make(chan *brokerProducerResponse)
+	)
+
+	bp := &brokerProducer{
+		parent:         p,
+		broker:         broker,
+		input:          input,
+		output:         bridge,
+		responses:      responses,
+		buffer:         newProduceSet(p),
+		currentRetries: make(map[string]map[int32]error),
+	}
+	go withRecover(bp.run)
+
+	// minimal bridge to make the network response `select`able
+	go withRecover(func() {
+		for set := range bridge {
+			request := set.buildRequest()
+
+			response, err := broker.Produce(request)
+
+			responses <- &brokerProducerResponse{
+				set: set,
+				err: err,
+				res: response,
+			}
+		}
+		close(responses)
+	})
+
+	return bp
+}
+
+type brokerProducerResponse struct {
+	set *produceSet
+	err error
+	res *ProduceResponse
+}
+
+// groups messages together into appropriately-sized batches for sending to the broker
+// handles state related to retries etc
+type brokerProducer struct {
+	parent *asyncProducer
+	broker *Broker
+
+	input     chan *ProducerMessage
+	output    chan<- *produceSet
+	responses <-chan *brokerProducerResponse
+
+	buffer     *produceSet
+	timer      <-chan time.Time
+	timerFired bool
+
+	closing        error
+	currentRetries map[string]map[int32]error
+}
+
+func (bp *brokerProducer) run() {
+	var output chan<- *produceSet
+	Logger.Printf("producer/broker/%d starting up\n", bp.broker.ID())
+
+	for {
+		select {
+		case msg := <-bp.input:
+			if msg == nil {
+				bp.shutdown()
+				return
+			}
+
+			if msg.flags&syn == syn {
+				Logger.Printf("producer/broker/%d state change to [open] on %s/%d\n",
+					bp.broker.ID(), msg.Topic, msg.Partition)
+				if bp.currentRetries[msg.Topic] == nil {
+					bp.currentRetries[msg.Topic] = make(map[int32]error)
+				}
+				bp.currentRetries[msg.Topic][msg.Partition] = nil
+				bp.parent.inFlight.Done()
+				continue
+			}
+
+			if reason := bp.needsRetry(msg); reason != nil {
+				bp.parent.retryMessage(msg, reason)
+
+				if bp.closing == nil && msg.flags&fin == fin {
+					// we were retrying this partition but we can start processing again
+					delete(bp.currentRetries[msg.Topic], msg.Partition)
+					Logger.Printf("producer/broker/%d state change to [closed] on %s/%d\n",
+						bp.broker.ID(), msg.Topic, msg.Partition)
+				}
+
+				continue
+			}
+
+			if bp.buffer.wouldOverflow(msg) {
+				if err := bp.waitForSpace(msg); err != nil {
+					bp.parent.retryMessage(msg, err)
+					continue
+				}
+			}
+
+			if err := bp.buffer.add(msg); err != nil {
+				bp.parent.returnError(msg, err)
+				continue
+			}
+
+			if bp.parent.conf.Producer.Flush.Frequency > 0 && bp.timer == nil {
+				bp.timer = time.After(bp.parent.conf.Producer.Flush.Frequency)
+			}
+		case <-bp.timer:
+			bp.timerFired = true
+		case output <- bp.buffer:
+			bp.rollOver()
+		case response := <-bp.responses:
+			bp.handleResponse(response)
+		}
+
+		if bp.timerFired || bp.buffer.readyToFlush() {
+			output = bp.output
+		} else {
+			output = nil
+		}
+	}
+}
+
+func (bp *brokerProducer) shutdown() {
+	for !bp.buffer.empty() {
+		select {
+		case response := <-bp.responses:
+			bp.handleResponse(response)
+		case bp.output <- bp.buffer:
+			bp.rollOver()
+		}
+	}
+	close(bp.output)
+	for response := range bp.responses {
+		bp.handleResponse(response)
+	}
+
+	Logger.Printf("producer/broker/%d shut down\n", bp.broker.ID())
+}
+
+func (bp *brokerProducer) needsRetry(msg *ProducerMessage) error {
+	if bp.closing != nil {
+		return bp.closing
+	}
+
+	return bp.currentRetries[msg.Topic][msg.Partition]
+}
+
+func (bp *brokerProducer) waitForSpace(msg *ProducerMessage) error {
+	Logger.Printf("producer/broker/%d maximum request accumulated, waiting for space\n", bp.broker.ID())
+
+	for {
+		select {
+		case response := <-bp.responses:
+			bp.handleResponse(response)
+			// handling a response can change our state, so re-check some things
+			if reason := bp.needsRetry(msg); reason != nil {
+				return reason
+			} else if !bp.buffer.wouldOverflow(msg) {
+				return nil
+			}
+		case bp.output <- bp.buffer:
+			bp.rollOver()
+			return nil
+		}
+	}
+}
+
+func (bp *brokerProducer) rollOver() {
+	bp.timer = nil
+	bp.timerFired = false
+	bp.buffer = newProduceSet(bp.parent)
+}
+
+func (bp *brokerProducer) handleResponse(response *brokerProducerResponse) {
+	if response.err != nil {
+		bp.handleError(response.set, response.err)
+	} else {
+		bp.handleSuccess(response.set, response.res)
+	}
+
+	if bp.buffer.empty() {
+		bp.rollOver() // this can happen if the response invalidated our buffer
+	}
+}
+
+func (bp *brokerProducer) handleSuccess(sent *produceSet, response *ProduceResponse) {
+	// we iterate through the blocks in the request set, not the response, so that we notice
+	// if the response is missing a block completely
+	var retryTopics []string
+	sent.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
+		if response == nil {
+			// this only happens when RequiredAcks is NoResponse, so we have to assume success
+			bp.parent.returnSuccesses(pSet.msgs)
+			return
+		}
+
+		block := response.GetBlock(topic, partition)
+		if block == nil {
+			bp.parent.returnErrors(pSet.msgs, ErrIncompleteResponse)
+			return
+		}
+
+		switch block.Err {
+		// Success
+		case ErrNoError:
+			if bp.parent.conf.Version.IsAtLeast(V0_10_0_0) && !block.Timestamp.IsZero() {
+				for _, msg := range pSet.msgs {
+					msg.Timestamp = block.Timestamp
+				}
+			}
+			for i, msg := range pSet.msgs {
+				msg.Offset = block.Offset + int64(i)
+			}
+			bp.parent.returnSuccesses(pSet.msgs)
+		// Duplicate
+		case ErrDuplicateSequenceNumber:
+			bp.parent.returnSuccesses(pSet.msgs)
+		// Retriable errors
+		case ErrInvalidMessage, ErrUnknownTopicOrPartition, ErrLeaderNotAvailable, ErrNotLeaderForPartition,
+			ErrRequestTimedOut, ErrNotEnoughReplicas, ErrNotEnoughReplicasAfterAppend:
+			retryTopics = append(retryTopics, topic)
+		// Other non-retriable errors
+		default:
+			bp.parent.returnErrors(pSet.msgs, block.Err)
+		}
+	})
+
+	if len(retryTopics) > 0 {
+		err := bp.parent.client.RefreshMetadata(retryTopics...)
+		if err != nil {
+			Logger.Printf("Failed refreshing metadata because of %v\n", err)
+		}
+
+		sent.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
+			block := response.GetBlock(topic, partition)
+			if block == nil {
+				// handled in the previous "eachPartition" loop
+				return
+			}
+
+			switch block.Err {
+			case ErrInvalidMessage, ErrUnknownTopicOrPartition, ErrLeaderNotAvailable, ErrNotLeaderForPartition,
+				ErrRequestTimedOut, ErrNotEnoughReplicas, ErrNotEnoughReplicasAfterAppend:
+				Logger.Printf("producer/broker/%d state change to [retrying] on %s/%d because %v\n",
+					bp.broker.ID(), topic, partition, block.Err)
+				if bp.currentRetries[topic] == nil {
+					bp.currentRetries[topic] = make(map[int32]error)
+				}
+				bp.currentRetries[topic][partition] = block.Err
+				// dropping the following messages has the side effect of incrementing their retry count
+				bp.parent.retryMessages(bp.buffer.dropPartition(topic, partition), block.Err)
+				bp.parent.retryBatch(topic, partition, pSet, block.Err)
+			}
+		})
+	}
+}
+
+func (p *asyncProducer) retryBatch(topic string, partition int32, pSet *partitionSet, kerr KError) {
+	Logger.Printf("Retrying batch for %v-%d because of %s\n", topic, partition, kerr)
+	produceSet := newProduceSet(p)
+	produceSet.msgs[topic] = make(map[int32]*partitionSet)
+	produceSet.msgs[topic][partition] = pSet
+	produceSet.bufferBytes += pSet.bufferBytes
+	produceSet.bufferCount += len(pSet.msgs)
+	for _, msg := range pSet.msgs {
+		if msg.retries >= p.conf.Producer.Retry.Max {
+			p.returnError(msg, kerr)
+			return
+		}
+		msg.retries++
+	}
+
+	// it's expected that a metadata refresh has been requested prior to calling retryBatch
+	leader, err := p.client.Leader(topic, partition)
+	if err != nil {
+		Logger.Printf("Failed retrying batch for %v-%d because of %v while looking up for new leader\n", topic, partition, err)
+		for _, msg := range pSet.msgs {
+			p.returnError(msg, kerr)
+		}
+		return
+	}
+	bp := p.getBrokerProducer(leader)
+	bp.output <- produceSet
+}
+
+func (bp *brokerProducer) handleError(sent *produceSet, err error) {
+	switch err.(type) {
+	case PacketEncodingError:
+		sent.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
+			bp.parent.returnErrors(pSet.msgs, err)
+		})
+	default:
+		Logger.Printf("producer/broker/%d state change to [closing] because %s\n", bp.broker.ID(), err)
+		bp.parent.abandonBrokerConnection(bp.broker)
+		_ = bp.broker.Close()
+		bp.closing = err
+		sent.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
+			bp.parent.retryMessages(pSet.msgs, err)
+		})
+		bp.buffer.eachPartition(func(topic string, partition int32, pSet *partitionSet) {
+			bp.parent.retryMessages(pSet.msgs, err)
+		})
+		bp.rollOver()
+	}
+}
+
+// singleton
+// effectively a "bridge" between the flushers and the dispatcher in order to avoid deadlock
+// based on https://godoc.org/github.com/eapache/channels#InfiniteChannel
+func (p *asyncProducer) retryHandler() {
+	var msg *ProducerMessage
+	buf := queue.New()
+
+	for {
+		if buf.Length() == 0 {
+			msg = <-p.retries
+		} else {
+			select {
+			case msg = <-p.retries:
+			case p.input <- buf.Peek().(*ProducerMessage):
+				buf.Remove()
+				continue
+			}
+		}
+
+		if msg == nil {
+			return
+		}
+
+		buf.Add(msg)
+	}
+}
+
+// utility functions
+
+func (p *asyncProducer) shutdown() {
+	Logger.Println("Producer shutting down.")
+	p.inFlight.Add(1)
+	p.input <- &ProducerMessage{flags: shutdown}
+
+	p.inFlight.Wait()
+
+	if p.ownClient {
+		err := p.client.Close()
+		if err != nil {
+			Logger.Println("producer/shutdown failed to close the embedded client:", err)
+		}
+	}
+
+	close(p.input)
+	close(p.retries)
+	close(p.errors)
+	close(p.successes)
+}
+
+func (p *asyncProducer) returnError(msg *ProducerMessage, err error) {
+	msg.clear()
+	pErr := &ProducerError{Msg: msg, Err: err}
+	if p.conf.Producer.Return.Errors {
+		p.errors <- pErr
+	} else {
+		Logger.Println(pErr)
+	}
+	p.inFlight.Done()
+}
+
+func (p *asyncProducer) returnErrors(batch []*ProducerMessage, err error) {
+	for _, msg := range batch {
+		p.returnError(msg, err)
+	}
+}
+
+func (p *asyncProducer) returnSuccesses(batch []*ProducerMessage) {
+	for _, msg := range batch {
+		if p.conf.Producer.Return.Successes {
+			msg.clear()
+			p.successes <- msg
+		}
+		p.inFlight.Done()
+	}
+}
+
+func (p *asyncProducer) retryMessage(msg *ProducerMessage, err error) {
+	if msg.retries >= p.conf.Producer.Retry.Max {
+		p.returnError(msg, err)
+	} else {
+		msg.retries++
+		p.retries <- msg
+	}
+}
+
+func (p *asyncProducer) retryMessages(batch []*ProducerMessage, err error) {
+	for _, msg := range batch {
+		p.retryMessage(msg, err)
+	}
+}
+
+func (p *asyncProducer) getBrokerProducer(broker *Broker) *brokerProducer {
+	p.brokerLock.Lock()
+	defer p.brokerLock.Unlock()
+
+	bp := p.brokers[broker]
+
+	if bp == nil {
+		bp = p.newBrokerProducer(broker)
+		p.brokers[broker] = bp
+		p.brokerRefs[bp] = 0
+	}
+
+	p.brokerRefs[bp]++
+
+	return bp
+}
+
+func (p *asyncProducer) unrefBrokerProducer(broker *Broker, bp *brokerProducer) {
+	p.brokerLock.Lock()
+	defer p.brokerLock.Unlock()
+
+	p.brokerRefs[bp]--
+	if p.brokerRefs[bp] == 0 {
+		close(bp.input)
+		delete(p.brokerRefs, bp)
+
+		if p.brokers[broker] == bp {
+			delete(p.brokers, broker)
+		}
+	}
+}
+
+func (p *asyncProducer) abandonBrokerConnection(broker *Broker) {
+	p.brokerLock.Lock()
+	defer p.brokerLock.Unlock()
+
+	delete(p.brokers, broker)
+}

diff --git a/vendor/github.com/Shopify/sarama/balance_strategy.go b/vendor/github.com/Shopify/sarama/balance_strategy.go
new file mode 100644
index 0000000..e78988d
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/balance_strategy.go

@@ -0,0 +1,129 @@
+package sarama
+
+import (
+	"math"
+	"sort"
+)
+
+// BalanceStrategyPlan is the results of any BalanceStrategy.Plan attempt.
+// It contains an allocation of topic/partitions by memberID in the form of
+// a `memberID -> topic -> partitions` map.
+type BalanceStrategyPlan map[string]map[string][]int32
+
+// Add assigns a topic with a number partitions to a member.
+func (p BalanceStrategyPlan) Add(memberID, topic string, partitions ...int32) {
+	if len(partitions) == 0 {
+		return
+	}
+	if _, ok := p[memberID]; !ok {
+		p[memberID] = make(map[string][]int32, 1)
+	}
+	p[memberID][topic] = append(p[memberID][topic], partitions...)
+}
+
+// --------------------------------------------------------------------
+
+// BalanceStrategy is used to balance topics and partitions
+// across memebers of a consumer group
+type BalanceStrategy interface {
+	// Name uniquely identifies the strategy.
+	Name() string
+
+	// Plan accepts a map of `memberID -> metadata` and a map of `topic -> partitions`
+	// and returns a distribution plan.
+	Plan(members map[string]ConsumerGroupMemberMetadata, topics map[string][]int32) (BalanceStrategyPlan, error)
+}
+
+// --------------------------------------------------------------------
+
+// BalanceStrategyRange is the default and assigns partitions as ranges to consumer group members.
+// Example with one topic T with six partitions (0..5) and two members (M1, M2):
+//   M1: {T: [0, 1, 2]}
+//   M2: {T: [3, 4, 5]}
+var BalanceStrategyRange = &balanceStrategy{
+	name: "range",
+	coreFn: func(plan BalanceStrategyPlan, memberIDs []string, topic string, partitions []int32) {
+		step := float64(len(partitions)) / float64(len(memberIDs))
+
+		for i, memberID := range memberIDs {
+			pos := float64(i)
+			min := int(math.Floor(pos*step + 0.5))
+			max := int(math.Floor((pos+1)*step + 0.5))
+			plan.Add(memberID, topic, partitions[min:max]...)
+		}
+	},
+}
+
+// BalanceStrategyRoundRobin assigns partitions to members in alternating order.
+// Example with topic T with six partitions (0..5) and two members (M1, M2):
+//   M1: {T: [0, 2, 4]}
+//   M2: {T: [1, 3, 5]}
+var BalanceStrategyRoundRobin = &balanceStrategy{
+	name: "roundrobin",
+	coreFn: func(plan BalanceStrategyPlan, memberIDs []string, topic string, partitions []int32) {
+		for i, part := range partitions {
+			memberID := memberIDs[i%len(memberIDs)]
+			plan.Add(memberID, topic, part)
+		}
+	},
+}
+
+// --------------------------------------------------------------------
+
+type balanceStrategy struct {
+	name   string
+	coreFn func(plan BalanceStrategyPlan, memberIDs []string, topic string, partitions []int32)
+}
+
+// Name implements BalanceStrategy.
+func (s *balanceStrategy) Name() string { return s.name }
+
+// Balance implements BalanceStrategy.
+func (s *balanceStrategy) Plan(members map[string]ConsumerGroupMemberMetadata, topics map[string][]int32) (BalanceStrategyPlan, error) {
+	// Build members by topic map
+	mbt := make(map[string][]string)
+	for memberID, meta := range members {
+		for _, topic := range meta.Topics {
+			mbt[topic] = append(mbt[topic], memberID)
+		}
+	}
+
+	// Sort members for each topic
+	for topic, memberIDs := range mbt {
+		sort.Sort(&balanceStrategySortable{
+			topic:     topic,
+			memberIDs: memberIDs,
+		})
+	}
+
+	// Assemble plan
+	plan := make(BalanceStrategyPlan, len(members))
+	for topic, memberIDs := range mbt {
+		s.coreFn(plan, memberIDs, topic, topics[topic])
+	}
+	return plan, nil
+}
+
+type balanceStrategySortable struct {
+	topic     string
+	memberIDs []string
+}
+
+func (p balanceStrategySortable) Len() int { return len(p.memberIDs) }
+func (p balanceStrategySortable) Swap(i, j int) {
+	p.memberIDs[i], p.memberIDs[j] = p.memberIDs[j], p.memberIDs[i]
+}
+func (p balanceStrategySortable) Less(i, j int) bool {
+	return balanceStrategyHashValue(p.topic, p.memberIDs[i]) < balanceStrategyHashValue(p.topic, p.memberIDs[j])
+}
+
+func balanceStrategyHashValue(vv ...string) uint32 {
+	h := uint32(2166136261)
+	for _, s := range vv {
+		for _, c := range s {
+			h ^= uint32(c)
+			h *= 16777619
+		}
+	}
+	return h
+}

diff --git a/vendor/github.com/Shopify/sarama/broker.go b/vendor/github.com/Shopify/sarama/broker.go
new file mode 100644
index 0000000..6a33b80
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/broker.go

@@ -0,0 +1,895 @@
+package sarama
+
+import (
+	"crypto/tls"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"net"
+	"strconv"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/rcrowley/go-metrics"
+)
+
+// Broker represents a single Kafka broker connection. All operations on this object are entirely concurrency-safe.
+type Broker struct {
+	id   int32
+	addr string
+	rack *string
+
+	conf          *Config
+	correlationID int32
+	conn          net.Conn
+	connErr       error
+	lock          sync.Mutex
+	opened        int32
+
+	responses chan responsePromise
+	done      chan bool
+
+	incomingByteRate       metrics.Meter
+	requestRate            metrics.Meter
+	requestSize            metrics.Histogram
+	requestLatency         metrics.Histogram
+	outgoingByteRate       metrics.Meter
+	responseRate           metrics.Meter
+	responseSize           metrics.Histogram
+	brokerIncomingByteRate metrics.Meter
+	brokerRequestRate      metrics.Meter
+	brokerRequestSize      metrics.Histogram
+	brokerRequestLatency   metrics.Histogram
+	brokerOutgoingByteRate metrics.Meter
+	brokerResponseRate     metrics.Meter
+	brokerResponseSize     metrics.Histogram
+}
+
+type responsePromise struct {
+	requestTime   time.Time
+	correlationID int32
+	packets       chan []byte
+	errors        chan error
+}
+
+// NewBroker creates and returns a Broker targeting the given host:port address.
+// This does not attempt to actually connect, you have to call Open() for that.
+func NewBroker(addr string) *Broker {
+	return &Broker{id: -1, addr: addr}
+}
+
+// Open tries to connect to the Broker if it is not already connected or connecting, but does not block
+// waiting for the connection to complete. This means that any subsequent operations on the broker will
+// block waiting for the connection to succeed or fail. To get the effect of a fully synchronous Open call,
+// follow it by a call to Connected(). The only errors Open will return directly are ConfigurationError or
+// AlreadyConnected. If conf is nil, the result of NewConfig() is used.
+func (b *Broker) Open(conf *Config) error {
+	if !atomic.CompareAndSwapInt32(&b.opened, 0, 1) {
+		return ErrAlreadyConnected
+	}
+
+	if conf == nil {
+		conf = NewConfig()
+	}
+
+	err := conf.Validate()
+	if err != nil {
+		return err
+	}
+
+	b.lock.Lock()
+
+	go withRecover(func() {
+		defer b.lock.Unlock()
+
+		dialer := net.Dialer{
+			Timeout:   conf.Net.DialTimeout,
+			KeepAlive: conf.Net.KeepAlive,
+			LocalAddr: conf.Net.LocalAddr,
+		}
+
+		if conf.Net.TLS.Enable {
+			b.conn, b.connErr = tls.DialWithDialer(&dialer, "tcp", b.addr, conf.Net.TLS.Config)
+		} else {
+			b.conn, b.connErr = dialer.Dial("tcp", b.addr)
+		}
+		if b.connErr != nil {
+			Logger.Printf("Failed to connect to broker %s: %s\n", b.addr, b.connErr)
+			b.conn = nil
+			atomic.StoreInt32(&b.opened, 0)
+			return
+		}
+		b.conn = newBufConn(b.conn)
+
+		b.conf = conf
+
+		// Create or reuse the global metrics shared between brokers
+		b.incomingByteRate = metrics.GetOrRegisterMeter("incoming-byte-rate", conf.MetricRegistry)
+		b.requestRate = metrics.GetOrRegisterMeter("request-rate", conf.MetricRegistry)
+		b.requestSize = getOrRegisterHistogram("request-size", conf.MetricRegistry)
+		b.requestLatency = getOrRegisterHistogram("request-latency-in-ms", conf.MetricRegistry)
+		b.outgoingByteRate = metrics.GetOrRegisterMeter("outgoing-byte-rate", conf.MetricRegistry)
+		b.responseRate = metrics.GetOrRegisterMeter("response-rate", conf.MetricRegistry)
+		b.responseSize = getOrRegisterHistogram("response-size", conf.MetricRegistry)
+		// Do not gather metrics for seeded broker (only used during bootstrap) because they share
+		// the same id (-1) and are already exposed through the global metrics above
+		if b.id >= 0 {
+			b.brokerIncomingByteRate = getOrRegisterBrokerMeter("incoming-byte-rate", b, conf.MetricRegistry)
+			b.brokerRequestRate = getOrRegisterBrokerMeter("request-rate", b, conf.MetricRegistry)
+			b.brokerRequestSize = getOrRegisterBrokerHistogram("request-size", b, conf.MetricRegistry)
+			b.brokerRequestLatency = getOrRegisterBrokerHistogram("request-latency-in-ms", b, conf.MetricRegistry)
+			b.brokerOutgoingByteRate = getOrRegisterBrokerMeter("outgoing-byte-rate", b, conf.MetricRegistry)
+			b.brokerResponseRate = getOrRegisterBrokerMeter("response-rate", b, conf.MetricRegistry)
+			b.brokerResponseSize = getOrRegisterBrokerHistogram("response-size", b, conf.MetricRegistry)
+		}
+
+		if conf.Net.SASL.Enable {
+			b.connErr = b.sendAndReceiveSASLPlainAuth()
+			if b.connErr != nil {
+				err = b.conn.Close()
+				if err == nil {
+					Logger.Printf("Closed connection to broker %s\n", b.addr)
+				} else {
+					Logger.Printf("Error while closing connection to broker %s: %s\n", b.addr, err)
+				}
+				b.conn = nil
+				atomic.StoreInt32(&b.opened, 0)
+				return
+			}
+		}
+
+		b.done = make(chan bool)
+		b.responses = make(chan responsePromise, b.conf.Net.MaxOpenRequests-1)
+
+		if b.id >= 0 {
+			Logger.Printf("Connected to broker at %s (registered as #%d)\n", b.addr, b.id)
+		} else {
+			Logger.Printf("Connected to broker at %s (unregistered)\n", b.addr)
+		}
+		go withRecover(b.responseReceiver)
+	})
+
+	return nil
+}
+
+// Connected returns true if the broker is connected and false otherwise. If the broker is not
+// connected but it had tried to connect, the error from that connection attempt is also returned.
+func (b *Broker) Connected() (bool, error) {
+	b.lock.Lock()
+	defer b.lock.Unlock()
+
+	return b.conn != nil, b.connErr
+}
+
+func (b *Broker) Close() error {
+	b.lock.Lock()
+	defer b.lock.Unlock()
+
+	if b.conn == nil {
+		return ErrNotConnected
+	}
+
+	close(b.responses)
+	<-b.done
+
+	err := b.conn.Close()
+
+	b.conn = nil
+	b.connErr = nil
+	b.done = nil
+	b.responses = nil
+
+	if b.id >= 0 {
+		b.conf.MetricRegistry.Unregister(getMetricNameForBroker("incoming-byte-rate", b))
+		b.conf.MetricRegistry.Unregister(getMetricNameForBroker("request-rate", b))
+		b.conf.MetricRegistry.Unregister(getMetricNameForBroker("outgoing-byte-rate", b))
+		b.conf.MetricRegistry.Unregister(getMetricNameForBroker("response-rate", b))
+	}
+
+	if err == nil {
+		Logger.Printf("Closed connection to broker %s\n", b.addr)
+	} else {
+		Logger.Printf("Error while closing connection to broker %s: %s\n", b.addr, err)
+	}
+
+	atomic.StoreInt32(&b.opened, 0)
+
+	return err
+}
+
+// ID returns the broker ID retrieved from Kafka's metadata, or -1 if that is not known.
+func (b *Broker) ID() int32 {
+	return b.id
+}
+
+// Addr returns the broker address as either retrieved from Kafka's metadata or passed to NewBroker.
+func (b *Broker) Addr() string {
+	return b.addr
+}
+
+// Rack returns the broker's rack as retrieved from Kafka's metadata or the
+// empty string if it is not known.  The returned value corresponds to the
+// broker's broker.rack configuration setting.  Requires protocol version to be
+// at least v0.10.0.0.
+func (b *Broker) Rack() string {
+	if b.rack == nil {
+		return ""
+	}
+	return *b.rack
+}
+
+func (b *Broker) GetMetadata(request *MetadataRequest) (*MetadataResponse, error) {
+	response := new(MetadataResponse)
+
+	err := b.sendAndReceive(request, response)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) GetConsumerMetadata(request *ConsumerMetadataRequest) (*ConsumerMetadataResponse, error) {
+	response := new(ConsumerMetadataResponse)
+
+	err := b.sendAndReceive(request, response)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) FindCoordinator(request *FindCoordinatorRequest) (*FindCoordinatorResponse, error) {
+	response := new(FindCoordinatorResponse)
+
+	err := b.sendAndReceive(request, response)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) GetAvailableOffsets(request *OffsetRequest) (*OffsetResponse, error) {
+	response := new(OffsetResponse)
+
+	err := b.sendAndReceive(request, response)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) Produce(request *ProduceRequest) (*ProduceResponse, error) {
+	var response *ProduceResponse
+	var err error
+
+	if request.RequiredAcks == NoResponse {
+		err = b.sendAndReceive(request, nil)
+	} else {
+		response = new(ProduceResponse)
+		err = b.sendAndReceive(request, response)
+	}
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) Fetch(request *FetchRequest) (*FetchResponse, error) {
+	response := new(FetchResponse)
+
+	err := b.sendAndReceive(request, response)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) CommitOffset(request *OffsetCommitRequest) (*OffsetCommitResponse, error) {
+	response := new(OffsetCommitResponse)
+
+	err := b.sendAndReceive(request, response)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) FetchOffset(request *OffsetFetchRequest) (*OffsetFetchResponse, error) {
+	response := new(OffsetFetchResponse)
+
+	err := b.sendAndReceive(request, response)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) JoinGroup(request *JoinGroupRequest) (*JoinGroupResponse, error) {
+	response := new(JoinGroupResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) SyncGroup(request *SyncGroupRequest) (*SyncGroupResponse, error) {
+	response := new(SyncGroupResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) LeaveGroup(request *LeaveGroupRequest) (*LeaveGroupResponse, error) {
+	response := new(LeaveGroupResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) Heartbeat(request *HeartbeatRequest) (*HeartbeatResponse, error) {
+	response := new(HeartbeatResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) ListGroups(request *ListGroupsRequest) (*ListGroupsResponse, error) {
+	response := new(ListGroupsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) DescribeGroups(request *DescribeGroupsRequest) (*DescribeGroupsResponse, error) {
+	response := new(DescribeGroupsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) ApiVersions(request *ApiVersionsRequest) (*ApiVersionsResponse, error) {
+	response := new(ApiVersionsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) CreateTopics(request *CreateTopicsRequest) (*CreateTopicsResponse, error) {
+	response := new(CreateTopicsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) DeleteTopics(request *DeleteTopicsRequest) (*DeleteTopicsResponse, error) {
+	response := new(DeleteTopicsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) CreatePartitions(request *CreatePartitionsRequest) (*CreatePartitionsResponse, error) {
+	response := new(CreatePartitionsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) DeleteRecords(request *DeleteRecordsRequest) (*DeleteRecordsResponse, error) {
+	response := new(DeleteRecordsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) DescribeAcls(request *DescribeAclsRequest) (*DescribeAclsResponse, error) {
+	response := new(DescribeAclsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) CreateAcls(request *CreateAclsRequest) (*CreateAclsResponse, error) {
+	response := new(CreateAclsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) DeleteAcls(request *DeleteAclsRequest) (*DeleteAclsResponse, error) {
+	response := new(DeleteAclsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) InitProducerID(request *InitProducerIDRequest) (*InitProducerIDResponse, error) {
+	response := new(InitProducerIDResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) AddPartitionsToTxn(request *AddPartitionsToTxnRequest) (*AddPartitionsToTxnResponse, error) {
+	response := new(AddPartitionsToTxnResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) AddOffsetsToTxn(request *AddOffsetsToTxnRequest) (*AddOffsetsToTxnResponse, error) {
+	response := new(AddOffsetsToTxnResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) EndTxn(request *EndTxnRequest) (*EndTxnResponse, error) {
+	response := new(EndTxnResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) TxnOffsetCommit(request *TxnOffsetCommitRequest) (*TxnOffsetCommitResponse, error) {
+	response := new(TxnOffsetCommitResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) DescribeConfigs(request *DescribeConfigsRequest) (*DescribeConfigsResponse, error) {
+	response := new(DescribeConfigsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) AlterConfigs(request *AlterConfigsRequest) (*AlterConfigsResponse, error) {
+	response := new(AlterConfigsResponse)
+
+	err := b.sendAndReceive(request, response)
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) DeleteGroups(request *DeleteGroupsRequest) (*DeleteGroupsResponse, error) {
+	response := new(DeleteGroupsResponse)
+
+	if err := b.sendAndReceive(request, response); err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func (b *Broker) send(rb protocolBody, promiseResponse bool) (*responsePromise, error) {
+	b.lock.Lock()
+	defer b.lock.Unlock()
+
+	if b.conn == nil {
+		if b.connErr != nil {
+			return nil, b.connErr
+		}
+		return nil, ErrNotConnected
+	}
+
+	if !b.conf.Version.IsAtLeast(rb.requiredVersion()) {
+		return nil, ErrUnsupportedVersion
+	}
+
+	req := &request{correlationID: b.correlationID, clientID: b.conf.ClientID, body: rb}
+	buf, err := encode(req, b.conf.MetricRegistry)
+	if err != nil {
+		return nil, err
+	}
+
+	err = b.conn.SetWriteDeadline(time.Now().Add(b.conf.Net.WriteTimeout))
+	if err != nil {
+		return nil, err
+	}
+
+	requestTime := time.Now()
+	bytes, err := b.conn.Write(buf)
+	b.updateOutgoingCommunicationMetrics(bytes)
+	if err != nil {
+		return nil, err
+	}
+	b.correlationID++
+
+	if !promiseResponse {
+		// Record request latency without the response
+		b.updateRequestLatencyMetrics(time.Since(requestTime))
+		return nil, nil
+	}
+
+	promise := responsePromise{requestTime, req.correlationID, make(chan []byte), make(chan error)}
+	b.responses <- promise
+
+	return &promise, nil
+}
+
+func (b *Broker) sendAndReceive(req protocolBody, res versionedDecoder) error {
+	promise, err := b.send(req, res != nil)
+
+	if err != nil {
+		return err
+	}
+
+	if promise == nil {
+		return nil
+	}
+
+	select {
+	case buf := <-promise.packets:
+		return versionedDecode(buf, res, req.version())
+	case err = <-promise.errors:
+		return err
+	}
+}
+
+func (b *Broker) decode(pd packetDecoder, version int16) (err error) {
+	b.id, err = pd.getInt32()
+	if err != nil {
+		return err
+	}
+
+	host, err := pd.getString()
+	if err != nil {
+		return err
+	}
+
+	port, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+
+	if version >= 1 {
+		b.rack, err = pd.getNullableString()
+		if err != nil {
+			return err
+		}
+	}
+
+	b.addr = net.JoinHostPort(host, fmt.Sprint(port))
+	if _, _, err := net.SplitHostPort(b.addr); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (b *Broker) encode(pe packetEncoder, version int16) (err error) {
+
+	host, portstr, err := net.SplitHostPort(b.addr)
+	if err != nil {
+		return err
+	}
+	port, err := strconv.Atoi(portstr)
+	if err != nil {
+		return err
+	}
+
+	pe.putInt32(b.id)
+
+	err = pe.putString(host)
+	if err != nil {
+		return err
+	}
+
+	pe.putInt32(int32(port))
+
+	if version >= 1 {
+		err = pe.putNullableString(b.rack)
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (b *Broker) responseReceiver() {
+	var dead error
+	header := make([]byte, 8)
+	for response := range b.responses {
+		if dead != nil {
+			response.errors <- dead
+			continue
+		}
+
+		err := b.conn.SetReadDeadline(time.Now().Add(b.conf.Net.ReadTimeout))
+		if err != nil {
+			dead = err
+			response.errors <- err
+			continue
+		}
+
+		bytesReadHeader, err := io.ReadFull(b.conn, header)
+		requestLatency := time.Since(response.requestTime)
+		if err != nil {
+			b.updateIncomingCommunicationMetrics(bytesReadHeader, requestLatency)
+			dead = err
+			response.errors <- err
+			continue
+		}
+
+		decodedHeader := responseHeader{}
+		err = decode(header, &decodedHeader)
+		if err != nil {
+			b.updateIncomingCommunicationMetrics(bytesReadHeader, requestLatency)
+			dead = err
+			response.errors <- err
+			continue
+		}
+		if decodedHeader.correlationID != response.correlationID {
+			b.updateIncomingCommunicationMetrics(bytesReadHeader, requestLatency)
+			// TODO if decoded ID < cur ID, discard until we catch up
+			// TODO if decoded ID > cur ID, save it so when cur ID catches up we have a response
+			dead = PacketDecodingError{fmt.Sprintf("correlation ID didn't match, wanted %d, got %d", response.correlationID, decodedHeader.correlationID)}
+			response.errors <- dead
+			continue
+		}
+
+		buf := make([]byte, decodedHeader.length-4)
+		bytesReadBody, err := io.ReadFull(b.conn, buf)
+		b.updateIncomingCommunicationMetrics(bytesReadHeader+bytesReadBody, requestLatency)
+		if err != nil {
+			dead = err
+			response.errors <- err
+			continue
+		}
+
+		response.packets <- buf
+	}
+	close(b.done)
+}
+
+func (b *Broker) sendAndReceiveSASLPlainHandshake() error {
+	rb := &SaslHandshakeRequest{"PLAIN"}
+	req := &request{correlationID: b.correlationID, clientID: b.conf.ClientID, body: rb}
+	buf, err := encode(req, b.conf.MetricRegistry)
+	if err != nil {
+		return err
+	}
+
+	err = b.conn.SetWriteDeadline(time.Now().Add(b.conf.Net.WriteTimeout))
+	if err != nil {
+		return err
+	}
+
+	requestTime := time.Now()
+	bytes, err := b.conn.Write(buf)
+	b.updateOutgoingCommunicationMetrics(bytes)
+	if err != nil {
+		Logger.Printf("Failed to send SASL handshake %s: %s\n", b.addr, err.Error())
+		return err
+	}
+	b.correlationID++
+	//wait for the response
+	header := make([]byte, 8) // response header
+	_, err = io.ReadFull(b.conn, header)
+	if err != nil {
+		Logger.Printf("Failed to read SASL handshake header : %s\n", err.Error())
+		return err
+	}
+	length := binary.BigEndian.Uint32(header[:4])
+	payload := make([]byte, length-4)
+	n, err := io.ReadFull(b.conn, payload)
+	if err != nil {
+		Logger.Printf("Failed to read SASL handshake payload : %s\n", err.Error())
+		return err
+	}
+	b.updateIncomingCommunicationMetrics(n+8, time.Since(requestTime))
+	res := &SaslHandshakeResponse{}
+	err = versionedDecode(payload, res, 0)
+	if err != nil {
+		Logger.Printf("Failed to parse SASL handshake : %s\n", err.Error())
+		return err
+	}
+	if res.Err != ErrNoError {
+		Logger.Printf("Invalid SASL Mechanism : %s\n", res.Err.Error())
+		return res.Err
+	}
+	Logger.Print("Successful SASL handshake")
+	return nil
+}
+
+// Kafka 0.10.0 plans to support SASL Plain and Kerberos as per PR #812 (KIP-43)/(JIRA KAFKA-3149)
+// Some hosted kafka services such as IBM Message Hub already offer SASL/PLAIN auth with Kafka 0.9
+//
+// In SASL Plain, Kafka expects the auth header to be in the following format
+// Message format (from https://tools.ietf.org/html/rfc4616):
+//
+//   message   = [authzid] UTF8NUL authcid UTF8NUL passwd
+//   authcid   = 1*SAFE ; MUST accept up to 255 octets
+//   authzid   = 1*SAFE ; MUST accept up to 255 octets
+//   passwd    = 1*SAFE ; MUST accept up to 255 octets
+//   UTF8NUL   = %x00 ; UTF-8 encoded NUL character
+//
+//   SAFE      = UTF1 / UTF2 / UTF3 / UTF4
+//                  ;; any UTF-8 encoded Unicode character except NUL
+//
+// When credentials are valid, Kafka returns a 4 byte array of null characters.
+// When credentials are invalid, Kafka closes the connection. This does not seem to be the ideal way
+// of responding to bad credentials but thats how its being done today.
+func (b *Broker) sendAndReceiveSASLPlainAuth() error {
+	if b.conf.Net.SASL.Handshake {
+		handshakeErr := b.sendAndReceiveSASLPlainHandshake()
+		if handshakeErr != nil {
+			Logger.Printf("Error while performing SASL handshake %s\n", b.addr)
+			return handshakeErr
+		}
+	}
+	length := 1 + len(b.conf.Net.SASL.User) + 1 + len(b.conf.Net.SASL.Password)
+	authBytes := make([]byte, length+4) //4 byte length header + auth data
+	binary.BigEndian.PutUint32(authBytes, uint32(length))
+	copy(authBytes[4:], []byte("\x00"+b.conf.Net.SASL.User+"\x00"+b.conf.Net.SASL.Password))
+
+	err := b.conn.SetWriteDeadline(time.Now().Add(b.conf.Net.WriteTimeout))
+	if err != nil {
+		Logger.Printf("Failed to set write deadline when doing SASL auth with broker %s: %s\n", b.addr, err.Error())
+		return err
+	}
+
+	requestTime := time.Now()
+	bytesWritten, err := b.conn.Write(authBytes)
+	b.updateOutgoingCommunicationMetrics(bytesWritten)
+	if err != nil {
+		Logger.Printf("Failed to write SASL auth header to broker %s: %s\n", b.addr, err.Error())
+		return err
+	}
+
+	header := make([]byte, 4)
+	n, err := io.ReadFull(b.conn, header)
+	b.updateIncomingCommunicationMetrics(n, time.Since(requestTime))
+	// If the credentials are valid, we would get a 4 byte response filled with null characters.
+	// Otherwise, the broker closes the connection and we get an EOF
+	if err != nil {
+		Logger.Printf("Failed to read response while authenticating with SASL to broker %s: %s\n", b.addr, err.Error())
+		return err
+	}
+
+	Logger.Printf("SASL authentication successful with broker %s:%v - %v\n", b.addr, n, header)
+	return nil
+}
+
+func (b *Broker) updateIncomingCommunicationMetrics(bytes int, requestLatency time.Duration) {
+	b.updateRequestLatencyMetrics(requestLatency)
+	b.responseRate.Mark(1)
+	if b.brokerResponseRate != nil {
+		b.brokerResponseRate.Mark(1)
+	}
+	responseSize := int64(bytes)
+	b.incomingByteRate.Mark(responseSize)
+	if b.brokerIncomingByteRate != nil {
+		b.brokerIncomingByteRate.Mark(responseSize)
+	}
+	b.responseSize.Update(responseSize)
+	if b.brokerResponseSize != nil {
+		b.brokerResponseSize.Update(responseSize)
+	}
+}
+
+func (b *Broker) updateRequestLatencyMetrics(requestLatency time.Duration) {
+	requestLatencyInMs := int64(requestLatency / time.Millisecond)
+	b.requestLatency.Update(requestLatencyInMs)
+	if b.brokerRequestLatency != nil {
+		b.brokerRequestLatency.Update(requestLatencyInMs)
+	}
+}
+
+func (b *Broker) updateOutgoingCommunicationMetrics(bytes int) {
+	b.requestRate.Mark(1)
+	if b.brokerRequestRate != nil {
+		b.brokerRequestRate.Mark(1)
+	}
+	requestSize := int64(bytes)
+	b.outgoingByteRate.Mark(requestSize)
+	if b.brokerOutgoingByteRate != nil {
+		b.brokerOutgoingByteRate.Mark(requestSize)
+	}
+	b.requestSize.Update(requestSize)
+	if b.brokerRequestSize != nil {
+		b.brokerRequestSize.Update(requestSize)
+	}
+}

diff --git a/vendor/github.com/Shopify/sarama/client.go b/vendor/github.com/Shopify/sarama/client.go
new file mode 100644
index 0000000..79be5ce
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/client.go

@@ -0,0 +1,899 @@
+package sarama
+
+import (
+	"math/rand"
+	"sort"
+	"sync"
+	"time"
+)
+
+// Client is a generic Kafka client. It manages connections to one or more Kafka brokers.
+// You MUST call Close() on a client to avoid leaks, it will not be garbage-collected
+// automatically when it passes out of scope. It is safe to share a client amongst many
+// users, however Kafka will process requests from a single client strictly in serial,
+// so it is generally more efficient to use the default one client per producer/consumer.
+type Client interface {
+	// Config returns the Config struct of the client. This struct should not be
+	// altered after it has been created.
+	Config() *Config
+
+	// Controller returns the cluster controller broker. Requires Kafka 0.10 or higher.
+	Controller() (*Broker, error)
+
+	// Brokers returns the current set of active brokers as retrieved from cluster metadata.
+	Brokers() []*Broker
+
+	// Topics returns the set of available topics as retrieved from cluster metadata.
+	Topics() ([]string, error)
+
+	// Partitions returns the sorted list of all partition IDs for the given topic.
+	Partitions(topic string) ([]int32, error)
+
+	// WritablePartitions returns the sorted list of all writable partition IDs for
+	// the given topic, where "writable" means "having a valid leader accepting
+	// writes".
+	WritablePartitions(topic string) ([]int32, error)
+
+	// Leader returns the broker object that is the leader of the current
+	// topic/partition, as determined by querying the cluster metadata.
+	Leader(topic string, partitionID int32) (*Broker, error)
+
+	// Replicas returns the set of all replica IDs for the given partition.
+	Replicas(topic string, partitionID int32) ([]int32, error)
+
+	// InSyncReplicas returns the set of all in-sync replica IDs for the given
+	// partition. In-sync replicas are replicas which are fully caught up with
+	// the partition leader.
+	InSyncReplicas(topic string, partitionID int32) ([]int32, error)
+
+	// RefreshMetadata takes a list of topics and queries the cluster to refresh the
+	// available metadata for those topics. If no topics are provided, it will refresh
+	// metadata for all topics.
+	RefreshMetadata(topics ...string) error
+
+	// GetOffset queries the cluster to get the most recent available offset at the
+	// given time (in milliseconds) on the topic/partition combination.
+	// Time should be OffsetOldest for the earliest available offset,
+	// OffsetNewest for the offset of the message that will be produced next, or a time.
+	GetOffset(topic string, partitionID int32, time int64) (int64, error)
+
+	// Coordinator returns the coordinating broker for a consumer group. It will
+	// return a locally cached value if it's available. You can call
+	// RefreshCoordinator to update the cached value. This function only works on
+	// Kafka 0.8.2 and higher.
+	Coordinator(consumerGroup string) (*Broker, error)
+
+	// RefreshCoordinator retrieves the coordinator for a consumer group and stores it
+	// in local cache. This function only works on Kafka 0.8.2 and higher.
+	RefreshCoordinator(consumerGroup string) error
+
+	// InitProducerID retrieves information required for Idempotent Producer
+	InitProducerID() (*InitProducerIDResponse, error)
+
+	// Close shuts down all broker connections managed by this client. It is required
+	// to call this function before a client object passes out of scope, as it will
+	// otherwise leak memory. You must close any Producers or Consumers using a client
+	// before you close the client.
+	Close() error
+
+	// Closed returns true if the client has already had Close called on it
+	Closed() bool
+}
+
+const (
+	// OffsetNewest stands for the log head offset, i.e. the offset that will be
+	// assigned to the next message that will be produced to the partition. You
+	// can send this to a client's GetOffset method to get this offset, or when
+	// calling ConsumePartition to start consuming new messages.
+	OffsetNewest int64 = -1
+	// OffsetOldest stands for the oldest offset available on the broker for a
+	// partition. You can send this to a client's GetOffset method to get this
+	// offset, or when calling ConsumePartition to start consuming from the
+	// oldest offset that is still available on the broker.
+	OffsetOldest int64 = -2
+)
+
+type client struct {
+	conf           *Config
+	closer, closed chan none // for shutting down background metadata updater
+
+	// the broker addresses given to us through the constructor are not guaranteed to be returned in
+	// the cluster metadata (I *think* it only returns brokers who are currently leading partitions?)
+	// so we store them separately
+	seedBrokers []*Broker
+	deadSeeds   []*Broker
+
+	controllerID   int32                                   // cluster controller broker id
+	brokers        map[int32]*Broker                       // maps broker ids to brokers
+	metadata       map[string]map[int32]*PartitionMetadata // maps topics to partition ids to metadata
+	metadataTopics map[string]none                         // topics that need to collect metadata
+	coordinators   map[string]int32                        // Maps consumer group names to coordinating broker IDs
+
+	// If the number of partitions is large, we can get some churn calling cachedPartitions,
+	// so the result is cached.  It is important to update this value whenever metadata is changed
+	cachedPartitionsResults map[string][maxPartitionIndex][]int32
+
+	lock sync.RWMutex // protects access to the maps that hold cluster state.
+}
+
+// NewClient creates a new Client. It connects to one of the given broker addresses
+// and uses that broker to automatically fetch metadata on the rest of the kafka cluster. If metadata cannot
+// be retrieved from any of the given broker addresses, the client is not created.
+func NewClient(addrs []string, conf *Config) (Client, error) {
+	Logger.Println("Initializing new client")
+
+	if conf == nil {
+		conf = NewConfig()
+	}
+
+	if err := conf.Validate(); err != nil {
+		return nil, err
+	}
+
+	if len(addrs) < 1 {
+		return nil, ConfigurationError("You must provide at least one broker address")
+	}
+
+	client := &client{
+		conf:                    conf,
+		closer:                  make(chan none),
+		closed:                  make(chan none),
+		brokers:                 make(map[int32]*Broker),
+		metadata:                make(map[string]map[int32]*PartitionMetadata),
+		metadataTopics:          make(map[string]none),
+		cachedPartitionsResults: make(map[string][maxPartitionIndex][]int32),
+		coordinators:            make(map[string]int32),
+	}
+
+	random := rand.New(rand.NewSource(time.Now().UnixNano()))
+	for _, index := range random.Perm(len(addrs)) {
+		client.seedBrokers = append(client.seedBrokers, NewBroker(addrs[index]))
+	}
+
+	if conf.Metadata.Full {
+		// do an initial fetch of all cluster metadata by specifying an empty list of topics
+		err := client.RefreshMetadata()
+		switch err {
+		case nil:
+			break
+		case ErrLeaderNotAvailable, ErrReplicaNotAvailable, ErrTopicAuthorizationFailed, ErrClusterAuthorizationFailed:
+			// indicates that maybe part of the cluster is down, but is not fatal to creating the client
+			Logger.Println(err)
+		default:
+			close(client.closed) // we haven't started the background updater yet, so we have to do this manually
+			_ = client.Close()
+			return nil, err
+		}
+	}
+	go withRecover(client.backgroundMetadataUpdater)
+
+	Logger.Println("Successfully initialized new client")
+
+	return client, nil
+}
+
+func (client *client) Config() *Config {
+	return client.conf
+}
+
+func (client *client) Brokers() []*Broker {
+	client.lock.RLock()
+	defer client.lock.RUnlock()
+	brokers := make([]*Broker, 0, len(client.brokers))
+	for _, broker := range client.brokers {
+		brokers = append(brokers, broker)
+	}
+	return brokers
+}
+
+func (client *client) InitProducerID() (*InitProducerIDResponse, error) {
+	var err error
+	for broker := client.any(); broker != nil; broker = client.any() {
+
+		req := &InitProducerIDRequest{}
+
+		response, err := broker.InitProducerID(req)
+		switch err.(type) {
+		case nil:
+			return response, nil
+		default:
+			// some error, remove that broker and try again
+			Logger.Printf("Client got error from broker %d when issuing InitProducerID : %v\n", broker.ID(), err)
+			_ = broker.Close()
+			client.deregisterBroker(broker)
+		}
+	}
+	return nil, err
+}
+
+func (client *client) Close() error {
+	if client.Closed() {
+		// Chances are this is being called from a defer() and the error will go unobserved
+		// so we go ahead and log the event in this case.
+		Logger.Printf("Close() called on already closed client")
+		return ErrClosedClient
+	}
+
+	// shutdown and wait for the background thread before we take the lock, to avoid races
+	close(client.closer)
+	<-client.closed
+
+	client.lock.Lock()
+	defer client.lock.Unlock()
+	Logger.Println("Closing Client")
+
+	for _, broker := range client.brokers {
+		safeAsyncClose(broker)
+	}
+
+	for _, broker := range client.seedBrokers {
+		safeAsyncClose(broker)
+	}
+
+	client.brokers = nil
+	client.metadata = nil
+	client.metadataTopics = nil
+
+	return nil
+}
+
+func (client *client) Closed() bool {
+	return client.brokers == nil
+}
+
+func (client *client) Topics() ([]string, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	client.lock.RLock()
+	defer client.lock.RUnlock()
+
+	ret := make([]string, 0, len(client.metadata))
+	for topic := range client.metadata {
+		ret = append(ret, topic)
+	}
+
+	return ret, nil
+}
+
+func (client *client) MetadataTopics() ([]string, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	client.lock.RLock()
+	defer client.lock.RUnlock()
+
+	ret := make([]string, 0, len(client.metadataTopics))
+	for topic := range client.metadataTopics {
+		ret = append(ret, topic)
+	}
+
+	return ret, nil
+}
+
+func (client *client) Partitions(topic string) ([]int32, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	partitions := client.cachedPartitions(topic, allPartitions)
+
+	if len(partitions) == 0 {
+		err := client.RefreshMetadata(topic)
+		if err != nil {
+			return nil, err
+		}
+		partitions = client.cachedPartitions(topic, allPartitions)
+	}
+
+	if partitions == nil {
+		return nil, ErrUnknownTopicOrPartition
+	}
+
+	return partitions, nil
+}
+
+func (client *client) WritablePartitions(topic string) ([]int32, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	partitions := client.cachedPartitions(topic, writablePartitions)
+
+	// len==0 catches when it's nil (no such topic) and the odd case when every single
+	// partition is undergoing leader election simultaneously. Callers have to be able to handle
+	// this function returning an empty slice (which is a valid return value) but catching it
+	// here the first time (note we *don't* catch it below where we return ErrUnknownTopicOrPartition) triggers
+	// a metadata refresh as a nicety so callers can just try again and don't have to manually
+	// trigger a refresh (otherwise they'd just keep getting a stale cached copy).
+	if len(partitions) == 0 {
+		err := client.RefreshMetadata(topic)
+		if err != nil {
+			return nil, err
+		}
+		partitions = client.cachedPartitions(topic, writablePartitions)
+	}
+
+	if partitions == nil {
+		return nil, ErrUnknownTopicOrPartition
+	}
+
+	return partitions, nil
+}
+
+func (client *client) Replicas(topic string, partitionID int32) ([]int32, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	metadata := client.cachedMetadata(topic, partitionID)
+
+	if metadata == nil {
+		err := client.RefreshMetadata(topic)
+		if err != nil {
+			return nil, err
+		}
+		metadata = client.cachedMetadata(topic, partitionID)
+	}
+
+	if metadata == nil {
+		return nil, ErrUnknownTopicOrPartition
+	}
+
+	if metadata.Err == ErrReplicaNotAvailable {
+		return dupInt32Slice(metadata.Replicas), metadata.Err
+	}
+	return dupInt32Slice(metadata.Replicas), nil
+}
+
+func (client *client) InSyncReplicas(topic string, partitionID int32) ([]int32, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	metadata := client.cachedMetadata(topic, partitionID)
+
+	if metadata == nil {
+		err := client.RefreshMetadata(topic)
+		if err != nil {
+			return nil, err
+		}
+		metadata = client.cachedMetadata(topic, partitionID)
+	}
+
+	if metadata == nil {
+		return nil, ErrUnknownTopicOrPartition
+	}
+
+	if metadata.Err == ErrReplicaNotAvailable {
+		return dupInt32Slice(metadata.Isr), metadata.Err
+	}
+	return dupInt32Slice(metadata.Isr), nil
+}
+
+func (client *client) Leader(topic string, partitionID int32) (*Broker, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	leader, err := client.cachedLeader(topic, partitionID)
+
+	if leader == nil {
+		err = client.RefreshMetadata(topic)
+		if err != nil {
+			return nil, err
+		}
+		leader, err = client.cachedLeader(topic, partitionID)
+	}
+
+	return leader, err
+}
+
+func (client *client) RefreshMetadata(topics ...string) error {
+	if client.Closed() {
+		return ErrClosedClient
+	}
+
+	// Prior to 0.8.2, Kafka will throw exceptions on an empty topic and not return a proper
+	// error. This handles the case by returning an error instead of sending it
+	// off to Kafka. See: https://github.com/Shopify/sarama/pull/38#issuecomment-26362310
+	for _, topic := range topics {
+		if len(topic) == 0 {
+			return ErrInvalidTopic // this is the error that 0.8.2 and later correctly return
+		}
+	}
+
+	return client.tryRefreshMetadata(topics, client.conf.Metadata.Retry.Max)
+}
+
+func (client *client) GetOffset(topic string, partitionID int32, time int64) (int64, error) {
+	if client.Closed() {
+		return -1, ErrClosedClient
+	}
+
+	offset, err := client.getOffset(topic, partitionID, time)
+
+	if err != nil {
+		if err := client.RefreshMetadata(topic); err != nil {
+			return -1, err
+		}
+		return client.getOffset(topic, partitionID, time)
+	}
+
+	return offset, err
+}
+
+func (client *client) Controller() (*Broker, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	if !client.conf.Version.IsAtLeast(V0_10_0_0) {
+		return nil, ErrUnsupportedVersion
+	}
+
+	controller := client.cachedController()
+	if controller == nil {
+		if err := client.refreshMetadata(); err != nil {
+			return nil, err
+		}
+		controller = client.cachedController()
+	}
+
+	if controller == nil {
+		return nil, ErrControllerNotAvailable
+	}
+
+	_ = controller.Open(client.conf)
+	return controller, nil
+}
+
+func (client *client) Coordinator(consumerGroup string) (*Broker, error) {
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	coordinator := client.cachedCoordinator(consumerGroup)
+
+	if coordinator == nil {
+		if err := client.RefreshCoordinator(consumerGroup); err != nil {
+			return nil, err
+		}
+		coordinator = client.cachedCoordinator(consumerGroup)
+	}
+
+	if coordinator == nil {
+		return nil, ErrConsumerCoordinatorNotAvailable
+	}
+
+	_ = coordinator.Open(client.conf)
+	return coordinator, nil
+}
+
+func (client *client) RefreshCoordinator(consumerGroup string) error {
+	if client.Closed() {
+		return ErrClosedClient
+	}
+
+	response, err := client.getConsumerMetadata(consumerGroup, client.conf.Metadata.Retry.Max)
+	if err != nil {
+		return err
+	}
+
+	client.lock.Lock()
+	defer client.lock.Unlock()
+	client.registerBroker(response.Coordinator)
+	client.coordinators[consumerGroup] = response.Coordinator.ID()
+	return nil
+}
+
+// private broker management helpers
+
+// registerBroker makes sure a broker received by a Metadata or Coordinator request is registered
+// in the brokers map. It returns the broker that is registered, which may be the provided broker,
+// or a previously registered Broker instance. You must hold the write lock before calling this function.
+func (client *client) registerBroker(broker *Broker) {
+	if client.brokers[broker.ID()] == nil {
+		client.brokers[broker.ID()] = broker
+		Logger.Printf("client/brokers registered new broker #%d at %s", broker.ID(), broker.Addr())
+	} else if broker.Addr() != client.brokers[broker.ID()].Addr() {
+		safeAsyncClose(client.brokers[broker.ID()])
+		client.brokers[broker.ID()] = broker
+		Logger.Printf("client/brokers replaced registered broker #%d with %s", broker.ID(), broker.Addr())
+	}
+}
+
+// deregisterBroker removes a broker from the seedsBroker list, and if it's
+// not the seedbroker, removes it from brokers map completely.
+func (client *client) deregisterBroker(broker *Broker) {
+	client.lock.Lock()
+	defer client.lock.Unlock()
+
+	if len(client.seedBrokers) > 0 && broker == client.seedBrokers[0] {
+		client.deadSeeds = append(client.deadSeeds, broker)
+		client.seedBrokers = client.seedBrokers[1:]
+	} else {
+		// we do this so that our loop in `tryRefreshMetadata` doesn't go on forever,
+		// but we really shouldn't have to; once that loop is made better this case can be
+		// removed, and the function generally can be renamed from `deregisterBroker` to
+		// `nextSeedBroker` or something
+		Logger.Printf("client/brokers deregistered broker #%d at %s", broker.ID(), broker.Addr())
+		delete(client.brokers, broker.ID())
+	}
+}
+
+func (client *client) resurrectDeadBrokers() {
+	client.lock.Lock()
+	defer client.lock.Unlock()
+
+	Logger.Printf("client/brokers resurrecting %d dead seed brokers", len(client.deadSeeds))
+	client.seedBrokers = append(client.seedBrokers, client.deadSeeds...)
+	client.deadSeeds = nil
+}
+
+func (client *client) any() *Broker {
+	client.lock.RLock()
+	defer client.lock.RUnlock()
+
+	if len(client.seedBrokers) > 0 {
+		_ = client.seedBrokers[0].Open(client.conf)
+		return client.seedBrokers[0]
+	}
+
+	// not guaranteed to be random *or* deterministic
+	for _, broker := range client.brokers {
+		_ = broker.Open(client.conf)
+		return broker
+	}
+
+	return nil
+}
+
+// private caching/lazy metadata helpers
+
+type partitionType int
+
+const (
+	allPartitions partitionType = iota
+	writablePartitions
+	// If you add any more types, update the partition cache in update()
+
+	// Ensure this is the last partition type value
+	maxPartitionIndex
+)
+
+func (client *client) cachedMetadata(topic string, partitionID int32) *PartitionMetadata {
+	client.lock.RLock()
+	defer client.lock.RUnlock()
+
+	partitions := client.metadata[topic]
+	if partitions != nil {
+		return partitions[partitionID]
+	}
+
+	return nil
+}
+
+func (client *client) cachedPartitions(topic string, partitionSet partitionType) []int32 {
+	client.lock.RLock()
+	defer client.lock.RUnlock()
+
+	partitions, exists := client.cachedPartitionsResults[topic]
+
+	if !exists {
+		return nil
+	}
+	return partitions[partitionSet]
+}
+
+func (client *client) setPartitionCache(topic string, partitionSet partitionType) []int32 {
+	partitions := client.metadata[topic]
+
+	if partitions == nil {
+		return nil
+	}
+
+	ret := make([]int32, 0, len(partitions))
+	for _, partition := range partitions {
+		if partitionSet == writablePartitions && partition.Err == ErrLeaderNotAvailable {
+			continue
+		}
+		ret = append(ret, partition.ID)
+	}
+
+	sort.Sort(int32Slice(ret))
+	return ret
+}
+
+func (client *client) cachedLeader(topic string, partitionID int32) (*Broker, error) {
+	client.lock.RLock()
+	defer client.lock.RUnlock()
+
+	partitions := client.metadata[topic]
+	if partitions != nil {
+		metadata, ok := partitions[partitionID]
+		if ok {
+			if metadata.Err == ErrLeaderNotAvailable {
+				return nil, ErrLeaderNotAvailable
+			}
+			b := client.brokers[metadata.Leader]
+			if b == nil {
+				return nil, ErrLeaderNotAvailable
+			}
+			_ = b.Open(client.conf)
+			return b, nil
+		}
+	}
+
+	return nil, ErrUnknownTopicOrPartition
+}
+
+func (client *client) getOffset(topic string, partitionID int32, time int64) (int64, error) {
+	broker, err := client.Leader(topic, partitionID)
+	if err != nil {
+		return -1, err
+	}
+
+	request := &OffsetRequest{}
+	if client.conf.Version.IsAtLeast(V0_10_1_0) {
+		request.Version = 1
+	}
+	request.AddBlock(topic, partitionID, time, 1)
+
+	response, err := broker.GetAvailableOffsets(request)
+	if err != nil {
+		_ = broker.Close()
+		return -1, err
+	}
+
+	block := response.GetBlock(topic, partitionID)
+	if block == nil {
+		_ = broker.Close()
+		return -1, ErrIncompleteResponse
+	}
+	if block.Err != ErrNoError {
+		return -1, block.Err
+	}
+	if len(block.Offsets) != 1 {
+		return -1, ErrOffsetOutOfRange
+	}
+
+	return block.Offsets[0], nil
+}
+
+// core metadata update logic
+
+func (client *client) backgroundMetadataUpdater() {
+	defer close(client.closed)
+
+	if client.conf.Metadata.RefreshFrequency == time.Duration(0) {
+		return
+	}
+
+	ticker := time.NewTicker(client.conf.Metadata.RefreshFrequency)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ticker.C:
+			if err := client.refreshMetadata(); err != nil {
+				Logger.Println("Client background metadata update:", err)
+			}
+		case <-client.closer:
+			return
+		}
+	}
+}
+
+func (client *client) refreshMetadata() error {
+	topics := []string{}
+
+	if !client.conf.Metadata.Full {
+		if specificTopics, err := client.MetadataTopics(); err != nil {
+			return err
+		} else if len(specificTopics) == 0 {
+			return ErrNoTopicsToUpdateMetadata
+		} else {
+			topics = specificTopics
+		}
+	}
+
+	if err := client.RefreshMetadata(topics...); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (client *client) tryRefreshMetadata(topics []string, attemptsRemaining int) error {
+	retry := func(err error) error {
+		if attemptsRemaining > 0 {
+			Logger.Printf("client/metadata retrying after %dms... (%d attempts remaining)\n", client.conf.Metadata.Retry.Backoff/time.Millisecond, attemptsRemaining)
+			time.Sleep(client.conf.Metadata.Retry.Backoff)
+			return client.tryRefreshMetadata(topics, attemptsRemaining-1)
+		}
+		return err
+	}
+
+	for broker := client.any(); broker != nil; broker = client.any() {
+		if len(topics) > 0 {
+			Logger.Printf("client/metadata fetching metadata for %v from broker %s\n", topics, broker.addr)
+		} else {
+			Logger.Printf("client/metadata fetching metadata for all topics from broker %s\n", broker.addr)
+		}
+
+		req := &MetadataRequest{Topics: topics}
+		if client.conf.Version.IsAtLeast(V0_10_0_0) {
+			req.Version = 1
+		}
+		response, err := broker.GetMetadata(req)
+
+		switch err.(type) {
+		case nil:
+			allKnownMetaData := len(topics) == 0
+			// valid response, use it
+			shouldRetry, err := client.updateMetadata(response, allKnownMetaData)
+			if shouldRetry {
+				Logger.Println("client/metadata found some partitions to be leaderless")
+				return retry(err) // note: err can be nil
+			}
+			return err
+
+		case PacketEncodingError:
+			// didn't even send, return the error
+			return err
+		default:
+			// some other error, remove that broker and try again
+			Logger.Printf("client/metadata got error from broker %d while fetching metadata: %v\n", broker.ID(), err)
+			_ = broker.Close()
+			client.deregisterBroker(broker)
+		}
+	}
+
+	Logger.Println("client/metadata no available broker to send metadata request to")
+	client.resurrectDeadBrokers()
+	return retry(ErrOutOfBrokers)
+}
+
+// if no fatal error, returns a list of topics that need retrying due to ErrLeaderNotAvailable
+func (client *client) updateMetadata(data *MetadataResponse, allKnownMetaData bool) (retry bool, err error) {
+	client.lock.Lock()
+	defer client.lock.Unlock()
+
+	// For all the brokers we received:
+	// - if it is a new ID, save it
+	// - if it is an existing ID, but the address we have is stale, discard the old one and save it
+	// - otherwise ignore it, replacing our existing one would just bounce the connection
+	for _, broker := range data.Brokers {
+		client.registerBroker(broker)
+	}
+
+	client.controllerID = data.ControllerID
+
+	if allKnownMetaData {
+		client.metadata = make(map[string]map[int32]*PartitionMetadata)
+		client.metadataTopics = make(map[string]none)
+		client.cachedPartitionsResults = make(map[string][maxPartitionIndex][]int32)
+	}
+	for _, topic := range data.Topics {
+		// topics must be added firstly to `metadataTopics` to guarantee that all
+		// requested topics must be recorded to keep them trackable for periodically
+		// metadata refresh.
+		if _, exists := client.metadataTopics[topic.Name]; !exists {
+			client.metadataTopics[topic.Name] = none{}
+		}
+		delete(client.metadata, topic.Name)
+		delete(client.cachedPartitionsResults, topic.Name)
+
+		switch topic.Err {
+		case ErrNoError:
+			break
+		case ErrInvalidTopic, ErrTopicAuthorizationFailed: // don't retry, don't store partial results
+			err = topic.Err
+			continue
+		case ErrUnknownTopicOrPartition: // retry, do not store partial partition results
+			err = topic.Err
+			retry = true
+			continue
+		case ErrLeaderNotAvailable: // retry, but store partial partition results
+			retry = true
+			break
+		default: // don't retry, don't store partial results
+			Logger.Printf("Unexpected topic-level metadata error: %s", topic.Err)
+			err = topic.Err
+			continue
+		}
+
+		client.metadata[topic.Name] = make(map[int32]*PartitionMetadata, len(topic.Partitions))
+		for _, partition := range topic.Partitions {
+			client.metadata[topic.Name][partition.ID] = partition
+			if partition.Err == ErrLeaderNotAvailable {
+				retry = true
+			}
+		}
+
+		var partitionCache [maxPartitionIndex][]int32
+		partitionCache[allPartitions] = client.setPartitionCache(topic.Name, allPartitions)
+		partitionCache[writablePartitions] = client.setPartitionCache(topic.Name, writablePartitions)
+		client.cachedPartitionsResults[topic.Name] = partitionCache
+	}
+
+	return
+}
+
+func (client *client) cachedCoordinator(consumerGroup string) *Broker {
+	client.lock.RLock()
+	defer client.lock.RUnlock()
+	if coordinatorID, ok := client.coordinators[consumerGroup]; ok {
+		return client.brokers[coordinatorID]
+	}
+	return nil
+}
+
+func (client *client) cachedController() *Broker {
+	client.lock.RLock()
+	defer client.lock.RUnlock()
+
+	return client.brokers[client.controllerID]
+}
+
+func (client *client) getConsumerMetadata(consumerGroup string, attemptsRemaining int) (*FindCoordinatorResponse, error) {
+	retry := func(err error) (*FindCoordinatorResponse, error) {
+		if attemptsRemaining > 0 {
+			Logger.Printf("client/coordinator retrying after %dms... (%d attempts remaining)\n", client.conf.Metadata.Retry.Backoff/time.Millisecond, attemptsRemaining)
+			time.Sleep(client.conf.Metadata.Retry.Backoff)
+			return client.getConsumerMetadata(consumerGroup, attemptsRemaining-1)
+		}
+		return nil, err
+	}
+
+	for broker := client.any(); broker != nil; broker = client.any() {
+		Logger.Printf("client/coordinator requesting coordinator for consumergroup %s from %s\n", consumerGroup, broker.Addr())
+
+		request := new(FindCoordinatorRequest)
+		request.CoordinatorKey = consumerGroup
+		request.CoordinatorType = CoordinatorGroup
+
+		response, err := broker.FindCoordinator(request)
+
+		if err != nil {
+			Logger.Printf("client/coordinator request to broker %s failed: %s\n", broker.Addr(), err)
+
+			switch err.(type) {
+			case PacketEncodingError:
+				return nil, err
+			default:
+				_ = broker.Close()
+				client.deregisterBroker(broker)
+				continue
+			}
+		}
+
+		switch response.Err {
+		case ErrNoError:
+			Logger.Printf("client/coordinator coordinator for consumergroup %s is #%d (%s)\n", consumerGroup, response.Coordinator.ID(), response.Coordinator.Addr())
+			return response, nil
+
+		case ErrConsumerCoordinatorNotAvailable:
+			Logger.Printf("client/coordinator coordinator for consumer group %s is not available\n", consumerGroup)
+
+			// This is very ugly, but this scenario will only happen once per cluster.
+			// The __consumer_offsets topic only has to be created one time.
+			// The number of partitions not configurable, but partition 0 should always exist.
+			if _, err := client.Leader("__consumer_offsets", 0); err != nil {
+				Logger.Printf("client/coordinator the __consumer_offsets topic is not initialized completely yet. Waiting 2 seconds...\n")
+				time.Sleep(2 * time.Second)
+			}
+
+			return retry(ErrConsumerCoordinatorNotAvailable)
+		default:
+			return nil, response.Err
+		}
+	}
+
+	Logger.Println("client/coordinator no available broker to send consumer metadata request to")
+	client.resurrectDeadBrokers()
+	return retry(ErrOutOfBrokers)
+}

diff --git a/vendor/github.com/Shopify/sarama/config.go b/vendor/github.com/Shopify/sarama/config.go
new file mode 100644
index 0000000..ed20522
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/config.go

@@ -0,0 +1,581 @@
+package sarama
+
+import (
+	"compress/gzip"
+	"crypto/tls"
+	"fmt"
+	"io/ioutil"
+	"net"
+	"regexp"
+	"time"
+
+	"github.com/rcrowley/go-metrics"
+)
+
+const defaultClientID = "sarama"
+
+var validID = regexp.MustCompile(`\A[A-Za-z0-9._-]+\z`)
+
+// Config is used to pass multiple configuration options to Sarama's constructors.
+type Config struct {
+	// Admin is the namespace for ClusterAdmin properties used by the administrative Kafka client.
+	Admin struct {
+		// The maximum duration the administrative Kafka client will wait for ClusterAdmin operations,
+		// including topics, brokers, configurations and ACLs (defaults to 3 seconds).
+		Timeout time.Duration
+	}
+
+	// Net is the namespace for network-level properties used by the Broker, and
+	// shared by the Client/Producer/Consumer.
+	Net struct {
+		// How many outstanding requests a connection is allowed to have before
+		// sending on it blocks (default 5).
+		MaxOpenRequests int
+
+		// All three of the below configurations are similar to the
+		// `socket.timeout.ms` setting in JVM kafka. All of them default
+		// to 30 seconds.
+		DialTimeout  time.Duration // How long to wait for the initial connection.
+		ReadTimeout  time.Duration // How long to wait for a response.
+		WriteTimeout time.Duration // How long to wait for a transmit.
+
+		TLS struct {
+			// Whether or not to use TLS when connecting to the broker
+			// (defaults to false).
+			Enable bool
+			// The TLS configuration to use for secure connections if
+			// enabled (defaults to nil).
+			Config *tls.Config
+		}
+
+		// SASL based authentication with broker. While there are multiple SASL authentication methods
+		// the current implementation is limited to plaintext (SASL/PLAIN) authentication
+		SASL struct {
+			// Whether or not to use SASL authentication when connecting to the broker
+			// (defaults to false).
+			Enable bool
+			// Whether or not to send the Kafka SASL handshake first if enabled
+			// (defaults to true). You should only set this to false if you're using
+			// a non-Kafka SASL proxy.
+			Handshake bool
+			//username and password for SASL/PLAIN authentication
+			User     string
+			Password string
+		}
+
+		// KeepAlive specifies the keep-alive period for an active network connection.
+		// If zero, keep-alives are disabled. (default is 0: disabled).
+		KeepAlive time.Duration
+
+		// LocalAddr is the local address to use when dialing an
+		// address. The address must be of a compatible type for the
+		// network being dialed.
+		// If nil, a local address is automatically chosen.
+		LocalAddr net.Addr
+	}
+
+	// Metadata is the namespace for metadata management properties used by the
+	// Client, and shared by the Producer/Consumer.
+	Metadata struct {
+		Retry struct {
+			// The total number of times to retry a metadata request when the
+			// cluster is in the middle of a leader election (default 3).
+			Max int
+			// How long to wait for leader election to occur before retrying
+			// (default 250ms). Similar to the JVM's `retry.backoff.ms`.
+			Backoff time.Duration
+		}
+		// How frequently to refresh the cluster metadata in the background.
+		// Defaults to 10 minutes. Set to 0 to disable. Similar to
+		// `topic.metadata.refresh.interval.ms` in the JVM version.
+		RefreshFrequency time.Duration
+
+		// Whether to maintain a full set of metadata for all topics, or just
+		// the minimal set that has been necessary so far. The full set is simpler
+		// and usually more convenient, but can take up a substantial amount of
+		// memory if you have many topics and partitions. Defaults to true.
+		Full bool
+	}
+
+	// Producer is the namespace for configuration related to producing messages,
+	// used by the Producer.
+	Producer struct {
+		// The maximum permitted size of a message (defaults to 1000000). Should be
+		// set equal to or smaller than the broker's `message.max.bytes`.
+		MaxMessageBytes int
+		// The level of acknowledgement reliability needed from the broker (defaults
+		// to WaitForLocal). Equivalent to the `request.required.acks` setting of the
+		// JVM producer.
+		RequiredAcks RequiredAcks
+		// The maximum duration the broker will wait the receipt of the number of
+		// RequiredAcks (defaults to 10 seconds). This is only relevant when
+		// RequiredAcks is set to WaitForAll or a number > 1. Only supports
+		// millisecond resolution, nanoseconds will be truncated. Equivalent to
+		// the JVM producer's `request.timeout.ms` setting.
+		Timeout time.Duration
+		// The type of compression to use on messages (defaults to no compression).
+		// Similar to `compression.codec` setting of the JVM producer.
+		Compression CompressionCodec
+		// The level of compression to use on messages. The meaning depends
+		// on the actual compression type used and defaults to default compression
+		// level for the codec.
+		CompressionLevel int
+		// Generates partitioners for choosing the partition to send messages to
+		// (defaults to hashing the message key). Similar to the `partitioner.class`
+		// setting for the JVM producer.
+		Partitioner PartitionerConstructor
+		// If enabled, the producer will ensure that exactly one copy of each message is
+		// written.
+		Idempotent bool
+
+		// Return specifies what channels will be populated. If they are set to true,
+		// you must read from the respective channels to prevent deadlock. If,
+		// however, this config is used to create a `SyncProducer`, both must be set
+		// to true and you shall not read from the channels since the producer does
+		// this internally.
+		Return struct {
+			// If enabled, successfully delivered messages will be returned on the
+			// Successes channel (default disabled).
+			Successes bool
+
+			// If enabled, messages that failed to deliver will be returned on the
+			// Errors channel, including error (default enabled).
+			Errors bool
+		}
+
+		// The following config options control how often messages are batched up and
+		// sent to the broker. By default, messages are sent as fast as possible, and
+		// all messages received while the current batch is in-flight are placed
+		// into the subsequent batch.
+		Flush struct {
+			// The best-effort number of bytes needed to trigger a flush. Use the
+			// global sarama.MaxRequestSize to set a hard upper limit.
+			Bytes int
+			// The best-effort number of messages needed to trigger a flush. Use
+			// `MaxMessages` to set a hard upper limit.
+			Messages int
+			// The best-effort frequency of flushes. Equivalent to
+			// `queue.buffering.max.ms` setting of JVM producer.
+			Frequency time.Duration
+			// The maximum number of messages the producer will send in a single
+			// broker request. Defaults to 0 for unlimited. Similar to
+			// `queue.buffering.max.messages` in the JVM producer.
+			MaxMessages int
+		}
+
+		Retry struct {
+			// The total number of times to retry sending a message (default 3).
+			// Similar to the `message.send.max.retries` setting of the JVM producer.
+			Max int
+			// How long to wait for the cluster to settle between retries
+			// (default 100ms). Similar to the `retry.backoff.ms` setting of the
+			// JVM producer.
+			Backoff time.Duration
+		}
+	}
+
+	// Consumer is the namespace for configuration related to consuming messages,
+	// used by the Consumer.
+	Consumer struct {
+
+		// Group is the namespace for configuring consumer group.
+		Group struct {
+			Session struct {
+				// The timeout used to detect consumer failures when using Kafka's group management facility.
+				// The consumer sends periodic heartbeats to indicate its liveness to the broker.
+				// If no heartbeats are received by the broker before the expiration of this session timeout,
+				// then the broker will remove this consumer from the group and initiate a rebalance.
+				// Note that the value must be in the allowable range as configured in the broker configuration
+				// by `group.min.session.timeout.ms` and `group.max.session.timeout.ms` (default 10s)
+				Timeout time.Duration
+			}
+			Heartbeat struct {
+				// The expected time between heartbeats to the consumer coordinator when using Kafka's group
+				// management facilities. Heartbeats are used to ensure that the consumer's session stays active and
+				// to facilitate rebalancing when new consumers join or leave the group.
+				// The value must be set lower than Consumer.Group.Session.Timeout, but typically should be set no
+				// higher than 1/3 of that value.
+				// It can be adjusted even lower to control the expected time for normal rebalances (default 3s)
+				Interval time.Duration
+			}
+			Rebalance struct {
+				// Strategy for allocating topic partitions to members (default BalanceStrategyRange)
+				Strategy BalanceStrategy
+				// The maximum allowed time for each worker to join the group once a rebalance has begun.
+				// This is basically a limit on the amount of time needed for all tasks to flush any pending
+				// data and commit offsets. If the timeout is exceeded, then the worker will be removed from
+				// the group, which will cause offset commit failures (default 60s).
+				Timeout time.Duration
+
+				Retry struct {
+					// When a new consumer joins a consumer group the set of consumers attempt to "rebalance"
+					// the load to assign partitions to each consumer. If the set of consumers changes while
+					// this assignment is taking place the rebalance will fail and retry. This setting controls
+					// the maximum number of attempts before giving up (default 4).
+					Max int
+					// Backoff time between retries during rebalance (default 2s)
+					Backoff time.Duration
+				}
+			}
+			Member struct {
+				// Custom metadata to include when joining the group. The user data for all joined members
+				// can be retrieved by sending a DescribeGroupRequest to the broker that is the
+				// coordinator for the group.
+				UserData []byte
+			}
+		}
+
+		Retry struct {
+			// How long to wait after a failing to read from a partition before
+			// trying again (default 2s).
+			Backoff time.Duration
+		}
+
+		// Fetch is the namespace for controlling how many bytes are retrieved by any
+		// given request.
+		Fetch struct {
+			// The minimum number of message bytes to fetch in a request - the broker
+			// will wait until at least this many are available. The default is 1,
+			// as 0 causes the consumer to spin when no messages are available.
+			// Equivalent to the JVM's `fetch.min.bytes`.
+			Min int32
+			// The default number of message bytes to fetch from the broker in each
+			// request (default 1MB). This should be larger than the majority of
+			// your messages, or else the consumer will spend a lot of time
+			// negotiating sizes and not actually consuming. Similar to the JVM's
+			// `fetch.message.max.bytes`.
+			Default int32
+			// The maximum number of message bytes to fetch from the broker in a
+			// single request. Messages larger than this will return
+			// ErrMessageTooLarge and will not be consumable, so you must be sure
+			// this is at least as large as your largest message. Defaults to 0
+			// (no limit). Similar to the JVM's `fetch.message.max.bytes`. The
+			// global `sarama.MaxResponseSize` still applies.
+			Max int32
+		}
+		// The maximum amount of time the broker will wait for Consumer.Fetch.Min
+		// bytes to become available before it returns fewer than that anyways. The
+		// default is 250ms, since 0 causes the consumer to spin when no events are
+		// available. 100-500ms is a reasonable range for most cases. Kafka only
+		// supports precision up to milliseconds; nanoseconds will be truncated.
+		// Equivalent to the JVM's `fetch.wait.max.ms`.
+		MaxWaitTime time.Duration
+
+		// The maximum amount of time the consumer expects a message takes to
+		// process for the user. If writing to the Messages channel takes longer
+		// than this, that partition will stop fetching more messages until it
+		// can proceed again.
+		// Note that, since the Messages channel is buffered, the actual grace time is
+		// (MaxProcessingTime * ChanneBufferSize). Defaults to 100ms.
+		// If a message is not written to the Messages channel between two ticks
+		// of the expiryTicker then a timeout is detected.
+		// Using a ticker instead of a timer to detect timeouts should typically
+		// result in many fewer calls to Timer functions which may result in a
+		// significant performance improvement if many messages are being sent
+		// and timeouts are infrequent.
+		// The disadvantage of using a ticker instead of a timer is that
+		// timeouts will be less accurate. That is, the effective timeout could
+		// be between `MaxProcessingTime` and `2 * MaxProcessingTime`. For
+		// example, if `MaxProcessingTime` is 100ms then a delay of 180ms
+		// between two messages being sent may not be recognized as a timeout.
+		MaxProcessingTime time.Duration
+
+		// Return specifies what channels will be populated. If they are set to true,
+		// you must read from them to prevent deadlock.
+		Return struct {
+			// If enabled, any errors that occurred while consuming are returned on
+			// the Errors channel (default disabled).
+			Errors bool
+		}
+
+		// Offsets specifies configuration for how and when to commit consumed
+		// offsets. This currently requires the manual use of an OffsetManager
+		// but will eventually be automated.
+		Offsets struct {
+			// How frequently to commit updated offsets. Defaults to 1s.
+			CommitInterval time.Duration
+
+			// The initial offset to use if no offset was previously committed.
+			// Should be OffsetNewest or OffsetOldest. Defaults to OffsetNewest.
+			Initial int64
+
+			// The retention duration for committed offsets. If zero, disabled
+			// (in which case the `offsets.retention.minutes` option on the
+			// broker will be used).  Kafka only supports precision up to
+			// milliseconds; nanoseconds will be truncated. Requires Kafka
+			// broker version 0.9.0 or later.
+			// (default is 0: disabled).
+			Retention time.Duration
+
+			Retry struct {
+				// The total number of times to retry failing commit
+				// requests during OffsetManager shutdown (default 3).
+				Max int
+			}
+		}
+	}
+
+	// A user-provided string sent with every request to the brokers for logging,
+	// debugging, and auditing purposes. Defaults to "sarama", but you should
+	// probably set it to something specific to your application.
+	ClientID string
+	// The number of events to buffer in internal and external channels. This
+	// permits the producer and consumer to continue processing some messages
+	// in the background while user code is working, greatly improving throughput.
+	// Defaults to 256.
+	ChannelBufferSize int
+	// The version of Kafka that Sarama will assume it is running against.
+	// Defaults to the oldest supported stable version. Since Kafka provides
+	// backwards-compatibility, setting it to a version older than you have
+	// will not break anything, although it may prevent you from using the
+	// latest features. Setting it to a version greater than you are actually
+	// running may lead to random breakage.
+	Version KafkaVersion
+	// The registry to define metrics into.
+	// Defaults to a local registry.
+	// If you want to disable metrics gathering, set "metrics.UseNilMetrics" to "true"
+	// prior to starting Sarama.
+	// See Examples on how to use the metrics registry
+	MetricRegistry metrics.Registry
+}
+
+// NewConfig returns a new configuration instance with sane defaults.
+func NewConfig() *Config {
+	c := &Config{}
+
+	c.Admin.Timeout = 3 * time.Second
+
+	c.Net.MaxOpenRequests = 5
+	c.Net.DialTimeout = 30 * time.Second
+	c.Net.ReadTimeout = 30 * time.Second
+	c.Net.WriteTimeout = 30 * time.Second
+	c.Net.SASL.Handshake = true
+
+	c.Metadata.Retry.Max = 3
+	c.Metadata.Retry.Backoff = 250 * time.Millisecond
+	c.Metadata.RefreshFrequency = 10 * time.Minute
+	c.Metadata.Full = true
+
+	c.Producer.MaxMessageBytes = 1000000
+	c.Producer.RequiredAcks = WaitForLocal
+	c.Producer.Timeout = 10 * time.Second
+	c.Producer.Partitioner = NewHashPartitioner
+	c.Producer.Retry.Max = 3
+	c.Producer.Retry.Backoff = 100 * time.Millisecond
+	c.Producer.Return.Errors = true
+	c.Producer.CompressionLevel = CompressionLevelDefault
+
+	c.Consumer.Fetch.Min = 1
+	c.Consumer.Fetch.Default = 1024 * 1024
+	c.Consumer.Retry.Backoff = 2 * time.Second
+	c.Consumer.MaxWaitTime = 250 * time.Millisecond
+	c.Consumer.MaxProcessingTime = 100 * time.Millisecond
+	c.Consumer.Return.Errors = false
+	c.Consumer.Offsets.CommitInterval = 1 * time.Second
+	c.Consumer.Offsets.Initial = OffsetNewest
+	c.Consumer.Offsets.Retry.Max = 3
+
+	c.Consumer.Group.Session.Timeout = 10 * time.Second
+	c.Consumer.Group.Heartbeat.Interval = 3 * time.Second
+	c.Consumer.Group.Rebalance.Strategy = BalanceStrategyRange
+	c.Consumer.Group.Rebalance.Timeout = 60 * time.Second
+	c.Consumer.Group.Rebalance.Retry.Max = 4
+	c.Consumer.Group.Rebalance.Retry.Backoff = 2 * time.Second
+
+	c.ClientID = defaultClientID
+	c.ChannelBufferSize = 256
+	c.Version = MinVersion
+	c.MetricRegistry = metrics.NewRegistry()
+
+	return c
+}
+
+// Validate checks a Config instance. It will return a
+// ConfigurationError if the specified values don't make sense.
+func (c *Config) Validate() error {
+	// some configuration values should be warned on but not fail completely, do those first
+	if c.Net.TLS.Enable == false && c.Net.TLS.Config != nil {
+		Logger.Println("Net.TLS is disabled but a non-nil configuration was provided.")
+	}
+	if c.Net.SASL.Enable == false {
+		if c.Net.SASL.User != "" {
+			Logger.Println("Net.SASL is disabled but a non-empty username was provided.")
+		}
+		if c.Net.SASL.Password != "" {
+			Logger.Println("Net.SASL is disabled but a non-empty password was provided.")
+		}
+	}
+	if c.Producer.RequiredAcks > 1 {
+		Logger.Println("Producer.RequiredAcks > 1 is deprecated and will raise an exception with kafka >= 0.8.2.0.")
+	}
+	if c.Producer.MaxMessageBytes >= int(MaxRequestSize) {
+		Logger.Println("Producer.MaxMessageBytes must be smaller than MaxRequestSize; it will be ignored.")
+	}
+	if c.Producer.Flush.Bytes >= int(MaxRequestSize) {
+		Logger.Println("Producer.Flush.Bytes must be smaller than MaxRequestSize; it will be ignored.")
+	}
+	if (c.Producer.Flush.Bytes > 0 || c.Producer.Flush.Messages > 0) && c.Producer.Flush.Frequency == 0 {
+		Logger.Println("Producer.Flush: Bytes or Messages are set, but Frequency is not; messages may not get flushed.")
+	}
+	if c.Producer.Timeout%time.Millisecond != 0 {
+		Logger.Println("Producer.Timeout only supports millisecond resolution; nanoseconds will be truncated.")
+	}
+	if c.Consumer.MaxWaitTime < 100*time.Millisecond {
+		Logger.Println("Consumer.MaxWaitTime is very low, which can cause high CPU and network usage. See documentation for details.")
+	}
+	if c.Consumer.MaxWaitTime%time.Millisecond != 0 {
+		Logger.Println("Consumer.MaxWaitTime only supports millisecond precision; nanoseconds will be truncated.")
+	}
+	if c.Consumer.Offsets.Retention%time.Millisecond != 0 {
+		Logger.Println("Consumer.Offsets.Retention only supports millisecond precision; nanoseconds will be truncated.")
+	}
+	if c.Consumer.Group.Session.Timeout%time.Millisecond != 0 {
+		Logger.Println("Consumer.Group.Session.Timeout only supports millisecond precision; nanoseconds will be truncated.")
+	}
+	if c.Consumer.Group.Heartbeat.Interval%time.Millisecond != 0 {
+		Logger.Println("Consumer.Group.Heartbeat.Interval only supports millisecond precision; nanoseconds will be truncated.")
+	}
+	if c.Consumer.Group.Rebalance.Timeout%time.Millisecond != 0 {
+		Logger.Println("Consumer.Group.Rebalance.Timeout only supports millisecond precision; nanoseconds will be truncated.")
+	}
+	if c.ClientID == defaultClientID {
+		Logger.Println("ClientID is the default of 'sarama', you should consider setting it to something application-specific.")
+	}
+
+	// validate Net values
+	switch {
+	case c.Net.MaxOpenRequests <= 0:
+		return ConfigurationError("Net.MaxOpenRequests must be > 0")
+	case c.Net.DialTimeout <= 0:
+		return ConfigurationError("Net.DialTimeout must be > 0")
+	case c.Net.ReadTimeout <= 0:
+		return ConfigurationError("Net.ReadTimeout must be > 0")
+	case c.Net.WriteTimeout <= 0:
+		return ConfigurationError("Net.WriteTimeout must be > 0")
+	case c.Net.KeepAlive < 0:
+		return ConfigurationError("Net.KeepAlive must be >= 0")
+	case c.Net.SASL.Enable == true && c.Net.SASL.User == "":
+		return ConfigurationError("Net.SASL.User must not be empty when SASL is enabled")
+	case c.Net.SASL.Enable == true && c.Net.SASL.Password == "":
+		return ConfigurationError("Net.SASL.Password must not be empty when SASL is enabled")
+	}
+
+	// validate the Admin values
+	switch {
+	case c.Admin.Timeout <= 0:
+		return ConfigurationError("Admin.Timeout must be > 0")
+	}
+
+	// validate the Metadata values
+	switch {
+	case c.Metadata.Retry.Max < 0:
+		return ConfigurationError("Metadata.Retry.Max must be >= 0")
+	case c.Metadata.Retry.Backoff < 0:
+		return ConfigurationError("Metadata.Retry.Backoff must be >= 0")
+	case c.Metadata.RefreshFrequency < 0:
+		return ConfigurationError("Metadata.RefreshFrequency must be >= 0")
+	}
+
+	// validate the Producer values
+	switch {
+	case c.Producer.MaxMessageBytes <= 0:
+		return ConfigurationError("Producer.MaxMessageBytes must be > 0")
+	case c.Producer.RequiredAcks < -1:
+		return ConfigurationError("Producer.RequiredAcks must be >= -1")
+	case c.Producer.Timeout <= 0:
+		return ConfigurationError("Producer.Timeout must be > 0")
+	case c.Producer.Partitioner == nil:
+		return ConfigurationError("Producer.Partitioner must not be nil")
+	case c.Producer.Flush.Bytes < 0:
+		return ConfigurationError("Producer.Flush.Bytes must be >= 0")
+	case c.Producer.Flush.Messages < 0:
+		return ConfigurationError("Producer.Flush.Messages must be >= 0")
+	case c.Producer.Flush.Frequency < 0:
+		return ConfigurationError("Producer.Flush.Frequency must be >= 0")
+	case c.Producer.Flush.MaxMessages < 0:
+		return ConfigurationError("Producer.Flush.MaxMessages must be >= 0")
+	case c.Producer.Flush.MaxMessages > 0 && c.Producer.Flush.MaxMessages < c.Producer.Flush.Messages:
+		return ConfigurationError("Producer.Flush.MaxMessages must be >= Producer.Flush.Messages when set")
+	case c.Producer.Retry.Max < 0:
+		return ConfigurationError("Producer.Retry.Max must be >= 0")
+	case c.Producer.Retry.Backoff < 0:
+		return ConfigurationError("Producer.Retry.Backoff must be >= 0")
+	}
+
+	if c.Producer.Compression == CompressionLZ4 && !c.Version.IsAtLeast(V0_10_0_0) {
+		return ConfigurationError("lz4 compression requires Version >= V0_10_0_0")
+	}
+
+	if c.Producer.Compression == CompressionGZIP {
+		if c.Producer.CompressionLevel != CompressionLevelDefault {
+			if _, err := gzip.NewWriterLevel(ioutil.Discard, c.Producer.CompressionLevel); err != nil {
+				return ConfigurationError(fmt.Sprintf("gzip compression does not work with level %d: %v", c.Producer.CompressionLevel, err))
+			}
+		}
+	}
+
+	if c.Producer.Idempotent {
+		if !c.Version.IsAtLeast(V0_11_0_0) {
+			return ConfigurationError("Idempotent producer requires Version >= V0_11_0_0")
+		}
+		if c.Producer.Retry.Max == 0 {
+			return ConfigurationError("Idempotent producer requires Producer.Retry.Max >= 1")
+		}
+		if c.Producer.RequiredAcks != WaitForAll {
+			return ConfigurationError("Idempotent producer requires Producer.RequiredAcks to be WaitForAll")
+		}
+		if c.Net.MaxOpenRequests > 1 {
+			return ConfigurationError("Idempotent producer requires Net.MaxOpenRequests to be 1")
+		}
+	}
+
+	// validate the Consumer values
+	switch {
+	case c.Consumer.Fetch.Min <= 0:
+		return ConfigurationError("Consumer.Fetch.Min must be > 0")
+	case c.Consumer.Fetch.Default <= 0:
+		return ConfigurationError("Consumer.Fetch.Default must be > 0")
+	case c.Consumer.Fetch.Max < 0:
+		return ConfigurationError("Consumer.Fetch.Max must be >= 0")
+	case c.Consumer.MaxWaitTime < 1*time.Millisecond:
+		return ConfigurationError("Consumer.MaxWaitTime must be >= 1ms")
+	case c.Consumer.MaxProcessingTime <= 0:
+		return ConfigurationError("Consumer.MaxProcessingTime must be > 0")
+	case c.Consumer.Retry.Backoff < 0:
+		return ConfigurationError("Consumer.Retry.Backoff must be >= 0")
+	case c.Consumer.Offsets.CommitInterval <= 0:
+		return ConfigurationError("Consumer.Offsets.CommitInterval must be > 0")
+	case c.Consumer.Offsets.Initial != OffsetOldest && c.Consumer.Offsets.Initial != OffsetNewest:
+		return ConfigurationError("Consumer.Offsets.Initial must be OffsetOldest or OffsetNewest")
+	case c.Consumer.Offsets.Retry.Max < 0:
+		return ConfigurationError("Consumer.Offsets.Retry.Max must be >= 0")
+	}
+
+	// validate the Consumer Group values
+	switch {
+	case c.Consumer.Group.Session.Timeout <= 2*time.Millisecond:
+		return ConfigurationError("Consumer.Group.Session.Timeout must be >= 2ms")
+	case c.Consumer.Group.Heartbeat.Interval < 1*time.Millisecond:
+		return ConfigurationError("Consumer.Group.Heartbeat.Interval must be >= 1ms")
+	case c.Consumer.Group.Heartbeat.Interval >= c.Consumer.Group.Session.Timeout:
+		return ConfigurationError("Consumer.Group.Heartbeat.Interval must be < Consumer.Group.Session.Timeout")
+	case c.Consumer.Group.Rebalance.Strategy == nil:
+		return ConfigurationError("Consumer.Group.Rebalance.Strategy must not be empty")
+	case c.Consumer.Group.Rebalance.Timeout <= time.Millisecond:
+		return ConfigurationError("Consumer.Group.Rebalance.Timeout must be >= 1ms")
+	case c.Consumer.Group.Rebalance.Retry.Max < 0:
+		return ConfigurationError("Consumer.Group.Rebalance.Retry.Max must be >= 0")
+	case c.Consumer.Group.Rebalance.Retry.Backoff < 0:
+		return ConfigurationError("Consumer.Group.Rebalance.Retry.Backoff must be >= 0")
+	}
+
+	// validate misc shared values
+	switch {
+	case c.ChannelBufferSize < 0:
+		return ConfigurationError("ChannelBufferSize must be >= 0")
+	case !validID.MatchString(c.ClientID):
+		return ConfigurationError("ClientID is invalid")
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/config_resource_type.go b/vendor/github.com/Shopify/sarama/config_resource_type.go
new file mode 100644
index 0000000..848cc9c
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/config_resource_type.go

@@ -0,0 +1,15 @@
+package sarama
+
+type ConfigResourceType int8
+
+// Taken from :
+// https://cwiki.apache.org/confluence/display/KAFKA/KIP-133%3A+Describe+and+Alter+Configs+Admin+APIs#KIP-133:DescribeandAlterConfigsAdminAPIs-WireFormattypes
+
+const (
+	UnknownResource ConfigResourceType = 0
+	AnyResource     ConfigResourceType = 1
+	TopicResource   ConfigResourceType = 2
+	GroupResource   ConfigResourceType = 3
+	ClusterResource ConfigResourceType = 4
+	BrokerResource  ConfigResourceType = 5
+)

diff --git a/vendor/github.com/Shopify/sarama/consumer.go b/vendor/github.com/Shopify/sarama/consumer.go
new file mode 100644
index 0000000..33d9d14
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/consumer.go

@@ -0,0 +1,807 @@
+package sarama
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"time"
+)
+
+// ConsumerMessage encapsulates a Kafka message returned by the consumer.
+type ConsumerMessage struct {
+	Key, Value     []byte
+	Topic          string
+	Partition      int32
+	Offset         int64
+	Timestamp      time.Time       // only set if kafka is version 0.10+, inner message timestamp
+	BlockTimestamp time.Time       // only set if kafka is version 0.10+, outer (compressed) block timestamp
+	Headers        []*RecordHeader // only set if kafka is version 0.11+
+}
+
+// ConsumerError is what is provided to the user when an error occurs.
+// It wraps an error and includes the topic and partition.
+type ConsumerError struct {
+	Topic     string
+	Partition int32
+	Err       error
+}
+
+func (ce ConsumerError) Error() string {
+	return fmt.Sprintf("kafka: error while consuming %s/%d: %s", ce.Topic, ce.Partition, ce.Err)
+}
+
+// ConsumerErrors is a type that wraps a batch of errors and implements the Error interface.
+// It can be returned from the PartitionConsumer's Close methods to avoid the need to manually drain errors
+// when stopping.
+type ConsumerErrors []*ConsumerError
+
+func (ce ConsumerErrors) Error() string {
+	return fmt.Sprintf("kafka: %d errors while consuming", len(ce))
+}
+
+// Consumer manages PartitionConsumers which process Kafka messages from brokers. You MUST call Close()
+// on a consumer to avoid leaks, it will not be garbage-collected automatically when it passes out of
+// scope.
+//
+// Sarama's Consumer type does not currently support automatic consumer-group rebalancing and offset tracking.
+// For Zookeeper-based tracking (Kafka 0.8.2 and earlier), the https://github.com/wvanbergen/kafka library
+// builds on Sarama to add this support. For Kafka-based tracking (Kafka 0.9 and later), the
+// https://github.com/bsm/sarama-cluster library builds on Sarama to add this support.
+type Consumer interface {
+
+	// Topics returns the set of available topics as retrieved from the cluster
+	// metadata. This method is the same as Client.Topics(), and is provided for
+	// convenience.
+	Topics() ([]string, error)
+
+	// Partitions returns the sorted list of all partition IDs for the given topic.
+	// This method is the same as Client.Partitions(), and is provided for convenience.
+	Partitions(topic string) ([]int32, error)
+
+	// ConsumePartition creates a PartitionConsumer on the given topic/partition with
+	// the given offset. It will return an error if this Consumer is already consuming
+	// on the given topic/partition. Offset can be a literal offset, or OffsetNewest
+	// or OffsetOldest
+	ConsumePartition(topic string, partition int32, offset int64) (PartitionConsumer, error)
+
+	// HighWaterMarks returns the current high water marks for each topic and partition.
+	// Consistency between partitions is not guaranteed since high water marks are updated separately.
+	HighWaterMarks() map[string]map[int32]int64
+
+	// Close shuts down the consumer. It must be called after all child
+	// PartitionConsumers have already been closed.
+	Close() error
+}
+
+type consumer struct {
+	client    Client
+	conf      *Config
+	ownClient bool
+
+	lock            sync.Mutex
+	children        map[string]map[int32]*partitionConsumer
+	brokerConsumers map[*Broker]*brokerConsumer
+}
+
+// NewConsumer creates a new consumer using the given broker addresses and configuration.
+func NewConsumer(addrs []string, config *Config) (Consumer, error) {
+	client, err := NewClient(addrs, config)
+	if err != nil {
+		return nil, err
+	}
+
+	c, err := NewConsumerFromClient(client)
+	if err != nil {
+		return nil, err
+	}
+	c.(*consumer).ownClient = true
+	return c, nil
+}
+
+// NewConsumerFromClient creates a new consumer using the given client. It is still
+// necessary to call Close() on the underlying client when shutting down this consumer.
+func NewConsumerFromClient(client Client) (Consumer, error) {
+	// Check that we are not dealing with a closed Client before processing any other arguments
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	c := &consumer{
+		client:          client,
+		conf:            client.Config(),
+		children:        make(map[string]map[int32]*partitionConsumer),
+		brokerConsumers: make(map[*Broker]*brokerConsumer),
+	}
+
+	return c, nil
+}
+
+func (c *consumer) Close() error {
+	if c.ownClient {
+		return c.client.Close()
+	}
+	return nil
+}
+
+func (c *consumer) Topics() ([]string, error) {
+	return c.client.Topics()
+}
+
+func (c *consumer) Partitions(topic string) ([]int32, error) {
+	return c.client.Partitions(topic)
+}
+
+func (c *consumer) ConsumePartition(topic string, partition int32, offset int64) (PartitionConsumer, error) {
+	child := &partitionConsumer{
+		consumer:  c,
+		conf:      c.conf,
+		topic:     topic,
+		partition: partition,
+		messages:  make(chan *ConsumerMessage, c.conf.ChannelBufferSize),
+		errors:    make(chan *ConsumerError, c.conf.ChannelBufferSize),
+		feeder:    make(chan *FetchResponse, 1),
+		trigger:   make(chan none, 1),
+		dying:     make(chan none),
+		fetchSize: c.conf.Consumer.Fetch.Default,
+	}
+
+	if err := child.chooseStartingOffset(offset); err != nil {
+		return nil, err
+	}
+
+	var leader *Broker
+	var err error
+	if leader, err = c.client.Leader(child.topic, child.partition); err != nil {
+		return nil, err
+	}
+
+	if err := c.addChild(child); err != nil {
+		return nil, err
+	}
+
+	go withRecover(child.dispatcher)
+	go withRecover(child.responseFeeder)
+
+	child.broker = c.refBrokerConsumer(leader)
+	child.broker.input <- child
+
+	return child, nil
+}
+
+func (c *consumer) HighWaterMarks() map[string]map[int32]int64 {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	hwms := make(map[string]map[int32]int64)
+	for topic, p := range c.children {
+		hwm := make(map[int32]int64, len(p))
+		for partition, pc := range p {
+			hwm[partition] = pc.HighWaterMarkOffset()
+		}
+		hwms[topic] = hwm
+	}
+
+	return hwms
+}
+
+func (c *consumer) addChild(child *partitionConsumer) error {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	topicChildren := c.children[child.topic]
+	if topicChildren == nil {
+		topicChildren = make(map[int32]*partitionConsumer)
+		c.children[child.topic] = topicChildren
+	}
+
+	if topicChildren[child.partition] != nil {
+		return ConfigurationError("That topic/partition is already being consumed")
+	}
+
+	topicChildren[child.partition] = child
+	return nil
+}
+
+func (c *consumer) removeChild(child *partitionConsumer) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	delete(c.children[child.topic], child.partition)
+}
+
+func (c *consumer) refBrokerConsumer(broker *Broker) *brokerConsumer {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	bc := c.brokerConsumers[broker]
+	if bc == nil {
+		bc = c.newBrokerConsumer(broker)
+		c.brokerConsumers[broker] = bc
+	}
+
+	bc.refs++
+
+	return bc
+}
+
+func (c *consumer) unrefBrokerConsumer(brokerWorker *brokerConsumer) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	brokerWorker.refs--
+
+	if brokerWorker.refs == 0 {
+		close(brokerWorker.input)
+		if c.brokerConsumers[brokerWorker.broker] == brokerWorker {
+			delete(c.brokerConsumers, brokerWorker.broker)
+		}
+	}
+}
+
+func (c *consumer) abandonBrokerConsumer(brokerWorker *brokerConsumer) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	delete(c.brokerConsumers, brokerWorker.broker)
+}
+
+// PartitionConsumer
+
+// PartitionConsumer processes Kafka messages from a given topic and partition. You MUST call one of Close() or
+// AsyncClose() on a PartitionConsumer to avoid leaks; it will not be garbage-collected automatically when it passes out
+// of scope.
+//
+// The simplest way of using a PartitionConsumer is to loop over its Messages channel using a for/range
+// loop. The PartitionConsumer will only stop itself in one case: when the offset being consumed is reported
+// as out of range by the brokers. In this case you should decide what you want to do (try a different offset,
+// notify a human, etc) and handle it appropriately. For all other error cases, it will just keep retrying.
+// By default, it logs these errors to sarama.Logger; if you want to be notified directly of all errors, set
+// your config's Consumer.Return.Errors to true and read from the Errors channel, using a select statement
+// or a separate goroutine. Check out the Consumer examples to see implementations of these different approaches.
+//
+// To terminate such a for/range loop while the loop is executing, call AsyncClose. This will kick off the process of
+// consumer tear-down & return imediately. Continue to loop, servicing the Messages channel until the teardown process
+// AsyncClose initiated closes it (thus terminating the for/range loop). If you've already ceased reading Messages, call
+// Close; this will signal the PartitionConsumer's goroutines to begin shutting down (just like AsyncClose), but will
+// also drain the Messages channel, harvest all errors & return them once cleanup has completed.
+type PartitionConsumer interface {
+
+	// AsyncClose initiates a shutdown of the PartitionConsumer. This method will return immediately, after which you
+	// should continue to service the 'Messages' and 'Errors' channels until they are empty. It is required to call this
+	// function, or Close before a consumer object passes out of scope, as it will otherwise leak memory. You must call
+	// this before calling Close on the underlying client.
+	AsyncClose()
+
+	// Close stops the PartitionConsumer from fetching messages. It will initiate a shutdown just like AsyncClose, drain
+	// the Messages channel, harvest any errors & return them to the caller. Note that if you are continuing to service
+	// the Messages channel when this function is called, you will be competing with Close for messages; consider
+	// calling AsyncClose, instead. It is required to call this function (or AsyncClose) before a consumer object passes
+	// out of scope, as it will otherwise leak memory. You must call this before calling Close on the underlying client.
+	Close() error
+
+	// Messages returns the read channel for the messages that are returned by
+	// the broker.
+	Messages() <-chan *ConsumerMessage
+
+	// Errors returns a read channel of errors that occurred during consuming, if
+	// enabled. By default, errors are logged and not returned over this channel.
+	// If you want to implement any custom error handling, set your config's
+	// Consumer.Return.Errors setting to true, and read from this channel.
+	Errors() <-chan *ConsumerError
+
+	// HighWaterMarkOffset returns the high water mark offset of the partition,
+	// i.e. the offset that will be used for the next message that will be produced.
+	// You can use this to determine how far behind the processing is.
+	HighWaterMarkOffset() int64
+}
+
+type partitionConsumer struct {
+	highWaterMarkOffset int64 // must be at the top of the struct because https://golang.org/pkg/sync/atomic/#pkg-note-BUG
+	consumer            *consumer
+	conf                *Config
+	topic               string
+	partition           int32
+
+	broker   *brokerConsumer
+	messages chan *ConsumerMessage
+	errors   chan *ConsumerError
+	feeder   chan *FetchResponse
+
+	trigger, dying chan none
+	responseResult error
+	closeOnce      sync.Once
+
+	fetchSize int32
+	offset    int64
+}
+
+var errTimedOut = errors.New("timed out feeding messages to the user") // not user-facing
+
+func (child *partitionConsumer) sendError(err error) {
+	cErr := &ConsumerError{
+		Topic:     child.topic,
+		Partition: child.partition,
+		Err:       err,
+	}
+
+	if child.conf.Consumer.Return.Errors {
+		child.errors <- cErr
+	} else {
+		Logger.Println(cErr)
+	}
+}
+
+func (child *partitionConsumer) dispatcher() {
+	for range child.trigger {
+		select {
+		case <-child.dying:
+			close(child.trigger)
+		case <-time.After(child.conf.Consumer.Retry.Backoff):
+			if child.broker != nil {
+				child.consumer.unrefBrokerConsumer(child.broker)
+				child.broker = nil
+			}
+
+			Logger.Printf("consumer/%s/%d finding new broker\n", child.topic, child.partition)
+			if err := child.dispatch(); err != nil {
+				child.sendError(err)
+				child.trigger <- none{}
+			}
+		}
+	}
+
+	if child.broker != nil {
+		child.consumer.unrefBrokerConsumer(child.broker)
+	}
+	child.consumer.removeChild(child)
+	close(child.feeder)
+}
+
+func (child *partitionConsumer) dispatch() error {
+	if err := child.consumer.client.RefreshMetadata(child.topic); err != nil {
+		return err
+	}
+
+	var leader *Broker
+	var err error
+	if leader, err = child.consumer.client.Leader(child.topic, child.partition); err != nil {
+		return err
+	}
+
+	child.broker = child.consumer.refBrokerConsumer(leader)
+
+	child.broker.input <- child
+
+	return nil
+}
+
+func (child *partitionConsumer) chooseStartingOffset(offset int64) error {
+	newestOffset, err := child.consumer.client.GetOffset(child.topic, child.partition, OffsetNewest)
+	if err != nil {
+		return err
+	}
+	oldestOffset, err := child.consumer.client.GetOffset(child.topic, child.partition, OffsetOldest)
+	if err != nil {
+		return err
+	}
+
+	switch {
+	case offset == OffsetNewest:
+		child.offset = newestOffset
+	case offset == OffsetOldest:
+		child.offset = oldestOffset
+	case offset >= oldestOffset && offset <= newestOffset:
+		child.offset = offset
+	default:
+		return ErrOffsetOutOfRange
+	}
+
+	return nil
+}
+
+func (child *partitionConsumer) Messages() <-chan *ConsumerMessage {
+	return child.messages
+}
+
+func (child *partitionConsumer) Errors() <-chan *ConsumerError {
+	return child.errors
+}
+
+func (child *partitionConsumer) AsyncClose() {
+	// this triggers whatever broker owns this child to abandon it and close its trigger channel, which causes
+	// the dispatcher to exit its loop, which removes it from the consumer then closes its 'messages' and
+	// 'errors' channel (alternatively, if the child is already at the dispatcher for some reason, that will
+	// also just close itself)
+	child.closeOnce.Do(func() {
+		close(child.dying)
+	})
+}
+
+func (child *partitionConsumer) Close() error {
+	child.AsyncClose()
+
+	go withRecover(func() {
+		for range child.messages {
+			// drain
+		}
+	})
+
+	var errors ConsumerErrors
+	for err := range child.errors {
+		errors = append(errors, err)
+	}
+
+	if len(errors) > 0 {
+		return errors
+	}
+	return nil
+}
+
+func (child *partitionConsumer) HighWaterMarkOffset() int64 {
+	return atomic.LoadInt64(&child.highWaterMarkOffset)
+}
+
+func (child *partitionConsumer) responseFeeder() {
+	var msgs []*ConsumerMessage
+	expiryTicker := time.NewTicker(child.conf.Consumer.MaxProcessingTime)
+	firstAttempt := true
+
+feederLoop:
+	for response := range child.feeder {
+		msgs, child.responseResult = child.parseResponse(response)
+
+		for i, msg := range msgs {
+		messageSelect:
+			select {
+			case child.messages <- msg:
+				firstAttempt = true
+			case <-expiryTicker.C:
+				if !firstAttempt {
+					child.responseResult = errTimedOut
+					child.broker.acks.Done()
+					for _, msg = range msgs[i:] {
+						child.messages <- msg
+					}
+					child.broker.input <- child
+					continue feederLoop
+				} else {
+					// current message has not been sent, return to select
+					// statement
+					firstAttempt = false
+					goto messageSelect
+				}
+			}
+		}
+
+		child.broker.acks.Done()
+	}
+
+	expiryTicker.Stop()
+	close(child.messages)
+	close(child.errors)
+}
+
+func (child *partitionConsumer) parseMessages(msgSet *MessageSet) ([]*ConsumerMessage, error) {
+	var messages []*ConsumerMessage
+	for _, msgBlock := range msgSet.Messages {
+		for _, msg := range msgBlock.Messages() {
+			offset := msg.Offset
+			if msg.Msg.Version >= 1 {
+				baseOffset := msgBlock.Offset - msgBlock.Messages()[len(msgBlock.Messages())-1].Offset
+				offset += baseOffset
+			}
+			if offset < child.offset {
+				continue
+			}
+			messages = append(messages, &ConsumerMessage{
+				Topic:          child.topic,
+				Partition:      child.partition,
+				Key:            msg.Msg.Key,
+				Value:          msg.Msg.Value,
+				Offset:         offset,
+				Timestamp:      msg.Msg.Timestamp,
+				BlockTimestamp: msgBlock.Msg.Timestamp,
+			})
+			child.offset = offset + 1
+		}
+	}
+	if len(messages) == 0 {
+		return nil, ErrIncompleteResponse
+	}
+	return messages, nil
+}
+
+func (child *partitionConsumer) parseRecords(batch *RecordBatch) ([]*ConsumerMessage, error) {
+	var messages []*ConsumerMessage
+	for _, rec := range batch.Records {
+		offset := batch.FirstOffset + rec.OffsetDelta
+		if offset < child.offset {
+			continue
+		}
+		messages = append(messages, &ConsumerMessage{
+			Topic:     child.topic,
+			Partition: child.partition,
+			Key:       rec.Key,
+			Value:     rec.Value,
+			Offset:    offset,
+			Timestamp: batch.FirstTimestamp.Add(rec.TimestampDelta),
+			Headers:   rec.Headers,
+		})
+		child.offset = offset + 1
+	}
+	if len(messages) == 0 {
+		child.offset += 1
+	}
+	return messages, nil
+}
+
+func (child *partitionConsumer) parseResponse(response *FetchResponse) ([]*ConsumerMessage, error) {
+	block := response.GetBlock(child.topic, child.partition)
+	if block == nil {
+		return nil, ErrIncompleteResponse
+	}
+
+	if block.Err != ErrNoError {
+		return nil, block.Err
+	}
+
+	nRecs, err := block.numRecords()
+	if err != nil {
+		return nil, err
+	}
+	if nRecs == 0 {
+		partialTrailingMessage, err := block.isPartial()
+		if err != nil {
+			return nil, err
+		}
+		// We got no messages. If we got a trailing one then we need to ask for more data.
+		// Otherwise we just poll again and wait for one to be produced...
+		if partialTrailingMessage {
+			if child.conf.Consumer.Fetch.Max > 0 && child.fetchSize == child.conf.Consumer.Fetch.Max {
+				// we can't ask for more data, we've hit the configured limit
+				child.sendError(ErrMessageTooLarge)
+				child.offset++ // skip this one so we can keep processing future messages
+			} else {
+				child.fetchSize *= 2
+				if child.conf.Consumer.Fetch.Max > 0 && child.fetchSize > child.conf.Consumer.Fetch.Max {
+					child.fetchSize = child.conf.Consumer.Fetch.Max
+				}
+			}
+		}
+
+		return nil, nil
+	}
+
+	// we got messages, reset our fetch size in case it was increased for a previous request
+	child.fetchSize = child.conf.Consumer.Fetch.Default
+	atomic.StoreInt64(&child.highWaterMarkOffset, block.HighWaterMarkOffset)
+
+	messages := []*ConsumerMessage{}
+	for _, records := range block.RecordsSet {
+		switch records.recordsType {
+		case legacyRecords:
+			messageSetMessages, err := child.parseMessages(records.MsgSet)
+			if err != nil {
+				return nil, err
+			}
+
+			messages = append(messages, messageSetMessages...)
+		case defaultRecords:
+			recordBatchMessages, err := child.parseRecords(records.RecordBatch)
+			if err != nil {
+				return nil, err
+			}
+			if control, err := records.isControl(); err != nil || control {
+				continue
+			}
+
+			messages = append(messages, recordBatchMessages...)
+		default:
+			return nil, fmt.Errorf("unknown records type: %v", records.recordsType)
+		}
+	}
+
+	return messages, nil
+}
+
+// brokerConsumer
+
+type brokerConsumer struct {
+	consumer         *consumer
+	broker           *Broker
+	input            chan *partitionConsumer
+	newSubscriptions chan []*partitionConsumer
+	wait             chan none
+	subscriptions    map[*partitionConsumer]none
+	acks             sync.WaitGroup
+	refs             int
+}
+
+func (c *consumer) newBrokerConsumer(broker *Broker) *brokerConsumer {
+	bc := &brokerConsumer{
+		consumer:         c,
+		broker:           broker,
+		input:            make(chan *partitionConsumer),
+		newSubscriptions: make(chan []*partitionConsumer),
+		wait:             make(chan none),
+		subscriptions:    make(map[*partitionConsumer]none),
+		refs:             0,
+	}
+
+	go withRecover(bc.subscriptionManager)
+	go withRecover(bc.subscriptionConsumer)
+
+	return bc
+}
+
+func (bc *brokerConsumer) subscriptionManager() {
+	var buffer []*partitionConsumer
+
+	// The subscriptionManager constantly accepts new subscriptions on `input` (even when the main subscriptionConsumer
+	// goroutine is in the middle of a network request) and batches it up. The main worker goroutine picks
+	// up a batch of new subscriptions between every network request by reading from `newSubscriptions`, so we give
+	// it nil if no new subscriptions are available. We also write to `wait` only when new subscriptions is available,
+	// so the main goroutine can block waiting for work if it has none.
+	for {
+		if len(buffer) > 0 {
+			select {
+			case event, ok := <-bc.input:
+				if !ok {
+					goto done
+				}
+				buffer = append(buffer, event)
+			case bc.newSubscriptions <- buffer:
+				buffer = nil
+			case bc.wait <- none{}:
+			}
+		} else {
+			select {
+			case event, ok := <-bc.input:
+				if !ok {
+					goto done
+				}
+				buffer = append(buffer, event)
+			case bc.newSubscriptions <- nil:
+			}
+		}
+	}
+
+done:
+	close(bc.wait)
+	if len(buffer) > 0 {
+		bc.newSubscriptions <- buffer
+	}
+	close(bc.newSubscriptions)
+}
+
+func (bc *brokerConsumer) subscriptionConsumer() {
+	<-bc.wait // wait for our first piece of work
+
+	// the subscriptionConsumer ensures we will get nil right away if no new subscriptions is available
+	for newSubscriptions := range bc.newSubscriptions {
+		bc.updateSubscriptions(newSubscriptions)
+
+		if len(bc.subscriptions) == 0 {
+			// We're about to be shut down or we're about to receive more subscriptions.
+			// Either way, the signal just hasn't propagated to our goroutine yet.
+			<-bc.wait
+			continue
+		}
+
+		response, err := bc.fetchNewMessages()
+
+		if err != nil {
+			Logger.Printf("consumer/broker/%d disconnecting due to error processing FetchRequest: %s\n", bc.broker.ID(), err)
+			bc.abort(err)
+			return
+		}
+
+		bc.acks.Add(len(bc.subscriptions))
+		for child := range bc.subscriptions {
+			child.feeder <- response
+		}
+		bc.acks.Wait()
+		bc.handleResponses()
+	}
+}
+
+func (bc *brokerConsumer) updateSubscriptions(newSubscriptions []*partitionConsumer) {
+	for _, child := range newSubscriptions {
+		bc.subscriptions[child] = none{}
+		Logger.Printf("consumer/broker/%d added subscription to %s/%d\n", bc.broker.ID(), child.topic, child.partition)
+	}
+
+	for child := range bc.subscriptions {
+		select {
+		case <-child.dying:
+			Logger.Printf("consumer/broker/%d closed dead subscription to %s/%d\n", bc.broker.ID(), child.topic, child.partition)
+			close(child.trigger)
+			delete(bc.subscriptions, child)
+		default:
+			break
+		}
+	}
+}
+
+func (bc *brokerConsumer) handleResponses() {
+	// handles the response codes left for us by our subscriptions, and abandons ones that have been closed
+	for child := range bc.subscriptions {
+		result := child.responseResult
+		child.responseResult = nil
+
+		switch result {
+		case nil:
+			break
+		case errTimedOut:
+			Logger.Printf("consumer/broker/%d abandoned subscription to %s/%d because consuming was taking too long\n",
+				bc.broker.ID(), child.topic, child.partition)
+			delete(bc.subscriptions, child)
+		case ErrOffsetOutOfRange:
+			// there's no point in retrying this it will just fail the same way again
+			// shut it down and force the user to choose what to do
+			child.sendError(result)
+			Logger.Printf("consumer/%s/%d shutting down because %s\n", child.topic, child.partition, result)
+			close(child.trigger)
+			delete(bc.subscriptions, child)
+		case ErrUnknownTopicOrPartition, ErrNotLeaderForPartition, ErrLeaderNotAvailable, ErrReplicaNotAvailable:
+			// not an error, but does need redispatching
+			Logger.Printf("consumer/broker/%d abandoned subscription to %s/%d because %s\n",
+				bc.broker.ID(), child.topic, child.partition, result)
+			child.trigger <- none{}
+			delete(bc.subscriptions, child)
+		default:
+			// dunno, tell the user and try redispatching
+			child.sendError(result)
+			Logger.Printf("consumer/broker/%d abandoned subscription to %s/%d because %s\n",
+				bc.broker.ID(), child.topic, child.partition, result)
+			child.trigger <- none{}
+			delete(bc.subscriptions, child)
+		}
+	}
+}
+
+func (bc *brokerConsumer) abort(err error) {
+	bc.consumer.abandonBrokerConsumer(bc)
+	_ = bc.broker.Close() // we don't care about the error this might return, we already have one
+
+	for child := range bc.subscriptions {
+		child.sendError(err)
+		child.trigger <- none{}
+	}
+
+	for newSubscriptions := range bc.newSubscriptions {
+		if len(newSubscriptions) == 0 {
+			<-bc.wait
+			continue
+		}
+		for _, child := range newSubscriptions {
+			child.sendError(err)
+			child.trigger <- none{}
+		}
+	}
+}
+
+func (bc *brokerConsumer) fetchNewMessages() (*FetchResponse, error) {
+	request := &FetchRequest{
+		MinBytes:    bc.consumer.conf.Consumer.Fetch.Min,
+		MaxWaitTime: int32(bc.consumer.conf.Consumer.MaxWaitTime / time.Millisecond),
+	}
+	if bc.consumer.conf.Version.IsAtLeast(V0_10_0_0) {
+		request.Version = 2
+	}
+	if bc.consumer.conf.Version.IsAtLeast(V0_10_1_0) {
+		request.Version = 3
+		request.MaxBytes = MaxResponseSize
+	}
+	if bc.consumer.conf.Version.IsAtLeast(V0_11_0_0) {
+		request.Version = 4
+		request.Isolation = ReadUncommitted // We don't support yet transactions.
+	}
+
+	for child := range bc.subscriptions {
+		request.AddBlock(child.topic, child.partition, child.offset, child.fetchSize)
+	}
+
+	return bc.broker.Fetch(request)
+}

diff --git a/vendor/github.com/Shopify/sarama/consumer_group.go b/vendor/github.com/Shopify/sarama/consumer_group.go
new file mode 100644
index 0000000..bb6a2c2
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/consumer_group.go

@@ -0,0 +1,774 @@
+package sarama
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sort"
+	"sync"
+	"time"
+)
+
+// ErrClosedConsumerGroup is the error returned when a method is called on a consumer group that has been closed.
+var ErrClosedConsumerGroup = errors.New("kafka: tried to use a consumer group that was closed")
+
+// ConsumerGroup is responsible for dividing up processing of topics and partitions
+// over a collection of processes (the members of the consumer group).
+type ConsumerGroup interface {
+	// Consume joins a cluster of consumers for a given list of topics and
+	// starts a blocking ConsumerGroupSession through the ConsumerGroupHandler.
+	//
+	// The life-cycle of a session is represented by the following steps:
+	//
+	// 1. The consumers join the group (as explained in https://kafka.apache.org/documentation/#intro_consumers)
+	//    and is assigned their "fair share" of partitions, aka 'claims'.
+	// 2. Before processing starts, the handler's Setup() hook is called to notify the user
+	//    of the claims and allow any necessary preparation or alteration of state.
+	// 3. For each of the assigned claims the handler's ConsumeClaim() function is then called
+	//    in a separate goroutine which requires it to be thread-safe. Any state must be carefully protected
+	//    from concurrent reads/writes.
+	// 4. The session will persist until one of the ConsumeClaim() functions exits. This can be either when the
+	//    parent context is cancelled or when a server-side rebalance cycle is initiated.
+	// 5. Once all the ConsumeClaim() loops have exited, the handler's Cleanup() hook is called
+	//    to allow the user to perform any final tasks before a rebalance.
+	// 6. Finally, marked offsets are committed one last time before claims are released.
+	//
+	// Please note, that once a rebalance is triggered, sessions must be completed within
+	// Config.Consumer.Group.Rebalance.Timeout. This means that ConsumeClaim() functions must exit
+	// as quickly as possible to allow time for Cleanup() and the final offset commit. If the timeout
+	// is exceeded, the consumer will be removed from the group by Kafka, which will cause offset
+	// commit failures.
+	Consume(ctx context.Context, topics []string, handler ConsumerGroupHandler) error
+
+	// Errors returns a read channel of errors that occurred during the consumer life-cycle.
+	// By default, errors are logged and not returned over this channel.
+	// If you want to implement any custom error handling, set your config's
+	// Consumer.Return.Errors setting to true, and read from this channel.
+	Errors() <-chan error
+
+	// Close stops the ConsumerGroup and detaches any running sessions. It is required to call
+	// this function before the object passes out of scope, as it will otherwise leak memory.
+	Close() error
+}
+
+type consumerGroup struct {
+	client    Client
+	ownClient bool
+
+	config   *Config
+	consumer Consumer
+	groupID  string
+	memberID string
+	errors   chan error
+
+	lock      sync.Mutex
+	closed    chan none
+	closeOnce sync.Once
+}
+
+// NewConsumerGroup creates a new consumer group the given broker addresses and configuration.
+func NewConsumerGroup(addrs []string, groupID string, config *Config) (ConsumerGroup, error) {
+	client, err := NewClient(addrs, config)
+	if err != nil {
+		return nil, err
+	}
+
+	c, err := NewConsumerGroupFromClient(groupID, client)
+	if err != nil {
+		_ = client.Close()
+		return nil, err
+	}
+
+	c.(*consumerGroup).ownClient = true
+	return c, nil
+}
+
+// NewConsumerGroupFromClient creates a new consumer group using the given client. It is still
+// necessary to call Close() on the underlying client when shutting down this consumer.
+// PLEASE NOTE: consumer groups can only re-use but not share clients.
+func NewConsumerGroupFromClient(groupID string, client Client) (ConsumerGroup, error) {
+	config := client.Config()
+	if !config.Version.IsAtLeast(V0_10_2_0) {
+		return nil, ConfigurationError("consumer groups require Version to be >= V0_10_2_0")
+	}
+
+	consumer, err := NewConsumerFromClient(client)
+	if err != nil {
+		return nil, err
+	}
+
+	return &consumerGroup{
+		client:   client,
+		consumer: consumer,
+		config:   config,
+		groupID:  groupID,
+		errors:   make(chan error, config.ChannelBufferSize),
+		closed:   make(chan none),
+	}, nil
+}
+
+// Errors implements ConsumerGroup.
+func (c *consumerGroup) Errors() <-chan error { return c.errors }
+
+// Close implements ConsumerGroup.
+func (c *consumerGroup) Close() (err error) {
+	c.closeOnce.Do(func() {
+		close(c.closed)
+
+		c.lock.Lock()
+		defer c.lock.Unlock()
+
+		// leave group
+		if e := c.leave(); e != nil {
+			err = e
+		}
+
+		// drain errors
+		go func() {
+			close(c.errors)
+		}()
+		for e := range c.errors {
+			err = e
+		}
+
+		if c.ownClient {
+			if e := c.client.Close(); e != nil {
+				err = e
+			}
+		}
+	})
+	return
+}
+
+// Consume implements ConsumerGroup.
+func (c *consumerGroup) Consume(ctx context.Context, topics []string, handler ConsumerGroupHandler) error {
+	// Ensure group is not closed
+	select {
+	case <-c.closed:
+		return ErrClosedConsumerGroup
+	default:
+	}
+
+	c.lock.Lock()
+	defer c.lock.Unlock()
+
+	// Quick exit when no topics are provided
+	if len(topics) == 0 {
+		return fmt.Errorf("no topics provided")
+	}
+
+	// Refresh metadata for requested topics
+	if err := c.client.RefreshMetadata(topics...); err != nil {
+		return err
+	}
+
+	// Get coordinator
+	coordinator, err := c.client.Coordinator(c.groupID)
+	if err != nil {
+		return err
+	}
+
+	// Init session
+	sess, err := c.newSession(ctx, coordinator, topics, handler, c.config.Consumer.Group.Rebalance.Retry.Max)
+	if err == ErrClosedClient {
+		return ErrClosedConsumerGroup
+	} else if err != nil {
+		return err
+	}
+
+	// Wait for session exit signal
+	<-sess.ctx.Done()
+
+	// Gracefully release session claims
+	return sess.release(true)
+}
+
+func (c *consumerGroup) newSession(ctx context.Context, coordinator *Broker, topics []string, handler ConsumerGroupHandler, retries int) (*consumerGroupSession, error) {
+	// Join consumer group
+	join, err := c.joinGroupRequest(coordinator, topics)
+	if err != nil {
+		_ = coordinator.Close()
+		return nil, err
+	}
+	switch join.Err {
+	case ErrNoError:
+		c.memberID = join.MemberId
+	case ErrUnknownMemberId, ErrIllegalGeneration: // reset member ID and retry immediately
+		c.memberID = ""
+		return c.newSession(ctx, coordinator, topics, handler, retries)
+	case ErrRebalanceInProgress: // retry after backoff
+		if retries <= 0 {
+			return nil, join.Err
+		}
+
+		select {
+		case <-c.closed:
+			return nil, ErrClosedConsumerGroup
+		case <-time.After(c.config.Consumer.Group.Rebalance.Retry.Backoff):
+		}
+
+		return c.newSession(ctx, coordinator, topics, handler, retries-1)
+	default:
+		return nil, join.Err
+	}
+
+	// Prepare distribution plan if we joined as the leader
+	var plan BalanceStrategyPlan
+	if join.LeaderId == join.MemberId {
+		members, err := join.GetMembers()
+		if err != nil {
+			return nil, err
+		}
+
+		plan, err = c.balance(members)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	// Sync consumer group
+	sync, err := c.syncGroupRequest(coordinator, plan, join.GenerationId)
+	if err != nil {
+		_ = coordinator.Close()
+		return nil, err
+	}
+	switch sync.Err {
+	case ErrNoError:
+	case ErrUnknownMemberId, ErrIllegalGeneration: // reset member ID and retry immediately
+		c.memberID = ""
+		return c.newSession(ctx, coordinator, topics, handler, retries)
+	case ErrRebalanceInProgress: // retry after backoff
+		if retries <= 0 {
+			return nil, sync.Err
+		}
+
+		select {
+		case <-c.closed:
+			return nil, ErrClosedConsumerGroup
+		case <-time.After(c.config.Consumer.Group.Rebalance.Retry.Backoff):
+		}
+
+		return c.newSession(ctx, coordinator, topics, handler, retries-1)
+	default:
+		return nil, sync.Err
+	}
+
+	// Retrieve and sort claims
+	var claims map[string][]int32
+	if len(sync.MemberAssignment) > 0 {
+		members, err := sync.GetMemberAssignment()
+		if err != nil {
+			return nil, err
+		}
+		claims = members.Topics
+
+		for _, partitions := range claims {
+			sort.Sort(int32Slice(partitions))
+		}
+	}
+
+	return newConsumerGroupSession(ctx, c, claims, join.MemberId, join.GenerationId, handler)
+}
+
+func (c *consumerGroup) joinGroupRequest(coordinator *Broker, topics []string) (*JoinGroupResponse, error) {
+	req := &JoinGroupRequest{
+		GroupId:        c.groupID,
+		MemberId:       c.memberID,
+		SessionTimeout: int32(c.config.Consumer.Group.Session.Timeout / time.Millisecond),
+		ProtocolType:   "consumer",
+	}
+	if c.config.Version.IsAtLeast(V0_10_1_0) {
+		req.Version = 1
+		req.RebalanceTimeout = int32(c.config.Consumer.Group.Rebalance.Timeout / time.Millisecond)
+	}
+
+	meta := &ConsumerGroupMemberMetadata{
+		Topics:   topics,
+		UserData: c.config.Consumer.Group.Member.UserData,
+	}
+	strategy := c.config.Consumer.Group.Rebalance.Strategy
+	if err := req.AddGroupProtocolMetadata(strategy.Name(), meta); err != nil {
+		return nil, err
+	}
+
+	return coordinator.JoinGroup(req)
+}
+
+func (c *consumerGroup) syncGroupRequest(coordinator *Broker, plan BalanceStrategyPlan, generationID int32) (*SyncGroupResponse, error) {
+	req := &SyncGroupRequest{
+		GroupId:      c.groupID,
+		MemberId:     c.memberID,
+		GenerationId: generationID,
+	}
+	for memberID, topics := range plan {
+		err := req.AddGroupAssignmentMember(memberID, &ConsumerGroupMemberAssignment{
+			Topics: topics,
+		})
+		if err != nil {
+			return nil, err
+		}
+	}
+	return coordinator.SyncGroup(req)
+}
+
+func (c *consumerGroup) heartbeatRequest(coordinator *Broker, memberID string, generationID int32) (*HeartbeatResponse, error) {
+	req := &HeartbeatRequest{
+		GroupId:      c.groupID,
+		MemberId:     memberID,
+		GenerationId: generationID,
+	}
+
+	return coordinator.Heartbeat(req)
+}
+
+func (c *consumerGroup) balance(members map[string]ConsumerGroupMemberMetadata) (BalanceStrategyPlan, error) {
+	topics := make(map[string][]int32)
+	for _, meta := range members {
+		for _, topic := range meta.Topics {
+			topics[topic] = nil
+		}
+	}
+
+	for topic := range topics {
+		partitions, err := c.client.Partitions(topic)
+		if err != nil {
+			return nil, err
+		}
+		topics[topic] = partitions
+	}
+
+	strategy := c.config.Consumer.Group.Rebalance.Strategy
+	return strategy.Plan(members, topics)
+}
+
+// Leaves the cluster, called by Close, protected by lock.
+func (c *consumerGroup) leave() error {
+	if c.memberID == "" {
+		return nil
+	}
+
+	coordinator, err := c.client.Coordinator(c.groupID)
+	if err != nil {
+		return err
+	}
+
+	resp, err := coordinator.LeaveGroup(&LeaveGroupRequest{
+		GroupId:  c.groupID,
+		MemberId: c.memberID,
+	})
+	if err != nil {
+		_ = coordinator.Close()
+		return err
+	}
+
+	// Unset memberID
+	c.memberID = ""
+
+	// Check response
+	switch resp.Err {
+	case ErrRebalanceInProgress, ErrUnknownMemberId, ErrNoError:
+		return nil
+	default:
+		return resp.Err
+	}
+}
+
+func (c *consumerGroup) handleError(err error, topic string, partition int32) {
+	select {
+	case <-c.closed:
+		return
+	default:
+	}
+
+	if _, ok := err.(*ConsumerError); !ok && topic != "" && partition > -1 {
+		err = &ConsumerError{
+			Topic:     topic,
+			Partition: partition,
+			Err:       err,
+		}
+	}
+
+	if c.config.Consumer.Return.Errors {
+		select {
+		case c.errors <- err:
+		default:
+		}
+	} else {
+		Logger.Println(err)
+	}
+}
+
+// --------------------------------------------------------------------
+
+// ConsumerGroupSession represents a consumer group member session.
+type ConsumerGroupSession interface {
+	// Claims returns information about the claimed partitions by topic.
+	Claims() map[string][]int32
+
+	// MemberID returns the cluster member ID.
+	MemberID() string
+
+	// GenerationID returns the current generation ID.
+	GenerationID() int32
+
+	// MarkOffset marks the provided offset, alongside a metadata string
+	// that represents the state of the partition consumer at that point in time. The
+	// metadata string can be used by another consumer to restore that state, so it
+	// can resume consumption.
+	//
+	// To follow upstream conventions, you are expected to mark the offset of the
+	// next message to read, not the last message read. Thus, when calling `MarkOffset`
+	// you should typically add one to the offset of the last consumed message.
+	//
+	// Note: calling MarkOffset does not necessarily commit the offset to the backend
+	// store immediately for efficiency reasons, and it may never be committed if
+	// your application crashes. This means that you may end up processing the same
+	// message twice, and your processing should ideally be idempotent.
+	MarkOffset(topic string, partition int32, offset int64, metadata string)
+
+	// ResetOffset resets to the provided offset, alongside a metadata string that
+	// represents the state of the partition consumer at that point in time. Reset
+	// acts as a counterpart to MarkOffset, the difference being that it allows to
+	// reset an offset to an earlier or smaller value, where MarkOffset only
+	// allows incrementing the offset. cf MarkOffset for more details.
+	ResetOffset(topic string, partition int32, offset int64, metadata string)
+
+	// MarkMessage marks a message as consumed.
+	MarkMessage(msg *ConsumerMessage, metadata string)
+
+	// Context returns the session context.
+	Context() context.Context
+}
+
+type consumerGroupSession struct {
+	parent       *consumerGroup
+	memberID     string
+	generationID int32
+	handler      ConsumerGroupHandler
+
+	claims  map[string][]int32
+	offsets *offsetManager
+	ctx     context.Context
+	cancel  func()
+
+	waitGroup       sync.WaitGroup
+	releaseOnce     sync.Once
+	hbDying, hbDead chan none
+}
+
+func newConsumerGroupSession(ctx context.Context, parent *consumerGroup, claims map[string][]int32, memberID string, generationID int32, handler ConsumerGroupHandler) (*consumerGroupSession, error) {
+	// init offset manager
+	offsets, err := newOffsetManagerFromClient(parent.groupID, memberID, generationID, parent.client)
+	if err != nil {
+		return nil, err
+	}
+
+	// init context
+	ctx, cancel := context.WithCancel(ctx)
+
+	// init session
+	sess := &consumerGroupSession{
+		parent:       parent,
+		memberID:     memberID,
+		generationID: generationID,
+		handler:      handler,
+		offsets:      offsets,
+		claims:       claims,
+		ctx:          ctx,
+		cancel:       cancel,
+		hbDying:      make(chan none),
+		hbDead:       make(chan none),
+	}
+
+	// start heartbeat loop
+	go sess.heartbeatLoop()
+
+	// create a POM for each claim
+	for topic, partitions := range claims {
+		for _, partition := range partitions {
+			pom, err := offsets.ManagePartition(topic, partition)
+			if err != nil {
+				_ = sess.release(false)
+				return nil, err
+			}
+
+			// handle POM errors
+			go func(topic string, partition int32) {
+				for err := range pom.Errors() {
+					sess.parent.handleError(err, topic, partition)
+				}
+			}(topic, partition)
+		}
+	}
+
+	// perform setup
+	if err := handler.Setup(sess); err != nil {
+		_ = sess.release(true)
+		return nil, err
+	}
+
+	// start consuming
+	for topic, partitions := range claims {
+		for _, partition := range partitions {
+			sess.waitGroup.Add(1)
+
+			go func(topic string, partition int32) {
+				defer sess.waitGroup.Done()
+
+				// cancel the as session as soon as the first
+				// goroutine exits
+				defer sess.cancel()
+
+				// consume a single topic/partition, blocking
+				sess.consume(topic, partition)
+			}(topic, partition)
+		}
+	}
+	return sess, nil
+}
+
+func (s *consumerGroupSession) Claims() map[string][]int32 { return s.claims }
+func (s *consumerGroupSession) MemberID() string           { return s.memberID }
+func (s *consumerGroupSession) GenerationID() int32        { return s.generationID }
+
+func (s *consumerGroupSession) MarkOffset(topic string, partition int32, offset int64, metadata string) {
+	if pom := s.offsets.findPOM(topic, partition); pom != nil {
+		pom.MarkOffset(offset, metadata)
+	}
+}
+
+func (s *consumerGroupSession) ResetOffset(topic string, partition int32, offset int64, metadata string) {
+	if pom := s.offsets.findPOM(topic, partition); pom != nil {
+		pom.ResetOffset(offset, metadata)
+	}
+}
+
+func (s *consumerGroupSession) MarkMessage(msg *ConsumerMessage, metadata string) {
+	s.MarkOffset(msg.Topic, msg.Partition, msg.Offset+1, metadata)
+}
+
+func (s *consumerGroupSession) Context() context.Context {
+	return s.ctx
+}
+
+func (s *consumerGroupSession) consume(topic string, partition int32) {
+	// quick exit if rebalance is due
+	select {
+	case <-s.ctx.Done():
+		return
+	case <-s.parent.closed:
+		return
+	default:
+	}
+
+	// get next offset
+	offset := s.parent.config.Consumer.Offsets.Initial
+	if pom := s.offsets.findPOM(topic, partition); pom != nil {
+		offset, _ = pom.NextOffset()
+	}
+
+	// create new claim
+	claim, err := newConsumerGroupClaim(s, topic, partition, offset)
+	if err != nil {
+		s.parent.handleError(err, topic, partition)
+		return
+	}
+
+	// handle errors
+	go func() {
+		for err := range claim.Errors() {
+			s.parent.handleError(err, topic, partition)
+		}
+	}()
+
+	// trigger close when session is done
+	go func() {
+		select {
+		case <-s.ctx.Done():
+		case <-s.parent.closed:
+		}
+		claim.AsyncClose()
+	}()
+
+	// start processing
+	if err := s.handler.ConsumeClaim(s, claim); err != nil {
+		s.parent.handleError(err, topic, partition)
+	}
+
+	// ensure consumer is closed & drained
+	claim.AsyncClose()
+	for _, err := range claim.waitClosed() {
+		s.parent.handleError(err, topic, partition)
+	}
+}
+
+func (s *consumerGroupSession) release(withCleanup bool) (err error) {
+	// signal release, stop heartbeat
+	s.cancel()
+
+	// wait for consumers to exit
+	s.waitGroup.Wait()
+
+	// perform release
+	s.releaseOnce.Do(func() {
+		if withCleanup {
+			if e := s.handler.Cleanup(s); e != nil {
+				s.parent.handleError(err, "", -1)
+				err = e
+			}
+		}
+
+		if e := s.offsets.Close(); e != nil {
+			err = e
+		}
+
+		close(s.hbDying)
+		<-s.hbDead
+	})
+
+	return
+}
+
+func (s *consumerGroupSession) heartbeatLoop() {
+	defer close(s.hbDead)
+	defer s.cancel() // trigger the end of the session on exit
+
+	pause := time.NewTicker(s.parent.config.Consumer.Group.Heartbeat.Interval)
+	defer pause.Stop()
+
+	retries := s.parent.config.Metadata.Retry.Max
+	for {
+		coordinator, err := s.parent.client.Coordinator(s.parent.groupID)
+		if err != nil {
+			if retries <= 0 {
+				s.parent.handleError(err, "", -1)
+				return
+			}
+
+			select {
+			case <-s.hbDying:
+				return
+			case <-time.After(s.parent.config.Metadata.Retry.Backoff):
+				retries--
+			}
+			continue
+		}
+
+		resp, err := s.parent.heartbeatRequest(coordinator, s.memberID, s.generationID)
+		if err != nil {
+			_ = coordinator.Close()
+			retries--
+			continue
+		}
+
+		switch resp.Err {
+		case ErrNoError:
+			retries = s.parent.config.Metadata.Retry.Max
+		case ErrRebalanceInProgress, ErrUnknownMemberId, ErrIllegalGeneration:
+			return
+		default:
+			s.parent.handleError(err, "", -1)
+			return
+		}
+
+		select {
+		case <-pause.C:
+		case <-s.hbDying:
+			return
+		}
+	}
+}
+
+// --------------------------------------------------------------------
+
+// ConsumerGroupHandler instances are used to handle individual topic/partition claims.
+// It also provides hooks for your consumer group session life-cycle and allow you to
+// trigger logic before or after the consume loop(s).
+//
+// PLEASE NOTE that handlers are likely be called from several goroutines concurrently,
+// ensure that all state is safely protected against race conditions.
+type ConsumerGroupHandler interface {
+	// Setup is run at the beginning of a new session, before ConsumeClaim.
+	Setup(ConsumerGroupSession) error
+
+	// Cleanup is run at the end of a session, once all ConsumeClaim goroutines have exited
+	// but before the offsets are committed for the very last time.
+	Cleanup(ConsumerGroupSession) error
+
+	// ConsumeClaim must start a consumer loop of ConsumerGroupClaim's Messages().
+	// Once the Messages() channel is closed, the Handler must finish its processing
+	// loop and exit.
+	ConsumeClaim(ConsumerGroupSession, ConsumerGroupClaim) error
+}
+
+// ConsumerGroupClaim processes Kafka messages from a given topic and partition within a consumer group.
+type ConsumerGroupClaim interface {
+	// Topic returns the consumed topic name.
+	Topic() string
+
+	// Partition returns the consumed partition.
+	Partition() int32
+
+	// InitialOffset returns the initial offset that was used as a starting point for this claim.
+	InitialOffset() int64
+
+	// HighWaterMarkOffset returns the high water mark offset of the partition,
+	// i.e. the offset that will be used for the next message that will be produced.
+	// You can use this to determine how far behind the processing is.
+	HighWaterMarkOffset() int64
+
+	// Messages returns the read channel for the messages that are returned by
+	// the broker. The messages channel will be closed when a new rebalance cycle
+	// is due. You must finish processing and mark offsets within
+	// Config.Consumer.Group.Session.Timeout before the topic/partition is eventually
+	// re-assigned to another group member.
+	Messages() <-chan *ConsumerMessage
+}
+
+type consumerGroupClaim struct {
+	topic     string
+	partition int32
+	offset    int64
+	PartitionConsumer
+}
+
+func newConsumerGroupClaim(sess *consumerGroupSession, topic string, partition int32, offset int64) (*consumerGroupClaim, error) {
+	pcm, err := sess.parent.consumer.ConsumePartition(topic, partition, offset)
+	if err == ErrOffsetOutOfRange {
+		offset = sess.parent.config.Consumer.Offsets.Initial
+		pcm, err = sess.parent.consumer.ConsumePartition(topic, partition, offset)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	go func() {
+		for err := range pcm.Errors() {
+			sess.parent.handleError(err, topic, partition)
+		}
+	}()
+
+	return &consumerGroupClaim{
+		topic:             topic,
+		partition:         partition,
+		offset:            offset,
+		PartitionConsumer: pcm,
+	}, nil
+}
+
+func (c *consumerGroupClaim) Topic() string        { return c.topic }
+func (c *consumerGroupClaim) Partition() int32     { return c.partition }
+func (c *consumerGroupClaim) InitialOffset() int64 { return c.offset }
+
+// Drains messages and errors, ensures the claim is fully closed.
+func (c *consumerGroupClaim) waitClosed() (errs ConsumerErrors) {
+	go func() {
+		for range c.Messages() {
+		}
+	}()
+
+	for err := range c.Errors() {
+		errs = append(errs, err)
+	}
+	return
+}

diff --git a/vendor/github.com/Shopify/sarama/consumer_group_members.go b/vendor/github.com/Shopify/sarama/consumer_group_members.go
new file mode 100644
index 0000000..2d02cc3
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/consumer_group_members.go

@@ -0,0 +1,96 @@
+package sarama
+
+//ConsumerGroupMemberMetadata holds the metadata for consumer group
+type ConsumerGroupMemberMetadata struct {
+	Version  int16
+	Topics   []string
+	UserData []byte
+}
+
+func (m *ConsumerGroupMemberMetadata) encode(pe packetEncoder) error {
+	pe.putInt16(m.Version)
+
+	if err := pe.putStringArray(m.Topics); err != nil {
+		return err
+	}
+
+	if err := pe.putBytes(m.UserData); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (m *ConsumerGroupMemberMetadata) decode(pd packetDecoder) (err error) {
+	if m.Version, err = pd.getInt16(); err != nil {
+		return
+	}
+
+	if m.Topics, err = pd.getStringArray(); err != nil {
+		return
+	}
+
+	if m.UserData, err = pd.getBytes(); err != nil {
+		return
+	}
+
+	return nil
+}
+
+//ConsumerGroupMemberAssignment holds the member assignment for a consume group
+type ConsumerGroupMemberAssignment struct {
+	Version  int16
+	Topics   map[string][]int32
+	UserData []byte
+}
+
+func (m *ConsumerGroupMemberAssignment) encode(pe packetEncoder) error {
+	pe.putInt16(m.Version)
+
+	if err := pe.putArrayLength(len(m.Topics)); err != nil {
+		return err
+	}
+
+	for topic, partitions := range m.Topics {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := pe.putInt32Array(partitions); err != nil {
+			return err
+		}
+	}
+
+	if err := pe.putBytes(m.UserData); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (m *ConsumerGroupMemberAssignment) decode(pd packetDecoder) (err error) {
+	if m.Version, err = pd.getInt16(); err != nil {
+		return
+	}
+
+	var topicLen int
+	if topicLen, err = pd.getArrayLength(); err != nil {
+		return
+	}
+
+	m.Topics = make(map[string][]int32, topicLen)
+	for i := 0; i < topicLen; i++ {
+		var topic string
+		if topic, err = pd.getString(); err != nil {
+			return
+		}
+		if m.Topics[topic], err = pd.getInt32Array(); err != nil {
+			return
+		}
+	}
+
+	if m.UserData, err = pd.getBytes(); err != nil {
+		return
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/consumer_metadata_request.go b/vendor/github.com/Shopify/sarama/consumer_metadata_request.go
new file mode 100644
index 0000000..a8dcaef
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/consumer_metadata_request.go

@@ -0,0 +1,34 @@
+package sarama
+
+//ConsumerMetadataRequest is used for metadata requests
+type ConsumerMetadataRequest struct {
+	ConsumerGroup string
+}
+
+func (r *ConsumerMetadataRequest) encode(pe packetEncoder) error {
+	tmp := new(FindCoordinatorRequest)
+	tmp.CoordinatorKey = r.ConsumerGroup
+	tmp.CoordinatorType = CoordinatorGroup
+	return tmp.encode(pe)
+}
+
+func (r *ConsumerMetadataRequest) decode(pd packetDecoder, version int16) (err error) {
+	tmp := new(FindCoordinatorRequest)
+	if err := tmp.decode(pd, version); err != nil {
+		return err
+	}
+	r.ConsumerGroup = tmp.CoordinatorKey
+	return nil
+}
+
+func (r *ConsumerMetadataRequest) key() int16 {
+	return 10
+}
+
+func (r *ConsumerMetadataRequest) version() int16 {
+	return 0
+}
+
+func (r *ConsumerMetadataRequest) requiredVersion() KafkaVersion {
+	return V0_8_2_0
+}

diff --git a/vendor/github.com/Shopify/sarama/consumer_metadata_response.go b/vendor/github.com/Shopify/sarama/consumer_metadata_response.go
new file mode 100644
index 0000000..4d86e93
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/consumer_metadata_response.go

@@ -0,0 +1,78 @@
+package sarama
+
+import (
+	"net"
+	"strconv"
+)
+
+//ConsumerMetadataResponse holds the reponse for a consumer gorup meta data request
+type ConsumerMetadataResponse struct {
+	Err             KError
+	Coordinator     *Broker
+	CoordinatorID   int32  // deprecated: use Coordinator.ID()
+	CoordinatorHost string // deprecated: use Coordinator.Addr()
+	CoordinatorPort int32  // deprecated: use Coordinator.Addr()
+}
+
+func (r *ConsumerMetadataResponse) decode(pd packetDecoder, version int16) (err error) {
+	tmp := new(FindCoordinatorResponse)
+
+	if err := tmp.decode(pd, version); err != nil {
+		return err
+	}
+
+	r.Err = tmp.Err
+
+	r.Coordinator = tmp.Coordinator
+	if tmp.Coordinator == nil {
+		return nil
+	}
+
+	// this can all go away in 2.0, but we have to fill in deprecated fields to maintain
+	// backwards compatibility
+	host, portstr, err := net.SplitHostPort(r.Coordinator.Addr())
+	if err != nil {
+		return err
+	}
+	port, err := strconv.ParseInt(portstr, 10, 32)
+	if err != nil {
+		return err
+	}
+	r.CoordinatorID = r.Coordinator.ID()
+	r.CoordinatorHost = host
+	r.CoordinatorPort = int32(port)
+
+	return nil
+}
+
+func (r *ConsumerMetadataResponse) encode(pe packetEncoder) error {
+	if r.Coordinator == nil {
+		r.Coordinator = new(Broker)
+		r.Coordinator.id = r.CoordinatorID
+		r.Coordinator.addr = net.JoinHostPort(r.CoordinatorHost, strconv.Itoa(int(r.CoordinatorPort)))
+	}
+
+	tmp := &FindCoordinatorResponse{
+		Version:     0,
+		Err:         r.Err,
+		Coordinator: r.Coordinator,
+	}
+
+	if err := tmp.encode(pe); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (r *ConsumerMetadataResponse) key() int16 {
+	return 10
+}
+
+func (r *ConsumerMetadataResponse) version() int16 {
+	return 0
+}
+
+func (r *ConsumerMetadataResponse) requiredVersion() KafkaVersion {
+	return V0_8_2_0
+}

diff --git a/vendor/github.com/Shopify/sarama/crc32_field.go b/vendor/github.com/Shopify/sarama/crc32_field.go
new file mode 100644
index 0000000..1f14443
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/crc32_field.go

@@ -0,0 +1,69 @@
+package sarama
+
+import (
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+)
+
+type crcPolynomial int8
+
+const (
+	crcIEEE crcPolynomial = iota
+	crcCastagnoli
+)
+
+var castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
+
+// crc32Field implements the pushEncoder and pushDecoder interfaces for calculating CRC32s.
+type crc32Field struct {
+	startOffset int
+	polynomial  crcPolynomial
+}
+
+func (c *crc32Field) saveOffset(in int) {
+	c.startOffset = in
+}
+
+func (c *crc32Field) reserveLength() int {
+	return 4
+}
+
+func newCRC32Field(polynomial crcPolynomial) *crc32Field {
+	return &crc32Field{polynomial: polynomial}
+}
+
+func (c *crc32Field) run(curOffset int, buf []byte) error {
+	crc, err := c.crc(curOffset, buf)
+	if err != nil {
+		return err
+	}
+	binary.BigEndian.PutUint32(buf[c.startOffset:], crc)
+	return nil
+}
+
+func (c *crc32Field) check(curOffset int, buf []byte) error {
+	crc, err := c.crc(curOffset, buf)
+	if err != nil {
+		return err
+	}
+
+	expected := binary.BigEndian.Uint32(buf[c.startOffset:])
+	if crc != expected {
+		return PacketDecodingError{fmt.Sprintf("CRC didn't match expected %#x got %#x", expected, crc)}
+	}
+
+	return nil
+}
+func (c *crc32Field) crc(curOffset int, buf []byte) (uint32, error) {
+	var tab *crc32.Table
+	switch c.polynomial {
+	case crcIEEE:
+		tab = crc32.IEEETable
+	case crcCastagnoli:
+		tab = castagnoliTable
+	default:
+		return 0, PacketDecodingError{"invalid CRC type"}
+	}
+	return crc32.Checksum(buf[c.startOffset+4:curOffset], tab), nil
+}

diff --git a/vendor/github.com/Shopify/sarama/create_partitions_request.go b/vendor/github.com/Shopify/sarama/create_partitions_request.go
new file mode 100644
index 0000000..af321e9
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/create_partitions_request.go

@@ -0,0 +1,121 @@
+package sarama
+
+import "time"
+
+type CreatePartitionsRequest struct {
+	TopicPartitions map[string]*TopicPartition
+	Timeout         time.Duration
+	ValidateOnly    bool
+}
+
+func (c *CreatePartitionsRequest) encode(pe packetEncoder) error {
+	if err := pe.putArrayLength(len(c.TopicPartitions)); err != nil {
+		return err
+	}
+
+	for topic, partition := range c.TopicPartitions {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := partition.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	pe.putInt32(int32(c.Timeout / time.Millisecond))
+
+	pe.putBool(c.ValidateOnly)
+
+	return nil
+}
+
+func (c *CreatePartitionsRequest) decode(pd packetDecoder, version int16) (err error) {
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	c.TopicPartitions = make(map[string]*TopicPartition, n)
+	for i := 0; i < n; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		c.TopicPartitions[topic] = new(TopicPartition)
+		if err := c.TopicPartitions[topic].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	timeout, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	c.Timeout = time.Duration(timeout) * time.Millisecond
+
+	if c.ValidateOnly, err = pd.getBool(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (r *CreatePartitionsRequest) key() int16 {
+	return 37
+}
+
+func (r *CreatePartitionsRequest) version() int16 {
+	return 0
+}
+
+func (r *CreatePartitionsRequest) requiredVersion() KafkaVersion {
+	return V1_0_0_0
+}
+
+type TopicPartition struct {
+	Count      int32
+	Assignment [][]int32
+}
+
+func (t *TopicPartition) encode(pe packetEncoder) error {
+	pe.putInt32(t.Count)
+
+	if len(t.Assignment) == 0 {
+		pe.putInt32(-1)
+		return nil
+	}
+
+	if err := pe.putArrayLength(len(t.Assignment)); err != nil {
+		return err
+	}
+
+	for _, assign := range t.Assignment {
+		if err := pe.putInt32Array(assign); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (t *TopicPartition) decode(pd packetDecoder, version int16) (err error) {
+	if t.Count, err = pd.getInt32(); err != nil {
+		return err
+	}
+
+	n, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	if n <= 0 {
+		return nil
+	}
+	t.Assignment = make([][]int32, n)
+
+	for i := 0; i < int(n); i++ {
+		if t.Assignment[i], err = pd.getInt32Array(); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/create_partitions_response.go b/vendor/github.com/Shopify/sarama/create_partitions_response.go
new file mode 100644
index 0000000..abd621c
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/create_partitions_response.go

@@ -0,0 +1,94 @@
+package sarama
+
+import "time"
+
+type CreatePartitionsResponse struct {
+	ThrottleTime         time.Duration
+	TopicPartitionErrors map[string]*TopicPartitionError
+}
+
+func (c *CreatePartitionsResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(c.ThrottleTime / time.Millisecond))
+	if err := pe.putArrayLength(len(c.TopicPartitionErrors)); err != nil {
+		return err
+	}
+
+	for topic, partitionError := range c.TopicPartitionErrors {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := partitionError.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (c *CreatePartitionsResponse) decode(pd packetDecoder, version int16) (err error) {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	c.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	c.TopicPartitionErrors = make(map[string]*TopicPartitionError, n)
+	for i := 0; i < n; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		c.TopicPartitionErrors[topic] = new(TopicPartitionError)
+		if err := c.TopicPartitionErrors[topic].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (r *CreatePartitionsResponse) key() int16 {
+	return 37
+}
+
+func (r *CreatePartitionsResponse) version() int16 {
+	return 0
+}
+
+func (r *CreatePartitionsResponse) requiredVersion() KafkaVersion {
+	return V1_0_0_0
+}
+
+type TopicPartitionError struct {
+	Err    KError
+	ErrMsg *string
+}
+
+func (t *TopicPartitionError) encode(pe packetEncoder) error {
+	pe.putInt16(int16(t.Err))
+
+	if err := pe.putNullableString(t.ErrMsg); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (t *TopicPartitionError) decode(pd packetDecoder, version int16) (err error) {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	t.Err = KError(kerr)
+
+	if t.ErrMsg, err = pd.getNullableString(); err != nil {
+		return err
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/create_topics_request.go b/vendor/github.com/Shopify/sarama/create_topics_request.go
new file mode 100644
index 0000000..709c0a4
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/create_topics_request.go

@@ -0,0 +1,174 @@
+package sarama
+
+import (
+	"time"
+)
+
+type CreateTopicsRequest struct {
+	Version int16
+
+	TopicDetails map[string]*TopicDetail
+	Timeout      time.Duration
+	ValidateOnly bool
+}
+
+func (c *CreateTopicsRequest) encode(pe packetEncoder) error {
+	if err := pe.putArrayLength(len(c.TopicDetails)); err != nil {
+		return err
+	}
+	for topic, detail := range c.TopicDetails {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := detail.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	pe.putInt32(int32(c.Timeout / time.Millisecond))
+
+	if c.Version >= 1 {
+		pe.putBool(c.ValidateOnly)
+	}
+
+	return nil
+}
+
+func (c *CreateTopicsRequest) decode(pd packetDecoder, version int16) (err error) {
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	c.TopicDetails = make(map[string]*TopicDetail, n)
+
+	for i := 0; i < n; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		c.TopicDetails[topic] = new(TopicDetail)
+		if err = c.TopicDetails[topic].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	timeout, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	c.Timeout = time.Duration(timeout) * time.Millisecond
+
+	if version >= 1 {
+		c.ValidateOnly, err = pd.getBool()
+		if err != nil {
+			return err
+		}
+
+		c.Version = version
+	}
+
+	return nil
+}
+
+func (c *CreateTopicsRequest) key() int16 {
+	return 19
+}
+
+func (c *CreateTopicsRequest) version() int16 {
+	return c.Version
+}
+
+func (c *CreateTopicsRequest) requiredVersion() KafkaVersion {
+	switch c.Version {
+	case 2:
+		return V1_0_0_0
+	case 1:
+		return V0_11_0_0
+	default:
+		return V0_10_1_0
+	}
+}
+
+type TopicDetail struct {
+	NumPartitions     int32
+	ReplicationFactor int16
+	ReplicaAssignment map[int32][]int32
+	ConfigEntries     map[string]*string
+}
+
+func (t *TopicDetail) encode(pe packetEncoder) error {
+	pe.putInt32(t.NumPartitions)
+	pe.putInt16(t.ReplicationFactor)
+
+	if err := pe.putArrayLength(len(t.ReplicaAssignment)); err != nil {
+		return err
+	}
+	for partition, assignment := range t.ReplicaAssignment {
+		pe.putInt32(partition)
+		if err := pe.putInt32Array(assignment); err != nil {
+			return err
+		}
+	}
+
+	if err := pe.putArrayLength(len(t.ConfigEntries)); err != nil {
+		return err
+	}
+	for configKey, configValue := range t.ConfigEntries {
+		if err := pe.putString(configKey); err != nil {
+			return err
+		}
+		if err := pe.putNullableString(configValue); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (t *TopicDetail) decode(pd packetDecoder, version int16) (err error) {
+	if t.NumPartitions, err = pd.getInt32(); err != nil {
+		return err
+	}
+	if t.ReplicationFactor, err = pd.getInt16(); err != nil {
+		return err
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	if n > 0 {
+		t.ReplicaAssignment = make(map[int32][]int32, n)
+		for i := 0; i < n; i++ {
+			replica, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			if t.ReplicaAssignment[replica], err = pd.getInt32Array(); err != nil {
+				return err
+			}
+		}
+	}
+
+	n, err = pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	if n > 0 {
+		t.ConfigEntries = make(map[string]*string, n)
+		for i := 0; i < n; i++ {
+			configKey, err := pd.getString()
+			if err != nil {
+				return err
+			}
+			if t.ConfigEntries[configKey], err = pd.getNullableString(); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/create_topics_response.go b/vendor/github.com/Shopify/sarama/create_topics_response.go
new file mode 100644
index 0000000..66207e0
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/create_topics_response.go

@@ -0,0 +1,112 @@
+package sarama
+
+import "time"
+
+type CreateTopicsResponse struct {
+	Version      int16
+	ThrottleTime time.Duration
+	TopicErrors  map[string]*TopicError
+}
+
+func (c *CreateTopicsResponse) encode(pe packetEncoder) error {
+	if c.Version >= 2 {
+		pe.putInt32(int32(c.ThrottleTime / time.Millisecond))
+	}
+
+	if err := pe.putArrayLength(len(c.TopicErrors)); err != nil {
+		return err
+	}
+	for topic, topicError := range c.TopicErrors {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := topicError.encode(pe, c.Version); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (c *CreateTopicsResponse) decode(pd packetDecoder, version int16) (err error) {
+	c.Version = version
+
+	if version >= 2 {
+		throttleTime, err := pd.getInt32()
+		if err != nil {
+			return err
+		}
+		c.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	c.TopicErrors = make(map[string]*TopicError, n)
+	for i := 0; i < n; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		c.TopicErrors[topic] = new(TopicError)
+		if err := c.TopicErrors[topic].decode(pd, version); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (c *CreateTopicsResponse) key() int16 {
+	return 19
+}
+
+func (c *CreateTopicsResponse) version() int16 {
+	return c.Version
+}
+
+func (c *CreateTopicsResponse) requiredVersion() KafkaVersion {
+	switch c.Version {
+	case 2:
+		return V1_0_0_0
+	case 1:
+		return V0_11_0_0
+	default:
+		return V0_10_1_0
+	}
+}
+
+type TopicError struct {
+	Err    KError
+	ErrMsg *string
+}
+
+func (t *TopicError) encode(pe packetEncoder, version int16) error {
+	pe.putInt16(int16(t.Err))
+
+	if version >= 1 {
+		if err := pe.putNullableString(t.ErrMsg); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (t *TopicError) decode(pd packetDecoder, version int16) (err error) {
+	kErr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	t.Err = KError(kErr)
+
+	if version >= 1 {
+		if t.ErrMsg, err = pd.getNullableString(); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/delete_groups_request.go b/vendor/github.com/Shopify/sarama/delete_groups_request.go
new file mode 100644
index 0000000..305a324
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/delete_groups_request.go

@@ -0,0 +1,30 @@
+package sarama
+
+type DeleteGroupsRequest struct {
+	Groups []string
+}
+
+func (r *DeleteGroupsRequest) encode(pe packetEncoder) error {
+	return pe.putStringArray(r.Groups)
+}
+
+func (r *DeleteGroupsRequest) decode(pd packetDecoder, version int16) (err error) {
+	r.Groups, err = pd.getStringArray()
+	return
+}
+
+func (r *DeleteGroupsRequest) key() int16 {
+	return 42
+}
+
+func (r *DeleteGroupsRequest) version() int16 {
+	return 0
+}
+
+func (r *DeleteGroupsRequest) requiredVersion() KafkaVersion {
+	return V1_1_0_0
+}
+
+func (r *DeleteGroupsRequest) AddGroup(group string) {
+	r.Groups = append(r.Groups, group)
+}

diff --git a/vendor/github.com/Shopify/sarama/delete_groups_response.go b/vendor/github.com/Shopify/sarama/delete_groups_response.go
new file mode 100644
index 0000000..c067ebb
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/delete_groups_response.go

@@ -0,0 +1,70 @@
+package sarama
+
+import (
+	"time"
+)
+
+type DeleteGroupsResponse struct {
+	ThrottleTime    time.Duration
+	GroupErrorCodes map[string]KError
+}
+
+func (r *DeleteGroupsResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(r.ThrottleTime / time.Millisecond))
+
+	if err := pe.putArrayLength(len(r.GroupErrorCodes)); err != nil {
+		return err
+	}
+	for groupID, errorCode := range r.GroupErrorCodes {
+		if err := pe.putString(groupID); err != nil {
+			return err
+		}
+		pe.putInt16(int16(errorCode))
+	}
+
+	return nil
+}
+
+func (r *DeleteGroupsResponse) decode(pd packetDecoder, version int16) error {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	r.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if n == 0 {
+		return nil
+	}
+
+	r.GroupErrorCodes = make(map[string]KError, n)
+	for i := 0; i < n; i++ {
+		groupID, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		errorCode, err := pd.getInt16()
+		if err != nil {
+			return err
+		}
+
+		r.GroupErrorCodes[groupID] = KError(errorCode)
+	}
+
+	return nil
+}
+
+func (r *DeleteGroupsResponse) key() int16 {
+	return 42
+}
+
+func (r *DeleteGroupsResponse) version() int16 {
+	return 0
+}
+
+func (r *DeleteGroupsResponse) requiredVersion() KafkaVersion {
+	return V1_1_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/delete_records_request.go b/vendor/github.com/Shopify/sarama/delete_records_request.go
new file mode 100644
index 0000000..93efafd
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/delete_records_request.go

@@ -0,0 +1,126 @@
+package sarama
+
+import (
+	"sort"
+	"time"
+)
+
+// request message format is:
+// [topic] timeout(int32)
+// where topic is:
+//  name(string) [partition]
+// where partition is:
+//  id(int32) offset(int64)
+
+type DeleteRecordsRequest struct {
+	Topics  map[string]*DeleteRecordsRequestTopic
+	Timeout time.Duration
+}
+
+func (d *DeleteRecordsRequest) encode(pe packetEncoder) error {
+	if err := pe.putArrayLength(len(d.Topics)); err != nil {
+		return err
+	}
+	keys := make([]string, 0, len(d.Topics))
+	for topic := range d.Topics {
+		keys = append(keys, topic)
+	}
+	sort.Strings(keys)
+	for _, topic := range keys {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := d.Topics[topic].encode(pe); err != nil {
+			return err
+		}
+	}
+	pe.putInt32(int32(d.Timeout / time.Millisecond))
+
+	return nil
+}
+
+func (d *DeleteRecordsRequest) decode(pd packetDecoder, version int16) error {
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	if n > 0 {
+		d.Topics = make(map[string]*DeleteRecordsRequestTopic, n)
+		for i := 0; i < n; i++ {
+			topic, err := pd.getString()
+			if err != nil {
+				return err
+			}
+			details := new(DeleteRecordsRequestTopic)
+			if err = details.decode(pd, version); err != nil {
+				return err
+			}
+			d.Topics[topic] = details
+		}
+	}
+
+	timeout, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	d.Timeout = time.Duration(timeout) * time.Millisecond
+
+	return nil
+}
+
+func (d *DeleteRecordsRequest) key() int16 {
+	return 21
+}
+
+func (d *DeleteRecordsRequest) version() int16 {
+	return 0
+}
+
+func (d *DeleteRecordsRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}
+
+type DeleteRecordsRequestTopic struct {
+	PartitionOffsets map[int32]int64 // partition => offset
+}
+
+func (t *DeleteRecordsRequestTopic) encode(pe packetEncoder) error {
+	if err := pe.putArrayLength(len(t.PartitionOffsets)); err != nil {
+		return err
+	}
+	keys := make([]int32, 0, len(t.PartitionOffsets))
+	for partition := range t.PartitionOffsets {
+		keys = append(keys, partition)
+	}
+	sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] })
+	for _, partition := range keys {
+		pe.putInt32(partition)
+		pe.putInt64(t.PartitionOffsets[partition])
+	}
+	return nil
+}
+
+func (t *DeleteRecordsRequestTopic) decode(pd packetDecoder, version int16) error {
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	if n > 0 {
+		t.PartitionOffsets = make(map[int32]int64, n)
+		for i := 0; i < n; i++ {
+			partition, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			offset, err := pd.getInt64()
+			if err != nil {
+				return err
+			}
+			t.PartitionOffsets[partition] = offset
+		}
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/delete_records_response.go b/vendor/github.com/Shopify/sarama/delete_records_response.go
new file mode 100644
index 0000000..733a58b
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/delete_records_response.go

@@ -0,0 +1,158 @@
+package sarama
+
+import (
+	"sort"
+	"time"
+)
+
+// response message format is:
+// throttleMs(int32) [topic]
+// where topic is:
+//  name(string) [partition]
+// where partition is:
+//  id(int32) low_watermark(int64) error_code(int16)
+
+type DeleteRecordsResponse struct {
+	Version      int16
+	ThrottleTime time.Duration
+	Topics       map[string]*DeleteRecordsResponseTopic
+}
+
+func (d *DeleteRecordsResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(d.ThrottleTime / time.Millisecond))
+
+	if err := pe.putArrayLength(len(d.Topics)); err != nil {
+		return err
+	}
+	keys := make([]string, 0, len(d.Topics))
+	for topic := range d.Topics {
+		keys = append(keys, topic)
+	}
+	sort.Strings(keys)
+	for _, topic := range keys {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := d.Topics[topic].encode(pe); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (d *DeleteRecordsResponse) decode(pd packetDecoder, version int16) error {
+	d.Version = version
+
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	d.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	if n > 0 {
+		d.Topics = make(map[string]*DeleteRecordsResponseTopic, n)
+		for i := 0; i < n; i++ {
+			topic, err := pd.getString()
+			if err != nil {
+				return err
+			}
+			details := new(DeleteRecordsResponseTopic)
+			if err = details.decode(pd, version); err != nil {
+				return err
+			}
+			d.Topics[topic] = details
+		}
+	}
+
+	return nil
+}
+
+func (d *DeleteRecordsResponse) key() int16 {
+	return 21
+}
+
+func (d *DeleteRecordsResponse) version() int16 {
+	return 0
+}
+
+func (d *DeleteRecordsResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}
+
+type DeleteRecordsResponseTopic struct {
+	Partitions map[int32]*DeleteRecordsResponsePartition
+}
+
+func (t *DeleteRecordsResponseTopic) encode(pe packetEncoder) error {
+	if err := pe.putArrayLength(len(t.Partitions)); err != nil {
+		return err
+	}
+	keys := make([]int32, 0, len(t.Partitions))
+	for partition := range t.Partitions {
+		keys = append(keys, partition)
+	}
+	sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] })
+	for _, partition := range keys {
+		pe.putInt32(partition)
+		if err := t.Partitions[partition].encode(pe); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (t *DeleteRecordsResponseTopic) decode(pd packetDecoder, version int16) error {
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	if n > 0 {
+		t.Partitions = make(map[int32]*DeleteRecordsResponsePartition, n)
+		for i := 0; i < n; i++ {
+			partition, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			details := new(DeleteRecordsResponsePartition)
+			if err = details.decode(pd, version); err != nil {
+				return err
+			}
+			t.Partitions[partition] = details
+		}
+	}
+
+	return nil
+}
+
+type DeleteRecordsResponsePartition struct {
+	LowWatermark int64
+	Err          KError
+}
+
+func (t *DeleteRecordsResponsePartition) encode(pe packetEncoder) error {
+	pe.putInt64(t.LowWatermark)
+	pe.putInt16(int16(t.Err))
+	return nil
+}
+
+func (t *DeleteRecordsResponsePartition) decode(pd packetDecoder, version int16) error {
+	lowWatermark, err := pd.getInt64()
+	if err != nil {
+		return err
+	}
+	t.LowWatermark = lowWatermark
+
+	kErr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	t.Err = KError(kErr)
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/delete_topics_request.go b/vendor/github.com/Shopify/sarama/delete_topics_request.go
new file mode 100644
index 0000000..911f67d
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/delete_topics_request.go

@@ -0,0 +1,48 @@
+package sarama
+
+import "time"
+
+type DeleteTopicsRequest struct {
+	Version int16
+	Topics  []string
+	Timeout time.Duration
+}
+
+func (d *DeleteTopicsRequest) encode(pe packetEncoder) error {
+	if err := pe.putStringArray(d.Topics); err != nil {
+		return err
+	}
+	pe.putInt32(int32(d.Timeout / time.Millisecond))
+
+	return nil
+}
+
+func (d *DeleteTopicsRequest) decode(pd packetDecoder, version int16) (err error) {
+	if d.Topics, err = pd.getStringArray(); err != nil {
+		return err
+	}
+	timeout, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	d.Timeout = time.Duration(timeout) * time.Millisecond
+	d.Version = version
+	return nil
+}
+
+func (d *DeleteTopicsRequest) key() int16 {
+	return 20
+}
+
+func (d *DeleteTopicsRequest) version() int16 {
+	return d.Version
+}
+
+func (d *DeleteTopicsRequest) requiredVersion() KafkaVersion {
+	switch d.Version {
+	case 1:
+		return V0_11_0_0
+	default:
+		return V0_10_1_0
+	}
+}

diff --git a/vendor/github.com/Shopify/sarama/delete_topics_response.go b/vendor/github.com/Shopify/sarama/delete_topics_response.go
new file mode 100644
index 0000000..3422546
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/delete_topics_response.go

@@ -0,0 +1,78 @@
+package sarama
+
+import "time"
+
+type DeleteTopicsResponse struct {
+	Version         int16
+	ThrottleTime    time.Duration
+	TopicErrorCodes map[string]KError
+}
+
+func (d *DeleteTopicsResponse) encode(pe packetEncoder) error {
+	if d.Version >= 1 {
+		pe.putInt32(int32(d.ThrottleTime / time.Millisecond))
+	}
+
+	if err := pe.putArrayLength(len(d.TopicErrorCodes)); err != nil {
+		return err
+	}
+	for topic, errorCode := range d.TopicErrorCodes {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		pe.putInt16(int16(errorCode))
+	}
+
+	return nil
+}
+
+func (d *DeleteTopicsResponse) decode(pd packetDecoder, version int16) (err error) {
+	if version >= 1 {
+		throttleTime, err := pd.getInt32()
+		if err != nil {
+			return err
+		}
+		d.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+		d.Version = version
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	d.TopicErrorCodes = make(map[string]KError, n)
+
+	for i := 0; i < n; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		errorCode, err := pd.getInt16()
+		if err != nil {
+			return err
+		}
+
+		d.TopicErrorCodes[topic] = KError(errorCode)
+	}
+
+	return nil
+}
+
+func (d *DeleteTopicsResponse) key() int16 {
+	return 20
+}
+
+func (d *DeleteTopicsResponse) version() int16 {
+	return d.Version
+}
+
+func (d *DeleteTopicsResponse) requiredVersion() KafkaVersion {
+	switch d.Version {
+	case 1:
+		return V0_11_0_0
+	default:
+		return V0_10_1_0
+	}
+}

diff --git a/vendor/github.com/Shopify/sarama/describe_configs_request.go b/vendor/github.com/Shopify/sarama/describe_configs_request.go
new file mode 100644
index 0000000..7a7cffc
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/describe_configs_request.go

@@ -0,0 +1,91 @@
+package sarama
+
+type ConfigResource struct {
+	Type        ConfigResourceType
+	Name        string
+	ConfigNames []string
+}
+
+type DescribeConfigsRequest struct {
+	Resources []*ConfigResource
+}
+
+func (r *DescribeConfigsRequest) encode(pe packetEncoder) error {
+	if err := pe.putArrayLength(len(r.Resources)); err != nil {
+		return err
+	}
+
+	for _, c := range r.Resources {
+		pe.putInt8(int8(c.Type))
+		if err := pe.putString(c.Name); err != nil {
+			return err
+		}
+
+		if len(c.ConfigNames) == 0 {
+			pe.putInt32(-1)
+			continue
+		}
+		if err := pe.putStringArray(c.ConfigNames); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (r *DescribeConfigsRequest) decode(pd packetDecoder, version int16) (err error) {
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.Resources = make([]*ConfigResource, n)
+
+	for i := 0; i < n; i++ {
+		r.Resources[i] = &ConfigResource{}
+		t, err := pd.getInt8()
+		if err != nil {
+			return err
+		}
+		r.Resources[i].Type = ConfigResourceType(t)
+		name, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		r.Resources[i].Name = name
+
+		confLength, err := pd.getArrayLength()
+
+		if err != nil {
+			return err
+		}
+
+		if confLength == -1 {
+			continue
+		}
+
+		cfnames := make([]string, confLength)
+		for i := 0; i < confLength; i++ {
+			s, err := pd.getString()
+			if err != nil {
+				return err
+			}
+			cfnames[i] = s
+		}
+		r.Resources[i].ConfigNames = cfnames
+	}
+
+	return nil
+}
+
+func (r *DescribeConfigsRequest) key() int16 {
+	return 32
+}
+
+func (r *DescribeConfigsRequest) version() int16 {
+	return 0
+}
+
+func (r *DescribeConfigsRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/describe_configs_response.go b/vendor/github.com/Shopify/sarama/describe_configs_response.go
new file mode 100644
index 0000000..6e5d30e
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/describe_configs_response.go

@@ -0,0 +1,188 @@
+package sarama
+
+import "time"
+
+type DescribeConfigsResponse struct {
+	ThrottleTime time.Duration
+	Resources    []*ResourceResponse
+}
+
+type ResourceResponse struct {
+	ErrorCode int16
+	ErrorMsg  string
+	Type      ConfigResourceType
+	Name      string
+	Configs   []*ConfigEntry
+}
+
+type ConfigEntry struct {
+	Name      string
+	Value     string
+	ReadOnly  bool
+	Default   bool
+	Sensitive bool
+}
+
+func (r *DescribeConfigsResponse) encode(pe packetEncoder) (err error) {
+	pe.putInt32(int32(r.ThrottleTime / time.Millisecond))
+	if err = pe.putArrayLength(len(r.Resources)); err != nil {
+		return err
+	}
+
+	for _, c := range r.Resources {
+		if err = c.encode(pe); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (r *DescribeConfigsResponse) decode(pd packetDecoder, version int16) (err error) {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	r.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.Resources = make([]*ResourceResponse, n)
+	for i := 0; i < n; i++ {
+		rr := &ResourceResponse{}
+		if err := rr.decode(pd, version); err != nil {
+			return err
+		}
+		r.Resources[i] = rr
+	}
+
+	return nil
+}
+
+func (r *DescribeConfigsResponse) key() int16 {
+	return 32
+}
+
+func (r *DescribeConfigsResponse) version() int16 {
+	return 0
+}
+
+func (r *DescribeConfigsResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}
+
+func (r *ResourceResponse) encode(pe packetEncoder) (err error) {
+	pe.putInt16(r.ErrorCode)
+
+	if err = pe.putString(r.ErrorMsg); err != nil {
+		return err
+	}
+
+	pe.putInt8(int8(r.Type))
+
+	if err = pe.putString(r.Name); err != nil {
+		return err
+	}
+
+	if err = pe.putArrayLength(len(r.Configs)); err != nil {
+		return err
+	}
+
+	for _, c := range r.Configs {
+		if err = c.encode(pe); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (r *ResourceResponse) decode(pd packetDecoder, version int16) (err error) {
+	ec, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	r.ErrorCode = ec
+
+	em, err := pd.getString()
+	if err != nil {
+		return err
+	}
+	r.ErrorMsg = em
+
+	t, err := pd.getInt8()
+	if err != nil {
+		return err
+	}
+	r.Type = ConfigResourceType(t)
+
+	name, err := pd.getString()
+	if err != nil {
+		return err
+	}
+	r.Name = name
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.Configs = make([]*ConfigEntry, n)
+	for i := 0; i < n; i++ {
+		c := &ConfigEntry{}
+		if err := c.decode(pd, version); err != nil {
+			return err
+		}
+		r.Configs[i] = c
+	}
+	return nil
+}
+
+func (r *ConfigEntry) encode(pe packetEncoder) (err error) {
+	if err = pe.putString(r.Name); err != nil {
+		return err
+	}
+
+	if err = pe.putString(r.Value); err != nil {
+		return err
+	}
+
+	pe.putBool(r.ReadOnly)
+	pe.putBool(r.Default)
+	pe.putBool(r.Sensitive)
+	return nil
+}
+
+func (r *ConfigEntry) decode(pd packetDecoder, version int16) (err error) {
+	name, err := pd.getString()
+	if err != nil {
+		return err
+	}
+	r.Name = name
+
+	value, err := pd.getString()
+	if err != nil {
+		return err
+	}
+	r.Value = value
+
+	read, err := pd.getBool()
+	if err != nil {
+		return err
+	}
+	r.ReadOnly = read
+
+	de, err := pd.getBool()
+	if err != nil {
+		return err
+	}
+	r.Default = de
+
+	sensitive, err := pd.getBool()
+	if err != nil {
+		return err
+	}
+	r.Sensitive = sensitive
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/describe_groups_request.go b/vendor/github.com/Shopify/sarama/describe_groups_request.go
new file mode 100644
index 0000000..1fb3567
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/describe_groups_request.go

@@ -0,0 +1,30 @@
+package sarama
+
+type DescribeGroupsRequest struct {
+	Groups []string
+}
+
+func (r *DescribeGroupsRequest) encode(pe packetEncoder) error {
+	return pe.putStringArray(r.Groups)
+}
+
+func (r *DescribeGroupsRequest) decode(pd packetDecoder, version int16) (err error) {
+	r.Groups, err = pd.getStringArray()
+	return
+}
+
+func (r *DescribeGroupsRequest) key() int16 {
+	return 15
+}
+
+func (r *DescribeGroupsRequest) version() int16 {
+	return 0
+}
+
+func (r *DescribeGroupsRequest) requiredVersion() KafkaVersion {
+	return V0_9_0_0
+}
+
+func (r *DescribeGroupsRequest) AddGroup(group string) {
+	r.Groups = append(r.Groups, group)
+}

diff --git a/vendor/github.com/Shopify/sarama/describe_groups_response.go b/vendor/github.com/Shopify/sarama/describe_groups_response.go
new file mode 100644
index 0000000..542b3a9
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/describe_groups_response.go

@@ -0,0 +1,187 @@
+package sarama
+
+type DescribeGroupsResponse struct {
+	Groups []*GroupDescription
+}
+
+func (r *DescribeGroupsResponse) encode(pe packetEncoder) error {
+	if err := pe.putArrayLength(len(r.Groups)); err != nil {
+		return err
+	}
+
+	for _, groupDescription := range r.Groups {
+		if err := groupDescription.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (r *DescribeGroupsResponse) decode(pd packetDecoder, version int16) (err error) {
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.Groups = make([]*GroupDescription, n)
+	for i := 0; i < n; i++ {
+		r.Groups[i] = new(GroupDescription)
+		if err := r.Groups[i].decode(pd); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (r *DescribeGroupsResponse) key() int16 {
+	return 15
+}
+
+func (r *DescribeGroupsResponse) version() int16 {
+	return 0
+}
+
+func (r *DescribeGroupsResponse) requiredVersion() KafkaVersion {
+	return V0_9_0_0
+}
+
+type GroupDescription struct {
+	Err          KError
+	GroupId      string
+	State        string
+	ProtocolType string
+	Protocol     string
+	Members      map[string]*GroupMemberDescription
+}
+
+func (gd *GroupDescription) encode(pe packetEncoder) error {
+	pe.putInt16(int16(gd.Err))
+
+	if err := pe.putString(gd.GroupId); err != nil {
+		return err
+	}
+	if err := pe.putString(gd.State); err != nil {
+		return err
+	}
+	if err := pe.putString(gd.ProtocolType); err != nil {
+		return err
+	}
+	if err := pe.putString(gd.Protocol); err != nil {
+		return err
+	}
+
+	if err := pe.putArrayLength(len(gd.Members)); err != nil {
+		return err
+	}
+
+	for memberId, groupMemberDescription := range gd.Members {
+		if err := pe.putString(memberId); err != nil {
+			return err
+		}
+		if err := groupMemberDescription.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (gd *GroupDescription) decode(pd packetDecoder) (err error) {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+
+	gd.Err = KError(kerr)
+
+	if gd.GroupId, err = pd.getString(); err != nil {
+		return
+	}
+	if gd.State, err = pd.getString(); err != nil {
+		return
+	}
+	if gd.ProtocolType, err = pd.getString(); err != nil {
+		return
+	}
+	if gd.Protocol, err = pd.getString(); err != nil {
+		return
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if n == 0 {
+		return nil
+	}
+
+	gd.Members = make(map[string]*GroupMemberDescription)
+	for i := 0; i < n; i++ {
+		memberId, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		gd.Members[memberId] = new(GroupMemberDescription)
+		if err := gd.Members[memberId].decode(pd); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+type GroupMemberDescription struct {
+	ClientId         string
+	ClientHost       string
+	MemberMetadata   []byte
+	MemberAssignment []byte
+}
+
+func (gmd *GroupMemberDescription) encode(pe packetEncoder) error {
+	if err := pe.putString(gmd.ClientId); err != nil {
+		return err
+	}
+	if err := pe.putString(gmd.ClientHost); err != nil {
+		return err
+	}
+	if err := pe.putBytes(gmd.MemberMetadata); err != nil {
+		return err
+	}
+	if err := pe.putBytes(gmd.MemberAssignment); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (gmd *GroupMemberDescription) decode(pd packetDecoder) (err error) {
+	if gmd.ClientId, err = pd.getString(); err != nil {
+		return
+	}
+	if gmd.ClientHost, err = pd.getString(); err != nil {
+		return
+	}
+	if gmd.MemberMetadata, err = pd.getBytes(); err != nil {
+		return
+	}
+	if gmd.MemberAssignment, err = pd.getBytes(); err != nil {
+		return
+	}
+
+	return nil
+}
+
+func (gmd *GroupMemberDescription) GetMemberAssignment() (*ConsumerGroupMemberAssignment, error) {
+	assignment := new(ConsumerGroupMemberAssignment)
+	err := decode(gmd.MemberAssignment, assignment)
+	return assignment, err
+}
+
+func (gmd *GroupMemberDescription) GetMemberMetadata() (*ConsumerGroupMemberMetadata, error) {
+	metadata := new(ConsumerGroupMemberMetadata)
+	err := decode(gmd.MemberMetadata, metadata)
+	return metadata, err
+}

diff --git a/vendor/github.com/Shopify/sarama/dev.yml b/vendor/github.com/Shopify/sarama/dev.yml
new file mode 100644
index 0000000..97eed3a
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/dev.yml

@@ -0,0 +1,10 @@
+name: sarama
+
+up:
+  - go:
+      version: '1.11'
+
+commands:
+  test:
+    run: make test
+    desc: 'run unit tests'

diff --git a/vendor/github.com/Shopify/sarama/encoder_decoder.go b/vendor/github.com/Shopify/sarama/encoder_decoder.go
new file mode 100644
index 0000000..7ce3bc0
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/encoder_decoder.go

@@ -0,0 +1,89 @@
+package sarama
+
+import (
+	"fmt"
+
+	"github.com/rcrowley/go-metrics"
+)
+
+// Encoder is the interface that wraps the basic Encode method.
+// Anything implementing Encoder can be turned into bytes using Kafka's encoding rules.
+type encoder interface {
+	encode(pe packetEncoder) error
+}
+
+// Encode takes an Encoder and turns it into bytes while potentially recording metrics.
+func encode(e encoder, metricRegistry metrics.Registry) ([]byte, error) {
+	if e == nil {
+		return nil, nil
+	}
+
+	var prepEnc prepEncoder
+	var realEnc realEncoder
+
+	err := e.encode(&prepEnc)
+	if err != nil {
+		return nil, err
+	}
+
+	if prepEnc.length < 0 || prepEnc.length > int(MaxRequestSize) {
+		return nil, PacketEncodingError{fmt.Sprintf("invalid request size (%d)", prepEnc.length)}
+	}
+
+	realEnc.raw = make([]byte, prepEnc.length)
+	realEnc.registry = metricRegistry
+	err = e.encode(&realEnc)
+	if err != nil {
+		return nil, err
+	}
+
+	return realEnc.raw, nil
+}
+
+// Decoder is the interface that wraps the basic Decode method.
+// Anything implementing Decoder can be extracted from bytes using Kafka's encoding rules.
+type decoder interface {
+	decode(pd packetDecoder) error
+}
+
+type versionedDecoder interface {
+	decode(pd packetDecoder, version int16) error
+}
+
+// Decode takes bytes and a Decoder and fills the fields of the decoder from the bytes,
+// interpreted using Kafka's encoding rules.
+func decode(buf []byte, in decoder) error {
+	if buf == nil {
+		return nil
+	}
+
+	helper := realDecoder{raw: buf}
+	err := in.decode(&helper)
+	if err != nil {
+		return err
+	}
+
+	if helper.off != len(buf) {
+		return PacketDecodingError{"invalid length"}
+	}
+
+	return nil
+}
+
+func versionedDecode(buf []byte, in versionedDecoder, version int16) error {
+	if buf == nil {
+		return nil
+	}
+
+	helper := realDecoder{raw: buf}
+	err := in.decode(&helper, version)
+	if err != nil {
+		return err
+	}
+
+	if helper.off != len(buf) {
+		return PacketDecodingError{"invalid length"}
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/end_txn_request.go b/vendor/github.com/Shopify/sarama/end_txn_request.go
new file mode 100644
index 0000000..2cd9b50
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/end_txn_request.go

@@ -0,0 +1,50 @@
+package sarama
+
+type EndTxnRequest struct {
+	TransactionalID   string
+	ProducerID        int64
+	ProducerEpoch     int16
+	TransactionResult bool
+}
+
+func (a *EndTxnRequest) encode(pe packetEncoder) error {
+	if err := pe.putString(a.TransactionalID); err != nil {
+		return err
+	}
+
+	pe.putInt64(a.ProducerID)
+
+	pe.putInt16(a.ProducerEpoch)
+
+	pe.putBool(a.TransactionResult)
+
+	return nil
+}
+
+func (a *EndTxnRequest) decode(pd packetDecoder, version int16) (err error) {
+	if a.TransactionalID, err = pd.getString(); err != nil {
+		return err
+	}
+	if a.ProducerID, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if a.ProducerEpoch, err = pd.getInt16(); err != nil {
+		return err
+	}
+	if a.TransactionResult, err = pd.getBool(); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (a *EndTxnRequest) key() int16 {
+	return 26
+}
+
+func (a *EndTxnRequest) version() int16 {
+	return 0
+}
+
+func (a *EndTxnRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/end_txn_response.go b/vendor/github.com/Shopify/sarama/end_txn_response.go
new file mode 100644
index 0000000..33b27e3
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/end_txn_response.go

@@ -0,0 +1,44 @@
+package sarama
+
+import (
+	"time"
+)
+
+type EndTxnResponse struct {
+	ThrottleTime time.Duration
+	Err          KError
+}
+
+func (e *EndTxnResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(e.ThrottleTime / time.Millisecond))
+	pe.putInt16(int16(e.Err))
+	return nil
+}
+
+func (e *EndTxnResponse) decode(pd packetDecoder, version int16) (err error) {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	e.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	e.Err = KError(kerr)
+
+	return nil
+}
+
+func (e *EndTxnResponse) key() int16 {
+	return 25
+}
+
+func (e *EndTxnResponse) version() int16 {
+	return 0
+}
+
+func (e *EndTxnResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/errors.go b/vendor/github.com/Shopify/sarama/errors.go
new file mode 100644
index 0000000..c11421d
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/errors.go

@@ -0,0 +1,317 @@
+package sarama
+
+import (
+	"errors"
+	"fmt"
+)
+
+// ErrOutOfBrokers is the error returned when the client has run out of brokers to talk to because all of them errored
+// or otherwise failed to respond.
+var ErrOutOfBrokers = errors.New("kafka: client has run out of available brokers to talk to (Is your cluster reachable?)")
+
+// ErrClosedClient is the error returned when a method is called on a client that has been closed.
+var ErrClosedClient = errors.New("kafka: tried to use a client that was closed")
+
+// ErrIncompleteResponse is the error returned when the server returns a syntactically valid response, but it does
+// not contain the expected information.
+var ErrIncompleteResponse = errors.New("kafka: response did not contain all the expected topic/partition blocks")
+
+// ErrInvalidPartition is the error returned when a partitioner returns an invalid partition index
+// (meaning one outside of the range [0...numPartitions-1]).
+var ErrInvalidPartition = errors.New("kafka: partitioner returned an invalid partition index")
+
+// ErrAlreadyConnected is the error returned when calling Open() on a Broker that is already connected or connecting.
+var ErrAlreadyConnected = errors.New("kafka: broker connection already initiated")
+
+// ErrNotConnected is the error returned when trying to send or call Close() on a Broker that is not connected.
+var ErrNotConnected = errors.New("kafka: broker not connected")
+
+// ErrInsufficientData is returned when decoding and the packet is truncated. This can be expected
+// when requesting messages, since as an optimization the server is allowed to return a partial message at the end
+// of the message set.
+var ErrInsufficientData = errors.New("kafka: insufficient data to decode packet, more bytes expected")
+
+// ErrShuttingDown is returned when a producer receives a message during shutdown.
+var ErrShuttingDown = errors.New("kafka: message received by producer in process of shutting down")
+
+// ErrMessageTooLarge is returned when the next message to consume is larger than the configured Consumer.Fetch.Max
+var ErrMessageTooLarge = errors.New("kafka: message is larger than Consumer.Fetch.Max")
+
+// ErrConsumerOffsetNotAdvanced is returned when a partition consumer didn't advance its offset after parsing
+// a RecordBatch.
+var ErrConsumerOffsetNotAdvanced = errors.New("kafka: consumer offset was not advanced after a RecordBatch")
+
+// ErrControllerNotAvailable is returned when server didn't give correct controller id. May be kafka server's version
+// is lower than 0.10.0.0.
+var ErrControllerNotAvailable = errors.New("kafka: controller is not available")
+
+// ErrNoTopicsToUpdateMetadata is returned when Meta.Full is set to false but no specific topics were found to update
+// the metadata.
+var ErrNoTopicsToUpdateMetadata = errors.New("kafka: no specific topics to update metadata")
+
+// PacketEncodingError is returned from a failure while encoding a Kafka packet. This can happen, for example,
+// if you try to encode a string over 2^15 characters in length, since Kafka's encoding rules do not permit that.
+type PacketEncodingError struct {
+	Info string
+}
+
+func (err PacketEncodingError) Error() string {
+	return fmt.Sprintf("kafka: error encoding packet: %s", err.Info)
+}
+
+// PacketDecodingError is returned when there was an error (other than truncated data) decoding the Kafka broker's response.
+// This can be a bad CRC or length field, or any other invalid value.
+type PacketDecodingError struct {
+	Info string
+}
+
+func (err PacketDecodingError) Error() string {
+	return fmt.Sprintf("kafka: error decoding packet: %s", err.Info)
+}
+
+// ConfigurationError is the type of error returned from a constructor (e.g. NewClient, or NewConsumer)
+// when the specified configuration is invalid.
+type ConfigurationError string
+
+func (err ConfigurationError) Error() string {
+	return "kafka: invalid configuration (" + string(err) + ")"
+}
+
+// KError is the type of error that can be returned directly by the Kafka broker.
+// See https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-ErrorCodes
+type KError int16
+
+// Numeric error codes returned by the Kafka server.
+const (
+	ErrNoError                            KError = 0
+	ErrUnknown                            KError = -1
+	ErrOffsetOutOfRange                   KError = 1
+	ErrInvalidMessage                     KError = 2
+	ErrUnknownTopicOrPartition            KError = 3
+	ErrInvalidMessageSize                 KError = 4
+	ErrLeaderNotAvailable                 KError = 5
+	ErrNotLeaderForPartition              KError = 6
+	ErrRequestTimedOut                    KError = 7
+	ErrBrokerNotAvailable                 KError = 8
+	ErrReplicaNotAvailable                KError = 9
+	ErrMessageSizeTooLarge                KError = 10
+	ErrStaleControllerEpochCode           KError = 11
+	ErrOffsetMetadataTooLarge             KError = 12
+	ErrNetworkException                   KError = 13
+	ErrOffsetsLoadInProgress              KError = 14
+	ErrConsumerCoordinatorNotAvailable    KError = 15
+	ErrNotCoordinatorForConsumer          KError = 16
+	ErrInvalidTopic                       KError = 17
+	ErrMessageSetSizeTooLarge             KError = 18
+	ErrNotEnoughReplicas                  KError = 19
+	ErrNotEnoughReplicasAfterAppend       KError = 20
+	ErrInvalidRequiredAcks                KError = 21
+	ErrIllegalGeneration                  KError = 22
+	ErrInconsistentGroupProtocol          KError = 23
+	ErrInvalidGroupId                     KError = 24
+	ErrUnknownMemberId                    KError = 25
+	ErrInvalidSessionTimeout              KError = 26
+	ErrRebalanceInProgress                KError = 27
+	ErrInvalidCommitOffsetSize            KError = 28
+	ErrTopicAuthorizationFailed           KError = 29
+	ErrGroupAuthorizationFailed           KError = 30
+	ErrClusterAuthorizationFailed         KError = 31
+	ErrInvalidTimestamp                   KError = 32
+	ErrUnsupportedSASLMechanism           KError = 33
+	ErrIllegalSASLState                   KError = 34
+	ErrUnsupportedVersion                 KError = 35
+	ErrTopicAlreadyExists                 KError = 36
+	ErrInvalidPartitions                  KError = 37
+	ErrInvalidReplicationFactor           KError = 38
+	ErrInvalidReplicaAssignment           KError = 39
+	ErrInvalidConfig                      KError = 40
+	ErrNotController                      KError = 41
+	ErrInvalidRequest                     KError = 42
+	ErrUnsupportedForMessageFormat        KError = 43
+	ErrPolicyViolation                    KError = 44
+	ErrOutOfOrderSequenceNumber           KError = 45
+	ErrDuplicateSequenceNumber            KError = 46
+	ErrInvalidProducerEpoch               KError = 47
+	ErrInvalidTxnState                    KError = 48
+	ErrInvalidProducerIDMapping           KError = 49
+	ErrInvalidTransactionTimeout          KError = 50
+	ErrConcurrentTransactions             KError = 51
+	ErrTransactionCoordinatorFenced       KError = 52
+	ErrTransactionalIDAuthorizationFailed KError = 53
+	ErrSecurityDisabled                   KError = 54
+	ErrOperationNotAttempted              KError = 55
+	ErrKafkaStorageError                  KError = 56
+	ErrLogDirNotFound                     KError = 57
+	ErrSASLAuthenticationFailed           KError = 58
+	ErrUnknownProducerID                  KError = 59
+	ErrReassignmentInProgress             KError = 60
+	ErrDelegationTokenAuthDisabled        KError = 61
+	ErrDelegationTokenNotFound            KError = 62
+	ErrDelegationTokenOwnerMismatch       KError = 63
+	ErrDelegationTokenRequestNotAllowed   KError = 64
+	ErrDelegationTokenAuthorizationFailed KError = 65
+	ErrDelegationTokenExpired             KError = 66
+	ErrInvalidPrincipalType               KError = 67
+	ErrNonEmptyGroup                      KError = 68
+	ErrGroupIDNotFound                    KError = 69
+	ErrFetchSessionIDNotFound             KError = 70
+	ErrInvalidFetchSessionEpoch           KError = 71
+	ErrListenerNotFound                   KError = 72
+)
+
+func (err KError) Error() string {
+	// Error messages stolen/adapted from
+	// https://kafka.apache.org/protocol#protocol_error_codes
+	switch err {
+	case ErrNoError:
+		return "kafka server: Not an error, why are you printing me?"
+	case ErrUnknown:
+		return "kafka server: Unexpected (unknown?) server error."
+	case ErrOffsetOutOfRange:
+		return "kafka server: The requested offset is outside the range of offsets maintained by the server for the given topic/partition."
+	case ErrInvalidMessage:
+		return "kafka server: Message contents does not match its CRC."
+	case ErrUnknownTopicOrPartition:
+		return "kafka server: Request was for a topic or partition that does not exist on this broker."
+	case ErrInvalidMessageSize:
+		return "kafka server: The message has a negative size."
+	case ErrLeaderNotAvailable:
+		return "kafka server: In the middle of a leadership election, there is currently no leader for this partition and hence it is unavailable for writes."
+	case ErrNotLeaderForPartition:
+		return "kafka server: Tried to send a message to a replica that is not the leader for some partition. Your metadata is out of date."
+	case ErrRequestTimedOut:
+		return "kafka server: Request exceeded the user-specified time limit in the request."
+	case ErrBrokerNotAvailable:
+		return "kafka server: Broker not available. Not a client facing error, we should never receive this!!!"
+	case ErrReplicaNotAvailable:
+		return "kafka server: Replica information not available, one or more brokers are down."
+	case ErrMessageSizeTooLarge:
+		return "kafka server: Message was too large, server rejected it to avoid allocation error."
+	case ErrStaleControllerEpochCode:
+		return "kafka server: StaleControllerEpochCode (internal error code for broker-to-broker communication)."
+	case ErrOffsetMetadataTooLarge:
+		return "kafka server: Specified a string larger than the configured maximum for offset metadata."
+	case ErrNetworkException:
+		return "kafka server: The server disconnected before a response was received."
+	case ErrOffsetsLoadInProgress:
+		return "kafka server: The broker is still loading offsets after a leader change for that offset's topic partition."
+	case ErrConsumerCoordinatorNotAvailable:
+		return "kafka server: Offset's topic has not yet been created."
+	case ErrNotCoordinatorForConsumer:
+		return "kafka server: Request was for a consumer group that is not coordinated by this broker."
+	case ErrInvalidTopic:
+		return "kafka server: The request attempted to perform an operation on an invalid topic."
+	case ErrMessageSetSizeTooLarge:
+		return "kafka server: The request included message batch larger than the configured segment size on the server."
+	case ErrNotEnoughReplicas:
+		return "kafka server: Messages are rejected since there are fewer in-sync replicas than required."
+	case ErrNotEnoughReplicasAfterAppend:
+		return "kafka server: Messages are written to the log, but to fewer in-sync replicas than required."
+	case ErrInvalidRequiredAcks:
+		return "kafka server: The number of required acks is invalid (should be either -1, 0, or 1)."
+	case ErrIllegalGeneration:
+		return "kafka server: The provided generation id is not the current generation."
+	case ErrInconsistentGroupProtocol:
+		return "kafka server: The provider group protocol type is incompatible with the other members."
+	case ErrInvalidGroupId:
+		return "kafka server: The provided group id was empty."
+	case ErrUnknownMemberId:
+		return "kafka server: The provided member is not known in the current generation."
+	case ErrInvalidSessionTimeout:
+		return "kafka server: The provided session timeout is outside the allowed range."
+	case ErrRebalanceInProgress:
+		return "kafka server: A rebalance for the group is in progress. Please re-join the group."
+	case ErrInvalidCommitOffsetSize:
+		return "kafka server: The provided commit metadata was too large."
+	case ErrTopicAuthorizationFailed:
+		return "kafka server: The client is not authorized to access this topic."
+	case ErrGroupAuthorizationFailed:
+		return "kafka server: The client is not authorized to access this group."
+	case ErrClusterAuthorizationFailed:
+		return "kafka server: The client is not authorized to send this request type."
+	case ErrInvalidTimestamp:
+		return "kafka server: The timestamp of the message is out of acceptable range."
+	case ErrUnsupportedSASLMechanism:
+		return "kafka server: The broker does not support the requested SASL mechanism."
+	case ErrIllegalSASLState:
+		return "kafka server: Request is not valid given the current SASL state."
+	case ErrUnsupportedVersion:
+		return "kafka server: The version of API is not supported."
+	case ErrTopicAlreadyExists:
+		return "kafka server: Topic with this name already exists."
+	case ErrInvalidPartitions:
+		return "kafka server: Number of partitions is invalid."
+	case ErrInvalidReplicationFactor:
+		return "kafka server: Replication-factor is invalid."
+	case ErrInvalidReplicaAssignment:
+		return "kafka server: Replica assignment is invalid."
+	case ErrInvalidConfig:
+		return "kafka server: Configuration is invalid."
+	case ErrNotController:
+		return "kafka server: This is not the correct controller for this cluster."
+	case ErrInvalidRequest:
+		return "kafka server: This most likely occurs because of a request being malformed by the client library or the message was sent to an incompatible broker. See the broker logs for more details."
+	case ErrUnsupportedForMessageFormat:
+		return "kafka server: The requested operation is not supported by the message format version."
+	case ErrPolicyViolation:
+		return "kafka server: Request parameters do not satisfy the configured policy."
+	case ErrOutOfOrderSequenceNumber:
+		return "kafka server: The broker received an out of order sequence number."
+	case ErrDuplicateSequenceNumber:
+		return "kafka server: The broker received a duplicate sequence number."
+	case ErrInvalidProducerEpoch:
+		return "kafka server: Producer attempted an operation with an old epoch."
+	case ErrInvalidTxnState:
+		return "kafka server: The producer attempted a transactional operation in an invalid state."
+	case ErrInvalidProducerIDMapping:
+		return "kafka server: The producer attempted to use a producer id which is not currently assigned to its transactional id."
+	case ErrInvalidTransactionTimeout:
+		return "kafka server: The transaction timeout is larger than the maximum value allowed by the broker (as configured by max.transaction.timeout.ms)."
+	case ErrConcurrentTransactions:
+		return "kafka server: The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing."
+	case ErrTransactionCoordinatorFenced:
+		return "kafka server: The transaction coordinator sending a WriteTxnMarker is no longer the current coordinator for a given producer."
+	case ErrTransactionalIDAuthorizationFailed:
+		return "kafka server: Transactional ID authorization failed."
+	case ErrSecurityDisabled:
+		return "kafka server: Security features are disabled."
+	case ErrOperationNotAttempted:
+		return "kafka server: The broker did not attempt to execute this operation."
+	case ErrKafkaStorageError:
+		return "kafka server: Disk error when trying to access log file on the disk."
+	case ErrLogDirNotFound:
+		return "kafka server: The specified log directory is not found in the broker config."
+	case ErrSASLAuthenticationFailed:
+		return "kafka server: SASL Authentication failed."
+	case ErrUnknownProducerID:
+		return "kafka server: The broker could not locate the producer metadata associated with the Producer ID."
+	case ErrReassignmentInProgress:
+		return "kafka server: A partition reassignment is in progress."
+	case ErrDelegationTokenAuthDisabled:
+		return "kafka server: Delegation Token feature is not enabled."
+	case ErrDelegationTokenNotFound:
+		return "kafka server: Delegation Token is not found on server."
+	case ErrDelegationTokenOwnerMismatch:
+		return "kafka server: Specified Principal is not valid Owner/Renewer."
+	case ErrDelegationTokenRequestNotAllowed:
+		return "kafka server: Delegation Token requests are not allowed on PLAINTEXT/1-way SSL channels and on delegation token authenticated channels."
+	case ErrDelegationTokenAuthorizationFailed:
+		return "kafka server: Delegation Token authorization failed."
+	case ErrDelegationTokenExpired:
+		return "kafka server: Delegation Token is expired."
+	case ErrInvalidPrincipalType:
+		return "kafka server: Supplied principalType is not supported."
+	case ErrNonEmptyGroup:
+		return "kafka server: The group is not empty."
+	case ErrGroupIDNotFound:
+		return "kafka server: The group id does not exist."
+	case ErrFetchSessionIDNotFound:
+		return "kafka server: The fetch session ID was not found."
+	case ErrInvalidFetchSessionEpoch:
+		return "kafka server: The fetch session epoch is invalid."
+	case ErrListenerNotFound:
+		return "kafka server: There is no listener on the leader broker that matches the listener on which metadata request was processed."
+	}
+
+	return fmt.Sprintf("Unknown error, how did this happen? Error code = %d", err)
+}

diff --git a/vendor/github.com/Shopify/sarama/fetch_request.go b/vendor/github.com/Shopify/sarama/fetch_request.go
new file mode 100644
index 0000000..462ab8a
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/fetch_request.go

@@ -0,0 +1,170 @@
+package sarama
+
+type fetchRequestBlock struct {
+	fetchOffset int64
+	maxBytes    int32
+}
+
+func (b *fetchRequestBlock) encode(pe packetEncoder) error {
+	pe.putInt64(b.fetchOffset)
+	pe.putInt32(b.maxBytes)
+	return nil
+}
+
+func (b *fetchRequestBlock) decode(pd packetDecoder) (err error) {
+	if b.fetchOffset, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if b.maxBytes, err = pd.getInt32(); err != nil {
+		return err
+	}
+	return nil
+}
+
+// FetchRequest (API key 1) will fetch Kafka messages. Version 3 introduced the MaxBytes field. See
+// https://issues.apache.org/jira/browse/KAFKA-2063 for a discussion of the issues leading up to that.  The KIP is at
+// https://cwiki.apache.org/confluence/display/KAFKA/KIP-74%3A+Add+Fetch+Response+Size+Limit+in+Bytes
+type FetchRequest struct {
+	MaxWaitTime int32
+	MinBytes    int32
+	MaxBytes    int32
+	Version     int16
+	Isolation   IsolationLevel
+	blocks      map[string]map[int32]*fetchRequestBlock
+}
+
+type IsolationLevel int8
+
+const (
+	ReadUncommitted IsolationLevel = 0
+	ReadCommitted   IsolationLevel = 1
+)
+
+func (r *FetchRequest) encode(pe packetEncoder) (err error) {
+	pe.putInt32(-1) // replica ID is always -1 for clients
+	pe.putInt32(r.MaxWaitTime)
+	pe.putInt32(r.MinBytes)
+	if r.Version >= 3 {
+		pe.putInt32(r.MaxBytes)
+	}
+	if r.Version >= 4 {
+		pe.putInt8(int8(r.Isolation))
+	}
+	err = pe.putArrayLength(len(r.blocks))
+	if err != nil {
+		return err
+	}
+	for topic, blocks := range r.blocks {
+		err = pe.putString(topic)
+		if err != nil {
+			return err
+		}
+		err = pe.putArrayLength(len(blocks))
+		if err != nil {
+			return err
+		}
+		for partition, block := range blocks {
+			pe.putInt32(partition)
+			err = block.encode(pe)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (r *FetchRequest) decode(pd packetDecoder, version int16) (err error) {
+	r.Version = version
+	if _, err = pd.getInt32(); err != nil {
+		return err
+	}
+	if r.MaxWaitTime, err = pd.getInt32(); err != nil {
+		return err
+	}
+	if r.MinBytes, err = pd.getInt32(); err != nil {
+		return err
+	}
+	if r.Version >= 3 {
+		if r.MaxBytes, err = pd.getInt32(); err != nil {
+			return err
+		}
+	}
+	if r.Version >= 4 {
+		isolation, err := pd.getInt8()
+		if err != nil {
+			return err
+		}
+		r.Isolation = IsolationLevel(isolation)
+	}
+	topicCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if topicCount == 0 {
+		return nil
+	}
+	r.blocks = make(map[string]map[int32]*fetchRequestBlock)
+	for i := 0; i < topicCount; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		partitionCount, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+		r.blocks[topic] = make(map[int32]*fetchRequestBlock)
+		for j := 0; j < partitionCount; j++ {
+			partition, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			fetchBlock := &fetchRequestBlock{}
+			if err = fetchBlock.decode(pd); err != nil {
+				return err
+			}
+			r.blocks[topic][partition] = fetchBlock
+		}
+	}
+	return nil
+}
+
+func (r *FetchRequest) key() int16 {
+	return 1
+}
+
+func (r *FetchRequest) version() int16 {
+	return r.Version
+}
+
+func (r *FetchRequest) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_9_0_0
+	case 2:
+		return V0_10_0_0
+	case 3:
+		return V0_10_1_0
+	case 4:
+		return V0_11_0_0
+	default:
+		return MinVersion
+	}
+}
+
+func (r *FetchRequest) AddBlock(topic string, partitionID int32, fetchOffset int64, maxBytes int32) {
+	if r.blocks == nil {
+		r.blocks = make(map[string]map[int32]*fetchRequestBlock)
+	}
+
+	if r.blocks[topic] == nil {
+		r.blocks[topic] = make(map[int32]*fetchRequestBlock)
+	}
+
+	tmp := new(fetchRequestBlock)
+	tmp.maxBytes = maxBytes
+	tmp.fetchOffset = fetchOffset
+
+	r.blocks[topic][partitionID] = tmp
+}

diff --git a/vendor/github.com/Shopify/sarama/fetch_response.go b/vendor/github.com/Shopify/sarama/fetch_response.go
new file mode 100644
index 0000000..dade1c4
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/fetch_response.go

@@ -0,0 +1,396 @@
+package sarama
+
+import (
+	"time"
+)
+
+type AbortedTransaction struct {
+	ProducerID  int64
+	FirstOffset int64
+}
+
+func (t *AbortedTransaction) decode(pd packetDecoder) (err error) {
+	if t.ProducerID, err = pd.getInt64(); err != nil {
+		return err
+	}
+
+	if t.FirstOffset, err = pd.getInt64(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (t *AbortedTransaction) encode(pe packetEncoder) (err error) {
+	pe.putInt64(t.ProducerID)
+	pe.putInt64(t.FirstOffset)
+
+	return nil
+}
+
+type FetchResponseBlock struct {
+	Err                 KError
+	HighWaterMarkOffset int64
+	LastStableOffset    int64
+	AbortedTransactions []*AbortedTransaction
+	Records             *Records // deprecated: use FetchResponseBlock.Records
+	RecordsSet          []*Records
+	Partial             bool
+}
+
+func (b *FetchResponseBlock) decode(pd packetDecoder, version int16) (err error) {
+	tmp, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	b.Err = KError(tmp)
+
+	b.HighWaterMarkOffset, err = pd.getInt64()
+	if err != nil {
+		return err
+	}
+
+	if version >= 4 {
+		b.LastStableOffset, err = pd.getInt64()
+		if err != nil {
+			return err
+		}
+
+		numTransact, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+
+		if numTransact >= 0 {
+			b.AbortedTransactions = make([]*AbortedTransaction, numTransact)
+		}
+
+		for i := 0; i < numTransact; i++ {
+			transact := new(AbortedTransaction)
+			if err = transact.decode(pd); err != nil {
+				return err
+			}
+			b.AbortedTransactions[i] = transact
+		}
+	}
+
+	recordsSize, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+
+	recordsDecoder, err := pd.getSubset(int(recordsSize))
+	if err != nil {
+		return err
+	}
+
+	b.RecordsSet = []*Records{}
+
+	for recordsDecoder.remaining() > 0 {
+		records := &Records{}
+		if err := records.decode(recordsDecoder); err != nil {
+			// If we have at least one decoded records, this is not an error
+			if err == ErrInsufficientData {
+				if len(b.RecordsSet) == 0 {
+					b.Partial = true
+				}
+				break
+			}
+			return err
+		}
+
+		partial, err := records.isPartial()
+		if err != nil {
+			return err
+		}
+
+		n, err := records.numRecords()
+		if err != nil {
+			return err
+		}
+
+		if n > 0 || (partial && len(b.RecordsSet) == 0) {
+			b.RecordsSet = append(b.RecordsSet, records)
+
+			if b.Records == nil {
+				b.Records = records
+			}
+		}
+
+		overflow, err := records.isOverflow()
+		if err != nil {
+			return err
+		}
+
+		if partial || overflow {
+			break
+		}
+	}
+
+	return nil
+}
+
+func (b *FetchResponseBlock) numRecords() (int, error) {
+	sum := 0
+
+	for _, records := range b.RecordsSet {
+		count, err := records.numRecords()
+		if err != nil {
+			return 0, err
+		}
+
+		sum += count
+	}
+
+	return sum, nil
+}
+
+func (b *FetchResponseBlock) isPartial() (bool, error) {
+	if b.Partial {
+		return true, nil
+	}
+
+	if len(b.RecordsSet) == 1 {
+		return b.RecordsSet[0].isPartial()
+	}
+
+	return false, nil
+}
+
+func (b *FetchResponseBlock) encode(pe packetEncoder, version int16) (err error) {
+	pe.putInt16(int16(b.Err))
+
+	pe.putInt64(b.HighWaterMarkOffset)
+
+	if version >= 4 {
+		pe.putInt64(b.LastStableOffset)
+
+		if err = pe.putArrayLength(len(b.AbortedTransactions)); err != nil {
+			return err
+		}
+		for _, transact := range b.AbortedTransactions {
+			if err = transact.encode(pe); err != nil {
+				return err
+			}
+		}
+	}
+
+	pe.push(&lengthField{})
+	for _, records := range b.RecordsSet {
+		err = records.encode(pe)
+		if err != nil {
+			return err
+		}
+	}
+	return pe.pop()
+}
+
+type FetchResponse struct {
+	Blocks       map[string]map[int32]*FetchResponseBlock
+	ThrottleTime time.Duration
+	Version      int16 // v1 requires 0.9+, v2 requires 0.10+
+}
+
+func (r *FetchResponse) decode(pd packetDecoder, version int16) (err error) {
+	r.Version = version
+
+	if r.Version >= 1 {
+		throttle, err := pd.getInt32()
+		if err != nil {
+			return err
+		}
+		r.ThrottleTime = time.Duration(throttle) * time.Millisecond
+	}
+
+	numTopics, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.Blocks = make(map[string]map[int32]*FetchResponseBlock, numTopics)
+	for i := 0; i < numTopics; i++ {
+		name, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		numBlocks, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+
+		r.Blocks[name] = make(map[int32]*FetchResponseBlock, numBlocks)
+
+		for j := 0; j < numBlocks; j++ {
+			id, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+
+			block := new(FetchResponseBlock)
+			err = block.decode(pd, version)
+			if err != nil {
+				return err
+			}
+			r.Blocks[name][id] = block
+		}
+	}
+
+	return nil
+}
+
+func (r *FetchResponse) encode(pe packetEncoder) (err error) {
+	if r.Version >= 1 {
+		pe.putInt32(int32(r.ThrottleTime / time.Millisecond))
+	}
+
+	err = pe.putArrayLength(len(r.Blocks))
+	if err != nil {
+		return err
+	}
+
+	for topic, partitions := range r.Blocks {
+		err = pe.putString(topic)
+		if err != nil {
+			return err
+		}
+
+		err = pe.putArrayLength(len(partitions))
+		if err != nil {
+			return err
+		}
+
+		for id, block := range partitions {
+			pe.putInt32(id)
+			err = block.encode(pe, r.Version)
+			if err != nil {
+				return err
+			}
+		}
+
+	}
+	return nil
+}
+
+func (r *FetchResponse) key() int16 {
+	return 1
+}
+
+func (r *FetchResponse) version() int16 {
+	return r.Version
+}
+
+func (r *FetchResponse) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_9_0_0
+	case 2:
+		return V0_10_0_0
+	case 3:
+		return V0_10_1_0
+	case 4:
+		return V0_11_0_0
+	default:
+		return MinVersion
+	}
+}
+
+func (r *FetchResponse) GetBlock(topic string, partition int32) *FetchResponseBlock {
+	if r.Blocks == nil {
+		return nil
+	}
+
+	if r.Blocks[topic] == nil {
+		return nil
+	}
+
+	return r.Blocks[topic][partition]
+}
+
+func (r *FetchResponse) AddError(topic string, partition int32, err KError) {
+	if r.Blocks == nil {
+		r.Blocks = make(map[string]map[int32]*FetchResponseBlock)
+	}
+	partitions, ok := r.Blocks[topic]
+	if !ok {
+		partitions = make(map[int32]*FetchResponseBlock)
+		r.Blocks[topic] = partitions
+	}
+	frb, ok := partitions[partition]
+	if !ok {
+		frb = new(FetchResponseBlock)
+		partitions[partition] = frb
+	}
+	frb.Err = err
+}
+
+func (r *FetchResponse) getOrCreateBlock(topic string, partition int32) *FetchResponseBlock {
+	if r.Blocks == nil {
+		r.Blocks = make(map[string]map[int32]*FetchResponseBlock)
+	}
+	partitions, ok := r.Blocks[topic]
+	if !ok {
+		partitions = make(map[int32]*FetchResponseBlock)
+		r.Blocks[topic] = partitions
+	}
+	frb, ok := partitions[partition]
+	if !ok {
+		frb = new(FetchResponseBlock)
+		partitions[partition] = frb
+	}
+
+	return frb
+}
+
+func encodeKV(key, value Encoder) ([]byte, []byte) {
+	var kb []byte
+	var vb []byte
+	if key != nil {
+		kb, _ = key.Encode()
+	}
+	if value != nil {
+		vb, _ = value.Encode()
+	}
+
+	return kb, vb
+}
+
+func (r *FetchResponse) AddMessage(topic string, partition int32, key, value Encoder, offset int64) {
+	frb := r.getOrCreateBlock(topic, partition)
+	kb, vb := encodeKV(key, value)
+	msg := &Message{Key: kb, Value: vb}
+	msgBlock := &MessageBlock{Msg: msg, Offset: offset}
+	if len(frb.RecordsSet) == 0 {
+		records := newLegacyRecords(&MessageSet{})
+		frb.RecordsSet = []*Records{&records}
+	}
+	set := frb.RecordsSet[0].MsgSet
+	set.Messages = append(set.Messages, msgBlock)
+}
+
+func (r *FetchResponse) AddRecord(topic string, partition int32, key, value Encoder, offset int64) {
+	frb := r.getOrCreateBlock(topic, partition)
+	kb, vb := encodeKV(key, value)
+	rec := &Record{Key: kb, Value: vb, OffsetDelta: offset}
+	if len(frb.RecordsSet) == 0 {
+		records := newDefaultRecords(&RecordBatch{Version: 2})
+		frb.RecordsSet = []*Records{&records}
+	}
+	batch := frb.RecordsSet[0].RecordBatch
+	batch.addRecord(rec)
+}
+
+func (r *FetchResponse) SetLastOffsetDelta(topic string, partition int32, offset int32) {
+	frb := r.getOrCreateBlock(topic, partition)
+	if len(frb.RecordsSet) == 0 {
+		records := newDefaultRecords(&RecordBatch{Version: 2})
+		frb.RecordsSet = []*Records{&records}
+	}
+	batch := frb.RecordsSet[0].RecordBatch
+	batch.LastOffsetDelta = offset
+}
+
+func (r *FetchResponse) SetLastStableOffset(topic string, partition int32, offset int64) {
+	frb := r.getOrCreateBlock(topic, partition)
+	frb.LastStableOffset = offset
+}

diff --git a/vendor/github.com/Shopify/sarama/find_coordinator_request.go b/vendor/github.com/Shopify/sarama/find_coordinator_request.go
new file mode 100644
index 0000000..0ab5cb5
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/find_coordinator_request.go

@@ -0,0 +1,61 @@
+package sarama
+
+type CoordinatorType int8
+
+const (
+	CoordinatorGroup       CoordinatorType = 0
+	CoordinatorTransaction CoordinatorType = 1
+)
+
+type FindCoordinatorRequest struct {
+	Version         int16
+	CoordinatorKey  string
+	CoordinatorType CoordinatorType
+}
+
+func (f *FindCoordinatorRequest) encode(pe packetEncoder) error {
+	if err := pe.putString(f.CoordinatorKey); err != nil {
+		return err
+	}
+
+	if f.Version >= 1 {
+		pe.putInt8(int8(f.CoordinatorType))
+	}
+
+	return nil
+}
+
+func (f *FindCoordinatorRequest) decode(pd packetDecoder, version int16) (err error) {
+	if f.CoordinatorKey, err = pd.getString(); err != nil {
+		return err
+	}
+
+	if version >= 1 {
+		f.Version = version
+		coordinatorType, err := pd.getInt8()
+		if err != nil {
+			return err
+		}
+
+		f.CoordinatorType = CoordinatorType(coordinatorType)
+	}
+
+	return nil
+}
+
+func (f *FindCoordinatorRequest) key() int16 {
+	return 10
+}
+
+func (f *FindCoordinatorRequest) version() int16 {
+	return f.Version
+}
+
+func (f *FindCoordinatorRequest) requiredVersion() KafkaVersion {
+	switch f.Version {
+	case 1:
+		return V0_11_0_0
+	default:
+		return V0_8_2_0
+	}
+}

diff --git a/vendor/github.com/Shopify/sarama/find_coordinator_response.go b/vendor/github.com/Shopify/sarama/find_coordinator_response.go
new file mode 100644
index 0000000..9c900e8
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/find_coordinator_response.go

@@ -0,0 +1,92 @@
+package sarama
+
+import (
+	"time"
+)
+
+var NoNode = &Broker{id: -1, addr: ":-1"}
+
+type FindCoordinatorResponse struct {
+	Version      int16
+	ThrottleTime time.Duration
+	Err          KError
+	ErrMsg       *string
+	Coordinator  *Broker
+}
+
+func (f *FindCoordinatorResponse) decode(pd packetDecoder, version int16) (err error) {
+	if version >= 1 {
+		f.Version = version
+
+		throttleTime, err := pd.getInt32()
+		if err != nil {
+			return err
+		}
+		f.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+	}
+
+	tmp, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	f.Err = KError(tmp)
+
+	if version >= 1 {
+		if f.ErrMsg, err = pd.getNullableString(); err != nil {
+			return err
+		}
+	}
+
+	coordinator := new(Broker)
+	// The version is hardcoded to 0, as version 1 of the Broker-decode
+	// contains the rack-field which is not present in the FindCoordinatorResponse.
+	if err := coordinator.decode(pd, 0); err != nil {
+		return err
+	}
+	if coordinator.addr == ":0" {
+		return nil
+	}
+	f.Coordinator = coordinator
+
+	return nil
+}
+
+func (f *FindCoordinatorResponse) encode(pe packetEncoder) error {
+	if f.Version >= 1 {
+		pe.putInt32(int32(f.ThrottleTime / time.Millisecond))
+	}
+
+	pe.putInt16(int16(f.Err))
+
+	if f.Version >= 1 {
+		if err := pe.putNullableString(f.ErrMsg); err != nil {
+			return err
+		}
+	}
+
+	coordinator := f.Coordinator
+	if coordinator == nil {
+		coordinator = NoNode
+	}
+	if err := coordinator.encode(pe, 0); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (f *FindCoordinatorResponse) key() int16 {
+	return 10
+}
+
+func (f *FindCoordinatorResponse) version() int16 {
+	return f.Version
+}
+
+func (f *FindCoordinatorResponse) requiredVersion() KafkaVersion {
+	switch f.Version {
+	case 1:
+		return V0_11_0_0
+	default:
+		return V0_8_2_0
+	}
+}

diff --git a/vendor/github.com/Shopify/sarama/heartbeat_request.go b/vendor/github.com/Shopify/sarama/heartbeat_request.go
new file mode 100644
index 0000000..ce49c47
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/heartbeat_request.go

@@ -0,0 +1,47 @@
+package sarama
+
+type HeartbeatRequest struct {
+	GroupId      string
+	GenerationId int32
+	MemberId     string
+}
+
+func (r *HeartbeatRequest) encode(pe packetEncoder) error {
+	if err := pe.putString(r.GroupId); err != nil {
+		return err
+	}
+
+	pe.putInt32(r.GenerationId)
+
+	if err := pe.putString(r.MemberId); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (r *HeartbeatRequest) decode(pd packetDecoder, version int16) (err error) {
+	if r.GroupId, err = pd.getString(); err != nil {
+		return
+	}
+	if r.GenerationId, err = pd.getInt32(); err != nil {
+		return
+	}
+	if r.MemberId, err = pd.getString(); err != nil {
+		return
+	}
+
+	return nil
+}
+
+func (r *HeartbeatRequest) key() int16 {
+	return 12
+}
+
+func (r *HeartbeatRequest) version() int16 {
+	return 0
+}
+
+func (r *HeartbeatRequest) requiredVersion() KafkaVersion {
+	return V0_9_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/heartbeat_response.go b/vendor/github.com/Shopify/sarama/heartbeat_response.go
new file mode 100644
index 0000000..766f5fd
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/heartbeat_response.go

@@ -0,0 +1,32 @@
+package sarama
+
+type HeartbeatResponse struct {
+	Err KError
+}
+
+func (r *HeartbeatResponse) encode(pe packetEncoder) error {
+	pe.putInt16(int16(r.Err))
+	return nil
+}
+
+func (r *HeartbeatResponse) decode(pd packetDecoder, version int16) error {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	r.Err = KError(kerr)
+
+	return nil
+}
+
+func (r *HeartbeatResponse) key() int16 {
+	return 12
+}
+
+func (r *HeartbeatResponse) version() int16 {
+	return 0
+}
+
+func (r *HeartbeatResponse) requiredVersion() KafkaVersion {
+	return V0_9_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/init_producer_id_request.go b/vendor/github.com/Shopify/sarama/init_producer_id_request.go
new file mode 100644
index 0000000..8ceb6c2
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/init_producer_id_request.go

@@ -0,0 +1,43 @@
+package sarama
+
+import "time"
+
+type InitProducerIDRequest struct {
+	TransactionalID    *string
+	TransactionTimeout time.Duration
+}
+
+func (i *InitProducerIDRequest) encode(pe packetEncoder) error {
+	if err := pe.putNullableString(i.TransactionalID); err != nil {
+		return err
+	}
+	pe.putInt32(int32(i.TransactionTimeout / time.Millisecond))
+
+	return nil
+}
+
+func (i *InitProducerIDRequest) decode(pd packetDecoder, version int16) (err error) {
+	if i.TransactionalID, err = pd.getNullableString(); err != nil {
+		return err
+	}
+
+	timeout, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	i.TransactionTimeout = time.Duration(timeout) * time.Millisecond
+
+	return nil
+}
+
+func (i *InitProducerIDRequest) key() int16 {
+	return 22
+}
+
+func (i *InitProducerIDRequest) version() int16 {
+	return 0
+}
+
+func (i *InitProducerIDRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/init_producer_id_response.go b/vendor/github.com/Shopify/sarama/init_producer_id_response.go
new file mode 100644
index 0000000..1b32eb0
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/init_producer_id_response.go

@@ -0,0 +1,55 @@
+package sarama
+
+import "time"
+
+type InitProducerIDResponse struct {
+	ThrottleTime  time.Duration
+	Err           KError
+	ProducerID    int64
+	ProducerEpoch int16
+}
+
+func (i *InitProducerIDResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(i.ThrottleTime / time.Millisecond))
+	pe.putInt16(int16(i.Err))
+	pe.putInt64(i.ProducerID)
+	pe.putInt16(i.ProducerEpoch)
+
+	return nil
+}
+
+func (i *InitProducerIDResponse) decode(pd packetDecoder, version int16) (err error) {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	i.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	i.Err = KError(kerr)
+
+	if i.ProducerID, err = pd.getInt64(); err != nil {
+		return err
+	}
+
+	if i.ProducerEpoch, err = pd.getInt16(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (i *InitProducerIDResponse) key() int16 {
+	return 22
+}
+
+func (i *InitProducerIDResponse) version() int16 {
+	return 0
+}
+
+func (i *InitProducerIDResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/join_group_request.go b/vendor/github.com/Shopify/sarama/join_group_request.go
new file mode 100644
index 0000000..97e9299
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/join_group_request.go

@@ -0,0 +1,163 @@
+package sarama
+
+type GroupProtocol struct {
+	Name     string
+	Metadata []byte
+}
+
+func (p *GroupProtocol) decode(pd packetDecoder) (err error) {
+	p.Name, err = pd.getString()
+	if err != nil {
+		return err
+	}
+	p.Metadata, err = pd.getBytes()
+	return err
+}
+
+func (p *GroupProtocol) encode(pe packetEncoder) (err error) {
+	if err := pe.putString(p.Name); err != nil {
+		return err
+	}
+	if err := pe.putBytes(p.Metadata); err != nil {
+		return err
+	}
+	return nil
+}
+
+type JoinGroupRequest struct {
+	Version               int16
+	GroupId               string
+	SessionTimeout        int32
+	RebalanceTimeout      int32
+	MemberId              string
+	ProtocolType          string
+	GroupProtocols        map[string][]byte // deprecated; use OrderedGroupProtocols
+	OrderedGroupProtocols []*GroupProtocol
+}
+
+func (r *JoinGroupRequest) encode(pe packetEncoder) error {
+	if err := pe.putString(r.GroupId); err != nil {
+		return err
+	}
+	pe.putInt32(r.SessionTimeout)
+	if r.Version >= 1 {
+		pe.putInt32(r.RebalanceTimeout)
+	}
+	if err := pe.putString(r.MemberId); err != nil {
+		return err
+	}
+	if err := pe.putString(r.ProtocolType); err != nil {
+		return err
+	}
+
+	if len(r.GroupProtocols) > 0 {
+		if len(r.OrderedGroupProtocols) > 0 {
+			return PacketDecodingError{"cannot specify both GroupProtocols and OrderedGroupProtocols on JoinGroupRequest"}
+		}
+
+		if err := pe.putArrayLength(len(r.GroupProtocols)); err != nil {
+			return err
+		}
+		for name, metadata := range r.GroupProtocols {
+			if err := pe.putString(name); err != nil {
+				return err
+			}
+			if err := pe.putBytes(metadata); err != nil {
+				return err
+			}
+		}
+	} else {
+		if err := pe.putArrayLength(len(r.OrderedGroupProtocols)); err != nil {
+			return err
+		}
+		for _, protocol := range r.OrderedGroupProtocols {
+			if err := protocol.encode(pe); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func (r *JoinGroupRequest) decode(pd packetDecoder, version int16) (err error) {
+	r.Version = version
+
+	if r.GroupId, err = pd.getString(); err != nil {
+		return
+	}
+
+	if r.SessionTimeout, err = pd.getInt32(); err != nil {
+		return
+	}
+
+	if version >= 1 {
+		if r.RebalanceTimeout, err = pd.getInt32(); err != nil {
+			return err
+		}
+	}
+
+	if r.MemberId, err = pd.getString(); err != nil {
+		return
+	}
+
+	if r.ProtocolType, err = pd.getString(); err != nil {
+		return
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if n == 0 {
+		return nil
+	}
+
+	r.GroupProtocols = make(map[string][]byte)
+	for i := 0; i < n; i++ {
+		protocol := &GroupProtocol{}
+		if err := protocol.decode(pd); err != nil {
+			return err
+		}
+		r.GroupProtocols[protocol.Name] = protocol.Metadata
+		r.OrderedGroupProtocols = append(r.OrderedGroupProtocols, protocol)
+	}
+
+	return nil
+}
+
+func (r *JoinGroupRequest) key() int16 {
+	return 11
+}
+
+func (r *JoinGroupRequest) version() int16 {
+	return r.Version
+}
+
+func (r *JoinGroupRequest) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 2:
+		return V0_11_0_0
+	case 1:
+		return V0_10_1_0
+	default:
+		return V0_9_0_0
+	}
+}
+
+func (r *JoinGroupRequest) AddGroupProtocol(name string, metadata []byte) {
+	r.OrderedGroupProtocols = append(r.OrderedGroupProtocols, &GroupProtocol{
+		Name:     name,
+		Metadata: metadata,
+	})
+}
+
+func (r *JoinGroupRequest) AddGroupProtocolMetadata(name string, metadata *ConsumerGroupMemberMetadata) error {
+	bin, err := encode(metadata, nil)
+	if err != nil {
+		return err
+	}
+
+	r.AddGroupProtocol(name, bin)
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/join_group_response.go b/vendor/github.com/Shopify/sarama/join_group_response.go
new file mode 100644
index 0000000..5752acc
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/join_group_response.go

@@ -0,0 +1,135 @@
+package sarama
+
+type JoinGroupResponse struct {
+	Version       int16
+	ThrottleTime  int32
+	Err           KError
+	GenerationId  int32
+	GroupProtocol string
+	LeaderId      string
+	MemberId      string
+	Members       map[string][]byte
+}
+
+func (r *JoinGroupResponse) GetMembers() (map[string]ConsumerGroupMemberMetadata, error) {
+	members := make(map[string]ConsumerGroupMemberMetadata, len(r.Members))
+	for id, bin := range r.Members {
+		meta := new(ConsumerGroupMemberMetadata)
+		if err := decode(bin, meta); err != nil {
+			return nil, err
+		}
+		members[id] = *meta
+	}
+	return members, nil
+}
+
+func (r *JoinGroupResponse) encode(pe packetEncoder) error {
+	if r.Version >= 2 {
+		pe.putInt32(r.ThrottleTime)
+	}
+	pe.putInt16(int16(r.Err))
+	pe.putInt32(r.GenerationId)
+
+	if err := pe.putString(r.GroupProtocol); err != nil {
+		return err
+	}
+	if err := pe.putString(r.LeaderId); err != nil {
+		return err
+	}
+	if err := pe.putString(r.MemberId); err != nil {
+		return err
+	}
+
+	if err := pe.putArrayLength(len(r.Members)); err != nil {
+		return err
+	}
+
+	for memberId, memberMetadata := range r.Members {
+		if err := pe.putString(memberId); err != nil {
+			return err
+		}
+
+		if err := pe.putBytes(memberMetadata); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (r *JoinGroupResponse) decode(pd packetDecoder, version int16) (err error) {
+	r.Version = version
+
+	if version >= 2 {
+		if r.ThrottleTime, err = pd.getInt32(); err != nil {
+			return
+		}
+	}
+
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+
+	r.Err = KError(kerr)
+
+	if r.GenerationId, err = pd.getInt32(); err != nil {
+		return
+	}
+
+	if r.GroupProtocol, err = pd.getString(); err != nil {
+		return
+	}
+
+	if r.LeaderId, err = pd.getString(); err != nil {
+		return
+	}
+
+	if r.MemberId, err = pd.getString(); err != nil {
+		return
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if n == 0 {
+		return nil
+	}
+
+	r.Members = make(map[string][]byte)
+	for i := 0; i < n; i++ {
+		memberId, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		memberMetadata, err := pd.getBytes()
+		if err != nil {
+			return err
+		}
+
+		r.Members[memberId] = memberMetadata
+	}
+
+	return nil
+}
+
+func (r *JoinGroupResponse) key() int16 {
+	return 11
+}
+
+func (r *JoinGroupResponse) version() int16 {
+	return r.Version
+}
+
+func (r *JoinGroupResponse) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 2:
+		return V0_11_0_0
+	case 1:
+		return V0_10_1_0
+	default:
+		return V0_9_0_0
+	}
+}

diff --git a/vendor/github.com/Shopify/sarama/leave_group_request.go b/vendor/github.com/Shopify/sarama/leave_group_request.go
new file mode 100644
index 0000000..e177427
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/leave_group_request.go

@@ -0,0 +1,40 @@
+package sarama
+
+type LeaveGroupRequest struct {
+	GroupId  string
+	MemberId string
+}
+
+func (r *LeaveGroupRequest) encode(pe packetEncoder) error {
+	if err := pe.putString(r.GroupId); err != nil {
+		return err
+	}
+	if err := pe.putString(r.MemberId); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (r *LeaveGroupRequest) decode(pd packetDecoder, version int16) (err error) {
+	if r.GroupId, err = pd.getString(); err != nil {
+		return
+	}
+	if r.MemberId, err = pd.getString(); err != nil {
+		return
+	}
+
+	return nil
+}
+
+func (r *LeaveGroupRequest) key() int16 {
+	return 13
+}
+
+func (r *LeaveGroupRequest) version() int16 {
+	return 0
+}
+
+func (r *LeaveGroupRequest) requiredVersion() KafkaVersion {
+	return V0_9_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/leave_group_response.go b/vendor/github.com/Shopify/sarama/leave_group_response.go
new file mode 100644
index 0000000..d60c626
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/leave_group_response.go

@@ -0,0 +1,32 @@
+package sarama
+
+type LeaveGroupResponse struct {
+	Err KError
+}
+
+func (r *LeaveGroupResponse) encode(pe packetEncoder) error {
+	pe.putInt16(int16(r.Err))
+	return nil
+}
+
+func (r *LeaveGroupResponse) decode(pd packetDecoder, version int16) (err error) {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	r.Err = KError(kerr)
+
+	return nil
+}
+
+func (r *LeaveGroupResponse) key() int16 {
+	return 13
+}
+
+func (r *LeaveGroupResponse) version() int16 {
+	return 0
+}
+
+func (r *LeaveGroupResponse) requiredVersion() KafkaVersion {
+	return V0_9_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/length_field.go b/vendor/github.com/Shopify/sarama/length_field.go
new file mode 100644
index 0000000..da199a7
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/length_field.go

@@ -0,0 +1,82 @@
+package sarama
+
+import "encoding/binary"
+
+// LengthField implements the PushEncoder and PushDecoder interfaces for calculating 4-byte lengths.
+type lengthField struct {
+	startOffset int
+	length      int32
+}
+
+func (l *lengthField) decode(pd packetDecoder) error {
+	var err error
+	l.length, err = pd.getInt32()
+	if err != nil {
+		return err
+	}
+	if l.length > int32(pd.remaining()) {
+		return ErrInsufficientData
+	}
+	return nil
+}
+
+func (l *lengthField) saveOffset(in int) {
+	l.startOffset = in
+}
+
+func (l *lengthField) reserveLength() int {
+	return 4
+}
+
+func (l *lengthField) run(curOffset int, buf []byte) error {
+	binary.BigEndian.PutUint32(buf[l.startOffset:], uint32(curOffset-l.startOffset-4))
+	return nil
+}
+
+func (l *lengthField) check(curOffset int, buf []byte) error {
+	if int32(curOffset-l.startOffset-4) != l.length {
+		return PacketDecodingError{"length field invalid"}
+	}
+
+	return nil
+}
+
+type varintLengthField struct {
+	startOffset int
+	length      int64
+}
+
+func (l *varintLengthField) decode(pd packetDecoder) error {
+	var err error
+	l.length, err = pd.getVarint()
+	return err
+}
+
+func (l *varintLengthField) saveOffset(in int) {
+	l.startOffset = in
+}
+
+func (l *varintLengthField) adjustLength(currOffset int) int {
+	oldFieldSize := l.reserveLength()
+	l.length = int64(currOffset - l.startOffset - oldFieldSize)
+
+	return l.reserveLength() - oldFieldSize
+}
+
+func (l *varintLengthField) reserveLength() int {
+	var tmp [binary.MaxVarintLen64]byte
+	return binary.PutVarint(tmp[:], l.length)
+}
+
+func (l *varintLengthField) run(curOffset int, buf []byte) error {
+	binary.PutVarint(buf[l.startOffset:], l.length)
+	return nil
+}
+
+func (l *varintLengthField) check(curOffset int, buf []byte) error {
+	if int64(curOffset-l.startOffset-l.reserveLength()) != l.length {
+		return PacketDecodingError{"length field invalid"}
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/list_groups_request.go b/vendor/github.com/Shopify/sarama/list_groups_request.go
new file mode 100644
index 0000000..3b16abf
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/list_groups_request.go

@@ -0,0 +1,24 @@
+package sarama
+
+type ListGroupsRequest struct {
+}
+
+func (r *ListGroupsRequest) encode(pe packetEncoder) error {
+	return nil
+}
+
+func (r *ListGroupsRequest) decode(pd packetDecoder, version int16) (err error) {
+	return nil
+}
+
+func (r *ListGroupsRequest) key() int16 {
+	return 16
+}
+
+func (r *ListGroupsRequest) version() int16 {
+	return 0
+}
+
+func (r *ListGroupsRequest) requiredVersion() KafkaVersion {
+	return V0_9_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/list_groups_response.go b/vendor/github.com/Shopify/sarama/list_groups_response.go
new file mode 100644
index 0000000..56115d4
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/list_groups_response.go

@@ -0,0 +1,69 @@
+package sarama
+
+type ListGroupsResponse struct {
+	Err    KError
+	Groups map[string]string
+}
+
+func (r *ListGroupsResponse) encode(pe packetEncoder) error {
+	pe.putInt16(int16(r.Err))
+
+	if err := pe.putArrayLength(len(r.Groups)); err != nil {
+		return err
+	}
+	for groupId, protocolType := range r.Groups {
+		if err := pe.putString(groupId); err != nil {
+			return err
+		}
+		if err := pe.putString(protocolType); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (r *ListGroupsResponse) decode(pd packetDecoder, version int16) error {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+
+	r.Err = KError(kerr)
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if n == 0 {
+		return nil
+	}
+
+	r.Groups = make(map[string]string)
+	for i := 0; i < n; i++ {
+		groupId, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		protocolType, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		r.Groups[groupId] = protocolType
+	}
+
+	return nil
+}
+
+func (r *ListGroupsResponse) key() int16 {
+	return 16
+}
+
+func (r *ListGroupsResponse) version() int16 {
+	return 0
+}
+
+func (r *ListGroupsResponse) requiredVersion() KafkaVersion {
+	return V0_9_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/message.go b/vendor/github.com/Shopify/sarama/message.go
new file mode 100644
index 0000000..44d5cc9
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/message.go

@@ -0,0 +1,239 @@
+package sarama
+
+import (
+	"bytes"
+	"compress/gzip"
+	"fmt"
+	"io/ioutil"
+	"time"
+
+	"github.com/eapache/go-xerial-snappy"
+	"github.com/pierrec/lz4"
+)
+
+// CompressionCodec represents the various compression codecs recognized by Kafka in messages.
+type CompressionCodec int8
+
+// The lowest 3 bits contain the compression codec used for the message
+const compressionCodecMask int8 = 0x07
+
+const (
+	CompressionNone   CompressionCodec = 0
+	CompressionGZIP   CompressionCodec = 1
+	CompressionSnappy CompressionCodec = 2
+	CompressionLZ4    CompressionCodec = 3
+	CompressionZSTD   CompressionCodec = 4
+)
+
+func (cc CompressionCodec) String() string {
+	return []string{
+		"none",
+		"gzip",
+		"snappy",
+		"lz4",
+	}[int(cc)]
+}
+
+// CompressionLevelDefault is the constant to use in CompressionLevel
+// to have the default compression level for any codec. The value is picked
+// that we don't use any existing compression levels.
+const CompressionLevelDefault = -1000
+
+type Message struct {
+	Codec            CompressionCodec // codec used to compress the message contents
+	CompressionLevel int              // compression level
+	Key              []byte           // the message key, may be nil
+	Value            []byte           // the message contents
+	Set              *MessageSet      // the message set a message might wrap
+	Version          int8             // v1 requires Kafka 0.10
+	Timestamp        time.Time        // the timestamp of the message (version 1+ only)
+
+	compressedCache []byte
+	compressedSize  int // used for computing the compression ratio metrics
+}
+
+func (m *Message) encode(pe packetEncoder) error {
+	pe.push(newCRC32Field(crcIEEE))
+
+	pe.putInt8(m.Version)
+
+	attributes := int8(m.Codec) & compressionCodecMask
+	pe.putInt8(attributes)
+
+	if m.Version >= 1 {
+		if err := (Timestamp{&m.Timestamp}).encode(pe); err != nil {
+			return err
+		}
+	}
+
+	err := pe.putBytes(m.Key)
+	if err != nil {
+		return err
+	}
+
+	var payload []byte
+
+	if m.compressedCache != nil {
+		payload = m.compressedCache
+		m.compressedCache = nil
+	} else if m.Value != nil {
+		switch m.Codec {
+		case CompressionNone:
+			payload = m.Value
+		case CompressionGZIP:
+			var buf bytes.Buffer
+			var writer *gzip.Writer
+			if m.CompressionLevel != CompressionLevelDefault {
+				writer, err = gzip.NewWriterLevel(&buf, m.CompressionLevel)
+				if err != nil {
+					return err
+				}
+			} else {
+				writer = gzip.NewWriter(&buf)
+			}
+			if _, err = writer.Write(m.Value); err != nil {
+				return err
+			}
+			if err = writer.Close(); err != nil {
+				return err
+			}
+			m.compressedCache = buf.Bytes()
+			payload = m.compressedCache
+		case CompressionSnappy:
+			tmp := snappy.Encode(m.Value)
+			m.compressedCache = tmp
+			payload = m.compressedCache
+		case CompressionLZ4:
+			var buf bytes.Buffer
+			writer := lz4.NewWriter(&buf)
+			if _, err = writer.Write(m.Value); err != nil {
+				return err
+			}
+			if err = writer.Close(); err != nil {
+				return err
+			}
+			m.compressedCache = buf.Bytes()
+			payload = m.compressedCache
+		case CompressionZSTD:
+			c, err := zstdCompressLevel(nil, m.Value, m.CompressionLevel)
+			if err != nil {
+				return err
+			}
+			m.compressedCache = c
+			payload = m.compressedCache
+		default:
+			return PacketEncodingError{fmt.Sprintf("unsupported compression codec (%d)", m.Codec)}
+		}
+		// Keep in mind the compressed payload size for metric gathering
+		m.compressedSize = len(payload)
+	}
+
+	if err = pe.putBytes(payload); err != nil {
+		return err
+	}
+
+	return pe.pop()
+}
+
+func (m *Message) decode(pd packetDecoder) (err error) {
+	err = pd.push(newCRC32Field(crcIEEE))
+	if err != nil {
+		return err
+	}
+
+	m.Version, err = pd.getInt8()
+	if err != nil {
+		return err
+	}
+
+	if m.Version > 1 {
+		return PacketDecodingError{fmt.Sprintf("unknown magic byte (%v)", m.Version)}
+	}
+
+	attribute, err := pd.getInt8()
+	if err != nil {
+		return err
+	}
+	m.Codec = CompressionCodec(attribute & compressionCodecMask)
+
+	if m.Version == 1 {
+		if err := (Timestamp{&m.Timestamp}).decode(pd); err != nil {
+			return err
+		}
+	}
+
+	m.Key, err = pd.getBytes()
+	if err != nil {
+		return err
+	}
+
+	m.Value, err = pd.getBytes()
+	if err != nil {
+		return err
+	}
+
+	// Required for deep equal assertion during tests but might be useful
+	// for future metrics about the compression ratio in fetch requests
+	m.compressedSize = len(m.Value)
+
+	switch m.Codec {
+	case CompressionNone:
+		// nothing to do
+	case CompressionGZIP:
+		if m.Value == nil {
+			break
+		}
+		reader, err := gzip.NewReader(bytes.NewReader(m.Value))
+		if err != nil {
+			return err
+		}
+		if m.Value, err = ioutil.ReadAll(reader); err != nil {
+			return err
+		}
+		if err := m.decodeSet(); err != nil {
+			return err
+		}
+	case CompressionSnappy:
+		if m.Value == nil {
+			break
+		}
+		if m.Value, err = snappy.Decode(m.Value); err != nil {
+			return err
+		}
+		if err := m.decodeSet(); err != nil {
+			return err
+		}
+	case CompressionLZ4:
+		if m.Value == nil {
+			break
+		}
+		reader := lz4.NewReader(bytes.NewReader(m.Value))
+		if m.Value, err = ioutil.ReadAll(reader); err != nil {
+			return err
+		}
+		if err := m.decodeSet(); err != nil {
+			return err
+		}
+	case CompressionZSTD:
+		if m.Value == nil {
+			break
+		}
+		if m.Value, err = zstdDecompress(nil, m.Value); err != nil {
+			return err
+		}
+		if err := m.decodeSet(); err != nil {
+			return err
+		}
+	default:
+		return PacketDecodingError{fmt.Sprintf("invalid compression specified (%d)", m.Codec)}
+	}
+
+	return pd.pop()
+}
+
+// decodes a message set from a previousy encoded bulk-message
+func (m *Message) decodeSet() (err error) {
+	pd := realDecoder{raw: m.Value}
+	m.Set = &MessageSet{}
+	return m.Set.decode(&pd)
+}

diff --git a/vendor/github.com/Shopify/sarama/message_set.go b/vendor/github.com/Shopify/sarama/message_set.go
new file mode 100644
index 0000000..600c7c4
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/message_set.go

@@ -0,0 +1,108 @@
+package sarama
+
+type MessageBlock struct {
+	Offset int64
+	Msg    *Message
+}
+
+// Messages convenience helper which returns either all the
+// messages that are wrapped in this block
+func (msb *MessageBlock) Messages() []*MessageBlock {
+	if msb.Msg.Set != nil {
+		return msb.Msg.Set.Messages
+	}
+	return []*MessageBlock{msb}
+}
+
+func (msb *MessageBlock) encode(pe packetEncoder) error {
+	pe.putInt64(msb.Offset)
+	pe.push(&lengthField{})
+	err := msb.Msg.encode(pe)
+	if err != nil {
+		return err
+	}
+	return pe.pop()
+}
+
+func (msb *MessageBlock) decode(pd packetDecoder) (err error) {
+	if msb.Offset, err = pd.getInt64(); err != nil {
+		return err
+	}
+
+	if err = pd.push(&lengthField{}); err != nil {
+		return err
+	}
+
+	msb.Msg = new(Message)
+	if err = msb.Msg.decode(pd); err != nil {
+		return err
+	}
+
+	if err = pd.pop(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+type MessageSet struct {
+	PartialTrailingMessage bool // whether the set on the wire contained an incomplete trailing MessageBlock
+	OverflowMessage        bool // whether the set on the wire contained an overflow message
+	Messages               []*MessageBlock
+}
+
+func (ms *MessageSet) encode(pe packetEncoder) error {
+	for i := range ms.Messages {
+		err := ms.Messages[i].encode(pe)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (ms *MessageSet) decode(pd packetDecoder) (err error) {
+	ms.Messages = nil
+
+	for pd.remaining() > 0 {
+		magic, err := magicValue(pd)
+		if err != nil {
+			if err == ErrInsufficientData {
+				ms.PartialTrailingMessage = true
+				return nil
+			}
+			return err
+		}
+
+		if magic > 1 {
+			return nil
+		}
+
+		msb := new(MessageBlock)
+		err = msb.decode(pd)
+		switch err {
+		case nil:
+			ms.Messages = append(ms.Messages, msb)
+		case ErrInsufficientData:
+			// As an optimization the server is allowed to return a partial message at the
+			// end of the message set. Clients should handle this case. So we just ignore such things.
+			if msb.Offset == -1 {
+				// This is an overflow message caused by chunked down conversion
+				ms.OverflowMessage = true
+			} else {
+				ms.PartialTrailingMessage = true
+			}
+			return nil
+		default:
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (ms *MessageSet) addMessage(msg *Message) {
+	block := new(MessageBlock)
+	block.Msg = msg
+	ms.Messages = append(ms.Messages, block)
+}

diff --git a/vendor/github.com/Shopify/sarama/metadata_request.go b/vendor/github.com/Shopify/sarama/metadata_request.go
new file mode 100644
index 0000000..17dc428
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/metadata_request.go

@@ -0,0 +1,88 @@
+package sarama
+
+type MetadataRequest struct {
+	Version                int16
+	Topics                 []string
+	AllowAutoTopicCreation bool
+}
+
+func (r *MetadataRequest) encode(pe packetEncoder) error {
+	if r.Version < 0 || r.Version > 5 {
+		return PacketEncodingError{"invalid or unsupported MetadataRequest version field"}
+	}
+	if r.Version == 0 || len(r.Topics) > 0 {
+		err := pe.putArrayLength(len(r.Topics))
+		if err != nil {
+			return err
+		}
+
+		for i := range r.Topics {
+			err = pe.putString(r.Topics[i])
+			if err != nil {
+				return err
+			}
+		}
+	} else {
+		pe.putInt32(-1)
+	}
+	if r.Version > 3 {
+		pe.putBool(r.AllowAutoTopicCreation)
+	}
+	return nil
+}
+
+func (r *MetadataRequest) decode(pd packetDecoder, version int16) error {
+	r.Version = version
+	size, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	if size < 0 {
+		return nil
+	} else {
+		topicCount := size
+		if topicCount == 0 {
+			return nil
+		}
+
+		r.Topics = make([]string, topicCount)
+		for i := range r.Topics {
+			topic, err := pd.getString()
+			if err != nil {
+				return err
+			}
+			r.Topics[i] = topic
+		}
+	}
+	if r.Version > 3 {
+		autoCreation, err := pd.getBool()
+		if err != nil {
+			return err
+		}
+		r.AllowAutoTopicCreation = autoCreation
+	}
+	return nil
+}
+
+func (r *MetadataRequest) key() int16 {
+	return 3
+}
+
+func (r *MetadataRequest) version() int16 {
+	return r.Version
+}
+
+func (r *MetadataRequest) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_10_0_0
+	case 2:
+		return V0_10_1_0
+	case 3, 4:
+		return V0_11_0_0
+	case 5:
+		return V1_0_0_0
+	default:
+		return MinVersion
+	}
+}

diff --git a/vendor/github.com/Shopify/sarama/metadata_response.go b/vendor/github.com/Shopify/sarama/metadata_response.go
new file mode 100644
index 0000000..c402d05
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/metadata_response.go

@@ -0,0 +1,321 @@
+package sarama
+
+type PartitionMetadata struct {
+	Err             KError
+	ID              int32
+	Leader          int32
+	Replicas        []int32
+	Isr             []int32
+	OfflineReplicas []int32
+}
+
+func (pm *PartitionMetadata) decode(pd packetDecoder, version int16) (err error) {
+	tmp, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	pm.Err = KError(tmp)
+
+	pm.ID, err = pd.getInt32()
+	if err != nil {
+		return err
+	}
+
+	pm.Leader, err = pd.getInt32()
+	if err != nil {
+		return err
+	}
+
+	pm.Replicas, err = pd.getInt32Array()
+	if err != nil {
+		return err
+	}
+
+	pm.Isr, err = pd.getInt32Array()
+	if err != nil {
+		return err
+	}
+
+	if version >= 5 {
+		pm.OfflineReplicas, err = pd.getInt32Array()
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (pm *PartitionMetadata) encode(pe packetEncoder, version int16) (err error) {
+	pe.putInt16(int16(pm.Err))
+	pe.putInt32(pm.ID)
+	pe.putInt32(pm.Leader)
+
+	err = pe.putInt32Array(pm.Replicas)
+	if err != nil {
+		return err
+	}
+
+	err = pe.putInt32Array(pm.Isr)
+	if err != nil {
+		return err
+	}
+
+	if version >= 5 {
+		err = pe.putInt32Array(pm.OfflineReplicas)
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+type TopicMetadata struct {
+	Err        KError
+	Name       string
+	IsInternal bool // Only valid for Version >= 1
+	Partitions []*PartitionMetadata
+}
+
+func (tm *TopicMetadata) decode(pd packetDecoder, version int16) (err error) {
+	tmp, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	tm.Err = KError(tmp)
+
+	tm.Name, err = pd.getString()
+	if err != nil {
+		return err
+	}
+
+	if version >= 1 {
+		tm.IsInternal, err = pd.getBool()
+		if err != nil {
+			return err
+		}
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	tm.Partitions = make([]*PartitionMetadata, n)
+	for i := 0; i < n; i++ {
+		tm.Partitions[i] = new(PartitionMetadata)
+		err = tm.Partitions[i].decode(pd, version)
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (tm *TopicMetadata) encode(pe packetEncoder, version int16) (err error) {
+	pe.putInt16(int16(tm.Err))
+
+	err = pe.putString(tm.Name)
+	if err != nil {
+		return err
+	}
+
+	if version >= 1 {
+		pe.putBool(tm.IsInternal)
+	}
+
+	err = pe.putArrayLength(len(tm.Partitions))
+	if err != nil {
+		return err
+	}
+
+	for _, pm := range tm.Partitions {
+		err = pm.encode(pe, version)
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+type MetadataResponse struct {
+	Version        int16
+	ThrottleTimeMs int32
+	Brokers        []*Broker
+	ClusterID      *string
+	ControllerID   int32
+	Topics         []*TopicMetadata
+}
+
+func (r *MetadataResponse) decode(pd packetDecoder, version int16) (err error) {
+	r.Version = version
+
+	if version >= 3 {
+		r.ThrottleTimeMs, err = pd.getInt32()
+		if err != nil {
+			return err
+		}
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.Brokers = make([]*Broker, n)
+	for i := 0; i < n; i++ {
+		r.Brokers[i] = new(Broker)
+		err = r.Brokers[i].decode(pd, version)
+		if err != nil {
+			return err
+		}
+	}
+
+	if version >= 2 {
+		r.ClusterID, err = pd.getNullableString()
+		if err != nil {
+			return err
+		}
+	}
+
+	if version >= 1 {
+		r.ControllerID, err = pd.getInt32()
+		if err != nil {
+			return err
+		}
+	} else {
+		r.ControllerID = -1
+	}
+
+	n, err = pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.Topics = make([]*TopicMetadata, n)
+	for i := 0; i < n; i++ {
+		r.Topics[i] = new(TopicMetadata)
+		err = r.Topics[i].decode(pd, version)
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (r *MetadataResponse) encode(pe packetEncoder) error {
+	if r.Version >= 3 {
+		pe.putInt32(r.ThrottleTimeMs)
+	}
+
+	err := pe.putArrayLength(len(r.Brokers))
+	if err != nil {
+		return err
+	}
+	for _, broker := range r.Brokers {
+		err = broker.encode(pe, r.Version)
+		if err != nil {
+			return err
+		}
+	}
+
+	if r.Version >= 2 {
+		err := pe.putNullableString(r.ClusterID)
+		if err != nil {
+			return err
+		}
+	}
+
+	if r.Version >= 1 {
+		pe.putInt32(r.ControllerID)
+	}
+
+	err = pe.putArrayLength(len(r.Topics))
+	if err != nil {
+		return err
+	}
+	for _, tm := range r.Topics {
+		err = tm.encode(pe, r.Version)
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (r *MetadataResponse) key() int16 {
+	return 3
+}
+
+func (r *MetadataResponse) version() int16 {
+	return r.Version
+}
+
+func (r *MetadataResponse) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_10_0_0
+	case 2:
+		return V0_10_1_0
+	case 3, 4:
+		return V0_11_0_0
+	case 5:
+		return V1_0_0_0
+	default:
+		return MinVersion
+	}
+}
+
+// testing API
+
+func (r *MetadataResponse) AddBroker(addr string, id int32) {
+	r.Brokers = append(r.Brokers, &Broker{id: id, addr: addr})
+}
+
+func (r *MetadataResponse) AddTopic(topic string, err KError) *TopicMetadata {
+	var tmatch *TopicMetadata
+
+	for _, tm := range r.Topics {
+		if tm.Name == topic {
+			tmatch = tm
+			goto foundTopic
+		}
+	}
+
+	tmatch = new(TopicMetadata)
+	tmatch.Name = topic
+	r.Topics = append(r.Topics, tmatch)
+
+foundTopic:
+
+	tmatch.Err = err
+	return tmatch
+}
+
+func (r *MetadataResponse) AddTopicPartition(topic string, partition, brokerID int32, replicas, isr []int32, err KError) {
+	tmatch := r.AddTopic(topic, ErrNoError)
+	var pmatch *PartitionMetadata
+
+	for _, pm := range tmatch.Partitions {
+		if pm.ID == partition {
+			pmatch = pm
+			goto foundPartition
+		}
+	}
+
+	pmatch = new(PartitionMetadata)
+	pmatch.ID = partition
+	tmatch.Partitions = append(tmatch.Partitions, pmatch)
+
+foundPartition:
+
+	pmatch.Leader = brokerID
+	pmatch.Replicas = replicas
+	pmatch.Isr = isr
+	pmatch.Err = err
+
+}

diff --git a/vendor/github.com/Shopify/sarama/metrics.go b/vendor/github.com/Shopify/sarama/metrics.go
new file mode 100644
index 0000000..4869708
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/metrics.go

@@ -0,0 +1,51 @@
+package sarama
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/rcrowley/go-metrics"
+)
+
+// Use exponentially decaying reservoir for sampling histograms with the same defaults as the Java library:
+// 1028 elements, which offers a 99.9% confidence level with a 5% margin of error assuming a normal distribution,
+// and an alpha factor of 0.015, which heavily biases the reservoir to the past 5 minutes of measurements.
+// See https://github.com/dropwizard/metrics/blob/v3.1.0/metrics-core/src/main/java/com/codahale/metrics/ExponentiallyDecayingReservoir.java#L38
+const (
+	metricsReservoirSize = 1028
+	metricsAlphaFactor   = 0.015
+)
+
+func getOrRegisterHistogram(name string, r metrics.Registry) metrics.Histogram {
+	return r.GetOrRegister(name, func() metrics.Histogram {
+		return metrics.NewHistogram(metrics.NewExpDecaySample(metricsReservoirSize, metricsAlphaFactor))
+	}).(metrics.Histogram)
+}
+
+func getMetricNameForBroker(name string, broker *Broker) string {
+	// Use broker id like the Java client as it does not contain '.' or ':' characters that
+	// can be interpreted as special character by monitoring tool (e.g. Graphite)
+	return fmt.Sprintf(name+"-for-broker-%d", broker.ID())
+}
+
+func getOrRegisterBrokerMeter(name string, broker *Broker, r metrics.Registry) metrics.Meter {
+	return metrics.GetOrRegisterMeter(getMetricNameForBroker(name, broker), r)
+}
+
+func getOrRegisterBrokerHistogram(name string, broker *Broker, r metrics.Registry) metrics.Histogram {
+	return getOrRegisterHistogram(getMetricNameForBroker(name, broker), r)
+}
+
+func getMetricNameForTopic(name string, topic string) string {
+	// Convert dot to _ since reporters like Graphite typically use dot to represent hierarchy
+	// cf. KAFKA-1902 and KAFKA-2337
+	return fmt.Sprintf(name+"-for-topic-%s", strings.Replace(topic, ".", "_", -1))
+}
+
+func getOrRegisterTopicMeter(name string, topic string, r metrics.Registry) metrics.Meter {
+	return metrics.GetOrRegisterMeter(getMetricNameForTopic(name, topic), r)
+}
+
+func getOrRegisterTopicHistogram(name string, topic string, r metrics.Registry) metrics.Histogram {
+	return getOrRegisterHistogram(getMetricNameForTopic(name, topic), r)
+}

diff --git a/vendor/github.com/Shopify/sarama/mockbroker.go b/vendor/github.com/Shopify/sarama/mockbroker.go
new file mode 100644
index 0000000..55ef1e2
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/mockbroker.go

@@ -0,0 +1,330 @@
+package sarama
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"net"
+	"reflect"
+	"strconv"
+	"sync"
+	"time"
+
+	"github.com/davecgh/go-spew/spew"
+)
+
+const (
+	expectationTimeout = 500 * time.Millisecond
+)
+
+type requestHandlerFunc func(req *request) (res encoder)
+
+// RequestNotifierFunc is invoked when a mock broker processes a request successfully
+// and will provides the number of bytes read and written.
+type RequestNotifierFunc func(bytesRead, bytesWritten int)
+
+// MockBroker is a mock Kafka broker that is used in unit tests. It is exposed
+// to facilitate testing of higher level or specialized consumers and producers
+// built on top of Sarama. Note that it does not 'mimic' the Kafka API protocol,
+// but rather provides a facility to do that. It takes care of the TCP
+// transport, request unmarshaling, response marshaling, and makes it the test
+// writer responsibility to program correct according to the Kafka API protocol
+// MockBroker behaviour.
+//
+// MockBroker is implemented as a TCP server listening on a kernel-selected
+// localhost port that can accept many connections. It reads Kafka requests
+// from that connection and returns responses programmed by the SetHandlerByMap
+// function. If a MockBroker receives a request that it has no programmed
+// response for, then it returns nothing and the request times out.
+//
+// A set of MockRequest builders to define mappings used by MockBroker is
+// provided by Sarama. But users can develop MockRequests of their own and use
+// them along with or instead of the standard ones.
+//
+// When running tests with MockBroker it is strongly recommended to specify
+// a timeout to `go test` so that if the broker hangs waiting for a response,
+// the test panics.
+//
+// It is not necessary to prefix message length or correlation ID to your
+// response bytes, the server does that automatically as a convenience.
+type MockBroker struct {
+	brokerID     int32
+	port         int32
+	closing      chan none
+	stopper      chan none
+	expectations chan encoder
+	listener     net.Listener
+	t            TestReporter
+	latency      time.Duration
+	handler      requestHandlerFunc
+	notifier     RequestNotifierFunc
+	history      []RequestResponse
+	lock         sync.Mutex
+}
+
+// RequestResponse represents a Request/Response pair processed by MockBroker.
+type RequestResponse struct {
+	Request  protocolBody
+	Response encoder
+}
+
+// SetLatency makes broker pause for the specified period every time before
+// replying.
+func (b *MockBroker) SetLatency(latency time.Duration) {
+	b.latency = latency
+}
+
+// SetHandlerByMap defines mapping of Request types to MockResponses. When a
+// request is received by the broker, it looks up the request type in the map
+// and uses the found MockResponse instance to generate an appropriate reply.
+// If the request type is not found in the map then nothing is sent.
+func (b *MockBroker) SetHandlerByMap(handlerMap map[string]MockResponse) {
+	b.setHandler(func(req *request) (res encoder) {
+		reqTypeName := reflect.TypeOf(req.body).Elem().Name()
+		mockResponse := handlerMap[reqTypeName]
+		if mockResponse == nil {
+			return nil
+		}
+		return mockResponse.For(req.body)
+	})
+}
+
+// SetNotifier set a function that will get invoked whenever a request has been
+// processed successfully and will provide the number of bytes read and written
+func (b *MockBroker) SetNotifier(notifier RequestNotifierFunc) {
+	b.lock.Lock()
+	b.notifier = notifier
+	b.lock.Unlock()
+}
+
+// BrokerID returns broker ID assigned to the broker.
+func (b *MockBroker) BrokerID() int32 {
+	return b.brokerID
+}
+
+// History returns a slice of RequestResponse pairs in the order they were
+// processed by the broker. Note that in case of multiple connections to the
+// broker the order expected by a test can be different from the order recorded
+// in the history, unless some synchronization is implemented in the test.
+func (b *MockBroker) History() []RequestResponse {
+	b.lock.Lock()
+	history := make([]RequestResponse, len(b.history))
+	copy(history, b.history)
+	b.lock.Unlock()
+	return history
+}
+
+// Port returns the TCP port number the broker is listening for requests on.
+func (b *MockBroker) Port() int32 {
+	return b.port
+}
+
+// Addr returns the broker connection string in the form "<address>:<port>".
+func (b *MockBroker) Addr() string {
+	return b.listener.Addr().String()
+}
+
+// Close terminates the broker blocking until it stops internal goroutines and
+// releases all resources.
+func (b *MockBroker) Close() {
+	close(b.expectations)
+	if len(b.expectations) > 0 {
+		buf := bytes.NewBufferString(fmt.Sprintf("mockbroker/%d: not all expectations were satisfied! Still waiting on:\n", b.BrokerID()))
+		for e := range b.expectations {
+			_, _ = buf.WriteString(spew.Sdump(e))
+		}
+		b.t.Error(buf.String())
+	}
+	close(b.closing)
+	<-b.stopper
+}
+
+// setHandler sets the specified function as the request handler. Whenever
+// a mock broker reads a request from the wire it passes the request to the
+// function and sends back whatever the handler function returns.
+func (b *MockBroker) setHandler(handler requestHandlerFunc) {
+	b.lock.Lock()
+	b.handler = handler
+	b.lock.Unlock()
+}
+
+func (b *MockBroker) serverLoop() {
+	defer close(b.stopper)
+	var err error
+	var conn net.Conn
+
+	go func() {
+		<-b.closing
+		err := b.listener.Close()
+		if err != nil {
+			b.t.Error(err)
+		}
+	}()
+
+	wg := &sync.WaitGroup{}
+	i := 0
+	for conn, err = b.listener.Accept(); err == nil; conn, err = b.listener.Accept() {
+		wg.Add(1)
+		go b.handleRequests(conn, i, wg)
+		i++
+	}
+	wg.Wait()
+	Logger.Printf("*** mockbroker/%d: listener closed, err=%v", b.BrokerID(), err)
+}
+
+func (b *MockBroker) handleRequests(conn net.Conn, idx int, wg *sync.WaitGroup) {
+	defer wg.Done()
+	defer func() {
+		_ = conn.Close()
+	}()
+	Logger.Printf("*** mockbroker/%d/%d: connection opened", b.BrokerID(), idx)
+	var err error
+
+	abort := make(chan none)
+	defer close(abort)
+	go func() {
+		select {
+		case <-b.closing:
+			_ = conn.Close()
+		case <-abort:
+		}
+	}()
+
+	resHeader := make([]byte, 8)
+	for {
+		req, bytesRead, err := decodeRequest(conn)
+		if err != nil {
+			Logger.Printf("*** mockbroker/%d/%d: invalid request: err=%+v, %+v", b.brokerID, idx, err, spew.Sdump(req))
+			b.serverError(err)
+			break
+		}
+
+		if b.latency > 0 {
+			time.Sleep(b.latency)
+		}
+
+		b.lock.Lock()
+		res := b.handler(req)
+		b.history = append(b.history, RequestResponse{req.body, res})
+		b.lock.Unlock()
+
+		if res == nil {
+			Logger.Printf("*** mockbroker/%d/%d: ignored %v", b.brokerID, idx, spew.Sdump(req))
+			continue
+		}
+		Logger.Printf("*** mockbroker/%d/%d: served %v -> %v", b.brokerID, idx, req, res)
+
+		encodedRes, err := encode(res, nil)
+		if err != nil {
+			b.serverError(err)
+			break
+		}
+		if len(encodedRes) == 0 {
+			b.lock.Lock()
+			if b.notifier != nil {
+				b.notifier(bytesRead, 0)
+			}
+			b.lock.Unlock()
+			continue
+		}
+
+		binary.BigEndian.PutUint32(resHeader, uint32(len(encodedRes)+4))
+		binary.BigEndian.PutUint32(resHeader[4:], uint32(req.correlationID))
+		if _, err = conn.Write(resHeader); err != nil {
+			b.serverError(err)
+			break
+		}
+		if _, err = conn.Write(encodedRes); err != nil {
+			b.serverError(err)
+			break
+		}
+
+		b.lock.Lock()
+		if b.notifier != nil {
+			b.notifier(bytesRead, len(resHeader)+len(encodedRes))
+		}
+		b.lock.Unlock()
+	}
+	Logger.Printf("*** mockbroker/%d/%d: connection closed, err=%v", b.BrokerID(), idx, err)
+}
+
+func (b *MockBroker) defaultRequestHandler(req *request) (res encoder) {
+	select {
+	case res, ok := <-b.expectations:
+		if !ok {
+			return nil
+		}
+		return res
+	case <-time.After(expectationTimeout):
+		return nil
+	}
+}
+
+func (b *MockBroker) serverError(err error) {
+	isConnectionClosedError := false
+	if _, ok := err.(*net.OpError); ok {
+		isConnectionClosedError = true
+	} else if err == io.EOF {
+		isConnectionClosedError = true
+	} else if err.Error() == "use of closed network connection" {
+		isConnectionClosedError = true
+	}
+
+	if isConnectionClosedError {
+		return
+	}
+
+	b.t.Errorf(err.Error())
+}
+
+// NewMockBroker launches a fake Kafka broker. It takes a TestReporter as provided by the
+// test framework and a channel of responses to use.  If an error occurs it is
+// simply logged to the TestReporter and the broker exits.
+func NewMockBroker(t TestReporter, brokerID int32) *MockBroker {
+	return NewMockBrokerAddr(t, brokerID, "localhost:0")
+}
+
+// NewMockBrokerAddr behaves like newMockBroker but listens on the address you give
+// it rather than just some ephemeral port.
+func NewMockBrokerAddr(t TestReporter, brokerID int32, addr string) *MockBroker {
+	listener, err := net.Listen("tcp", addr)
+	if err != nil {
+		t.Fatal(err)
+	}
+	return NewMockBrokerListener(t, brokerID, listener)
+}
+
+// NewMockBrokerListener behaves like newMockBrokerAddr but accepts connections on the listener specified.
+func NewMockBrokerListener(t TestReporter, brokerID int32, listener net.Listener) *MockBroker {
+	var err error
+
+	broker := &MockBroker{
+		closing:      make(chan none),
+		stopper:      make(chan none),
+		t:            t,
+		brokerID:     brokerID,
+		expectations: make(chan encoder, 512),
+		listener:     listener,
+	}
+	broker.handler = broker.defaultRequestHandler
+
+	Logger.Printf("*** mockbroker/%d listening on %s\n", brokerID, broker.listener.Addr().String())
+	_, portStr, err := net.SplitHostPort(broker.listener.Addr().String())
+	if err != nil {
+		t.Fatal(err)
+	}
+	tmp, err := strconv.ParseInt(portStr, 10, 32)
+	if err != nil {
+		t.Fatal(err)
+	}
+	broker.port = int32(tmp)
+
+	go broker.serverLoop()
+
+	return broker
+}
+
+func (b *MockBroker) Returns(e encoder) {
+	b.expectations <- e
+}

diff --git a/vendor/github.com/Shopify/sarama/mockresponses.go b/vendor/github.com/Shopify/sarama/mockresponses.go
new file mode 100644
index 0000000..fe55200
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/mockresponses.go

@@ -0,0 +1,727 @@
+package sarama
+
+import (
+	"fmt"
+)
+
+// TestReporter has methods matching go's testing.T to avoid importing
+// `testing` in the main part of the library.
+type TestReporter interface {
+	Error(...interface{})
+	Errorf(string, ...interface{})
+	Fatal(...interface{})
+	Fatalf(string, ...interface{})
+}
+
+// MockResponse is a response builder interface it defines one method that
+// allows generating a response based on a request body. MockResponses are used
+// to program behavior of MockBroker in tests.
+type MockResponse interface {
+	For(reqBody versionedDecoder) (res encoder)
+}
+
+// MockWrapper is a mock response builder that returns a particular concrete
+// response regardless of the actual request passed to the `For` method.
+type MockWrapper struct {
+	res encoder
+}
+
+func (mw *MockWrapper) For(reqBody versionedDecoder) (res encoder) {
+	return mw.res
+}
+
+func NewMockWrapper(res encoder) *MockWrapper {
+	return &MockWrapper{res: res}
+}
+
+// MockSequence is a mock response builder that is created from a sequence of
+// concrete responses. Every time when a `MockBroker` calls its `For` method
+// the next response from the sequence is returned. When the end of the
+// sequence is reached the last element from the sequence is returned.
+type MockSequence struct {
+	responses []MockResponse
+}
+
+func NewMockSequence(responses ...interface{}) *MockSequence {
+	ms := &MockSequence{}
+	ms.responses = make([]MockResponse, len(responses))
+	for i, res := range responses {
+		switch res := res.(type) {
+		case MockResponse:
+			ms.responses[i] = res
+		case encoder:
+			ms.responses[i] = NewMockWrapper(res)
+		default:
+			panic(fmt.Sprintf("Unexpected response type: %T", res))
+		}
+	}
+	return ms
+}
+
+func (mc *MockSequence) For(reqBody versionedDecoder) (res encoder) {
+	res = mc.responses[0].For(reqBody)
+	if len(mc.responses) > 1 {
+		mc.responses = mc.responses[1:]
+	}
+	return res
+}
+
+// MockMetadataResponse is a `MetadataResponse` builder.
+type MockMetadataResponse struct {
+	controllerID int32
+	leaders      map[string]map[int32]int32
+	brokers      map[string]int32
+	t            TestReporter
+}
+
+func NewMockMetadataResponse(t TestReporter) *MockMetadataResponse {
+	return &MockMetadataResponse{
+		leaders: make(map[string]map[int32]int32),
+		brokers: make(map[string]int32),
+		t:       t,
+	}
+}
+
+func (mmr *MockMetadataResponse) SetLeader(topic string, partition, brokerID int32) *MockMetadataResponse {
+	partitions := mmr.leaders[topic]
+	if partitions == nil {
+		partitions = make(map[int32]int32)
+		mmr.leaders[topic] = partitions
+	}
+	partitions[partition] = brokerID
+	return mmr
+}
+
+func (mmr *MockMetadataResponse) SetBroker(addr string, brokerID int32) *MockMetadataResponse {
+	mmr.brokers[addr] = brokerID
+	return mmr
+}
+
+func (mmr *MockMetadataResponse) SetController(brokerID int32) *MockMetadataResponse {
+	mmr.controllerID = brokerID
+	return mmr
+}
+
+func (mmr *MockMetadataResponse) For(reqBody versionedDecoder) encoder {
+	metadataRequest := reqBody.(*MetadataRequest)
+	metadataResponse := &MetadataResponse{
+		Version:      metadataRequest.version(),
+		ControllerID: mmr.controllerID,
+	}
+	for addr, brokerID := range mmr.brokers {
+		metadataResponse.AddBroker(addr, brokerID)
+	}
+	if len(metadataRequest.Topics) == 0 {
+		for topic, partitions := range mmr.leaders {
+			for partition, brokerID := range partitions {
+				metadataResponse.AddTopicPartition(topic, partition, brokerID, nil, nil, ErrNoError)
+			}
+		}
+		return metadataResponse
+	}
+	for _, topic := range metadataRequest.Topics {
+		for partition, brokerID := range mmr.leaders[topic] {
+			metadataResponse.AddTopicPartition(topic, partition, brokerID, nil, nil, ErrNoError)
+		}
+	}
+	return metadataResponse
+}
+
+// MockOffsetResponse is an `OffsetResponse` builder.
+type MockOffsetResponse struct {
+	offsets map[string]map[int32]map[int64]int64
+	t       TestReporter
+	version int16
+}
+
+func NewMockOffsetResponse(t TestReporter) *MockOffsetResponse {
+	return &MockOffsetResponse{
+		offsets: make(map[string]map[int32]map[int64]int64),
+		t:       t,
+	}
+}
+
+func (mor *MockOffsetResponse) SetVersion(version int16) *MockOffsetResponse {
+	mor.version = version
+	return mor
+}
+
+func (mor *MockOffsetResponse) SetOffset(topic string, partition int32, time, offset int64) *MockOffsetResponse {
+	partitions := mor.offsets[topic]
+	if partitions == nil {
+		partitions = make(map[int32]map[int64]int64)
+		mor.offsets[topic] = partitions
+	}
+	times := partitions[partition]
+	if times == nil {
+		times = make(map[int64]int64)
+		partitions[partition] = times
+	}
+	times[time] = offset
+	return mor
+}
+
+func (mor *MockOffsetResponse) For(reqBody versionedDecoder) encoder {
+	offsetRequest := reqBody.(*OffsetRequest)
+	offsetResponse := &OffsetResponse{Version: mor.version}
+	for topic, partitions := range offsetRequest.blocks {
+		for partition, block := range partitions {
+			offset := mor.getOffset(topic, partition, block.time)
+			offsetResponse.AddTopicPartition(topic, partition, offset)
+		}
+	}
+	return offsetResponse
+}
+
+func (mor *MockOffsetResponse) getOffset(topic string, partition int32, time int64) int64 {
+	partitions := mor.offsets[topic]
+	if partitions == nil {
+		mor.t.Errorf("missing topic: %s", topic)
+	}
+	times := partitions[partition]
+	if times == nil {
+		mor.t.Errorf("missing partition: %d", partition)
+	}
+	offset, ok := times[time]
+	if !ok {
+		mor.t.Errorf("missing time: %d", time)
+	}
+	return offset
+}
+
+// MockFetchResponse is a `FetchResponse` builder.
+type MockFetchResponse struct {
+	messages       map[string]map[int32]map[int64]Encoder
+	highWaterMarks map[string]map[int32]int64
+	t              TestReporter
+	batchSize      int
+	version        int16
+}
+
+func NewMockFetchResponse(t TestReporter, batchSize int) *MockFetchResponse {
+	return &MockFetchResponse{
+		messages:       make(map[string]map[int32]map[int64]Encoder),
+		highWaterMarks: make(map[string]map[int32]int64),
+		t:              t,
+		batchSize:      batchSize,
+	}
+}
+
+func (mfr *MockFetchResponse) SetVersion(version int16) *MockFetchResponse {
+	mfr.version = version
+	return mfr
+}
+
+func (mfr *MockFetchResponse) SetMessage(topic string, partition int32, offset int64, msg Encoder) *MockFetchResponse {
+	partitions := mfr.messages[topic]
+	if partitions == nil {
+		partitions = make(map[int32]map[int64]Encoder)
+		mfr.messages[topic] = partitions
+	}
+	messages := partitions[partition]
+	if messages == nil {
+		messages = make(map[int64]Encoder)
+		partitions[partition] = messages
+	}
+	messages[offset] = msg
+	return mfr
+}
+
+func (mfr *MockFetchResponse) SetHighWaterMark(topic string, partition int32, offset int64) *MockFetchResponse {
+	partitions := mfr.highWaterMarks[topic]
+	if partitions == nil {
+		partitions = make(map[int32]int64)
+		mfr.highWaterMarks[topic] = partitions
+	}
+	partitions[partition] = offset
+	return mfr
+}
+
+func (mfr *MockFetchResponse) For(reqBody versionedDecoder) encoder {
+	fetchRequest := reqBody.(*FetchRequest)
+	res := &FetchResponse{
+		Version: mfr.version,
+	}
+	for topic, partitions := range fetchRequest.blocks {
+		for partition, block := range partitions {
+			initialOffset := block.fetchOffset
+			offset := initialOffset
+			maxOffset := initialOffset + int64(mfr.getMessageCount(topic, partition))
+			for i := 0; i < mfr.batchSize && offset < maxOffset; {
+				msg := mfr.getMessage(topic, partition, offset)
+				if msg != nil {
+					res.AddMessage(topic, partition, nil, msg, offset)
+					i++
+				}
+				offset++
+			}
+			fb := res.GetBlock(topic, partition)
+			if fb == nil {
+				res.AddError(topic, partition, ErrNoError)
+				fb = res.GetBlock(topic, partition)
+			}
+			fb.HighWaterMarkOffset = mfr.getHighWaterMark(topic, partition)
+		}
+	}
+	return res
+}
+
+func (mfr *MockFetchResponse) getMessage(topic string, partition int32, offset int64) Encoder {
+	partitions := mfr.messages[topic]
+	if partitions == nil {
+		return nil
+	}
+	messages := partitions[partition]
+	if messages == nil {
+		return nil
+	}
+	return messages[offset]
+}
+
+func (mfr *MockFetchResponse) getMessageCount(topic string, partition int32) int {
+	partitions := mfr.messages[topic]
+	if partitions == nil {
+		return 0
+	}
+	messages := partitions[partition]
+	if messages == nil {
+		return 0
+	}
+	return len(messages)
+}
+
+func (mfr *MockFetchResponse) getHighWaterMark(topic string, partition int32) int64 {
+	partitions := mfr.highWaterMarks[topic]
+	if partitions == nil {
+		return 0
+	}
+	return partitions[partition]
+}
+
+// MockConsumerMetadataResponse is a `ConsumerMetadataResponse` builder.
+type MockConsumerMetadataResponse struct {
+	coordinators map[string]interface{}
+	t            TestReporter
+}
+
+func NewMockConsumerMetadataResponse(t TestReporter) *MockConsumerMetadataResponse {
+	return &MockConsumerMetadataResponse{
+		coordinators: make(map[string]interface{}),
+		t:            t,
+	}
+}
+
+func (mr *MockConsumerMetadataResponse) SetCoordinator(group string, broker *MockBroker) *MockConsumerMetadataResponse {
+	mr.coordinators[group] = broker
+	return mr
+}
+
+func (mr *MockConsumerMetadataResponse) SetError(group string, kerror KError) *MockConsumerMetadataResponse {
+	mr.coordinators[group] = kerror
+	return mr
+}
+
+func (mr *MockConsumerMetadataResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*ConsumerMetadataRequest)
+	group := req.ConsumerGroup
+	res := &ConsumerMetadataResponse{}
+	v := mr.coordinators[group]
+	switch v := v.(type) {
+	case *MockBroker:
+		res.Coordinator = &Broker{id: v.BrokerID(), addr: v.Addr()}
+	case KError:
+		res.Err = v
+	}
+	return res
+}
+
+// MockFindCoordinatorResponse is a `FindCoordinatorResponse` builder.
+type MockFindCoordinatorResponse struct {
+	groupCoordinators map[string]interface{}
+	transCoordinators map[string]interface{}
+	t                 TestReporter
+}
+
+func NewMockFindCoordinatorResponse(t TestReporter) *MockFindCoordinatorResponse {
+	return &MockFindCoordinatorResponse{
+		groupCoordinators: make(map[string]interface{}),
+		transCoordinators: make(map[string]interface{}),
+		t:                 t,
+	}
+}
+
+func (mr *MockFindCoordinatorResponse) SetCoordinator(coordinatorType CoordinatorType, group string, broker *MockBroker) *MockFindCoordinatorResponse {
+	switch coordinatorType {
+	case CoordinatorGroup:
+		mr.groupCoordinators[group] = broker
+	case CoordinatorTransaction:
+		mr.transCoordinators[group] = broker
+	}
+	return mr
+}
+
+func (mr *MockFindCoordinatorResponse) SetError(coordinatorType CoordinatorType, group string, kerror KError) *MockFindCoordinatorResponse {
+	switch coordinatorType {
+	case CoordinatorGroup:
+		mr.groupCoordinators[group] = kerror
+	case CoordinatorTransaction:
+		mr.transCoordinators[group] = kerror
+	}
+	return mr
+}
+
+func (mr *MockFindCoordinatorResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*FindCoordinatorRequest)
+	res := &FindCoordinatorResponse{}
+	var v interface{}
+	switch req.CoordinatorType {
+	case CoordinatorGroup:
+		v = mr.groupCoordinators[req.CoordinatorKey]
+	case CoordinatorTransaction:
+		v = mr.transCoordinators[req.CoordinatorKey]
+	}
+	switch v := v.(type) {
+	case *MockBroker:
+		res.Coordinator = &Broker{id: v.BrokerID(), addr: v.Addr()}
+	case KError:
+		res.Err = v
+	}
+	return res
+}
+
+// MockOffsetCommitResponse is a `OffsetCommitResponse` builder.
+type MockOffsetCommitResponse struct {
+	errors map[string]map[string]map[int32]KError
+	t      TestReporter
+}
+
+func NewMockOffsetCommitResponse(t TestReporter) *MockOffsetCommitResponse {
+	return &MockOffsetCommitResponse{t: t}
+}
+
+func (mr *MockOffsetCommitResponse) SetError(group, topic string, partition int32, kerror KError) *MockOffsetCommitResponse {
+	if mr.errors == nil {
+		mr.errors = make(map[string]map[string]map[int32]KError)
+	}
+	topics := mr.errors[group]
+	if topics == nil {
+		topics = make(map[string]map[int32]KError)
+		mr.errors[group] = topics
+	}
+	partitions := topics[topic]
+	if partitions == nil {
+		partitions = make(map[int32]KError)
+		topics[topic] = partitions
+	}
+	partitions[partition] = kerror
+	return mr
+}
+
+func (mr *MockOffsetCommitResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*OffsetCommitRequest)
+	group := req.ConsumerGroup
+	res := &OffsetCommitResponse{}
+	for topic, partitions := range req.blocks {
+		for partition := range partitions {
+			res.AddError(topic, partition, mr.getError(group, topic, partition))
+		}
+	}
+	return res
+}
+
+func (mr *MockOffsetCommitResponse) getError(group, topic string, partition int32) KError {
+	topics := mr.errors[group]
+	if topics == nil {
+		return ErrNoError
+	}
+	partitions := topics[topic]
+	if partitions == nil {
+		return ErrNoError
+	}
+	kerror, ok := partitions[partition]
+	if !ok {
+		return ErrNoError
+	}
+	return kerror
+}
+
+// MockProduceResponse is a `ProduceResponse` builder.
+type MockProduceResponse struct {
+	version int16
+	errors  map[string]map[int32]KError
+	t       TestReporter
+}
+
+func NewMockProduceResponse(t TestReporter) *MockProduceResponse {
+	return &MockProduceResponse{t: t}
+}
+
+func (mr *MockProduceResponse) SetVersion(version int16) *MockProduceResponse {
+	mr.version = version
+	return mr
+}
+
+func (mr *MockProduceResponse) SetError(topic string, partition int32, kerror KError) *MockProduceResponse {
+	if mr.errors == nil {
+		mr.errors = make(map[string]map[int32]KError)
+	}
+	partitions := mr.errors[topic]
+	if partitions == nil {
+		partitions = make(map[int32]KError)
+		mr.errors[topic] = partitions
+	}
+	partitions[partition] = kerror
+	return mr
+}
+
+func (mr *MockProduceResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*ProduceRequest)
+	res := &ProduceResponse{
+		Version: mr.version,
+	}
+	for topic, partitions := range req.records {
+		for partition := range partitions {
+			res.AddTopicPartition(topic, partition, mr.getError(topic, partition))
+		}
+	}
+	return res
+}
+
+func (mr *MockProduceResponse) getError(topic string, partition int32) KError {
+	partitions := mr.errors[topic]
+	if partitions == nil {
+		return ErrNoError
+	}
+	kerror, ok := partitions[partition]
+	if !ok {
+		return ErrNoError
+	}
+	return kerror
+}
+
+// MockOffsetFetchResponse is a `OffsetFetchResponse` builder.
+type MockOffsetFetchResponse struct {
+	offsets map[string]map[string]map[int32]*OffsetFetchResponseBlock
+	t       TestReporter
+}
+
+func NewMockOffsetFetchResponse(t TestReporter) *MockOffsetFetchResponse {
+	return &MockOffsetFetchResponse{t: t}
+}
+
+func (mr *MockOffsetFetchResponse) SetOffset(group, topic string, partition int32, offset int64, metadata string, kerror KError) *MockOffsetFetchResponse {
+	if mr.offsets == nil {
+		mr.offsets = make(map[string]map[string]map[int32]*OffsetFetchResponseBlock)
+	}
+	topics := mr.offsets[group]
+	if topics == nil {
+		topics = make(map[string]map[int32]*OffsetFetchResponseBlock)
+		mr.offsets[group] = topics
+	}
+	partitions := topics[topic]
+	if partitions == nil {
+		partitions = make(map[int32]*OffsetFetchResponseBlock)
+		topics[topic] = partitions
+	}
+	partitions[partition] = &OffsetFetchResponseBlock{offset, 0, metadata, kerror}
+	return mr
+}
+
+func (mr *MockOffsetFetchResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*OffsetFetchRequest)
+	group := req.ConsumerGroup
+	res := &OffsetFetchResponse{}
+	for topic, partitions := range mr.offsets[group] {
+		for partition, block := range partitions {
+			res.AddBlock(topic, partition, block)
+		}
+	}
+	return res
+}
+
+type MockCreateTopicsResponse struct {
+	t TestReporter
+}
+
+func NewMockCreateTopicsResponse(t TestReporter) *MockCreateTopicsResponse {
+	return &MockCreateTopicsResponse{t: t}
+}
+
+func (mr *MockCreateTopicsResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*CreateTopicsRequest)
+	res := &CreateTopicsResponse{}
+	res.TopicErrors = make(map[string]*TopicError)
+
+	for topic, _ := range req.TopicDetails {
+		res.TopicErrors[topic] = &TopicError{Err: ErrNoError}
+	}
+	return res
+}
+
+type MockDeleteTopicsResponse struct {
+	t TestReporter
+}
+
+func NewMockDeleteTopicsResponse(t TestReporter) *MockDeleteTopicsResponse {
+	return &MockDeleteTopicsResponse{t: t}
+}
+
+func (mr *MockDeleteTopicsResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*DeleteTopicsRequest)
+	res := &DeleteTopicsResponse{}
+	res.TopicErrorCodes = make(map[string]KError)
+
+	for _, topic := range req.Topics {
+		res.TopicErrorCodes[topic] = ErrNoError
+	}
+	return res
+}
+
+type MockCreatePartitionsResponse struct {
+	t TestReporter
+}
+
+func NewMockCreatePartitionsResponse(t TestReporter) *MockCreatePartitionsResponse {
+	return &MockCreatePartitionsResponse{t: t}
+}
+
+func (mr *MockCreatePartitionsResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*CreatePartitionsRequest)
+	res := &CreatePartitionsResponse{}
+	res.TopicPartitionErrors = make(map[string]*TopicPartitionError)
+
+	for topic, _ := range req.TopicPartitions {
+		res.TopicPartitionErrors[topic] = &TopicPartitionError{Err: ErrNoError}
+	}
+	return res
+}
+
+type MockDeleteRecordsResponse struct {
+	t TestReporter
+}
+
+func NewMockDeleteRecordsResponse(t TestReporter) *MockDeleteRecordsResponse {
+	return &MockDeleteRecordsResponse{t: t}
+}
+
+func (mr *MockDeleteRecordsResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*DeleteRecordsRequest)
+	res := &DeleteRecordsResponse{}
+	res.Topics = make(map[string]*DeleteRecordsResponseTopic)
+
+	for topic, deleteRecordRequestTopic := range req.Topics {
+		partitions := make(map[int32]*DeleteRecordsResponsePartition)
+		for partition, _ := range deleteRecordRequestTopic.PartitionOffsets {
+			partitions[partition] = &DeleteRecordsResponsePartition{Err: ErrNoError}
+		}
+		res.Topics[topic] = &DeleteRecordsResponseTopic{Partitions: partitions}
+	}
+	return res
+}
+
+type MockDescribeConfigsResponse struct {
+	t TestReporter
+}
+
+func NewMockDescribeConfigsResponse(t TestReporter) *MockDescribeConfigsResponse {
+	return &MockDescribeConfigsResponse{t: t}
+}
+
+func (mr *MockDescribeConfigsResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*DescribeConfigsRequest)
+	res := &DescribeConfigsResponse{}
+
+	var configEntries []*ConfigEntry
+	configEntries = append(configEntries, &ConfigEntry{Name: "my_topic",
+		Value:     "my_topic",
+		ReadOnly:  true,
+		Default:   true,
+		Sensitive: false,
+	})
+
+	for _, r := range req.Resources {
+		res.Resources = append(res.Resources, &ResourceResponse{Name: r.Name, Configs: configEntries})
+	}
+	return res
+}
+
+type MockAlterConfigsResponse struct {
+	t TestReporter
+}
+
+func NewMockAlterConfigsResponse(t TestReporter) *MockAlterConfigsResponse {
+	return &MockAlterConfigsResponse{t: t}
+}
+
+func (mr *MockAlterConfigsResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*AlterConfigsRequest)
+	res := &AlterConfigsResponse{}
+
+	for _, r := range req.Resources {
+		res.Resources = append(res.Resources, &AlterConfigsResourceResponse{Name: r.Name,
+			Type:     TopicResource,
+			ErrorMsg: "",
+		})
+	}
+	return res
+}
+
+type MockCreateAclsResponse struct {
+	t TestReporter
+}
+
+func NewMockCreateAclsResponse(t TestReporter) *MockCreateAclsResponse {
+	return &MockCreateAclsResponse{t: t}
+}
+
+func (mr *MockCreateAclsResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*CreateAclsRequest)
+	res := &CreateAclsResponse{}
+
+	for range req.AclCreations {
+		res.AclCreationResponses = append(res.AclCreationResponses, &AclCreationResponse{Err: ErrNoError})
+	}
+	return res
+}
+
+type MockListAclsResponse struct {
+	t TestReporter
+}
+
+func NewMockListAclsResponse(t TestReporter) *MockListAclsResponse {
+	return &MockListAclsResponse{t: t}
+}
+
+func (mr *MockListAclsResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*DescribeAclsRequest)
+	res := &DescribeAclsResponse{}
+
+	res.Err = ErrNoError
+	acl := &ResourceAcls{}
+	acl.Resource.ResourceName = *req.ResourceName
+	acl.Resource.ResourceType = req.ResourceType
+	acl.Acls = append(acl.Acls, &Acl{})
+	res.ResourceAcls = append(res.ResourceAcls, acl)
+
+	return res
+}
+
+type MockDeleteAclsResponse struct {
+	t TestReporter
+}
+
+func NewMockDeleteAclsResponse(t TestReporter) *MockDeleteAclsResponse {
+	return &MockDeleteAclsResponse{t: t}
+}
+
+func (mr *MockDeleteAclsResponse) For(reqBody versionedDecoder) encoder {
+	req := reqBody.(*DeleteAclsRequest)
+	res := &DeleteAclsResponse{}
+
+	for range req.Filters {
+		response := &FilterResponse{Err: ErrNoError}
+		response.MatchingAcls = append(response.MatchingAcls, &MatchingAcl{Err: ErrNoError})
+		res.FilterResponses = append(res.FilterResponses, response)
+	}
+	return res
+}

diff --git a/vendor/github.com/Shopify/sarama/offset_commit_request.go b/vendor/github.com/Shopify/sarama/offset_commit_request.go
new file mode 100644
index 0000000..1ec583e
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/offset_commit_request.go

@@ -0,0 +1,210 @@
+package sarama
+
+import "errors"
+
+// ReceiveTime is a special value for the timestamp field of Offset Commit Requests which
+// tells the broker to set the timestamp to the time at which the request was received.
+// The timestamp is only used if message version 1 is used, which requires kafka 0.8.2.
+const ReceiveTime int64 = -1
+
+// GroupGenerationUndefined is a special value for the group generation field of
+// Offset Commit Requests that should be used when a consumer group does not rely
+// on Kafka for partition management.
+const GroupGenerationUndefined = -1
+
+type offsetCommitRequestBlock struct {
+	offset    int64
+	timestamp int64
+	metadata  string
+}
+
+func (b *offsetCommitRequestBlock) encode(pe packetEncoder, version int16) error {
+	pe.putInt64(b.offset)
+	if version == 1 {
+		pe.putInt64(b.timestamp)
+	} else if b.timestamp != 0 {
+		Logger.Println("Non-zero timestamp specified for OffsetCommitRequest not v1, it will be ignored")
+	}
+
+	return pe.putString(b.metadata)
+}
+
+func (b *offsetCommitRequestBlock) decode(pd packetDecoder, version int16) (err error) {
+	if b.offset, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if version == 1 {
+		if b.timestamp, err = pd.getInt64(); err != nil {
+			return err
+		}
+	}
+	b.metadata, err = pd.getString()
+	return err
+}
+
+type OffsetCommitRequest struct {
+	ConsumerGroup           string
+	ConsumerGroupGeneration int32  // v1 or later
+	ConsumerID              string // v1 or later
+	RetentionTime           int64  // v2 or later
+
+	// Version can be:
+	// - 0 (kafka 0.8.1 and later)
+	// - 1 (kafka 0.8.2 and later)
+	// - 2 (kafka 0.9.0 and later)
+	// - 3 (kafka 0.11.0 and later)
+	// - 4 (kafka 2.0.0 and later)
+	Version int16
+	blocks  map[string]map[int32]*offsetCommitRequestBlock
+}
+
+func (r *OffsetCommitRequest) encode(pe packetEncoder) error {
+	if r.Version < 0 || r.Version > 4 {
+		return PacketEncodingError{"invalid or unsupported OffsetCommitRequest version field"}
+	}
+
+	if err := pe.putString(r.ConsumerGroup); err != nil {
+		return err
+	}
+
+	if r.Version >= 1 {
+		pe.putInt32(r.ConsumerGroupGeneration)
+		if err := pe.putString(r.ConsumerID); err != nil {
+			return err
+		}
+	} else {
+		if r.ConsumerGroupGeneration != 0 {
+			Logger.Println("Non-zero ConsumerGroupGeneration specified for OffsetCommitRequest v0, it will be ignored")
+		}
+		if r.ConsumerID != "" {
+			Logger.Println("Non-empty ConsumerID specified for OffsetCommitRequest v0, it will be ignored")
+		}
+	}
+
+	if r.Version >= 2 {
+		pe.putInt64(r.RetentionTime)
+	} else if r.RetentionTime != 0 {
+		Logger.Println("Non-zero RetentionTime specified for OffsetCommitRequest version <2, it will be ignored")
+	}
+
+	if err := pe.putArrayLength(len(r.blocks)); err != nil {
+		return err
+	}
+	for topic, partitions := range r.blocks {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := pe.putArrayLength(len(partitions)); err != nil {
+			return err
+		}
+		for partition, block := range partitions {
+			pe.putInt32(partition)
+			if err := block.encode(pe, r.Version); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (r *OffsetCommitRequest) decode(pd packetDecoder, version int16) (err error) {
+	r.Version = version
+
+	if r.ConsumerGroup, err = pd.getString(); err != nil {
+		return err
+	}
+
+	if r.Version >= 1 {
+		if r.ConsumerGroupGeneration, err = pd.getInt32(); err != nil {
+			return err
+		}
+		if r.ConsumerID, err = pd.getString(); err != nil {
+			return err
+		}
+	}
+
+	if r.Version >= 2 {
+		if r.RetentionTime, err = pd.getInt64(); err != nil {
+			return err
+		}
+	}
+
+	topicCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if topicCount == 0 {
+		return nil
+	}
+	r.blocks = make(map[string]map[int32]*offsetCommitRequestBlock)
+	for i := 0; i < topicCount; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		partitionCount, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+		r.blocks[topic] = make(map[int32]*offsetCommitRequestBlock)
+		for j := 0; j < partitionCount; j++ {
+			partition, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			block := &offsetCommitRequestBlock{}
+			if err := block.decode(pd, r.Version); err != nil {
+				return err
+			}
+			r.blocks[topic][partition] = block
+		}
+	}
+	return nil
+}
+
+func (r *OffsetCommitRequest) key() int16 {
+	return 8
+}
+
+func (r *OffsetCommitRequest) version() int16 {
+	return r.Version
+}
+
+func (r *OffsetCommitRequest) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_8_2_0
+	case 2:
+		return V0_9_0_0
+	case 3:
+		return V0_11_0_0
+	case 4:
+		return V2_0_0_0
+	default:
+		return MinVersion
+	}
+}
+
+func (r *OffsetCommitRequest) AddBlock(topic string, partitionID int32, offset int64, timestamp int64, metadata string) {
+	if r.blocks == nil {
+		r.blocks = make(map[string]map[int32]*offsetCommitRequestBlock)
+	}
+
+	if r.blocks[topic] == nil {
+		r.blocks[topic] = make(map[int32]*offsetCommitRequestBlock)
+	}
+
+	r.blocks[topic][partitionID] = &offsetCommitRequestBlock{offset, timestamp, metadata}
+}
+
+func (r *OffsetCommitRequest) Offset(topic string, partitionID int32) (int64, string, error) {
+	partitions := r.blocks[topic]
+	if partitions == nil {
+		return 0, "", errors.New("No such offset")
+	}
+	block := partitions[partitionID]
+	if block == nil {
+		return 0, "", errors.New("No such offset")
+	}
+	return block.offset, block.metadata, nil
+}

diff --git a/vendor/github.com/Shopify/sarama/offset_commit_response.go b/vendor/github.com/Shopify/sarama/offset_commit_response.go
new file mode 100644
index 0000000..e842298
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/offset_commit_response.go

@@ -0,0 +1,110 @@
+package sarama
+
+type OffsetCommitResponse struct {
+	Version        int16
+	ThrottleTimeMs int32
+	Errors         map[string]map[int32]KError
+}
+
+func (r *OffsetCommitResponse) AddError(topic string, partition int32, kerror KError) {
+	if r.Errors == nil {
+		r.Errors = make(map[string]map[int32]KError)
+	}
+	partitions := r.Errors[topic]
+	if partitions == nil {
+		partitions = make(map[int32]KError)
+		r.Errors[topic] = partitions
+	}
+	partitions[partition] = kerror
+}
+
+func (r *OffsetCommitResponse) encode(pe packetEncoder) error {
+	if r.Version >= 3 {
+		pe.putInt32(r.ThrottleTimeMs)
+	}
+	if err := pe.putArrayLength(len(r.Errors)); err != nil {
+		return err
+	}
+	for topic, partitions := range r.Errors {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := pe.putArrayLength(len(partitions)); err != nil {
+			return err
+		}
+		for partition, kerror := range partitions {
+			pe.putInt32(partition)
+			pe.putInt16(int16(kerror))
+		}
+	}
+	return nil
+}
+
+func (r *OffsetCommitResponse) decode(pd packetDecoder, version int16) (err error) {
+	r.Version = version
+
+	if version >= 3 {
+		r.ThrottleTimeMs, err = pd.getInt32()
+		if err != nil {
+			return err
+		}
+	}
+
+	numTopics, err := pd.getArrayLength()
+	if err != nil || numTopics == 0 {
+		return err
+	}
+
+	r.Errors = make(map[string]map[int32]KError, numTopics)
+	for i := 0; i < numTopics; i++ {
+		name, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		numErrors, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+
+		r.Errors[name] = make(map[int32]KError, numErrors)
+
+		for j := 0; j < numErrors; j++ {
+			id, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+
+			tmp, err := pd.getInt16()
+			if err != nil {
+				return err
+			}
+			r.Errors[name][id] = KError(tmp)
+		}
+	}
+
+	return nil
+}
+
+func (r *OffsetCommitResponse) key() int16 {
+	return 8
+}
+
+func (r *OffsetCommitResponse) version() int16 {
+	return r.Version
+}
+
+func (r *OffsetCommitResponse) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_8_2_0
+	case 2:
+		return V0_9_0_0
+	case 3:
+		return V0_11_0_0
+	case 4:
+		return V2_0_0_0
+	default:
+		return MinVersion
+	}
+}

diff --git a/vendor/github.com/Shopify/sarama/offset_fetch_request.go b/vendor/github.com/Shopify/sarama/offset_fetch_request.go
new file mode 100644
index 0000000..6860824
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/offset_fetch_request.go

@@ -0,0 +1,100 @@
+package sarama
+
+type OffsetFetchRequest struct {
+	Version       int16
+	ConsumerGroup string
+	partitions    map[string][]int32
+}
+
+func (r *OffsetFetchRequest) encode(pe packetEncoder) (err error) {
+	if r.Version < 0 || r.Version > 5 {
+		return PacketEncodingError{"invalid or unsupported OffsetFetchRequest version field"}
+	}
+
+	if err = pe.putString(r.ConsumerGroup); err != nil {
+		return err
+	}
+
+	if r.Version >= 2 && r.partitions == nil {
+		pe.putInt32(-1)
+	} else {
+		if err = pe.putArrayLength(len(r.partitions)); err != nil {
+			return err
+		}
+		for topic, partitions := range r.partitions {
+			if err = pe.putString(topic); err != nil {
+				return err
+			}
+			if err = pe.putInt32Array(partitions); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (r *OffsetFetchRequest) decode(pd packetDecoder, version int16) (err error) {
+	r.Version = version
+	if r.ConsumerGroup, err = pd.getString(); err != nil {
+		return err
+	}
+	partitionCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if (partitionCount == 0 && version < 2) || partitionCount < 0 {
+		return nil
+	}
+	r.partitions = make(map[string][]int32)
+	for i := 0; i < partitionCount; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		partitions, err := pd.getInt32Array()
+		if err != nil {
+			return err
+		}
+		r.partitions[topic] = partitions
+	}
+	return nil
+}
+
+func (r *OffsetFetchRequest) key() int16 {
+	return 9
+}
+
+func (r *OffsetFetchRequest) version() int16 {
+	return r.Version
+}
+
+func (r *OffsetFetchRequest) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_8_2_0
+	case 2:
+		return V0_10_2_0
+	case 3:
+		return V0_11_0_0
+	case 4:
+		return V2_0_0_0
+	case 5:
+		return V2_1_0_0
+	default:
+		return MinVersion
+	}
+}
+
+func (r *OffsetFetchRequest) ZeroPartitions() {
+	if r.partitions == nil && r.Version >= 2 {
+		r.partitions = make(map[string][]int32)
+	}
+}
+
+func (r *OffsetFetchRequest) AddPartition(topic string, partitionID int32) {
+	if r.partitions == nil {
+		r.partitions = make(map[string][]int32)
+	}
+
+	r.partitions[topic] = append(r.partitions[topic], partitionID)
+}

diff --git a/vendor/github.com/Shopify/sarama/offset_fetch_response.go b/vendor/github.com/Shopify/sarama/offset_fetch_response.go
new file mode 100644
index 0000000..9e25702
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/offset_fetch_response.go

@@ -0,0 +1,197 @@
+package sarama
+
+type OffsetFetchResponseBlock struct {
+	Offset      int64
+	LeaderEpoch int32
+	Metadata    string
+	Err         KError
+}
+
+func (b *OffsetFetchResponseBlock) decode(pd packetDecoder, version int16) (err error) {
+	b.Offset, err = pd.getInt64()
+	if err != nil {
+		return err
+	}
+
+	if version >= 5 {
+		b.LeaderEpoch, err = pd.getInt32()
+		if err != nil {
+			return err
+		}
+	}
+
+	b.Metadata, err = pd.getString()
+	if err != nil {
+		return err
+	}
+
+	tmp, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	b.Err = KError(tmp)
+
+	return nil
+}
+
+func (b *OffsetFetchResponseBlock) encode(pe packetEncoder, version int16) (err error) {
+	pe.putInt64(b.Offset)
+
+	if version >= 5 {
+		pe.putInt32(b.LeaderEpoch)
+	}
+
+	err = pe.putString(b.Metadata)
+	if err != nil {
+		return err
+	}
+
+	pe.putInt16(int16(b.Err))
+
+	return nil
+}
+
+type OffsetFetchResponse struct {
+	Version        int16
+	ThrottleTimeMs int32
+	Blocks         map[string]map[int32]*OffsetFetchResponseBlock
+	Err            KError
+}
+
+func (r *OffsetFetchResponse) encode(pe packetEncoder) error {
+	if r.Version >= 3 {
+		pe.putInt32(r.ThrottleTimeMs)
+	}
+
+	if err := pe.putArrayLength(len(r.Blocks)); err != nil {
+		return err
+	}
+	for topic, partitions := range r.Blocks {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := pe.putArrayLength(len(partitions)); err != nil {
+			return err
+		}
+		for partition, block := range partitions {
+			pe.putInt32(partition)
+			if err := block.encode(pe, r.Version); err != nil {
+				return err
+			}
+		}
+	}
+	if r.Version >= 2 {
+		pe.putInt16(int16(r.Err))
+	}
+	return nil
+}
+
+func (r *OffsetFetchResponse) decode(pd packetDecoder, version int16) (err error) {
+	r.Version = version
+
+	if version >= 3 {
+		r.ThrottleTimeMs, err = pd.getInt32()
+		if err != nil {
+			return err
+		}
+	}
+
+	numTopics, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	if numTopics > 0 {
+		r.Blocks = make(map[string]map[int32]*OffsetFetchResponseBlock, numTopics)
+		for i := 0; i < numTopics; i++ {
+			name, err := pd.getString()
+			if err != nil {
+				return err
+			}
+
+			numBlocks, err := pd.getArrayLength()
+			if err != nil {
+				return err
+			}
+
+			if numBlocks == 0 {
+				r.Blocks[name] = nil
+				continue
+			}
+			r.Blocks[name] = make(map[int32]*OffsetFetchResponseBlock, numBlocks)
+
+			for j := 0; j < numBlocks; j++ {
+				id, err := pd.getInt32()
+				if err != nil {
+					return err
+				}
+
+				block := new(OffsetFetchResponseBlock)
+				err = block.decode(pd, version)
+				if err != nil {
+					return err
+				}
+				r.Blocks[name][id] = block
+			}
+		}
+	}
+
+	if version >= 2 {
+		kerr, err := pd.getInt16()
+		if err != nil {
+			return err
+		}
+		r.Err = KError(kerr)
+	}
+
+	return nil
+}
+
+func (r *OffsetFetchResponse) key() int16 {
+	return 9
+}
+
+func (r *OffsetFetchResponse) version() int16 {
+	return r.Version
+}
+
+func (r *OffsetFetchResponse) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_8_2_0
+	case 2:
+		return V0_10_2_0
+	case 3:
+		return V0_11_0_0
+	case 4:
+		return V2_0_0_0
+	case 5:
+		return V2_1_0_0
+	default:
+		return MinVersion
+	}
+}
+
+func (r *OffsetFetchResponse) GetBlock(topic string, partition int32) *OffsetFetchResponseBlock {
+	if r.Blocks == nil {
+		return nil
+	}
+
+	if r.Blocks[topic] == nil {
+		return nil
+	}
+
+	return r.Blocks[topic][partition]
+}
+
+func (r *OffsetFetchResponse) AddBlock(topic string, partition int32, block *OffsetFetchResponseBlock) {
+	if r.Blocks == nil {
+		r.Blocks = make(map[string]map[int32]*OffsetFetchResponseBlock)
+	}
+	partitions := r.Blocks[topic]
+	if partitions == nil {
+		partitions = make(map[int32]*OffsetFetchResponseBlock)
+		r.Blocks[topic] = partitions
+	}
+	partitions[partition] = block
+}

diff --git a/vendor/github.com/Shopify/sarama/offset_manager.go b/vendor/github.com/Shopify/sarama/offset_manager.go
new file mode 100644
index 0000000..8ea857f
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/offset_manager.go

@@ -0,0 +1,572 @@
+package sarama
+
+import (
+	"sync"
+	"time"
+)
+
+// Offset Manager
+
+// OffsetManager uses Kafka to store and fetch consumed partition offsets.
+type OffsetManager interface {
+	// ManagePartition creates a PartitionOffsetManager on the given topic/partition.
+	// It will return an error if this OffsetManager is already managing the given
+	// topic/partition.
+	ManagePartition(topic string, partition int32) (PartitionOffsetManager, error)
+
+	// Close stops the OffsetManager from managing offsets. It is required to call
+	// this function before an OffsetManager object passes out of scope, as it
+	// will otherwise leak memory. You must call this after all the
+	// PartitionOffsetManagers are closed.
+	Close() error
+}
+
+type offsetManager struct {
+	client Client
+	conf   *Config
+	group  string
+	ticker *time.Ticker
+
+	memberID   string
+	generation int32
+
+	broker     *Broker
+	brokerLock sync.RWMutex
+
+	poms     map[string]map[int32]*partitionOffsetManager
+	pomsLock sync.RWMutex
+
+	closeOnce sync.Once
+	closing   chan none
+	closed    chan none
+}
+
+// NewOffsetManagerFromClient creates a new OffsetManager from the given client.
+// It is still necessary to call Close() on the underlying client when finished with the partition manager.
+func NewOffsetManagerFromClient(group string, client Client) (OffsetManager, error) {
+	return newOffsetManagerFromClient(group, "", GroupGenerationUndefined, client)
+}
+
+func newOffsetManagerFromClient(group, memberID string, generation int32, client Client) (*offsetManager, error) {
+	// Check that we are not dealing with a closed Client before processing any other arguments
+	if client.Closed() {
+		return nil, ErrClosedClient
+	}
+
+	conf := client.Config()
+	om := &offsetManager{
+		client: client,
+		conf:   conf,
+		group:  group,
+		ticker: time.NewTicker(conf.Consumer.Offsets.CommitInterval),
+		poms:   make(map[string]map[int32]*partitionOffsetManager),
+
+		memberID:   memberID,
+		generation: generation,
+
+		closing: make(chan none),
+		closed:  make(chan none),
+	}
+	go withRecover(om.mainLoop)
+
+	return om, nil
+}
+
+func (om *offsetManager) ManagePartition(topic string, partition int32) (PartitionOffsetManager, error) {
+	pom, err := om.newPartitionOffsetManager(topic, partition)
+	if err != nil {
+		return nil, err
+	}
+
+	om.pomsLock.Lock()
+	defer om.pomsLock.Unlock()
+
+	topicManagers := om.poms[topic]
+	if topicManagers == nil {
+		topicManagers = make(map[int32]*partitionOffsetManager)
+		om.poms[topic] = topicManagers
+	}
+
+	if topicManagers[partition] != nil {
+		return nil, ConfigurationError("That topic/partition is already being managed")
+	}
+
+	topicManagers[partition] = pom
+	return pom, nil
+}
+
+func (om *offsetManager) Close() error {
+	om.closeOnce.Do(func() {
+		// exit the mainLoop
+		close(om.closing)
+		<-om.closed
+
+		// mark all POMs as closed
+		om.asyncClosePOMs()
+
+		// flush one last time
+		for attempt := 0; attempt <= om.conf.Consumer.Offsets.Retry.Max; attempt++ {
+			om.flushToBroker()
+			if om.releasePOMs(false) == 0 {
+				break
+			}
+		}
+
+		om.releasePOMs(true)
+		om.brokerLock.Lock()
+		om.broker = nil
+		om.brokerLock.Unlock()
+	})
+	return nil
+}
+
+func (om *offsetManager) fetchInitialOffset(topic string, partition int32, retries int) (int64, string, error) {
+	broker, err := om.coordinator()
+	if err != nil {
+		if retries <= 0 {
+			return 0, "", err
+		}
+		return om.fetchInitialOffset(topic, partition, retries-1)
+	}
+
+	req := new(OffsetFetchRequest)
+	req.Version = 1
+	req.ConsumerGroup = om.group
+	req.AddPartition(topic, partition)
+
+	resp, err := broker.FetchOffset(req)
+	if err != nil {
+		if retries <= 0 {
+			return 0, "", err
+		}
+		om.releaseCoordinator(broker)
+		return om.fetchInitialOffset(topic, partition, retries-1)
+	}
+
+	block := resp.GetBlock(topic, partition)
+	if block == nil {
+		return 0, "", ErrIncompleteResponse
+	}
+
+	switch block.Err {
+	case ErrNoError:
+		return block.Offset, block.Metadata, nil
+	case ErrNotCoordinatorForConsumer:
+		if retries <= 0 {
+			return 0, "", block.Err
+		}
+		om.releaseCoordinator(broker)
+		return om.fetchInitialOffset(topic, partition, retries-1)
+	case ErrOffsetsLoadInProgress:
+		if retries <= 0 {
+			return 0, "", block.Err
+		}
+		select {
+		case <-om.closing:
+			return 0, "", block.Err
+		case <-time.After(om.conf.Metadata.Retry.Backoff):
+		}
+		return om.fetchInitialOffset(topic, partition, retries-1)
+	default:
+		return 0, "", block.Err
+	}
+}
+
+func (om *offsetManager) coordinator() (*Broker, error) {
+	om.brokerLock.RLock()
+	broker := om.broker
+	om.brokerLock.RUnlock()
+
+	if broker != nil {
+		return broker, nil
+	}
+
+	om.brokerLock.Lock()
+	defer om.brokerLock.Unlock()
+
+	if broker := om.broker; broker != nil {
+		return broker, nil
+	}
+
+	if err := om.client.RefreshCoordinator(om.group); err != nil {
+		return nil, err
+	}
+
+	broker, err := om.client.Coordinator(om.group)
+	if err != nil {
+		return nil, err
+	}
+
+	om.broker = broker
+	return broker, nil
+}
+
+func (om *offsetManager) releaseCoordinator(b *Broker) {
+	om.brokerLock.Lock()
+	if om.broker == b {
+		om.broker = nil
+	}
+	om.brokerLock.Unlock()
+}
+
+func (om *offsetManager) mainLoop() {
+	defer om.ticker.Stop()
+	defer close(om.closed)
+
+	for {
+		select {
+		case <-om.ticker.C:
+			om.flushToBroker()
+			om.releasePOMs(false)
+		case <-om.closing:
+			return
+		}
+	}
+}
+
+func (om *offsetManager) flushToBroker() {
+	req := om.constructRequest()
+	if req == nil {
+		return
+	}
+
+	broker, err := om.coordinator()
+	if err != nil {
+		om.handleError(err)
+		return
+	}
+
+	resp, err := broker.CommitOffset(req)
+	if err != nil {
+		om.handleError(err)
+		om.releaseCoordinator(broker)
+		_ = broker.Close()
+		return
+	}
+
+	om.handleResponse(broker, req, resp)
+}
+
+func (om *offsetManager) constructRequest() *OffsetCommitRequest {
+	var r *OffsetCommitRequest
+	var perPartitionTimestamp int64
+	if om.conf.Consumer.Offsets.Retention == 0 {
+		perPartitionTimestamp = ReceiveTime
+		r = &OffsetCommitRequest{
+			Version:                 1,
+			ConsumerGroup:           om.group,
+			ConsumerID:              om.memberID,
+			ConsumerGroupGeneration: om.generation,
+		}
+	} else {
+		r = &OffsetCommitRequest{
+			Version:                 2,
+			RetentionTime:           int64(om.conf.Consumer.Offsets.Retention / time.Millisecond),
+			ConsumerGroup:           om.group,
+			ConsumerID:              om.memberID,
+			ConsumerGroupGeneration: om.generation,
+		}
+
+	}
+
+	om.pomsLock.RLock()
+	defer om.pomsLock.RUnlock()
+
+	for _, topicManagers := range om.poms {
+		for _, pom := range topicManagers {
+			pom.lock.Lock()
+			if pom.dirty {
+				r.AddBlock(pom.topic, pom.partition, pom.offset, perPartitionTimestamp, pom.metadata)
+			}
+			pom.lock.Unlock()
+		}
+	}
+
+	if len(r.blocks) > 0 {
+		return r
+	}
+
+	return nil
+}
+
+func (om *offsetManager) handleResponse(broker *Broker, req *OffsetCommitRequest, resp *OffsetCommitResponse) {
+	om.pomsLock.RLock()
+	defer om.pomsLock.RUnlock()
+
+	for _, topicManagers := range om.poms {
+		for _, pom := range topicManagers {
+			if req.blocks[pom.topic] == nil || req.blocks[pom.topic][pom.partition] == nil {
+				continue
+			}
+
+			var err KError
+			var ok bool
+
+			if resp.Errors[pom.topic] == nil {
+				pom.handleError(ErrIncompleteResponse)
+				continue
+			}
+			if err, ok = resp.Errors[pom.topic][pom.partition]; !ok {
+				pom.handleError(ErrIncompleteResponse)
+				continue
+			}
+
+			switch err {
+			case ErrNoError:
+				block := req.blocks[pom.topic][pom.partition]
+				pom.updateCommitted(block.offset, block.metadata)
+			case ErrNotLeaderForPartition, ErrLeaderNotAvailable,
+				ErrConsumerCoordinatorNotAvailable, ErrNotCoordinatorForConsumer:
+				// not a critical error, we just need to redispatch
+				om.releaseCoordinator(broker)
+			case ErrOffsetMetadataTooLarge, ErrInvalidCommitOffsetSize:
+				// nothing we can do about this, just tell the user and carry on
+				pom.handleError(err)
+			case ErrOffsetsLoadInProgress:
+				// nothing wrong but we didn't commit, we'll get it next time round
+				break
+			case ErrUnknownTopicOrPartition:
+				// let the user know *and* try redispatching - if topic-auto-create is
+				// enabled, redispatching should trigger a metadata req and create the
+				// topic; if not then re-dispatching won't help, but we've let the user
+				// know and it shouldn't hurt either (see https://github.com/Shopify/sarama/issues/706)
+				fallthrough
+			default:
+				// dunno, tell the user and try redispatching
+				pom.handleError(err)
+				om.releaseCoordinator(broker)
+			}
+		}
+	}
+}
+
+func (om *offsetManager) handleError(err error) {
+	om.pomsLock.RLock()
+	defer om.pomsLock.RUnlock()
+
+	for _, topicManagers := range om.poms {
+		for _, pom := range topicManagers {
+			pom.handleError(err)
+		}
+	}
+}
+
+func (om *offsetManager) asyncClosePOMs() {
+	om.pomsLock.RLock()
+	defer om.pomsLock.RUnlock()
+
+	for _, topicManagers := range om.poms {
+		for _, pom := range topicManagers {
+			pom.AsyncClose()
+		}
+	}
+}
+
+// Releases/removes closed POMs once they are clean (or when forced)
+func (om *offsetManager) releasePOMs(force bool) (remaining int) {
+	om.pomsLock.Lock()
+	defer om.pomsLock.Unlock()
+
+	for topic, topicManagers := range om.poms {
+		for partition, pom := range topicManagers {
+			pom.lock.Lock()
+			releaseDue := pom.done && (force || !pom.dirty)
+			pom.lock.Unlock()
+
+			if releaseDue {
+				pom.release()
+
+				delete(om.poms[topic], partition)
+				if len(om.poms[topic]) == 0 {
+					delete(om.poms, topic)
+				}
+			}
+		}
+		remaining += len(om.poms[topic])
+	}
+	return
+}
+
+func (om *offsetManager) findPOM(topic string, partition int32) *partitionOffsetManager {
+	om.pomsLock.RLock()
+	defer om.pomsLock.RUnlock()
+
+	if partitions, ok := om.poms[topic]; ok {
+		if pom, ok := partitions[partition]; ok {
+			return pom
+		}
+	}
+	return nil
+}
+
+// Partition Offset Manager
+
+// PartitionOffsetManager uses Kafka to store and fetch consumed partition offsets. You MUST call Close()
+// on a partition offset manager to avoid leaks, it will not be garbage-collected automatically when it passes
+// out of scope.
+type PartitionOffsetManager interface {
+	// NextOffset returns the next offset that should be consumed for the managed
+	// partition, accompanied by metadata which can be used to reconstruct the state
+	// of the partition consumer when it resumes. NextOffset() will return
+	// `config.Consumer.Offsets.Initial` and an empty metadata string if no offset
+	// was committed for this partition yet.
+	NextOffset() (int64, string)
+
+	// MarkOffset marks the provided offset, alongside a metadata string
+	// that represents the state of the partition consumer at that point in time. The
+	// metadata string can be used by another consumer to restore that state, so it
+	// can resume consumption.
+	//
+	// To follow upstream conventions, you are expected to mark the offset of the
+	// next message to read, not the last message read. Thus, when calling `MarkOffset`
+	// you should typically add one to the offset of the last consumed message.
+	//
+	// Note: calling MarkOffset does not necessarily commit the offset to the backend
+	// store immediately for efficiency reasons, and it may never be committed if
+	// your application crashes. This means that you may end up processing the same
+	// message twice, and your processing should ideally be idempotent.
+	MarkOffset(offset int64, metadata string)
+
+	// ResetOffset resets to the provided offset, alongside a metadata string that
+	// represents the state of the partition consumer at that point in time. Reset
+	// acts as a counterpart to MarkOffset, the difference being that it allows to
+	// reset an offset to an earlier or smaller value, where MarkOffset only
+	// allows incrementing the offset. cf MarkOffset for more details.
+	ResetOffset(offset int64, metadata string)
+
+	// Errors returns a read channel of errors that occur during offset management, if
+	// enabled. By default, errors are logged and not returned over this channel. If
+	// you want to implement any custom error handling, set your config's
+	// Consumer.Return.Errors setting to true, and read from this channel.
+	Errors() <-chan *ConsumerError
+
+	// AsyncClose initiates a shutdown of the PartitionOffsetManager. This method will
+	// return immediately, after which you should wait until the 'errors' channel has
+	// been drained and closed. It is required to call this function, or Close before
+	// a consumer object passes out of scope, as it will otherwise leak memory. You
+	// must call this before calling Close on the underlying client.
+	AsyncClose()
+
+	// Close stops the PartitionOffsetManager from managing offsets. It is required to
+	// call this function (or AsyncClose) before a PartitionOffsetManager object
+	// passes out of scope, as it will otherwise leak memory. You must call this
+	// before calling Close on the underlying client.
+	Close() error
+}
+
+type partitionOffsetManager struct {
+	parent    *offsetManager
+	topic     string
+	partition int32
+
+	lock     sync.Mutex
+	offset   int64
+	metadata string
+	dirty    bool
+	done     bool
+
+	releaseOnce sync.Once
+	errors      chan *ConsumerError
+}
+
+func (om *offsetManager) newPartitionOffsetManager(topic string, partition int32) (*partitionOffsetManager, error) {
+	offset, metadata, err := om.fetchInitialOffset(topic, partition, om.conf.Metadata.Retry.Max)
+	if err != nil {
+		return nil, err
+	}
+
+	return &partitionOffsetManager{
+		parent:    om,
+		topic:     topic,
+		partition: partition,
+		errors:    make(chan *ConsumerError, om.conf.ChannelBufferSize),
+		offset:    offset,
+		metadata:  metadata,
+	}, nil
+}
+
+func (pom *partitionOffsetManager) Errors() <-chan *ConsumerError {
+	return pom.errors
+}
+
+func (pom *partitionOffsetManager) MarkOffset(offset int64, metadata string) {
+	pom.lock.Lock()
+	defer pom.lock.Unlock()
+
+	if offset > pom.offset {
+		pom.offset = offset
+		pom.metadata = metadata
+		pom.dirty = true
+	}
+}
+
+func (pom *partitionOffsetManager) ResetOffset(offset int64, metadata string) {
+	pom.lock.Lock()
+	defer pom.lock.Unlock()
+
+	if offset <= pom.offset {
+		pom.offset = offset
+		pom.metadata = metadata
+		pom.dirty = true
+	}
+}
+
+func (pom *partitionOffsetManager) updateCommitted(offset int64, metadata string) {
+	pom.lock.Lock()
+	defer pom.lock.Unlock()
+
+	if pom.offset == offset && pom.metadata == metadata {
+		pom.dirty = false
+	}
+}
+
+func (pom *partitionOffsetManager) NextOffset() (int64, string) {
+	pom.lock.Lock()
+	defer pom.lock.Unlock()
+
+	if pom.offset >= 0 {
+		return pom.offset, pom.metadata
+	}
+
+	return pom.parent.conf.Consumer.Offsets.Initial, ""
+}
+
+func (pom *partitionOffsetManager) AsyncClose() {
+	pom.lock.Lock()
+	pom.done = true
+	pom.lock.Unlock()
+}
+
+func (pom *partitionOffsetManager) Close() error {
+	pom.AsyncClose()
+
+	var errors ConsumerErrors
+	for err := range pom.errors {
+		errors = append(errors, err)
+	}
+
+	if len(errors) > 0 {
+		return errors
+	}
+	return nil
+}
+
+func (pom *partitionOffsetManager) handleError(err error) {
+	cErr := &ConsumerError{
+		Topic:     pom.topic,
+		Partition: pom.partition,
+		Err:       err,
+	}
+
+	if pom.parent.conf.Consumer.Return.Errors {
+		pom.errors <- cErr
+	} else {
+		Logger.Println(cErr)
+	}
+}
+
+func (pom *partitionOffsetManager) release() {
+	pom.releaseOnce.Do(func() {
+		go close(pom.errors)
+	})
+}

diff --git a/vendor/github.com/Shopify/sarama/offset_request.go b/vendor/github.com/Shopify/sarama/offset_request.go
new file mode 100644
index 0000000..4c5df75
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/offset_request.go

@@ -0,0 +1,132 @@
+package sarama
+
+type offsetRequestBlock struct {
+	time       int64
+	maxOffsets int32 // Only used in version 0
+}
+
+func (b *offsetRequestBlock) encode(pe packetEncoder, version int16) error {
+	pe.putInt64(int64(b.time))
+	if version == 0 {
+		pe.putInt32(b.maxOffsets)
+	}
+
+	return nil
+}
+
+func (b *offsetRequestBlock) decode(pd packetDecoder, version int16) (err error) {
+	if b.time, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if version == 0 {
+		if b.maxOffsets, err = pd.getInt32(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+type OffsetRequest struct {
+	Version int16
+	blocks  map[string]map[int32]*offsetRequestBlock
+}
+
+func (r *OffsetRequest) encode(pe packetEncoder) error {
+	pe.putInt32(-1) // replica ID is always -1 for clients
+	err := pe.putArrayLength(len(r.blocks))
+	if err != nil {
+		return err
+	}
+	for topic, partitions := range r.blocks {
+		err = pe.putString(topic)
+		if err != nil {
+			return err
+		}
+		err = pe.putArrayLength(len(partitions))
+		if err != nil {
+			return err
+		}
+		for partition, block := range partitions {
+			pe.putInt32(partition)
+			if err = block.encode(pe, r.Version); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (r *OffsetRequest) decode(pd packetDecoder, version int16) error {
+	r.Version = version
+
+	// Ignore replica ID
+	if _, err := pd.getInt32(); err != nil {
+		return err
+	}
+	blockCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if blockCount == 0 {
+		return nil
+	}
+	r.blocks = make(map[string]map[int32]*offsetRequestBlock)
+	for i := 0; i < blockCount; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		partitionCount, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+		r.blocks[topic] = make(map[int32]*offsetRequestBlock)
+		for j := 0; j < partitionCount; j++ {
+			partition, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			block := &offsetRequestBlock{}
+			if err := block.decode(pd, version); err != nil {
+				return err
+			}
+			r.blocks[topic][partition] = block
+		}
+	}
+	return nil
+}
+
+func (r *OffsetRequest) key() int16 {
+	return 2
+}
+
+func (r *OffsetRequest) version() int16 {
+	return r.Version
+}
+
+func (r *OffsetRequest) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_10_1_0
+	default:
+		return MinVersion
+	}
+}
+
+func (r *OffsetRequest) AddBlock(topic string, partitionID int32, time int64, maxOffsets int32) {
+	if r.blocks == nil {
+		r.blocks = make(map[string]map[int32]*offsetRequestBlock)
+	}
+
+	if r.blocks[topic] == nil {
+		r.blocks[topic] = make(map[int32]*offsetRequestBlock)
+	}
+
+	tmp := new(offsetRequestBlock)
+	tmp.time = time
+	if r.Version == 0 {
+		tmp.maxOffsets = maxOffsets
+	}
+
+	r.blocks[topic][partitionID] = tmp
+}

diff --git a/vendor/github.com/Shopify/sarama/offset_response.go b/vendor/github.com/Shopify/sarama/offset_response.go
new file mode 100644
index 0000000..8b2193f
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/offset_response.go

@@ -0,0 +1,174 @@
+package sarama
+
+type OffsetResponseBlock struct {
+	Err       KError
+	Offsets   []int64 // Version 0
+	Offset    int64   // Version 1
+	Timestamp int64   // Version 1
+}
+
+func (b *OffsetResponseBlock) decode(pd packetDecoder, version int16) (err error) {
+	tmp, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	b.Err = KError(tmp)
+
+	if version == 0 {
+		b.Offsets, err = pd.getInt64Array()
+
+		return err
+	}
+
+	b.Timestamp, err = pd.getInt64()
+	if err != nil {
+		return err
+	}
+
+	b.Offset, err = pd.getInt64()
+	if err != nil {
+		return err
+	}
+
+	// For backwards compatibility put the offset in the offsets array too
+	b.Offsets = []int64{b.Offset}
+
+	return nil
+}
+
+func (b *OffsetResponseBlock) encode(pe packetEncoder, version int16) (err error) {
+	pe.putInt16(int16(b.Err))
+
+	if version == 0 {
+		return pe.putInt64Array(b.Offsets)
+	}
+
+	pe.putInt64(b.Timestamp)
+	pe.putInt64(b.Offset)
+
+	return nil
+}
+
+type OffsetResponse struct {
+	Version int16
+	Blocks  map[string]map[int32]*OffsetResponseBlock
+}
+
+func (r *OffsetResponse) decode(pd packetDecoder, version int16) (err error) {
+	numTopics, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.Blocks = make(map[string]map[int32]*OffsetResponseBlock, numTopics)
+	for i := 0; i < numTopics; i++ {
+		name, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		numBlocks, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+
+		r.Blocks[name] = make(map[int32]*OffsetResponseBlock, numBlocks)
+
+		for j := 0; j < numBlocks; j++ {
+			id, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+
+			block := new(OffsetResponseBlock)
+			err = block.decode(pd, version)
+			if err != nil {
+				return err
+			}
+			r.Blocks[name][id] = block
+		}
+	}
+
+	return nil
+}
+
+func (r *OffsetResponse) GetBlock(topic string, partition int32) *OffsetResponseBlock {
+	if r.Blocks == nil {
+		return nil
+	}
+
+	if r.Blocks[topic] == nil {
+		return nil
+	}
+
+	return r.Blocks[topic][partition]
+}
+
+/*
+// [0 0 0 1 ntopics
+0 8 109 121 95 116 111 112 105 99 topic
+0 0 0 1 npartitions
+0 0 0 0 id
+0 0
+
+0 0 0 1 0 0 0 0
+0 1 1 1 0 0 0 1
+0 8 109 121 95 116 111 112
+105 99 0 0 0 1 0 0
+0 0 0 0 0 0 0 1
+0 0 0 0 0 1 1 1] <nil>
+
+*/
+func (r *OffsetResponse) encode(pe packetEncoder) (err error) {
+	if err = pe.putArrayLength(len(r.Blocks)); err != nil {
+		return err
+	}
+
+	for topic, partitions := range r.Blocks {
+		if err = pe.putString(topic); err != nil {
+			return err
+		}
+		if err = pe.putArrayLength(len(partitions)); err != nil {
+			return err
+		}
+		for partition, block := range partitions {
+			pe.putInt32(partition)
+			if err = block.encode(pe, r.version()); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func (r *OffsetResponse) key() int16 {
+	return 2
+}
+
+func (r *OffsetResponse) version() int16 {
+	return r.Version
+}
+
+func (r *OffsetResponse) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_10_1_0
+	default:
+		return MinVersion
+	}
+}
+
+// testing API
+
+func (r *OffsetResponse) AddTopicPartition(topic string, partition int32, offset int64) {
+	if r.Blocks == nil {
+		r.Blocks = make(map[string]map[int32]*OffsetResponseBlock)
+	}
+	byTopic, ok := r.Blocks[topic]
+	if !ok {
+		byTopic = make(map[int32]*OffsetResponseBlock)
+		r.Blocks[topic] = byTopic
+	}
+	byTopic[partition] = &OffsetResponseBlock{Offsets: []int64{offset}, Offset: offset}
+}

diff --git a/vendor/github.com/Shopify/sarama/packet_decoder.go b/vendor/github.com/Shopify/sarama/packet_decoder.go
new file mode 100644
index 0000000..74805cc
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/packet_decoder.go

@@ -0,0 +1,60 @@
+package sarama
+
+// PacketDecoder is the interface providing helpers for reading with Kafka's encoding rules.
+// Types implementing Decoder only need to worry about calling methods like GetString,
+// not about how a string is represented in Kafka.
+type packetDecoder interface {
+	// Primitives
+	getInt8() (int8, error)
+	getInt16() (int16, error)
+	getInt32() (int32, error)
+	getInt64() (int64, error)
+	getVarint() (int64, error)
+	getArrayLength() (int, error)
+	getBool() (bool, error)
+
+	// Collections
+	getBytes() ([]byte, error)
+	getVarintBytes() ([]byte, error)
+	getRawBytes(length int) ([]byte, error)
+	getString() (string, error)
+	getNullableString() (*string, error)
+	getInt32Array() ([]int32, error)
+	getInt64Array() ([]int64, error)
+	getStringArray() ([]string, error)
+
+	// Subsets
+	remaining() int
+	getSubset(length int) (packetDecoder, error)
+	peek(offset, length int) (packetDecoder, error) // similar to getSubset, but it doesn't advance the offset
+
+	// Stacks, see PushDecoder
+	push(in pushDecoder) error
+	pop() error
+}
+
+// PushDecoder is the interface for decoding fields like CRCs and lengths where the validity
+// of the field depends on what is after it in the packet. Start them with PacketDecoder.Push() where
+// the actual value is located in the packet, then PacketDecoder.Pop() them when all the bytes they
+// depend upon have been decoded.
+type pushDecoder interface {
+	// Saves the offset into the input buffer as the location to actually read the calculated value when able.
+	saveOffset(in int)
+
+	// Returns the length of data to reserve for the input of this encoder (eg 4 bytes for a CRC32).
+	reserveLength() int
+
+	// Indicates that all required data is now available to calculate and check the field.
+	// SaveOffset is guaranteed to have been called first. The implementation should read ReserveLength() bytes
+	// of data from the saved offset, and verify it based on the data between the saved offset and curOffset.
+	check(curOffset int, buf []byte) error
+}
+
+// dynamicPushDecoder extends the interface of pushDecoder for uses cases where the length of the
+// fields itself is unknown until its value was decoded (for instance varint encoded length
+// fields).
+// During push, dynamicPushDecoder.decode() method will be called instead of reserveLength()
+type dynamicPushDecoder interface {
+	pushDecoder
+	decoder
+}

diff --git a/vendor/github.com/Shopify/sarama/packet_encoder.go b/vendor/github.com/Shopify/sarama/packet_encoder.go
new file mode 100644
index 0000000..67b8dae
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/packet_encoder.go

@@ -0,0 +1,65 @@
+package sarama
+
+import "github.com/rcrowley/go-metrics"
+
+// PacketEncoder is the interface providing helpers for writing with Kafka's encoding rules.
+// Types implementing Encoder only need to worry about calling methods like PutString,
+// not about how a string is represented in Kafka.
+type packetEncoder interface {
+	// Primitives
+	putInt8(in int8)
+	putInt16(in int16)
+	putInt32(in int32)
+	putInt64(in int64)
+	putVarint(in int64)
+	putArrayLength(in int) error
+	putBool(in bool)
+
+	// Collections
+	putBytes(in []byte) error
+	putVarintBytes(in []byte) error
+	putRawBytes(in []byte) error
+	putString(in string) error
+	putNullableString(in *string) error
+	putStringArray(in []string) error
+	putInt32Array(in []int32) error
+	putInt64Array(in []int64) error
+
+	// Provide the current offset to record the batch size metric
+	offset() int
+
+	// Stacks, see PushEncoder
+	push(in pushEncoder)
+	pop() error
+
+	// To record metrics when provided
+	metricRegistry() metrics.Registry
+}
+
+// PushEncoder is the interface for encoding fields like CRCs and lengths where the value
+// of the field depends on what is encoded after it in the packet. Start them with PacketEncoder.Push() where
+// the actual value is located in the packet, then PacketEncoder.Pop() them when all the bytes they
+// depend upon have been written.
+type pushEncoder interface {
+	// Saves the offset into the input buffer as the location to actually write the calculated value when able.
+	saveOffset(in int)
+
+	// Returns the length of data to reserve for the output of this encoder (eg 4 bytes for a CRC32).
+	reserveLength() int
+
+	// Indicates that all required data is now available to calculate and write the field.
+	// SaveOffset is guaranteed to have been called first. The implementation should write ReserveLength() bytes
+	// of data to the saved offset, based on the data between the saved offset and curOffset.
+	run(curOffset int, buf []byte) error
+}
+
+// dynamicPushEncoder extends the interface of pushEncoder for uses cases where the length of the
+// fields itself is unknown until its value was computed (for instance varint encoded length
+// fields).
+type dynamicPushEncoder interface {
+	pushEncoder
+
+	// Called during pop() to adjust the length of the field.
+	// It should return the difference in bytes between the last computed length and current length.
+	adjustLength(currOffset int) int
+}

diff --git a/vendor/github.com/Shopify/sarama/partitioner.go b/vendor/github.com/Shopify/sarama/partitioner.go
new file mode 100644
index 0000000..6a708e7
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/partitioner.go

@@ -0,0 +1,217 @@
+package sarama
+
+import (
+	"hash"
+	"hash/fnv"
+	"math/rand"
+	"time"
+)
+
+// Partitioner is anything that, given a Kafka message and a number of partitions indexed [0...numPartitions-1],
+// decides to which partition to send the message. RandomPartitioner, RoundRobinPartitioner and HashPartitioner are provided
+// as simple default implementations.
+type Partitioner interface {
+	// Partition takes a message and partition count and chooses a partition
+	Partition(message *ProducerMessage, numPartitions int32) (int32, error)
+
+	// RequiresConsistency indicates to the user of the partitioner whether the
+	// mapping of key->partition is consistent or not. Specifically, if a
+	// partitioner requires consistency then it must be allowed to choose from all
+	// partitions (even ones known to be unavailable), and its choice must be
+	// respected by the caller. The obvious example is the HashPartitioner.
+	RequiresConsistency() bool
+}
+
+// DynamicConsistencyPartitioner can optionally be implemented by Partitioners
+// in order to allow more flexibility than is originally allowed by the
+// RequiresConsistency method in the Partitioner interface. This allows
+// partitioners to require consistency sometimes, but not all times. It's useful
+// for, e.g., the HashPartitioner, which does not require consistency if the
+// message key is nil.
+type DynamicConsistencyPartitioner interface {
+	Partitioner
+
+	// MessageRequiresConsistency is similar to Partitioner.RequiresConsistency,
+	// but takes in the message being partitioned so that the partitioner can
+	// make a per-message determination.
+	MessageRequiresConsistency(message *ProducerMessage) bool
+}
+
+// PartitionerConstructor is the type for a function capable of constructing new Partitioners.
+type PartitionerConstructor func(topic string) Partitioner
+
+type manualPartitioner struct{}
+
+// HashPartitionOption lets you modify default values of the partitioner
+type HashPartitionerOption func(*hashPartitioner)
+
+// WithAbsFirst means that the partitioner handles absolute values
+// in the same way as the reference Java implementation
+func WithAbsFirst() HashPartitionerOption {
+	return func(hp *hashPartitioner) {
+		hp.referenceAbs = true
+	}
+}
+
+// WithCustomHashFunction lets you specify what hash function to use for the partitioning
+func WithCustomHashFunction(hasher func() hash.Hash32) HashPartitionerOption {
+	return func(hp *hashPartitioner) {
+		hp.hasher = hasher()
+	}
+}
+
+// WithCustomFallbackPartitioner lets you specify what HashPartitioner should be used in case a Distribution Key is empty
+func WithCustomFallbackPartitioner(randomHP *hashPartitioner) HashPartitionerOption {
+	return func(hp *hashPartitioner) {
+		hp.random = hp
+	}
+}
+
+// NewManualPartitioner returns a Partitioner which uses the partition manually set in the provided
+// ProducerMessage's Partition field as the partition to produce to.
+func NewManualPartitioner(topic string) Partitioner {
+	return new(manualPartitioner)
+}
+
+func (p *manualPartitioner) Partition(message *ProducerMessage, numPartitions int32) (int32, error) {
+	return message.Partition, nil
+}
+
+func (p *manualPartitioner) RequiresConsistency() bool {
+	return true
+}
+
+type randomPartitioner struct {
+	generator *rand.Rand
+}
+
+// NewRandomPartitioner returns a Partitioner which chooses a random partition each time.
+func NewRandomPartitioner(topic string) Partitioner {
+	p := new(randomPartitioner)
+	p.generator = rand.New(rand.NewSource(time.Now().UTC().UnixNano()))
+	return p
+}
+
+func (p *randomPartitioner) Partition(message *ProducerMessage, numPartitions int32) (int32, error) {
+	return int32(p.generator.Intn(int(numPartitions))), nil
+}
+
+func (p *randomPartitioner) RequiresConsistency() bool {
+	return false
+}
+
+type roundRobinPartitioner struct {
+	partition int32
+}
+
+// NewRoundRobinPartitioner returns a Partitioner which walks through the available partitions one at a time.
+func NewRoundRobinPartitioner(topic string) Partitioner {
+	return &roundRobinPartitioner{}
+}
+
+func (p *roundRobinPartitioner) Partition(message *ProducerMessage, numPartitions int32) (int32, error) {
+	if p.partition >= numPartitions {
+		p.partition = 0
+	}
+	ret := p.partition
+	p.partition++
+	return ret, nil
+}
+
+func (p *roundRobinPartitioner) RequiresConsistency() bool {
+	return false
+}
+
+type hashPartitioner struct {
+	random       Partitioner
+	hasher       hash.Hash32
+	referenceAbs bool
+}
+
+// NewCustomHashPartitioner is a wrapper around NewHashPartitioner, allowing the use of custom hasher.
+// The argument is a function providing the instance, implementing the hash.Hash32 interface. This is to ensure that
+// each partition dispatcher gets its own hasher, to avoid concurrency issues by sharing an instance.
+func NewCustomHashPartitioner(hasher func() hash.Hash32) PartitionerConstructor {
+	return func(topic string) Partitioner {
+		p := new(hashPartitioner)
+		p.random = NewRandomPartitioner(topic)
+		p.hasher = hasher()
+		p.referenceAbs = false
+		return p
+	}
+}
+
+// NewCustomPartitioner creates a default Partitioner but lets you specify the behavior of each component via options
+func NewCustomPartitioner(options ...HashPartitionerOption) PartitionerConstructor {
+	return func(topic string) Partitioner {
+		p := new(hashPartitioner)
+		p.random = NewRandomPartitioner(topic)
+		p.hasher = fnv.New32a()
+		p.referenceAbs = false
+		for _, option := range options {
+			option(p)
+		}
+		return p
+	}
+}
+
+// NewHashPartitioner returns a Partitioner which behaves as follows. If the message's key is nil then a
+// random partition is chosen. Otherwise the FNV-1a hash of the encoded bytes of the message key is used,
+// modulus the number of partitions. This ensures that messages with the same key always end up on the
+// same partition.
+func NewHashPartitioner(topic string) Partitioner {
+	p := new(hashPartitioner)
+	p.random = NewRandomPartitioner(topic)
+	p.hasher = fnv.New32a()
+	p.referenceAbs = false
+	return p
+}
+
+// NewReferenceHashPartitioner is like NewHashPartitioner except that it handles absolute values
+// in the same way as the reference Java implementation. NewHashPartitioner was supposed to do
+// that but it had a mistake and now there are people depending on both behaviours. This will
+// all go away on the next major version bump.
+func NewReferenceHashPartitioner(topic string) Partitioner {
+	p := new(hashPartitioner)
+	p.random = NewRandomPartitioner(topic)
+	p.hasher = fnv.New32a()
+	p.referenceAbs = true
+	return p
+}
+
+func (p *hashPartitioner) Partition(message *ProducerMessage, numPartitions int32) (int32, error) {
+	if message.Key == nil {
+		return p.random.Partition(message, numPartitions)
+	}
+	bytes, err := message.Key.Encode()
+	if err != nil {
+		return -1, err
+	}
+	p.hasher.Reset()
+	_, err = p.hasher.Write(bytes)
+	if err != nil {
+		return -1, err
+	}
+	var partition int32
+	// Turns out we were doing our absolute value in a subtly different way from the upstream
+	// implementation, but now we need to maintain backwards compat for people who started using
+	// the old version; if referenceAbs is set we are compatible with the reference java client
+	// but not past Sarama versions
+	if p.referenceAbs {
+		partition = (int32(p.hasher.Sum32()) & 0x7fffffff) % numPartitions
+	} else {
+		partition = int32(p.hasher.Sum32()) % numPartitions
+		if partition < 0 {
+			partition = -partition
+		}
+	}
+	return partition, nil
+}
+
+func (p *hashPartitioner) RequiresConsistency() bool {
+	return true
+}
+
+func (p *hashPartitioner) MessageRequiresConsistency(message *ProducerMessage) bool {
+	return message.Key != nil
+}

diff --git a/vendor/github.com/Shopify/sarama/prep_encoder.go b/vendor/github.com/Shopify/sarama/prep_encoder.go
new file mode 100644
index 0000000..b633cd1
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/prep_encoder.go

@@ -0,0 +1,153 @@
+package sarama
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math"
+
+	"github.com/rcrowley/go-metrics"
+)
+
+type prepEncoder struct {
+	stack  []pushEncoder
+	length int
+}
+
+// primitives
+
+func (pe *prepEncoder) putInt8(in int8) {
+	pe.length++
+}
+
+func (pe *prepEncoder) putInt16(in int16) {
+	pe.length += 2
+}
+
+func (pe *prepEncoder) putInt32(in int32) {
+	pe.length += 4
+}
+
+func (pe *prepEncoder) putInt64(in int64) {
+	pe.length += 8
+}
+
+func (pe *prepEncoder) putVarint(in int64) {
+	var buf [binary.MaxVarintLen64]byte
+	pe.length += binary.PutVarint(buf[:], in)
+}
+
+func (pe *prepEncoder) putArrayLength(in int) error {
+	if in > math.MaxInt32 {
+		return PacketEncodingError{fmt.Sprintf("array too long (%d)", in)}
+	}
+	pe.length += 4
+	return nil
+}
+
+func (pe *prepEncoder) putBool(in bool) {
+	pe.length++
+}
+
+// arrays
+
+func (pe *prepEncoder) putBytes(in []byte) error {
+	pe.length += 4
+	if in == nil {
+		return nil
+	}
+	return pe.putRawBytes(in)
+}
+
+func (pe *prepEncoder) putVarintBytes(in []byte) error {
+	if in == nil {
+		pe.putVarint(-1)
+		return nil
+	}
+	pe.putVarint(int64(len(in)))
+	return pe.putRawBytes(in)
+}
+
+func (pe *prepEncoder) putRawBytes(in []byte) error {
+	if len(in) > math.MaxInt32 {
+		return PacketEncodingError{fmt.Sprintf("byteslice too long (%d)", len(in))}
+	}
+	pe.length += len(in)
+	return nil
+}
+
+func (pe *prepEncoder) putNullableString(in *string) error {
+	if in == nil {
+		pe.length += 2
+		return nil
+	}
+	return pe.putString(*in)
+}
+
+func (pe *prepEncoder) putString(in string) error {
+	pe.length += 2
+	if len(in) > math.MaxInt16 {
+		return PacketEncodingError{fmt.Sprintf("string too long (%d)", len(in))}
+	}
+	pe.length += len(in)
+	return nil
+}
+
+func (pe *prepEncoder) putStringArray(in []string) error {
+	err := pe.putArrayLength(len(in))
+	if err != nil {
+		return err
+	}
+
+	for _, str := range in {
+		if err := pe.putString(str); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (pe *prepEncoder) putInt32Array(in []int32) error {
+	err := pe.putArrayLength(len(in))
+	if err != nil {
+		return err
+	}
+	pe.length += 4 * len(in)
+	return nil
+}
+
+func (pe *prepEncoder) putInt64Array(in []int64) error {
+	err := pe.putArrayLength(len(in))
+	if err != nil {
+		return err
+	}
+	pe.length += 8 * len(in)
+	return nil
+}
+
+func (pe *prepEncoder) offset() int {
+	return pe.length
+}
+
+// stackable
+
+func (pe *prepEncoder) push(in pushEncoder) {
+	in.saveOffset(pe.length)
+	pe.length += in.reserveLength()
+	pe.stack = append(pe.stack, in)
+}
+
+func (pe *prepEncoder) pop() error {
+	in := pe.stack[len(pe.stack)-1]
+	pe.stack = pe.stack[:len(pe.stack)-1]
+	if dpe, ok := in.(dynamicPushEncoder); ok {
+		pe.length += dpe.adjustLength(pe.length)
+	}
+
+	return nil
+}
+
+// we do not record metrics during the prep encoder pass
+func (pe *prepEncoder) metricRegistry() metrics.Registry {
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/produce_request.go b/vendor/github.com/Shopify/sarama/produce_request.go
new file mode 100644
index 0000000..0c755d0
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/produce_request.go

@@ -0,0 +1,252 @@
+package sarama
+
+import "github.com/rcrowley/go-metrics"
+
+// RequiredAcks is used in Produce Requests to tell the broker how many replica acknowledgements
+// it must see before responding. Any of the constants defined here are valid. On broker versions
+// prior to 0.8.2.0 any other positive int16 is also valid (the broker will wait for that many
+// acknowledgements) but in 0.8.2.0 and later this will raise an exception (it has been replaced
+// by setting the `min.isr` value in the brokers configuration).
+type RequiredAcks int16
+
+const (
+	// NoResponse doesn't send any response, the TCP ACK is all you get.
+	NoResponse RequiredAcks = 0
+	// WaitForLocal waits for only the local commit to succeed before responding.
+	WaitForLocal RequiredAcks = 1
+	// WaitForAll waits for all in-sync replicas to commit before responding.
+	// The minimum number of in-sync replicas is configured on the broker via
+	// the `min.insync.replicas` configuration key.
+	WaitForAll RequiredAcks = -1
+)
+
+type ProduceRequest struct {
+	TransactionalID *string
+	RequiredAcks    RequiredAcks
+	Timeout         int32
+	Version         int16 // v1 requires Kafka 0.9, v2 requires Kafka 0.10, v3 requires Kafka 0.11
+	records         map[string]map[int32]Records
+}
+
+func updateMsgSetMetrics(msgSet *MessageSet, compressionRatioMetric metrics.Histogram,
+	topicCompressionRatioMetric metrics.Histogram) int64 {
+	var topicRecordCount int64
+	for _, messageBlock := range msgSet.Messages {
+		// Is this a fake "message" wrapping real messages?
+		if messageBlock.Msg.Set != nil {
+			topicRecordCount += int64(len(messageBlock.Msg.Set.Messages))
+		} else {
+			// A single uncompressed message
+			topicRecordCount++
+		}
+		// Better be safe than sorry when computing the compression ratio
+		if messageBlock.Msg.compressedSize != 0 {
+			compressionRatio := float64(len(messageBlock.Msg.Value)) /
+				float64(messageBlock.Msg.compressedSize)
+			// Histogram do not support decimal values, let's multiple it by 100 for better precision
+			intCompressionRatio := int64(100 * compressionRatio)
+			compressionRatioMetric.Update(intCompressionRatio)
+			topicCompressionRatioMetric.Update(intCompressionRatio)
+		}
+	}
+	return topicRecordCount
+}
+
+func updateBatchMetrics(recordBatch *RecordBatch, compressionRatioMetric metrics.Histogram,
+	topicCompressionRatioMetric metrics.Histogram) int64 {
+	if recordBatch.compressedRecords != nil {
+		compressionRatio := int64(float64(recordBatch.recordsLen) / float64(len(recordBatch.compressedRecords)) * 100)
+		compressionRatioMetric.Update(compressionRatio)
+		topicCompressionRatioMetric.Update(compressionRatio)
+	}
+
+	return int64(len(recordBatch.Records))
+}
+
+func (r *ProduceRequest) encode(pe packetEncoder) error {
+	if r.Version >= 3 {
+		if err := pe.putNullableString(r.TransactionalID); err != nil {
+			return err
+		}
+	}
+	pe.putInt16(int16(r.RequiredAcks))
+	pe.putInt32(r.Timeout)
+	metricRegistry := pe.metricRegistry()
+	var batchSizeMetric metrics.Histogram
+	var compressionRatioMetric metrics.Histogram
+	if metricRegistry != nil {
+		batchSizeMetric = getOrRegisterHistogram("batch-size", metricRegistry)
+		compressionRatioMetric = getOrRegisterHistogram("compression-ratio", metricRegistry)
+	}
+	totalRecordCount := int64(0)
+
+	err := pe.putArrayLength(len(r.records))
+	if err != nil {
+		return err
+	}
+
+	for topic, partitions := range r.records {
+		err = pe.putString(topic)
+		if err != nil {
+			return err
+		}
+		err = pe.putArrayLength(len(partitions))
+		if err != nil {
+			return err
+		}
+		topicRecordCount := int64(0)
+		var topicCompressionRatioMetric metrics.Histogram
+		if metricRegistry != nil {
+			topicCompressionRatioMetric = getOrRegisterTopicHistogram("compression-ratio", topic, metricRegistry)
+		}
+		for id, records := range partitions {
+			startOffset := pe.offset()
+			pe.putInt32(id)
+			pe.push(&lengthField{})
+			err = records.encode(pe)
+			if err != nil {
+				return err
+			}
+			err = pe.pop()
+			if err != nil {
+				return err
+			}
+			if metricRegistry != nil {
+				if r.Version >= 3 {
+					topicRecordCount += updateBatchMetrics(records.RecordBatch, compressionRatioMetric, topicCompressionRatioMetric)
+				} else {
+					topicRecordCount += updateMsgSetMetrics(records.MsgSet, compressionRatioMetric, topicCompressionRatioMetric)
+				}
+				batchSize := int64(pe.offset() - startOffset)
+				batchSizeMetric.Update(batchSize)
+				getOrRegisterTopicHistogram("batch-size", topic, metricRegistry).Update(batchSize)
+			}
+		}
+		if topicRecordCount > 0 {
+			getOrRegisterTopicMeter("record-send-rate", topic, metricRegistry).Mark(topicRecordCount)
+			getOrRegisterTopicHistogram("records-per-request", topic, metricRegistry).Update(topicRecordCount)
+			totalRecordCount += topicRecordCount
+		}
+	}
+	if totalRecordCount > 0 {
+		metrics.GetOrRegisterMeter("record-send-rate", metricRegistry).Mark(totalRecordCount)
+		getOrRegisterHistogram("records-per-request", metricRegistry).Update(totalRecordCount)
+	}
+
+	return nil
+}
+
+func (r *ProduceRequest) decode(pd packetDecoder, version int16) error {
+	r.Version = version
+
+	if version >= 3 {
+		id, err := pd.getNullableString()
+		if err != nil {
+			return err
+		}
+		r.TransactionalID = id
+	}
+	requiredAcks, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	r.RequiredAcks = RequiredAcks(requiredAcks)
+	if r.Timeout, err = pd.getInt32(); err != nil {
+		return err
+	}
+	topicCount, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if topicCount == 0 {
+		return nil
+	}
+
+	r.records = make(map[string]map[int32]Records)
+	for i := 0; i < topicCount; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		partitionCount, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+		r.records[topic] = make(map[int32]Records)
+
+		for j := 0; j < partitionCount; j++ {
+			partition, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			size, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+			recordsDecoder, err := pd.getSubset(int(size))
+			if err != nil {
+				return err
+			}
+			var records Records
+			if err := records.decode(recordsDecoder); err != nil {
+				return err
+			}
+			r.records[topic][partition] = records
+		}
+	}
+
+	return nil
+}
+
+func (r *ProduceRequest) key() int16 {
+	return 0
+}
+
+func (r *ProduceRequest) version() int16 {
+	return r.Version
+}
+
+func (r *ProduceRequest) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_9_0_0
+	case 2:
+		return V0_10_0_0
+	case 3:
+		return V0_11_0_0
+	default:
+		return MinVersion
+	}
+}
+
+func (r *ProduceRequest) ensureRecords(topic string, partition int32) {
+	if r.records == nil {
+		r.records = make(map[string]map[int32]Records)
+	}
+
+	if r.records[topic] == nil {
+		r.records[topic] = make(map[int32]Records)
+	}
+}
+
+func (r *ProduceRequest) AddMessage(topic string, partition int32, msg *Message) {
+	r.ensureRecords(topic, partition)
+	set := r.records[topic][partition].MsgSet
+
+	if set == nil {
+		set = new(MessageSet)
+		r.records[topic][partition] = newLegacyRecords(set)
+	}
+
+	set.addMessage(msg)
+}
+
+func (r *ProduceRequest) AddSet(topic string, partition int32, set *MessageSet) {
+	r.ensureRecords(topic, partition)
+	r.records[topic][partition] = newLegacyRecords(set)
+}
+
+func (r *ProduceRequest) AddBatch(topic string, partition int32, batch *RecordBatch) {
+	r.ensureRecords(topic, partition)
+	r.records[topic][partition] = newDefaultRecords(batch)
+}

diff --git a/vendor/github.com/Shopify/sarama/produce_response.go b/vendor/github.com/Shopify/sarama/produce_response.go
new file mode 100644
index 0000000..4c5cd35
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/produce_response.go

@@ -0,0 +1,189 @@
+package sarama
+
+import (
+	"fmt"
+	"time"
+)
+
+type ProduceResponseBlock struct {
+	Err    KError
+	Offset int64
+	// only provided if Version >= 2 and the broker is configured with `LogAppendTime`
+	Timestamp time.Time
+}
+
+func (b *ProduceResponseBlock) decode(pd packetDecoder, version int16) (err error) {
+	tmp, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	b.Err = KError(tmp)
+
+	b.Offset, err = pd.getInt64()
+	if err != nil {
+		return err
+	}
+
+	if version >= 2 {
+		if millis, err := pd.getInt64(); err != nil {
+			return err
+		} else if millis != -1 {
+			b.Timestamp = time.Unix(millis/1000, (millis%1000)*int64(time.Millisecond))
+		}
+	}
+
+	return nil
+}
+
+func (b *ProduceResponseBlock) encode(pe packetEncoder, version int16) (err error) {
+	pe.putInt16(int16(b.Err))
+	pe.putInt64(b.Offset)
+
+	if version >= 2 {
+		timestamp := int64(-1)
+		if !b.Timestamp.Before(time.Unix(0, 0)) {
+			timestamp = b.Timestamp.UnixNano() / int64(time.Millisecond)
+		} else if !b.Timestamp.IsZero() {
+			return PacketEncodingError{fmt.Sprintf("invalid timestamp (%v)", b.Timestamp)}
+		}
+		pe.putInt64(timestamp)
+	}
+
+	return nil
+}
+
+type ProduceResponse struct {
+	Blocks       map[string]map[int32]*ProduceResponseBlock
+	Version      int16
+	ThrottleTime time.Duration // only provided if Version >= 1
+}
+
+func (r *ProduceResponse) decode(pd packetDecoder, version int16) (err error) {
+	r.Version = version
+
+	numTopics, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	r.Blocks = make(map[string]map[int32]*ProduceResponseBlock, numTopics)
+	for i := 0; i < numTopics; i++ {
+		name, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		numBlocks, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+
+		r.Blocks[name] = make(map[int32]*ProduceResponseBlock, numBlocks)
+
+		for j := 0; j < numBlocks; j++ {
+			id, err := pd.getInt32()
+			if err != nil {
+				return err
+			}
+
+			block := new(ProduceResponseBlock)
+			err = block.decode(pd, version)
+			if err != nil {
+				return err
+			}
+			r.Blocks[name][id] = block
+		}
+	}
+
+	if r.Version >= 1 {
+		millis, err := pd.getInt32()
+		if err != nil {
+			return err
+		}
+
+		r.ThrottleTime = time.Duration(millis) * time.Millisecond
+	}
+
+	return nil
+}
+
+func (r *ProduceResponse) encode(pe packetEncoder) error {
+	err := pe.putArrayLength(len(r.Blocks))
+	if err != nil {
+		return err
+	}
+	for topic, partitions := range r.Blocks {
+		err = pe.putString(topic)
+		if err != nil {
+			return err
+		}
+		err = pe.putArrayLength(len(partitions))
+		if err != nil {
+			return err
+		}
+		for id, prb := range partitions {
+			pe.putInt32(id)
+			err = prb.encode(pe, r.Version)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	if r.Version >= 1 {
+		pe.putInt32(int32(r.ThrottleTime / time.Millisecond))
+	}
+	return nil
+}
+
+func (r *ProduceResponse) key() int16 {
+	return 0
+}
+
+func (r *ProduceResponse) version() int16 {
+	return r.Version
+}
+
+func (r *ProduceResponse) requiredVersion() KafkaVersion {
+	switch r.Version {
+	case 1:
+		return V0_9_0_0
+	case 2:
+		return V0_10_0_0
+	case 3:
+		return V0_11_0_0
+	default:
+		return MinVersion
+	}
+}
+
+func (r *ProduceResponse) GetBlock(topic string, partition int32) *ProduceResponseBlock {
+	if r.Blocks == nil {
+		return nil
+	}
+
+	if r.Blocks[topic] == nil {
+		return nil
+	}
+
+	return r.Blocks[topic][partition]
+}
+
+// Testing API
+
+func (r *ProduceResponse) AddTopicPartition(topic string, partition int32, err KError) {
+	if r.Blocks == nil {
+		r.Blocks = make(map[string]map[int32]*ProduceResponseBlock)
+	}
+	byTopic, ok := r.Blocks[topic]
+	if !ok {
+		byTopic = make(map[int32]*ProduceResponseBlock)
+		r.Blocks[topic] = byTopic
+	}
+	block := &ProduceResponseBlock{
+		Err: err,
+	}
+	if r.Version >= 2 {
+		block.Timestamp = time.Now()
+	}
+	byTopic[partition] = block
+}

diff --git a/vendor/github.com/Shopify/sarama/produce_set.go b/vendor/github.com/Shopify/sarama/produce_set.go
new file mode 100644
index 0000000..219ec5f
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/produce_set.go

@@ -0,0 +1,259 @@
+package sarama
+
+import (
+	"encoding/binary"
+	"errors"
+	"time"
+)
+
+type partitionSet struct {
+	msgs          []*ProducerMessage
+	recordsToSend Records
+	bufferBytes   int
+}
+
+type produceSet struct {
+	parent *asyncProducer
+	msgs   map[string]map[int32]*partitionSet
+
+	bufferBytes int
+	bufferCount int
+}
+
+func newProduceSet(parent *asyncProducer) *produceSet {
+	return &produceSet{
+		msgs:   make(map[string]map[int32]*partitionSet),
+		parent: parent,
+	}
+}
+
+func (ps *produceSet) add(msg *ProducerMessage) error {
+	var err error
+	var key, val []byte
+
+	if msg.Key != nil {
+		if key, err = msg.Key.Encode(); err != nil {
+			return err
+		}
+	}
+
+	if msg.Value != nil {
+		if val, err = msg.Value.Encode(); err != nil {
+			return err
+		}
+	}
+
+	timestamp := msg.Timestamp
+	if msg.Timestamp.IsZero() {
+		timestamp = time.Now()
+	}
+
+	partitions := ps.msgs[msg.Topic]
+	if partitions == nil {
+		partitions = make(map[int32]*partitionSet)
+		ps.msgs[msg.Topic] = partitions
+	}
+
+	var size int
+
+	set := partitions[msg.Partition]
+	if set == nil {
+		if ps.parent.conf.Version.IsAtLeast(V0_11_0_0) {
+			batch := &RecordBatch{
+				FirstTimestamp:   timestamp,
+				Version:          2,
+				Codec:            ps.parent.conf.Producer.Compression,
+				CompressionLevel: ps.parent.conf.Producer.CompressionLevel,
+				ProducerID:       ps.parent.txnmgr.producerID,
+				ProducerEpoch:    ps.parent.txnmgr.producerEpoch,
+			}
+			if ps.parent.conf.Producer.Idempotent {
+				batch.FirstSequence = msg.sequenceNumber
+			}
+			set = &partitionSet{recordsToSend: newDefaultRecords(batch)}
+			size = recordBatchOverhead
+		} else {
+			set = &partitionSet{recordsToSend: newLegacyRecords(new(MessageSet))}
+		}
+		partitions[msg.Partition] = set
+	}
+	set.msgs = append(set.msgs, msg)
+
+	if ps.parent.conf.Version.IsAtLeast(V0_11_0_0) {
+		if ps.parent.conf.Producer.Idempotent && msg.sequenceNumber < set.recordsToSend.RecordBatch.FirstSequence {
+			return errors.New("Assertion failed: Message out of sequence added to a batch")
+		}
+		// We are being conservative here to avoid having to prep encode the record
+		size += maximumRecordOverhead
+		rec := &Record{
+			Key:            key,
+			Value:          val,
+			TimestampDelta: timestamp.Sub(set.recordsToSend.RecordBatch.FirstTimestamp),
+		}
+		size += len(key) + len(val)
+		if len(msg.Headers) > 0 {
+			rec.Headers = make([]*RecordHeader, len(msg.Headers))
+			for i := range msg.Headers {
+				rec.Headers[i] = &msg.Headers[i]
+				size += len(rec.Headers[i].Key) + len(rec.Headers[i].Value) + 2*binary.MaxVarintLen32
+			}
+		}
+		set.recordsToSend.RecordBatch.addRecord(rec)
+	} else {
+		msgToSend := &Message{Codec: CompressionNone, Key: key, Value: val}
+		if ps.parent.conf.Version.IsAtLeast(V0_10_0_0) {
+			msgToSend.Timestamp = timestamp
+			msgToSend.Version = 1
+		}
+		set.recordsToSend.MsgSet.addMessage(msgToSend)
+		size = producerMessageOverhead + len(key) + len(val)
+	}
+
+	set.bufferBytes += size
+	ps.bufferBytes += size
+	ps.bufferCount++
+
+	return nil
+}
+
+func (ps *produceSet) buildRequest() *ProduceRequest {
+	req := &ProduceRequest{
+		RequiredAcks: ps.parent.conf.Producer.RequiredAcks,
+		Timeout:      int32(ps.parent.conf.Producer.Timeout / time.Millisecond),
+	}
+	if ps.parent.conf.Version.IsAtLeast(V0_10_0_0) {
+		req.Version = 2
+	}
+	if ps.parent.conf.Version.IsAtLeast(V0_11_0_0) {
+		req.Version = 3
+	}
+
+	for topic, partitionSets := range ps.msgs {
+		for partition, set := range partitionSets {
+			if req.Version >= 3 {
+				// If the API version we're hitting is 3 or greater, we need to calculate
+				// offsets for each record in the batch relative to FirstOffset.
+				// Additionally, we must set LastOffsetDelta to the value of the last offset
+				// in the batch. Since the OffsetDelta of the first record is 0, we know that the
+				// final record of any batch will have an offset of (# of records in batch) - 1.
+				// (See https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-Messagesets
+				//  under the RecordBatch section for details.)
+				rb := set.recordsToSend.RecordBatch
+				if len(rb.Records) > 0 {
+					rb.LastOffsetDelta = int32(len(rb.Records) - 1)
+					for i, record := range rb.Records {
+						record.OffsetDelta = int64(i)
+					}
+				}
+				req.AddBatch(topic, partition, rb)
+				continue
+			}
+			if ps.parent.conf.Producer.Compression == CompressionNone {
+				req.AddSet(topic, partition, set.recordsToSend.MsgSet)
+			} else {
+				// When compression is enabled, the entire set for each partition is compressed
+				// and sent as the payload of a single fake "message" with the appropriate codec
+				// set and no key. When the server sees a message with a compression codec, it
+				// decompresses the payload and treats the result as its message set.
+
+				if ps.parent.conf.Version.IsAtLeast(V0_10_0_0) {
+					// If our version is 0.10 or later, assign relative offsets
+					// to the inner messages. This lets the broker avoid
+					// recompressing the message set.
+					// (See https://cwiki.apache.org/confluence/display/KAFKA/KIP-31+-+Move+to+relative+offsets+in+compressed+message+sets
+					// for details on relative offsets.)
+					for i, msg := range set.recordsToSend.MsgSet.Messages {
+						msg.Offset = int64(i)
+					}
+				}
+				payload, err := encode(set.recordsToSend.MsgSet, ps.parent.conf.MetricRegistry)
+				if err != nil {
+					Logger.Println(err) // if this happens, it's basically our fault.
+					panic(err)
+				}
+				compMsg := &Message{
+					Codec:            ps.parent.conf.Producer.Compression,
+					CompressionLevel: ps.parent.conf.Producer.CompressionLevel,
+					Key:              nil,
+					Value:            payload,
+					Set:              set.recordsToSend.MsgSet, // Provide the underlying message set for accurate metrics
+				}
+				if ps.parent.conf.Version.IsAtLeast(V0_10_0_0) {
+					compMsg.Version = 1
+					compMsg.Timestamp = set.recordsToSend.MsgSet.Messages[0].Msg.Timestamp
+				}
+				req.AddMessage(topic, partition, compMsg)
+			}
+		}
+	}
+
+	return req
+}
+
+func (ps *produceSet) eachPartition(cb func(topic string, partition int32, pSet *partitionSet)) {
+	for topic, partitionSet := range ps.msgs {
+		for partition, set := range partitionSet {
+			cb(topic, partition, set)
+		}
+	}
+}
+
+func (ps *produceSet) dropPartition(topic string, partition int32) []*ProducerMessage {
+	if ps.msgs[topic] == nil {
+		return nil
+	}
+	set := ps.msgs[topic][partition]
+	if set == nil {
+		return nil
+	}
+	ps.bufferBytes -= set.bufferBytes
+	ps.bufferCount -= len(set.msgs)
+	delete(ps.msgs[topic], partition)
+	return set.msgs
+}
+
+func (ps *produceSet) wouldOverflow(msg *ProducerMessage) bool {
+	version := 1
+	if ps.parent.conf.Version.IsAtLeast(V0_11_0_0) {
+		version = 2
+	}
+
+	switch {
+	// Would we overflow our maximum possible size-on-the-wire? 10KiB is arbitrary overhead for safety.
+	case ps.bufferBytes+msg.byteSize(version) >= int(MaxRequestSize-(10*1024)):
+		return true
+	// Would we overflow the size-limit of a compressed message-batch for this partition?
+	case ps.parent.conf.Producer.Compression != CompressionNone &&
+		ps.msgs[msg.Topic] != nil && ps.msgs[msg.Topic][msg.Partition] != nil &&
+		ps.msgs[msg.Topic][msg.Partition].bufferBytes+msg.byteSize(version) >= ps.parent.conf.Producer.MaxMessageBytes:
+		return true
+	// Would we overflow simply in number of messages?
+	case ps.parent.conf.Producer.Flush.MaxMessages > 0 && ps.bufferCount >= ps.parent.conf.Producer.Flush.MaxMessages:
+		return true
+	default:
+		return false
+	}
+}
+
+func (ps *produceSet) readyToFlush() bool {
+	switch {
+	// If we don't have any messages, nothing else matters
+	case ps.empty():
+		return false
+	// If all three config values are 0, we always flush as-fast-as-possible
+	case ps.parent.conf.Producer.Flush.Frequency == 0 && ps.parent.conf.Producer.Flush.Bytes == 0 && ps.parent.conf.Producer.Flush.Messages == 0:
+		return true
+	// If we've passed the message trigger-point
+	case ps.parent.conf.Producer.Flush.Messages > 0 && ps.bufferCount >= ps.parent.conf.Producer.Flush.Messages:
+		return true
+	// If we've passed the byte trigger-point
+	case ps.parent.conf.Producer.Flush.Bytes > 0 && ps.bufferBytes >= ps.parent.conf.Producer.Flush.Bytes:
+		return true
+	default:
+		return false
+	}
+}
+
+func (ps *produceSet) empty() bool {
+	return ps.bufferCount == 0
+}

diff --git a/vendor/github.com/Shopify/sarama/real_decoder.go b/vendor/github.com/Shopify/sarama/real_decoder.go
new file mode 100644
index 0000000..23045e7
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/real_decoder.go

@@ -0,0 +1,324 @@
+package sarama
+
+import (
+	"encoding/binary"
+	"math"
+)
+
+var errInvalidArrayLength = PacketDecodingError{"invalid array length"}
+var errInvalidByteSliceLength = PacketDecodingError{"invalid byteslice length"}
+var errInvalidByteSliceLengthType = PacketDecodingError{"invalid byteslice length type"}
+var errInvalidStringLength = PacketDecodingError{"invalid string length"}
+var errInvalidSubsetSize = PacketDecodingError{"invalid subset size"}
+var errVarintOverflow = PacketDecodingError{"varint overflow"}
+var errInvalidBool = PacketDecodingError{"invalid bool"}
+
+type realDecoder struct {
+	raw   []byte
+	off   int
+	stack []pushDecoder
+}
+
+// primitives
+
+func (rd *realDecoder) getInt8() (int8, error) {
+	if rd.remaining() < 1 {
+		rd.off = len(rd.raw)
+		return -1, ErrInsufficientData
+	}
+	tmp := int8(rd.raw[rd.off])
+	rd.off++
+	return tmp, nil
+}
+
+func (rd *realDecoder) getInt16() (int16, error) {
+	if rd.remaining() < 2 {
+		rd.off = len(rd.raw)
+		return -1, ErrInsufficientData
+	}
+	tmp := int16(binary.BigEndian.Uint16(rd.raw[rd.off:]))
+	rd.off += 2
+	return tmp, nil
+}
+
+func (rd *realDecoder) getInt32() (int32, error) {
+	if rd.remaining() < 4 {
+		rd.off = len(rd.raw)
+		return -1, ErrInsufficientData
+	}
+	tmp := int32(binary.BigEndian.Uint32(rd.raw[rd.off:]))
+	rd.off += 4
+	return tmp, nil
+}
+
+func (rd *realDecoder) getInt64() (int64, error) {
+	if rd.remaining() < 8 {
+		rd.off = len(rd.raw)
+		return -1, ErrInsufficientData
+	}
+	tmp := int64(binary.BigEndian.Uint64(rd.raw[rd.off:]))
+	rd.off += 8
+	return tmp, nil
+}
+
+func (rd *realDecoder) getVarint() (int64, error) {
+	tmp, n := binary.Varint(rd.raw[rd.off:])
+	if n == 0 {
+		rd.off = len(rd.raw)
+		return -1, ErrInsufficientData
+	}
+	if n < 0 {
+		rd.off -= n
+		return -1, errVarintOverflow
+	}
+	rd.off += n
+	return tmp, nil
+}
+
+func (rd *realDecoder) getArrayLength() (int, error) {
+	if rd.remaining() < 4 {
+		rd.off = len(rd.raw)
+		return -1, ErrInsufficientData
+	}
+	tmp := int(int32(binary.BigEndian.Uint32(rd.raw[rd.off:])))
+	rd.off += 4
+	if tmp > rd.remaining() {
+		rd.off = len(rd.raw)
+		return -1, ErrInsufficientData
+	} else if tmp > 2*math.MaxUint16 {
+		return -1, errInvalidArrayLength
+	}
+	return tmp, nil
+}
+
+func (rd *realDecoder) getBool() (bool, error) {
+	b, err := rd.getInt8()
+	if err != nil || b == 0 {
+		return false, err
+	}
+	if b != 1 {
+		return false, errInvalidBool
+	}
+	return true, nil
+}
+
+// collections
+
+func (rd *realDecoder) getBytes() ([]byte, error) {
+	tmp, err := rd.getInt32()
+	if err != nil {
+		return nil, err
+	}
+	if tmp == -1 {
+		return nil, nil
+	}
+
+	return rd.getRawBytes(int(tmp))
+}
+
+func (rd *realDecoder) getVarintBytes() ([]byte, error) {
+	tmp, err := rd.getVarint()
+	if err != nil {
+		return nil, err
+	}
+	if tmp == -1 {
+		return nil, nil
+	}
+
+	return rd.getRawBytes(int(tmp))
+}
+
+func (rd *realDecoder) getStringLength() (int, error) {
+	length, err := rd.getInt16()
+	if err != nil {
+		return 0, err
+	}
+
+	n := int(length)
+
+	switch {
+	case n < -1:
+		return 0, errInvalidStringLength
+	case n > rd.remaining():
+		rd.off = len(rd.raw)
+		return 0, ErrInsufficientData
+	}
+
+	return n, nil
+}
+
+func (rd *realDecoder) getString() (string, error) {
+	n, err := rd.getStringLength()
+	if err != nil || n == -1 {
+		return "", err
+	}
+
+	tmpStr := string(rd.raw[rd.off : rd.off+n])
+	rd.off += n
+	return tmpStr, nil
+}
+
+func (rd *realDecoder) getNullableString() (*string, error) {
+	n, err := rd.getStringLength()
+	if err != nil || n == -1 {
+		return nil, err
+	}
+
+	tmpStr := string(rd.raw[rd.off : rd.off+n])
+	rd.off += n
+	return &tmpStr, err
+}
+
+func (rd *realDecoder) getInt32Array() ([]int32, error) {
+	if rd.remaining() < 4 {
+		rd.off = len(rd.raw)
+		return nil, ErrInsufficientData
+	}
+	n := int(binary.BigEndian.Uint32(rd.raw[rd.off:]))
+	rd.off += 4
+
+	if rd.remaining() < 4*n {
+		rd.off = len(rd.raw)
+		return nil, ErrInsufficientData
+	}
+
+	if n == 0 {
+		return nil, nil
+	}
+
+	if n < 0 {
+		return nil, errInvalidArrayLength
+	}
+
+	ret := make([]int32, n)
+	for i := range ret {
+		ret[i] = int32(binary.BigEndian.Uint32(rd.raw[rd.off:]))
+		rd.off += 4
+	}
+	return ret, nil
+}
+
+func (rd *realDecoder) getInt64Array() ([]int64, error) {
+	if rd.remaining() < 4 {
+		rd.off = len(rd.raw)
+		return nil, ErrInsufficientData
+	}
+	n := int(binary.BigEndian.Uint32(rd.raw[rd.off:]))
+	rd.off += 4
+
+	if rd.remaining() < 8*n {
+		rd.off = len(rd.raw)
+		return nil, ErrInsufficientData
+	}
+
+	if n == 0 {
+		return nil, nil
+	}
+
+	if n < 0 {
+		return nil, errInvalidArrayLength
+	}
+
+	ret := make([]int64, n)
+	for i := range ret {
+		ret[i] = int64(binary.BigEndian.Uint64(rd.raw[rd.off:]))
+		rd.off += 8
+	}
+	return ret, nil
+}
+
+func (rd *realDecoder) getStringArray() ([]string, error) {
+	if rd.remaining() < 4 {
+		rd.off = len(rd.raw)
+		return nil, ErrInsufficientData
+	}
+	n := int(binary.BigEndian.Uint32(rd.raw[rd.off:]))
+	rd.off += 4
+
+	if n == 0 {
+		return nil, nil
+	}
+
+	if n < 0 {
+		return nil, errInvalidArrayLength
+	}
+
+	ret := make([]string, n)
+	for i := range ret {
+		str, err := rd.getString()
+		if err != nil {
+			return nil, err
+		}
+
+		ret[i] = str
+	}
+	return ret, nil
+}
+
+// subsets
+
+func (rd *realDecoder) remaining() int {
+	return len(rd.raw) - rd.off
+}
+
+func (rd *realDecoder) getSubset(length int) (packetDecoder, error) {
+	buf, err := rd.getRawBytes(length)
+	if err != nil {
+		return nil, err
+	}
+	return &realDecoder{raw: buf}, nil
+}
+
+func (rd *realDecoder) getRawBytes(length int) ([]byte, error) {
+	if length < 0 {
+		return nil, errInvalidByteSliceLength
+	} else if length > rd.remaining() {
+		rd.off = len(rd.raw)
+		return nil, ErrInsufficientData
+	}
+
+	start := rd.off
+	rd.off += length
+	return rd.raw[start:rd.off], nil
+}
+
+func (rd *realDecoder) peek(offset, length int) (packetDecoder, error) {
+	if rd.remaining() < offset+length {
+		return nil, ErrInsufficientData
+	}
+	off := rd.off + offset
+	return &realDecoder{raw: rd.raw[off : off+length]}, nil
+}
+
+// stacks
+
+func (rd *realDecoder) push(in pushDecoder) error {
+	in.saveOffset(rd.off)
+
+	var reserve int
+	if dpd, ok := in.(dynamicPushDecoder); ok {
+		if err := dpd.decode(rd); err != nil {
+			return err
+		}
+	} else {
+		reserve = in.reserveLength()
+		if rd.remaining() < reserve {
+			rd.off = len(rd.raw)
+			return ErrInsufficientData
+		}
+	}
+
+	rd.stack = append(rd.stack, in)
+
+	rd.off += reserve
+
+	return nil
+}
+
+func (rd *realDecoder) pop() error {
+	// this is go's ugly pop pattern (the inverse of append)
+	in := rd.stack[len(rd.stack)-1]
+	rd.stack = rd.stack[:len(rd.stack)-1]
+
+	return in.check(rd.off, rd.raw)
+}

diff --git a/vendor/github.com/Shopify/sarama/real_encoder.go b/vendor/github.com/Shopify/sarama/real_encoder.go
new file mode 100644
index 0000000..3c75387
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/real_encoder.go

@@ -0,0 +1,156 @@
+package sarama
+
+import (
+	"encoding/binary"
+
+	"github.com/rcrowley/go-metrics"
+)
+
+type realEncoder struct {
+	raw      []byte
+	off      int
+	stack    []pushEncoder
+	registry metrics.Registry
+}
+
+// primitives
+
+func (re *realEncoder) putInt8(in int8) {
+	re.raw[re.off] = byte(in)
+	re.off++
+}
+
+func (re *realEncoder) putInt16(in int16) {
+	binary.BigEndian.PutUint16(re.raw[re.off:], uint16(in))
+	re.off += 2
+}
+
+func (re *realEncoder) putInt32(in int32) {
+	binary.BigEndian.PutUint32(re.raw[re.off:], uint32(in))
+	re.off += 4
+}
+
+func (re *realEncoder) putInt64(in int64) {
+	binary.BigEndian.PutUint64(re.raw[re.off:], uint64(in))
+	re.off += 8
+}
+
+func (re *realEncoder) putVarint(in int64) {
+	re.off += binary.PutVarint(re.raw[re.off:], in)
+}
+
+func (re *realEncoder) putArrayLength(in int) error {
+	re.putInt32(int32(in))
+	return nil
+}
+
+func (re *realEncoder) putBool(in bool) {
+	if in {
+		re.putInt8(1)
+		return
+	}
+	re.putInt8(0)
+}
+
+// collection
+
+func (re *realEncoder) putRawBytes(in []byte) error {
+	copy(re.raw[re.off:], in)
+	re.off += len(in)
+	return nil
+}
+
+func (re *realEncoder) putBytes(in []byte) error {
+	if in == nil {
+		re.putInt32(-1)
+		return nil
+	}
+	re.putInt32(int32(len(in)))
+	return re.putRawBytes(in)
+}
+
+func (re *realEncoder) putVarintBytes(in []byte) error {
+	if in == nil {
+		re.putVarint(-1)
+		return nil
+	}
+	re.putVarint(int64(len(in)))
+	return re.putRawBytes(in)
+}
+
+func (re *realEncoder) putString(in string) error {
+	re.putInt16(int16(len(in)))
+	copy(re.raw[re.off:], in)
+	re.off += len(in)
+	return nil
+}
+
+func (re *realEncoder) putNullableString(in *string) error {
+	if in == nil {
+		re.putInt16(-1)
+		return nil
+	}
+	return re.putString(*in)
+}
+
+func (re *realEncoder) putStringArray(in []string) error {
+	err := re.putArrayLength(len(in))
+	if err != nil {
+		return err
+	}
+
+	for _, val := range in {
+		if err := re.putString(val); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (re *realEncoder) putInt32Array(in []int32) error {
+	err := re.putArrayLength(len(in))
+	if err != nil {
+		return err
+	}
+	for _, val := range in {
+		re.putInt32(val)
+	}
+	return nil
+}
+
+func (re *realEncoder) putInt64Array(in []int64) error {
+	err := re.putArrayLength(len(in))
+	if err != nil {
+		return err
+	}
+	for _, val := range in {
+		re.putInt64(val)
+	}
+	return nil
+}
+
+func (re *realEncoder) offset() int {
+	return re.off
+}
+
+// stacks
+
+func (re *realEncoder) push(in pushEncoder) {
+	in.saveOffset(re.off)
+	re.off += in.reserveLength()
+	re.stack = append(re.stack, in)
+}
+
+func (re *realEncoder) pop() error {
+	// this is go's ugly pop pattern (the inverse of append)
+	in := re.stack[len(re.stack)-1]
+	re.stack = re.stack[:len(re.stack)-1]
+
+	return in.run(re.off, re.raw)
+}
+
+// we do record metrics during the real encoder pass
+func (re *realEncoder) metricRegistry() metrics.Registry {
+	return re.registry
+}

diff --git a/vendor/github.com/Shopify/sarama/record.go b/vendor/github.com/Shopify/sarama/record.go
new file mode 100644
index 0000000..cded308
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/record.go

@@ -0,0 +1,113 @@
+package sarama
+
+import (
+	"encoding/binary"
+	"time"
+)
+
+const (
+	controlMask           = 0x20
+	maximumRecordOverhead = 5*binary.MaxVarintLen32 + binary.MaxVarintLen64 + 1
+)
+
+type RecordHeader struct {
+	Key   []byte
+	Value []byte
+}
+
+func (h *RecordHeader) encode(pe packetEncoder) error {
+	if err := pe.putVarintBytes(h.Key); err != nil {
+		return err
+	}
+	return pe.putVarintBytes(h.Value)
+}
+
+func (h *RecordHeader) decode(pd packetDecoder) (err error) {
+	if h.Key, err = pd.getVarintBytes(); err != nil {
+		return err
+	}
+
+	if h.Value, err = pd.getVarintBytes(); err != nil {
+		return err
+	}
+	return nil
+}
+
+type Record struct {
+	Attributes     int8
+	TimestampDelta time.Duration
+	OffsetDelta    int64
+	Key            []byte
+	Value          []byte
+	Headers        []*RecordHeader
+
+	length varintLengthField
+}
+
+func (r *Record) encode(pe packetEncoder) error {
+	pe.push(&r.length)
+	pe.putInt8(r.Attributes)
+	pe.putVarint(int64(r.TimestampDelta / time.Millisecond))
+	pe.putVarint(r.OffsetDelta)
+	if err := pe.putVarintBytes(r.Key); err != nil {
+		return err
+	}
+	if err := pe.putVarintBytes(r.Value); err != nil {
+		return err
+	}
+	pe.putVarint(int64(len(r.Headers)))
+
+	for _, h := range r.Headers {
+		if err := h.encode(pe); err != nil {
+			return err
+		}
+	}
+
+	return pe.pop()
+}
+
+func (r *Record) decode(pd packetDecoder) (err error) {
+	if err = pd.push(&r.length); err != nil {
+		return err
+	}
+
+	if r.Attributes, err = pd.getInt8(); err != nil {
+		return err
+	}
+
+	timestamp, err := pd.getVarint()
+	if err != nil {
+		return err
+	}
+	r.TimestampDelta = time.Duration(timestamp) * time.Millisecond
+
+	if r.OffsetDelta, err = pd.getVarint(); err != nil {
+		return err
+	}
+
+	if r.Key, err = pd.getVarintBytes(); err != nil {
+		return err
+	}
+
+	if r.Value, err = pd.getVarintBytes(); err != nil {
+		return err
+	}
+
+	numHeaders, err := pd.getVarint()
+	if err != nil {
+		return err
+	}
+
+	if numHeaders >= 0 {
+		r.Headers = make([]*RecordHeader, numHeaders)
+	}
+	for i := int64(0); i < numHeaders; i++ {
+		hdr := new(RecordHeader)
+		if err := hdr.decode(pd); err != nil {
+			return err
+		}
+		r.Headers[i] = hdr
+	}
+
+	return pd.pop()
+}

diff --git a/vendor/github.com/Shopify/sarama/record_batch.go b/vendor/github.com/Shopify/sarama/record_batch.go
new file mode 100644
index 0000000..5444557
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/record_batch.go

@@ -0,0 +1,278 @@
+package sarama
+
+import (
+	"bytes"
+	"compress/gzip"
+	"fmt"
+	"io/ioutil"
+	"time"
+
+	"github.com/eapache/go-xerial-snappy"
+	"github.com/pierrec/lz4"
+)
+
+const recordBatchOverhead = 49
+
+type recordsArray []*Record
+
+func (e recordsArray) encode(pe packetEncoder) error {
+	for _, r := range e {
+		if err := r.encode(pe); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (e recordsArray) decode(pd packetDecoder) error {
+	for i := range e {
+		rec := &Record{}
+		if err := rec.decode(pd); err != nil {
+			return err
+		}
+		e[i] = rec
+	}
+	return nil
+}
+
+type RecordBatch struct {
+	FirstOffset           int64
+	PartitionLeaderEpoch  int32
+	Version               int8
+	Codec                 CompressionCodec
+	CompressionLevel      int
+	Control               bool
+	LastOffsetDelta       int32
+	FirstTimestamp        time.Time
+	MaxTimestamp          time.Time
+	ProducerID            int64
+	ProducerEpoch         int16
+	FirstSequence         int32
+	Records               []*Record
+	PartialTrailingRecord bool
+
+	compressedRecords []byte
+	recordsLen        int // uncompressed records size
+}
+
+func (b *RecordBatch) encode(pe packetEncoder) error {
+	if b.Version != 2 {
+		return PacketEncodingError{fmt.Sprintf("unsupported compression codec (%d)", b.Codec)}
+	}
+	pe.putInt64(b.FirstOffset)
+	pe.push(&lengthField{})
+	pe.putInt32(b.PartitionLeaderEpoch)
+	pe.putInt8(b.Version)
+	pe.push(newCRC32Field(crcCastagnoli))
+	pe.putInt16(b.computeAttributes())
+	pe.putInt32(b.LastOffsetDelta)
+
+	if err := (Timestamp{&b.FirstTimestamp}).encode(pe); err != nil {
+		return err
+	}
+
+	if err := (Timestamp{&b.MaxTimestamp}).encode(pe); err != nil {
+		return err
+	}
+
+	pe.putInt64(b.ProducerID)
+	pe.putInt16(b.ProducerEpoch)
+	pe.putInt32(b.FirstSequence)
+
+	if err := pe.putArrayLength(len(b.Records)); err != nil {
+		return err
+	}
+
+	if b.compressedRecords == nil {
+		if err := b.encodeRecords(pe); err != nil {
+			return err
+		}
+	}
+	if err := pe.putRawBytes(b.compressedRecords); err != nil {
+		return err
+	}
+
+	if err := pe.pop(); err != nil {
+		return err
+	}
+	return pe.pop()
+}
+
+func (b *RecordBatch) decode(pd packetDecoder) (err error) {
+	if b.FirstOffset, err = pd.getInt64(); err != nil {
+		return err
+	}
+
+	batchLen, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+
+	if b.PartitionLeaderEpoch, err = pd.getInt32(); err != nil {
+		return err
+	}
+
+	if b.Version, err = pd.getInt8(); err != nil {
+		return err
+	}
+
+	if err = pd.push(&crc32Field{polynomial: crcCastagnoli}); err != nil {
+		return err
+	}
+
+	attributes, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+	b.Codec = CompressionCodec(int8(attributes) & compressionCodecMask)
+	b.Control = attributes&controlMask == controlMask
+
+	if b.LastOffsetDelta, err = pd.getInt32(); err != nil {
+		return err
+	}
+
+	if err = (Timestamp{&b.FirstTimestamp}).decode(pd); err != nil {
+		return err
+	}
+
+	if err = (Timestamp{&b.MaxTimestamp}).decode(pd); err != nil {
+		return err
+	}
+
+	if b.ProducerID, err = pd.getInt64(); err != nil {
+		return err
+	}
+
+	if b.ProducerEpoch, err = pd.getInt16(); err != nil {
+		return err
+	}
+
+	if b.FirstSequence, err = pd.getInt32(); err != nil {
+		return err
+	}
+
+	numRecs, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if numRecs >= 0 {
+		b.Records = make([]*Record, numRecs)
+	}
+
+	bufSize := int(batchLen) - recordBatchOverhead
+	recBuffer, err := pd.getRawBytes(bufSize)
+	if err != nil {
+		if err == ErrInsufficientData {
+			b.PartialTrailingRecord = true
+			b.Records = nil
+			return nil
+		}
+		return err
+	}
+
+	if err = pd.pop(); err != nil {
+		return err
+	}
+
+	switch b.Codec {
+	case CompressionNone:
+	case CompressionGZIP:
+		reader, err := gzip.NewReader(bytes.NewReader(recBuffer))
+		if err != nil {
+			return err
+		}
+		if recBuffer, err = ioutil.ReadAll(reader); err != nil {
+			return err
+		}
+	case CompressionSnappy:
+		if recBuffer, err = snappy.Decode(recBuffer); err != nil {
+			return err
+		}
+	case CompressionLZ4:
+		reader := lz4.NewReader(bytes.NewReader(recBuffer))
+		if recBuffer, err = ioutil.ReadAll(reader); err != nil {
+			return err
+		}
+	case CompressionZSTD:
+		if recBuffer, err = zstdDecompress(nil, recBuffer); err != nil {
+			return err
+		}
+	default:
+		return PacketDecodingError{fmt.Sprintf("invalid compression specified (%d)", b.Codec)}
+	}
+
+	b.recordsLen = len(recBuffer)
+	err = decode(recBuffer, recordsArray(b.Records))
+	if err == ErrInsufficientData {
+		b.PartialTrailingRecord = true
+		b.Records = nil
+		return nil
+	}
+	return err
+}
+
+func (b *RecordBatch) encodeRecords(pe packetEncoder) error {
+	var raw []byte
+	var err error
+	if raw, err = encode(recordsArray(b.Records), pe.metricRegistry()); err != nil {
+		return err
+	}
+	b.recordsLen = len(raw)
+
+	switch b.Codec {
+	case CompressionNone:
+		b.compressedRecords = raw
+	case CompressionGZIP:
+		var buf bytes.Buffer
+		var writer *gzip.Writer
+		if b.CompressionLevel != CompressionLevelDefault {
+			writer, err = gzip.NewWriterLevel(&buf, b.CompressionLevel)
+			if err != nil {
+				return err
+			}
+		} else {
+			writer = gzip.NewWriter(&buf)
+		}
+		if _, err := writer.Write(raw); err != nil {
+			return err
+		}
+		if err := writer.Close(); err != nil {
+			return err
+		}
+		b.compressedRecords = buf.Bytes()
+	case CompressionSnappy:
+		b.compressedRecords = snappy.Encode(raw)
+	case CompressionLZ4:
+		var buf bytes.Buffer
+		writer := lz4.NewWriter(&buf)
+		if _, err := writer.Write(raw); err != nil {
+			return err
+		}
+		if err := writer.Close(); err != nil {
+			return err
+		}
+		b.compressedRecords = buf.Bytes()
+	case CompressionZSTD:
+		c, err := zstdCompressLevel(nil, raw, b.CompressionLevel)
+		if err != nil {
+			return err
+		}
+		b.compressedRecords = c
+	default:
+		return PacketEncodingError{fmt.Sprintf("unsupported compression codec (%d)", b.Codec)}
+	}
+
+	return nil
+}
+
+func (b *RecordBatch) computeAttributes() int16 {
+	attr := int16(b.Codec) & int16(compressionCodecMask)
+	if b.Control {
+		attr |= controlMask
+	}
+	return attr
+}
+
+func (b *RecordBatch) addRecord(r *Record) {
+	b.Records = append(b.Records, r)
+}

diff --git a/vendor/github.com/Shopify/sarama/records.go b/vendor/github.com/Shopify/sarama/records.go
new file mode 100644
index 0000000..192f592
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/records.go

@@ -0,0 +1,194 @@
+package sarama
+
+import "fmt"
+
+const (
+	unknownRecords = iota
+	legacyRecords
+	defaultRecords
+
+	magicOffset = 16
+	magicLength = 1
+)
+
+// Records implements a union type containing either a RecordBatch or a legacy MessageSet.
+type Records struct {
+	recordsType int
+	MsgSet      *MessageSet
+	RecordBatch *RecordBatch
+}
+
+func newLegacyRecords(msgSet *MessageSet) Records {
+	return Records{recordsType: legacyRecords, MsgSet: msgSet}
+}
+
+func newDefaultRecords(batch *RecordBatch) Records {
+	return Records{recordsType: defaultRecords, RecordBatch: batch}
+}
+
+// setTypeFromFields sets type of Records depending on which of MsgSet or RecordBatch is not nil.
+// The first return value indicates whether both fields are nil (and the type is not set).
+// If both fields are not nil, it returns an error.
+func (r *Records) setTypeFromFields() (bool, error) {
+	if r.MsgSet == nil && r.RecordBatch == nil {
+		return true, nil
+	}
+	if r.MsgSet != nil && r.RecordBatch != nil {
+		return false, fmt.Errorf("both MsgSet and RecordBatch are set, but record type is unknown")
+	}
+	r.recordsType = defaultRecords
+	if r.MsgSet != nil {
+		r.recordsType = legacyRecords
+	}
+	return false, nil
+}
+
+func (r *Records) encode(pe packetEncoder) error {
+	if r.recordsType == unknownRecords {
+		if empty, err := r.setTypeFromFields(); err != nil || empty {
+			return err
+		}
+	}
+
+	switch r.recordsType {
+	case legacyRecords:
+		if r.MsgSet == nil {
+			return nil
+		}
+		return r.MsgSet.encode(pe)
+	case defaultRecords:
+		if r.RecordBatch == nil {
+			return nil
+		}
+		return r.RecordBatch.encode(pe)
+	}
+
+	return fmt.Errorf("unknown records type: %v", r.recordsType)
+}
+
+func (r *Records) setTypeFromMagic(pd packetDecoder) error {
+	magic, err := magicValue(pd)
+	if err != nil {
+		return err
+	}
+
+	r.recordsType = defaultRecords
+	if magic < 2 {
+		r.recordsType = legacyRecords
+	}
+
+	return nil
+}
+
+func (r *Records) decode(pd packetDecoder) error {
+	if r.recordsType == unknownRecords {
+		if err := r.setTypeFromMagic(pd); err != nil {
+			return err
+		}
+	}
+
+	switch r.recordsType {
+	case legacyRecords:
+		r.MsgSet = &MessageSet{}
+		return r.MsgSet.decode(pd)
+	case defaultRecords:
+		r.RecordBatch = &RecordBatch{}
+		return r.RecordBatch.decode(pd)
+	}
+	return fmt.Errorf("unknown records type: %v", r.recordsType)
+}
+
+func (r *Records) numRecords() (int, error) {
+	if r.recordsType == unknownRecords {
+		if empty, err := r.setTypeFromFields(); err != nil || empty {
+			return 0, err
+		}
+	}
+
+	switch r.recordsType {
+	case legacyRecords:
+		if r.MsgSet == nil {
+			return 0, nil
+		}
+		return len(r.MsgSet.Messages), nil
+	case defaultRecords:
+		if r.RecordBatch == nil {
+			return 0, nil
+		}
+		return len(r.RecordBatch.Records), nil
+	}
+	return 0, fmt.Errorf("unknown records type: %v", r.recordsType)
+}
+
+func (r *Records) isPartial() (bool, error) {
+	if r.recordsType == unknownRecords {
+		if empty, err := r.setTypeFromFields(); err != nil || empty {
+			return false, err
+		}
+	}
+
+	switch r.recordsType {
+	case unknownRecords:
+		return false, nil
+	case legacyRecords:
+		if r.MsgSet == nil {
+			return false, nil
+		}
+		return r.MsgSet.PartialTrailingMessage, nil
+	case defaultRecords:
+		if r.RecordBatch == nil {
+			return false, nil
+		}
+		return r.RecordBatch.PartialTrailingRecord, nil
+	}
+	return false, fmt.Errorf("unknown records type: %v", r.recordsType)
+}
+
+func (r *Records) isControl() (bool, error) {
+	if r.recordsType == unknownRecords {
+		if empty, err := r.setTypeFromFields(); err != nil || empty {
+			return false, err
+		}
+	}
+
+	switch r.recordsType {
+	case legacyRecords:
+		return false, nil
+	case defaultRecords:
+		if r.RecordBatch == nil {
+			return false, nil
+		}
+		return r.RecordBatch.Control, nil
+	}
+	return false, fmt.Errorf("unknown records type: %v", r.recordsType)
+}
+
+func (r *Records) isOverflow() (bool, error) {
+	if r.recordsType == unknownRecords {
+		if empty, err := r.setTypeFromFields(); err != nil || empty {
+			return false, err
+		}
+	}
+
+	switch r.recordsType {
+	case unknownRecords:
+		return false, nil
+	case legacyRecords:
+		if r.MsgSet == nil {
+			return false, nil
+		}
+		return r.MsgSet.OverflowMessage, nil
+	case defaultRecords:
+		return false, nil
+	}
+	return false, fmt.Errorf("unknown records type: %v", r.recordsType)
+}
+
+func magicValue(pd packetDecoder) (int8, error) {
+	dec, err := pd.peek(magicOffset, magicLength)
+	if err != nil {
+		return 0, err
+	}
+
+	return dec.getInt8()
+}

diff --git a/vendor/github.com/Shopify/sarama/request.go b/vendor/github.com/Shopify/sarama/request.go
new file mode 100644
index 0000000..4d211a1
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/request.go

@@ -0,0 +1,149 @@
+package sarama
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+)
+
+type protocolBody interface {
+	encoder
+	versionedDecoder
+	key() int16
+	version() int16
+	requiredVersion() KafkaVersion
+}
+
+type request struct {
+	correlationID int32
+	clientID      string
+	body          protocolBody
+}
+
+func (r *request) encode(pe packetEncoder) (err error) {
+	pe.push(&lengthField{})
+	pe.putInt16(r.body.key())
+	pe.putInt16(r.body.version())
+	pe.putInt32(r.correlationID)
+	err = pe.putString(r.clientID)
+	if err != nil {
+		return err
+	}
+	err = r.body.encode(pe)
+	if err != nil {
+		return err
+	}
+	return pe.pop()
+}
+
+func (r *request) decode(pd packetDecoder) (err error) {
+	var key int16
+	if key, err = pd.getInt16(); err != nil {
+		return err
+	}
+	var version int16
+	if version, err = pd.getInt16(); err != nil {
+		return err
+	}
+	if r.correlationID, err = pd.getInt32(); err != nil {
+		return err
+	}
+	r.clientID, err = pd.getString()
+
+	r.body = allocateBody(key, version)
+	if r.body == nil {
+		return PacketDecodingError{fmt.Sprintf("unknown request key (%d)", key)}
+	}
+	return r.body.decode(pd, version)
+}
+
+func decodeRequest(r io.Reader) (req *request, bytesRead int, err error) {
+	lengthBytes := make([]byte, 4)
+	if _, err := io.ReadFull(r, lengthBytes); err != nil {
+		return nil, bytesRead, err
+	}
+	bytesRead += len(lengthBytes)
+
+	length := int32(binary.BigEndian.Uint32(lengthBytes))
+	if length <= 4 || length > MaxRequestSize {
+		return nil, bytesRead, PacketDecodingError{fmt.Sprintf("message of length %d too large or too small", length)}
+	}
+
+	encodedReq := make([]byte, length)
+	if _, err := io.ReadFull(r, encodedReq); err != nil {
+		return nil, bytesRead, err
+	}
+	bytesRead += len(encodedReq)
+
+	req = &request{}
+	if err := decode(encodedReq, req); err != nil {
+		return nil, bytesRead, err
+	}
+	return req, bytesRead, nil
+}
+
+func allocateBody(key, version int16) protocolBody {
+	switch key {
+	case 0:
+		return &ProduceRequest{}
+	case 1:
+		return &FetchRequest{}
+	case 2:
+		return &OffsetRequest{Version: version}
+	case 3:
+		return &MetadataRequest{}
+	case 8:
+		return &OffsetCommitRequest{Version: version}
+	case 9:
+		return &OffsetFetchRequest{}
+	case 10:
+		return &FindCoordinatorRequest{}
+	case 11:
+		return &JoinGroupRequest{}
+	case 12:
+		return &HeartbeatRequest{}
+	case 13:
+		return &LeaveGroupRequest{}
+	case 14:
+		return &SyncGroupRequest{}
+	case 15:
+		return &DescribeGroupsRequest{}
+	case 16:
+		return &ListGroupsRequest{}
+	case 17:
+		return &SaslHandshakeRequest{}
+	case 18:
+		return &ApiVersionsRequest{}
+	case 19:
+		return &CreateTopicsRequest{}
+	case 20:
+		return &DeleteTopicsRequest{}
+	case 21:
+		return &DeleteRecordsRequest{}
+	case 22:
+		return &InitProducerIDRequest{}
+	case 24:
+		return &AddPartitionsToTxnRequest{}
+	case 25:
+		return &AddOffsetsToTxnRequest{}
+	case 26:
+		return &EndTxnRequest{}
+	case 28:
+		return &TxnOffsetCommitRequest{}
+	case 29:
+		return &DescribeAclsRequest{}
+	case 30:
+		return &CreateAclsRequest{}
+	case 31:
+		return &DeleteAclsRequest{}
+	case 32:
+		return &DescribeConfigsRequest{}
+	case 33:
+		return &AlterConfigsRequest{}
+	case 37:
+		return &CreatePartitionsRequest{}
+	case 42:
+		return &DeleteGroupsRequest{}
+	}
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/response_header.go b/vendor/github.com/Shopify/sarama/response_header.go
new file mode 100644
index 0000000..f3f4d27
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/response_header.go

@@ -0,0 +1,21 @@
+package sarama
+
+import "fmt"
+
+type responseHeader struct {
+	length        int32
+	correlationID int32
+}
+
+func (r *responseHeader) decode(pd packetDecoder) (err error) {
+	r.length, err = pd.getInt32()
+	if err != nil {
+		return err
+	}
+	if r.length <= 4 || r.length > MaxResponseSize {
+		return PacketDecodingError{fmt.Sprintf("message of length %d too large or too small", r.length)}
+	}
+
+	r.correlationID, err = pd.getInt32()
+	return err
+}

diff --git a/vendor/github.com/Shopify/sarama/sarama.go b/vendor/github.com/Shopify/sarama/sarama.go
new file mode 100644
index 0000000..7d5dc60
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/sarama.go

@@ -0,0 +1,99 @@
+/*
+Package sarama is a pure Go client library for dealing with Apache Kafka (versions 0.8 and later). It includes a high-level
+API for easily producing and consuming messages, and a low-level API for controlling bytes on the wire when the high-level
+API is insufficient. Usage examples for the high-level APIs are provided inline with their full documentation.
+
+To produce messages, use either the AsyncProducer or the SyncProducer. The AsyncProducer accepts messages on a channel
+and produces them asynchronously in the background as efficiently as possible; it is preferred in most cases.
+The SyncProducer provides a method which will block until Kafka acknowledges the message as produced. This can be
+useful but comes with two caveats: it will generally be less efficient, and the actual durability guarantees
+depend on the configured value of `Producer.RequiredAcks`. There are configurations where a message acknowledged by the
+SyncProducer can still sometimes be lost.
+
+To consume messages, use the Consumer. Note that Sarama's Consumer implementation does not currently support automatic
+consumer-group rebalancing and offset tracking. For Zookeeper-based tracking (Kafka 0.8.2 and earlier), the
+https://github.com/wvanbergen/kafka library builds on Sarama to add this support. For Kafka-based tracking (Kafka 0.9
+and later), the https://github.com/bsm/sarama-cluster library builds on Sarama to add this support.
+
+For lower-level needs, the Broker and Request/Response objects permit precise control over each connection
+and message sent on the wire; the Client provides higher-level metadata management that is shared between
+the producers and the consumer. The Request/Response objects and properties are mostly undocumented, as they line up
+exactly with the protocol fields documented by Kafka at
+https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol
+
+Metrics are exposed through https://github.com/rcrowley/go-metrics library in a local registry.
+
+Broker related metrics:
+
+	+----------------------------------------------+------------+---------------------------------------------------------------+
+	| Name                                         | Type       | Description                                                   |
+	+----------------------------------------------+------------+---------------------------------------------------------------+
+	| incoming-byte-rate                           | meter      | Bytes/second read off all brokers                             |
+	| incoming-byte-rate-for-broker-<broker-id>    | meter      | Bytes/second read off a given broker                          |
+	| outgoing-byte-rate                           | meter      | Bytes/second written off all brokers                          |
+	| outgoing-byte-rate-for-broker-<broker-id>    | meter      | Bytes/second written off a given broker                       |
+	| request-rate                                 | meter      | Requests/second sent to all brokers                           |
+	| request-rate-for-broker-<broker-id>          | meter      | Requests/second sent to a given broker                        |
+	| request-size                                 | histogram  | Distribution of the request size in bytes for all brokers     |
+	| request-size-for-broker-<broker-id>          | histogram  | Distribution of the request size in bytes for a given broker  |
+	| request-latency-in-ms                        | histogram  | Distribution of the request latency in ms for all brokers     |
+	| request-latency-in-ms-for-broker-<broker-id> | histogram  | Distribution of the request latency in ms for a given broker  |
+	| response-rate                                | meter      | Responses/second received from all brokers                    |
+	| response-rate-for-broker-<broker-id>         | meter      | Responses/second received from a given broker                 |
+	| response-size                                | histogram  | Distribution of the response size in bytes for all brokers    |
+	| response-size-for-broker-<broker-id>         | histogram  | Distribution of the response size in bytes for a given broker |
+	+----------------------------------------------+------------+---------------------------------------------------------------+
+
+Note that we do not gather specific metrics for seed brokers but they are part of the "all brokers" metrics.
+
+Producer related metrics:
+
+	+-------------------------------------------+------------+--------------------------------------------------------------------------------------+
+	| Name                                      | Type       | Description                                                                          |
+	+-------------------------------------------+------------+--------------------------------------------------------------------------------------+
+	| batch-size                                | histogram  | Distribution of the number of bytes sent per partition per request for all topics    |
+	| batch-size-for-topic-<topic>              | histogram  | Distribution of the number of bytes sent per partition per request for a given topic |
+	| record-send-rate                          | meter      | Records/second sent to all topics                                                    |
+	| record-send-rate-for-topic-<topic>        | meter      | Records/second sent to a given topic                                                 |
+	| records-per-request                       | histogram  | Distribution of the number of records sent per request for all topics                |
+	| records-per-request-for-topic-<topic>     | histogram  | Distribution of the number of records sent per request for a given topic             |
+	| compression-ratio                         | histogram  | Distribution of the compression ratio times 100 of record batches for all topics     |
+	| compression-ratio-for-topic-<topic>       | histogram  | Distribution of the compression ratio times 100 of record batches for a given topic  |
+	+-------------------------------------------+------------+--------------------------------------------------------------------------------------+
+
+*/
+package sarama
+
+import (
+	"io/ioutil"
+	"log"
+)
+
+// Logger is the instance of a StdLogger interface that Sarama writes connection
+// management events to. By default it is set to discard all log messages via ioutil.Discard,
+// but you can set it to redirect wherever you want.
+var Logger StdLogger = log.New(ioutil.Discard, "[Sarama] ", log.LstdFlags)
+
+// StdLogger is used to log error messages.
+type StdLogger interface {
+	Print(v ...interface{})
+	Printf(format string, v ...interface{})
+	Println(v ...interface{})
+}
+
+// PanicHandler is called for recovering from panics spawned internally to the library (and thus
+// not recoverable by the caller's goroutine). Defaults to nil, which means panics are not recovered.
+var PanicHandler func(interface{})
+
+// MaxRequestSize is the maximum size (in bytes) of any request that Sarama will attempt to send. Trying
+// to send a request larger than this will result in an PacketEncodingError. The default of 100 MiB is aligned
+// with Kafka's default `socket.request.max.bytes`, which is the largest request the broker will attempt
+// to process.
+var MaxRequestSize int32 = 100 * 1024 * 1024
+
+// MaxResponseSize is the maximum size (in bytes) of any response that Sarama will attempt to parse. If
+// a broker returns a response message larger than this value, Sarama will return a PacketDecodingError to
+// protect the client from running out of memory. Please note that brokers do not have any natural limit on
+// the size of responses they send. In particular, they can send arbitrarily large fetch responses to consumers
+// (see https://issues.apache.org/jira/browse/KAFKA-2063).
+var MaxResponseSize int32 = 100 * 1024 * 1024

diff --git a/vendor/github.com/Shopify/sarama/sasl_handshake_request.go b/vendor/github.com/Shopify/sarama/sasl_handshake_request.go
new file mode 100644
index 0000000..fbbc894
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/sasl_handshake_request.go

@@ -0,0 +1,33 @@
+package sarama
+
+type SaslHandshakeRequest struct {
+	Mechanism string
+}
+
+func (r *SaslHandshakeRequest) encode(pe packetEncoder) error {
+	if err := pe.putString(r.Mechanism); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (r *SaslHandshakeRequest) decode(pd packetDecoder, version int16) (err error) {
+	if r.Mechanism, err = pd.getString(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (r *SaslHandshakeRequest) key() int16 {
+	return 17
+}
+
+func (r *SaslHandshakeRequest) version() int16 {
+	return 0
+}
+
+func (r *SaslHandshakeRequest) requiredVersion() KafkaVersion {
+	return V0_10_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/sasl_handshake_response.go b/vendor/github.com/Shopify/sarama/sasl_handshake_response.go
new file mode 100644
index 0000000..ef290d4
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/sasl_handshake_response.go

@@ -0,0 +1,38 @@
+package sarama
+
+type SaslHandshakeResponse struct {
+	Err               KError
+	EnabledMechanisms []string
+}
+
+func (r *SaslHandshakeResponse) encode(pe packetEncoder) error {
+	pe.putInt16(int16(r.Err))
+	return pe.putStringArray(r.EnabledMechanisms)
+}
+
+func (r *SaslHandshakeResponse) decode(pd packetDecoder, version int16) error {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+
+	r.Err = KError(kerr)
+
+	if r.EnabledMechanisms, err = pd.getStringArray(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (r *SaslHandshakeResponse) key() int16 {
+	return 17
+}
+
+func (r *SaslHandshakeResponse) version() int16 {
+	return 0
+}
+
+func (r *SaslHandshakeResponse) requiredVersion() KafkaVersion {
+	return V0_10_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/sync_group_request.go b/vendor/github.com/Shopify/sarama/sync_group_request.go
new file mode 100644
index 0000000..fe20708
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/sync_group_request.go

@@ -0,0 +1,100 @@
+package sarama
+
+type SyncGroupRequest struct {
+	GroupId          string
+	GenerationId     int32
+	MemberId         string
+	GroupAssignments map[string][]byte
+}
+
+func (r *SyncGroupRequest) encode(pe packetEncoder) error {
+	if err := pe.putString(r.GroupId); err != nil {
+		return err
+	}
+
+	pe.putInt32(r.GenerationId)
+
+	if err := pe.putString(r.MemberId); err != nil {
+		return err
+	}
+
+	if err := pe.putArrayLength(len(r.GroupAssignments)); err != nil {
+		return err
+	}
+	for memberId, memberAssignment := range r.GroupAssignments {
+		if err := pe.putString(memberId); err != nil {
+			return err
+		}
+		if err := pe.putBytes(memberAssignment); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (r *SyncGroupRequest) decode(pd packetDecoder, version int16) (err error) {
+	if r.GroupId, err = pd.getString(); err != nil {
+		return
+	}
+	if r.GenerationId, err = pd.getInt32(); err != nil {
+		return
+	}
+	if r.MemberId, err = pd.getString(); err != nil {
+		return
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+	if n == 0 {
+		return nil
+	}
+
+	r.GroupAssignments = make(map[string][]byte)
+	for i := 0; i < n; i++ {
+		memberId, err := pd.getString()
+		if err != nil {
+			return err
+		}
+		memberAssignment, err := pd.getBytes()
+		if err != nil {
+			return err
+		}
+
+		r.GroupAssignments[memberId] = memberAssignment
+	}
+
+	return nil
+}
+
+func (r *SyncGroupRequest) key() int16 {
+	return 14
+}
+
+func (r *SyncGroupRequest) version() int16 {
+	return 0
+}
+
+func (r *SyncGroupRequest) requiredVersion() KafkaVersion {
+	return V0_9_0_0
+}
+
+func (r *SyncGroupRequest) AddGroupAssignment(memberId string, memberAssignment []byte) {
+	if r.GroupAssignments == nil {
+		r.GroupAssignments = make(map[string][]byte)
+	}
+
+	r.GroupAssignments[memberId] = memberAssignment
+}
+
+func (r *SyncGroupRequest) AddGroupAssignmentMember(memberId string, memberAssignment *ConsumerGroupMemberAssignment) error {
+	bin, err := encode(memberAssignment, nil)
+	if err != nil {
+		return err
+	}
+
+	r.AddGroupAssignment(memberId, bin)
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/sync_group_response.go b/vendor/github.com/Shopify/sarama/sync_group_response.go
new file mode 100644
index 0000000..194b382
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/sync_group_response.go

@@ -0,0 +1,41 @@
+package sarama
+
+type SyncGroupResponse struct {
+	Err              KError
+	MemberAssignment []byte
+}
+
+func (r *SyncGroupResponse) GetMemberAssignment() (*ConsumerGroupMemberAssignment, error) {
+	assignment := new(ConsumerGroupMemberAssignment)
+	err := decode(r.MemberAssignment, assignment)
+	return assignment, err
+}
+
+func (r *SyncGroupResponse) encode(pe packetEncoder) error {
+	pe.putInt16(int16(r.Err))
+	return pe.putBytes(r.MemberAssignment)
+}
+
+func (r *SyncGroupResponse) decode(pd packetDecoder, version int16) (err error) {
+	kerr, err := pd.getInt16()
+	if err != nil {
+		return err
+	}
+
+	r.Err = KError(kerr)
+
+	r.MemberAssignment, err = pd.getBytes()
+	return
+}
+
+func (r *SyncGroupResponse) key() int16 {
+	return 14
+}
+
+func (r *SyncGroupResponse) version() int16 {
+	return 0
+}
+
+func (r *SyncGroupResponse) requiredVersion() KafkaVersion {
+	return V0_9_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/sync_producer.go b/vendor/github.com/Shopify/sarama/sync_producer.go
new file mode 100644
index 0000000..021c5a0
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/sync_producer.go

@@ -0,0 +1,149 @@
+package sarama
+
+import "sync"
+
+// SyncProducer publishes Kafka messages, blocking until they have been acknowledged. It routes messages to the correct
+// broker, refreshing metadata as appropriate, and parses responses for errors. You must call Close() on a producer
+// to avoid leaks, it may not be garbage-collected automatically when it passes out of scope.
+//
+// The SyncProducer comes with two caveats: it will generally be less efficient than the AsyncProducer, and the actual
+// durability guarantee provided when a message is acknowledged depend on the configured value of `Producer.RequiredAcks`.
+// There are configurations where a message acknowledged by the SyncProducer can still sometimes be lost.
+//
+// For implementation reasons, the SyncProducer requires `Producer.Return.Errors` and `Producer.Return.Successes` to
+// be set to true in its configuration.
+type SyncProducer interface {
+
+	// SendMessage produces a given message, and returns only when it either has
+	// succeeded or failed to produce. It will return the partition and the offset
+	// of the produced message, or an error if the message failed to produce.
+	SendMessage(msg *ProducerMessage) (partition int32, offset int64, err error)
+
+	// SendMessages produces a given set of messages, and returns only when all
+	// messages in the set have either succeeded or failed. Note that messages
+	// can succeed and fail individually; if some succeed and some fail,
+	// SendMessages will return an error.
+	SendMessages(msgs []*ProducerMessage) error
+
+	// Close shuts down the producer and waits for any buffered messages to be
+	// flushed. You must call this function before a producer object passes out of
+	// scope, as it may otherwise leak memory. You must call this before calling
+	// Close on the underlying client.
+	Close() error
+}
+
+type syncProducer struct {
+	producer *asyncProducer
+	wg       sync.WaitGroup
+}
+
+// NewSyncProducer creates a new SyncProducer using the given broker addresses and configuration.
+func NewSyncProducer(addrs []string, config *Config) (SyncProducer, error) {
+	if config == nil {
+		config = NewConfig()
+		config.Producer.Return.Successes = true
+	}
+
+	if err := verifyProducerConfig(config); err != nil {
+		return nil, err
+	}
+
+	p, err := NewAsyncProducer(addrs, config)
+	if err != nil {
+		return nil, err
+	}
+	return newSyncProducerFromAsyncProducer(p.(*asyncProducer)), nil
+}
+
+// NewSyncProducerFromClient creates a new SyncProducer using the given client. It is still
+// necessary to call Close() on the underlying client when shutting down this producer.
+func NewSyncProducerFromClient(client Client) (SyncProducer, error) {
+	if err := verifyProducerConfig(client.Config()); err != nil {
+		return nil, err
+	}
+
+	p, err := NewAsyncProducerFromClient(client)
+	if err != nil {
+		return nil, err
+	}
+	return newSyncProducerFromAsyncProducer(p.(*asyncProducer)), nil
+}
+
+func newSyncProducerFromAsyncProducer(p *asyncProducer) *syncProducer {
+	sp := &syncProducer{producer: p}
+
+	sp.wg.Add(2)
+	go withRecover(sp.handleSuccesses)
+	go withRecover(sp.handleErrors)
+
+	return sp
+}
+
+func verifyProducerConfig(config *Config) error {
+	if !config.Producer.Return.Errors {
+		return ConfigurationError("Producer.Return.Errors must be true to be used in a SyncProducer")
+	}
+	if !config.Producer.Return.Successes {
+		return ConfigurationError("Producer.Return.Successes must be true to be used in a SyncProducer")
+	}
+	return nil
+}
+
+func (sp *syncProducer) SendMessage(msg *ProducerMessage) (partition int32, offset int64, err error) {
+	expectation := make(chan *ProducerError, 1)
+	msg.expectation = expectation
+	sp.producer.Input() <- msg
+
+	if err := <-expectation; err != nil {
+		return -1, -1, err.Err
+	}
+
+	return msg.Partition, msg.Offset, nil
+}
+
+func (sp *syncProducer) SendMessages(msgs []*ProducerMessage) error {
+	expectations := make(chan chan *ProducerError, len(msgs))
+	go func() {
+		for _, msg := range msgs {
+			expectation := make(chan *ProducerError, 1)
+			msg.expectation = expectation
+			sp.producer.Input() <- msg
+			expectations <- expectation
+		}
+		close(expectations)
+	}()
+
+	var errors ProducerErrors
+	for expectation := range expectations {
+		if err := <-expectation; err != nil {
+			errors = append(errors, err)
+		}
+	}
+
+	if len(errors) > 0 {
+		return errors
+	}
+	return nil
+}
+
+func (sp *syncProducer) handleSuccesses() {
+	defer sp.wg.Done()
+	for msg := range sp.producer.Successes() {
+		expectation := msg.expectation
+		expectation <- nil
+	}
+}
+
+func (sp *syncProducer) handleErrors() {
+	defer sp.wg.Done()
+	for err := range sp.producer.Errors() {
+		expectation := err.Msg.expectation
+		expectation <- err
+	}
+}
+
+func (sp *syncProducer) Close() error {
+	sp.producer.AsyncClose()
+	sp.wg.Wait()
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/timestamp.go b/vendor/github.com/Shopify/sarama/timestamp.go
new file mode 100644
index 0000000..372278d
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/timestamp.go

@@ -0,0 +1,40 @@
+package sarama
+
+import (
+	"fmt"
+	"time"
+)
+
+type Timestamp struct {
+	*time.Time
+}
+
+func (t Timestamp) encode(pe packetEncoder) error {
+	timestamp := int64(-1)
+
+	if !t.Before(time.Unix(0, 0)) {
+		timestamp = t.UnixNano() / int64(time.Millisecond)
+	} else if !t.IsZero() {
+		return PacketEncodingError{fmt.Sprintf("invalid timestamp (%v)", t)}
+	}
+
+	pe.putInt64(timestamp)
+	return nil
+}
+
+func (t Timestamp) decode(pd packetDecoder) error {
+	millis, err := pd.getInt64()
+	if err != nil {
+		return err
+	}
+
+	// negative timestamps are invalid, in these cases we should return
+	// a zero time
+	timestamp := time.Time{}
+	if millis >= 0 {
+		timestamp = time.Unix(millis/1000, (millis%1000)*int64(time.Millisecond))
+	}
+
+	*t.Time = timestamp
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/txn_offset_commit_request.go b/vendor/github.com/Shopify/sarama/txn_offset_commit_request.go
new file mode 100644
index 0000000..71e95b8
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/txn_offset_commit_request.go

@@ -0,0 +1,126 @@
+package sarama
+
+type TxnOffsetCommitRequest struct {
+	TransactionalID string
+	GroupID         string
+	ProducerID      int64
+	ProducerEpoch   int16
+	Topics          map[string][]*PartitionOffsetMetadata
+}
+
+func (t *TxnOffsetCommitRequest) encode(pe packetEncoder) error {
+	if err := pe.putString(t.TransactionalID); err != nil {
+		return err
+	}
+	if err := pe.putString(t.GroupID); err != nil {
+		return err
+	}
+	pe.putInt64(t.ProducerID)
+	pe.putInt16(t.ProducerEpoch)
+
+	if err := pe.putArrayLength(len(t.Topics)); err != nil {
+		return err
+	}
+	for topic, partitions := range t.Topics {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := pe.putArrayLength(len(partitions)); err != nil {
+			return err
+		}
+		for _, partition := range partitions {
+			if err := partition.encode(pe); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func (t *TxnOffsetCommitRequest) decode(pd packetDecoder, version int16) (err error) {
+	if t.TransactionalID, err = pd.getString(); err != nil {
+		return err
+	}
+	if t.GroupID, err = pd.getString(); err != nil {
+		return err
+	}
+	if t.ProducerID, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if t.ProducerEpoch, err = pd.getInt16(); err != nil {
+		return err
+	}
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	t.Topics = make(map[string][]*PartitionOffsetMetadata)
+	for i := 0; i < n; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		m, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+
+		t.Topics[topic] = make([]*PartitionOffsetMetadata, m)
+
+		for j := 0; j < m; j++ {
+			partitionOffsetMetadata := new(PartitionOffsetMetadata)
+			if err := partitionOffsetMetadata.decode(pd, version); err != nil {
+				return err
+			}
+			t.Topics[topic][j] = partitionOffsetMetadata
+		}
+	}
+
+	return nil
+}
+
+func (a *TxnOffsetCommitRequest) key() int16 {
+	return 28
+}
+
+func (a *TxnOffsetCommitRequest) version() int16 {
+	return 0
+}
+
+func (a *TxnOffsetCommitRequest) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}
+
+type PartitionOffsetMetadata struct {
+	Partition int32
+	Offset    int64
+	Metadata  *string
+}
+
+func (p *PartitionOffsetMetadata) encode(pe packetEncoder) error {
+	pe.putInt32(p.Partition)
+	pe.putInt64(p.Offset)
+	if err := pe.putNullableString(p.Metadata); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (p *PartitionOffsetMetadata) decode(pd packetDecoder, version int16) (err error) {
+	if p.Partition, err = pd.getInt32(); err != nil {
+		return err
+	}
+	if p.Offset, err = pd.getInt64(); err != nil {
+		return err
+	}
+	if p.Metadata, err = pd.getNullableString(); err != nil {
+		return err
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/Shopify/sarama/txn_offset_commit_response.go b/vendor/github.com/Shopify/sarama/txn_offset_commit_response.go
new file mode 100644
index 0000000..6c980f4
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/txn_offset_commit_response.go

@@ -0,0 +1,83 @@
+package sarama
+
+import (
+	"time"
+)
+
+type TxnOffsetCommitResponse struct {
+	ThrottleTime time.Duration
+	Topics       map[string][]*PartitionError
+}
+
+func (t *TxnOffsetCommitResponse) encode(pe packetEncoder) error {
+	pe.putInt32(int32(t.ThrottleTime / time.Millisecond))
+	if err := pe.putArrayLength(len(t.Topics)); err != nil {
+		return err
+	}
+
+	for topic, e := range t.Topics {
+		if err := pe.putString(topic); err != nil {
+			return err
+		}
+		if err := pe.putArrayLength(len(e)); err != nil {
+			return err
+		}
+		for _, partitionError := range e {
+			if err := partitionError.encode(pe); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func (t *TxnOffsetCommitResponse) decode(pd packetDecoder, version int16) (err error) {
+	throttleTime, err := pd.getInt32()
+	if err != nil {
+		return err
+	}
+	t.ThrottleTime = time.Duration(throttleTime) * time.Millisecond
+
+	n, err := pd.getArrayLength()
+	if err != nil {
+		return err
+	}
+
+	t.Topics = make(map[string][]*PartitionError)
+
+	for i := 0; i < n; i++ {
+		topic, err := pd.getString()
+		if err != nil {
+			return err
+		}
+
+		m, err := pd.getArrayLength()
+		if err != nil {
+			return err
+		}
+
+		t.Topics[topic] = make([]*PartitionError, m)
+
+		for j := 0; j < m; j++ {
+			t.Topics[topic][j] = new(PartitionError)
+			if err := t.Topics[topic][j].decode(pd, version); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func (a *TxnOffsetCommitResponse) key() int16 {
+	return 28
+}
+
+func (a *TxnOffsetCommitResponse) version() int16 {
+	return 0
+}
+
+func (a *TxnOffsetCommitResponse) requiredVersion() KafkaVersion {
+	return V0_11_0_0
+}

diff --git a/vendor/github.com/Shopify/sarama/utils.go b/vendor/github.com/Shopify/sarama/utils.go
new file mode 100644
index 0000000..7dcbf03
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/utils.go

@@ -0,0 +1,220 @@
+package sarama
+
+import (
+	"bufio"
+	"fmt"
+	"net"
+	"regexp"
+)
+
+type none struct{}
+
+// make []int32 sortable so we can sort partition numbers
+type int32Slice []int32
+
+func (slice int32Slice) Len() int {
+	return len(slice)
+}
+
+func (slice int32Slice) Less(i, j int) bool {
+	return slice[i] < slice[j]
+}
+
+func (slice int32Slice) Swap(i, j int) {
+	slice[i], slice[j] = slice[j], slice[i]
+}
+
+func dupInt32Slice(input []int32) []int32 {
+	ret := make([]int32, 0, len(input))
+	for _, val := range input {
+		ret = append(ret, val)
+	}
+	return ret
+}
+
+func withRecover(fn func()) {
+	defer func() {
+		handler := PanicHandler
+		if handler != nil {
+			if err := recover(); err != nil {
+				handler(err)
+			}
+		}
+	}()
+
+	fn()
+}
+
+func safeAsyncClose(b *Broker) {
+	tmp := b // local var prevents clobbering in goroutine
+	go withRecover(func() {
+		if connected, _ := tmp.Connected(); connected {
+			if err := tmp.Close(); err != nil {
+				Logger.Println("Error closing broker", tmp.ID(), ":", err)
+			}
+		}
+	})
+}
+
+// Encoder is a simple interface for any type that can be encoded as an array of bytes
+// in order to be sent as the key or value of a Kafka message. Length() is provided as an
+// optimization, and must return the same as len() on the result of Encode().
+type Encoder interface {
+	Encode() ([]byte, error)
+	Length() int
+}
+
+// make strings and byte slices encodable for convenience so they can be used as keys
+// and/or values in kafka messages
+
+// StringEncoder implements the Encoder interface for Go strings so that they can be used
+// as the Key or Value in a ProducerMessage.
+type StringEncoder string
+
+func (s StringEncoder) Encode() ([]byte, error) {
+	return []byte(s), nil
+}
+
+func (s StringEncoder) Length() int {
+	return len(s)
+}
+
+// ByteEncoder implements the Encoder interface for Go byte slices so that they can be used
+// as the Key or Value in a ProducerMessage.
+type ByteEncoder []byte
+
+func (b ByteEncoder) Encode() ([]byte, error) {
+	return b, nil
+}
+
+func (b ByteEncoder) Length() int {
+	return len(b)
+}
+
+// bufConn wraps a net.Conn with a buffer for reads to reduce the number of
+// reads that trigger syscalls.
+type bufConn struct {
+	net.Conn
+	buf *bufio.Reader
+}
+
+func newBufConn(conn net.Conn) *bufConn {
+	return &bufConn{
+		Conn: conn,
+		buf:  bufio.NewReader(conn),
+	}
+}
+
+func (bc *bufConn) Read(b []byte) (n int, err error) {
+	return bc.buf.Read(b)
+}
+
+// KafkaVersion instances represent versions of the upstream Kafka broker.
+type KafkaVersion struct {
+	// it's a struct rather than just typing the array directly to make it opaque and stop people
+	// generating their own arbitrary versions
+	version [4]uint
+}
+
+func newKafkaVersion(major, minor, veryMinor, patch uint) KafkaVersion {
+	return KafkaVersion{
+		version: [4]uint{major, minor, veryMinor, patch},
+	}
+}
+
+// IsAtLeast return true if and only if the version it is called on is
+// greater than or equal to the version passed in:
+//    V1.IsAtLeast(V2) // false
+//    V2.IsAtLeast(V1) // true
+func (v KafkaVersion) IsAtLeast(other KafkaVersion) bool {
+	for i := range v.version {
+		if v.version[i] > other.version[i] {
+			return true
+		} else if v.version[i] < other.version[i] {
+			return false
+		}
+	}
+	return true
+}
+
+// Effective constants defining the supported kafka versions.
+var (
+	V0_8_2_0  = newKafkaVersion(0, 8, 2, 0)
+	V0_8_2_1  = newKafkaVersion(0, 8, 2, 1)
+	V0_8_2_2  = newKafkaVersion(0, 8, 2, 2)
+	V0_9_0_0  = newKafkaVersion(0, 9, 0, 0)
+	V0_9_0_1  = newKafkaVersion(0, 9, 0, 1)
+	V0_10_0_0 = newKafkaVersion(0, 10, 0, 0)
+	V0_10_0_1 = newKafkaVersion(0, 10, 0, 1)
+	V0_10_1_0 = newKafkaVersion(0, 10, 1, 0)
+	V0_10_1_1 = newKafkaVersion(0, 10, 1, 1)
+	V0_10_2_0 = newKafkaVersion(0, 10, 2, 0)
+	V0_10_2_1 = newKafkaVersion(0, 10, 2, 1)
+	V0_11_0_0 = newKafkaVersion(0, 11, 0, 0)
+	V0_11_0_1 = newKafkaVersion(0, 11, 0, 1)
+	V0_11_0_2 = newKafkaVersion(0, 11, 0, 2)
+	V1_0_0_0  = newKafkaVersion(1, 0, 0, 0)
+	V1_1_0_0  = newKafkaVersion(1, 1, 0, 0)
+	V1_1_1_0  = newKafkaVersion(1, 1, 1, 0)
+	V2_0_0_0  = newKafkaVersion(2, 0, 0, 0)
+	V2_0_1_0  = newKafkaVersion(2, 0, 1, 0)
+	V2_1_0_0  = newKafkaVersion(2, 1, 0, 0)
+
+	SupportedVersions = []KafkaVersion{
+		V0_8_2_0,
+		V0_8_2_1,
+		V0_8_2_2,
+		V0_9_0_0,
+		V0_9_0_1,
+		V0_10_0_0,
+		V0_10_0_1,
+		V0_10_1_0,
+		V0_10_1_1,
+		V0_10_2_0,
+		V0_10_2_1,
+		V0_11_0_0,
+		V0_11_0_1,
+		V0_11_0_2,
+		V1_0_0_0,
+		V1_1_0_0,
+		V1_1_1_0,
+		V2_0_0_0,
+		V2_0_1_0,
+		V2_1_0_0,
+	}
+	MinVersion = V0_8_2_0
+	MaxVersion = V2_1_0_0
+)
+
+func ParseKafkaVersion(s string) (KafkaVersion, error) {
+	if len(s) < 5 {
+		return MinVersion, fmt.Errorf("invalid version `%s`", s)
+	}
+	var major, minor, veryMinor, patch uint
+	var err error
+	if s[0] == '0' {
+		err = scanKafkaVersion(s, `^0\.\d+\.\d+\.\d+$`, "0.%d.%d.%d", [3]*uint{&minor, &veryMinor, &patch})
+	} else {
+		err = scanKafkaVersion(s, `^\d+\.\d+\.\d+$`, "%d.%d.%d", [3]*uint{&major, &minor, &veryMinor})
+	}
+	if err != nil {
+		return MinVersion, err
+	}
+	return newKafkaVersion(major, minor, veryMinor, patch), nil
+}
+
+func scanKafkaVersion(s string, pattern string, format string, v [3]*uint) error {
+	if !regexp.MustCompile(pattern).MatchString(s) {
+		return fmt.Errorf("invalid version `%s`", s)
+	}
+	_, err := fmt.Sscanf(s, format, v[0], v[1], v[2])
+	return err
+}
+
+func (v KafkaVersion) String() string {
+	if v.version[0] == 0 {
+		return fmt.Sprintf("0.%d.%d.%d", v.version[1], v.version[2], v.version[3])
+	}
+
+	return fmt.Sprintf("%d.%d.%d", v.version[0], v.version[1], v.version[2])
+}

diff --git a/vendor/github.com/Shopify/sarama/zstd_cgo.go b/vendor/github.com/Shopify/sarama/zstd_cgo.go
new file mode 100644
index 0000000..f5ccb31
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/zstd_cgo.go

@@ -0,0 +1,13 @@
+// +build cgo
+
+package sarama
+
+import "github.com/DataDog/zstd"
+
+func zstdDecompress(dst, src []byte) ([]byte, error) {
+	return zstd.Decompress(dst, src)
+}
+
+func zstdCompressLevel(dst, src []byte, level int) ([]byte, error) {
+	return zstd.CompressLevel(dst, src, level)
+}

diff --git a/vendor/github.com/Shopify/sarama/zstd_fallback.go b/vendor/github.com/Shopify/sarama/zstd_fallback.go
new file mode 100644
index 0000000..381a56b
--- /dev/null
+++ b/vendor/github.com/Shopify/sarama/zstd_fallback.go

@@ -0,0 +1,17 @@
+// +build !cgo
+
+package sarama
+
+import (
+	"errors"
+)
+
+var errZstdCgo = errors.New("zstd compression requires building with cgo enabled")
+
+func zstdDecompress(dst, src []byte) ([]byte, error) {
+	return nil, errZstdCgo
+}
+
+func zstdCompressLevel(dst, src []byte, level int) ([]byte, error) {
+	return nil, errZstdCgo
+}

diff --git a/vendor/github.com/davecgh/go-spew/LICENSE b/vendor/github.com/davecgh/go-spew/LICENSE
new file mode 100644
index 0000000..bc52e96
--- /dev/null
+++ b/vendor/github.com/davecgh/go-spew/LICENSE

@@ -0,0 +1,15 @@
+ISC License
+
+Copyright (c) 2012-2016 Dave Collins <dave@davec.name>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

diff --git a/vendor/github.com/davecgh/go-spew/spew/bypass.go b/vendor/github.com/davecgh/go-spew/spew/bypass.go
new file mode 100644
index 0000000..7929947
--- /dev/null
+++ b/vendor/github.com/davecgh/go-spew/spew/bypass.go

@@ -0,0 +1,145 @@
+// Copyright (c) 2015-2016 Dave Collins <dave@davec.name>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// NOTE: Due to the following build constraints, this file will only be compiled
+// when the code is not running on Google App Engine, compiled by GopherJS, and
+// "-tags safe" is not added to the go build command line.  The "disableunsafe"
+// tag is deprecated and thus should not be used.
+// Go versions prior to 1.4 are disabled because they use a different layout
+// for interfaces which make the implementation of unsafeReflectValue more complex.
+// +build !js,!appengine,!safe,!disableunsafe,go1.4
+
+package spew
+
+import (
+	"reflect"
+	"unsafe"
+)
+
+const (
+	// UnsafeDisabled is a build-time constant which specifies whether or
+	// not access to the unsafe package is available.
+	UnsafeDisabled = false
+
+	// ptrSize is the size of a pointer on the current arch.
+	ptrSize = unsafe.Sizeof((*byte)(nil))
+)
+
+type flag uintptr
+
+var (
+	// flagRO indicates whether the value field of a reflect.Value
+	// is read-only.
+	flagRO flag
+
+	// flagAddr indicates whether the address of the reflect.Value's
+	// value may be taken.
+	flagAddr flag
+)
+
+// flagKindMask holds the bits that make up the kind
+// part of the flags field. In all the supported versions,
+// it is in the lower 5 bits.
+const flagKindMask = flag(0x1f)
+
+// Different versions of Go have used different
+// bit layouts for the flags type. This table
+// records the known combinations.
+var okFlags = []struct {
+	ro, addr flag
+}{{
+	// From Go 1.4 to 1.5
+	ro:   1 << 5,
+	addr: 1 << 7,
+}, {
+	// Up to Go tip.
+	ro:   1<<5 | 1<<6,
+	addr: 1 << 8,
+}}
+
+var flagValOffset = func() uintptr {
+	field, ok := reflect.TypeOf(reflect.Value{}).FieldByName("flag")
+	if !ok {
+		panic("reflect.Value has no flag field")
+	}
+	return field.Offset
+}()
+
+// flagField returns a pointer to the flag field of a reflect.Value.
+func flagField(v *reflect.Value) *flag {
+	return (*flag)(unsafe.Pointer(uintptr(unsafe.Pointer(v)) + flagValOffset))
+}
+
+// unsafeReflectValue converts the passed reflect.Value into a one that bypasses
+// the typical safety restrictions preventing access to unaddressable and
+// unexported data.  It works by digging the raw pointer to the underlying
+// value out of the protected value and generating a new unprotected (unsafe)
+// reflect.Value to it.
+//
+// This allows us to check for implementations of the Stringer and error
+// interfaces to be used for pretty printing ordinarily unaddressable and
+// inaccessible values such as unexported struct fields.
+func unsafeReflectValue(v reflect.Value) reflect.Value {
+	if !v.IsValid() || (v.CanInterface() && v.CanAddr()) {
+		return v
+	}
+	flagFieldPtr := flagField(&v)
+	*flagFieldPtr &^= flagRO
+	*flagFieldPtr |= flagAddr
+	return v
+}
+
+// Sanity checks against future reflect package changes
+// to the type or semantics of the Value.flag field.
+func init() {
+	field, ok := reflect.TypeOf(reflect.Value{}).FieldByName("flag")
+	if !ok {
+		panic("reflect.Value has no flag field")
+	}
+	if field.Type.Kind() != reflect.TypeOf(flag(0)).Kind() {
+		panic("reflect.Value flag field has changed kind")
+	}
+	type t0 int
+	var t struct {
+		A t0
+		// t0 will have flagEmbedRO set.
+		t0
+		// a will have flagStickyRO set
+		a t0
+	}
+	vA := reflect.ValueOf(t).FieldByName("A")
+	va := reflect.ValueOf(t).FieldByName("a")
+	vt0 := reflect.ValueOf(t).FieldByName("t0")
+
+	// Infer flagRO from the difference between the flags
+	// for the (otherwise identical) fields in t.
+	flagPublic := *flagField(&vA)
+	flagWithRO := *flagField(&va) | *flagField(&vt0)
+	flagRO = flagPublic ^ flagWithRO
+
+	// Infer flagAddr from the difference between a value
+	// taken from a pointer and not.
+	vPtrA := reflect.ValueOf(&t).Elem().FieldByName("A")
+	flagNoPtr := *flagField(&vA)
+	flagPtr := *flagField(&vPtrA)
+	flagAddr = flagNoPtr ^ flagPtr
+
+	// Check that the inferred flags tally with one of the known versions.
+	for _, f := range okFlags {
+		if flagRO == f.ro && flagAddr == f.addr {
+			return
+		}
+	}
+	panic("reflect.Value read-only flag has changed semantics")
+}

diff --git a/vendor/github.com/davecgh/go-spew/spew/bypasssafe.go b/vendor/github.com/davecgh/go-spew/spew/bypasssafe.go
new file mode 100644
index 0000000..205c28d
--- /dev/null
+++ b/vendor/github.com/davecgh/go-spew/spew/bypasssafe.go

@@ -0,0 +1,38 @@
+// Copyright (c) 2015-2016 Dave Collins <dave@davec.name>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// NOTE: Due to the following build constraints, this file will only be compiled
+// when the code is running on Google App Engine, compiled by GopherJS, or
+// "-tags safe" is added to the go build command line.  The "disableunsafe"
+// tag is deprecated and thus should not be used.
+// +build js appengine safe disableunsafe !go1.4
+
+package spew
+
+import "reflect"
+
+const (
+	// UnsafeDisabled is a build-time constant which specifies whether or
+	// not access to the unsafe package is available.
+	UnsafeDisabled = true
+)
+
+// unsafeReflectValue typically converts the passed reflect.Value into a one
+// that bypasses the typical safety restrictions preventing access to
+// unaddressable and unexported data.  However, doing this relies on access to
+// the unsafe package.  This is a stub version which simply returns the passed
+// reflect.Value when the unsafe package is not available.
+func unsafeReflectValue(v reflect.Value) reflect.Value {
+	return v
+}

diff --git a/vendor/github.com/davecgh/go-spew/spew/common.go b/vendor/github.com/davecgh/go-spew/spew/common.go
new file mode 100644
index 0000000..1be8ce9
--- /dev/null
+++ b/vendor/github.com/davecgh/go-spew/spew/common.go

@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2013-2016 Dave Collins <dave@davec.name>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+package spew
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"reflect"
+	"sort"
+	"strconv"
+)
+
+// Some constants in the form of bytes to avoid string overhead.  This mirrors
+// the technique used in the fmt package.
+var (
+	panicBytes            = []byte("(PANIC=")
+	plusBytes             = []byte("+")
+	iBytes                = []byte("i")
+	trueBytes             = []byte("true")
+	falseBytes            = []byte("false")
+	interfaceBytes        = []byte("(interface {})")
+	commaNewlineBytes     = []byte(",\n")
+	newlineBytes          = []byte("\n")
+	openBraceBytes        = []byte("{")
+	openBraceNewlineBytes = []byte("{\n")
+	closeBraceBytes       = []byte("}")
+	asteriskBytes         = []byte("*")
+	colonBytes            = []byte(":")
+	colonSpaceBytes       = []byte(": ")
+	openParenBytes        = []byte("(")
+	closeParenBytes       = []byte(")")
+	spaceBytes            = []byte(" ")
+	pointerChainBytes     = []byte("->")
+	nilAngleBytes         = []byte("<nil>")
+	maxNewlineBytes       = []byte("<max depth reached>\n")
+	maxShortBytes         = []byte("<max>")
+	circularBytes         = []byte("<already shown>")
+	circularShortBytes    = []byte("<shown>")
+	invalidAngleBytes     = []byte("<invalid>")
+	openBracketBytes      = []byte("[")
+	closeBracketBytes     = []byte("]")
+	percentBytes          = []byte("%")
+	precisionBytes        = []byte(".")
+	openAngleBytes        = []byte("<")
+	closeAngleBytes       = []byte(">")
+	openMapBytes          = []byte("map[")
+	closeMapBytes         = []byte("]")
+	lenEqualsBytes        = []byte("len=")
+	capEqualsBytes        = []byte("cap=")
+)
+
+// hexDigits is used to map a decimal value to a hex digit.
+var hexDigits = "0123456789abcdef"
+
+// catchPanic handles any panics that might occur during the handleMethods
+// calls.
+func catchPanic(w io.Writer, v reflect.Value) {
+	if err := recover(); err != nil {
+		w.Write(panicBytes)
+		fmt.Fprintf(w, "%v", err)
+		w.Write(closeParenBytes)
+	}
+}
+
+// handleMethods attempts to call the Error and String methods on the underlying
+// type the passed reflect.Value represents and outputes the result to Writer w.
+//
+// It handles panics in any called methods by catching and displaying the error
+// as the formatted value.
+func handleMethods(cs *ConfigState, w io.Writer, v reflect.Value) (handled bool) {
+	// We need an interface to check if the type implements the error or
+	// Stringer interface.  However, the reflect package won't give us an
+	// interface on certain things like unexported struct fields in order
+	// to enforce visibility rules.  We use unsafe, when it's available,
+	// to bypass these restrictions since this package does not mutate the
+	// values.
+	if !v.CanInterface() {
+		if UnsafeDisabled {
+			return false
+		}
+
+		v = unsafeReflectValue(v)
+	}
+
+	// Choose whether or not to do error and Stringer interface lookups against
+	// the base type or a pointer to the base type depending on settings.
+	// Technically calling one of these methods with a pointer receiver can
+	// mutate the value, however, types which choose to satisify an error or
+	// Stringer interface with a pointer receiver should not be mutating their
+	// state inside these interface methods.
+	if !cs.DisablePointerMethods && !UnsafeDisabled && !v.CanAddr() {
+		v = unsafeReflectValue(v)
+	}
+	if v.CanAddr() {
+		v = v.Addr()
+	}
+
+	// Is it an error or Stringer?
+	switch iface := v.Interface().(type) {
+	case error:
+		defer catchPanic(w, v)
+		if cs.ContinueOnMethod {
+			w.Write(openParenBytes)
+			w.Write([]byte(iface.Error()))
+			w.Write(closeParenBytes)
+			w.Write(spaceBytes)
+			return false
+		}
+
+		w.Write([]byte(iface.Error()))
+		return true
+
+	case fmt.Stringer:
+		defer catchPanic(w, v)
+		if cs.ContinueOnMethod {
+			w.Write(openParenBytes)
+			w.Write([]byte(iface.String()))
+			w.Write(closeParenBytes)
+			w.Write(spaceBytes)
+			return false
+		}
+		w.Write([]byte(iface.String()))
+		return true
+	}
+	return false
+}
+
+// printBool outputs a boolean value as true or false to Writer w.
+func printBool(w io.Writer, val bool) {
+	if val {
+		w.Write(trueBytes)
+	} else {
+		w.Write(falseBytes)
+	}
+}
+
+// printInt outputs a signed integer value to Writer w.
+func printInt(w io.Writer, val int64, base int) {
+	w.Write([]byte(strconv.FormatInt(val, base)))
+}
+
+// printUint outputs an unsigned integer value to Writer w.
+func printUint(w io.Writer, val uint64, base int) {
+	w.Write([]byte(strconv.FormatUint(val, base)))
+}
+
+// printFloat outputs a floating point value using the specified precision,
+// which is expected to be 32 or 64bit, to Writer w.
+func printFloat(w io.Writer, val float64, precision int) {
+	w.Write([]byte(strconv.FormatFloat(val, 'g', -1, precision)))
+}
+
+// printComplex outputs a complex value using the specified float precision
+// for the real and imaginary parts to Writer w.
+func printComplex(w io.Writer, c complex128, floatPrecision int) {
+	r := real(c)
+	w.Write(openParenBytes)
+	w.Write([]byte(strconv.FormatFloat(r, 'g', -1, floatPrecision)))
+	i := imag(c)
+	if i >= 0 {
+		w.Write(plusBytes)
+	}
+	w.Write([]byte(strconv.FormatFloat(i, 'g', -1, floatPrecision)))
+	w.Write(iBytes)
+	w.Write(closeParenBytes)
+}
+
+// printHexPtr outputs a uintptr formatted as hexadecimal with a leading '0x'
+// prefix to Writer w.
+func printHexPtr(w io.Writer, p uintptr) {
+	// Null pointer.
+	num := uint64(p)
+	if num == 0 {
+		w.Write(nilAngleBytes)
+		return
+	}
+
+	// Max uint64 is 16 bytes in hex + 2 bytes for '0x' prefix
+	buf := make([]byte, 18)
+
+	// It's simpler to construct the hex string right to left.
+	base := uint64(16)
+	i := len(buf) - 1
+	for num >= base {
+		buf[i] = hexDigits[num%base]
+		num /= base
+		i--
+	}
+	buf[i] = hexDigits[num]
+
+	// Add '0x' prefix.
+	i--
+	buf[i] = 'x'
+	i--
+	buf[i] = '0'
+
+	// Strip unused leading bytes.
+	buf = buf[i:]
+	w.Write(buf)
+}
+
+// valuesSorter implements sort.Interface to allow a slice of reflect.Value
+// elements to be sorted.
+type valuesSorter struct {
+	values  []reflect.Value
+	strings []string // either nil or same len and values
+	cs      *ConfigState
+}
+
+// newValuesSorter initializes a valuesSorter instance, which holds a set of
+// surrogate keys on which the data should be sorted.  It uses flags in
+// ConfigState to decide if and how to populate those surrogate keys.
+func newValuesSorter(values []reflect.Value, cs *ConfigState) sort.Interface {
+	vs := &valuesSorter{values: values, cs: cs}
+	if canSortSimply(vs.values[0].Kind()) {
+		return vs
+	}
+	if !cs.DisableMethods {
+		vs.strings = make([]string, len(values))
+		for i := range vs.values {
+			b := bytes.Buffer{}
+			if !handleMethods(cs, &b, vs.values[i]) {
+				vs.strings = nil
+				break
+			}
+			vs.strings[i] = b.String()
+		}
+	}
+	if vs.strings == nil && cs.SpewKeys {
+		vs.strings = make([]string, len(values))
+		for i := range vs.values {
+			vs.strings[i] = Sprintf("%#v", vs.values[i].Interface())
+		}
+	}
+	return vs
+}
+
+// canSortSimply tests whether a reflect.Kind is a primitive that can be sorted
+// directly, or whether it should be considered for sorting by surrogate keys
+// (if the ConfigState allows it).
+func canSortSimply(kind reflect.Kind) bool {
+	// This switch parallels valueSortLess, except for the default case.
+	switch kind {
+	case reflect.Bool:
+		return true
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
+		return true
+	case reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint:
+		return true
+	case reflect.Float32, reflect.Float64:
+		return true
+	case reflect.String:
+		return true
+	case reflect.Uintptr:
+		return true
+	case reflect.Array:
+		return true
+	}
+	return false
+}
+
+// Len returns the number of values in the slice.  It is part of the
+// sort.Interface implementation.
+func (s *valuesSorter) Len() int {
+	return len(s.values)
+}
+
+// Swap swaps the values at the passed indices.  It is part of the
+// sort.Interface implementation.
+func (s *valuesSorter) Swap(i, j int) {
+	s.values[i], s.values[j] = s.values[j], s.values[i]
+	if s.strings != nil {
+		s.strings[i], s.strings[j] = s.strings[j], s.strings[i]
+	}
+}
+
+// valueSortLess returns whether the first value should sort before the second
+// value.  It is used by valueSorter.Less as part of the sort.Interface
+// implementation.
+func valueSortLess(a, b reflect.Value) bool {
+	switch a.Kind() {
+	case reflect.Bool:
+		return !a.Bool() && b.Bool()
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
+		return a.Int() < b.Int()
+	case reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint:
+		return a.Uint() < b.Uint()
+	case reflect.Float32, reflect.Float64:
+		return a.Float() < b.Float()
+	case reflect.String:
+		return a.String() < b.String()
+	case reflect.Uintptr:
+		return a.Uint() < b.Uint()
+	case reflect.Array:
+		// Compare the contents of both arrays.
+		l := a.Len()
+		for i := 0; i < l; i++ {
+			av := a.Index(i)
+			bv := b.Index(i)
+			if av.Interface() == bv.Interface() {
+				continue
+			}
+			return valueSortLess(av, bv)
+		}
+	}
+	return a.String() < b.String()
+}
+
+// Less returns whether the value at index i should sort before the
+// value at index j.  It is part of the sort.Interface implementation.
+func (s *valuesSorter) Less(i, j int) bool {
+	if s.strings == nil {
+		return valueSortLess(s.values[i], s.values[j])
+	}
+	return s.strings[i] < s.strings[j]
+}
+
+// sortValues is a sort function that handles both native types and any type that
+// can be converted to error or Stringer.  Other inputs are sorted according to
+// their Value.String() value to ensure display stability.
+func sortValues(values []reflect.Value, cs *ConfigState) {
+	if len(values) == 0 {
+		return
+	}
+	sort.Sort(newValuesSorter(values, cs))
+}

diff --git a/vendor/github.com/davecgh/go-spew/spew/config.go b/vendor/github.com/davecgh/go-spew/spew/config.go
new file mode 100644
index 0000000..2e3d22f
--- /dev/null
+++ b/vendor/github.com/davecgh/go-spew/spew/config.go

@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2013-2016 Dave Collins <dave@davec.name>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+package spew
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+)
+
+// ConfigState houses the configuration options used by spew to format and
+// display values.  There is a global instance, Config, that is used to control
+// all top-level Formatter and Dump functionality.  Each ConfigState instance
+// provides methods equivalent to the top-level functions.
+//
+// The zero value for ConfigState provides no indentation.  You would typically
+// want to set it to a space or a tab.
+//
+// Alternatively, you can use NewDefaultConfig to get a ConfigState instance
+// with default settings.  See the documentation of NewDefaultConfig for default
+// values.
+type ConfigState struct {
+	// Indent specifies the string to use for each indentation level.  The
+	// global config instance that all top-level functions use set this to a
+	// single space by default.  If you would like more indentation, you might
+	// set this to a tab with "\t" or perhaps two spaces with "  ".
+	Indent string
+
+	// MaxDepth controls the maximum number of levels to descend into nested
+	// data structures.  The default, 0, means there is no limit.
+	//
+	// NOTE: Circular data structures are properly detected, so it is not
+	// necessary to set this value unless you specifically want to limit deeply
+	// nested data structures.
+	MaxDepth int
+
+	// DisableMethods specifies whether or not error and Stringer interfaces are
+	// invoked for types that implement them.
+	DisableMethods bool
+
+	// DisablePointerMethods specifies whether or not to check for and invoke
+	// error and Stringer interfaces on types which only accept a pointer
+	// receiver when the current type is not a pointer.
+	//
+	// NOTE: This might be an unsafe action since calling one of these methods
+	// with a pointer receiver could technically mutate the value, however,
+	// in practice, types which choose to satisify an error or Stringer
+	// interface with a pointer receiver should not be mutating their state
+	// inside these interface methods.  As a result, this option relies on
+	// access to the unsafe package, so it will not have any effect when
+	// running in environments without access to the unsafe package such as
+	// Google App Engine or with the "safe" build tag specified.
+	DisablePointerMethods bool
+
+	// DisablePointerAddresses specifies whether to disable the printing of
+	// pointer addresses. This is useful when diffing data structures in tests.
+	DisablePointerAddresses bool
+
+	// DisableCapacities specifies whether to disable the printing of capacities
+	// for arrays, slices, maps and channels. This is useful when diffing
+	// data structures in tests.
+	DisableCapacities bool
+
+	// ContinueOnMethod specifies whether or not recursion should continue once
+	// a custom error or Stringer interface is invoked.  The default, false,
+	// means it will print the results of invoking the custom error or Stringer
+	// interface and return immediately instead of continuing to recurse into
+	// the internals of the data type.
+	//
+	// NOTE: This flag does not have any effect if method invocation is disabled
+	// via the DisableMethods or DisablePointerMethods options.
+	ContinueOnMethod bool
+
+	// SortKeys specifies map keys should be sorted before being printed. Use
+	// this to have a more deterministic, diffable output.  Note that only
+	// native types (bool, int, uint, floats, uintptr and string) and types
+	// that support the error or Stringer interfaces (if methods are
+	// enabled) are supported, with other types sorted according to the
+	// reflect.Value.String() output which guarantees display stability.
+	SortKeys bool
+
+	// SpewKeys specifies that, as a last resort attempt, map keys should
+	// be spewed to strings and sorted by those strings.  This is only
+	// considered if SortKeys is true.
+	SpewKeys bool
+}
+
+// Config is the active configuration of the top-level functions.
+// The configuration can be changed by modifying the contents of spew.Config.
+var Config = ConfigState{Indent: " "}
+
+// Errorf is a wrapper for fmt.Errorf that treats each argument as if it were
+// passed with a Formatter interface returned by c.NewFormatter.  It returns
+// the formatted string as a value that satisfies error.  See NewFormatter
+// for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Errorf(format, c.NewFormatter(a), c.NewFormatter(b))
+func (c *ConfigState) Errorf(format string, a ...interface{}) (err error) {
+	return fmt.Errorf(format, c.convertArgs(a)...)
+}
+
+// Fprint is a wrapper for fmt.Fprint that treats each argument as if it were
+// passed with a Formatter interface returned by c.NewFormatter.  It returns
+// the number of bytes written and any write error encountered.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Fprint(w, c.NewFormatter(a), c.NewFormatter(b))
+func (c *ConfigState) Fprint(w io.Writer, a ...interface{}) (n int, err error) {
+	return fmt.Fprint(w, c.convertArgs(a)...)
+}
+
+// Fprintf is a wrapper for fmt.Fprintf that treats each argument as if it were
+// passed with a Formatter interface returned by c.NewFormatter.  It returns
+// the number of bytes written and any write error encountered.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Fprintf(w, format, c.NewFormatter(a), c.NewFormatter(b))
+func (c *ConfigState) Fprintf(w io.Writer, format string, a ...interface{}) (n int, err error) {
+	return fmt.Fprintf(w, format, c.convertArgs(a)...)
+}
+
+// Fprintln is a wrapper for fmt.Fprintln that treats each argument as if it
+// passed with a Formatter interface returned by c.NewFormatter.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Fprintln(w, c.NewFormatter(a), c.NewFormatter(b))
+func (c *ConfigState) Fprintln(w io.Writer, a ...interface{}) (n int, err error) {
+	return fmt.Fprintln(w, c.convertArgs(a)...)
+}
+
+// Print is a wrapper for fmt.Print that treats each argument as if it were
+// passed with a Formatter interface returned by c.NewFormatter.  It returns
+// the number of bytes written and any write error encountered.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Print(c.NewFormatter(a), c.NewFormatter(b))
+func (c *ConfigState) Print(a ...interface{}) (n int, err error) {
+	return fmt.Print(c.convertArgs(a)...)
+}
+
+// Printf is a wrapper for fmt.Printf that treats each argument as if it were
+// passed with a Formatter interface returned by c.NewFormatter.  It returns
+// the number of bytes written and any write error encountered.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Printf(format, c.NewFormatter(a), c.NewFormatter(b))
+func (c *ConfigState) Printf(format string, a ...interface{}) (n int, err error) {
+	return fmt.Printf(format, c.convertArgs(a)...)
+}
+
+// Println is a wrapper for fmt.Println that treats each argument as if it were
+// passed with a Formatter interface returned by c.NewFormatter.  It returns
+// the number of bytes written and any write error encountered.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Println(c.NewFormatter(a), c.NewFormatter(b))
+func (c *ConfigState) Println(a ...interface{}) (n int, err error) {
+	return fmt.Println(c.convertArgs(a)...)
+}
+
+// Sprint is a wrapper for fmt.Sprint that treats each argument as if it were
+// passed with a Formatter interface returned by c.NewFormatter.  It returns
+// the resulting string.  See NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Sprint(c.NewFormatter(a), c.NewFormatter(b))
+func (c *ConfigState) Sprint(a ...interface{}) string {
+	return fmt.Sprint(c.convertArgs(a)...)
+}
+
+// Sprintf is a wrapper for fmt.Sprintf that treats each argument as if it were
+// passed with a Formatter interface returned by c.NewFormatter.  It returns
+// the resulting string.  See NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Sprintf(format, c.NewFormatter(a), c.NewFormatter(b))
+func (c *ConfigState) Sprintf(format string, a ...interface{}) string {
+	return fmt.Sprintf(format, c.convertArgs(a)...)
+}
+
+// Sprintln is a wrapper for fmt.Sprintln that treats each argument as if it
+// were passed with a Formatter interface returned by c.NewFormatter.  It
+// returns the resulting string.  See NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Sprintln(c.NewFormatter(a), c.NewFormatter(b))
+func (c *ConfigState) Sprintln(a ...interface{}) string {
+	return fmt.Sprintln(c.convertArgs(a)...)
+}
+
+/*
+NewFormatter returns a custom formatter that satisfies the fmt.Formatter
+interface.  As a result, it integrates cleanly with standard fmt package
+printing functions.  The formatter is useful for inline printing of smaller data
+types similar to the standard %v format specifier.
+
+The custom formatter only responds to the %v (most compact), %+v (adds pointer
+addresses), %#v (adds types), and %#+v (adds types and pointer addresses) verb
+combinations.  Any other verbs such as %x and %q will be sent to the the
+standard fmt package for formatting.  In addition, the custom formatter ignores
+the width and precision arguments (however they will still work on the format
+specifiers not handled by the custom formatter).
+
+Typically this function shouldn't be called directly.  It is much easier to make
+use of the custom formatter by calling one of the convenience functions such as
+c.Printf, c.Println, or c.Printf.
+*/
+func (c *ConfigState) NewFormatter(v interface{}) fmt.Formatter {
+	return newFormatter(c, v)
+}
+
+// Fdump formats and displays the passed arguments to io.Writer w.  It formats
+// exactly the same as Dump.
+func (c *ConfigState) Fdump(w io.Writer, a ...interface{}) {
+	fdump(c, w, a...)
+}
+
+/*
+Dump displays the passed parameters to standard out with newlines, customizable
+indentation, and additional debug information such as complete types and all
+pointer addresses used to indirect to the final value.  It provides the
+following features over the built-in printing facilities provided by the fmt
+package:
+
+	* Pointers are dereferenced and followed
+	* Circular data structures are detected and handled properly
+	* Custom Stringer/error interfaces are optionally invoked, including
+	  on unexported types
+	* Custom types which only implement the Stringer/error interfaces via
+	  a pointer receiver are optionally invoked when passing non-pointer
+	  variables
+	* Byte arrays and slices are dumped like the hexdump -C command which
+	  includes offsets, byte values in hex, and ASCII output
+
+The configuration options are controlled by modifying the public members
+of c.  See ConfigState for options documentation.
+
+See Fdump if you would prefer dumping to an arbitrary io.Writer or Sdump to
+get the formatted result as a string.
+*/
+func (c *ConfigState) Dump(a ...interface{}) {
+	fdump(c, os.Stdout, a...)
+}
+
+// Sdump returns a string with the passed arguments formatted exactly the same
+// as Dump.
+func (c *ConfigState) Sdump(a ...interface{}) string {
+	var buf bytes.Buffer
+	fdump(c, &buf, a...)
+	return buf.String()
+}
+
+// convertArgs accepts a slice of arguments and returns a slice of the same
+// length with each argument converted to a spew Formatter interface using
+// the ConfigState associated with s.
+func (c *ConfigState) convertArgs(args []interface{}) (formatters []interface{}) {
+	formatters = make([]interface{}, len(args))
+	for index, arg := range args {
+		formatters[index] = newFormatter(c, arg)
+	}
+	return formatters
+}
+
+// NewDefaultConfig returns a ConfigState with the following default settings.
+//
+// 	Indent: " "
+// 	MaxDepth: 0
+// 	DisableMethods: false
+// 	DisablePointerMethods: false
+// 	ContinueOnMethod: false
+// 	SortKeys: false
+func NewDefaultConfig() *ConfigState {
+	return &ConfigState{Indent: " "}
+}

diff --git a/vendor/github.com/davecgh/go-spew/spew/doc.go b/vendor/github.com/davecgh/go-spew/spew/doc.go
new file mode 100644
index 0000000..aacaac6
--- /dev/null
+++ b/vendor/github.com/davecgh/go-spew/spew/doc.go

@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2013-2016 Dave Collins <dave@davec.name>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+Package spew implements a deep pretty printer for Go data structures to aid in
+debugging.
+
+A quick overview of the additional features spew provides over the built-in
+printing facilities for Go data types are as follows:
+
+	* Pointers are dereferenced and followed
+	* Circular data structures are detected and handled properly
+	* Custom Stringer/error interfaces are optionally invoked, including
+	  on unexported types
+	* Custom types which only implement the Stringer/error interfaces via
+	  a pointer receiver are optionally invoked when passing non-pointer
+	  variables
+	* Byte arrays and slices are dumped like the hexdump -C command which
+	  includes offsets, byte values in hex, and ASCII output (only when using
+	  Dump style)
+
+There are two different approaches spew allows for dumping Go data structures:
+
+	* Dump style which prints with newlines, customizable indentation,
+	  and additional debug information such as types and all pointer addresses
+	  used to indirect to the final value
+	* A custom Formatter interface that integrates cleanly with the standard fmt
+	  package and replaces %v, %+v, %#v, and %#+v to provide inline printing
+	  similar to the default %v while providing the additional functionality
+	  outlined above and passing unsupported format verbs such as %x and %q
+	  along to fmt
+
+Quick Start
+
+This section demonstrates how to quickly get started with spew.  See the
+sections below for further details on formatting and configuration options.
+
+To dump a variable with full newlines, indentation, type, and pointer
+information use Dump, Fdump, or Sdump:
+	spew.Dump(myVar1, myVar2, ...)
+	spew.Fdump(someWriter, myVar1, myVar2, ...)
+	str := spew.Sdump(myVar1, myVar2, ...)
+
+Alternatively, if you would prefer to use format strings with a compacted inline
+printing style, use the convenience wrappers Printf, Fprintf, etc with
+%v (most compact), %+v (adds pointer addresses), %#v (adds types), or
+%#+v (adds types and pointer addresses):
+	spew.Printf("myVar1: %v -- myVar2: %+v", myVar1, myVar2)
+	spew.Printf("myVar3: %#v -- myVar4: %#+v", myVar3, myVar4)
+	spew.Fprintf(someWriter, "myVar1: %v -- myVar2: %+v", myVar1, myVar2)
+	spew.Fprintf(someWriter, "myVar3: %#v -- myVar4: %#+v", myVar3, myVar4)
+
+Configuration Options
+
+Configuration of spew is handled by fields in the ConfigState type.  For
+convenience, all of the top-level functions use a global state available
+via the spew.Config global.
+
+It is also possible to create a ConfigState instance that provides methods
+equivalent to the top-level functions.  This allows concurrent configuration
+options.  See the ConfigState documentation for more details.
+
+The following configuration options are available:
+	* Indent
+		String to use for each indentation level for Dump functions.
+		It is a single space by default.  A popular alternative is "\t".
+
+	* MaxDepth
+		Maximum number of levels to descend into nested data structures.
+		There is no limit by default.
+
+	* DisableMethods
+		Disables invocation of error and Stringer interface methods.
+		Method invocation is enabled by default.
+
+	* DisablePointerMethods
+		Disables invocation of error and Stringer interface methods on types
+		which only accept pointer receivers from non-pointer variables.
+		Pointer method invocation is enabled by default.
+
+	* DisablePointerAddresses
+		DisablePointerAddresses specifies whether to disable the printing of
+		pointer addresses. This is useful when diffing data structures in tests.
+
+	* DisableCapacities
+		DisableCapacities specifies whether to disable the printing of
+		capacities for arrays, slices, maps and channels. This is useful when
+		diffing data structures in tests.
+
+	* ContinueOnMethod
+		Enables recursion into types after invoking error and Stringer interface
+		methods. Recursion after method invocation is disabled by default.
+
+	* SortKeys
+		Specifies map keys should be sorted before being printed. Use
+		this to have a more deterministic, diffable output.  Note that
+		only native types (bool, int, uint, floats, uintptr and string)
+		and types which implement error or Stringer interfaces are
+		supported with other types sorted according to the
+		reflect.Value.String() output which guarantees display
+		stability.  Natural map order is used by default.
+
+	* SpewKeys
+		Specifies that, as a last resort attempt, map keys should be
+		spewed to strings and sorted by those strings.  This is only
+		considered if SortKeys is true.
+
+Dump Usage
+
+Simply call spew.Dump with a list of variables you want to dump:
+
+	spew.Dump(myVar1, myVar2, ...)
+
+You may also call spew.Fdump if you would prefer to output to an arbitrary
+io.Writer.  For example, to dump to standard error:
+
+	spew.Fdump(os.Stderr, myVar1, myVar2, ...)
+
+A third option is to call spew.Sdump to get the formatted output as a string:
+
+	str := spew.Sdump(myVar1, myVar2, ...)
+
+Sample Dump Output
+
+See the Dump example for details on the setup of the types and variables being
+shown here.
+
+	(main.Foo) {
+	 unexportedField: (*main.Bar)(0xf84002e210)({
+	  flag: (main.Flag) flagTwo,
+	  data: (uintptr) <nil>
+	 }),
+	 ExportedField: (map[interface {}]interface {}) (len=1) {
+	  (string) (len=3) "one": (bool) true
+	 }
+	}
+
+Byte (and uint8) arrays and slices are displayed uniquely like the hexdump -C
+command as shown.
+	([]uint8) (len=32 cap=32) {
+	 00000000  11 12 13 14 15 16 17 18  19 1a 1b 1c 1d 1e 1f 20  |............... |
+	 00000010  21 22 23 24 25 26 27 28  29 2a 2b 2c 2d 2e 2f 30  |!"#$%&'()*+,-./0|
+	 00000020  31 32                                             |12|
+	}
+
+Custom Formatter
+
+Spew provides a custom formatter that implements the fmt.Formatter interface
+so that it integrates cleanly with standard fmt package printing functions. The
+formatter is useful for inline printing of smaller data types similar to the
+standard %v format specifier.
+
+The custom formatter only responds to the %v (most compact), %+v (adds pointer
+addresses), %#v (adds types), or %#+v (adds types and pointer addresses) verb
+combinations.  Any other verbs such as %x and %q will be sent to the the
+standard fmt package for formatting.  In addition, the custom formatter ignores
+the width and precision arguments (however they will still work on the format
+specifiers not handled by the custom formatter).
+
+Custom Formatter Usage
+
+The simplest way to make use of the spew custom formatter is to call one of the
+convenience functions such as spew.Printf, spew.Println, or spew.Printf.  The
+functions have syntax you are most likely already familiar with:
+
+	spew.Printf("myVar1: %v -- myVar2: %+v", myVar1, myVar2)
+	spew.Printf("myVar3: %#v -- myVar4: %#+v", myVar3, myVar4)
+	spew.Println(myVar, myVar2)
+	spew.Fprintf(os.Stderr, "myVar1: %v -- myVar2: %+v", myVar1, myVar2)
+	spew.Fprintf(os.Stderr, "myVar3: %#v -- myVar4: %#+v", myVar3, myVar4)
+
+See the Index for the full list convenience functions.
+
+Sample Formatter Output
+
+Double pointer to a uint8:
+	  %v: <**>5
+	 %+v: <**>(0xf8400420d0->0xf8400420c8)5
+	 %#v: (**uint8)5
+	%#+v: (**uint8)(0xf8400420d0->0xf8400420c8)5
+
+Pointer to circular struct with a uint8 field and a pointer to itself:
+	  %v: <*>{1 <*><shown>}
+	 %+v: <*>(0xf84003e260){ui8:1 c:<*>(0xf84003e260)<shown>}
+	 %#v: (*main.circular){ui8:(uint8)1 c:(*main.circular)<shown>}
+	%#+v: (*main.circular)(0xf84003e260){ui8:(uint8)1 c:(*main.circular)(0xf84003e260)<shown>}
+
+See the Printf example for details on the setup of variables being shown
+here.
+
+Errors
+
+Since it is possible for custom Stringer/error interfaces to panic, spew
+detects them and handles them internally by printing the panic information
+inline with the output.  Since spew is intended to provide deep pretty printing
+capabilities on structures, it intentionally does not return any errors.
+*/
+package spew

diff --git a/vendor/github.com/davecgh/go-spew/spew/dump.go b/vendor/github.com/davecgh/go-spew/spew/dump.go
new file mode 100644
index 0000000..f78d89f
--- /dev/null
+++ b/vendor/github.com/davecgh/go-spew/spew/dump.go

@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2013-2016 Dave Collins <dave@davec.name>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+package spew
+
+import (
+	"bytes"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"os"
+	"reflect"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+var (
+	// uint8Type is a reflect.Type representing a uint8.  It is used to
+	// convert cgo types to uint8 slices for hexdumping.
+	uint8Type = reflect.TypeOf(uint8(0))
+
+	// cCharRE is a regular expression that matches a cgo char.
+	// It is used to detect character arrays to hexdump them.
+	cCharRE = regexp.MustCompile(`^.*\._Ctype_char$`)
+
+	// cUnsignedCharRE is a regular expression that matches a cgo unsigned
+	// char.  It is used to detect unsigned character arrays to hexdump
+	// them.
+	cUnsignedCharRE = regexp.MustCompile(`^.*\._Ctype_unsignedchar$`)
+
+	// cUint8tCharRE is a regular expression that matches a cgo uint8_t.
+	// It is used to detect uint8_t arrays to hexdump them.
+	cUint8tCharRE = regexp.MustCompile(`^.*\._Ctype_uint8_t$`)
+)
+
+// dumpState contains information about the state of a dump operation.
+type dumpState struct {
+	w                io.Writer
+	depth            int
+	pointers         map[uintptr]int
+	ignoreNextType   bool
+	ignoreNextIndent bool
+	cs               *ConfigState
+}
+
+// indent performs indentation according to the depth level and cs.Indent
+// option.
+func (d *dumpState) indent() {
+	if d.ignoreNextIndent {
+		d.ignoreNextIndent = false
+		return
+	}
+	d.w.Write(bytes.Repeat([]byte(d.cs.Indent), d.depth))
+}
+
+// unpackValue returns values inside of non-nil interfaces when possible.
+// This is useful for data types like structs, arrays, slices, and maps which
+// can contain varying types packed inside an interface.
+func (d *dumpState) unpackValue(v reflect.Value) reflect.Value {
+	if v.Kind() == reflect.Interface && !v.IsNil() {
+		v = v.Elem()
+	}
+	return v
+}
+
+// dumpPtr handles formatting of pointers by indirecting them as necessary.
+func (d *dumpState) dumpPtr(v reflect.Value) {
+	// Remove pointers at or below the current depth from map used to detect
+	// circular refs.
+	for k, depth := range d.pointers {
+		if depth >= d.depth {
+			delete(d.pointers, k)
+		}
+	}
+
+	// Keep list of all dereferenced pointers to show later.
+	pointerChain := make([]uintptr, 0)
+
+	// Figure out how many levels of indirection there are by dereferencing
+	// pointers and unpacking interfaces down the chain while detecting circular
+	// references.
+	nilFound := false
+	cycleFound := false
+	indirects := 0
+	ve := v
+	for ve.Kind() == reflect.Ptr {
+		if ve.IsNil() {
+			nilFound = true
+			break
+		}
+		indirects++
+		addr := ve.Pointer()
+		pointerChain = append(pointerChain, addr)
+		if pd, ok := d.pointers[addr]; ok && pd < d.depth {
+			cycleFound = true
+			indirects--
+			break
+		}
+		d.pointers[addr] = d.depth
+
+		ve = ve.Elem()
+		if ve.Kind() == reflect.Interface {
+			if ve.IsNil() {
+				nilFound = true
+				break
+			}
+			ve = ve.Elem()
+		}
+	}
+
+	// Display type information.
+	d.w.Write(openParenBytes)
+	d.w.Write(bytes.Repeat(asteriskBytes, indirects))
+	d.w.Write([]byte(ve.Type().String()))
+	d.w.Write(closeParenBytes)
+
+	// Display pointer information.
+	if !d.cs.DisablePointerAddresses && len(pointerChain) > 0 {
+		d.w.Write(openParenBytes)
+		for i, addr := range pointerChain {
+			if i > 0 {
+				d.w.Write(pointerChainBytes)
+			}
+			printHexPtr(d.w, addr)
+		}
+		d.w.Write(closeParenBytes)
+	}
+
+	// Display dereferenced value.
+	d.w.Write(openParenBytes)
+	switch {
+	case nilFound:
+		d.w.Write(nilAngleBytes)
+
+	case cycleFound:
+		d.w.Write(circularBytes)
+
+	default:
+		d.ignoreNextType = true
+		d.dump(ve)
+	}
+	d.w.Write(closeParenBytes)
+}
+
+// dumpSlice handles formatting of arrays and slices.  Byte (uint8 under
+// reflection) arrays and slices are dumped in hexdump -C fashion.
+func (d *dumpState) dumpSlice(v reflect.Value) {
+	// Determine whether this type should be hex dumped or not.  Also,
+	// for types which should be hexdumped, try to use the underlying data
+	// first, then fall back to trying to convert them to a uint8 slice.
+	var buf []uint8
+	doConvert := false
+	doHexDump := false
+	numEntries := v.Len()
+	if numEntries > 0 {
+		vt := v.Index(0).Type()
+		vts := vt.String()
+		switch {
+		// C types that need to be converted.
+		case cCharRE.MatchString(vts):
+			fallthrough
+		case cUnsignedCharRE.MatchString(vts):
+			fallthrough
+		case cUint8tCharRE.MatchString(vts):
+			doConvert = true
+
+		// Try to use existing uint8 slices and fall back to converting
+		// and copying if that fails.
+		case vt.Kind() == reflect.Uint8:
+			// We need an addressable interface to convert the type
+			// to a byte slice.  However, the reflect package won't
+			// give us an interface on certain things like
+			// unexported struct fields in order to enforce
+			// visibility rules.  We use unsafe, when available, to
+			// bypass these restrictions since this package does not
+			// mutate the values.
+			vs := v
+			if !vs.CanInterface() || !vs.CanAddr() {
+				vs = unsafeReflectValue(vs)
+			}
+			if !UnsafeDisabled {
+				vs = vs.Slice(0, numEntries)
+
+				// Use the existing uint8 slice if it can be
+				// type asserted.
+				iface := vs.Interface()
+				if slice, ok := iface.([]uint8); ok {
+					buf = slice
+					doHexDump = true
+					break
+				}
+			}
+
+			// The underlying data needs to be converted if it can't
+			// be type asserted to a uint8 slice.
+			doConvert = true
+		}
+
+		// Copy and convert the underlying type if needed.
+		if doConvert && vt.ConvertibleTo(uint8Type) {
+			// Convert and copy each element into a uint8 byte
+			// slice.
+			buf = make([]uint8, numEntries)
+			for i := 0; i < numEntries; i++ {
+				vv := v.Index(i)
+				buf[i] = uint8(vv.Convert(uint8Type).Uint())
+			}
+			doHexDump = true
+		}
+	}
+
+	// Hexdump the entire slice as needed.
+	if doHexDump {
+		indent := strings.Repeat(d.cs.Indent, d.depth)
+		str := indent + hex.Dump(buf)
+		str = strings.Replace(str, "\n", "\n"+indent, -1)
+		str = strings.TrimRight(str, d.cs.Indent)
+		d.w.Write([]byte(str))
+		return
+	}
+
+	// Recursively call dump for each item.
+	for i := 0; i < numEntries; i++ {
+		d.dump(d.unpackValue(v.Index(i)))
+		if i < (numEntries - 1) {
+			d.w.Write(commaNewlineBytes)
+		} else {
+			d.w.Write(newlineBytes)
+		}
+	}
+}
+
+// dump is the main workhorse for dumping a value.  It uses the passed reflect
+// value to figure out what kind of object we are dealing with and formats it
+// appropriately.  It is a recursive function, however circular data structures
+// are detected and handled properly.
+func (d *dumpState) dump(v reflect.Value) {
+	// Handle invalid reflect values immediately.
+	kind := v.Kind()
+	if kind == reflect.Invalid {
+		d.w.Write(invalidAngleBytes)
+		return
+	}
+
+	// Handle pointers specially.
+	if kind == reflect.Ptr {
+		d.indent()
+		d.dumpPtr(v)
+		return
+	}
+
+	// Print type information unless already handled elsewhere.
+	if !d.ignoreNextType {
+		d.indent()
+		d.w.Write(openParenBytes)
+		d.w.Write([]byte(v.Type().String()))
+		d.w.Write(closeParenBytes)
+		d.w.Write(spaceBytes)
+	}
+	d.ignoreNextType = false
+
+	// Display length and capacity if the built-in len and cap functions
+	// work with the value's kind and the len/cap itself is non-zero.
+	valueLen, valueCap := 0, 0
+	switch v.Kind() {
+	case reflect.Array, reflect.Slice, reflect.Chan:
+		valueLen, valueCap = v.Len(), v.Cap()
+	case reflect.Map, reflect.String:
+		valueLen = v.Len()
+	}
+	if valueLen != 0 || !d.cs.DisableCapacities && valueCap != 0 {
+		d.w.Write(openParenBytes)
+		if valueLen != 0 {
+			d.w.Write(lenEqualsBytes)
+			printInt(d.w, int64(valueLen), 10)
+		}
+		if !d.cs.DisableCapacities && valueCap != 0 {
+			if valueLen != 0 {
+				d.w.Write(spaceBytes)
+			}
+			d.w.Write(capEqualsBytes)
+			printInt(d.w, int64(valueCap), 10)
+		}
+		d.w.Write(closeParenBytes)
+		d.w.Write(spaceBytes)
+	}
+
+	// Call Stringer/error interfaces if they exist and the handle methods flag
+	// is enabled
+	if !d.cs.DisableMethods {
+		if (kind != reflect.Invalid) && (kind != reflect.Interface) {
+			if handled := handleMethods(d.cs, d.w, v); handled {
+				return
+			}
+		}
+	}
+
+	switch kind {
+	case reflect.Invalid:
+		// Do nothing.  We should never get here since invalid has already
+		// been handled above.
+
+	case reflect.Bool:
+		printBool(d.w, v.Bool())
+
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
+		printInt(d.w, v.Int(), 10)
+
+	case reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint:
+		printUint(d.w, v.Uint(), 10)
+
+	case reflect.Float32:
+		printFloat(d.w, v.Float(), 32)
+
+	case reflect.Float64:
+		printFloat(d.w, v.Float(), 64)
+
+	case reflect.Complex64:
+		printComplex(d.w, v.Complex(), 32)
+
+	case reflect.Complex128:
+		printComplex(d.w, v.Complex(), 64)
+
+	case reflect.Slice:
+		if v.IsNil() {
+			d.w.Write(nilAngleBytes)
+			break
+		}
+		fallthrough
+
+	case reflect.Array:
+		d.w.Write(openBraceNewlineBytes)
+		d.depth++
+		if (d.cs.MaxDepth != 0) && (d.depth > d.cs.MaxDepth) {
+			d.indent()
+			d.w.Write(maxNewlineBytes)
+		} else {
+			d.dumpSlice(v)
+		}
+		d.depth--
+		d.indent()
+		d.w.Write(closeBraceBytes)
+
+	case reflect.String:
+		d.w.Write([]byte(strconv.Quote(v.String())))
+
+	case reflect.Interface:
+		// The only time we should get here is for nil interfaces due to
+		// unpackValue calls.
+		if v.IsNil() {
+			d.w.Write(nilAngleBytes)
+		}
+
+	case reflect.Ptr:
+		// Do nothing.  We should never get here since pointers have already
+		// been handled above.
+
+	case reflect.Map:
+		// nil maps should be indicated as different than empty maps
+		if v.IsNil() {
+			d.w.Write(nilAngleBytes)
+			break
+		}
+
+		d.w.Write(openBraceNewlineBytes)
+		d.depth++
+		if (d.cs.MaxDepth != 0) && (d.depth > d.cs.MaxDepth) {
+			d.indent()
+			d.w.Write(maxNewlineBytes)
+		} else {
+			numEntries := v.Len()
+			keys := v.MapKeys()
+			if d.cs.SortKeys {
+				sortValues(keys, d.cs)
+			}
+			for i, key := range keys {
+				d.dump(d.unpackValue(key))
+				d.w.Write(colonSpaceBytes)
+				d.ignoreNextIndent = true
+				d.dump(d.unpackValue(v.MapIndex(key)))
+				if i < (numEntries - 1) {
+					d.w.Write(commaNewlineBytes)
+				} else {
+					d.w.Write(newlineBytes)
+				}
+			}
+		}
+		d.depth--
+		d.indent()
+		d.w.Write(closeBraceBytes)
+
+	case reflect.Struct:
+		d.w.Write(openBraceNewlineBytes)
+		d.depth++
+		if (d.cs.MaxDepth != 0) && (d.depth > d.cs.MaxDepth) {
+			d.indent()
+			d.w.Write(maxNewlineBytes)
+		} else {
+			vt := v.Type()
+			numFields := v.NumField()
+			for i := 0; i < numFields; i++ {
+				d.indent()
+				vtf := vt.Field(i)
+				d.w.Write([]byte(vtf.Name))
+				d.w.Write(colonSpaceBytes)
+				d.ignoreNextIndent = true
+				d.dump(d.unpackValue(v.Field(i)))
+				if i < (numFields - 1) {
+					d.w.Write(commaNewlineBytes)
+				} else {
+					d.w.Write(newlineBytes)
+				}
+			}
+		}
+		d.depth--
+		d.indent()
+		d.w.Write(closeBraceBytes)
+
+	case reflect.Uintptr:
+		printHexPtr(d.w, uintptr(v.Uint()))
+
+	case reflect.UnsafePointer, reflect.Chan, reflect.Func:
+		printHexPtr(d.w, v.Pointer())
+
+	// There were not any other types at the time this code was written, but
+	// fall back to letting the default fmt package handle it in case any new
+	// types are added.
+	default:
+		if v.CanInterface() {
+			fmt.Fprintf(d.w, "%v", v.Interface())
+		} else {
+			fmt.Fprintf(d.w, "%v", v.String())
+		}
+	}
+}
+
+// fdump is a helper function to consolidate the logic from the various public
+// methods which take varying writers and config states.
+func fdump(cs *ConfigState, w io.Writer, a ...interface{}) {
+	for _, arg := range a {
+		if arg == nil {
+			w.Write(interfaceBytes)
+			w.Write(spaceBytes)
+			w.Write(nilAngleBytes)
+			w.Write(newlineBytes)
+			continue
+		}
+
+		d := dumpState{w: w, cs: cs}
+		d.pointers = make(map[uintptr]int)
+		d.dump(reflect.ValueOf(arg))
+		d.w.Write(newlineBytes)
+	}
+}
+
+// Fdump formats and displays the passed arguments to io.Writer w.  It formats
+// exactly the same as Dump.
+func Fdump(w io.Writer, a ...interface{}) {
+	fdump(&Config, w, a...)
+}
+
+// Sdump returns a string with the passed arguments formatted exactly the same
+// as Dump.
+func Sdump(a ...interface{}) string {
+	var buf bytes.Buffer
+	fdump(&Config, &buf, a...)
+	return buf.String()
+}
+
+/*
+Dump displays the passed parameters to standard out with newlines, customizable
+indentation, and additional debug information such as complete types and all
+pointer addresses used to indirect to the final value.  It provides the
+following features over the built-in printing facilities provided by the fmt
+package:
+
+	* Pointers are dereferenced and followed
+	* Circular data structures are detected and handled properly
+	* Custom Stringer/error interfaces are optionally invoked, including
+	  on unexported types
+	* Custom types which only implement the Stringer/error interfaces via
+	  a pointer receiver are optionally invoked when passing non-pointer
+	  variables
+	* Byte arrays and slices are dumped like the hexdump -C command which
+	  includes offsets, byte values in hex, and ASCII output
+
+The configuration options are controlled by an exported package global,
+spew.Config.  See ConfigState for options documentation.
+
+See Fdump if you would prefer dumping to an arbitrary io.Writer or Sdump to
+get the formatted result as a string.
+*/
+func Dump(a ...interface{}) {
+	fdump(&Config, os.Stdout, a...)
+}

diff --git a/vendor/github.com/davecgh/go-spew/spew/format.go b/vendor/github.com/davecgh/go-spew/spew/format.go
new file mode 100644
index 0000000..b04edb7
--- /dev/null
+++ b/vendor/github.com/davecgh/go-spew/spew/format.go

@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2013-2016 Dave Collins <dave@davec.name>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+package spew
+
+import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"strconv"
+	"strings"
+)
+
+// supportedFlags is a list of all the character flags supported by fmt package.
+const supportedFlags = "0-+# "
+
+// formatState implements the fmt.Formatter interface and contains information
+// about the state of a formatting operation.  The NewFormatter function can
+// be used to get a new Formatter which can be used directly as arguments
+// in standard fmt package printing calls.
+type formatState struct {
+	value          interface{}
+	fs             fmt.State
+	depth          int
+	pointers       map[uintptr]int
+	ignoreNextType bool
+	cs             *ConfigState
+}
+
+// buildDefaultFormat recreates the original format string without precision
+// and width information to pass in to fmt.Sprintf in the case of an
+// unrecognized type.  Unless new types are added to the language, this
+// function won't ever be called.
+func (f *formatState) buildDefaultFormat() (format string) {
+	buf := bytes.NewBuffer(percentBytes)
+
+	for _, flag := range supportedFlags {
+		if f.fs.Flag(int(flag)) {
+			buf.WriteRune(flag)
+		}
+	}
+
+	buf.WriteRune('v')
+
+	format = buf.String()
+	return format
+}
+
+// constructOrigFormat recreates the original format string including precision
+// and width information to pass along to the standard fmt package.  This allows
+// automatic deferral of all format strings this package doesn't support.
+func (f *formatState) constructOrigFormat(verb rune) (format string) {
+	buf := bytes.NewBuffer(percentBytes)
+
+	for _, flag := range supportedFlags {
+		if f.fs.Flag(int(flag)) {
+			buf.WriteRune(flag)
+		}
+	}
+
+	if width, ok := f.fs.Width(); ok {
+		buf.WriteString(strconv.Itoa(width))
+	}
+
+	if precision, ok := f.fs.Precision(); ok {
+		buf.Write(precisionBytes)
+		buf.WriteString(strconv.Itoa(precision))
+	}
+
+	buf.WriteRune(verb)
+
+	format = buf.String()
+	return format
+}
+
+// unpackValue returns values inside of non-nil interfaces when possible and
+// ensures that types for values which have been unpacked from an interface
+// are displayed when the show types flag is also set.
+// This is useful for data types like structs, arrays, slices, and maps which
+// can contain varying types packed inside an interface.
+func (f *formatState) unpackValue(v reflect.Value) reflect.Value {
+	if v.Kind() == reflect.Interface {
+		f.ignoreNextType = false
+		if !v.IsNil() {
+			v = v.Elem()
+		}
+	}
+	return v
+}
+
+// formatPtr handles formatting of pointers by indirecting them as necessary.
+func (f *formatState) formatPtr(v reflect.Value) {
+	// Display nil if top level pointer is nil.
+	showTypes := f.fs.Flag('#')
+	if v.IsNil() && (!showTypes || f.ignoreNextType) {
+		f.fs.Write(nilAngleBytes)
+		return
+	}
+
+	// Remove pointers at or below the current depth from map used to detect
+	// circular refs.
+	for k, depth := range f.pointers {
+		if depth >= f.depth {
+			delete(f.pointers, k)
+		}
+	}
+
+	// Keep list of all dereferenced pointers to possibly show later.
+	pointerChain := make([]uintptr, 0)
+
+	// Figure out how many levels of indirection there are by derferencing
+	// pointers and unpacking interfaces down the chain while detecting circular
+	// references.
+	nilFound := false
+	cycleFound := false
+	indirects := 0
+	ve := v
+	for ve.Kind() == reflect.Ptr {
+		if ve.IsNil() {
+			nilFound = true
+			break
+		}
+		indirects++
+		addr := ve.Pointer()
+		pointerChain = append(pointerChain, addr)
+		if pd, ok := f.pointers[addr]; ok && pd < f.depth {
+			cycleFound = true
+			indirects--
+			break
+		}
+		f.pointers[addr] = f.depth
+
+		ve = ve.Elem()
+		if ve.Kind() == reflect.Interface {
+			if ve.IsNil() {
+				nilFound = true
+				break
+			}
+			ve = ve.Elem()
+		}
+	}
+
+	// Display type or indirection level depending on flags.
+	if showTypes && !f.ignoreNextType {
+		f.fs.Write(openParenBytes)
+		f.fs.Write(bytes.Repeat(asteriskBytes, indirects))
+		f.fs.Write([]byte(ve.Type().String()))
+		f.fs.Write(closeParenBytes)
+	} else {
+		if nilFound || cycleFound {
+			indirects += strings.Count(ve.Type().String(), "*")
+		}
+		f.fs.Write(openAngleBytes)
+		f.fs.Write([]byte(strings.Repeat("*", indirects)))
+		f.fs.Write(closeAngleBytes)
+	}
+
+	// Display pointer information depending on flags.
+	if f.fs.Flag('+') && (len(pointerChain) > 0) {
+		f.fs.Write(openParenBytes)
+		for i, addr := range pointerChain {
+			if i > 0 {
+				f.fs.Write(pointerChainBytes)
+			}
+			printHexPtr(f.fs, addr)
+		}
+		f.fs.Write(closeParenBytes)
+	}
+
+	// Display dereferenced value.
+	switch {
+	case nilFound:
+		f.fs.Write(nilAngleBytes)
+
+	case cycleFound:
+		f.fs.Write(circularShortBytes)
+
+	default:
+		f.ignoreNextType = true
+		f.format(ve)
+	}
+}
+
+// format is the main workhorse for providing the Formatter interface.  It
+// uses the passed reflect value to figure out what kind of object we are
+// dealing with and formats it appropriately.  It is a recursive function,
+// however circular data structures are detected and handled properly.
+func (f *formatState) format(v reflect.Value) {
+	// Handle invalid reflect values immediately.
+	kind := v.Kind()
+	if kind == reflect.Invalid {
+		f.fs.Write(invalidAngleBytes)
+		return
+	}
+
+	// Handle pointers specially.
+	if kind == reflect.Ptr {
+		f.formatPtr(v)
+		return
+	}
+
+	// Print type information unless already handled elsewhere.
+	if !f.ignoreNextType && f.fs.Flag('#') {
+		f.fs.Write(openParenBytes)
+		f.fs.Write([]byte(v.Type().String()))
+		f.fs.Write(closeParenBytes)
+	}
+	f.ignoreNextType = false
+
+	// Call Stringer/error interfaces if they exist and the handle methods
+	// flag is enabled.
+	if !f.cs.DisableMethods {
+		if (kind != reflect.Invalid) && (kind != reflect.Interface) {
+			if handled := handleMethods(f.cs, f.fs, v); handled {
+				return
+			}
+		}
+	}
+
+	switch kind {
+	case reflect.Invalid:
+		// Do nothing.  We should never get here since invalid has already
+		// been handled above.
+
+	case reflect.Bool:
+		printBool(f.fs, v.Bool())
+
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
+		printInt(f.fs, v.Int(), 10)
+
+	case reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint:
+		printUint(f.fs, v.Uint(), 10)
+
+	case reflect.Float32:
+		printFloat(f.fs, v.Float(), 32)
+
+	case reflect.Float64:
+		printFloat(f.fs, v.Float(), 64)
+
+	case reflect.Complex64:
+		printComplex(f.fs, v.Complex(), 32)
+
+	case reflect.Complex128:
+		printComplex(f.fs, v.Complex(), 64)
+
+	case reflect.Slice:
+		if v.IsNil() {
+			f.fs.Write(nilAngleBytes)
+			break
+		}
+		fallthrough
+
+	case reflect.Array:
+		f.fs.Write(openBracketBytes)
+		f.depth++
+		if (f.cs.MaxDepth != 0) && (f.depth > f.cs.MaxDepth) {
+			f.fs.Write(maxShortBytes)
+		} else {
+			numEntries := v.Len()
+			for i := 0; i < numEntries; i++ {
+				if i > 0 {
+					f.fs.Write(spaceBytes)
+				}
+				f.ignoreNextType = true
+				f.format(f.unpackValue(v.Index(i)))
+			}
+		}
+		f.depth--
+		f.fs.Write(closeBracketBytes)
+
+	case reflect.String:
+		f.fs.Write([]byte(v.String()))
+
+	case reflect.Interface:
+		// The only time we should get here is for nil interfaces due to
+		// unpackValue calls.
+		if v.IsNil() {
+			f.fs.Write(nilAngleBytes)
+		}
+
+	case reflect.Ptr:
+		// Do nothing.  We should never get here since pointers have already
+		// been handled above.
+
+	case reflect.Map:
+		// nil maps should be indicated as different than empty maps
+		if v.IsNil() {
+			f.fs.Write(nilAngleBytes)
+			break
+		}
+
+		f.fs.Write(openMapBytes)
+		f.depth++
+		if (f.cs.MaxDepth != 0) && (f.depth > f.cs.MaxDepth) {
+			f.fs.Write(maxShortBytes)
+		} else {
+			keys := v.MapKeys()
+			if f.cs.SortKeys {
+				sortValues(keys, f.cs)
+			}
+			for i, key := range keys {
+				if i > 0 {
+					f.fs.Write(spaceBytes)
+				}
+				f.ignoreNextType = true
+				f.format(f.unpackValue(key))
+				f.fs.Write(colonBytes)
+				f.ignoreNextType = true
+				f.format(f.unpackValue(v.MapIndex(key)))
+			}
+		}
+		f.depth--
+		f.fs.Write(closeMapBytes)
+
+	case reflect.Struct:
+		numFields := v.NumField()
+		f.fs.Write(openBraceBytes)
+		f.depth++
+		if (f.cs.MaxDepth != 0) && (f.depth > f.cs.MaxDepth) {
+			f.fs.Write(maxShortBytes)
+		} else {
+			vt := v.Type()
+			for i := 0; i < numFields; i++ {
+				if i > 0 {
+					f.fs.Write(spaceBytes)
+				}
+				vtf := vt.Field(i)
+				if f.fs.Flag('+') || f.fs.Flag('#') {
+					f.fs.Write([]byte(vtf.Name))
+					f.fs.Write(colonBytes)
+				}
+				f.format(f.unpackValue(v.Field(i)))
+			}
+		}
+		f.depth--
+		f.fs.Write(closeBraceBytes)
+
+	case reflect.Uintptr:
+		printHexPtr(f.fs, uintptr(v.Uint()))
+
+	case reflect.UnsafePointer, reflect.Chan, reflect.Func:
+		printHexPtr(f.fs, v.Pointer())
+
+	// There were not any other types at the time this code was written, but
+	// fall back to letting the default fmt package handle it if any get added.
+	default:
+		format := f.buildDefaultFormat()
+		if v.CanInterface() {
+			fmt.Fprintf(f.fs, format, v.Interface())
+		} else {
+			fmt.Fprintf(f.fs, format, v.String())
+		}
+	}
+}
+
+// Format satisfies the fmt.Formatter interface. See NewFormatter for usage
+// details.
+func (f *formatState) Format(fs fmt.State, verb rune) {
+	f.fs = fs
+
+	// Use standard formatting for verbs that are not v.
+	if verb != 'v' {
+		format := f.constructOrigFormat(verb)
+		fmt.Fprintf(fs, format, f.value)
+		return
+	}
+
+	if f.value == nil {
+		if fs.Flag('#') {
+			fs.Write(interfaceBytes)
+		}
+		fs.Write(nilAngleBytes)
+		return
+	}
+
+	f.format(reflect.ValueOf(f.value))
+}
+
+// newFormatter is a helper function to consolidate the logic from the various
+// public methods which take varying config states.
+func newFormatter(cs *ConfigState, v interface{}) fmt.Formatter {
+	fs := &formatState{value: v, cs: cs}
+	fs.pointers = make(map[uintptr]int)
+	return fs
+}
+
+/*
+NewFormatter returns a custom formatter that satisfies the fmt.Formatter
+interface.  As a result, it integrates cleanly with standard fmt package
+printing functions.  The formatter is useful for inline printing of smaller data
+types similar to the standard %v format specifier.
+
+The custom formatter only responds to the %v (most compact), %+v (adds pointer
+addresses), %#v (adds types), or %#+v (adds types and pointer addresses) verb
+combinations.  Any other verbs such as %x and %q will be sent to the the
+standard fmt package for formatting.  In addition, the custom formatter ignores
+the width and precision arguments (however they will still work on the format
+specifiers not handled by the custom formatter).
+
+Typically this function shouldn't be called directly.  It is much easier to make
+use of the custom formatter by calling one of the convenience functions such as
+Printf, Println, or Fprintf.
+*/
+func NewFormatter(v interface{}) fmt.Formatter {
+	return newFormatter(&Config, v)
+}

diff --git a/vendor/github.com/davecgh/go-spew/spew/spew.go b/vendor/github.com/davecgh/go-spew/spew/spew.go
new file mode 100644
index 0000000..32c0e33
--- /dev/null
+++ b/vendor/github.com/davecgh/go-spew/spew/spew.go

@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2013-2016 Dave Collins <dave@davec.name>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+package spew
+
+import (
+	"fmt"
+	"io"
+)
+
+// Errorf is a wrapper for fmt.Errorf that treats each argument as if it were
+// passed with a default Formatter interface returned by NewFormatter.  It
+// returns the formatted string as a value that satisfies error.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Errorf(format, spew.NewFormatter(a), spew.NewFormatter(b))
+func Errorf(format string, a ...interface{}) (err error) {
+	return fmt.Errorf(format, convertArgs(a)...)
+}
+
+// Fprint is a wrapper for fmt.Fprint that treats each argument as if it were
+// passed with a default Formatter interface returned by NewFormatter.  It
+// returns the number of bytes written and any write error encountered.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Fprint(w, spew.NewFormatter(a), spew.NewFormatter(b))
+func Fprint(w io.Writer, a ...interface{}) (n int, err error) {
+	return fmt.Fprint(w, convertArgs(a)...)
+}
+
+// Fprintf is a wrapper for fmt.Fprintf that treats each argument as if it were
+// passed with a default Formatter interface returned by NewFormatter.  It
+// returns the number of bytes written and any write error encountered.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Fprintf(w, format, spew.NewFormatter(a), spew.NewFormatter(b))
+func Fprintf(w io.Writer, format string, a ...interface{}) (n int, err error) {
+	return fmt.Fprintf(w, format, convertArgs(a)...)
+}
+
+// Fprintln is a wrapper for fmt.Fprintln that treats each argument as if it
+// passed with a default Formatter interface returned by NewFormatter.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Fprintln(w, spew.NewFormatter(a), spew.NewFormatter(b))
+func Fprintln(w io.Writer, a ...interface{}) (n int, err error) {
+	return fmt.Fprintln(w, convertArgs(a)...)
+}
+
+// Print is a wrapper for fmt.Print that treats each argument as if it were
+// passed with a default Formatter interface returned by NewFormatter.  It
+// returns the number of bytes written and any write error encountered.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Print(spew.NewFormatter(a), spew.NewFormatter(b))
+func Print(a ...interface{}) (n int, err error) {
+	return fmt.Print(convertArgs(a)...)
+}
+
+// Printf is a wrapper for fmt.Printf that treats each argument as if it were
+// passed with a default Formatter interface returned by NewFormatter.  It
+// returns the number of bytes written and any write error encountered.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Printf(format, spew.NewFormatter(a), spew.NewFormatter(b))
+func Printf(format string, a ...interface{}) (n int, err error) {
+	return fmt.Printf(format, convertArgs(a)...)
+}
+
+// Println is a wrapper for fmt.Println that treats each argument as if it were
+// passed with a default Formatter interface returned by NewFormatter.  It
+// returns the number of bytes written and any write error encountered.  See
+// NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Println(spew.NewFormatter(a), spew.NewFormatter(b))
+func Println(a ...interface{}) (n int, err error) {
+	return fmt.Println(convertArgs(a)...)
+}
+
+// Sprint is a wrapper for fmt.Sprint that treats each argument as if it were
+// passed with a default Formatter interface returned by NewFormatter.  It
+// returns the resulting string.  See NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Sprint(spew.NewFormatter(a), spew.NewFormatter(b))
+func Sprint(a ...interface{}) string {
+	return fmt.Sprint(convertArgs(a)...)
+}
+
+// Sprintf is a wrapper for fmt.Sprintf that treats each argument as if it were
+// passed with a default Formatter interface returned by NewFormatter.  It
+// returns the resulting string.  See NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Sprintf(format, spew.NewFormatter(a), spew.NewFormatter(b))
+func Sprintf(format string, a ...interface{}) string {
+	return fmt.Sprintf(format, convertArgs(a)...)
+}
+
+// Sprintln is a wrapper for fmt.Sprintln that treats each argument as if it
+// were passed with a default Formatter interface returned by NewFormatter.  It
+// returns the resulting string.  See NewFormatter for formatting details.
+//
+// This function is shorthand for the following syntax:
+//
+//	fmt.Sprintln(spew.NewFormatter(a), spew.NewFormatter(b))
+func Sprintln(a ...interface{}) string {
+	return fmt.Sprintln(convertArgs(a)...)
+}
+
+// convertArgs accepts a slice of arguments and returns a slice of the same
+// length with each argument converted to a default spew Formatter interface.
+func convertArgs(args []interface{}) (formatters []interface{}) {
+	formatters = make([]interface{}, len(args))
+	for index, arg := range args {
+		formatters[index] = NewFormatter(arg)
+	}
+	return formatters
+}

diff --git a/vendor/github.com/eapache/go-resiliency/LICENSE b/vendor/github.com/eapache/go-resiliency/LICENSE
new file mode 100644
index 0000000..698a3f5
--- /dev/null
+++ b/vendor/github.com/eapache/go-resiliency/LICENSE

@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Evan Huus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+

diff --git a/vendor/github.com/eapache/go-resiliency/breaker/README.md b/vendor/github.com/eapache/go-resiliency/breaker/README.md
new file mode 100644
index 0000000..2d1b3d9
--- /dev/null
+++ b/vendor/github.com/eapache/go-resiliency/breaker/README.md

@@ -0,0 +1,34 @@
+circuit-breaker
+===============
+
+[![Build Status](https://travis-ci.org/eapache/go-resiliency.svg?branch=master)](https://travis-ci.org/eapache/go-resiliency)
+[![GoDoc](https://godoc.org/github.com/eapache/go-resiliency/breaker?status.svg)](https://godoc.org/github.com/eapache/go-resiliency/breaker)
+[![Code of Conduct](https://img.shields.io/badge/code%20of%20conduct-active-blue.svg)](https://eapache.github.io/conduct.html)
+
+The circuit-breaker resiliency pattern for golang.
+
+Creating a breaker takes three parameters:
+- error threshold (for opening the breaker)
+- success threshold (for closing the breaker)
+- timeout (how long to keep the breaker open)
+
+```go
+b := breaker.New(3, 1, 5*time.Second)
+
+for {
+	result := b.Run(func() error {
+		// communicate with some external service and
+		// return an error if the communication failed
+		return nil
+	})
+
+	switch result {
+	case nil:
+		// success!
+	case breaker.ErrBreakerOpen:
+		// our function wasn't run because the breaker was open
+	default:
+		// some other error
+	}
+}
+```

diff --git a/vendor/github.com/eapache/go-resiliency/breaker/breaker.go b/vendor/github.com/eapache/go-resiliency/breaker/breaker.go
new file mode 100644
index 0000000..f88ca72
--- /dev/null
+++ b/vendor/github.com/eapache/go-resiliency/breaker/breaker.go

@@ -0,0 +1,161 @@
+// Package breaker implements the circuit-breaker resiliency pattern for Go.
+package breaker
+
+import (
+	"errors"
+	"sync"
+	"sync/atomic"
+	"time"
+)
+
+// ErrBreakerOpen is the error returned from Run() when the function is not executed
+// because the breaker is currently open.
+var ErrBreakerOpen = errors.New("circuit breaker is open")
+
+const (
+	closed uint32 = iota
+	open
+	halfOpen
+)
+
+// Breaker implements the circuit-breaker resiliency pattern
+type Breaker struct {
+	errorThreshold, successThreshold int
+	timeout                          time.Duration
+
+	lock              sync.Mutex
+	state             uint32
+	errors, successes int
+	lastError         time.Time
+}
+
+// New constructs a new circuit-breaker that starts closed.
+// From closed, the breaker opens if "errorThreshold" errors are seen
+// without an error-free period of at least "timeout". From open, the
+// breaker half-closes after "timeout". From half-open, the breaker closes
+// after "successThreshold" consecutive successes, or opens on a single error.
+func New(errorThreshold, successThreshold int, timeout time.Duration) *Breaker {
+	return &Breaker{
+		errorThreshold:   errorThreshold,
+		successThreshold: successThreshold,
+		timeout:          timeout,
+	}
+}
+
+// Run will either return ErrBreakerOpen immediately if the circuit-breaker is
+// already open, or it will run the given function and pass along its return
+// value. It is safe to call Run concurrently on the same Breaker.
+func (b *Breaker) Run(work func() error) error {
+	state := atomic.LoadUint32(&b.state)
+
+	if state == open {
+		return ErrBreakerOpen
+	}
+
+	return b.doWork(state, work)
+}
+
+// Go will either return ErrBreakerOpen immediately if the circuit-breaker is
+// already open, or it will run the given function in a separate goroutine.
+// If the function is run, Go will return nil immediately, and will *not* return
+// the return value of the function. It is safe to call Go concurrently on the
+// same Breaker.
+func (b *Breaker) Go(work func() error) error {
+	state := atomic.LoadUint32(&b.state)
+
+	if state == open {
+		return ErrBreakerOpen
+	}
+
+	// errcheck complains about ignoring the error return value, but
+	// that's on purpose; if you want an error from a goroutine you have to
+	// get it over a channel or something
+	go b.doWork(state, work)
+
+	return nil
+}
+
+func (b *Breaker) doWork(state uint32, work func() error) error {
+	var panicValue interface{}
+
+	result := func() error {
+		defer func() {
+			panicValue = recover()
+		}()
+		return work()
+	}()
+
+	if result == nil && panicValue == nil && state == closed {
+		// short-circuit the normal, success path without contending
+		// on the lock
+		return nil
+	}
+
+	// oh well, I guess we have to contend on the lock
+	b.processResult(result, panicValue)
+
+	if panicValue != nil {
+		// as close as Go lets us come to a "rethrow" although unfortunately
+		// we lose the original panicing location
+		panic(panicValue)
+	}
+
+	return result
+}
+
+func (b *Breaker) processResult(result error, panicValue interface{}) {
+	b.lock.Lock()
+	defer b.lock.Unlock()
+
+	if result == nil && panicValue == nil {
+		if b.state == halfOpen {
+			b.successes++
+			if b.successes == b.successThreshold {
+				b.closeBreaker()
+			}
+		}
+	} else {
+		if b.errors > 0 {
+			expiry := b.lastError.Add(b.timeout)
+			if time.Now().After(expiry) {
+				b.errors = 0
+			}
+		}
+
+		switch b.state {
+		case closed:
+			b.errors++
+			if b.errors == b.errorThreshold {
+				b.openBreaker()
+			} else {
+				b.lastError = time.Now()
+			}
+		case halfOpen:
+			b.openBreaker()
+		}
+	}
+}
+
+func (b *Breaker) openBreaker() {
+	b.changeState(open)
+	go b.timer()
+}
+
+func (b *Breaker) closeBreaker() {
+	b.changeState(closed)
+}
+
+func (b *Breaker) timer() {
+	time.Sleep(b.timeout)
+
+	b.lock.Lock()
+	defer b.lock.Unlock()
+
+	b.changeState(halfOpen)
+}
+
+func (b *Breaker) changeState(newState uint32) {
+	b.errors = 0
+	b.successes = 0
+	atomic.StoreUint32(&b.state, newState)
+}

diff --git a/vendor/github.com/eapache/go-xerial-snappy/.gitignore b/vendor/github.com/eapache/go-xerial-snappy/.gitignore
new file mode 100644
index 0000000..daf913b
--- /dev/null
+++ b/vendor/github.com/eapache/go-xerial-snappy/.gitignore

@@ -0,0 +1,24 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof

diff --git a/vendor/github.com/eapache/go-xerial-snappy/.travis.yml b/vendor/github.com/eapache/go-xerial-snappy/.travis.yml
new file mode 100644
index 0000000..d6cf4f1
--- /dev/null
+++ b/vendor/github.com/eapache/go-xerial-snappy/.travis.yml

@@ -0,0 +1,7 @@
+language: go
+
+go:
+- 1.5.4
+- 1.6.1
+
+sudo: false

diff --git a/vendor/github.com/eapache/go-xerial-snappy/LICENSE b/vendor/github.com/eapache/go-xerial-snappy/LICENSE
new file mode 100644
index 0000000..5bf3688
--- /dev/null
+++ b/vendor/github.com/eapache/go-xerial-snappy/LICENSE

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Evan Huus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

diff --git a/vendor/github.com/eapache/go-xerial-snappy/README.md b/vendor/github.com/eapache/go-xerial-snappy/README.md
new file mode 100644
index 0000000..3f2695c
--- /dev/null
+++ b/vendor/github.com/eapache/go-xerial-snappy/README.md

@@ -0,0 +1,13 @@
+# go-xerial-snappy
+
+[![Build Status](https://travis-ci.org/eapache/go-xerial-snappy.svg?branch=master)](https://travis-ci.org/eapache/go-xerial-snappy)
+
+Xerial-compatible Snappy framing support for golang.
+
+Packages using Xerial for snappy encoding use a framing format incompatible with
+basically everything else in existence. This package wraps Go's built-in snappy
+package to support it.
+
+Apps that use this format include Apache Kafka (see
+https://github.com/dpkp/kafka-python/issues/126#issuecomment-35478921 for
+details).

diff --git a/vendor/github.com/eapache/go-xerial-snappy/fuzz.go b/vendor/github.com/eapache/go-xerial-snappy/fuzz.go
new file mode 100644
index 0000000..6a46f47
--- /dev/null
+++ b/vendor/github.com/eapache/go-xerial-snappy/fuzz.go

@@ -0,0 +1,16 @@
+// +build gofuzz
+
+package snappy
+
+func Fuzz(data []byte) int {
+	decode, err := Decode(data)
+	if decode == nil && err == nil {
+		panic("nil error with nil result")
+	}
+
+	if err != nil {
+		return 0
+	}
+
+	return 1
+}

diff --git a/vendor/github.com/eapache/go-xerial-snappy/snappy.go b/vendor/github.com/eapache/go-xerial-snappy/snappy.go
new file mode 100644
index 0000000..ea8f7af
--- /dev/null
+++ b/vendor/github.com/eapache/go-xerial-snappy/snappy.go

@@ -0,0 +1,131 @@
+package snappy
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+
+	master "github.com/golang/snappy"
+)
+
+const (
+	sizeOffset = 16
+	sizeBytes  = 4
+)
+
+var (
+	xerialHeader = []byte{130, 83, 78, 65, 80, 80, 89, 0}
+
+	// This is xerial version 1 and minimally compatible with version 1
+	xerialVersionInfo = []byte{0, 0, 0, 1, 0, 0, 0, 1}
+
+	// ErrMalformed is returned by the decoder when the xerial framing
+	// is malformed
+	ErrMalformed = errors.New("malformed xerial framing")
+)
+
+func min(x, y int) int {
+    if x < y {
+        return x
+    }
+    return y
+}
+
+// Encode encodes data as snappy with no framing header.
+func Encode(src []byte) []byte {
+	return master.Encode(nil, src)
+}
+
+// EncodeStream *appends* to the specified 'dst' the compressed
+// 'src' in xerial framing format. If 'dst' does not have enough
+// capacity, then a new slice will be allocated. If 'dst' has
+// non-zero length, then if *must* have been built using this function.
+func EncodeStream(dst, src []byte) []byte {
+	if len(dst) == 0 {
+		dst = append(dst, xerialHeader...)
+		dst = append(dst, xerialVersionInfo...)
+	}
+
+	// Snappy encode in blocks of maximum 32KB
+	var (
+		max = len(src)
+		blockSize = 32 * 1024
+		pos   = 0
+		chunk []byte
+	)
+
+	for pos < max {
+		newPos := min(pos + blockSize, max)
+		chunk = master.Encode(chunk[:cap(chunk)], src[pos:newPos])
+
+		// First encode the compressed size (big-endian)
+		// Put* panics if the buffer is too small, so pad 4 bytes first
+		origLen := len(dst)
+		dst = append(dst, dst[0:4]...)
+		binary.BigEndian.PutUint32(dst[origLen:], uint32(len(chunk)))
+
+		// And now the compressed data
+		dst = append(dst, chunk...)
+		pos = newPos
+	}
+	return dst
+}
+
+// Decode decodes snappy data whether it is traditional unframed
+// or includes the xerial framing format.
+func Decode(src []byte) ([]byte, error) {
+	return DecodeInto(nil, src)
+}
+
+// DecodeInto decodes snappy data whether it is traditional unframed
+// or includes the xerial framing format into the specified `dst`.
+// It is assumed that the entirety of `dst` including all capacity is available
+// for use by this function. If `dst` is nil *or* insufficiently large to hold
+// the decoded `src`, new space will be allocated.
+func DecodeInto(dst, src []byte) ([]byte, error) {
+	var max = len(src)
+	if max < len(xerialHeader) {
+		return nil, ErrMalformed
+	}
+
+	if !bytes.Equal(src[:8], xerialHeader) {
+		return master.Decode(dst[:cap(dst)], src)
+	}
+
+	if max < sizeOffset+sizeBytes {
+		return nil, ErrMalformed
+	}
+
+	if dst == nil {
+		dst = make([]byte, 0, len(src))
+	}
+
+	dst = dst[:0]
+	var (
+		pos   = sizeOffset
+		chunk []byte
+		err       error
+	)
+
+	for pos+sizeBytes <= max {
+		size := int(binary.BigEndian.Uint32(src[pos : pos+sizeBytes]))
+		pos += sizeBytes
+
+		nextPos := pos + size
+		// On architectures where int is 32-bytes wide size + pos could
+		// overflow so we need to check the low bound as well as the
+		// high
+		if nextPos < pos || nextPos > max {
+			return nil, ErrMalformed
+		}
+
+		chunk, err = master.Decode(chunk[:cap(chunk)], src[pos:nextPos])
+
+		if err != nil {
+			return nil, err
+		}
+		pos = nextPos
+		dst = append(dst, chunk...)
+	}
+	return dst, nil
+}

diff --git a/vendor/github.com/eapache/queue/.gitignore b/vendor/github.com/eapache/queue/.gitignore
new file mode 100644
index 0000000..8365624
--- /dev/null
+++ b/vendor/github.com/eapache/queue/.gitignore

@@ -0,0 +1,23 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test

diff --git a/vendor/github.com/eapache/queue/.travis.yml b/vendor/github.com/eapache/queue/.travis.yml
new file mode 100644
index 0000000..235a40a
--- /dev/null
+++ b/vendor/github.com/eapache/queue/.travis.yml

@@ -0,0 +1,7 @@
+language: go
+sudo: false
+
+go:
+  - 1.2
+  - 1.3
+  - 1.4

diff --git a/vendor/github.com/eapache/queue/LICENSE b/vendor/github.com/eapache/queue/LICENSE
new file mode 100644
index 0000000..d5f36db
--- /dev/null
+++ b/vendor/github.com/eapache/queue/LICENSE

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Evan Huus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file

diff --git a/vendor/github.com/eapache/queue/README.md b/vendor/github.com/eapache/queue/README.md
new file mode 100644
index 0000000..8e78233
--- /dev/null
+++ b/vendor/github.com/eapache/queue/README.md

@@ -0,0 +1,16 @@
+Queue
+=====
+
+[![Build Status](https://travis-ci.org/eapache/queue.svg)](https://travis-ci.org/eapache/queue)
+[![GoDoc](https://godoc.org/github.com/eapache/queue?status.png)](https://godoc.org/github.com/eapache/queue)
+[![Code of Conduct](https://img.shields.io/badge/code%20of%20conduct-active-blue.svg)](https://eapache.github.io/conduct.html)
+
+A fast Golang queue using a ring-buffer, based on the version suggested by Dariusz Górecki.
+Using this instead of other, simpler, queue implementations (slice+append or linked list) provides
+substantial memory and time benefits, and fewer GC pauses.
+
+The queue implemented here is as fast as it is in part because it is *not* thread-safe.
+
+Follows semantic versioning using https://gopkg.in/ - import from
+[`gopkg.in/eapache/queue.v1`](https://gopkg.in/eapache/queue.v1)
+for guaranteed API stability.

diff --git a/vendor/github.com/eapache/queue/queue.go b/vendor/github.com/eapache/queue/queue.go
new file mode 100644
index 0000000..71d1acd
--- /dev/null
+++ b/vendor/github.com/eapache/queue/queue.go

@@ -0,0 +1,102 @@
+/*
+Package queue provides a fast, ring-buffer queue based on the version suggested by Dariusz Górecki.
+Using this instead of other, simpler, queue implementations (slice+append or linked list) provides
+substantial memory and time benefits, and fewer GC pauses.
+
+The queue implemented here is as fast as it is for an additional reason: it is *not* thread-safe.
+*/
+package queue
+
+// minQueueLen is smallest capacity that queue may have.
+// Must be power of 2 for bitwise modulus: x % n == x & (n - 1).
+const minQueueLen = 16
+
+// Queue represents a single instance of the queue data structure.
+type Queue struct {
+	buf               []interface{}
+	head, tail, count int
+}
+
+// New constructs and returns a new Queue.
+func New() *Queue {
+	return &Queue{
+		buf: make([]interface{}, minQueueLen),
+	}
+}
+
+// Length returns the number of elements currently stored in the queue.
+func (q *Queue) Length() int {
+	return q.count
+}
+
+// resizes the queue to fit exactly twice its current contents
+// this can result in shrinking if the queue is less than half-full
+func (q *Queue) resize() {
+	newBuf := make([]interface{}, q.count<<1)
+
+	if q.tail > q.head {
+		copy(newBuf, q.buf[q.head:q.tail])
+	} else {
+		n := copy(newBuf, q.buf[q.head:])
+		copy(newBuf[n:], q.buf[:q.tail])
+	}
+
+	q.head = 0
+	q.tail = q.count
+	q.buf = newBuf
+}
+
+// Add puts an element on the end of the queue.
+func (q *Queue) Add(elem interface{}) {
+	if q.count == len(q.buf) {
+		q.resize()
+	}
+
+	q.buf[q.tail] = elem
+	// bitwise modulus
+	q.tail = (q.tail + 1) & (len(q.buf) - 1)
+	q.count++
+}
+
+// Peek returns the element at the head of the queue. This call panics
+// if the queue is empty.
+func (q *Queue) Peek() interface{} {
+	if q.count <= 0 {
+		panic("queue: Peek() called on empty queue")
+	}
+	return q.buf[q.head]
+}
+
+// Get returns the element at index i in the queue. If the index is
+// invalid, the call will panic. This method accepts both positive and
+// negative index values. Index 0 refers to the first element, and
+// index -1 refers to the last.
+func (q *Queue) Get(i int) interface{} {
+	// If indexing backwards, convert to positive index.
+	if i < 0 {
+		i += q.count
+	}
+	if i < 0 || i >= q.count {
+		panic("queue: Get() called with index out of range")
+	}
+	// bitwise modulus
+	return q.buf[(q.head+i)&(len(q.buf)-1)]
+}
+
+// Remove removes and returns the element from the front of the queue. If the
+// queue is empty, the call will panic.
+func (q *Queue) Remove() interface{} {
+	if q.count <= 0 {
+		panic("queue: Remove() called on empty queue")
+	}
+	ret := q.buf[q.head]
+	q.buf[q.head] = nil
+	// bitwise modulus
+	q.head = (q.head + 1) & (len(q.buf) - 1)
+	q.count--
+	// Resize down if buffer 1/4 full.
+	if len(q.buf) > minQueueLen && (q.count<<2) == len(q.buf) {
+		q.resize()
+	}
+	return ret
+}

diff --git a/vendor/github.com/gfremex/logrus-kafka-hook/README.md b/vendor/github.com/gfremex/logrus-kafka-hook/README.md
new file mode 100644
index 0000000..d03739d
--- /dev/null
+++ b/vendor/github.com/gfremex/logrus-kafka-hook/README.md

@@ -0,0 +1,81 @@
+## logrus-kafka-hook
+
+
+A [logrus.Hook](https://godoc.org/github.com/sirupsen/logrus#Hook) which sends a single
+log entry to multiple kafka topics simultaneously.
+
+## How to use
+
+### Import package
+
+```Go
+import lkh "github.com/gfremex/logrus-kafka-hook"
+```
+
+### Create a hook (KafkaHook)
+
+```Go
+NewKafkaHook(id string, levels []logrus.Level, formatter logrus.Formatter, brokers []string) (*KafkaHook, error)
+```
+
+- id: Hook Id
+- levels: [logrus.Levels](https://godoc.org/github.com/sirupsen/logrus#Level) supported by the hook
+- formatter: [logrus.Formatter](https://godoc.org/github.com/sirupsen/logrus#Formatter) used by the hook
+- brokers: Kafka brokers
+
+For example:
+
+```Go
+hook, err := lkh.NewKafkaHook(
+		"kh",
+		[]logrus.Level{logrus.InfoLevel, logrus.WarnLevel, logrus.ErrorLevel},
+		&logrus.JSONFormatter{},
+		[]string{"192.168.60.5:9092", "192.168.60.6:9092", "192.168.60.7:9092"},
+	)
+```
+
+### Create a [logrus.Logger](https://godoc.org/github.com/sirupsen/logrus#Logger)
+
+For example:
+
+```Go
+logger := logrus.New()
+```
+
+### Add hook to logger
+
+```Go
+logger.Hooks.Add(hook)
+```
+
+### Add topics
+
+```Go
+l := logger.WithField("topics", []string{"topic_1", "topic_2", "topic_3"})
+```
+
+The field name must be ***topics***.
+
+If only one topic needed, then
+
+```Go
+l := logger.WithField("topics", []string{"topic_1"})
+```
+
+### Send messages to logger
+
+For example:
+
+```Go
+l.Debug("This must not be logged")
+
+l.Info("This is an Info msg")
+
+l.Warn("This is a Warn msg")
+
+l.Error("This is an Error msg")
+```
+
+#### Complete examples
+
+[https://github.com/gfremex/logrus-kafka-hook/tree/master/examples](https://github.com/gfremex/logrus-kafka-hook/tree/master/examples)

diff --git a/vendor/github.com/gfremex/logrus-kafka-hook/kafka-hook.go b/vendor/github.com/gfremex/logrus-kafka-hook/kafka-hook.go
new file mode 100644
index 0000000..2684dda
--- /dev/null
+++ b/vendor/github.com/gfremex/logrus-kafka-hook/kafka-hook.go

@@ -0,0 +1,109 @@
+package logrus_kafka_hook
+
+import (
+	"errors"
+	"github.com/Shopify/sarama"
+	"github.com/sirupsen/logrus"
+	"log"
+	"time"
+)
+
+type KafkaHook struct {
+	// Id of the hook
+	id string
+
+	// Log levels allowed
+	levels []logrus.Level
+
+	// Log entry formatter
+	formatter logrus.Formatter
+
+	// sarama.AsyncProducer
+	producer sarama.AsyncProducer
+}
+
+// Create a new KafkaHook.
+func NewKafkaHook(id string, levels []logrus.Level, formatter logrus.Formatter, brokers []string) (*KafkaHook, error) {
+	kafkaConfig := sarama.NewConfig()
+	kafkaConfig.Producer.RequiredAcks = sarama.WaitForLocal       // Only wait for the leader to ack
+	kafkaConfig.Producer.Compression = sarama.CompressionSnappy   // Compress messages
+	kafkaConfig.Producer.Flush.Frequency = 500 * time.Millisecond // Flush batches every 500ms
+
+	producer, err := sarama.NewAsyncProducer(brokers, kafkaConfig)
+
+	if err != nil {
+		return nil, err
+	}
+
+	// We will just log to STDOUT if we're not able to produce messages.
+	// Note: messages will only be returned here after all retry attempts are exhausted.
+	go func() {
+		for err := range producer.Errors() {
+			log.Printf("Failed to send log entry to kafka: %v\n", err)
+		}
+	}()
+
+	hook := &KafkaHook{
+		id,
+		levels,
+		formatter,
+		producer,
+	}
+
+	return hook, nil
+}
+
+func (hook *KafkaHook) Id() string {
+	return hook.id
+}
+
+func (hook *KafkaHook) Levels() []logrus.Level {
+	return hook.levels
+}
+
+func (hook *KafkaHook) Fire(entry *logrus.Entry) error {
+	// Check time for partition key
+	var partitionKey sarama.ByteEncoder
+
+	// Get field time
+	t, _ := entry.Data["time"].(time.Time)
+
+	// Convert it to bytes
+	b, err := t.MarshalBinary()
+
+	if err != nil {
+		return err
+	}
+
+	partitionKey = sarama.ByteEncoder(b)
+
+	// Check topics
+	var topics []string
+
+	if ts, ok := entry.Data["topics"]; ok {
+		if topics, ok = ts.([]string); !ok {
+			return errors.New("Field topics must be []string")
+		}
+	} else {
+		return errors.New("Field topics not found")
+	}
+
+	// Format before writing
+	b, err = hook.formatter.Format(entry)
+
+	if err != nil {
+		return err
+	}
+
+	value := sarama.ByteEncoder(b)
+
+	for _, topic := range topics {
+		hook.producer.Input() <- &sarama.ProducerMessage{
+			Key:   partitionKey,
+			Topic: topic,
+			Value: value,
+		}
+	}
+
+	return nil
+}

diff --git a/vendor/github.com/golang/snappy/.gitignore b/vendor/github.com/golang/snappy/.gitignore
new file mode 100644
index 0000000..042091d
--- /dev/null
+++ b/vendor/github.com/golang/snappy/.gitignore

@@ -0,0 +1,16 @@
+cmd/snappytool/snappytool
+testdata/bench
+
+# These explicitly listed benchmark data files are for an obsolete version of
+# snappy_test.go.
+testdata/alice29.txt
+testdata/asyoulik.txt
+testdata/fireworks.jpeg
+testdata/geo.protodata
+testdata/html
+testdata/html_x_4
+testdata/kppkn.gtb
+testdata/lcet10.txt
+testdata/paper-100k.pdf
+testdata/plrabn12.txt
+testdata/urls.10K

diff --git a/vendor/github.com/golang/snappy/AUTHORS b/vendor/github.com/golang/snappy/AUTHORS
new file mode 100644
index 0000000..bcfa195
--- /dev/null
+++ b/vendor/github.com/golang/snappy/AUTHORS

@@ -0,0 +1,15 @@
+# This is the official list of Snappy-Go authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+
+# Names should be added to this file as
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+
+# Please keep the list sorted.
+
+Damian Gryski <dgryski@gmail.com>
+Google Inc.
+Jan Mercl <0xjnml@gmail.com>
+Rodolfo Carvalho <rhcarvalho@gmail.com>
+Sebastien Binet <seb.binet@gmail.com>

diff --git a/vendor/github.com/golang/snappy/CONTRIBUTORS b/vendor/github.com/golang/snappy/CONTRIBUTORS
new file mode 100644
index 0000000..931ae31
--- /dev/null
+++ b/vendor/github.com/golang/snappy/CONTRIBUTORS

@@ -0,0 +1,37 @@
+# This is the official list of people who can contribute
+# (and typically have contributed) code to the Snappy-Go repository.
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS, because Google holds the copyright.
+#
+# The submission process automatically checks to make sure
+# that people submitting code are listed in this file (by email address).
+#
+# Names should be added to this file only after verifying that
+# the individual or the individual's organization has agreed to
+# the appropriate Contributor License Agreement, found here:
+#
+#     http://code.google.com/legal/individual-cla-v1.0.html
+#     http://code.google.com/legal/corporate-cla-v1.0.html
+#
+# The agreement for individuals can be filled out on the web.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file, depending on whether the
+# individual or corporate CLA was used.
+
+# Names should be added to this file like so:
+#     Name <email address>
+
+# Please keep the list sorted.
+
+Damian Gryski <dgryski@gmail.com>
+Jan Mercl <0xjnml@gmail.com>
+Kai Backman <kaib@golang.org>
+Marc-Antoine Ruel <maruel@chromium.org>
+Nigel Tao <nigeltao@golang.org>
+Rob Pike <r@golang.org>
+Rodolfo Carvalho <rhcarvalho@gmail.com>
+Russ Cox <rsc@golang.org>
+Sebastien Binet <seb.binet@gmail.com>

diff --git a/vendor/github.com/golang/snappy/LICENSE b/vendor/github.com/golang/snappy/LICENSE
new file mode 100644
index 0000000..6050c10
--- /dev/null
+++ b/vendor/github.com/golang/snappy/LICENSE

@@ -0,0 +1,27 @@
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

diff --git a/vendor/github.com/golang/snappy/README b/vendor/github.com/golang/snappy/README
new file mode 100644
index 0000000..cea1287
--- /dev/null
+++ b/vendor/github.com/golang/snappy/README

@@ -0,0 +1,107 @@
+The Snappy compression format in the Go programming language.
+
+To download and install from source:
+$ go get github.com/golang/snappy
+
+Unless otherwise noted, the Snappy-Go source files are distributed
+under the BSD-style license found in the LICENSE file.
+
+
+
+Benchmarks.
+
+The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
+or so files, the same set used by the C++ Snappy code (github.com/google/snappy
+and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
+3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
+
+"go test -test.bench=."
+
+_UFlat0-8         2.19GB/s ± 0%  html
+_UFlat1-8         1.41GB/s ± 0%  urls
+_UFlat2-8         23.5GB/s ± 2%  jpg
+_UFlat3-8         1.91GB/s ± 0%  jpg_200
+_UFlat4-8         14.0GB/s ± 1%  pdf
+_UFlat5-8         1.97GB/s ± 0%  html4
+_UFlat6-8          814MB/s ± 0%  txt1
+_UFlat7-8          785MB/s ± 0%  txt2
+_UFlat8-8          857MB/s ± 0%  txt3
+_UFlat9-8          719MB/s ± 1%  txt4
+_UFlat10-8        2.84GB/s ± 0%  pb
+_UFlat11-8        1.05GB/s ± 0%  gaviota
+
+_ZFlat0-8         1.04GB/s ± 0%  html
+_ZFlat1-8          534MB/s ± 0%  urls
+_ZFlat2-8         15.7GB/s ± 1%  jpg
+_ZFlat3-8          740MB/s ± 3%  jpg_200
+_ZFlat4-8         9.20GB/s ± 1%  pdf
+_ZFlat5-8          991MB/s ± 0%  html4
+_ZFlat6-8          379MB/s ± 0%  txt1
+_ZFlat7-8          352MB/s ± 0%  txt2
+_ZFlat8-8          396MB/s ± 1%  txt3
+_ZFlat9-8          327MB/s ± 1%  txt4
+_ZFlat10-8        1.33GB/s ± 1%  pb
+_ZFlat11-8         605MB/s ± 1%  gaviota
+
+
+
+"go test -test.bench=. -tags=noasm"
+
+_UFlat0-8          621MB/s ± 2%  html
+_UFlat1-8          494MB/s ± 1%  urls
+_UFlat2-8         23.2GB/s ± 1%  jpg
+_UFlat3-8         1.12GB/s ± 1%  jpg_200
+_UFlat4-8         4.35GB/s ± 1%  pdf
+_UFlat5-8          609MB/s ± 0%  html4
+_UFlat6-8          296MB/s ± 0%  txt1
+_UFlat7-8          288MB/s ± 0%  txt2
+_UFlat8-8          309MB/s ± 1%  txt3
+_UFlat9-8          280MB/s ± 1%  txt4
+_UFlat10-8         753MB/s ± 0%  pb
+_UFlat11-8         400MB/s ± 0%  gaviota
+
+_ZFlat0-8          409MB/s ± 1%  html
+_ZFlat1-8          250MB/s ± 1%  urls
+_ZFlat2-8         12.3GB/s ± 1%  jpg
+_ZFlat3-8          132MB/s ± 0%  jpg_200
+_ZFlat4-8         2.92GB/s ± 0%  pdf
+_ZFlat5-8          405MB/s ± 1%  html4
+_ZFlat6-8          179MB/s ± 1%  txt1
+_ZFlat7-8          170MB/s ± 1%  txt2
+_ZFlat8-8          189MB/s ± 1%  txt3
+_ZFlat9-8          164MB/s ± 1%  txt4
+_ZFlat10-8         479MB/s ± 1%  pb
+_ZFlat11-8         270MB/s ± 1%  gaviota
+
+
+
+For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
+are the numbers from C++ Snappy's
+
+make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
+
+BM_UFlat/0     2.4GB/s  html
+BM_UFlat/1     1.4GB/s  urls
+BM_UFlat/2    21.8GB/s  jpg
+BM_UFlat/3     1.5GB/s  jpg_200
+BM_UFlat/4    13.3GB/s  pdf
+BM_UFlat/5     2.1GB/s  html4
+BM_UFlat/6     1.0GB/s  txt1
+BM_UFlat/7   959.4MB/s  txt2
+BM_UFlat/8     1.0GB/s  txt3
+BM_UFlat/9   864.5MB/s  txt4
+BM_UFlat/10    2.9GB/s  pb
+BM_UFlat/11    1.2GB/s  gaviota
+
+BM_ZFlat/0   944.3MB/s  html (22.31 %)
+BM_ZFlat/1   501.6MB/s  urls (47.78 %)
+BM_ZFlat/2    14.3GB/s  jpg (99.95 %)
+BM_ZFlat/3   538.3MB/s  jpg_200 (73.00 %)
+BM_ZFlat/4     8.3GB/s  pdf (83.30 %)
+BM_ZFlat/5   903.5MB/s  html4 (22.52 %)
+BM_ZFlat/6   336.0MB/s  txt1 (57.88 %)
+BM_ZFlat/7   312.3MB/s  txt2 (61.91 %)
+BM_ZFlat/8   353.1MB/s  txt3 (54.99 %)
+BM_ZFlat/9   289.9MB/s  txt4 (66.26 %)
+BM_ZFlat/10    1.2GB/s  pb (19.68 %)
+BM_ZFlat/11  527.4MB/s  gaviota (37.72 %)

diff --git a/vendor/github.com/golang/snappy/decode.go b/vendor/github.com/golang/snappy/decode.go
new file mode 100644
index 0000000..72efb03
--- /dev/null
+++ b/vendor/github.com/golang/snappy/decode.go

@@ -0,0 +1,237 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package snappy
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+var (
+	// ErrCorrupt reports that the input is invalid.
+	ErrCorrupt = errors.New("snappy: corrupt input")
+	// ErrTooLarge reports that the uncompressed length is too large.
+	ErrTooLarge = errors.New("snappy: decoded block is too large")
+	// ErrUnsupported reports that the input isn't supported.
+	ErrUnsupported = errors.New("snappy: unsupported input")
+
+	errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
+)
+
+// DecodedLen returns the length of the decoded block.
+func DecodedLen(src []byte) (int, error) {
+	v, _, err := decodedLen(src)
+	return v, err
+}
+
+// decodedLen returns the length of the decoded block and the number of bytes
+// that the length header occupied.
+func decodedLen(src []byte) (blockLen, headerLen int, err error) {
+	v, n := binary.Uvarint(src)
+	if n <= 0 || v > 0xffffffff {
+		return 0, 0, ErrCorrupt
+	}
+
+	const wordSize = 32 << (^uint(0) >> 32 & 1)
+	if wordSize == 32 && v > 0x7fffffff {
+		return 0, 0, ErrTooLarge
+	}
+	return int(v), n, nil
+}
+
+const (
+	decodeErrCodeCorrupt                  = 1
+	decodeErrCodeUnsupportedLiteralLength = 2
+)
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Decode(dst, src []byte) ([]byte, error) {
+	dLen, s, err := decodedLen(src)
+	if err != nil {
+		return nil, err
+	}
+	if dLen <= len(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen)
+	}
+	switch decode(dst, src[s:]) {
+	case 0:
+		return dst, nil
+	case decodeErrCodeUnsupportedLiteralLength:
+		return nil, errUnsupportedLiteralLength
+	}
+	return nil, ErrCorrupt
+}
+
+// NewReader returns a new Reader that decompresses from r, using the framing
+// format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func NewReader(r io.Reader) *Reader {
+	return &Reader{
+		r:       r,
+		decoded: make([]byte, maxBlockSize),
+		buf:     make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
+	}
+}
+
+// Reader is an io.Reader that can read Snappy-compressed bytes.
+type Reader struct {
+	r       io.Reader
+	err     error
+	decoded []byte
+	buf     []byte
+	// decoded[i:j] contains decoded bytes that have not yet been passed on.
+	i, j       int
+	readHeader bool
+}
+
+// Reset discards any buffered data, resets all state, and switches the Snappy
+// reader to read from r. This permits reusing a Reader rather than allocating
+// a new one.
+func (r *Reader) Reset(reader io.Reader) {
+	r.r = reader
+	r.err = nil
+	r.i = 0
+	r.j = 0
+	r.readHeader = false
+}
+
+func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
+	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
+		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+		}
+		return false
+	}
+	return true
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	for {
+		if r.i < r.j {
+			n := copy(p, r.decoded[r.i:r.j])
+			r.i += n
+			return n, nil
+		}
+		if !r.readFull(r.buf[:4], true) {
+			return 0, r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+		if chunkLen > len(r.buf) {
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if n > len(r.decoded) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if _, err := Decode(r.decoded, buf); err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if crc(r.decoded[:n]) != checksum {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeUncompressedData:
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n := chunkLen - checksumSize
+			if n > len(r.decoded) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.decoded[:n], false) {
+				return 0, r.err
+			}
+			if crc(r.decoded[:n]) != checksum {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return 0, r.err
+			}
+			for i := 0; i < len(magicBody); i++ {
+				if r.buf[i] != magicBody[i] {
+					r.err = ErrCorrupt
+					return 0, r.err
+				}
+			}
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if !r.readFull(r.buf[:chunkLen], false) {
+			return 0, r.err
+		}
+	}
+}

diff --git a/vendor/github.com/golang/snappy/decode_amd64.go b/vendor/github.com/golang/snappy/decode_amd64.go
new file mode 100644
index 0000000..fcd192b
--- /dev/null
+++ b/vendor/github.com/golang/snappy/decode_amd64.go

@@ -0,0 +1,14 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package snappy
+
+// decode has the same semantics as in decode_other.go.
+//
+//go:noescape
+func decode(dst, src []byte) int

diff --git a/vendor/github.com/golang/snappy/decode_amd64.s b/vendor/github.com/golang/snappy/decode_amd64.s
new file mode 100644
index 0000000..e6179f6
--- /dev/null
+++ b/vendor/github.com/golang/snappy/decode_amd64.s

@@ -0,0 +1,490 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- AX	scratch
+//	- BX	scratch
+//	- CX	length or x
+//	- DX	offset
+//	- SI	&src[s]
+//	- DI	&dst[d]
+//	+ R8	dst_base
+//	+ R9	dst_len
+//	+ R10	dst_base + dst_len
+//	+ R11	src_base
+//	+ R12	src_len
+//	+ R13	src_base + src_len
+//	- R14	used by doCopy
+//	- R15	used by doCopy
+//
+// The registers R8-R13 (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
+// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
+TEXT ·decode(SB), NOSPLIT, $48-56
+	// Initialize SI, DI and R8-R13.
+	MOVQ dst_base+0(FP), R8
+	MOVQ dst_len+8(FP), R9
+	MOVQ R8, DI
+	MOVQ R8, R10
+	ADDQ R9, R10
+	MOVQ src_base+24(FP), R11
+	MOVQ src_len+32(FP), R12
+	MOVQ R11, SI
+	MOVQ R11, R13
+	ADDQ R12, R13
+
+loop:
+	// for s < len(src)
+	CMPQ SI, R13
+	JEQ  end
+
+	// CX = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBLZX (SI), CX
+	MOVL    CX, BX
+	ANDL    $3, BX
+	CMPL    BX, $1
+	JAE     tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	SHRL $2, CX
+	CMPL CX, $60
+	JAE  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	INCQ SI
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that CX == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// CX can hold 64 bits, so the increment cannot overflow.
+	INCQ CX
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// AX = len(dst) - d
+	// BX = len(src) - s
+	MOVQ R10, AX
+	SUBQ DI, AX
+	MOVQ R13, BX
+	SUBQ SI, BX
+
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMPQ CX, $16
+	JGT  callMemmove
+	CMPQ AX, $16
+	JLT  callMemmove
+	CMPQ BX, $16
+	JLT  callMemmove
+
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(SI), X0
+	MOVOU X0, 0(DI)
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+callMemmove:
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMPQ CX, AX
+	JGT  errCorrupt
+	CMPQ CX, BX
+	JGT  errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R8-R13.
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+	MOVQ dst_base+0(FP), R8
+	MOVQ dst_len+8(FP), R9
+	MOVQ R8, R10
+	ADDQ R9, R10
+	MOVQ src_base+24(FP), R11
+	MOVQ src_len+32(FP), R12
+	MOVQ R11, R13
+	ADDQ R12, R13
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADDQ CX, SI
+	SUBQ $58, SI
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// case x == 60:
+	CMPL CX, $61
+	JEQ  tagLit61
+	JA   tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBLZX -1(SI), CX
+	JMP     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVWLZX -2(SI), CX
+	JMP     doLit
+
+tagLit62Plus:
+	CMPL CX, $62
+	JA   tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	MOVWLZX -3(SI), CX
+	MOVBLZX -1(SI), BX
+	SHLL    $16, BX
+	ORL     BX, CX
+	JMP     doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVL -4(SI), CX
+	JMP  doLit
+
+// The code above handles literal tags.
+// ----------------------------------------
+// The code below handles copy tags.
+
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADDQ $5, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	SHRQ $2, CX
+	INCQ CX
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVLQZX -4(SI), DX
+	JMP     doCopy
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADDQ $3, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-3])>>2
+	SHRQ $2, CX
+	INCQ CX
+
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+	MOVWQZX -2(SI), DX
+	JMP     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- BX == src[s] & 0x03
+	//	- CX == src[s]
+	CMPQ BX, $2
+	JEQ  tagCopy2
+	JA   tagCopy4
+
+	// case tagCopy1:
+	// s += 2
+	ADDQ $2, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+	MOVQ    CX, DX
+	ANDQ    $0xe0, DX
+	SHLQ    $3, DX
+	MOVBQZX -1(SI), BX
+	ORQ     BX, DX
+
+	// length = 4 + int(src[s-2])>>2&0x7
+	SHRQ $2, CX
+	ANDQ $7, CX
+	ADDQ $4, CX
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- CX == length && CX > 0
+	//	- DX == offset
+
+	// if offset <= 0 { etc }
+	CMPQ DX, $0
+	JLE  errCorrupt
+
+	// if d < offset { etc }
+	MOVQ DI, BX
+	SUBQ R8, BX
+	CMPQ BX, DX
+	JLT  errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVQ R10, BX
+	SUBQ DI, BX
+	CMPQ CX, BX
+	JGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R14 = len(dst)-d
+	//	- R15 = &dst[d-offset]
+	MOVQ R10, R14
+	SUBQ DI, R14
+	MOVQ DI, R15
+	SUBQ DX, R15
+
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMPQ CX, $16
+	JGT  slowForwardCopy
+	CMPQ DX, $8
+	JLT  slowForwardCopy
+	CMPQ R14, $16
+	JLT  slowForwardCopy
+	MOVQ 0(R15), AX
+	MOVQ AX, 0(DI)
+	MOVQ 8(R15), BX
+	MOVQ BX, 8(DI)
+	ADDQ CX, DI
+	JMP  loop
+
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUBQ $10, R14
+	CMPQ CX, R14
+	JGT  verySlowForwardCopy
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R15, is unchanged.
+	// }
+	CMPQ DX, $8
+	JGE  fixUpSlowForwardCopy
+	MOVQ (R15), BX
+	MOVQ BX, (DI)
+	SUBQ DX, CX
+	ADDQ DX, DI
+	ADDQ DX, DX
+	JMP  makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by DI being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save DI to AX so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVQ DI, AX
+	ADDQ CX, DI
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	CMPQ CX, $0
+	JLE  loop
+	MOVQ (R15), BX
+	MOVQ BX, (AX)
+	ADDQ $8, R15
+	ADDQ $8, AX
+	SUBQ $8, CX
+	JMP  finishSlowForwardCopy
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R15), BX
+	MOVB BX, (DI)
+	INCQ R15
+	INCQ DI
+	DECQ CX
+	JNZ  verySlowForwardCopy
+	JMP  loop
+
+// The code above handles copy tags.
+// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMPQ DI, R10
+	JNE  errCorrupt
+
+	// return 0
+	MOVQ $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVQ $1, ret+48(FP)
+	RET

diff --git a/vendor/github.com/golang/snappy/decode_other.go b/vendor/github.com/golang/snappy/decode_other.go
new file mode 100644
index 0000000..8c9f204
--- /dev/null
+++ b/vendor/github.com/golang/snappy/decode_other.go

@@ -0,0 +1,101 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 appengine !gc noasm
+
+package snappy
+
+// decode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func decode(dst, src []byte) int {
+	var d, s, offset, length int
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length <= 0 {
+				return decodeErrCodeUnsupportedLiteralLength
+			}
+			if length > len(dst)-d || length > len(src)-s {
+				return decodeErrCodeCorrupt
+			}
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 4 + int(src[s-2])>>2&0x7
+			offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+
+		case tagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			return decodeErrCodeCorrupt
+		}
+		// Copy from an earlier sub-slice of dst to a later sub-slice. Unlike
+		// the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		for end := d + length; d != end; d++ {
+			dst[d] = dst[d-offset]
+		}
+	}
+	if d != len(dst) {
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}

diff --git a/vendor/github.com/golang/snappy/encode.go b/vendor/github.com/golang/snappy/encode.go
new file mode 100644
index 0000000..8d393e9
--- /dev/null
+++ b/vendor/github.com/golang/snappy/encode.go

@@ -0,0 +1,285 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package snappy
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Encode(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	for len(src) > 0 {
+		p := src
+		src = nil
+		if len(p) > maxBlockSize {
+			p, src = p[:maxBlockSize], p[maxBlockSize:]
+		}
+		if len(p) < minNonLiteralBlockSize {
+			d += emitLiteral(dst[d:], p)
+		} else {
+			d += encodeBlock(dst[d:], p)
+		}
+	}
+	return dst[:d]
+}
+
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+const inputMargin = 16 - 1
+
+// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
+// could be encoded with a copy tag. This is the minimum with respect to the
+// algorithm used by encodeBlock, not a minimum enforced by the file format.
+//
+// The encoded output must start with at least a 1 byte literal, as there are
+// no previous bytes to copy. A minimal (1 byte) copy after that, generated
+// from an emitCopy call in encodeBlock's main loop, would require at least
+// another inputMargin bytes, for the reason above: we want any emitLiteral
+// calls inside encodeBlock's main loop to use the fast path if possible, which
+// requires being able to overrun by inputMargin bytes. Thus,
+// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
+//
+// The C++ code doesn't use this exact threshold, but it could, as discussed at
+// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
+// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
+// optimization. It should not affect the encoded form. This is tested by
+// TestSameEncodingAsCppShortCopies.
+const minNonLiteralBlockSize = 1 + 1 + inputMargin
+
+// MaxEncodedLen returns the maximum length of a snappy block, given its
+// uncompressed length.
+//
+// It will return a negative value if srcLen is too large to encode.
+func MaxEncodedLen(srcLen int) int {
+	n := uint64(srcLen)
+	if n > 0xffffffff {
+		return -1
+	}
+	// Compressed data can be defined as:
+	//    compressed := item* literal*
+	//    item       := literal* copy
+	//
+	// The trailing literal sequence has a space blowup of at most 62/60
+	// since a literal of length 60 needs one tag byte + one extra byte
+	// for length information.
+	//
+	// Item blowup is trickier to measure. Suppose the "copy" op copies
+	// 4 bytes of data. Because of a special check in the encoding code,
+	// we produce a 4-byte copy only if the offset is < 65536. Therefore
+	// the copy op takes 3 bytes to encode, and this type of item leads
+	// to at most the 62/60 blowup for representing literals.
+	//
+	// Suppose the "copy" op copies 5 bytes of data. If the offset is big
+	// enough, it will take 5 bytes to encode the copy op. Therefore the
+	// worst case here is a one-byte literal followed by a five-byte copy.
+	// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
+	//
+	// This last factor dominates the blowup, so the final estimate is:
+	n = 32 + n + n/6
+	if n > 0xffffffff {
+		return -1
+	}
+	return int(n)
+}
+
+var errClosed = errors.New("snappy: Writer is closed")
+
+// NewWriter returns a new Writer that compresses to w.
+//
+// The Writer returned does not buffer writes. There is no need to Flush or
+// Close such a Writer.
+//
+// Deprecated: the Writer returned is not suitable for many small writes, only
+// for few large writes. Use NewBufferedWriter instead, which is efficient
+// regardless of the frequency and shape of the writes, and remember to Close
+// that Writer when done.
+func NewWriter(w io.Writer) *Writer {
+	return &Writer{
+		w:    w,
+		obuf: make([]byte, obufLen),
+	}
+}
+
+// NewBufferedWriter returns a new Writer that compresses to w, using the
+// framing format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+//
+// The Writer returned buffers writes. Users must call Close to guarantee all
+// data has been forwarded to the underlying io.Writer. They may also call
+// Flush zero or more times before calling Close.
+func NewBufferedWriter(w io.Writer) *Writer {
+	return &Writer{
+		w:    w,
+		ibuf: make([]byte, 0, maxBlockSize),
+		obuf: make([]byte, obufLen),
+	}
+}
+
+// Writer is an io.Writer that can write Snappy-compressed bytes.
+type Writer struct {
+	w   io.Writer
+	err error
+
+	// ibuf is a buffer for the incoming (uncompressed) bytes.
+	//
+	// Its use is optional. For backwards compatibility, Writers created by the
+	// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
+	// therefore do not need to be Flush'ed or Close'd.
+	ibuf []byte
+
+	// obuf is a buffer for the outgoing (compressed) bytes.
+	obuf []byte
+
+	// wroteStreamHeader is whether we have written the stream header.
+	wroteStreamHeader bool
+}
+
+// Reset discards the writer's state and switches the Snappy writer to write to
+// w. This permits reusing a Writer rather than allocating a new one.
+func (w *Writer) Reset(writer io.Writer) {
+	w.w = writer
+	w.err = nil
+	if w.ibuf != nil {
+		w.ibuf = w.ibuf[:0]
+	}
+	w.wroteStreamHeader = false
+}
+
+// Write satisfies the io.Writer interface.
+func (w *Writer) Write(p []byte) (nRet int, errRet error) {
+	if w.ibuf == nil {
+		// Do not buffer incoming bytes. This does not perform or compress well
+		// if the caller of Writer.Write writes many small slices. This
+		// behavior is therefore deprecated, but still supported for backwards
+		// compatibility with code that doesn't explicitly Flush or Close.
+		return w.write(p)
+	}
+
+	// The remainder of this method is based on bufio.Writer.Write from the
+	// standard library.
+
+	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
+		var n int
+		if len(w.ibuf) == 0 {
+			// Large write, empty buffer.
+			// Write directly from p to avoid copy.
+			n, _ = w.write(p)
+		} else {
+			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+			w.ibuf = w.ibuf[:len(w.ibuf)+n]
+			w.Flush()
+		}
+		nRet += n
+		p = p[n:]
+	}
+	if w.err != nil {
+		return nRet, w.err
+	}
+	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+	w.ibuf = w.ibuf[:len(w.ibuf)+n]
+	nRet += n
+	return nRet, nil
+}
+
+func (w *Writer) write(p []byte) (nRet int, errRet error) {
+	if w.err != nil {
+		return 0, w.err
+	}
+	for len(p) > 0 {
+		obufStart := len(magicChunk)
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			copy(w.obuf, magicChunk)
+			obufStart = 0
+		}
+
+		var uncompressed []byte
+		if len(p) > maxBlockSize {
+			uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+		checksum := crc(uncompressed)
+
+		// Compress the buffer, discarding the result if the improvement
+		// isn't at least 12.5%.
+		compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
+		chunkType := uint8(chunkTypeCompressedData)
+		chunkLen := 4 + len(compressed)
+		obufEnd := obufHeaderLen + len(compressed)
+		if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
+			chunkType = chunkTypeUncompressedData
+			chunkLen = 4 + len(uncompressed)
+			obufEnd = obufHeaderLen
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		w.obuf[len(magicChunk)+0] = chunkType
+		w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
+		w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
+		w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
+		w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
+		w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
+		w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
+		w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
+
+		if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
+			w.err = err
+			return nRet, err
+		}
+		if chunkType == chunkTypeUncompressedData {
+			if _, err := w.w.Write(uncompressed); err != nil {
+				w.err = err
+				return nRet, err
+			}
+		}
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// Flush flushes the Writer to its underlying io.Writer.
+func (w *Writer) Flush() error {
+	if w.err != nil {
+		return w.err
+	}
+	if len(w.ibuf) == 0 {
+		return nil
+	}
+	w.write(w.ibuf)
+	w.ibuf = w.ibuf[:0]
+	return w.err
+}
+
+// Close calls Flush and then closes the Writer.
+func (w *Writer) Close() error {
+	w.Flush()
+	ret := w.err
+	if w.err == nil {
+		w.err = errClosed
+	}
+	return ret
+}

diff --git a/vendor/github.com/golang/snappy/encode_amd64.go b/vendor/github.com/golang/snappy/encode_amd64.go
new file mode 100644
index 0000000..150d91b
--- /dev/null
+++ b/vendor/github.com/golang/snappy/encode_amd64.go

@@ -0,0 +1,29 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package snappy
+
+// emitLiteral has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitLiteral(dst, lit []byte) int
+
+// emitCopy has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitCopy(dst []byte, offset, length int) int
+
+// extendMatch has the same semantics as in encode_other.go.
+//
+//go:noescape
+func extendMatch(src []byte, i, j int) int
+
+// encodeBlock has the same semantics as in encode_other.go.
+//
+//go:noescape
+func encodeBlock(dst, src []byte) (d int)

diff --git a/vendor/github.com/golang/snappy/encode_amd64.s b/vendor/github.com/golang/snappy/encode_amd64.s
new file mode 100644
index 0000000..adfd979
--- /dev/null
+++ b/vendor/github.com/golang/snappy/encode_amd64.s

@@ -0,0 +1,730 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
+// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
+// https://github.com/golang/snappy/issues/29
+//
+// As a workaround, the package was built with a known good assembler, and
+// those instructions were disassembled by "objdump -d" to yield the
+//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+// style comments, in AT&T asm syntax. Note that rsp here is a physical
+// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
+// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
+// fine on Go 1.6.
+
+// The asm code generally follows the pure Go code in encode_other.go, except
+// where marked with a "!!!".
+
+// ----------------------------------------------------------------------------
+
+// func emitLiteral(dst, lit []byte) int
+//
+// All local variables fit into registers. The register allocation:
+//	- AX	len(lit)
+//	- BX	n
+//	- DX	return value
+//	- DI	&dst[i]
+//	- R10	&lit[0]
+//
+// The 24 bytes of stack space is to call runtime·memmove.
+//
+// The unusual register allocation of local variables, such as R10 for the
+// source pointer, matches the allocation used at the call site in encodeBlock,
+// which makes it easier to manually inline this function.
+TEXT ·emitLiteral(SB), NOSPLIT, $24-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ lit_base+24(FP), R10
+	MOVQ lit_len+32(FP), AX
+	MOVQ AX, DX
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  oneByte
+	CMPL BX, $256
+	JLT  twoBytes
+
+threeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	ADDQ $3, DX
+	JMP  memmove
+
+twoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	ADDQ $2, DX
+	JMP  memmove
+
+oneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+	ADDQ $1, DX
+
+memmove:
+	MOVQ DX, ret+48(FP)
+
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	CALL runtime·memmove(SB)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func emitCopy(dst []byte, offset, length int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- AX	length
+//	- SI	&dst[0]
+//	- DI	&dst[i]
+//	- R11	offset
+//
+// The unusual register allocation of local variables, such as R11 for the
+// offset, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, SI
+	MOVQ offset+24(FP), R11
+	MOVQ length+32(FP), AX
+
+loop0:
+	// for length >= 68 { etc }
+	CMPL AX, $68
+	JLT  step1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, AX
+	JMP  loop0
+
+step1:
+	// if length > 64 { etc }
+	CMPL AX, $64
+	JLE  step2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, AX
+
+step2:
+	// if length >= 12 || offset >= 2048 { goto step3 }
+	CMPL AX, $12
+	JGE  step3
+	CMPL R11, $2048
+	JGE  step3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
+	ADDQ $2, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+step3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func extendMatch(src []byte, i, j int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- DX	&src[0]
+//	- SI	&src[j]
+//	- R13	&src[len(src) - 8]
+//	- R14	&src[len(src)]
+//	- R15	&src[i]
+//
+// The unusual register allocation of local variables, such as R15 for a source
+// pointer, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·extendMatch(SB), NOSPLIT, $0-48
+	MOVQ src_base+0(FP), DX
+	MOVQ src_len+8(FP), R14
+	MOVQ i+24(FP), R15
+	MOVQ j+32(FP), SI
+	ADDQ DX, R14
+	ADDQ DX, R15
+	ADDQ DX, SI
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+cmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   cmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  bsf
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  cmp8
+
+bsf:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+
+	// Convert from &src[ret] to ret.
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
+	RET
+
+cmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  extendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  extendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  cmp1
+
+extendMatchEnd:
+	// Convert from &src[ret] to ret.
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func encodeBlock(dst, src []byte) (d int)
+//
+// All local variables fit into registers, other than "var table". The register
+// allocation:
+//	- AX	.	.
+//	- BX	.	.
+//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
+//	- DX	64	&src[0], tableSize
+//	- SI	72	&src[s]
+//	- DI	80	&dst[d]
+//	- R9	88	sLimit
+//	- R10	.	&src[nextEmit]
+//	- R11	96	prevHash, currHash, nextHash, offset
+//	- R12	104	&src[base], skip
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
+//	- R15	112	candidate
+//
+// The second column (56, 64, etc) is the stack offset to spill the registers
+// when calling other functions. We could pack this slightly tighter, but it's
+// simpler to have a dedicated spill map independent of the function called.
+//
+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
+// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
+// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
+TEXT ·encodeBlock(SB), 0, $32888-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R14
+
+	// shift, tableSize := uint32(32-8), 1<<8
+	MOVQ $24, CX
+	MOVQ $256, DX
+
+calcShift:
+	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+	//	shift--
+	// }
+	CMPQ DX, $16384
+	JGE  varTable
+	CMPQ DX, R14
+	JGE  varTable
+	SUBQ $1, CX
+	SHLQ $1, DX
+	JMP  calcShift
+
+varTable:
+	// var table [maxTableSize]uint16
+	//
+	// In the asm code, unlike the Go code, we can zero-initialize only the
+	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
+	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
+	// 2048 writes that would zero-initialize all of table's 32768 bytes.
+	SHRQ $3, DX
+	LEAQ table-32768(SP), BX
+	PXOR X0, X0
+
+memclr:
+	MOVOU X0, 0(BX)
+	ADDQ  $16, BX
+	SUBQ  $1, DX
+	JNZ   memclr
+
+	// !!! DX = &src[0]
+	MOVQ SI, DX
+
+	// sLimit := len(src) - inputMargin
+	MOVQ R14, R9
+	SUBQ $15, R9
+
+	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
+	// change for the rest of the function.
+	MOVQ CX, 56(SP)
+	MOVQ DX, 64(SP)
+	MOVQ R9, 88(SP)
+
+	// nextEmit := 0
+	MOVQ DX, R10
+
+	// s := 1
+	ADDQ $1, SI
+
+	// nextHash := hash(load32(src, s), shift)
+	MOVL  0(SI), R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+outer:
+	// for { etc }
+
+	// skip := 32
+	MOVQ $32, R12
+
+	// nextS := s
+	MOVQ SI, R13
+
+	// candidate := 0
+	MOVQ $0, R15
+
+inner0:
+	// for { etc }
+
+	// s := nextS
+	MOVQ R13, SI
+
+	// bytesBetweenHashLookups := skip >> 5
+	MOVQ R12, R14
+	SHRQ $5, R14
+
+	// nextS = s + bytesBetweenHashLookups
+	ADDQ R14, R13
+
+	// skip += bytesBetweenHashLookups
+	ADDQ R14, R12
+
+	// if nextS > sLimit { goto emitRemainder }
+	MOVQ R13, AX
+	SUBQ DX, AX
+	CMPQ AX, R9
+	JA   emitRemainder
+
+	// candidate = int(table[nextHash])
+	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
+	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+	BYTE $0x4e
+	BYTE $0x0f
+	BYTE $0xb7
+	BYTE $0x7c
+	BYTE $0x5c
+	BYTE $0x78
+
+	// table[nextHash] = uint16(s)
+	MOVQ SI, AX
+	SUBQ DX, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// nextHash = hash(load32(src, nextS), shift)
+	MOVL  0(R13), R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// if load32(src, s) != load32(src, candidate) { continue } break
+	MOVL 0(SI), AX
+	MOVL (DX)(R15*1), BX
+	CMPL AX, BX
+	JNE  inner0
+
+fourByteMatch:
+	// As per the encode_other.go code:
+	//
+	// A 4-byte match has been found. We'll later see etc.
+
+	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+	// on inputMargin in encode.go.
+	MOVQ SI, AX
+	SUBQ R10, AX
+	CMPQ AX, $16
+	JLE  emitLiteralFastPath
+
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
+	//
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
+
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  inlineEmitLiteralOneByte
+	CMPL BX, $256
+	JLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+
+inlineEmitLiteralMemmove:
+	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
+	MOVQ SI, 72(SP)
+	MOVQ DI, 80(SP)
+	MOVQ R15, 112(SP)
+	CALL runtime·memmove(SB)
+	MOVQ 56(SP), CX
+	MOVQ 64(SP), DX
+	MOVQ 72(SP), SI
+	MOVQ 80(SP), DI
+	MOVQ 88(SP), R9
+	MOVQ 112(SP), R15
+	JMP  inner1
+
+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
+emitLiteralFastPath:
+	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+	MOVB AX, BX
+	SUBB $1, BX
+	SHLB $2, BX
+	MOVB BX, (DI)
+	ADDQ $1, DI
+
+	// !!! Implement the copy from lit to dst as a 16-byte load and store.
+	// (Encode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
+	// OK. Subsequent iterations will fix up the overrun.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(R10), X0
+	MOVOU X0, 0(DI)
+	ADDQ  AX, DI
+
+inner1:
+	// for { etc }
+
+	// base := s
+	MOVQ SI, R12
+
+	// !!! offset := base - candidate
+	MOVQ R12, R11
+	SUBQ R15, R11
+	SUBQ DX, R11
+
+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
+	//
+	// s = extendMatch(src, candidate+4, s+4)
+
+	// !!! R14 = &src[len(src)]
+	MOVQ src_len+32(FP), R14
+	ADDQ DX, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+	// !!! R15 = &src[candidate + 4]
+	ADDQ $4, R15
+	ADDQ DX, R15
+
+	// !!! s += 4
+	ADDQ $4, SI
+
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   inlineExtendMatchCmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  inlineExtendMatchBSF
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  inlineExtendMatchCmp8
+
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+	JMP  inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  inlineExtendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  inlineExtendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------
+
+	// ----------------------------------------
+	// Begin inline of the emitCopy call.
+	//
+	// d += emitCopy(dst[d:], base-candidate, s-base)
+
+	// !!! length := s - base
+	MOVQ SI, AX
+	SUBQ R12, AX
+
+inlineEmitCopyLoop0:
+	// for length >= 68 { etc }
+	CMPL AX, $68
+	JLT  inlineEmitCopyStep1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, AX
+	JMP  inlineEmitCopyLoop0
+
+inlineEmitCopyStep1:
+	// if length > 64 { etc }
+	CMPL AX, $64
+	JLE  inlineEmitCopyStep2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, AX
+
+inlineEmitCopyStep2:
+	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
+	CMPL AX, $12
+	JGE  inlineEmitCopyStep3
+	CMPL R11, $2048
+	JGE  inlineEmitCopyStep3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitCopyEnd
+
+inlineEmitCopyStep3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+
+inlineEmitCopyEnd:
+	// End inline of the emitCopy call.
+	// ----------------------------------------
+
+	// nextEmit = s
+	MOVQ SI, R10
+
+	// if s >= sLimit { goto emitRemainder }
+	MOVQ SI, AX
+	SUBQ DX, AX
+	CMPQ AX, R9
+	JAE  emitRemainder
+
+	// As per the encode_other.go code:
+	//
+	// We could immediately etc.
+
+	// x := load64(src, s-1)
+	MOVQ -1(SI), R14
+
+	// prevHash := hash(uint32(x>>0), shift)
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// table[prevHash] = uint16(s-1)
+	MOVQ SI, AX
+	SUBQ DX, AX
+	SUBQ $1, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// currHash := hash(uint32(x>>8), shift)
+	SHRQ  $8, R14
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// candidate = int(table[currHash])
+	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
+	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+	BYTE $0x4e
+	BYTE $0x0f
+	BYTE $0xb7
+	BYTE $0x7c
+	BYTE $0x5c
+	BYTE $0x78
+
+	// table[currHash] = uint16(s)
+	ADDQ $1, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// if uint32(x>>8) == load32(src, candidate) { continue }
+	MOVL (DX)(R15*1), BX
+	CMPL R14, BX
+	JEQ  inner1
+
+	// nextHash = hash(uint32(x>>16), shift)
+	SHRQ  $8, R14
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// s++
+	ADDQ $1, SI
+
+	// break out of the inner1 for loop, i.e. continue the outer loop.
+	JMP outer
+
+emitRemainder:
+	// if nextEmit < len(src) { etc }
+	MOVQ src_len+32(FP), AX
+	ADDQ DX, AX
+	CMPQ R10, AX
+	JEQ  encodeBlockEnd
+
+	// d += emitLiteral(dst[d:], src[nextEmit:])
+	//
+	// Push args.
+	MOVQ DI, 0(SP)
+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ R10, 24(SP)
+	SUBQ R10, AX
+	MOVQ AX, 32(SP)
+	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	MOVQ DI, 80(SP)
+	CALL ·emitLiteral(SB)
+	MOVQ 80(SP), DI
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADDQ 48(SP), DI
+
+encodeBlockEnd:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, DI
+	MOVQ DI, d+48(FP)
+	RET

diff --git a/vendor/github.com/golang/snappy/encode_other.go b/vendor/github.com/golang/snappy/encode_other.go
new file mode 100644
index 0000000..dbcae90
--- /dev/null
+++ b/vendor/github.com/golang/snappy/encode_other.go

@@ -0,0 +1,238 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 appengine !gc noasm
+
+package snappy
+
+func load32(b []byte, i int) uint32 {
+	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= len(lit) && len(lit) <= 65536
+func emitLiteral(dst, lit []byte) int {
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[0] = 60<<2 | tagLiteral
+		dst[1] = uint8(n)
+		i = 2
+	default:
+		dst[0] = 61<<2 | tagLiteral
+		dst[1] = uint8(n)
+		dst[2] = uint8(n >> 8)
+		i = 3
+	}
+	return i + copy(dst[i:], lit)
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= 65535
+//	4 <= length && length <= 65535
+func emitCopy(dst []byte, offset, length int) int {
+	i := 0
+	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
+	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
+	// length emitted down below is is a little lower (at 60 = 64 - 4), because
+	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
+	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
+	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
+	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
+	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
+	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
+	for length >= 68 {
+		// Emit a length 64 copy, encoded as 3 bytes.
+		dst[i+0] = 63<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 64
+	}
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		dst[i+0] = 59<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 60
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		return i + 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	dst[i+1] = uint8(offset)
+	return i + 2
+}
+
+// extendMatch returns the largest k such that k <= len(src) and that
+// src[i:i+k-j] and src[j:k] have the same contents.
+//
+// It assumes that:
+//	0 <= i && i < j && j <= len(src)
+func extendMatch(src []byte, i, j int) int {
+	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
+	}
+	return j
+}
+
+func hash(u, shift uint32) uint32 {
+	return (u * 0x1e35a7bd) >> shift
+}
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
+	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+	// The table element type is uint16, as s < sLimit and sLimit < len(src)
+	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
+	const (
+		maxTableSize = 1 << 14
+		// tableMask is redundant, but helps the compiler eliminate bounds
+		// checks.
+		tableMask = maxTableSize - 1
+	)
+	shift := uint32(32 - 8)
+	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+		shift--
+	}
+	// In Go, all array elements are zero-initialized, so there is no advantage
+	// to a smaller tableSize per se. However, it matches the C++ algorithm,
+	// and in the asm versions of this code, we can get away with zeroing only
+	// the first tableSize elements.
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	nextHash := hash(load32(src, s), shift)
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := 32
+
+		nextS := s
+		candidate := 0
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = int(table[nextHash&tableMask])
+			table[nextHash&tableMask] = uint16(s)
+			nextHash = hash(load32(src, nextS), shift)
+			if load32(src, s) == load32(src, candidate) {
+				break
+			}
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+
+			// Extend the 4-byte match as long as possible.
+			//
+			// This is an inlined version of:
+			//	s = extendMatch(src, candidate+4, s+4)
+			s += 4
+			for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
+			}
+
+			d += emitCopy(dst[d:], base-candidate, s-base)
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-1 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load64(src, s-1)
+			prevHash := hash(uint32(x>>0), shift)
+			table[prevHash&tableMask] = uint16(s - 1)
+			currHash := hash(uint32(x>>8), shift)
+			candidate = int(table[currHash&tableMask])
+			table[currHash&tableMask] = uint16(s)
+			if uint32(x>>8) != load32(src, candidate) {
+				nextHash = hash(uint32(x>>16), shift)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}

diff --git a/vendor/github.com/golang/snappy/snappy.go b/vendor/github.com/golang/snappy/snappy.go
new file mode 100644
index 0000000..ece692e
--- /dev/null
+++ b/vendor/github.com/golang/snappy/snappy.go

@@ -0,0 +1,98 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package snappy implements the Snappy compression format. It aims for very
+// high speeds and reasonable compression.
+//
+// There are actually two Snappy formats: block and stream. They are related,
+// but different: trying to decompress block-compressed data as a Snappy stream
+// will fail, and vice versa. The block format is the Decode and Encode
+// functions and the stream format is the Reader and Writer types.
+//
+// The block format, the more common case, is used when the complete size (the
+// number of bytes) of the original data is known upfront, at the time
+// compression starts. The stream format, also known as the framing format, is
+// for when that isn't always true.
+//
+// The canonical, C++ implementation is at https://github.com/google/snappy and
+// it only implements the block format.
+package snappy // import "github.com/golang/snappy"
+
+import (
+	"hash/crc32"
+)
+
+/*
+Each encoded block begins with the varint-encoded length of the decoded data,
+followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
+first byte of each chunk is broken into its 2 least and 6 most significant bits
+called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
+Zero means a literal tag. All other values mean a copy tag.
+
+For literal tags:
+  - If m < 60, the next 1 + m bytes are literal bytes.
+  - Otherwise, let n be the little-endian unsigned integer denoted by the next
+    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
+
+For copy tags, length bytes are copied from offset bytes ago, in the style of
+Lempel-Ziv compression algorithms. In particular:
+  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
+    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
+    of the offset. The next byte is bits 0-7 of the offset.
+  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
+    The length is 1 + m. The offset is the little-endian unsigned integer
+    denoted by the next 2 bytes.
+  - For l == 3, this tag is a legacy format that is no longer issued by most
+    encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in
+    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
+    integer denoted by the next 4 bytes.
+*/
+const (
+	tagLiteral = 0x00
+	tagCopy1   = 0x01
+	tagCopy2   = 0x02
+	tagCopy4   = 0x03
+)
+
+const (
+	checksumSize    = 4
+	chunkHeaderSize = 4
+	magicChunk      = "\xff\x06\x00\x00" + magicBody
+	magicBody       = "sNaPpY"
+
+	// maxBlockSize is the maximum size of the input to encodeBlock. It is not
+	// part of the wire format per se, but some parts of the encoder assume
+	// that an offset fits into a uint16.
+	//
+	// Also, for the framing format (Writer type instead of Encode function),
+	// https://github.com/google/snappy/blob/master/framing_format.txt says
+	// that "the uncompressed data in a chunk must be no longer than 65536
+	// bytes".
+	maxBlockSize = 65536
+
+	// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
+	// hard coded to be a const instead of a variable, so that obufLen can also
+	// be a const. Their equivalence is confirmed by
+	// TestMaxEncodedLenOfMaxBlockSize.
+	maxEncodedLenOfMaxBlockSize = 76490
+
+	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
+	obufLen       = obufHeaderLen + maxEncodedLenOfMaxBlockSize
+)
+
+const (
+	chunkTypeCompressedData   = 0x00
+	chunkTypeUncompressedData = 0x01
+	chunkTypePadding          = 0xfe
+	chunkTypeStreamIdentifier = 0xff
+)
+
+var crcTable = crc32.MakeTable(crc32.Castagnoli)
+
+// crc implements the checksum specified in section 3 of
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func crc(b []byte) uint32 {
+	c := crc32.Update(0, crcTable, b)
+	return uint32(c>>15|c<<17) + 0xa282ead8
+}

diff --git a/vendor/github.com/pierrec/lz4/.gitignore b/vendor/github.com/pierrec/lz4/.gitignore
new file mode 100644
index 0000000..e48bab3
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/.gitignore

@@ -0,0 +1,33 @@
+# Created by https://www.gitignore.io/api/macos
+
+### macOS ###
+*.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+# End of https://www.gitignore.io/api/macos
+
+lz4c/lz4c

diff --git a/vendor/github.com/pierrec/lz4/.travis.yml b/vendor/github.com/pierrec/lz4/.travis.yml
new file mode 100644
index 0000000..204afe1
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/.travis.yml

@@ -0,0 +1,22 @@
+language: go
+
+env:
+  - GO111MODULE=off
+  - GO111MODULE=on
+
+go:
+  - 1.9.x
+  - 1.10.x
+  - 1.11.x
+  - master
+
+matrix:
+ fast_finish: true
+ allow_failures:
+   - go: master
+
+sudo: false
+
+script: 
+ - go test -v -cpu=2
+ - go test -v -cpu=2 -race

diff --git a/vendor/github.com/pierrec/lz4/LICENSE b/vendor/github.com/pierrec/lz4/LICENSE
new file mode 100644
index 0000000..bd899d8
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/LICENSE

@@ -0,0 +1,28 @@
+Copyright (c) 2015, Pierre Curto
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of xxHash nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+

diff --git a/vendor/github.com/pierrec/lz4/README.md b/vendor/github.com/pierrec/lz4/README.md
new file mode 100644
index 0000000..e71ebd5
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/README.md

@@ -0,0 +1,24 @@
+[![godoc](https://godoc.org/github.com/pierrec/lz4?status.png)](https://godoc.org/github.com/pierrec/lz4)
+
+# lz4
+LZ4 compression and decompression in pure Go.
+
+## Usage
+
+```go
+import "github.com/pierrec/lz4/v2"
+```
+
+## Description
+Package lz4 implements reading and writing lz4 compressed data (a frame),
+as specified in http://fastcompression.blogspot.fr/2013/04/lz4-streaming-format-final.html.
+
+This package is **compatible with the LZ4 frame format** although the block level compression 
+and decompression functions are exposed and are fully compatible with the lz4 block format 
+definition, they are low level and should not be used directly.
+
+For a complete description of an lz4 compressed block, see:
+http://fastcompression.blogspot.fr/2011/05/lz4-explained.html
+
+See https://github.com/Cyan4973/lz4 for the reference C implementation.
+

diff --git a/vendor/github.com/pierrec/lz4/block.go b/vendor/github.com/pierrec/lz4/block.go
new file mode 100644
index 0000000..00b1111
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/block.go

@@ -0,0 +1,397 @@
+package lz4
+
+import (
+	"encoding/binary"
+	"errors"
+)
+
+var (
+	// ErrInvalidSourceShortBuffer is returned by UncompressBlock or CompressBLock when a compressed
+	// block is corrupted or the destination buffer is not large enough for the uncompressed data.
+	ErrInvalidSourceShortBuffer = errors.New("lz4: invalid source or destination buffer too short")
+	// ErrInvalid is returned when reading an invalid LZ4 archive.
+	ErrInvalid = errors.New("lz4: bad magic number")
+)
+
+// blockHash hashes 4 bytes into a value < winSize.
+func blockHash(x uint32) uint32 {
+	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
+	return x * hasher >> hashShift
+}
+
+// CompressBlockBound returns the maximum size of a given buffer of size n, when not compressible.
+func CompressBlockBound(n int) int {
+	return n + n/255 + 16
+}
+
+// UncompressBlock uncompresses the source buffer into the destination one,
+// and returns the uncompressed size.
+//
+// The destination buffer must be sized appropriately.
+//
+// An error is returned if the source data is invalid or the destination buffer is too small.
+func UncompressBlock(src, dst []byte) (si int, err error) {
+	defer func() {
+		// It is now faster to let the runtime panic and recover on out of bound slice access
+		// than checking indices as we go along.
+		if recover() != nil {
+			err = ErrInvalidSourceShortBuffer
+		}
+	}()
+	sn := len(src)
+	if sn == 0 {
+		return 0, nil
+	}
+	var di int
+
+	for {
+		// Literals and match lengths (token).
+		b := int(src[si])
+		si++
+
+		// Literals.
+		if lLen := b >> 4; lLen > 0 {
+			if lLen == 0xF {
+				for src[si] == 0xFF {
+					lLen += 0xFF
+					si++
+				}
+				lLen += int(src[si])
+				si++
+			}
+			i := si
+			si += lLen
+			di += copy(dst[di:], src[i:si])
+
+			if si >= sn {
+				return di, nil
+			}
+		}
+
+		si++
+		_ = src[si] // Bound check elimination.
+		offset := int(src[si-1]) | int(src[si])<<8
+		si++
+
+		// Match.
+		mLen := b & 0xF
+		if mLen == 0xF {
+			for src[si] == 0xFF {
+				mLen += 0xFF
+				si++
+			}
+			mLen += int(src[si])
+			si++
+		}
+		mLen += minMatch
+
+		// Copy the match.
+		i := di - offset
+		if offset > 0 && mLen >= offset {
+			// Efficiently copy the match dst[di-offset:di] into the dst slice.
+			bytesToCopy := offset * (mLen / offset)
+			expanded := dst[i:]
+			for n := offset; n <= bytesToCopy+offset; n *= 2 {
+				copy(expanded[n:], expanded[:n])
+			}
+			di += bytesToCopy
+			mLen -= bytesToCopy
+		}
+		di += copy(dst[di:], dst[i:i+mLen])
+	}
+}
+
+// CompressBlock compresses the source buffer into the destination one.
+// This is the fast version of LZ4 compression and also the default one.
+// The size of hashTable must be at least 64Kb.
+//
+// The size of the compressed data is returned. If it is 0 and no error, then the data is incompressible.
+//
+// An error is returned if the destination buffer is too small.
+func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
+	defer func() {
+		if recover() != nil {
+			err = ErrInvalidSourceShortBuffer
+		}
+	}()
+
+	sn, dn := len(src)-mfLimit, len(dst)
+	if sn <= 0 || dn == 0 {
+		return 0, nil
+	}
+	var si int
+
+	// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
+	// const accInit = 1 << skipStrength
+
+	anchor := si // Position of the current literals.
+	// acc := accInit // Variable step: improves performance on non-compressible data.
+
+	for si < sn {
+		// Hash the next 4 bytes (sequence)...
+		match := binary.LittleEndian.Uint32(src[si:])
+		h := blockHash(match)
+
+		ref := hashTable[h]
+		hashTable[h] = si
+		if ref >= sn { // Invalid reference (dirty hashtable).
+			si++
+			continue
+		}
+		offset := si - ref
+		if offset <= 0 || offset >= winSize || // Out of window.
+			match != binary.LittleEndian.Uint32(src[ref:]) { // Hash collision on different matches.
+			// si += acc >> skipStrength
+			// acc++
+			si++
+			continue
+		}
+
+		// Match found.
+		// acc = accInit
+		lLen := si - anchor // Literal length.
+
+		// Encode match length part 1.
+		si += minMatch
+		mLen := si // Match length has minMatch already.
+		// Find the longest match, first looking by batches of 8 bytes.
+		for si < sn && binary.LittleEndian.Uint64(src[si:]) == binary.LittleEndian.Uint64(src[si-offset:]) {
+			si += 8
+		}
+		// Then byte by byte.
+		for si < sn && src[si] == src[si-offset] {
+			si++
+		}
+
+		mLen = si - mLen
+		if mLen < 0xF {
+			dst[di] = byte(mLen)
+		} else {
+			dst[di] = 0xF
+		}
+
+		// Encode literals length.
+		if lLen < 0xF {
+			dst[di] |= byte(lLen << 4)
+		} else {
+			dst[di] |= 0xF0
+			di++
+			l := lLen - 0xF
+			for ; l >= 0xFF; l -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(l)
+		}
+		di++
+
+		// Literals.
+		copy(dst[di:], src[anchor:anchor+lLen])
+		di += lLen + 2
+		anchor = si
+
+		// Encode offset.
+		_ = dst[di] // Bound check elimination.
+		dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
+
+		// Encode match length part 2.
+		if mLen >= 0xF {
+			for mLen -= 0xF; mLen >= 0xFF; mLen -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(mLen)
+			di++
+		}
+	}
+
+	if anchor == 0 {
+		// Incompressible.
+		return 0, nil
+	}
+
+	// Last literals.
+	lLen := len(src) - anchor
+	if lLen < 0xF {
+		dst[di] = byte(lLen << 4)
+	} else {
+		dst[di] = 0xF0
+		di++
+		for lLen -= 0xF; lLen >= 0xFF; lLen -= 0xFF {
+			dst[di] = 0xFF
+			di++
+		}
+		dst[di] = byte(lLen)
+	}
+	di++
+
+	// Write the last literals.
+	if di >= anchor {
+		// Incompressible.
+		return 0, nil
+	}
+	di += copy(dst[di:], src[anchor:])
+	return di, nil
+}
+
+// CompressBlockHC compresses the source buffer src into the destination dst
+// with max search depth (use 0 or negative value for no max).
+//
+// CompressBlockHC compression ratio is better than CompressBlock but it is also slower.
+//
+// The size of the compressed data is returned. If it is 0 and no error, then the data is not compressible.
+//
+// An error is returned if the destination buffer is too small.
+func CompressBlockHC(src, dst []byte, depth int) (di int, err error) {
+	defer func() {
+		if recover() != nil {
+			err = ErrInvalidSourceShortBuffer
+		}
+	}()
+
+	sn, dn := len(src)-mfLimit, len(dst)
+	if sn <= 0 || dn == 0 {
+		return 0, nil
+	}
+	var si int
+
+	// hashTable: stores the last position found for a given hash
+	// chaingTable: stores previous positions for a given hash
+	var hashTable, chainTable [winSize]int
+
+	if depth <= 0 {
+		depth = winSize
+	}
+
+	anchor := si
+	for si < sn {
+		// Hash the next 4 bytes (sequence).
+		match := binary.LittleEndian.Uint32(src[si:])
+		h := blockHash(match)
+
+		// Follow the chain until out of window and give the longest match.
+		mLen := 0
+		offset := 0
+		for next, try := hashTable[h], depth; try > 0 && next > 0 && si-next < winSize; next = chainTable[next&winMask] {
+			// The first (mLen==0) or next byte (mLen>=minMatch) at current match length
+			// must match to improve on the match length.
+			if src[next+mLen] != src[si+mLen] {
+				continue
+			}
+			ml := 0
+			// Compare the current position with a previous with the same hash.
+			for ml < sn-si && binary.LittleEndian.Uint64(src[next+ml:]) == binary.LittleEndian.Uint64(src[si+ml:]) {
+				ml += 8
+			}
+			for ml < sn-si && src[next+ml] == src[si+ml] {
+				ml++
+			}
+			if ml < minMatch || ml <= mLen {
+				// Match too small (<minMath) or smaller than the current match.
+				continue
+			}
+			// Found a longer match, keep its position and length.
+			mLen = ml
+			offset = si - next
+			// Try another previous position with the same hash.
+			try--
+		}
+		chainTable[si&winMask] = hashTable[h]
+		hashTable[h] = si
+
+		// No match found.
+		if mLen == 0 {
+			si++
+			continue
+		}
+
+		// Match found.
+		// Update hash/chain tables with overlapping bytes:
+		// si already hashed, add everything from si+1 up to the match length.
+		winStart := si + 1
+		if ws := si + mLen - winSize; ws > winStart {
+			winStart = ws
+		}
+		for si, ml := winStart, si+mLen; si < ml; {
+			match >>= 8
+			match |= uint32(src[si+3]) << 24
+			h := blockHash(match)
+			chainTable[si&winMask] = hashTable[h]
+			hashTable[h] = si
+			si++
+		}
+
+		lLen := si - anchor
+		si += mLen
+		mLen -= minMatch // Match length does not include minMatch.
+
+		if mLen < 0xF {
+			dst[di] = byte(mLen)
+		} else {
+			dst[di] = 0xF
+		}
+
+		// Encode literals length.
+		if lLen < 0xF {
+			dst[di] |= byte(lLen << 4)
+		} else {
+			dst[di] |= 0xF0
+			di++
+			l := lLen - 0xF
+			for ; l >= 0xFF; l -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(l)
+		}
+		di++
+
+		// Literals.
+		copy(dst[di:], src[anchor:anchor+lLen])
+		di += lLen
+		anchor = si
+
+		// Encode offset.
+		di += 2
+		dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
+
+		// Encode match length part 2.
+		if mLen >= 0xF {
+			for mLen -= 0xF; mLen >= 0xFF; mLen -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(mLen)
+			di++
+		}
+	}
+
+	if anchor == 0 {
+		// Incompressible.
+		return 0, nil
+	}
+
+	// Last literals.
+	lLen := len(src) - anchor
+	if lLen < 0xF {
+		dst[di] = byte(lLen << 4)
+	} else {
+		dst[di] = 0xF0
+		di++
+		lLen -= 0xF
+		for ; lLen >= 0xFF; lLen -= 0xFF {
+			dst[di] = 0xFF
+			di++
+		}
+		dst[di] = byte(lLen)
+	}
+	di++
+
+	// Write the last literals.
+	if di >= anchor {
+		// Incompressible.
+		return 0, nil
+	}
+	di += copy(dst[di:], src[anchor:])
+	return di, nil
+}

diff --git a/vendor/github.com/pierrec/lz4/debug.go b/vendor/github.com/pierrec/lz4/debug.go
new file mode 100644
index 0000000..bc5e78d
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/debug.go

@@ -0,0 +1,23 @@
+// +build lz4debug
+
+package lz4
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+)
+
+const debugFlag = true
+
+func debug(args ...interface{}) {
+	_, file, line, _ := runtime.Caller(1)
+	file = filepath.Base(file)
+
+	f := fmt.Sprintf("LZ4: %s:%d %s", file, line, args[0])
+	if f[len(f)-1] != '\n' {
+		f += "\n"
+	}
+	fmt.Fprintf(os.Stderr, f, args[1:]...)
+}

diff --git a/vendor/github.com/pierrec/lz4/debug_stub.go b/vendor/github.com/pierrec/lz4/debug_stub.go
new file mode 100644
index 0000000..44211ad
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/debug_stub.go

@@ -0,0 +1,7 @@
+// +build !lz4debug
+
+package lz4
+
+const debugFlag = false
+
+func debug(args ...interface{}) {}

diff --git a/vendor/github.com/pierrec/lz4/go.mod b/vendor/github.com/pierrec/lz4/go.mod
new file mode 100644
index 0000000..f9f570a
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/go.mod

@@ -0,0 +1,3 @@
+module github.com/pierrec/lz4
+
+require github.com/pkg/profile v1.2.1

diff --git a/vendor/github.com/pierrec/lz4/go.sum b/vendor/github.com/pierrec/lz4/go.sum
new file mode 100644
index 0000000..6ca7598
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/go.sum

@@ -0,0 +1,2 @@
+github.com/pkg/profile v1.2.1 h1:F++O52m40owAmADcojzM+9gyjmMOY/T4oYJkgFDH8RE=
+github.com/pkg/profile v1.2.1/go.mod h1:hJw3o1OdXxsrSjjVksARp5W95eeEaEfptyVZyv6JUPA=

diff --git a/vendor/github.com/pierrec/lz4/internal/xxh32/xxh32zero.go b/vendor/github.com/pierrec/lz4/internal/xxh32/xxh32zero.go
new file mode 100644
index 0000000..850a6fd
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/internal/xxh32/xxh32zero.go

@@ -0,0 +1,222 @@
+// Package xxh32 implements the very fast XXH hashing algorithm (32 bits version).
+// (https://github.com/Cyan4973/XXH/)
+package xxh32
+
+import (
+	"encoding/binary"
+)
+
+const (
+	prime32_1 uint32 = 2654435761
+	prime32_2 uint32 = 2246822519
+	prime32_3 uint32 = 3266489917
+	prime32_4 uint32 = 668265263
+	prime32_5 uint32 = 374761393
+
+	prime32_1plus2 uint32 = 606290984
+	prime32_minus1 uint32 = 1640531535
+)
+
+// XXHZero represents an xxhash32 object with seed 0.
+type XXHZero struct {
+	v1       uint32
+	v2       uint32
+	v3       uint32
+	v4       uint32
+	totalLen uint64
+	buf      [16]byte
+	bufused  int
+}
+
+// Sum appends the current hash to b and returns the resulting slice.
+// It does not change the underlying hash state.
+func (xxh XXHZero) Sum(b []byte) []byte {
+	h32 := xxh.Sum32()
+	return append(b, byte(h32), byte(h32>>8), byte(h32>>16), byte(h32>>24))
+}
+
+// Reset resets the Hash to its initial state.
+func (xxh *XXHZero) Reset() {
+	xxh.v1 = prime32_1plus2
+	xxh.v2 = prime32_2
+	xxh.v3 = 0
+	xxh.v4 = prime32_minus1
+	xxh.totalLen = 0
+	xxh.bufused = 0
+}
+
+// Size returns the number of bytes returned by Sum().
+func (xxh *XXHZero) Size() int {
+	return 4
+}
+
+// BlockSize gives the minimum number of bytes accepted by Write().
+func (xxh *XXHZero) BlockSize() int {
+	return 1
+}
+
+// Write adds input bytes to the Hash.
+// It never returns an error.
+func (xxh *XXHZero) Write(input []byte) (int, error) {
+	if xxh.totalLen == 0 {
+		xxh.Reset()
+	}
+	n := len(input)
+	m := xxh.bufused
+
+	xxh.totalLen += uint64(n)
+
+	r := len(xxh.buf) - m
+	if n < r {
+		copy(xxh.buf[m:], input)
+		xxh.bufused += len(input)
+		return n, nil
+	}
+
+	p := 0
+	// Causes compiler to work directly from registers instead of stack:
+	v1, v2, v3, v4 := xxh.v1, xxh.v2, xxh.v3, xxh.v4
+	if m > 0 {
+		// some data left from previous update
+		copy(xxh.buf[xxh.bufused:], input[:r])
+		xxh.bufused += len(input) - r
+
+		// fast rotl(13)
+		buf := xxh.buf[:16] // BCE hint.
+		v1 = rol13(v1+binary.LittleEndian.Uint32(buf[:])*prime32_2) * prime32_1
+		v2 = rol13(v2+binary.LittleEndian.Uint32(buf[4:])*prime32_2) * prime32_1
+		v3 = rol13(v3+binary.LittleEndian.Uint32(buf[8:])*prime32_2) * prime32_1
+		v4 = rol13(v4+binary.LittleEndian.Uint32(buf[12:])*prime32_2) * prime32_1
+		p = r
+		xxh.bufused = 0
+	}
+
+	for n := n - 16; p <= n; p += 16 {
+		sub := input[p:][:16] //BCE hint for compiler
+		v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime32_2) * prime32_1
+		v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime32_2) * prime32_1
+		v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime32_2) * prime32_1
+		v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime32_2) * prime32_1
+	}
+	xxh.v1, xxh.v2, xxh.v3, xxh.v4 = v1, v2, v3, v4
+
+	copy(xxh.buf[xxh.bufused:], input[p:])
+	xxh.bufused += len(input) - p
+
+	return n, nil
+}
+
+// Sum32 returns the 32 bits Hash value.
+func (xxh *XXHZero) Sum32() uint32 {
+	h32 := uint32(xxh.totalLen)
+	if h32 >= 16 {
+		h32 += rol1(xxh.v1) + rol7(xxh.v2) + rol12(xxh.v3) + rol18(xxh.v4)
+	} else {
+		h32 += prime32_5
+	}
+
+	p := 0
+	n := xxh.bufused
+	buf := xxh.buf
+	for n := n - 4; p <= n; p += 4 {
+		h32 += binary.LittleEndian.Uint32(buf[p:p+4]) * prime32_3
+		h32 = rol17(h32) * prime32_4
+	}
+	for ; p < n; p++ {
+		h32 += uint32(buf[p]) * prime32_5
+		h32 = rol11(h32) * prime32_1
+	}
+
+	h32 ^= h32 >> 15
+	h32 *= prime32_2
+	h32 ^= h32 >> 13
+	h32 *= prime32_3
+	h32 ^= h32 >> 16
+
+	return h32
+}
+
+// ChecksumZero returns the 32bits Hash value.
+func ChecksumZero(input []byte) uint32 {
+	n := len(input)
+	h32 := uint32(n)
+
+	if n < 16 {
+		h32 += prime32_5
+	} else {
+		v1 := prime32_1plus2
+		v2 := prime32_2
+		v3 := uint32(0)
+		v4 := prime32_minus1
+		p := 0
+		for n := n - 16; p <= n; p += 16 {
+			sub := input[p:][:16] //BCE hint for compiler
+			v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime32_2) * prime32_1
+			v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime32_2) * prime32_1
+			v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime32_2) * prime32_1
+			v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime32_2) * prime32_1
+		}
+		input = input[p:]
+		n -= p
+		h32 += rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
+	}
+
+	p := 0
+	for n := n - 4; p <= n; p += 4 {
+		h32 += binary.LittleEndian.Uint32(input[p:p+4]) * prime32_3
+		h32 = rol17(h32) * prime32_4
+	}
+	for p < n {
+		h32 += uint32(input[p]) * prime32_5
+		h32 = rol11(h32) * prime32_1
+		p++
+	}
+
+	h32 ^= h32 >> 15
+	h32 *= prime32_2
+	h32 ^= h32 >> 13
+	h32 *= prime32_3
+	h32 ^= h32 >> 16
+
+	return h32
+}
+
+// Uint32Zero hashes x with seed 0.
+func Uint32Zero(x uint32) uint32 {
+	h := prime32_5 + 4 + x*prime32_3
+	h = rol17(h) * prime32_4
+	h ^= h >> 15
+	h *= prime32_2
+	h ^= h >> 13
+	h *= prime32_3
+	h ^= h >> 16
+	return h
+}
+
+func rol1(u uint32) uint32 {
+	return u<<1 | u>>31
+}
+
+func rol7(u uint32) uint32 {
+	return u<<7 | u>>25
+}
+
+func rol11(u uint32) uint32 {
+	return u<<11 | u>>21
+}
+
+func rol12(u uint32) uint32 {
+	return u<<12 | u>>20
+}
+
+func rol13(u uint32) uint32 {
+	return u<<13 | u>>19
+}
+
+func rol17(u uint32) uint32 {
+	return u<<17 | u>>15
+}
+
+func rol18(u uint32) uint32 {
+	return u<<18 | u>>14
+}

diff --git a/vendor/github.com/pierrec/lz4/lz4.go b/vendor/github.com/pierrec/lz4/lz4.go
new file mode 100644
index 0000000..3580275
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/lz4.go

@@ -0,0 +1,68 @@
+// Package lz4 implements reading and writing lz4 compressed data (a frame),
+// as specified in http://fastcompression.blogspot.fr/2013/04/lz4-streaming-format-final.html.
+//
+// Although the block level compression and decompression functions are exposed and are fully compatible
+// with the lz4 block format definition, they are low level and should not be used directly.
+// For a complete description of an lz4 compressed block, see:
+// http://fastcompression.blogspot.fr/2011/05/lz4-explained.html
+//
+// See https://github.com/Cyan4973/lz4 for the reference C implementation.
+//
+package lz4
+
+const (
+	// Extension is the LZ4 frame file name extension
+	Extension = ".lz4"
+	// Version is the LZ4 frame format version
+	Version = 1
+
+	frameMagic     uint32 = 0x184D2204
+	frameSkipMagic uint32 = 0x184D2A50
+
+	// The following constants are used to setup the compression algorithm.
+	minMatch            = 4  // the minimum size of the match sequence size (4 bytes)
+	winSizeLog          = 16 // LZ4 64Kb window size limit
+	winSize             = 1 << winSizeLog
+	winMask             = winSize - 1 // 64Kb window of previous data for dependent blocks
+	compressedBlockFlag = 1 << 31
+	compressedBlockMask = compressedBlockFlag - 1
+
+	// hashLog determines the size of the hash table used to quickly find a previous match position.
+	// Its value influences the compression speed and memory usage, the lower the faster,
+	// but at the expense of the compression ratio.
+	// 16 seems to be the best compromise.
+	hashLog       = 16
+	hashTableSize = 1 << hashLog
+	hashShift     = uint((minMatch * 8) - hashLog)
+
+	mfLimit      = 8 + minMatch // The last match cannot start within the last 12 bytes.
+	skipStrength = 6            // variable step for fast scan
+)
+
+// map the block max size id with its value in bytes: 64Kb, 256Kb, 1Mb and 4Mb.
+var (
+	bsMapID    = map[byte]int{4: 64 << 10, 5: 256 << 10, 6: 1 << 20, 7: 4 << 20}
+	bsMapValue = make(map[int]byte, len(bsMapID))
+)
+
+// Reversed.
+func init() {
+	for i, v := range bsMapID {
+		bsMapValue[v] = i
+	}
+}
+
+// Header describes the various flags that can be set on a Writer or obtained from a Reader.
+// The default values match those of the LZ4 frame format definition
+// (http://fastcompression.blogspot.com/2013/04/lz4-streaming-format-final.html).
+//
+// NB. in a Reader, in case of concatenated frames, the Header values may change between Read() calls.
+// It is the caller responsibility to check them if necessary.
+type Header struct {
+	BlockChecksum    bool   // Compressed blocks checksum flag.
+	NoChecksum       bool   // Frame checksum flag.
+	BlockMaxSize     int    // Size of the uncompressed data block (one of [64KB, 256KB, 1MB, 4MB]). Default=4MB.
+	Size             uint64 // Frame total size. It is _not_ computed by the Writer.
+	CompressionLevel int    // Compression level (higher is better, use 0 for fastest compression).
+	done             bool   // Header processed flag (Read or Write and checked).
+}

diff --git a/vendor/github.com/pierrec/lz4/lz4_go1.10.go b/vendor/github.com/pierrec/lz4/lz4_go1.10.go
new file mode 100644
index 0000000..9a0fb00
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/lz4_go1.10.go

@@ -0,0 +1,29 @@
+//+build go1.10
+
+package lz4
+
+import (
+	"fmt"
+	"strings"
+)
+
+func (h Header) String() string {
+	var s strings.Builder
+
+	s.WriteString(fmt.Sprintf("%T{", h))
+	if h.BlockChecksum {
+		s.WriteString("BlockChecksum: true ")
+	}
+	if h.NoChecksum {
+		s.WriteString("NoChecksum: true ")
+	}
+	if bs := h.BlockMaxSize; bs != 0 && bs != 4<<20 {
+		s.WriteString(fmt.Sprintf("BlockMaxSize: %d ", bs))
+	}
+	if l := h.CompressionLevel; l != 0 {
+		s.WriteString(fmt.Sprintf("CompressionLevel: %d ", l))
+	}
+	s.WriteByte('}')
+
+	return s.String()
+}

diff --git a/vendor/github.com/pierrec/lz4/lz4_notgo1.10.go b/vendor/github.com/pierrec/lz4/lz4_notgo1.10.go
new file mode 100644
index 0000000..12c761a
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/lz4_notgo1.10.go

@@ -0,0 +1,29 @@
+//+build !go1.10
+
+package lz4
+
+import (
+	"bytes"
+	"fmt"
+)
+
+func (h Header) String() string {
+	var s bytes.Buffer
+
+	s.WriteString(fmt.Sprintf("%T{", h))
+	if h.BlockChecksum {
+		s.WriteString("BlockChecksum: true ")
+	}
+	if h.NoChecksum {
+		s.WriteString("NoChecksum: true ")
+	}
+	if bs := h.BlockMaxSize; bs != 0 && bs != 4<<20 {
+		s.WriteString(fmt.Sprintf("BlockMaxSize: %d ", bs))
+	}
+	if l := h.CompressionLevel; l != 0 {
+		s.WriteString(fmt.Sprintf("CompressionLevel: %d ", l))
+	}
+	s.WriteByte('}')
+
+	return s.String()
+}

diff --git a/vendor/github.com/pierrec/lz4/reader.go b/vendor/github.com/pierrec/lz4/reader.go
new file mode 100644
index 0000000..f08db47
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/reader.go

@@ -0,0 +1,295 @@
+package lz4
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"io/ioutil"
+
+	"github.com/pierrec/lz4/internal/xxh32"
+)
+
+// Reader implements the LZ4 frame decoder.
+// The Header is set after the first call to Read().
+// The Header may change between Read() calls in case of concatenated frames.
+type Reader struct {
+	Header
+
+	buf      [8]byte       // Scrap buffer.
+	pos      int64         // Current position in src.
+	src      io.Reader     // Source.
+	zdata    []byte        // Compressed data.
+	data     []byte        // Uncompressed data.
+	idx      int           // Index of unread bytes into data.
+	checksum xxh32.XXHZero // Frame hash.
+}
+
+// NewReader returns a new LZ4 frame decoder.
+// No access to the underlying io.Reader is performed.
+func NewReader(src io.Reader) *Reader {
+	r := &Reader{src: src}
+	return r
+}
+
+// readHeader checks the frame magic number and parses the frame descriptoz.
+// Skippable frames are supported even as a first frame although the LZ4
+// specifications recommends skippable frames not to be used as first frames.
+func (z *Reader) readHeader(first bool) error {
+	defer z.checksum.Reset()
+
+	buf := z.buf[:]
+	for {
+		magic, err := z.readUint32()
+		if err != nil {
+			z.pos += 4
+			if !first && err == io.ErrUnexpectedEOF {
+				return io.EOF
+			}
+			return err
+		}
+		if magic == frameMagic {
+			break
+		}
+		if magic>>8 != frameSkipMagic>>8 {
+			return ErrInvalid
+		}
+		skipSize, err := z.readUint32()
+		if err != nil {
+			return err
+		}
+		z.pos += 4
+		m, err := io.CopyN(ioutil.Discard, z.src, int64(skipSize))
+		if err != nil {
+			return err
+		}
+		z.pos += m
+	}
+
+	// Header.
+	if _, err := io.ReadFull(z.src, buf[:2]); err != nil {
+		return err
+	}
+	z.pos += 8
+
+	b := buf[0]
+	if v := b >> 6; v != Version {
+		return fmt.Errorf("lz4: invalid version: got %d; expected %d", v, Version)
+	}
+	if b>>5&1 == 0 {
+		return fmt.Errorf("lz4: block dependency not supported")
+	}
+	z.BlockChecksum = b>>4&1 > 0
+	frameSize := b>>3&1 > 0
+	z.NoChecksum = b>>2&1 == 0
+
+	bmsID := buf[1] >> 4 & 0x7
+	bSize, ok := bsMapID[bmsID]
+	if !ok {
+		return fmt.Errorf("lz4: invalid block max size ID: %d", bmsID)
+	}
+	z.BlockMaxSize = bSize
+
+	// Allocate the compressed/uncompressed buffers.
+	// The compressed buffer cannot exceed the uncompressed one.
+	if n := 2 * bSize; cap(z.zdata) < n {
+		z.zdata = make([]byte, n, n)
+	}
+	if debugFlag {
+		debug("header block max size id=%d size=%d", bmsID, bSize)
+	}
+	z.zdata = z.zdata[:bSize]
+	z.data = z.zdata[:cap(z.zdata)][bSize:]
+	z.idx = len(z.data)
+
+	z.checksum.Write(buf[0:2])
+
+	if frameSize {
+		buf := buf[:8]
+		if _, err := io.ReadFull(z.src, buf); err != nil {
+			return err
+		}
+		z.Size = binary.LittleEndian.Uint64(buf)
+		z.pos += 8
+		z.checksum.Write(buf)
+	}
+
+	// Header checksum.
+	if _, err := io.ReadFull(z.src, buf[:1]); err != nil {
+		return err
+	}
+	z.pos++
+	if h := byte(z.checksum.Sum32() >> 8 & 0xFF); h != buf[0] {
+		return fmt.Errorf("lz4: invalid header checksum: got %x; expected %x", buf[0], h)
+	}
+
+	z.Header.done = true
+	if debugFlag {
+		debug("header read: %v", z.Header)
+	}
+
+	return nil
+}
+
+// Read decompresses data from the underlying source into the supplied buffer.
+//
+// Since there can be multiple streams concatenated, Header values may
+// change between calls to Read(). If that is the case, no data is actually read from
+// the underlying io.Reader, to allow for potential input buffer resizing.
+func (z *Reader) Read(buf []byte) (int, error) {
+	if debugFlag {
+		debug("Read buf len=%d", len(buf))
+	}
+	if !z.Header.done {
+		if err := z.readHeader(true); err != nil {
+			return 0, err
+		}
+		if debugFlag {
+			debug("header read OK compressed buffer %d / %d uncompressed buffer %d : %d index=%d",
+				len(z.zdata), cap(z.zdata), len(z.data), cap(z.data), z.idx)
+		}
+	}
+
+	if len(buf) == 0 {
+		return 0, nil
+	}
+
+	if z.idx == len(z.data) {
+		// No data ready for reading, process the next block.
+		if debugFlag {
+			debug("reading block from writer")
+		}
+		// Block length: 0 = end of frame, highest bit set: uncompressed.
+		bLen, err := z.readUint32()
+		if err != nil {
+			return 0, err
+		}
+		z.pos += 4
+
+		if bLen == 0 {
+			// End of frame reached.
+			if !z.NoChecksum {
+				// Validate the frame checksum.
+				checksum, err := z.readUint32()
+				if err != nil {
+					return 0, err
+				}
+				if debugFlag {
+					debug("frame checksum got=%x / want=%x", z.checksum.Sum32(), checksum)
+				}
+				z.pos += 4
+				if h := z.checksum.Sum32(); checksum != h {
+					return 0, fmt.Errorf("lz4: invalid frame checksum: got %x; expected %x", h, checksum)
+				}
+			}
+
+			// Get ready for the next concatenated frame and keep the position.
+			pos := z.pos
+			z.Reset(z.src)
+			z.pos = pos
+
+			// Since multiple frames can be concatenated, check for more.
+			return 0, z.readHeader(false)
+		}
+
+		if debugFlag {
+			debug("raw block size %d", bLen)
+		}
+		if bLen&compressedBlockFlag > 0 {
+			// Uncompressed block.
+			bLen &= compressedBlockMask
+			if debugFlag {
+				debug("uncompressed block size %d", bLen)
+			}
+			if int(bLen) > cap(z.data) {
+				return 0, fmt.Errorf("lz4: invalid block size: %d", bLen)
+			}
+			z.data = z.data[:bLen]
+			if _, err := io.ReadFull(z.src, z.data); err != nil {
+				return 0, err
+			}
+			z.pos += int64(bLen)
+
+			if z.BlockChecksum {
+				checksum, err := z.readUint32()
+				if err != nil {
+					return 0, err
+				}
+				z.pos += 4
+
+				if h := xxh32.ChecksumZero(z.data); h != checksum {
+					return 0, fmt.Errorf("lz4: invalid block checksum: got %x; expected %x", h, checksum)
+				}
+			}
+
+		} else {
+			// Compressed block.
+			if debugFlag {
+				debug("compressed block size %d", bLen)
+			}
+			if int(bLen) > cap(z.data) {
+				return 0, fmt.Errorf("lz4: invalid block size: %d", bLen)
+			}
+			zdata := z.zdata[:bLen]
+			if _, err := io.ReadFull(z.src, zdata); err != nil {
+				return 0, err
+			}
+			z.pos += int64(bLen)
+
+			if z.BlockChecksum {
+				checksum, err := z.readUint32()
+				if err != nil {
+					return 0, err
+				}
+				z.pos += 4
+
+				if h := xxh32.ChecksumZero(zdata); h != checksum {
+					return 0, fmt.Errorf("lz4: invalid block checksum: got %x; expected %x", h, checksum)
+				}
+			}
+
+			n, err := UncompressBlock(zdata, z.data)
+			if err != nil {
+				return 0, err
+			}
+			z.data = z.data[:n]
+		}
+
+		if !z.NoChecksum {
+			z.checksum.Write(z.data)
+			if debugFlag {
+				debug("current frame checksum %x", z.checksum.Sum32())
+			}
+		}
+		z.idx = 0
+	}
+
+	n := copy(buf, z.data[z.idx:])
+	z.idx += n
+	if debugFlag {
+		debug("copied %d bytes to input", n)
+	}
+
+	return n, nil
+}
+
+// Reset discards the Reader's state and makes it equivalent to the
+// result of its original state from NewReader, but reading from r instead.
+// This permits reusing a Reader rather than allocating a new one.
+func (z *Reader) Reset(r io.Reader) {
+	z.Header = Header{}
+	z.pos = 0
+	z.src = r
+	z.zdata = z.zdata[:0]
+	z.data = z.data[:0]
+	z.idx = 0
+	z.checksum.Reset()
+}
+
+// readUint32 reads an uint32 into the supplied buffer.
+// The idea is to make use of the already allocated buffers avoiding additional allocations.
+func (z *Reader) readUint32() (uint32, error) {
+	buf := z.buf[:4]
+	_, err := io.ReadFull(z.src, buf)
+	x := binary.LittleEndian.Uint32(buf)
+	return x, err
+}

diff --git a/vendor/github.com/pierrec/lz4/writer.go b/vendor/github.com/pierrec/lz4/writer.go
new file mode 100644
index 0000000..0120438
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/writer.go

@@ -0,0 +1,267 @@
+package lz4
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+
+	"github.com/pierrec/lz4/internal/xxh32"
+)
+
+// Writer implements the LZ4 frame encoder.
+type Writer struct {
+	Header
+
+	buf       [19]byte      // magic number(4) + header(flags(2)+[Size(8)+DictID(4)]+checksum(1)) does not exceed 19 bytes
+	dst       io.Writer     // Destination.
+	checksum  xxh32.XXHZero // Frame checksum.
+	zdata     []byte        // Compressed data.
+	data      []byte        // Data to be compressed.
+	idx       int           // Index into data.
+	hashtable [winSize]int  // Hash table used in CompressBlock().
+}
+
+// NewWriter returns a new LZ4 frame encoder.
+// No access to the underlying io.Writer is performed.
+// The supplied Header is checked at the first Write.
+// It is ok to change it before the first Write but then not until a Reset() is performed.
+func NewWriter(dst io.Writer) *Writer {
+	return &Writer{dst: dst}
+}
+
+// writeHeader builds and writes the header (magic+header) to the underlying io.Writer.
+func (z *Writer) writeHeader() error {
+	// Default to 4Mb if BlockMaxSize is not set.
+	if z.Header.BlockMaxSize == 0 {
+		z.Header.BlockMaxSize = bsMapID[7]
+	}
+	// The only option that needs to be validated.
+	bSize := z.Header.BlockMaxSize
+	bSizeID, ok := bsMapValue[bSize]
+	if !ok {
+		return fmt.Errorf("lz4: invalid block max size: %d", bSize)
+	}
+	// Allocate the compressed/uncompressed buffers.
+	// The compressed buffer cannot exceed the uncompressed one.
+	if n := 2 * bSize; cap(z.zdata) < n {
+		z.zdata = make([]byte, n, n)
+	}
+	z.zdata = z.zdata[:bSize]
+	z.data = z.zdata[:cap(z.zdata)][bSize:]
+	z.idx = 0
+
+	// Size is optional.
+	buf := z.buf[:]
+
+	// Set the fixed size data: magic number, block max size and flags.
+	binary.LittleEndian.PutUint32(buf[0:], frameMagic)
+	flg := byte(Version << 6)
+	flg |= 1 << 5 // No block dependency.
+	if z.Header.BlockChecksum {
+		flg |= 1 << 4
+	}
+	if z.Header.Size > 0 {
+		flg |= 1 << 3
+	}
+	if !z.Header.NoChecksum {
+		flg |= 1 << 2
+	}
+	buf[4] = flg
+	buf[5] = bSizeID << 4
+
+	// Current buffer size: magic(4) + flags(1) + block max size (1).
+	n := 6
+	// Optional items.
+	if z.Header.Size > 0 {
+		binary.LittleEndian.PutUint64(buf[n:], z.Header.Size)
+		n += 8
+	}
+
+	// The header checksum includes the flags, block max size and optional Size.
+	buf[n] = byte(xxh32.ChecksumZero(buf[4:n]) >> 8 & 0xFF)
+	z.checksum.Reset()
+
+	// Header ready, write it out.
+	if _, err := z.dst.Write(buf[0 : n+1]); err != nil {
+		return err
+	}
+	z.Header.done = true
+	if debugFlag {
+		debug("wrote header %v", z.Header)
+	}
+
+	return nil
+}
+
+// Write compresses data from the supplied buffer into the underlying io.Writer.
+// Write does not return until the data has been written.
+func (z *Writer) Write(buf []byte) (int, error) {
+	if !z.Header.done {
+		if err := z.writeHeader(); err != nil {
+			return 0, err
+		}
+	}
+	if debugFlag {
+		debug("input buffer len=%d index=%d", len(buf), z.idx)
+	}
+
+	zn := len(z.data)
+	var n int
+	for len(buf) > 0 {
+		if z.idx == 0 && len(buf) >= zn {
+			// Avoid a copy as there is enough data for a block.
+			if err := z.compressBlock(buf[:zn]); err != nil {
+				return n, err
+			}
+			n += zn
+			buf = buf[zn:]
+			continue
+		}
+		// Accumulate the data to be compressed.
+		m := copy(z.data[z.idx:], buf)
+		n += m
+		z.idx += m
+		buf = buf[m:]
+		if debugFlag {
+			debug("%d bytes copied to buf, current index %d", n, z.idx)
+		}
+
+		if z.idx < len(z.data) {
+			// Buffer not filled.
+			if debugFlag {
+				debug("need more data for compression")
+			}
+			return n, nil
+		}
+
+		// Buffer full.
+		if err := z.compressBlock(z.data); err != nil {
+			return n, err
+		}
+		z.idx = 0
+	}
+
+	return n, nil
+}
+
+// compressBlock compresses a block.
+func (z *Writer) compressBlock(data []byte) error {
+	if !z.NoChecksum {
+		z.checksum.Write(data)
+	}
+
+	// The compressed block size cannot exceed the input's.
+	var zn int
+	var err error
+
+	if level := z.Header.CompressionLevel; level != 0 {
+		zn, err = CompressBlockHC(data, z.zdata, level)
+	} else {
+		zn, err = CompressBlock(data, z.zdata, z.hashtable[:])
+	}
+
+	var zdata []byte
+	var bLen uint32
+	if debugFlag {
+		debug("block compression %d => %d", len(data), zn)
+	}
+	if err == nil && zn > 0 && zn < len(data) {
+		// Compressible and compressed size smaller than uncompressed: ok!
+		bLen = uint32(zn)
+		zdata = z.zdata[:zn]
+	} else {
+		// Uncompressed block.
+		bLen = uint32(len(data)) | compressedBlockFlag
+		zdata = data
+	}
+	if debugFlag {
+		debug("block compression to be written len=%d data len=%d", bLen, len(zdata))
+	}
+
+	// Write the block.
+	if err := z.writeUint32(bLen); err != nil {
+		return err
+	}
+	if _, err := z.dst.Write(zdata); err != nil {
+		return err
+	}
+
+	if z.BlockChecksum {
+		checksum := xxh32.ChecksumZero(zdata)
+		if debugFlag {
+			debug("block checksum %x", checksum)
+		}
+		if err := z.writeUint32(checksum); err != nil {
+			return err
+		}
+	}
+	if debugFlag {
+		debug("current frame checksum %x", z.checksum.Sum32())
+	}
+
+	return nil
+}
+
+// Flush flushes any pending compressed data to the underlying writer.
+// Flush does not return until the data has been written.
+// If the underlying writer returns an error, Flush returns that error.
+func (z *Writer) Flush() error {
+	if debugFlag {
+		debug("flush with index %d", z.idx)
+	}
+	if z.idx == 0 {
+		return nil
+	}
+
+	return z.compressBlock(z.data[:z.idx])
+}
+
+// Close closes the Writer, flushing any unwritten data to the underlying io.Writer, but does not close the underlying io.Writer.
+func (z *Writer) Close() error {
+	if !z.Header.done {
+		if err := z.writeHeader(); err != nil {
+			return err
+		}
+	}
+
+	if err := z.Flush(); err != nil {
+		return err
+	}
+
+	if debugFlag {
+		debug("writing last empty block")
+	}
+	if err := z.writeUint32(0); err != nil {
+		return err
+	}
+	if !z.NoChecksum {
+		checksum := z.checksum.Sum32()
+		if debugFlag {
+			debug("stream checksum %x", checksum)
+		}
+		if err := z.writeUint32(checksum); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Reset clears the state of the Writer z such that it is equivalent to its
+// initial state from NewWriter, but instead writing to w.
+// No access to the underlying io.Writer is performed.
+func (z *Writer) Reset(w io.Writer) {
+	z.Header = Header{}
+	z.dst = w
+	z.checksum.Reset()
+	z.zdata = z.zdata[:0]
+	z.data = z.data[:0]
+	z.idx = 0
+}
+
+// writeUint32 writes a uint32 to the underlying writer.
+func (z *Writer) writeUint32(x uint32) error {
+	buf := z.buf[:4]
+	binary.LittleEndian.PutUint32(buf, x)
+	_, err := z.dst.Write(buf)
+	return err
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/.gitignore b/vendor/github.com/rcrowley/go-metrics/.gitignore
new file mode 100644
index 0000000..83c8f82
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/.gitignore

@@ -0,0 +1,9 @@
+*.[68]
+*.a
+*.out
+*.swp
+_obj
+_testmain.go
+cmd/metrics-bench/metrics-bench
+cmd/metrics-example/metrics-example
+cmd/never-read/never-read

diff --git a/vendor/github.com/rcrowley/go-metrics/.travis.yml b/vendor/github.com/rcrowley/go-metrics/.travis.yml
new file mode 100644
index 0000000..117763e
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/.travis.yml

@@ -0,0 +1,17 @@
+language: go
+
+go:
+    - 1.3
+    - 1.4
+    - 1.5
+    - 1.6
+    - 1.7
+    - 1.8
+    - 1.9
+
+script:
+    - ./validate.sh
+
+# this should give us faster builds according to 
+# http://docs.travis-ci.com/user/migrating-from-legacy/
+sudo: false

diff --git a/vendor/github.com/rcrowley/go-metrics/LICENSE b/vendor/github.com/rcrowley/go-metrics/LICENSE
new file mode 100644
index 0000000..363fa9e
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/LICENSE

@@ -0,0 +1,29 @@
+Copyright 2012 Richard Crowley. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    1.  Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+
+    2.  Redistributions in binary form must reproduce the above
+        copyright notice, this list of conditions and the following
+        disclaimer in the documentation and/or other materials provided
+        with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY RICHARD CROWLEY ``AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL RICHARD CROWLEY OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation
+are those of the authors and should not be interpreted as representing
+official policies, either expressed or implied, of Richard Crowley.

diff --git a/vendor/github.com/rcrowley/go-metrics/README.md b/vendor/github.com/rcrowley/go-metrics/README.md
new file mode 100644
index 0000000..b7356b5
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/README.md

@@ -0,0 +1,168 @@
+go-metrics
+==========
+
+![travis build status](https://travis-ci.org/rcrowley/go-metrics.svg?branch=master)
+
+Go port of Coda Hale's Metrics library: <https://github.com/dropwizard/metrics>.
+
+Documentation: <http://godoc.org/github.com/rcrowley/go-metrics>.
+
+Usage
+-----
+
+Create and update metrics:
+
+```go
+c := metrics.NewCounter()
+metrics.Register("foo", c)
+c.Inc(47)
+
+g := metrics.NewGauge()
+metrics.Register("bar", g)
+g.Update(47)
+
+r := NewRegistry()
+g := metrics.NewRegisteredFunctionalGauge("cache-evictions", r, func() int64 { return cache.getEvictionsCount() })
+
+s := metrics.NewExpDecaySample(1028, 0.015) // or metrics.NewUniformSample(1028)
+h := metrics.NewHistogram(s)
+metrics.Register("baz", h)
+h.Update(47)
+
+m := metrics.NewMeter()
+metrics.Register("quux", m)
+m.Mark(47)
+
+t := metrics.NewTimer()
+metrics.Register("bang", t)
+t.Time(func() {})
+t.Update(47)
+```
+
+Register() is not threadsafe. For threadsafe metric registration use
+GetOrRegister:
+
+```go
+t := metrics.GetOrRegisterTimer("account.create.latency", nil)
+t.Time(func() {})
+t.Update(47)
+```
+
+**NOTE:** Be sure to unregister short-lived meters and timers otherwise they will
+leak memory:
+
+```go
+// Will call Stop() on the Meter to allow for garbage collection
+metrics.Unregister("quux")
+// Or similarly for a Timer that embeds a Meter
+metrics.Unregister("bang")
+```
+
+Periodically log every metric in human-readable form to standard error:
+
+```go
+go metrics.Log(metrics.DefaultRegistry, 5 * time.Second, log.New(os.Stderr, "metrics: ", log.Lmicroseconds))
+```
+
+Periodically log every metric in slightly-more-parseable form to syslog:
+
+```go
+w, _ := syslog.Dial("unixgram", "/dev/log", syslog.LOG_INFO, "metrics")
+go metrics.Syslog(metrics.DefaultRegistry, 60e9, w)
+```
+
+Periodically emit every metric to Graphite using the [Graphite client](https://github.com/cyberdelia/go-metrics-graphite):
+
+```go
+
+import "github.com/cyberdelia/go-metrics-graphite"
+
+addr, _ := net.ResolveTCPAddr("tcp", "127.0.0.1:2003")
+go graphite.Graphite(metrics.DefaultRegistry, 10e9, "metrics", addr)
+```
+
+Periodically emit every metric into InfluxDB:
+
+**NOTE:** this has been pulled out of the library due to constant fluctuations
+in the InfluxDB API. In fact, all client libraries are on their way out. see
+issues [#121](https://github.com/rcrowley/go-metrics/issues/121) and
+[#124](https://github.com/rcrowley/go-metrics/issues/124) for progress and details.
+
+```go
+import "github.com/vrischmann/go-metrics-influxdb"
+
+go influxdb.InfluxDB(metrics.DefaultRegistry,
+  10e9, 
+  "127.0.0.1:8086", 
+  "database-name", 
+  "username", 
+  "password"
+)
+```
+
+Periodically upload every metric to Librato using the [Librato client](https://github.com/mihasya/go-metrics-librato):
+
+**Note**: the client included with this repository under the `librato` package
+has been deprecated and moved to the repository linked above.
+
+```go
+import "github.com/mihasya/go-metrics-librato"
+
+go librato.Librato(metrics.DefaultRegistry,
+    10e9,                  // interval
+    "example@example.com", // account owner email address
+    "token",               // Librato API token
+    "hostname",            // source
+    []float64{0.95},       // percentiles to send
+    time.Millisecond,      // time unit
+)
+```
+
+Periodically emit every metric to StatHat:
+
+```go
+import "github.com/rcrowley/go-metrics/stathat"
+
+go stathat.Stathat(metrics.DefaultRegistry, 10e9, "example@example.com")
+```
+
+Maintain all metrics along with expvars at `/debug/metrics`:
+
+This uses the same mechanism as [the official expvar](http://golang.org/pkg/expvar/)
+but exposed under `/debug/metrics`, which shows a json representation of all your usual expvars
+as well as all your go-metrics.
+
+
+```go
+import "github.com/rcrowley/go-metrics/exp"
+
+exp.Exp(metrics.DefaultRegistry)
+```
+
+Installation
+------------
+
+```sh
+go get github.com/rcrowley/go-metrics
+```
+
+StatHat support additionally requires their Go client:
+
+```sh
+go get github.com/stathat/go
+```
+
+Publishing Metrics
+------------------
+
+Clients are available for the following destinations:
+
+* Librato - https://github.com/mihasya/go-metrics-librato
+* Graphite - https://github.com/cyberdelia/go-metrics-graphite
+* InfluxDB - https://github.com/vrischmann/go-metrics-influxdb
+* Ganglia - https://github.com/appscode/metlia
+* Prometheus - https://github.com/deathowl/go-metrics-prometheus
+* DataDog - https://github.com/syntaqx/go-metrics-datadog
+* SignalFX - https://github.com/pascallouisperez/go-metrics-signalfx
+* Honeycomb - https://github.com/getspine/go-metrics-honeycomb
+* Wavefront - https://github.com/wavefrontHQ/go-metrics-wavefront

diff --git a/vendor/github.com/rcrowley/go-metrics/counter.go b/vendor/github.com/rcrowley/go-metrics/counter.go
new file mode 100644
index 0000000..bb7b039
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/counter.go

@@ -0,0 +1,112 @@
+package metrics
+
+import "sync/atomic"
+
+// Counters hold an int64 value that can be incremented and decremented.
+type Counter interface {
+	Clear()
+	Count() int64
+	Dec(int64)
+	Inc(int64)
+	Snapshot() Counter
+}
+
+// GetOrRegisterCounter returns an existing Counter or constructs and registers
+// a new StandardCounter.
+func GetOrRegisterCounter(name string, r Registry) Counter {
+	if nil == r {
+		r = DefaultRegistry
+	}
+	return r.GetOrRegister(name, NewCounter).(Counter)
+}
+
+// NewCounter constructs a new StandardCounter.
+func NewCounter() Counter {
+	if UseNilMetrics {
+		return NilCounter{}
+	}
+	return &StandardCounter{0}
+}
+
+// NewRegisteredCounter constructs and registers a new StandardCounter.
+func NewRegisteredCounter(name string, r Registry) Counter {
+	c := NewCounter()
+	if nil == r {
+		r = DefaultRegistry
+	}
+	r.Register(name, c)
+	return c
+}
+
+// CounterSnapshot is a read-only copy of another Counter.
+type CounterSnapshot int64
+
+// Clear panics.
+func (CounterSnapshot) Clear() {
+	panic("Clear called on a CounterSnapshot")
+}
+
+// Count returns the count at the time the snapshot was taken.
+func (c CounterSnapshot) Count() int64 { return int64(c) }
+
+// Dec panics.
+func (CounterSnapshot) Dec(int64) {
+	panic("Dec called on a CounterSnapshot")
+}
+
+// Inc panics.
+func (CounterSnapshot) Inc(int64) {
+	panic("Inc called on a CounterSnapshot")
+}
+
+// Snapshot returns the snapshot.
+func (c CounterSnapshot) Snapshot() Counter { return c }
+
+// NilCounter is a no-op Counter.
+type NilCounter struct{}
+
+// Clear is a no-op.
+func (NilCounter) Clear() {}
+
+// Count is a no-op.
+func (NilCounter) Count() int64 { return 0 }
+
+// Dec is a no-op.
+func (NilCounter) Dec(i int64) {}
+
+// Inc is a no-op.
+func (NilCounter) Inc(i int64) {}
+
+// Snapshot is a no-op.
+func (NilCounter) Snapshot() Counter { return NilCounter{} }
+
+// StandardCounter is the standard implementation of a Counter and uses the
+// sync/atomic package to manage a single int64 value.
+type StandardCounter struct {
+	count int64
+}
+
+// Clear sets the counter to zero.
+func (c *StandardCounter) Clear() {
+	atomic.StoreInt64(&c.count, 0)
+}
+
+// Count returns the current count.
+func (c *StandardCounter) Count() int64 {
+	return atomic.LoadInt64(&c.count)
+}
+
+// Dec decrements the counter by the given amount.
+func (c *StandardCounter) Dec(i int64) {
+	atomic.AddInt64(&c.count, -i)
+}
+
+// Inc increments the counter by the given amount.
+func (c *StandardCounter) Inc(i int64) {
+	atomic.AddInt64(&c.count, i)
+}
+
+// Snapshot returns a read-only copy of the counter.
+func (c *StandardCounter) Snapshot() Counter {
+	return CounterSnapshot(c.Count())
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/debug.go b/vendor/github.com/rcrowley/go-metrics/debug.go
new file mode 100644
index 0000000..043ccef
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/debug.go

@@ -0,0 +1,76 @@
+package metrics
+
+import (
+	"runtime/debug"
+	"time"
+)
+
+var (
+	debugMetrics struct {
+		GCStats struct {
+			LastGC Gauge
+			NumGC  Gauge
+			Pause  Histogram
+			//PauseQuantiles Histogram
+			PauseTotal Gauge
+		}
+		ReadGCStats Timer
+	}
+	gcStats debug.GCStats
+)
+
+// Capture new values for the Go garbage collector statistics exported in
+// debug.GCStats.  This is designed to be called as a goroutine.
+func CaptureDebugGCStats(r Registry, d time.Duration) {
+	for _ = range time.Tick(d) {
+		CaptureDebugGCStatsOnce(r)
+	}
+}
+
+// Capture new values for the Go garbage collector statistics exported in
+// debug.GCStats.  This is designed to be called in a background goroutine.
+// Giving a registry which has not been given to RegisterDebugGCStats will
+// panic.
+//
+// Be careful (but much less so) with this because debug.ReadGCStats calls
+// the C function runtime·lock(runtime·mheap) which, while not a stop-the-world
+// operation, isn't something you want to be doing all the time.
+func CaptureDebugGCStatsOnce(r Registry) {
+	lastGC := gcStats.LastGC
+	t := time.Now()
+	debug.ReadGCStats(&gcStats)
+	debugMetrics.ReadGCStats.UpdateSince(t)
+
+	debugMetrics.GCStats.LastGC.Update(int64(gcStats.LastGC.UnixNano()))
+	debugMetrics.GCStats.NumGC.Update(int64(gcStats.NumGC))
+	if lastGC != gcStats.LastGC && 0 < len(gcStats.Pause) {
+		debugMetrics.GCStats.Pause.Update(int64(gcStats.Pause[0]))
+	}
+	//debugMetrics.GCStats.PauseQuantiles.Update(gcStats.PauseQuantiles)
+	debugMetrics.GCStats.PauseTotal.Update(int64(gcStats.PauseTotal))
+}
+
+// Register metrics for the Go garbage collector statistics exported in
+// debug.GCStats.  The metrics are named by their fully-qualified Go symbols,
+// i.e. debug.GCStats.PauseTotal.
+func RegisterDebugGCStats(r Registry) {
+	debugMetrics.GCStats.LastGC = NewGauge()
+	debugMetrics.GCStats.NumGC = NewGauge()
+	debugMetrics.GCStats.Pause = NewHistogram(NewExpDecaySample(1028, 0.015))
+	//debugMetrics.GCStats.PauseQuantiles = NewHistogram(NewExpDecaySample(1028, 0.015))
+	debugMetrics.GCStats.PauseTotal = NewGauge()
+	debugMetrics.ReadGCStats = NewTimer()
+
+	r.Register("debug.GCStats.LastGC", debugMetrics.GCStats.LastGC)
+	r.Register("debug.GCStats.NumGC", debugMetrics.GCStats.NumGC)
+	r.Register("debug.GCStats.Pause", debugMetrics.GCStats.Pause)
+	//r.Register("debug.GCStats.PauseQuantiles", debugMetrics.GCStats.PauseQuantiles)
+	r.Register("debug.GCStats.PauseTotal", debugMetrics.GCStats.PauseTotal)
+	r.Register("debug.ReadGCStats", debugMetrics.ReadGCStats)
+}
+
+// Allocate an initial slice for gcStats.Pause to avoid allocations during
+// normal operation.
+func init() {
+	gcStats.Pause = make([]time.Duration, 11)
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/ewma.go b/vendor/github.com/rcrowley/go-metrics/ewma.go
new file mode 100644
index 0000000..a8183dd
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/ewma.go

@@ -0,0 +1,138 @@
+package metrics
+
+import (
+	"math"
+	"sync"
+	"sync/atomic"
+)
+
+// EWMAs continuously calculate an exponentially-weighted moving average
+// based on an outside source of clock ticks.
+type EWMA interface {
+	Rate() float64
+	Snapshot() EWMA
+	Tick()
+	Update(int64)
+}
+
+// NewEWMA constructs a new EWMA with the given alpha.
+func NewEWMA(alpha float64) EWMA {
+	if UseNilMetrics {
+		return NilEWMA{}
+	}
+	return &StandardEWMA{alpha: alpha}
+}
+
+// NewEWMA1 constructs a new EWMA for a one-minute moving average.
+func NewEWMA1() EWMA {
+	return NewEWMA(1 - math.Exp(-5.0/60.0/1))
+}
+
+// NewEWMA5 constructs a new EWMA for a five-minute moving average.
+func NewEWMA5() EWMA {
+	return NewEWMA(1 - math.Exp(-5.0/60.0/5))
+}
+
+// NewEWMA15 constructs a new EWMA for a fifteen-minute moving average.
+func NewEWMA15() EWMA {
+	return NewEWMA(1 - math.Exp(-5.0/60.0/15))
+}
+
+// EWMASnapshot is a read-only copy of another EWMA.
+type EWMASnapshot float64
+
+// Rate returns the rate of events per second at the time the snapshot was
+// taken.
+func (a EWMASnapshot) Rate() float64 { return float64(a) }
+
+// Snapshot returns the snapshot.
+func (a EWMASnapshot) Snapshot() EWMA { return a }
+
+// Tick panics.
+func (EWMASnapshot) Tick() {
+	panic("Tick called on an EWMASnapshot")
+}
+
+// Update panics.
+func (EWMASnapshot) Update(int64) {
+	panic("Update called on an EWMASnapshot")
+}
+
+// NilEWMA is a no-op EWMA.
+type NilEWMA struct{}
+
+// Rate is a no-op.
+func (NilEWMA) Rate() float64 { return 0.0 }
+
+// Snapshot is a no-op.
+func (NilEWMA) Snapshot() EWMA { return NilEWMA{} }
+
+// Tick is a no-op.
+func (NilEWMA) Tick() {}
+
+// Update is a no-op.
+func (NilEWMA) Update(n int64) {}
+
+// StandardEWMA is the standard implementation of an EWMA and tracks the number
+// of uncounted events and processes them on each tick.  It uses the
+// sync/atomic package to manage uncounted events.
+type StandardEWMA struct {
+	uncounted int64 // /!\ this should be the first member to ensure 64-bit alignment
+	alpha     float64
+	rate      uint64
+	init      uint32
+	mutex     sync.Mutex
+}
+
+// Rate returns the moving average rate of events per second.
+func (a *StandardEWMA) Rate() float64 {
+	currentRate := math.Float64frombits(atomic.LoadUint64(&a.rate)) * float64(1e9)
+	return currentRate
+}
+
+// Snapshot returns a read-only copy of the EWMA.
+func (a *StandardEWMA) Snapshot() EWMA {
+	return EWMASnapshot(a.Rate())
+}
+
+// Tick ticks the clock to update the moving average.  It assumes it is called
+// every five seconds.
+func (a *StandardEWMA) Tick() {
+	// Optimization to avoid mutex locking in the hot-path.
+	if atomic.LoadUint32(&a.init) == 1 {
+		a.updateRate(a.fetchInstantRate())
+	} else {
+		// Slow-path: this is only needed on the first Tick() and preserves transactional updating
+		// of init and rate in the else block. The first conditional is needed below because
+		// a different thread could have set a.init = 1 between the time of the first atomic load and when
+		// the lock was acquired.
+		a.mutex.Lock()
+		if atomic.LoadUint32(&a.init) == 1 {
+			// The fetchInstantRate() uses atomic loading, which is unecessary in this critical section
+			// but again, this section is only invoked on the first successful Tick() operation.
+			a.updateRate(a.fetchInstantRate())
+		} else {
+			atomic.StoreUint32(&a.init, 1)
+			atomic.StoreUint64(&a.rate, math.Float64bits(a.fetchInstantRate()))
+		}
+		a.mutex.Unlock()
+	}
+}
+
+func (a *StandardEWMA) fetchInstantRate() float64 {
+	count := atomic.LoadInt64(&a.uncounted)
+	atomic.AddInt64(&a.uncounted, -count)
+	instantRate := float64(count) / float64(5e9)
+	return instantRate
+}
+
+func (a *StandardEWMA) updateRate(instantRate float64) {
+	currentRate := math.Float64frombits(atomic.LoadUint64(&a.rate))
+	currentRate += a.alpha * (instantRate - currentRate)
+	atomic.StoreUint64(&a.rate, math.Float64bits(currentRate))
+}
+
+// Update adds n uncounted events.
+func (a *StandardEWMA) Update(n int64) {
+	atomic.AddInt64(&a.uncounted, n)
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/gauge.go b/vendor/github.com/rcrowley/go-metrics/gauge.go
new file mode 100644
index 0000000..cb57a93
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/gauge.go

@@ -0,0 +1,120 @@
+package metrics
+
+import "sync/atomic"
+
+// Gauges hold an int64 value that can be set arbitrarily.
+type Gauge interface {
+	Snapshot() Gauge
+	Update(int64)
+	Value() int64
+}
+
+// GetOrRegisterGauge returns an existing Gauge or constructs and registers a
+// new StandardGauge.
+func GetOrRegisterGauge(name string, r Registry) Gauge {
+	if nil == r {
+		r = DefaultRegistry
+	}
+	return r.GetOrRegister(name, NewGauge).(Gauge)
+}
+
+// NewGauge constructs a new StandardGauge.
+func NewGauge() Gauge {
+	if UseNilMetrics {
+		return NilGauge{}
+	}
+	return &StandardGauge{0}
+}
+
+// NewRegisteredGauge constructs and registers a new StandardGauge.
+func NewRegisteredGauge(name string, r Registry) Gauge {
+	c := NewGauge()
+	if nil == r {
+		r = DefaultRegistry
+	}
+	r.Register(name, c)
+	return c
+}
+
+// NewFunctionalGauge constructs a new FunctionalGauge.
+func NewFunctionalGauge(f func() int64) Gauge {
+	if UseNilMetrics {
+		return NilGauge{}
+	}
+	return &FunctionalGauge{value: f}
+}
+
+// NewRegisteredFunctionalGauge constructs and registers a new StandardGauge.
+func NewRegisteredFunctionalGauge(name string, r Registry, f func() int64) Gauge {
+	c := NewFunctionalGauge(f)
+	if nil == r {
+		r = DefaultRegistry
+	}
+	r.Register(name, c)
+	return c
+}
+
+// GaugeSnapshot is a read-only copy of another Gauge.
+type GaugeSnapshot int64
+
+// Snapshot returns the snapshot.
+func (g GaugeSnapshot) Snapshot() Gauge { return g }
+
+// Update panics.
+func (GaugeSnapshot) Update(int64) {
+	panic("Update called on a GaugeSnapshot")
+}
+
+// Value returns the value at the time the snapshot was taken.
+func (g GaugeSnapshot) Value() int64 { return int64(g) }
+
+// NilGauge is a no-op Gauge.
+type NilGauge struct{}
+
+// Snapshot is a no-op.
+func (NilGauge) Snapshot() Gauge { return NilGauge{} }
+
+// Update is a no-op.
+func (NilGauge) Update(v int64) {}
+
+// Value is a no-op.
+func (NilGauge) Value() int64 { return 0 }
+
+// StandardGauge is the standard implementation of a Gauge and uses the
+// sync/atomic package to manage a single int64 value.
+type StandardGauge struct {
+	value int64
+}
+
+// Snapshot returns a read-only copy of the gauge.
+func (g *StandardGauge) Snapshot() Gauge {
+	return GaugeSnapshot(g.Value())
+}
+
+// Update updates the gauge's value.
+func (g *StandardGauge) Update(v int64) {
+	atomic.StoreInt64(&g.value, v)
+}
+
+// Value returns the gauge's current value.
+func (g *StandardGauge) Value() int64 {
+	return atomic.LoadInt64(&g.value)
+}
+
+// FunctionalGauge returns value from given function
+type FunctionalGauge struct {
+	value func() int64
+}
+
+// Value returns the gauge's current value.
+func (g FunctionalGauge) Value() int64 {
+	return g.value()
+}
+
+// Snapshot returns the snapshot.
+func (g FunctionalGauge) Snapshot() Gauge { return GaugeSnapshot(g.Value()) }
+
+// Update panics.
+func (FunctionalGauge) Update(int64) {
+	panic("Update called on a FunctionalGauge")
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/gauge_float64.go b/vendor/github.com/rcrowley/go-metrics/gauge_float64.go
new file mode 100644
index 0000000..3962e6d
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/gauge_float64.go

@@ -0,0 +1,125 @@
+package metrics
+
+import (
+	"math"
+	"sync/atomic"
+)
+
+// GaugeFloat64s hold a float64 value that can be set arbitrarily.
+type GaugeFloat64 interface {
+	Snapshot() GaugeFloat64
+	Update(float64)
+	Value() float64
+}
+
+// GetOrRegisterGaugeFloat64 returns an existing GaugeFloat64 or constructs and registers a
+// new StandardGaugeFloat64.
+func GetOrRegisterGaugeFloat64(name string, r Registry) GaugeFloat64 {
+	if nil == r {
+		r = DefaultRegistry
+	}
+	return r.GetOrRegister(name, NewGaugeFloat64()).(GaugeFloat64)
+}
+
+// NewGaugeFloat64 constructs a new StandardGaugeFloat64.
+func NewGaugeFloat64() GaugeFloat64 {
+	if UseNilMetrics {
+		return NilGaugeFloat64{}
+	}
+	return &StandardGaugeFloat64{
+		value: 0.0,
+	}
+}
+
+// NewRegisteredGaugeFloat64 constructs and registers a new StandardGaugeFloat64.
+func NewRegisteredGaugeFloat64(name string, r Registry) GaugeFloat64 {
+	c := NewGaugeFloat64()
+	if nil == r {
+		r = DefaultRegistry
+	}
+	r.Register(name, c)
+	return c
+}
+
+// NewFunctionalGauge constructs a new FunctionalGauge.
+func NewFunctionalGaugeFloat64(f func() float64) GaugeFloat64 {
+	if UseNilMetrics {
+		return NilGaugeFloat64{}
+	}
+	return &FunctionalGaugeFloat64{value: f}
+}
+
+// NewRegisteredFunctionalGauge constructs and registers a new StandardGauge.
+func NewRegisteredFunctionalGaugeFloat64(name string, r Registry, f func() float64) GaugeFloat64 {
+	c := NewFunctionalGaugeFloat64(f)
+	if nil == r {
+		r = DefaultRegistry
+	}
+	r.Register(name, c)
+	return c
+}
+
+// GaugeFloat64Snapshot is a read-only copy of another GaugeFloat64.
+type GaugeFloat64Snapshot float64
+
+// Snapshot returns the snapshot.
+func (g GaugeFloat64Snapshot) Snapshot() GaugeFloat64 { return g }
+
+// Update panics.
+func (GaugeFloat64Snapshot) Update(float64) {
+	panic("Update called on a GaugeFloat64Snapshot")
+}
+
+// Value returns the value at the time the snapshot was taken.
+func (g GaugeFloat64Snapshot) Value() float64 { return float64(g) }
+
+// NilGauge is a no-op Gauge.
+type NilGaugeFloat64 struct{}
+
+// Snapshot is a no-op.
+func (NilGaugeFloat64) Snapshot() GaugeFloat64 { return NilGaugeFloat64{} }
+
+// Update is a no-op.
+func (NilGaugeFloat64) Update(v float64) {}
+
+// Value is a no-op.
+func (NilGaugeFloat64) Value() float64 { return 0.0 }
+
+// StandardGaugeFloat64 is the standard implementation of a GaugeFloat64 and uses
+// sync.Mutex to manage a single float64 value.
+type StandardGaugeFloat64 struct {
+	value uint64
+}
+
+// Snapshot returns a read-only copy of the gauge.
+func (g *StandardGaugeFloat64) Snapshot() GaugeFloat64 {
+	return GaugeFloat64Snapshot(g.Value())
+}
+
+// Update updates the gauge's value.
+func (g *StandardGaugeFloat64) Update(v float64) {
+	atomic.StoreUint64(&g.value, math.Float64bits(v))
+}
+
+// Value returns the gauge's current value.
+func (g *StandardGaugeFloat64) Value() float64 {
+	return math.Float64frombits(atomic.LoadUint64(&g.value))
+}
+
+// FunctionalGaugeFloat64 returns value from given function
+type FunctionalGaugeFloat64 struct {
+	value func() float64
+}
+
+// Value returns the gauge's current value.
+func (g FunctionalGaugeFloat64) Value() float64 {
+	return g.value()
+}
+
+// Snapshot returns the snapshot.
+func (g FunctionalGaugeFloat64) Snapshot() GaugeFloat64 { return GaugeFloat64Snapshot(g.Value()) }
+
+// Update panics.
+func (FunctionalGaugeFloat64) Update(float64) {
+	panic("Update called on a FunctionalGaugeFloat64")
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/graphite.go b/vendor/github.com/rcrowley/go-metrics/graphite.go
new file mode 100644
index 0000000..abd0a7d
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/graphite.go

@@ -0,0 +1,113 @@
+package metrics
+
+import (
+	"bufio"
+	"fmt"
+	"log"
+	"net"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// GraphiteConfig provides a container with configuration parameters for
+// the Graphite exporter
+type GraphiteConfig struct {
+	Addr          *net.TCPAddr  // Network address to connect to
+	Registry      Registry      // Registry to be exported
+	FlushInterval time.Duration // Flush interval
+	DurationUnit  time.Duration // Time conversion unit for durations
+	Prefix        string        // Prefix to be prepended to metric names
+	Percentiles   []float64     // Percentiles to export from timers and histograms
+}
+
+// Graphite is a blocking exporter function which reports metrics in r
+// to a graphite server located at addr, flushing them every d duration
+// and prepending metric names with prefix.
+func Graphite(r Registry, d time.Duration, prefix string, addr *net.TCPAddr) {
+	GraphiteWithConfig(GraphiteConfig{
+		Addr:          addr,
+		Registry:      r,
+		FlushInterval: d,
+		DurationUnit:  time.Nanosecond,
+		Prefix:        prefix,
+		Percentiles:   []float64{0.5, 0.75, 0.95, 0.99, 0.999},
+	})
+}
+
+// GraphiteWithConfig is a blocking exporter function just like Graphite,
+// but it takes a GraphiteConfig instead.
+func GraphiteWithConfig(c GraphiteConfig) {
+	log.Printf("WARNING: This go-metrics client has been DEPRECATED! It has been moved to https://github.com/cyberdelia/go-metrics-graphite and will be removed from rcrowley/go-metrics on August 12th 2015")
+	for _ = range time.Tick(c.FlushInterval) {
+		if err := graphite(&c); nil != err {
+			log.Println(err)
+		}
+	}
+}
+
+// GraphiteOnce performs a single submission to Graphite, returning a
+// non-nil error on failed connections. This can be used in a loop
+// similar to GraphiteWithConfig for custom error handling.
+func GraphiteOnce(c GraphiteConfig) error {
+	log.Printf("WARNING: This go-metrics client has been DEPRECATED! It has been moved to https://github.com/cyberdelia/go-metrics-graphite and will be removed from rcrowley/go-metrics on August 12th 2015")
+	return graphite(&c)
+}
+
+func graphite(c *GraphiteConfig) error {
+	now := time.Now().Unix()
+	du := float64(c.DurationUnit)
+	conn, err := net.DialTCP("tcp", nil, c.Addr)
+	if nil != err {
+		return err
+	}
+	defer conn.Close()
+	w := bufio.NewWriter(conn)
+	c.Registry.Each(func(name string, i interface{}) {
+		switch metric := i.(type) {
+		case Counter:
+			fmt.Fprintf(w, "%s.%s.count %d %d\n", c.Prefix, name, metric.Count(), now)
+		case Gauge:
+			fmt.Fprintf(w, "%s.%s.value %d %d\n", c.Prefix, name, metric.Value(), now)
+		case GaugeFloat64:
+			fmt.Fprintf(w, "%s.%s.value %f %d\n", c.Prefix, name, metric.Value(), now)
+		case Histogram:
+			h := metric.Snapshot()
+			ps := h.Percentiles(c.Percentiles)
+			fmt.Fprintf(w, "%s.%s.count %d %d\n", c.Prefix, name, h.Count(), now)
+			fmt.Fprintf(w, "%s.%s.min %d %d\n", c.Prefix, name, h.Min(), now)
+			fmt.Fprintf(w, "%s.%s.max %d %d\n", c.Prefix, name, h.Max(), now)
+			fmt.Fprintf(w, "%s.%s.mean %.2f %d\n", c.Prefix, name, h.Mean(), now)
+			fmt.Fprintf(w, "%s.%s.std-dev %.2f %d\n", c.Prefix, name, h.StdDev(), now)
+			for psIdx, psKey := range c.Percentiles {
+				key := strings.Replace(strconv.FormatFloat(psKey*100.0, 'f', -1, 64), ".", "", 1)
+				fmt.Fprintf(w, "%s.%s.%s-percentile %.2f %d\n", c.Prefix, name, key, ps[psIdx], now)
+			}
+		case Meter:
+			m := metric.Snapshot()
+			fmt.Fprintf(w, "%s.%s.count %d %d\n", c.Prefix, name, m.Count(), now)
+			fmt.Fprintf(w, "%s.%s.one-minute %.2f %d\n", c.Prefix, name, m.Rate1(), now)
+			fmt.Fprintf(w, "%s.%s.five-minute %.2f %d\n", c.Prefix, name, m.Rate5(), now)
+			fmt.Fprintf(w, "%s.%s.fifteen-minute %.2f %d\n", c.Prefix, name, m.Rate15(), now)
+			fmt.Fprintf(w, "%s.%s.mean %.2f %d\n", c.Prefix, name, m.RateMean(), now)
+		case Timer:
+			t := metric.Snapshot()
+			ps := t.Percentiles(c.Percentiles)
+			fmt.Fprintf(w, "%s.%s.count %d %d\n", c.Prefix, name, t.Count(), now)
+			fmt.Fprintf(w, "%s.%s.min %d %d\n", c.Prefix, name, t.Min()/int64(du), now)
+			fmt.Fprintf(w, "%s.%s.max %d %d\n", c.Prefix, name, t.Max()/int64(du), now)
+			fmt.Fprintf(w, "%s.%s.mean %.2f %d\n", c.Prefix, name, t.Mean()/du, now)
+			fmt.Fprintf(w, "%s.%s.std-dev %.2f %d\n", c.Prefix, name, t.StdDev()/du, now)
+			for psIdx, psKey := range c.Percentiles {
+				key := strings.Replace(strconv.FormatFloat(psKey*100.0, 'f', -1, 64), ".", "", 1)
+				fmt.Fprintf(w, "%s.%s.%s-percentile %.2f %d\n", c.Prefix, name, key, ps[psIdx], now)
+			}
+			fmt.Fprintf(w, "%s.%s.one-minute %.2f %d\n", c.Prefix, name, t.Rate1(), now)
+			fmt.Fprintf(w, "%s.%s.five-minute %.2f %d\n", c.Prefix, name, t.Rate5(), now)
+			fmt.Fprintf(w, "%s.%s.fifteen-minute %.2f %d\n", c.Prefix, name, t.Rate15(), now)
+			fmt.Fprintf(w, "%s.%s.mean-rate %.2f %d\n", c.Prefix, name, t.RateMean(), now)
+		}
+		w.Flush()
+	})
+	return nil
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/healthcheck.go b/vendor/github.com/rcrowley/go-metrics/healthcheck.go
new file mode 100644
index 0000000..445131c
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/healthcheck.go

@@ -0,0 +1,61 @@
+package metrics
+
+// Healthchecks hold an error value describing an arbitrary up/down status.
+type Healthcheck interface {
+	Check()
+	Error() error
+	Healthy()
+	Unhealthy(error)
+}
+
+// NewHealthcheck constructs a new Healthcheck which will use the given
+// function to update its status.
+func NewHealthcheck(f func(Healthcheck)) Healthcheck {
+	if UseNilMetrics {
+		return NilHealthcheck{}
+	}
+	return &StandardHealthcheck{nil, f}
+}
+
+// NilHealthcheck is a no-op.
+type NilHealthcheck struct{}
+
+// Check is a no-op.
+func (NilHealthcheck) Check() {}
+
+// Error is a no-op.
+func (NilHealthcheck) Error() error { return nil }
+
+// Healthy is a no-op.
+func (NilHealthcheck) Healthy() {}
+
+// Unhealthy is a no-op.
+func (NilHealthcheck) Unhealthy(error) {}
+
+// StandardHealthcheck is the standard implementation of a Healthcheck and
+// stores the status and a function to call to update the status.
+type StandardHealthcheck struct {
+	err error
+	f   func(Healthcheck)
+}
+
+// Check runs the healthcheck function to update the healthcheck's status.
+func (h *StandardHealthcheck) Check() {
+	h.f(h)
+}
+
+// Error returns the healthcheck's status, which will be nil if it is healthy.
+func (h *StandardHealthcheck) Error() error {
+	return h.err
+}
+
+// Healthy marks the healthcheck as healthy.
+func (h *StandardHealthcheck) Healthy() {
+	h.err = nil
+}
+
+// Unhealthy marks the healthcheck as unhealthy.  The error is stored and
+// may be retrieved by the Error method.
+func (h *StandardHealthcheck) Unhealthy(err error) {
+	h.err = err
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/histogram.go b/vendor/github.com/rcrowley/go-metrics/histogram.go
new file mode 100644
index 0000000..dbc837f
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/histogram.go

@@ -0,0 +1,202 @@
+package metrics
+
+// Histograms calculate distribution statistics from a series of int64 values.
+type Histogram interface {
+	Clear()
+	Count() int64
+	Max() int64
+	Mean() float64
+	Min() int64
+	Percentile(float64) float64
+	Percentiles([]float64) []float64
+	Sample() Sample
+	Snapshot() Histogram
+	StdDev() float64
+	Sum() int64
+	Update(int64)
+	Variance() float64
+}
+
+// GetOrRegisterHistogram returns an existing Histogram or constructs and
+// registers a new StandardHistogram.
+func GetOrRegisterHistogram(name string, r Registry, s Sample) Histogram {
+	if nil == r {
+		r = DefaultRegistry
+	}
+	return r.GetOrRegister(name, func() Histogram { return NewHistogram(s) }).(Histogram)
+}
+
+// NewHistogram constructs a new StandardHistogram from a Sample.
+func NewHistogram(s Sample) Histogram {
+	if UseNilMetrics {
+		return NilHistogram{}
+	}
+	return &StandardHistogram{sample: s}
+}
+
+// NewRegisteredHistogram constructs and registers a new StandardHistogram from
+// a Sample.
+func NewRegisteredHistogram(name string, r Registry, s Sample) Histogram {
+	c := NewHistogram(s)
+	if nil == r {
+		r = DefaultRegistry
+	}
+	r.Register(name, c)
+	return c
+}
+
+// HistogramSnapshot is a read-only copy of another Histogram.
+type HistogramSnapshot struct {
+	sample *SampleSnapshot
+}
+
+// Clear panics.
+func (*HistogramSnapshot) Clear() {
+	panic("Clear called on a HistogramSnapshot")
+}
+
+// Count returns the number of samples recorded at the time the snapshot was
+// taken.
+func (h *HistogramSnapshot) Count() int64 { return h.sample.Count() }
+
+// Max returns the maximum value in the sample at the time the snapshot was
+// taken.
+func (h *HistogramSnapshot) Max() int64 { return h.sample.Max() }
+
+// Mean returns the mean of the values in the sample at the time the snapshot
+// was taken.
+func (h *HistogramSnapshot) Mean() float64 { return h.sample.Mean() }
+
+// Min returns the minimum value in the sample at the time the snapshot was
+// taken.
+func (h *HistogramSnapshot) Min() int64 { return h.sample.Min() }
+
+// Percentile returns an arbitrary percentile of values in the sample at the
+// time the snapshot was taken.
+func (h *HistogramSnapshot) Percentile(p float64) float64 {
+	return h.sample.Percentile(p)
+}
+
+// Percentiles returns a slice of arbitrary percentiles of values in the sample
+// at the time the snapshot was taken.
+func (h *HistogramSnapshot) Percentiles(ps []float64) []float64 {
+	return h.sample.Percentiles(ps)
+}
+
+// Sample returns the Sample underlying the histogram.
+func (h *HistogramSnapshot) Sample() Sample { return h.sample }
+
+// Snapshot returns the snapshot.
+func (h *HistogramSnapshot) Snapshot() Histogram { return h }
+
+// StdDev returns the standard deviation of the values in the sample at the
+// time the snapshot was taken.
+func (h *HistogramSnapshot) StdDev() float64 { return h.sample.StdDev() }
+
+// Sum returns the sum in the sample at the time the snapshot was taken.
+func (h *HistogramSnapshot) Sum() int64 { return h.sample.Sum() }
+
+// Update panics.
+func (*HistogramSnapshot) Update(int64) {
+	panic("Update called on a HistogramSnapshot")
+}
+
+// Variance returns the variance of inputs at the time the snapshot was taken.
+func (h *HistogramSnapshot) Variance() float64 { return h.sample.Variance() }
+
+// NilHistogram is a no-op Histogram.
+type NilHistogram struct{}
+
+// Clear is a no-op.
+func (NilHistogram) Clear() {}
+
+// Count is a no-op.
+func (NilHistogram) Count() int64 { return 0 }
+
+// Max is a no-op.
+func (NilHistogram) Max() int64 { return 0 }
+
+// Mean is a no-op.
+func (NilHistogram) Mean() float64 { return 0.0 }
+
+// Min is a no-op.
+func (NilHistogram) Min() int64 { return 0 }
+
+// Percentile is a no-op.
+func (NilHistogram) Percentile(p float64) float64 { return 0.0 }
+
+// Percentiles is a no-op.
+func (NilHistogram) Percentiles(ps []float64) []float64 {
+	return make([]float64, len(ps))
+}
+
+// Sample is a no-op.
+func (NilHistogram) Sample() Sample { return NilSample{} }
+
+// Snapshot is a no-op.
+func (NilHistogram) Snapshot() Histogram { return NilHistogram{} }
+
+// StdDev is a no-op.
+func (NilHistogram) StdDev() float64 { return 0.0 }
+
+// Sum is a no-op.
+func (NilHistogram) Sum() int64 { return 0 }
+
+// Update is a no-op.
+func (NilHistogram) Update(v int64) {}
+
+// Variance is a no-op.
+func (NilHistogram) Variance() float64 { return 0.0 }
+
+// StandardHistogram is the standard implementation of a Histogram and uses a
+// Sample to bound its memory use.
+type StandardHistogram struct {
+	sample Sample
+}
+
+// Clear clears the histogram and its sample.
+func (h *StandardHistogram) Clear() { h.sample.Clear() }
+
+// Count returns the number of samples recorded since the histogram was last
+// cleared.
+func (h *StandardHistogram) Count() int64 { return h.sample.Count() }
+
+// Max returns the maximum value in the sample.
+func (h *StandardHistogram) Max() int64 { return h.sample.Max() }
+
+// Mean returns the mean of the values in the sample.
+func (h *StandardHistogram) Mean() float64 { return h.sample.Mean() }
+
+// Min returns the minimum value in the sample.
+func (h *StandardHistogram) Min() int64 { return h.sample.Min() }
+
+// Percentile returns an arbitrary percentile of the values in the sample.
+func (h *StandardHistogram) Percentile(p float64) float64 {
+	return h.sample.Percentile(p)
+}
+
+// Percentiles returns a slice of arbitrary percentiles of the values in the
+// sample.
+func (h *StandardHistogram) Percentiles(ps []float64) []float64 {
+	return h.sample.Percentiles(ps)
+}
+
+// Sample returns the Sample underlying the histogram.
+func (h *StandardHistogram) Sample() Sample { return h.sample }
+
+// Snapshot returns a read-only copy of the histogram.
+func (h *StandardHistogram) Snapshot() Histogram {
+	return &HistogramSnapshot{sample: h.sample.Snapshot().(*SampleSnapshot)}
+}
+
+// StdDev returns the standard deviation of the values in the sample.
+func (h *StandardHistogram) StdDev() float64 { return h.sample.StdDev() }
+
+// Sum returns the sum in the sample.
+func (h *StandardHistogram) Sum() int64 { return h.sample.Sum() }
+
+// Update samples a new value.
+func (h *StandardHistogram) Update(v int64) { h.sample.Update(v) }
+
+// Variance returns the variance of the values in the sample.
+func (h *StandardHistogram) Variance() float64 { return h.sample.Variance() }

diff --git a/vendor/github.com/rcrowley/go-metrics/json.go b/vendor/github.com/rcrowley/go-metrics/json.go
new file mode 100644
index 0000000..174b947
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/json.go

@@ -0,0 +1,31 @@
+package metrics
+
+import (
+	"encoding/json"
+	"io"
+	"time"
+)
+
+// MarshalJSON returns a byte slice containing a JSON representation of all
+// the metrics in the Registry.
+func (r *StandardRegistry) MarshalJSON() ([]byte, error) {
+	return json.Marshal(r.GetAll())
+}
+
+// WriteJSON writes metrics from the given registry  periodically to the
+// specified io.Writer as JSON.
+func WriteJSON(r Registry, d time.Duration, w io.Writer) {
+	for _ = range time.Tick(d) {
+		WriteJSONOnce(r, w)
+	}
+}
+
+// WriteJSONOnce writes metrics from the given registry to the specified
+// io.Writer as JSON.
+func WriteJSONOnce(r Registry, w io.Writer) {
+	json.NewEncoder(w).Encode(r)
+}
+
+func (p *PrefixedRegistry) MarshalJSON() ([]byte, error) {
+	return json.Marshal(p.GetAll())
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/log.go b/vendor/github.com/rcrowley/go-metrics/log.go
new file mode 100644
index 0000000..f8074c0
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/log.go

@@ -0,0 +1,80 @@
+package metrics
+
+import (
+	"time"
+)
+
+type Logger interface {
+	Printf(format string, v ...interface{})
+}
+
+func Log(r Registry, freq time.Duration, l Logger) {
+	LogScaled(r, freq, time.Nanosecond, l)
+}
+
+// Output each metric in the given registry periodically using the given
+// logger. Print timings in `scale` units (eg time.Millisecond) rather than nanos.
+func LogScaled(r Registry, freq time.Duration, scale time.Duration, l Logger) {
+	du := float64(scale)
+	duSuffix := scale.String()[1:]
+
+	for _ = range time.Tick(freq) {
+		r.Each(func(name string, i interface{}) {
+			switch metric := i.(type) {
+			case Counter:
+				l.Printf("counter %s\n", name)
+				l.Printf("  count:       %9d\n", metric.Count())
+			case Gauge:
+				l.Printf("gauge %s\n", name)
+				l.Printf("  value:       %9d\n", metric.Value())
+			case GaugeFloat64:
+				l.Printf("gauge %s\n", name)
+				l.Printf("  value:       %f\n", metric.Value())
+			case Healthcheck:
+				metric.Check()
+				l.Printf("healthcheck %s\n", name)
+				l.Printf("  error:       %v\n", metric.Error())
+			case Histogram:
+				h := metric.Snapshot()
+				ps := h.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999})
+				l.Printf("histogram %s\n", name)
+				l.Printf("  count:       %9d\n", h.Count())
+				l.Printf("  min:         %9d\n", h.Min())
+				l.Printf("  max:         %9d\n", h.Max())
+				l.Printf("  mean:        %12.2f\n", h.Mean())
+				l.Printf("  stddev:      %12.2f\n", h.StdDev())
+				l.Printf("  median:      %12.2f\n", ps[0])
+				l.Printf("  75%%:         %12.2f\n", ps[1])
+				l.Printf("  95%%:         %12.2f\n", ps[2])
+				l.Printf("  99%%:         %12.2f\n", ps[3])
+				l.Printf("  99.9%%:       %12.2f\n", ps[4])
+			case Meter:
+				m := metric.Snapshot()
+				l.Printf("meter %s\n", name)
+				l.Printf("  count:       %9d\n", m.Count())
+				l.Printf("  1-min rate:  %12.2f\n", m.Rate1())
+				l.Printf("  5-min rate:  %12.2f\n", m.Rate5())
+				l.Printf("  15-min rate: %12.2f\n", m.Rate15())
+				l.Printf("  mean rate:   %12.2f\n", m.RateMean())
+			case Timer:
+				t := metric.Snapshot()
+				ps := t.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999})
+				l.Printf("timer %s\n", name)
+				l.Printf("  count:       %9d\n", t.Count())
+				l.Printf("  min:         %12.2f%s\n", float64(t.Min())/du, duSuffix)
+				l.Printf("  max:         %12.2f%s\n", float64(t.Max())/du, duSuffix)
+				l.Printf("  mean:        %12.2f%s\n", t.Mean()/du, duSuffix)
+				l.Printf("  stddev:      %12.2f%s\n", t.StdDev()/du, duSuffix)
+				l.Printf("  median:      %12.2f%s\n", ps[0]/du, duSuffix)
+				l.Printf("  75%%:         %12.2f%s\n", ps[1]/du, duSuffix)
+				l.Printf("  95%%:         %12.2f%s\n", ps[2]/du, duSuffix)
+				l.Printf("  99%%:         %12.2f%s\n", ps[3]/du, duSuffix)
+				l.Printf("  99.9%%:       %12.2f%s\n", ps[4]/du, duSuffix)
+				l.Printf("  1-min rate:  %12.2f\n", t.Rate1())
+				l.Printf("  5-min rate:  %12.2f\n", t.Rate5())
+				l.Printf("  15-min rate: %12.2f\n", t.Rate15())
+				l.Printf("  mean rate:   %12.2f\n", t.RateMean())
+			}
+		})
+	}
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/memory.md b/vendor/github.com/rcrowley/go-metrics/memory.md
new file mode 100644
index 0000000..47454f5
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/memory.md

@@ -0,0 +1,285 @@
+Memory usage
+============
+
+(Highly unscientific.)
+
+Command used to gather static memory usage:
+
+```sh
+grep ^Vm "/proc/$(ps fax | grep [m]etrics-bench | awk '{print $1}')/status"
+```
+
+Program used to gather baseline memory usage:
+
+```go
+package main
+
+import "time"
+
+func main() {
+	time.Sleep(600e9)
+}
+```
+
+Baseline
+--------
+
+```
+VmPeak:    42604 kB
+VmSize:    42604 kB
+VmLck:         0 kB
+VmHWM:      1120 kB
+VmRSS:      1120 kB
+VmData:    35460 kB
+VmStk:       136 kB
+VmExe:      1020 kB
+VmLib:      1848 kB
+VmPTE:        36 kB
+VmSwap:        0 kB
+```
+
+Program used to gather metric memory usage (with other metrics being similar):
+
+```go
+package main
+
+import (
+	"fmt"
+	"metrics"
+	"time"
+)
+
+func main() {
+	fmt.Sprintf("foo")
+	metrics.NewRegistry()
+	time.Sleep(600e9)
+}
+```
+
+1000 counters registered
+------------------------
+
+```
+VmPeak:    44016 kB
+VmSize:    44016 kB
+VmLck:         0 kB
+VmHWM:      1928 kB
+VmRSS:      1928 kB
+VmData:    36868 kB
+VmStk:       136 kB
+VmExe:      1024 kB
+VmLib:      1848 kB
+VmPTE:        40 kB
+VmSwap:        0 kB
+```
+
+**1.412 kB virtual, TODO 0.808 kB resident per counter.**
+
+100000 counters registered
+--------------------------
+
+```
+VmPeak:    55024 kB
+VmSize:    55024 kB
+VmLck:         0 kB
+VmHWM:     12440 kB
+VmRSS:     12440 kB
+VmData:    47876 kB
+VmStk:       136 kB
+VmExe:      1024 kB
+VmLib:      1848 kB
+VmPTE:        64 kB
+VmSwap:        0 kB
+```
+
+**0.1242 kB virtual, 0.1132 kB resident per counter.**
+
+1000 gauges registered
+----------------------
+
+```
+VmPeak:    44012 kB
+VmSize:    44012 kB
+VmLck:         0 kB
+VmHWM:      1928 kB
+VmRSS:      1928 kB
+VmData:    36868 kB
+VmStk:       136 kB
+VmExe:      1020 kB
+VmLib:      1848 kB
+VmPTE:        40 kB
+VmSwap:        0 kB
+```
+
+**1.408 kB virtual, 0.808 kB resident per counter.**
+
+100000 gauges registered
+------------------------
+
+```
+VmPeak:    55020 kB
+VmSize:    55020 kB
+VmLck:         0 kB
+VmHWM:     12432 kB
+VmRSS:     12432 kB
+VmData:    47876 kB
+VmStk:       136 kB
+VmExe:      1020 kB
+VmLib:      1848 kB
+VmPTE:        60 kB
+VmSwap:        0 kB
+```
+
+**0.12416 kB virtual, 0.11312 resident per gauge.**
+
+1000 histograms with a uniform sample size of 1028
+--------------------------------------------------
+
+```
+VmPeak:    72272 kB
+VmSize:    72272 kB
+VmLck:         0 kB
+VmHWM:     16204 kB
+VmRSS:     16204 kB
+VmData:    65100 kB
+VmStk:       136 kB
+VmExe:      1048 kB
+VmLib:      1848 kB
+VmPTE:        80 kB
+VmSwap:        0 kB
+```
+
+**29.668 kB virtual, TODO 15.084 resident per histogram.**
+
+10000 histograms with a uniform sample size of 1028
+---------------------------------------------------
+
+```
+VmPeak:   256912 kB
+VmSize:   256912 kB
+VmLck:         0 kB
+VmHWM:    146204 kB
+VmRSS:    146204 kB
+VmData:   249740 kB
+VmStk:       136 kB
+VmExe:      1048 kB
+VmLib:      1848 kB
+VmPTE:       448 kB
+VmSwap:        0 kB
+```
+
+**21.4308 kB virtual, 14.5084 kB resident per histogram.**
+
+50000 histograms with a uniform sample size of 1028
+---------------------------------------------------
+
+```
+VmPeak:   908112 kB
+VmSize:   908112 kB
+VmLck:         0 kB
+VmHWM:    645832 kB
+VmRSS:    645588 kB
+VmData:   900940 kB
+VmStk:       136 kB
+VmExe:      1048 kB
+VmLib:      1848 kB
+VmPTE:      1716 kB
+VmSwap:     1544 kB
+```
+
+**17.31016 kB virtual, 12.88936 kB resident per histogram.**
+
+1000 histograms with an exponentially-decaying sample size of 1028 and alpha of 0.015
+-------------------------------------------------------------------------------------
+
+```
+VmPeak:    62480 kB
+VmSize:    62480 kB
+VmLck:         0 kB
+VmHWM:     11572 kB
+VmRSS:     11572 kB
+VmData:    55308 kB
+VmStk:       136 kB
+VmExe:      1048 kB
+VmLib:      1848 kB
+VmPTE:        64 kB
+VmSwap:        0 kB
+```
+
+**19.876 kB virtual, 10.452 kB resident per histogram.**
+
+10000 histograms with an exponentially-decaying sample size of 1028 and alpha of 0.015
+--------------------------------------------------------------------------------------
+
+```
+VmPeak:   153296 kB
+VmSize:   153296 kB
+VmLck:         0 kB
+VmHWM:    101176 kB
+VmRSS:    101176 kB
+VmData:   146124 kB
+VmStk:       136 kB
+VmExe:      1048 kB
+VmLib:      1848 kB
+VmPTE:       240 kB
+VmSwap:        0 kB
+```
+
+**11.0692 kB virtual, 10.0056 kB resident per histogram.**
+
+50000 histograms with an exponentially-decaying sample size of 1028 and alpha of 0.015
+--------------------------------------------------------------------------------------
+
+```
+VmPeak:   557264 kB
+VmSize:   557264 kB
+VmLck:         0 kB
+VmHWM:    501056 kB
+VmRSS:    501056 kB
+VmData:   550092 kB
+VmStk:       136 kB
+VmExe:      1048 kB
+VmLib:      1848 kB
+VmPTE:      1032 kB
+VmSwap:        0 kB
+```
+
+**10.2932 kB virtual, 9.99872 kB resident per histogram.**
+
+1000 meters
+-----------
+
+```
+VmPeak:    74504 kB
+VmSize:    74504 kB
+VmLck:         0 kB
+VmHWM:     24124 kB
+VmRSS:     24124 kB
+VmData:    67340 kB
+VmStk:       136 kB
+VmExe:      1040 kB
+VmLib:      1848 kB
+VmPTE:        92 kB
+VmSwap:        0 kB
+```
+
+**31.9 kB virtual, 23.004 kB resident per meter.**
+
+10000 meters
+------------
+
+```
+VmPeak:   278920 kB
+VmSize:   278920 kB
+VmLck:         0 kB
+VmHWM:    227300 kB
+VmRSS:    227300 kB
+VmData:   271756 kB
+VmStk:       136 kB
+VmExe:      1040 kB
+VmLib:      1848 kB
+VmPTE:       488 kB
+VmSwap:        0 kB
+```
+
+**23.6316 kB virtual, 22.618 kB resident per meter.**

diff --git a/vendor/github.com/rcrowley/go-metrics/meter.go b/vendor/github.com/rcrowley/go-metrics/meter.go
new file mode 100644
index 0000000..223669b
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/meter.go

@@ -0,0 +1,251 @@
+package metrics
+
+import (
+	"math"
+	"sync"
+	"sync/atomic"
+	"time"
+)
+
+// Meters count events to produce exponentially-weighted moving average rates
+// at one-, five-, and fifteen-minutes and a mean rate.
+type Meter interface {
+	Count() int64
+	Mark(int64)
+	Rate1() float64
+	Rate5() float64
+	Rate15() float64
+	RateMean() float64
+	Snapshot() Meter
+	Stop()
+}
+
+// GetOrRegisterMeter returns an existing Meter or constructs and registers a
+// new StandardMeter.
+// Be sure to unregister the meter from the registry once it is of no use to
+// allow for garbage collection.
+func GetOrRegisterMeter(name string, r Registry) Meter {
+	if nil == r {
+		r = DefaultRegistry
+	}
+	return r.GetOrRegister(name, NewMeter).(Meter)
+}
+
+// NewMeter constructs a new StandardMeter and launches a goroutine.
+// Be sure to call Stop() once the meter is of no use to allow for garbage collection.
+func NewMeter() Meter {
+	if UseNilMetrics {
+		return NilMeter{}
+	}
+	m := newStandardMeter()
+	arbiter.Lock()
+	defer arbiter.Unlock()
+	arbiter.meters[m] = struct{}{}
+	if !arbiter.started {
+		arbiter.started = true
+		go arbiter.tick()
+	}
+	return m
+}
+
+// NewMeter constructs and registers a new StandardMeter and launches a
+// goroutine.
+// Be sure to unregister the meter from the registry once it is of no use to
+// allow for garbage collection.
+func NewRegisteredMeter(name string, r Registry) Meter {
+	c := NewMeter()
+	if nil == r {
+		r = DefaultRegistry
+	}
+	r.Register(name, c)
+	return c
+}
+
+// MeterSnapshot is a read-only copy of another Meter.
+type MeterSnapshot struct {
+	count                          int64
+	rate1, rate5, rate15, rateMean uint64
+}
+
+// Count returns the count of events at the time the snapshot was taken.
+func (m *MeterSnapshot) Count() int64 { return m.count }
+
+// Mark panics.
+func (*MeterSnapshot) Mark(n int64) {
+	panic("Mark called on a MeterSnapshot")
+}
+
+// Rate1 returns the one-minute moving average rate of events per second at the
+// time the snapshot was taken.
+func (m *MeterSnapshot) Rate1() float64 { return math.Float64frombits(m.rate1) }
+
+// Rate5 returns the five-minute moving average rate of events per second at
+// the time the snapshot was taken.
+func (m *MeterSnapshot) Rate5() float64 { return math.Float64frombits(m.rate5) }
+
+// Rate15 returns the fifteen-minute moving average rate of events per second
+// at the time the snapshot was taken.
+func (m *MeterSnapshot) Rate15() float64 { return math.Float64frombits(m.rate15) }
+
+// RateMean returns the meter's mean rate of events per second at the time the
+// snapshot was taken.
+func (m *MeterSnapshot) RateMean() float64 { return math.Float64frombits(m.rateMean) }
+
+// Snapshot returns the snapshot.
+func (m *MeterSnapshot) Snapshot() Meter { return m }
+
+// Stop is a no-op.
+func (m *MeterSnapshot) Stop() {}
+
+// NilMeter is a no-op Meter.
+type NilMeter struct{}
+
+// Count is a no-op.
+func (NilMeter) Count() int64 { return 0 }
+
+// Mark is a no-op.
+func (NilMeter) Mark(n int64) {}
+
+// Rate1 is a no-op.
+func (NilMeter) Rate1() float64 { return 0.0 }
+
+// Rate5 is a no-op.
+func (NilMeter) Rate5() float64 { return 0.0 }
+
+// Rate15is a no-op.
+func (NilMeter) Rate15() float64 { return 0.0 }
+
+// RateMean is a no-op.
+func (NilMeter) RateMean() float64 { return 0.0 }
+
+// Snapshot is a no-op.
+func (NilMeter) Snapshot() Meter { return NilMeter{} }
+
+// Stop is a no-op.
+func (NilMeter) Stop() {}
+
+// StandardMeter is the standard implementation of a Meter.
+type StandardMeter struct {
+	snapshot    *MeterSnapshot
+	a1, a5, a15 EWMA
+	startTime   time.Time
+	stopped     uint32
+}
+
+func newStandardMeter() *StandardMeter {
+	return &StandardMeter{
+		snapshot:  &MeterSnapshot{},
+		a1:        NewEWMA1(),
+		a5:        NewEWMA5(),
+		a15:       NewEWMA15(),
+		startTime: time.Now(),
+	}
+}
+
+// Stop stops the meter, Mark() will be a no-op if you use it after being stopped.
+func (m *StandardMeter) Stop() {
+	if atomic.CompareAndSwapUint32(&m.stopped, 0, 1) {
+		arbiter.Lock()
+		delete(arbiter.meters, m)
+		arbiter.Unlock()
+	}
+}
+
+// Count returns the number of events recorded.
+func (m *StandardMeter) Count() int64 {
+	return atomic.LoadInt64(&m.snapshot.count)
+}
+
+// Mark records the occurance of n events.
+func (m *StandardMeter) Mark(n int64) {
+	if atomic.LoadUint32(&m.stopped) == 1 {
+		return
+	}
+
+	atomic.AddInt64(&m.snapshot.count, n)
+
+	m.a1.Update(n)
+	m.a5.Update(n)
+	m.a15.Update(n)
+	m.updateSnapshot()
+}
+
+// Rate1 returns the one-minute moving average rate of events per second.
+func (m *StandardMeter) Rate1() float64 {
+	return math.Float64frombits(atomic.LoadUint64(&m.snapshot.rate1))
+}
+
+// Rate5 returns the five-minute moving average rate of events per second.
+func (m *StandardMeter) Rate5() float64 {
+	return math.Float64frombits(atomic.LoadUint64(&m.snapshot.rate5))
+}
+
+// Rate15 returns the fifteen-minute moving average rate of events per second.
+func (m *StandardMeter) Rate15() float64 {
+	return math.Float64frombits(atomic.LoadUint64(&m.snapshot.rate15))
+}
+
+// RateMean returns the meter's mean rate of events per second.
+func (m *StandardMeter) RateMean() float64 {
+	return math.Float64frombits(atomic.LoadUint64(&m.snapshot.rateMean))
+}
+
+// Snapshot returns a read-only copy of the meter.
+func (m *StandardMeter) Snapshot() Meter {
+	copiedSnapshot := MeterSnapshot{
+		count:    atomic.LoadInt64(&m.snapshot.count),
+		rate1:    atomic.LoadUint64(&m.snapshot.rate1),
+		rate5:    atomic.LoadUint64(&m.snapshot.rate5),
+		rate15:   atomic.LoadUint64(&m.snapshot.rate15),
+		rateMean: atomic.LoadUint64(&m.snapshot.rateMean),
+	}
+	return &copiedSnapshot
+}
+
+func (m *StandardMeter) updateSnapshot() {
+	rate1 := math.Float64bits(m.a1.Rate())
+	rate5 := math.Float64bits(m.a5.Rate())
+	rate15 := math.Float64bits(m.a15.Rate())
+	rateMean := math.Float64bits(float64(m.Count()) / time.Since(m.startTime).Seconds())
+
+	atomic.StoreUint64(&m.snapshot.rate1, rate1)
+	atomic.StoreUint64(&m.snapshot.rate5, rate5)
+	atomic.StoreUint64(&m.snapshot.rate15, rate15)
+	atomic.StoreUint64(&m.snapshot.rateMean, rateMean)
+}
+
+func (m *StandardMeter) tick() {
+	m.a1.Tick()
+	m.a5.Tick()
+	m.a15.Tick()
+	m.updateSnapshot()
+}
+
+// meterArbiter ticks meters every 5s from a single goroutine.
+// meters are references in a set for future stopping.
+type meterArbiter struct {
+	sync.RWMutex
+	started bool
+	meters  map[*StandardMeter]struct{}
+	ticker  *time.Ticker
+}
+
+var arbiter = meterArbiter{ticker: time.NewTicker(5e9), meters: make(map[*StandardMeter]struct{})}
+
+// Ticks meters on the scheduled interval
+func (ma *meterArbiter) tick() {
+	for {
+		select {
+		case <-ma.ticker.C:
+			ma.tickMeters()
+		}
+	}
+}
+
+func (ma *meterArbiter) tickMeters() {
+	ma.RLock()
+	defer ma.RUnlock()
+	for meter := range ma.meters {
+		meter.tick()
+	}
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/metrics.go b/vendor/github.com/rcrowley/go-metrics/metrics.go
new file mode 100644
index 0000000..b97a49e
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/metrics.go

@@ -0,0 +1,13 @@
+// Go port of Coda Hale's Metrics library
+//
+// <https://github.com/rcrowley/go-metrics>
+//
+// Coda Hale's original work: <https://github.com/codahale/metrics>
+package metrics
+
+// UseNilMetrics is checked by the constructor functions for all of the
+// standard metrics.  If it is true, the metric returned is a stub.
+//
+// This global kill-switch helps quantify the observer effect and makes
+// for less cluttered pprof profiles.
+var UseNilMetrics bool = false

diff --git a/vendor/github.com/rcrowley/go-metrics/opentsdb.go b/vendor/github.com/rcrowley/go-metrics/opentsdb.go
new file mode 100644
index 0000000..266b6c9
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/opentsdb.go

@@ -0,0 +1,119 @@
+package metrics
+
+import (
+	"bufio"
+	"fmt"
+	"log"
+	"net"
+	"os"
+	"strings"
+	"time"
+)
+
+var shortHostName string = ""
+
+// OpenTSDBConfig provides a container with configuration parameters for
+// the OpenTSDB exporter
+type OpenTSDBConfig struct {
+	Addr          *net.TCPAddr  // Network address to connect to
+	Registry      Registry      // Registry to be exported
+	FlushInterval time.Duration // Flush interval
+	DurationUnit  time.Duration // Time conversion unit for durations
+	Prefix        string        // Prefix to be prepended to metric names
+}
+
+// OpenTSDB is a blocking exporter function which reports metrics in r
+// to a TSDB server located at addr, flushing them every d duration
+// and prepending metric names with prefix.
+func OpenTSDB(r Registry, d time.Duration, prefix string, addr *net.TCPAddr) {
+	OpenTSDBWithConfig(OpenTSDBConfig{
+		Addr:          addr,
+		Registry:      r,
+		FlushInterval: d,
+		DurationUnit:  time.Nanosecond,
+		Prefix:        prefix,
+	})
+}
+
+// OpenTSDBWithConfig is a blocking exporter function just like OpenTSDB,
+// but it takes a OpenTSDBConfig instead.
+func OpenTSDBWithConfig(c OpenTSDBConfig) {
+	for _ = range time.Tick(c.FlushInterval) {
+		if err := openTSDB(&c); nil != err {
+			log.Println(err)
+		}
+	}
+}
+
+func getShortHostname() string {
+	if shortHostName == "" {
+		host, _ := os.Hostname()
+		if index := strings.Index(host, "."); index > 0 {
+			shortHostName = host[:index]
+		} else {
+			shortHostName = host
+		}
+	}
+	return shortHostName
+}
+
+func openTSDB(c *OpenTSDBConfig) error {
+	shortHostname := getShortHostname()
+	now := time.Now().Unix()
+	du := float64(c.DurationUnit)
+	conn, err := net.DialTCP("tcp", nil, c.Addr)
+	if nil != err {
+		return err
+	}
+	defer conn.Close()
+	w := bufio.NewWriter(conn)
+	c.Registry.Each(func(name string, i interface{}) {
+		switch metric := i.(type) {
+		case Counter:
+			fmt.Fprintf(w, "put %s.%s.count %d %d host=%s\n", c.Prefix, name, now, metric.Count(), shortHostname)
+		case Gauge:
+			fmt.Fprintf(w, "put %s.%s.value %d %d host=%s\n", c.Prefix, name, now, metric.Value(), shortHostname)
+		case GaugeFloat64:
+			fmt.Fprintf(w, "put %s.%s.value %d %f host=%s\n", c.Prefix, name, now, metric.Value(), shortHostname)
+		case Histogram:
+			h := metric.Snapshot()
+			ps := h.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999})
+			fmt.Fprintf(w, "put %s.%s.count %d %d host=%s\n", c.Prefix, name, now, h.Count(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.min %d %d host=%s\n", c.Prefix, name, now, h.Min(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.max %d %d host=%s\n", c.Prefix, name, now, h.Max(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.mean %d %.2f host=%s\n", c.Prefix, name, now, h.Mean(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.std-dev %d %.2f host=%s\n", c.Prefix, name, now, h.StdDev(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.50-percentile %d %.2f host=%s\n", c.Prefix, name, now, ps[0], shortHostname)
+			fmt.Fprintf(w, "put %s.%s.75-percentile %d %.2f host=%s\n", c.Prefix, name, now, ps[1], shortHostname)
+			fmt.Fprintf(w, "put %s.%s.95-percentile %d %.2f host=%s\n", c.Prefix, name, now, ps[2], shortHostname)
+			fmt.Fprintf(w, "put %s.%s.99-percentile %d %.2f host=%s\n", c.Prefix, name, now, ps[3], shortHostname)
+			fmt.Fprintf(w, "put %s.%s.999-percentile %d %.2f host=%s\n", c.Prefix, name, now, ps[4], shortHostname)
+		case Meter:
+			m := metric.Snapshot()
+			fmt.Fprintf(w, "put %s.%s.count %d %d host=%s\n", c.Prefix, name, now, m.Count(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.one-minute %d %.2f host=%s\n", c.Prefix, name, now, m.Rate1(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.five-minute %d %.2f host=%s\n", c.Prefix, name, now, m.Rate5(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.fifteen-minute %d %.2f host=%s\n", c.Prefix, name, now, m.Rate15(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.mean %d %.2f host=%s\n", c.Prefix, name, now, m.RateMean(), shortHostname)
+		case Timer:
+			t := metric.Snapshot()
+			ps := t.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999})
+			fmt.Fprintf(w, "put %s.%s.count %d %d host=%s\n", c.Prefix, name, now, t.Count(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.min %d %d host=%s\n", c.Prefix, name, now, t.Min()/int64(du), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.max %d %d host=%s\n", c.Prefix, name, now, t.Max()/int64(du), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.mean %d %.2f host=%s\n", c.Prefix, name, now, t.Mean()/du, shortHostname)
+			fmt.Fprintf(w, "put %s.%s.std-dev %d %.2f host=%s\n", c.Prefix, name, now, t.StdDev()/du, shortHostname)
+			fmt.Fprintf(w, "put %s.%s.50-percentile %d %.2f host=%s\n", c.Prefix, name, now, ps[0]/du, shortHostname)
+			fmt.Fprintf(w, "put %s.%s.75-percentile %d %.2f host=%s\n", c.Prefix, name, now, ps[1]/du, shortHostname)
+			fmt.Fprintf(w, "put %s.%s.95-percentile %d %.2f host=%s\n", c.Prefix, name, now, ps[2]/du, shortHostname)
+			fmt.Fprintf(w, "put %s.%s.99-percentile %d %.2f host=%s\n", c.Prefix, name, now, ps[3]/du, shortHostname)
+			fmt.Fprintf(w, "put %s.%s.999-percentile %d %.2f host=%s\n", c.Prefix, name, now, ps[4]/du, shortHostname)
+			fmt.Fprintf(w, "put %s.%s.one-minute %d %.2f host=%s\n", c.Prefix, name, now, t.Rate1(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.five-minute %d %.2f host=%s\n", c.Prefix, name, now, t.Rate5(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.fifteen-minute %d %.2f host=%s\n", c.Prefix, name, now, t.Rate15(), shortHostname)
+			fmt.Fprintf(w, "put %s.%s.mean-rate %d %.2f host=%s\n", c.Prefix, name, now, t.RateMean(), shortHostname)
+		}
+		w.Flush()
+	})
+	return nil
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/registry.go b/vendor/github.com/rcrowley/go-metrics/registry.go
new file mode 100644
index 0000000..b3bab64
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/registry.go

@@ -0,0 +1,363 @@
+package metrics
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+	"sync"
+)
+
+// DuplicateMetric is the error returned by Registry.Register when a metric
+// already exists.  If you mean to Register that metric you must first
+// Unregister the existing metric.
+type DuplicateMetric string
+
+func (err DuplicateMetric) Error() string {
+	return fmt.Sprintf("duplicate metric: %s", string(err))
+}
+
+// A Registry holds references to a set of metrics by name and can iterate
+// over them, calling callback functions provided by the user.
+//
+// This is an interface so as to encourage other structs to implement
+// the Registry API as appropriate.
+type Registry interface {
+
+	// Call the given function for each registered metric.
+	Each(func(string, interface{}))
+
+	// Get the metric by the given name or nil if none is registered.
+	Get(string) interface{}
+
+	// GetAll metrics in the Registry.
+	GetAll() map[string]map[string]interface{}
+
+	// Gets an existing metric or registers the given one.
+	// The interface can be the metric to register if not found in registry,
+	// or a function returning the metric for lazy instantiation.
+	GetOrRegister(string, interface{}) interface{}
+
+	// Register the given metric under the given name.
+	Register(string, interface{}) error
+
+	// Run all registered healthchecks.
+	RunHealthchecks()
+
+	// Unregister the metric with the given name.
+	Unregister(string)
+
+	// Unregister all metrics.  (Mostly for testing.)
+	UnregisterAll()
+}
+
+// The standard implementation of a Registry is a mutex-protected map
+// of names to metrics.
+type StandardRegistry struct {
+	metrics map[string]interface{}
+	mutex   sync.RWMutex
+}
+
+// Create a new registry.
+func NewRegistry() Registry {
+	return &StandardRegistry{metrics: make(map[string]interface{})}
+}
+
+// Call the given function for each registered metric.
+func (r *StandardRegistry) Each(f func(string, interface{})) {
+	for name, i := range r.registered() {
+		f(name, i)
+	}
+}
+
+// Get the metric by the given name or nil if none is registered.
+func (r *StandardRegistry) Get(name string) interface{} {
+	r.mutex.RLock()
+	defer r.mutex.RUnlock()
+	return r.metrics[name]
+}
+
+// Gets an existing metric or creates and registers a new one. Threadsafe
+// alternative to calling Get and Register on failure.
+// The interface can be the metric to register if not found in registry,
+// or a function returning the metric for lazy instantiation.
+func (r *StandardRegistry) GetOrRegister(name string, i interface{}) interface{} {
+	// access the read lock first which should be re-entrant
+	r.mutex.RLock()
+	metric, ok := r.metrics[name]
+	r.mutex.RUnlock()
+	if ok {
+		return metric
+	}
+
+	// only take the write lock if we'll be modifying the metrics map
+	r.mutex.Lock()
+	defer r.mutex.Unlock()
+	if metric, ok := r.metrics[name]; ok {
+		return metric
+	}
+	if v := reflect.ValueOf(i); v.Kind() == reflect.Func {
+		i = v.Call(nil)[0].Interface()
+	}
+	r.register(name, i)
+	return i
+}
+
+// Register the given metric under the given name.  Returns a DuplicateMetric
+// if a metric by the given name is already registered.
+func (r *StandardRegistry) Register(name string, i interface{}) error {
+	r.mutex.Lock()
+	defer r.mutex.Unlock()
+	return r.register(name, i)
+}
+
+// Run all registered healthchecks.
+func (r *StandardRegistry) RunHealthchecks() {
+	r.mutex.RLock()
+	defer r.mutex.RUnlock()
+	for _, i := range r.metrics {
+		if h, ok := i.(Healthcheck); ok {
+			h.Check()
+		}
+	}
+}
+
+// GetAll metrics in the Registry
+func (r *StandardRegistry) GetAll() map[string]map[string]interface{} {
+	data := make(map[string]map[string]interface{})
+	r.Each(func(name string, i interface{}) {
+		values := make(map[string]interface{})
+		switch metric := i.(type) {
+		case Counter:
+			values["count"] = metric.Count()
+		case Gauge:
+			values["value"] = metric.Value()
+		case GaugeFloat64:
+			values["value"] = metric.Value()
+		case Healthcheck:
+			values["error"] = nil
+			metric.Check()
+			if err := metric.Error(); nil != err {
+				values["error"] = metric.Error().Error()
+			}
+		case Histogram:
+			h := metric.Snapshot()
+			ps := h.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999})
+			values["count"] = h.Count()
+			values["min"] = h.Min()
+			values["max"] = h.Max()
+			values["mean"] = h.Mean()
+			values["stddev"] = h.StdDev()
+			values["median"] = ps[0]
+			values["75%"] = ps[1]
+			values["95%"] = ps[2]
+			values["99%"] = ps[3]
+			values["99.9%"] = ps[4]
+		case Meter:
+			m := metric.Snapshot()
+			values["count"] = m.Count()
+			values["1m.rate"] = m.Rate1()
+			values["5m.rate"] = m.Rate5()
+			values["15m.rate"] = m.Rate15()
+			values["mean.rate"] = m.RateMean()
+		case Timer:
+			t := metric.Snapshot()
+			ps := t.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999})
+			values["count"] = t.Count()
+			values["min"] = t.Min()
+			values["max"] = t.Max()
+			values["mean"] = t.Mean()
+			values["stddev"] = t.StdDev()
+			values["median"] = ps[0]
+			values["75%"] = ps[1]
+			values["95%"] = ps[2]
+			values["99%"] = ps[3]
+			values["99.9%"] = ps[4]
+			values["1m.rate"] = t.Rate1()
+			values["5m.rate"] = t.Rate5()
+			values["15m.rate"] = t.Rate15()
+			values["mean.rate"] = t.RateMean()
+		}
+		data[name] = values
+	})
+	return data
+}
+
+// Unregister the metric with the given name.
+func (r *StandardRegistry) Unregister(name string) {
+	r.mutex.Lock()
+	defer r.mutex.Unlock()
+	r.stop(name)
+	delete(r.metrics, name)
+}
+
+// Unregister all metrics.  (Mostly for testing.)
+func (r *StandardRegistry) UnregisterAll() {
+	r.mutex.Lock()
+	defer r.mutex.Unlock()
+	for name, _ := range r.metrics {
+		r.stop(name)
+		delete(r.metrics, name)
+	}
+}
+
+func (r *StandardRegistry) register(name string, i interface{}) error {
+	if _, ok := r.metrics[name]; ok {
+		return DuplicateMetric(name)
+	}
+	switch i.(type) {
+	case Counter, Gauge, GaugeFloat64, Healthcheck, Histogram, Meter, Timer:
+		r.metrics[name] = i
+	}
+	return nil
+}
+
+func (r *StandardRegistry) registered() map[string]interface{} {
+	r.mutex.Lock()
+	defer r.mutex.Unlock()
+	metrics := make(map[string]interface{}, len(r.metrics))
+	for name, i := range r.metrics {
+		metrics[name] = i
+	}
+	return metrics
+}
+
+func (r *StandardRegistry) stop(name string) {
+	if i, ok := r.metrics[name]; ok {
+		if s, ok := i.(Stoppable); ok {
+			s.Stop()
+		}
+	}
+}
+
+// Stoppable defines the metrics which has to be stopped.
+type Stoppable interface {
+	Stop()
+}
+
+type PrefixedRegistry struct {
+	underlying Registry
+	prefix     string
+}
+
+func NewPrefixedRegistry(prefix string) Registry {
+	return &PrefixedRegistry{
+		underlying: NewRegistry(),
+		prefix:     prefix,
+	}
+}
+
+func NewPrefixedChildRegistry(parent Registry, prefix string) Registry {
+	return &PrefixedRegistry{
+		underlying: parent,
+		prefix:     prefix,
+	}
+}
+
+// Call the given function for each registered metric.
+func (r *PrefixedRegistry) Each(fn func(string, interface{})) {
+	wrappedFn := func(prefix string) func(string, interface{}) {
+		return func(name string, iface interface{}) {
+			if strings.HasPrefix(name, prefix) {
+				fn(name, iface)
+			} else {
+				return
+			}
+		}
+	}
+
+	baseRegistry, prefix := findPrefix(r, "")
+	baseRegistry.Each(wrappedFn(prefix))
+}
+
+func findPrefix(registry Registry, prefix string) (Registry, string) {
+	switch r := registry.(type) {
+	case *PrefixedRegistry:
+		return findPrefix(r.underlying, r.prefix+prefix)
+	case *StandardRegistry:
+		return r, prefix
+	}
+	return nil, ""
+}
+
+// Get the metric by the given name or nil if none is registered.
+func (r *PrefixedRegistry) Get(name string) interface{} {
+	realName := r.prefix + name
+	return r.underlying.Get(realName)
+}
+
+// Gets an existing metric or registers the given one.
+// The interface can be the metric to register if not found in registry,
+// or a function returning the metric for lazy instantiation.
+func (r *PrefixedRegistry) GetOrRegister(name string, metric interface{}) interface{} {
+	realName := r.prefix + name
+	return r.underlying.GetOrRegister(realName, metric)
+}
+
+// Register the given metric under the given name. The name will be prefixed.
+func (r *PrefixedRegistry) Register(name string, metric interface{}) error {
+	realName := r.prefix + name
+	return r.underlying.Register(realName, metric)
+}
+
+// Run all registered healthchecks.
+func (r *PrefixedRegistry) RunHealthchecks() {
+	r.underlying.RunHealthchecks()
+}
+
+// GetAll metrics in the Registry
+func (r *PrefixedRegistry) GetAll() map[string]map[string]interface{} {
+	return r.underlying.GetAll()
+}
+
+// Unregister the metric with the given name. The name will be prefixed.
+func (r *PrefixedRegistry) Unregister(name string) {
+	realName := r.prefix + name
+	r.underlying.Unregister(realName)
+}
+
+// Unregister all metrics.  (Mostly for testing.)
+func (r *PrefixedRegistry) UnregisterAll() {
+	r.underlying.UnregisterAll()
+}
+
+var DefaultRegistry Registry = NewRegistry()
+
+// Call the given function for each registered metric.
+func Each(f func(string, interface{})) {
+	DefaultRegistry.Each(f)
+}
+
+// Get the metric by the given name or nil if none is registered.
+func Get(name string) interface{} {
+	return DefaultRegistry.Get(name)
+}
+
+// Gets an existing metric or creates and registers a new one. Threadsafe
+// alternative to calling Get and Register on failure.
+func GetOrRegister(name string, i interface{}) interface{} {
+	return DefaultRegistry.GetOrRegister(name, i)
+}
+
+// Register the given metric under the given name.  Returns a DuplicateMetric
+// if a metric by the given name is already registered.
+func Register(name string, i interface{}) error {
+	return DefaultRegistry.Register(name, i)
+}
+
+// Register the given metric under the given name.  Panics if a metric by the
+// given name is already registered.
+func MustRegister(name string, i interface{}) {
+	if err := Register(name, i); err != nil {
+		panic(err)
+	}
+}
+
+// Run all registered healthchecks.
+func RunHealthchecks() {
+	DefaultRegistry.RunHealthchecks()
+}
+
+// Unregister the metric with the given name.
+func Unregister(name string) {
+	DefaultRegistry.Unregister(name)
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/runtime.go b/vendor/github.com/rcrowley/go-metrics/runtime.go
new file mode 100644
index 0000000..11c6b78
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/runtime.go

@@ -0,0 +1,212 @@
+package metrics
+
+import (
+	"runtime"
+	"runtime/pprof"
+	"time"
+)
+
+var (
+	memStats       runtime.MemStats
+	runtimeMetrics struct {
+		MemStats struct {
+			Alloc         Gauge
+			BuckHashSys   Gauge
+			DebugGC       Gauge
+			EnableGC      Gauge
+			Frees         Gauge
+			HeapAlloc     Gauge
+			HeapIdle      Gauge
+			HeapInuse     Gauge
+			HeapObjects   Gauge
+			HeapReleased  Gauge
+			HeapSys       Gauge
+			LastGC        Gauge
+			Lookups       Gauge
+			Mallocs       Gauge
+			MCacheInuse   Gauge
+			MCacheSys     Gauge
+			MSpanInuse    Gauge
+			MSpanSys      Gauge
+			NextGC        Gauge
+			NumGC         Gauge
+			GCCPUFraction GaugeFloat64
+			PauseNs       Histogram
+			PauseTotalNs  Gauge
+			StackInuse    Gauge
+			StackSys      Gauge
+			Sys           Gauge
+			TotalAlloc    Gauge
+		}
+		NumCgoCall   Gauge
+		NumGoroutine Gauge
+		NumThread    Gauge
+		ReadMemStats Timer
+	}
+	frees       uint64
+	lookups     uint64
+	mallocs     uint64
+	numGC       uint32
+	numCgoCalls int64
+
+	threadCreateProfile = pprof.Lookup("threadcreate")
+)
+
+// Capture new values for the Go runtime statistics exported in
+// runtime.MemStats.  This is designed to be called as a goroutine.
+func CaptureRuntimeMemStats(r Registry, d time.Duration) {
+	for _ = range time.Tick(d) {
+		CaptureRuntimeMemStatsOnce(r)
+	}
+}
+
+// Capture new values for the Go runtime statistics exported in
+// runtime.MemStats.  This is designed to be called in a background
+// goroutine.  Giving a registry which has not been given to
+// RegisterRuntimeMemStats will panic.
+//
+// Be very careful with this because runtime.ReadMemStats calls the C
+// functions runtime·semacquire(&runtime·worldsema) and runtime·stoptheworld()
+// and that last one does what it says on the tin.
+func CaptureRuntimeMemStatsOnce(r Registry) {
+	t := time.Now()
+	runtime.ReadMemStats(&memStats) // This takes 50-200us.
+	runtimeMetrics.ReadMemStats.UpdateSince(t)
+
+	runtimeMetrics.MemStats.Alloc.Update(int64(memStats.Alloc))
+	runtimeMetrics.MemStats.BuckHashSys.Update(int64(memStats.BuckHashSys))
+	if memStats.DebugGC {
+		runtimeMetrics.MemStats.DebugGC.Update(1)
+	} else {
+		runtimeMetrics.MemStats.DebugGC.Update(0)
+	}
+	if memStats.EnableGC {
+		runtimeMetrics.MemStats.EnableGC.Update(1)
+	} else {
+		runtimeMetrics.MemStats.EnableGC.Update(0)
+	}
+
+	runtimeMetrics.MemStats.Frees.Update(int64(memStats.Frees - frees))
+	runtimeMetrics.MemStats.HeapAlloc.Update(int64(memStats.HeapAlloc))
+	runtimeMetrics.MemStats.HeapIdle.Update(int64(memStats.HeapIdle))
+	runtimeMetrics.MemStats.HeapInuse.Update(int64(memStats.HeapInuse))
+	runtimeMetrics.MemStats.HeapObjects.Update(int64(memStats.HeapObjects))
+	runtimeMetrics.MemStats.HeapReleased.Update(int64(memStats.HeapReleased))
+	runtimeMetrics.MemStats.HeapSys.Update(int64(memStats.HeapSys))
+	runtimeMetrics.MemStats.LastGC.Update(int64(memStats.LastGC))
+	runtimeMetrics.MemStats.Lookups.Update(int64(memStats.Lookups - lookups))
+	runtimeMetrics.MemStats.Mallocs.Update(int64(memStats.Mallocs - mallocs))
+	runtimeMetrics.MemStats.MCacheInuse.Update(int64(memStats.MCacheInuse))
+	runtimeMetrics.MemStats.MCacheSys.Update(int64(memStats.MCacheSys))
+	runtimeMetrics.MemStats.MSpanInuse.Update(int64(memStats.MSpanInuse))
+	runtimeMetrics.MemStats.MSpanSys.Update(int64(memStats.MSpanSys))
+	runtimeMetrics.MemStats.NextGC.Update(int64(memStats.NextGC))
+	runtimeMetrics.MemStats.NumGC.Update(int64(memStats.NumGC - numGC))
+	runtimeMetrics.MemStats.GCCPUFraction.Update(gcCPUFraction(&memStats))
+
+	// <https://code.google.com/p/go/source/browse/src/pkg/runtime/mgc0.c>
+	i := numGC % uint32(len(memStats.PauseNs))
+	ii := memStats.NumGC % uint32(len(memStats.PauseNs))
+	if memStats.NumGC-numGC >= uint32(len(memStats.PauseNs)) {
+		for i = 0; i < uint32(len(memStats.PauseNs)); i++ {
+			runtimeMetrics.MemStats.PauseNs.Update(int64(memStats.PauseNs[i]))
+		}
+	} else {
+		if i > ii {
+			for ; i < uint32(len(memStats.PauseNs)); i++ {
+				runtimeMetrics.MemStats.PauseNs.Update(int64(memStats.PauseNs[i]))
+			}
+			i = 0
+		}
+		for ; i < ii; i++ {
+			runtimeMetrics.MemStats.PauseNs.Update(int64(memStats.PauseNs[i]))
+		}
+	}
+	frees = memStats.Frees
+	lookups = memStats.Lookups
+	mallocs = memStats.Mallocs
+	numGC = memStats.NumGC
+
+	runtimeMetrics.MemStats.PauseTotalNs.Update(int64(memStats.PauseTotalNs))
+	runtimeMetrics.MemStats.StackInuse.Update(int64(memStats.StackInuse))
+	runtimeMetrics.MemStats.StackSys.Update(int64(memStats.StackSys))
+	runtimeMetrics.MemStats.Sys.Update(int64(memStats.Sys))
+	runtimeMetrics.MemStats.TotalAlloc.Update(int64(memStats.TotalAlloc))
+
+	currentNumCgoCalls := numCgoCall()
+	runtimeMetrics.NumCgoCall.Update(currentNumCgoCalls - numCgoCalls)
+	numCgoCalls = currentNumCgoCalls
+
+	runtimeMetrics.NumGoroutine.Update(int64(runtime.NumGoroutine()))
+
+	runtimeMetrics.NumThread.Update(int64(threadCreateProfile.Count()))
+}
+
+// Register runtimeMetrics for the Go runtime statistics exported in runtime and
+// specifically runtime.MemStats.  The runtimeMetrics are named by their
+// fully-qualified Go symbols, i.e. runtime.MemStats.Alloc.
+func RegisterRuntimeMemStats(r Registry) {
+	runtimeMetrics.MemStats.Alloc = NewGauge()
+	runtimeMetrics.MemStats.BuckHashSys = NewGauge()
+	runtimeMetrics.MemStats.DebugGC = NewGauge()
+	runtimeMetrics.MemStats.EnableGC = NewGauge()
+	runtimeMetrics.MemStats.Frees = NewGauge()
+	runtimeMetrics.MemStats.HeapAlloc = NewGauge()
+	runtimeMetrics.MemStats.HeapIdle = NewGauge()
+	runtimeMetrics.MemStats.HeapInuse = NewGauge()
+	runtimeMetrics.MemStats.HeapObjects = NewGauge()
+	runtimeMetrics.MemStats.HeapReleased = NewGauge()
+	runtimeMetrics.MemStats.HeapSys = NewGauge()
+	runtimeMetrics.MemStats.LastGC = NewGauge()
+	runtimeMetrics.MemStats.Lookups = NewGauge()
+	runtimeMetrics.MemStats.Mallocs = NewGauge()
+	runtimeMetrics.MemStats.MCacheInuse = NewGauge()
+	runtimeMetrics.MemStats.MCacheSys = NewGauge()
+	runtimeMetrics.MemStats.MSpanInuse = NewGauge()
+	runtimeMetrics.MemStats.MSpanSys = NewGauge()
+	runtimeMetrics.MemStats.NextGC = NewGauge()
+	runtimeMetrics.MemStats.NumGC = NewGauge()
+	runtimeMetrics.MemStats.GCCPUFraction = NewGaugeFloat64()
+	runtimeMetrics.MemStats.PauseNs = NewHistogram(NewExpDecaySample(1028, 0.015))
+	runtimeMetrics.MemStats.PauseTotalNs = NewGauge()
+	runtimeMetrics.MemStats.StackInuse = NewGauge()
+	runtimeMetrics.MemStats.StackSys = NewGauge()
+	runtimeMetrics.MemStats.Sys = NewGauge()
+	runtimeMetrics.MemStats.TotalAlloc = NewGauge()
+	runtimeMetrics.NumCgoCall = NewGauge()
+	runtimeMetrics.NumGoroutine = NewGauge()
+	runtimeMetrics.NumThread = NewGauge()
+	runtimeMetrics.ReadMemStats = NewTimer()
+
+	r.Register("runtime.MemStats.Alloc", runtimeMetrics.MemStats.Alloc)
+	r.Register("runtime.MemStats.BuckHashSys", runtimeMetrics.MemStats.BuckHashSys)
+	r.Register("runtime.MemStats.DebugGC", runtimeMetrics.MemStats.DebugGC)
+	r.Register("runtime.MemStats.EnableGC", runtimeMetrics.MemStats.EnableGC)
+	r.Register("runtime.MemStats.Frees", runtimeMetrics.MemStats.Frees)
+	r.Register("runtime.MemStats.HeapAlloc", runtimeMetrics.MemStats.HeapAlloc)
+	r.Register("runtime.MemStats.HeapIdle", runtimeMetrics.MemStats.HeapIdle)
+	r.Register("runtime.MemStats.HeapInuse", runtimeMetrics.MemStats.HeapInuse)
+	r.Register("runtime.MemStats.HeapObjects", runtimeMetrics.MemStats.HeapObjects)
+	r.Register("runtime.MemStats.HeapReleased", runtimeMetrics.MemStats.HeapReleased)
+	r.Register("runtime.MemStats.HeapSys", runtimeMetrics.MemStats.HeapSys)
+	r.Register("runtime.MemStats.LastGC", runtimeMetrics.MemStats.LastGC)
+	r.Register("runtime.MemStats.Lookups", runtimeMetrics.MemStats.Lookups)
+	r.Register("runtime.MemStats.Mallocs", runtimeMetrics.MemStats.Mallocs)
+	r.Register("runtime.MemStats.MCacheInuse", runtimeMetrics.MemStats.MCacheInuse)
+	r.Register("runtime.MemStats.MCacheSys", runtimeMetrics.MemStats.MCacheSys)
+	r.Register("runtime.MemStats.MSpanInuse", runtimeMetrics.MemStats.MSpanInuse)
+	r.Register("runtime.MemStats.MSpanSys", runtimeMetrics.MemStats.MSpanSys)
+	r.Register("runtime.MemStats.NextGC", runtimeMetrics.MemStats.NextGC)
+	r.Register("runtime.MemStats.NumGC", runtimeMetrics.MemStats.NumGC)
+	r.Register("runtime.MemStats.GCCPUFraction", runtimeMetrics.MemStats.GCCPUFraction)
+	r.Register("runtime.MemStats.PauseNs", runtimeMetrics.MemStats.PauseNs)
+	r.Register("runtime.MemStats.PauseTotalNs", runtimeMetrics.MemStats.PauseTotalNs)
+	r.Register("runtime.MemStats.StackInuse", runtimeMetrics.MemStats.StackInuse)
+	r.Register("runtime.MemStats.StackSys", runtimeMetrics.MemStats.StackSys)
+	r.Register("runtime.MemStats.Sys", runtimeMetrics.MemStats.Sys)
+	r.Register("runtime.MemStats.TotalAlloc", runtimeMetrics.MemStats.TotalAlloc)
+	r.Register("runtime.NumCgoCall", runtimeMetrics.NumCgoCall)
+	r.Register("runtime.NumGoroutine", runtimeMetrics.NumGoroutine)
+	r.Register("runtime.NumThread", runtimeMetrics.NumThread)
+	r.Register("runtime.ReadMemStats", runtimeMetrics.ReadMemStats)
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/runtime_cgo.go b/vendor/github.com/rcrowley/go-metrics/runtime_cgo.go
new file mode 100644
index 0000000..e3391f4
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/runtime_cgo.go

@@ -0,0 +1,10 @@
+// +build cgo
+// +build !appengine
+
+package metrics
+
+import "runtime"
+
+func numCgoCall() int64 {
+	return runtime.NumCgoCall()
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/runtime_gccpufraction.go b/vendor/github.com/rcrowley/go-metrics/runtime_gccpufraction.go
new file mode 100644
index 0000000..ca12c05
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/runtime_gccpufraction.go

@@ -0,0 +1,9 @@
+// +build go1.5
+
+package metrics
+
+import "runtime"
+
+func gcCPUFraction(memStats *runtime.MemStats) float64 {
+	return memStats.GCCPUFraction
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/runtime_no_cgo.go b/vendor/github.com/rcrowley/go-metrics/runtime_no_cgo.go
new file mode 100644
index 0000000..616a3b4
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/runtime_no_cgo.go

@@ -0,0 +1,7 @@
+// +build !cgo appengine
+
+package metrics
+
+func numCgoCall() int64 {
+	return 0
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/runtime_no_gccpufraction.go b/vendor/github.com/rcrowley/go-metrics/runtime_no_gccpufraction.go
new file mode 100644
index 0000000..be96aa6
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/runtime_no_gccpufraction.go

@@ -0,0 +1,9 @@
+// +build !go1.5
+
+package metrics
+
+import "runtime"
+
+func gcCPUFraction(memStats *runtime.MemStats) float64 {
+	return 0
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/sample.go b/vendor/github.com/rcrowley/go-metrics/sample.go
new file mode 100644
index 0000000..fecee5e
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/sample.go

@@ -0,0 +1,616 @@
+package metrics
+
+import (
+	"math"
+	"math/rand"
+	"sort"
+	"sync"
+	"time"
+)
+
+const rescaleThreshold = time.Hour
+
+// Samples maintain a statistically-significant selection of values from
+// a stream.
+type Sample interface {
+	Clear()
+	Count() int64
+	Max() int64
+	Mean() float64
+	Min() int64
+	Percentile(float64) float64
+	Percentiles([]float64) []float64
+	Size() int
+	Snapshot() Sample
+	StdDev() float64
+	Sum() int64
+	Update(int64)
+	Values() []int64
+	Variance() float64
+}
+
+// ExpDecaySample is an exponentially-decaying sample using a forward-decaying
+// priority reservoir.  See Cormode et al's "Forward Decay: A Practical Time
+// Decay Model for Streaming Systems".
+//
+// <http://dimacs.rutgers.edu/~graham/pubs/papers/fwddecay.pdf>
+type ExpDecaySample struct {
+	alpha         float64
+	count         int64
+	mutex         sync.Mutex
+	reservoirSize int
+	t0, t1        time.Time
+	values        *expDecaySampleHeap
+}
+
+// NewExpDecaySample constructs a new exponentially-decaying sample with the
+// given reservoir size and alpha.
+func NewExpDecaySample(reservoirSize int, alpha float64) Sample {
+	if UseNilMetrics {
+		return NilSample{}
+	}
+	s := &ExpDecaySample{
+		alpha:         alpha,
+		reservoirSize: reservoirSize,
+		t0:            time.Now(),
+		values:        newExpDecaySampleHeap(reservoirSize),
+	}
+	s.t1 = s.t0.Add(rescaleThreshold)
+	return s
+}
+
+// Clear clears all samples.
+func (s *ExpDecaySample) Clear() {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	s.count = 0
+	s.t0 = time.Now()
+	s.t1 = s.t0.Add(rescaleThreshold)
+	s.values.Clear()
+}
+
+// Count returns the number of samples recorded, which may exceed the
+// reservoir size.
+func (s *ExpDecaySample) Count() int64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return s.count
+}
+
+// Max returns the maximum value in the sample, which may not be the maximum
+// value ever to be part of the sample.
+func (s *ExpDecaySample) Max() int64 {
+	return SampleMax(s.Values())
+}
+
+// Mean returns the mean of the values in the sample.
+func (s *ExpDecaySample) Mean() float64 {
+	return SampleMean(s.Values())
+}
+
+// Min returns the minimum value in the sample, which may not be the minimum
+// value ever to be part of the sample.
+func (s *ExpDecaySample) Min() int64 {
+	return SampleMin(s.Values())
+}
+
+// Percentile returns an arbitrary percentile of values in the sample.
+func (s *ExpDecaySample) Percentile(p float64) float64 {
+	return SamplePercentile(s.Values(), p)
+}
+
+// Percentiles returns a slice of arbitrary percentiles of values in the
+// sample.
+func (s *ExpDecaySample) Percentiles(ps []float64) []float64 {
+	return SamplePercentiles(s.Values(), ps)
+}
+
+// Size returns the size of the sample, which is at most the reservoir size.
+func (s *ExpDecaySample) Size() int {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return s.values.Size()
+}
+
+// Snapshot returns a read-only copy of the sample.
+func (s *ExpDecaySample) Snapshot() Sample {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	vals := s.values.Values()
+	values := make([]int64, len(vals))
+	for i, v := range vals {
+		values[i] = v.v
+	}
+	return &SampleSnapshot{
+		count:  s.count,
+		values: values,
+	}
+}
+
+// StdDev returns the standard deviation of the values in the sample.
+func (s *ExpDecaySample) StdDev() float64 {
+	return SampleStdDev(s.Values())
+}
+
+// Sum returns the sum of the values in the sample.
+func (s *ExpDecaySample) Sum() int64 {
+	return SampleSum(s.Values())
+}
+
+// Update samples a new value.
+func (s *ExpDecaySample) Update(v int64) {
+	s.update(time.Now(), v)
+}
+
+// Values returns a copy of the values in the sample.
+func (s *ExpDecaySample) Values() []int64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	vals := s.values.Values()
+	values := make([]int64, len(vals))
+	for i, v := range vals {
+		values[i] = v.v
+	}
+	return values
+}
+
+// Variance returns the variance of the values in the sample.
+func (s *ExpDecaySample) Variance() float64 {
+	return SampleVariance(s.Values())
+}
+
+// update samples a new value at a particular timestamp.  This is a method all
+// its own to facilitate testing.
+func (s *ExpDecaySample) update(t time.Time, v int64) {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	s.count++
+	if s.values.Size() == s.reservoirSize {
+		s.values.Pop()
+	}
+	s.values.Push(expDecaySample{
+		k: math.Exp(t.Sub(s.t0).Seconds()*s.alpha) / rand.Float64(),
+		v: v,
+	})
+	if t.After(s.t1) {
+		values := s.values.Values()
+		t0 := s.t0
+		s.values.Clear()
+		s.t0 = t
+		s.t1 = s.t0.Add(rescaleThreshold)
+		for _, v := range values {
+			v.k = v.k * math.Exp(-s.alpha*s.t0.Sub(t0).Seconds())
+			s.values.Push(v)
+		}
+	}
+}
+
+// NilSample is a no-op Sample.
+type NilSample struct{}
+
+// Clear is a no-op.
+func (NilSample) Clear() {}
+
+// Count is a no-op.
+func (NilSample) Count() int64 { return 0 }
+
+// Max is a no-op.
+func (NilSample) Max() int64 { return 0 }
+
+// Mean is a no-op.
+func (NilSample) Mean() float64 { return 0.0 }
+
+// Min is a no-op.
+func (NilSample) Min() int64 { return 0 }
+
+// Percentile is a no-op.
+func (NilSample) Percentile(p float64) float64 { return 0.0 }
+
+// Percentiles is a no-op.
+func (NilSample) Percentiles(ps []float64) []float64 {
+	return make([]float64, len(ps))
+}
+
+// Size is a no-op.
+func (NilSample) Size() int { return 0 }
+
+// Sample is a no-op.
+func (NilSample) Snapshot() Sample { return NilSample{} }
+
+// StdDev is a no-op.
+func (NilSample) StdDev() float64 { return 0.0 }
+
+// Sum is a no-op.
+func (NilSample) Sum() int64 { return 0 }
+
+// Update is a no-op.
+func (NilSample) Update(v int64) {}
+
+// Values is a no-op.
+func (NilSample) Values() []int64 { return []int64{} }
+
+// Variance is a no-op.
+func (NilSample) Variance() float64 { return 0.0 }
+
+// SampleMax returns the maximum value of the slice of int64.
+func SampleMax(values []int64) int64 {
+	if 0 == len(values) {
+		return 0
+	}
+	var max int64 = math.MinInt64
+	for _, v := range values {
+		if max < v {
+			max = v
+		}
+	}
+	return max
+}
+
+// SampleMean returns the mean value of the slice of int64.
+func SampleMean(values []int64) float64 {
+	if 0 == len(values) {
+		return 0.0
+	}
+	return float64(SampleSum(values)) / float64(len(values))
+}
+
+// SampleMin returns the minimum value of the slice of int64.
+func SampleMin(values []int64) int64 {
+	if 0 == len(values) {
+		return 0
+	}
+	var min int64 = math.MaxInt64
+	for _, v := range values {
+		if min > v {
+			min = v
+		}
+	}
+	return min
+}
+
+// SamplePercentiles returns an arbitrary percentile of the slice of int64.
+func SamplePercentile(values int64Slice, p float64) float64 {
+	return SamplePercentiles(values, []float64{p})[0]
+}
+
+// SamplePercentiles returns a slice of arbitrary percentiles of the slice of
+// int64.
+func SamplePercentiles(values int64Slice, ps []float64) []float64 {
+	scores := make([]float64, len(ps))
+	size := len(values)
+	if size > 0 {
+		sort.Sort(values)
+		for i, p := range ps {
+			pos := p * float64(size+1)
+			if pos < 1.0 {
+				scores[i] = float64(values[0])
+			} else if pos >= float64(size) {
+				scores[i] = float64(values[size-1])
+			} else {
+				lower := float64(values[int(pos)-1])
+				upper := float64(values[int(pos)])
+				scores[i] = lower + (pos-math.Floor(pos))*(upper-lower)
+			}
+		}
+	}
+	return scores
+}
+
+// SampleSnapshot is a read-only copy of another Sample.
+type SampleSnapshot struct {
+	count  int64
+	values []int64
+}
+
+func NewSampleSnapshot(count int64, values []int64) *SampleSnapshot {
+	return &SampleSnapshot{
+		count:  count,
+		values: values,
+	}
+}
+
+// Clear panics.
+func (*SampleSnapshot) Clear() {
+	panic("Clear called on a SampleSnapshot")
+}
+
+// Count returns the count of inputs at the time the snapshot was taken.
+func (s *SampleSnapshot) Count() int64 { return s.count }
+
+// Max returns the maximal value at the time the snapshot was taken.
+func (s *SampleSnapshot) Max() int64 { return SampleMax(s.values) }
+
+// Mean returns the mean value at the time the snapshot was taken.
+func (s *SampleSnapshot) Mean() float64 { return SampleMean(s.values) }
+
+// Min returns the minimal value at the time the snapshot was taken.
+func (s *SampleSnapshot) Min() int64 { return SampleMin(s.values) }
+
+// Percentile returns an arbitrary percentile of values at the time the
+// snapshot was taken.
+func (s *SampleSnapshot) Percentile(p float64) float64 {
+	return SamplePercentile(s.values, p)
+}
+
+// Percentiles returns a slice of arbitrary percentiles of values at the time
+// the snapshot was taken.
+func (s *SampleSnapshot) Percentiles(ps []float64) []float64 {
+	return SamplePercentiles(s.values, ps)
+}
+
+// Size returns the size of the sample at the time the snapshot was taken.
+func (s *SampleSnapshot) Size() int { return len(s.values) }
+
+// Snapshot returns the snapshot.
+func (s *SampleSnapshot) Snapshot() Sample { return s }
+
+// StdDev returns the standard deviation of values at the time the snapshot was
+// taken.
+func (s *SampleSnapshot) StdDev() float64 { return SampleStdDev(s.values) }
+
+// Sum returns the sum of values at the time the snapshot was taken.
+func (s *SampleSnapshot) Sum() int64 { return SampleSum(s.values) }
+
+// Update panics.
+func (*SampleSnapshot) Update(int64) {
+	panic("Update called on a SampleSnapshot")
+}
+
+// Values returns a copy of the values in the sample.
+func (s *SampleSnapshot) Values() []int64 {
+	values := make([]int64, len(s.values))
+	copy(values, s.values)
+	return values
+}
+
+// Variance returns the variance of values at the time the snapshot was taken.
+func (s *SampleSnapshot) Variance() float64 { return SampleVariance(s.values) }
+
+// SampleStdDev returns the standard deviation of the slice of int64.
+func SampleStdDev(values []int64) float64 {
+	return math.Sqrt(SampleVariance(values))
+}
+
+// SampleSum returns the sum of the slice of int64.
+func SampleSum(values []int64) int64 {
+	var sum int64
+	for _, v := range values {
+		sum += v
+	}
+	return sum
+}
+
+// SampleVariance returns the variance of the slice of int64.
+func SampleVariance(values []int64) float64 {
+	if 0 == len(values) {
+		return 0.0
+	}
+	m := SampleMean(values)
+	var sum float64
+	for _, v := range values {
+		d := float64(v) - m
+		sum += d * d
+	}
+	return sum / float64(len(values))
+}
+
+// A uniform sample using Vitter's Algorithm R.
+//
+// <http://www.cs.umd.edu/~samir/498/vitter.pdf>
+type UniformSample struct {
+	count         int64
+	mutex         sync.Mutex
+	reservoirSize int
+	values        []int64
+}
+
+// NewUniformSample constructs a new uniform sample with the given reservoir
+// size.
+func NewUniformSample(reservoirSize int) Sample {
+	if UseNilMetrics {
+		return NilSample{}
+	}
+	return &UniformSample{
+		reservoirSize: reservoirSize,
+		values:        make([]int64, 0, reservoirSize),
+	}
+}
+
+// Clear clears all samples.
+func (s *UniformSample) Clear() {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	s.count = 0
+	s.values = make([]int64, 0, s.reservoirSize)
+}
+
+// Count returns the number of samples recorded, which may exceed the
+// reservoir size.
+func (s *UniformSample) Count() int64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return s.count
+}
+
+// Max returns the maximum value in the sample, which may not be the maximum
+// value ever to be part of the sample.
+func (s *UniformSample) Max() int64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return SampleMax(s.values)
+}
+
+// Mean returns the mean of the values in the sample.
+func (s *UniformSample) Mean() float64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return SampleMean(s.values)
+}
+
+// Min returns the minimum value in the sample, which may not be the minimum
+// value ever to be part of the sample.
+func (s *UniformSample) Min() int64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return SampleMin(s.values)
+}
+
+// Percentile returns an arbitrary percentile of values in the sample.
+func (s *UniformSample) Percentile(p float64) float64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return SamplePercentile(s.values, p)
+}
+
+// Percentiles returns a slice of arbitrary percentiles of values in the
+// sample.
+func (s *UniformSample) Percentiles(ps []float64) []float64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return SamplePercentiles(s.values, ps)
+}
+
+// Size returns the size of the sample, which is at most the reservoir size.
+func (s *UniformSample) Size() int {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return len(s.values)
+}
+
+// Snapshot returns a read-only copy of the sample.
+func (s *UniformSample) Snapshot() Sample {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	values := make([]int64, len(s.values))
+	copy(values, s.values)
+	return &SampleSnapshot{
+		count:  s.count,
+		values: values,
+	}
+}
+
+// StdDev returns the standard deviation of the values in the sample.
+func (s *UniformSample) StdDev() float64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return SampleStdDev(s.values)
+}
+
+// Sum returns the sum of the values in the sample.
+func (s *UniformSample) Sum() int64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return SampleSum(s.values)
+}
+
+// Update samples a new value.
+func (s *UniformSample) Update(v int64) {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	s.count++
+	if len(s.values) < s.reservoirSize {
+		s.values = append(s.values, v)
+	} else {
+		r := rand.Int63n(s.count)
+		if r < int64(len(s.values)) {
+			s.values[int(r)] = v
+		}
+	}
+}
+
+// Values returns a copy of the values in the sample.
+func (s *UniformSample) Values() []int64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	values := make([]int64, len(s.values))
+	copy(values, s.values)
+	return values
+}
+
+// Variance returns the variance of the values in the sample.
+func (s *UniformSample) Variance() float64 {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return SampleVariance(s.values)
+}
+
+// expDecaySample represents an individual sample in a heap.
+type expDecaySample struct {
+	k float64
+	v int64
+}
+
+func newExpDecaySampleHeap(reservoirSize int) *expDecaySampleHeap {
+	return &expDecaySampleHeap{make([]expDecaySample, 0, reservoirSize)}
+}
+
+// expDecaySampleHeap is a min-heap of expDecaySamples.
+// The internal implementation is copied from the standard library's container/heap
+type expDecaySampleHeap struct {
+	s []expDecaySample
+}
+
+func (h *expDecaySampleHeap) Clear() {
+	h.s = h.s[:0]
+}
+
+func (h *expDecaySampleHeap) Push(s expDecaySample) {
+	n := len(h.s)
+	h.s = h.s[0 : n+1]
+	h.s[n] = s
+	h.up(n)
+}
+
+func (h *expDecaySampleHeap) Pop() expDecaySample {
+	n := len(h.s) - 1
+	h.s[0], h.s[n] = h.s[n], h.s[0]
+	h.down(0, n)
+
+	n = len(h.s)
+	s := h.s[n-1]
+	h.s = h.s[0 : n-1]
+	return s
+}
+
+func (h *expDecaySampleHeap) Size() int {
+	return len(h.s)
+}
+
+func (h *expDecaySampleHeap) Values() []expDecaySample {
+	return h.s
+}
+
+func (h *expDecaySampleHeap) up(j int) {
+	for {
+		i := (j - 1) / 2 // parent
+		if i == j || !(h.s[j].k < h.s[i].k) {
+			break
+		}
+		h.s[i], h.s[j] = h.s[j], h.s[i]
+		j = i
+	}
+}
+
+func (h *expDecaySampleHeap) down(i, n int) {
+	for {
+		j1 := 2*i + 1
+		if j1 >= n || j1 < 0 { // j1 < 0 after int overflow
+			break
+		}
+		j := j1 // left child
+		if j2 := j1 + 1; j2 < n && !(h.s[j1].k < h.s[j2].k) {
+			j = j2 // = 2*i + 2  // right child
+		}
+		if !(h.s[j].k < h.s[i].k) {
+			break
+		}
+		h.s[i], h.s[j] = h.s[j], h.s[i]
+		i = j
+	}
+}
+
+type int64Slice []int64
+
+func (p int64Slice) Len() int           { return len(p) }
+func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] }
+func (p int64Slice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }

diff --git a/vendor/github.com/rcrowley/go-metrics/syslog.go b/vendor/github.com/rcrowley/go-metrics/syslog.go
new file mode 100644
index 0000000..693f190
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/syslog.go

@@ -0,0 +1,78 @@
+// +build !windows
+
+package metrics
+
+import (
+	"fmt"
+	"log/syslog"
+	"time"
+)
+
+// Output each metric in the given registry to syslog periodically using
+// the given syslogger.
+func Syslog(r Registry, d time.Duration, w *syslog.Writer) {
+	for _ = range time.Tick(d) {
+		r.Each(func(name string, i interface{}) {
+			switch metric := i.(type) {
+			case Counter:
+				w.Info(fmt.Sprintf("counter %s: count: %d", name, metric.Count()))
+			case Gauge:
+				w.Info(fmt.Sprintf("gauge %s: value: %d", name, metric.Value()))
+			case GaugeFloat64:
+				w.Info(fmt.Sprintf("gauge %s: value: %f", name, metric.Value()))
+			case Healthcheck:
+				metric.Check()
+				w.Info(fmt.Sprintf("healthcheck %s: error: %v", name, metric.Error()))
+			case Histogram:
+				h := metric.Snapshot()
+				ps := h.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999})
+				w.Info(fmt.Sprintf(
+					"histogram %s: count: %d min: %d max: %d mean: %.2f stddev: %.2f median: %.2f 75%%: %.2f 95%%: %.2f 99%%: %.2f 99.9%%: %.2f",
+					name,
+					h.Count(),
+					h.Min(),
+					h.Max(),
+					h.Mean(),
+					h.StdDev(),
+					ps[0],
+					ps[1],
+					ps[2],
+					ps[3],
+					ps[4],
+				))
+			case Meter:
+				m := metric.Snapshot()
+				w.Info(fmt.Sprintf(
+					"meter %s: count: %d 1-min: %.2f 5-min: %.2f 15-min: %.2f mean: %.2f",
+					name,
+					m.Count(),
+					m.Rate1(),
+					m.Rate5(),
+					m.Rate15(),
+					m.RateMean(),
+				))
+			case Timer:
+				t := metric.Snapshot()
+				ps := t.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999})
+				w.Info(fmt.Sprintf(
+					"timer %s: count: %d min: %d max: %d mean: %.2f stddev: %.2f median: %.2f 75%%: %.2f 95%%: %.2f 99%%: %.2f 99.9%%: %.2f 1-min: %.2f 5-min: %.2f 15-min: %.2f mean-rate: %.2f",
+					name,
+					t.Count(),
+					t.Min(),
+					t.Max(),
+					t.Mean(),
+					t.StdDev(),
+					ps[0],
+					ps[1],
+					ps[2],
+					ps[3],
+					ps[4],
+					t.Rate1(),
+					t.Rate5(),
+					t.Rate15(),
+					t.RateMean(),
+				))
+			}
+		})
+	}
+}

diff --git a/vendor/github.com/rcrowley/go-metrics/timer.go b/vendor/github.com/rcrowley/go-metrics/timer.go
new file mode 100644
index 0000000..d6ec4c6
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/timer.go

@@ -0,0 +1,329 @@
+package metrics
+
+import (
+	"sync"
+	"time"
+)
+
+// Timers capture the duration and rate of events.
+type Timer interface {
+	Count() int64
+	Max() int64
+	Mean() float64
+	Min() int64
+	Percentile(float64) float64
+	Percentiles([]float64) []float64
+	Rate1() float64
+	Rate5() float64
+	Rate15() float64
+	RateMean() float64
+	Snapshot() Timer
+	StdDev() float64
+	Stop()
+	Sum() int64
+	Time(func())
+	Update(time.Duration)
+	UpdateSince(time.Time)
+	Variance() float64
+}
+
+// GetOrRegisterTimer returns an existing Timer or constructs and registers a
+// new StandardTimer.
+// Be sure to unregister the meter from the registry once it is of no use to
+// allow for garbage collection.
+func GetOrRegisterTimer(name string, r Registry) Timer {
+	if nil == r {
+		r = DefaultRegistry
+	}
+	return r.GetOrRegister(name, NewTimer).(Timer)
+}
+
+// NewCustomTimer constructs a new StandardTimer from a Histogram and a Meter.
+// Be sure to call Stop() once the timer is of no use to allow for garbage collection.
+func NewCustomTimer(h Histogram, m Meter) Timer {
+	if UseNilMetrics {
+		return NilTimer{}
+	}
+	return &StandardTimer{
+		histogram: h,
+		meter:     m,
+	}
+}
+
+// NewRegisteredTimer constructs and registers a new StandardTimer.
+// Be sure to unregister the meter from the registry once it is of no use to
+// allow for garbage collection.
+func NewRegisteredTimer(name string, r Registry) Timer {
+	c := NewTimer()
+	if nil == r {
+		r = DefaultRegistry
+	}
+	r.Register(name, c)
+	return c
+}
+
+// NewTimer constructs a new StandardTimer using an exponentially-decaying
+// sample with the same reservoir size and alpha as UNIX load averages.
+// Be sure to call Stop() once the timer is of no use to allow for garbage collection.
+func NewTimer() Timer {
+	if UseNilMetrics {
+		return NilTimer{}
+	}
+	return &StandardTimer{
+		histogram: NewHistogram(NewExpDecaySample(1028, 0.015)),
+		meter:     NewMeter(),
+	}
+}
+
+// NilTimer is a no-op Timer.
+type NilTimer struct {
+	h Histogram
+	m Meter
+}
+
+// Count is a no-op.
+func (NilTimer) Count() int64 { return 0 }
+
+// Max is a no-op.
+func (NilTimer) Max() int64 { return 0 }
+
+// Mean is a no-op.
+func (NilTimer) Mean() float64 { return 0.0 }
+
+// Min is a no-op.
+func (NilTimer) Min() int64 { return 0 }
+
+// Percentile is a no-op.
+func (NilTimer) Percentile(p float64) float64 { return 0.0 }
+
+// Percentiles is a no-op.
+func (NilTimer) Percentiles(ps []float64) []float64 {
+	return make([]float64, len(ps))
+}
+
+// Rate1 is a no-op.
+func (NilTimer) Rate1() float64 { return 0.0 }
+
+// Rate5 is a no-op.
+func (NilTimer) Rate5() float64 { return 0.0 }
+
+// Rate15 is a no-op.
+func (NilTimer) Rate15() float64 { return 0.0 }
+
+// RateMean is a no-op.
+func (NilTimer) RateMean() float64 { return 0.0 }
+
+// Snapshot is a no-op.
+func (NilTimer) Snapshot() Timer { return NilTimer{} }
+
+// StdDev is a no-op.
+func (NilTimer) StdDev() float64 { return 0.0 }
+
+// Stop is a no-op.
+func (NilTimer) Stop() {}
+
+// Sum is a no-op.
+func (NilTimer) Sum() int64 { return 0 }
+
+// Time is a no-op.
+func (NilTimer) Time(func()) {}
+
+// Update is a no-op.
+func (NilTimer) Update(time.Duration) {}
+
+// UpdateSince is a no-op.
+func (NilTimer) UpdateSince(time.Time) {}
+
+// Variance is a no-op.
+func (NilTimer) Variance() float64 { return 0.0 }
+
+// StandardTimer is the standard implementation of a Timer and uses a Histogram
+// and Meter.
+type StandardTimer struct {
+	histogram Histogram
+	meter     Meter
+	mutex     sync.Mutex
+}
+
+// Count returns the number of events recorded.
+func (t *StandardTimer) Count() int64 {
+	return t.histogram.Count()
+}
+
+// Max returns the maximum value in the sample.
+func (t *StandardTimer) Max() int64 {
+	return t.histogram.Max()
+}
+
+// Mean returns the mean of the values in the sample.
+func (t *StandardTimer) Mean() float64 {
+	return t.histogram.Mean()
+}
+
+// Min returns the minimum value in the sample.
+func (t *StandardTimer) Min() int64 {
+	return t.histogram.Min()
+}
+
+// Percentile returns an arbitrary percentile of the values in the sample.
+func (t *StandardTimer) Percentile(p float64) float64 {
+	return t.histogram.Percentile(p)
+}
+
+// Percentiles returns a slice of arbitrary percentiles of the values in the
+// sample.
+func (t *StandardTimer) Percentiles(ps []float64) []float64 {
+	return t.histogram.Percentiles(ps)
+}
+
+// Rate1 returns the one-minute moving average rate of events per second.
+func (t *StandardTimer) Rate1() float64 {
+	return t.meter.Rate1()
+}
+
+// Rate5 returns the five-minute moving average rate of events per second.
+func (t *StandardTimer) Rate5() float64 {
+	return t.meter.Rate5()
+}
+
+// Rate15 returns the fifteen-minute moving average rate of events per second.
+func (t *StandardTimer) Rate15() float64 {
+	return t.meter.Rate15()
+}
+
+// RateMean returns the meter's mean rate of events per second.
+func (t *StandardTimer) RateMean() float64 {
+	return t.meter.RateMean()
+}
+
+// Snapshot returns a read-only copy of the timer.
+func (t *StandardTimer) Snapshot() Timer {
+	t.mutex.Lock()
+	defer t.mutex.Unlock()
+	return &TimerSnapshot{
+		histogram: t.histogram.Snapshot().(*HistogramSnapshot),
+		meter:     t.meter.Snapshot().(*MeterSnapshot),
+	}
+}
+
+// StdDev returns the standard deviation of the values in the sample.
+func (t *StandardTimer) StdDev() float64 {
+	return t.histogram.StdDev()
+}
+
+// Stop stops the meter.
+func (t *StandardTimer) Stop() {
+	t.meter.Stop()
+}
+
+// Sum returns the sum in the sample.
+func (t *StandardTimer) Sum() int64 {
+	return t.histogram.Sum()
+}
+
+// Record the duration of the execution of the given function.
+func (t *StandardTimer) Time(f func()) {
+	ts := time.Now()
+	f()
+	t.Update(time.Since(ts))
+}
+
+// Record the duration of an event.
+func (t *StandardTimer) Update(d time.Duration) {
+	t.mutex.Lock()
+	defer t.mutex.Unlock()
+	t.histogram.Update(int64(d))
+	t.meter.Mark(1)
+}
+
+// Record the duration of an event that started at a time and ends now.
+func (t *StandardTimer) UpdateSince(ts time.Time) {
+	t.mutex.Lock()
+	defer t.mutex.Unlock()
+	t.histogram.Update(int64(time.Since(ts)))
+	t.meter.Mark(1)
+}
+
+// Variance returns the variance of the values in the sample.
+func (t *StandardTimer) Variance() float64 {
+	return t.histogram.Variance()
+}
+
+// TimerSnapshot is a read-only copy of another Timer.
+type TimerSnapshot struct {
+	histogram *HistogramSnapshot
+	meter     *MeterSnapshot
+}
+
+// Count returns the number of events recorded at the time the snapshot was
+// taken.
+func (t *TimerSnapshot) Count() int64 { return t.histogram.Count() }
+
+// Max returns the maximum value at the time the snapshot was taken.
+func (t *TimerSnapshot) Max() int64 { return t.histogram.Max() }
+
+// Mean returns the mean value at the time the snapshot was taken.
+func (t *TimerSnapshot) Mean() float64 { return t.histogram.Mean() }
+
+// Min returns the minimum value at the time the snapshot was taken.
+func (t *TimerSnapshot) Min() int64 { return t.histogram.Min() }
+
+// Percentile returns an arbitrary percentile of sampled values at the time the
+// snapshot was taken.
+func (t *TimerSnapshot) Percentile(p float64) float64 {
+	return t.histogram.Percentile(p)
+}
+
+// Percentiles returns a slice of arbitrary percentiles of sampled values at
+// the time the snapshot was taken.
+func (t *TimerSnapshot) Percentiles(ps []float64) []float64 {
+	return t.histogram.Percentiles(ps)
+}
+
+// Rate1 returns the one-minute moving average rate of events per second at the
+// time the snapshot was taken.
+func (t *TimerSnapshot) Rate1() float64 { return t.meter.Rate1() }
+
+// Rate5 returns the five-minute moving average rate of events per second at
+// the time the snapshot was taken.
+func (t *TimerSnapshot) Rate5() float64 { return t.meter.Rate5() }
+
+// Rate15 returns the fifteen-minute moving average rate of events per second
+// at the time the snapshot was taken.
+func (t *TimerSnapshot) Rate15() float64 { return t.meter.Rate15() }
+
+// RateMean returns the meter's mean rate of events per second at the time the
+// snapshot was taken.
+func (t *TimerSnapshot) RateMean() float64 { return t.meter.RateMean() }
+
+// Snapshot returns the snapshot.
+func (t *TimerSnapshot) Snapshot() Timer { return t }
+
+// StdDev returns the standard deviation of the values at the time the snapshot
+// was taken.
+func (t *TimerSnapshot) StdDev() float64 { return t.histogram.StdDev() }
+
+// Stop is a no-op.
+func (t *TimerSnapshot) Stop() {}
+
+// Sum returns the sum at the time the snapshot was taken.
+func (t *TimerSnapshot) Sum() int64 { return t.histogram.Sum() }
+
+// Time panics.
+func (*TimerSnapshot) Time(func()) {
+	panic("Time called on a TimerSnapshot")
+}
+
+// Update panics.
+func (*TimerSnapshot) Update(time.Duration) {
+	panic("Update called on a TimerSnapshot")
+}
+
+// UpdateSince panics.
+func (*TimerSnapshot) UpdateSince(time.Time) {
+	panic("UpdateSince called on a TimerSnapshot")
+}
+
+// Variance returns the variance of the values at the time the snapshot was
+// taken.
+func (t *TimerSnapshot) Variance() float64 { return t.histogram.Variance() }

diff --git a/vendor/github.com/rcrowley/go-metrics/validate.sh b/vendor/github.com/rcrowley/go-metrics/validate.sh
new file mode 100755
index 0000000..c4ae91e
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/validate.sh

@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -e
+
+# check there are no formatting issues
+GOFMT_LINES=`gofmt -l . | wc -l | xargs`
+test $GOFMT_LINES -eq 0 || echo "gofmt needs to be run, ${GOFMT_LINES} files have issues"
+
+# run the tests for the root package
+go test -race .

diff --git a/vendor/github.com/rcrowley/go-metrics/writer.go b/vendor/github.com/rcrowley/go-metrics/writer.go
new file mode 100644
index 0000000..091e971
--- /dev/null
+++ b/vendor/github.com/rcrowley/go-metrics/writer.go

@@ -0,0 +1,100 @@
+package metrics
+
+import (
+	"fmt"
+	"io"
+	"sort"
+	"time"
+)
+
+// Write sorts writes each metric in the given registry periodically to the
+// given io.Writer.
+func Write(r Registry, d time.Duration, w io.Writer) {
+	for _ = range time.Tick(d) {
+		WriteOnce(r, w)
+	}
+}
+
+// WriteOnce sorts and writes metrics in the given registry to the given
+// io.Writer.
+func WriteOnce(r Registry, w io.Writer) {
+	var namedMetrics namedMetricSlice
+	r.Each(func(name string, i interface{}) {
+		namedMetrics = append(namedMetrics, namedMetric{name, i})
+	})
+
+	sort.Sort(namedMetrics)
+	for _, namedMetric := range namedMetrics {
+		switch metric := namedMetric.m.(type) {
+		case Counter:
+			fmt.Fprintf(w, "counter %s\n", namedMetric.name)
+			fmt.Fprintf(w, "  count:       %9d\n", metric.Count())
+		case Gauge:
+			fmt.Fprintf(w, "gauge %s\n", namedMetric.name)
+			fmt.Fprintf(w, "  value:       %9d\n", metric.Value())
+		case GaugeFloat64:
+			fmt.Fprintf(w, "gauge %s\n", namedMetric.name)
+			fmt.Fprintf(w, "  value:       %f\n", metric.Value())
+		case Healthcheck:
+			metric.Check()
+			fmt.Fprintf(w, "healthcheck %s\n", namedMetric.name)
+			fmt.Fprintf(w, "  error:       %v\n", metric.Error())
+		case Histogram:
+			h := metric.Snapshot()
+			ps := h.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999})
+			fmt.Fprintf(w, "histogram %s\n", namedMetric.name)
+			fmt.Fprintf(w, "  count:       %9d\n", h.Count())
+			fmt.Fprintf(w, "  min:         %9d\n", h.Min())
+			fmt.Fprintf(w, "  max:         %9d\n", h.Max())
+			fmt.Fprintf(w, "  mean:        %12.2f\n", h.Mean())
+			fmt.Fprintf(w, "  stddev:      %12.2f\n", h.StdDev())
+			fmt.Fprintf(w, "  median:      %12.2f\n", ps[0])
+			fmt.Fprintf(w, "  75%%:         %12.2f\n", ps[1])
+			fmt.Fprintf(w, "  95%%:         %12.2f\n", ps[2])
+			fmt.Fprintf(w, "  99%%:         %12.2f\n", ps[3])
+			fmt.Fprintf(w, "  99.9%%:       %12.2f\n", ps[4])
+		case Meter:
+			m := metric.Snapshot()
+			fmt.Fprintf(w, "meter %s\n", namedMetric.name)
+			fmt.Fprintf(w, "  count:       %9d\n", m.Count())
+			fmt.Fprintf(w, "  1-min rate:  %12.2f\n", m.Rate1())
+			fmt.Fprintf(w, "  5-min rate:  %12.2f\n", m.Rate5())
+			fmt.Fprintf(w, "  15-min rate: %12.2f\n", m.Rate15())
+			fmt.Fprintf(w, "  mean rate:   %12.2f\n", m.RateMean())
+		case Timer:
+			t := metric.Snapshot()
+			ps := t.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999})
+			fmt.Fprintf(w, "timer %s\n", namedMetric.name)
+			fmt.Fprintf(w, "  count:       %9d\n", t.Count())
+			fmt.Fprintf(w, "  min:         %9d\n", t.Min())
+			fmt.Fprintf(w, "  max:         %9d\n", t.Max())
+			fmt.Fprintf(w, "  mean:        %12.2f\n", t.Mean())
+			fmt.Fprintf(w, "  stddev:      %12.2f\n", t.StdDev())
+			fmt.Fprintf(w, "  median:      %12.2f\n", ps[0])
+			fmt.Fprintf(w, "  75%%:         %12.2f\n", ps[1])
+			fmt.Fprintf(w, "  95%%:         %12.2f\n", ps[2])
+			fmt.Fprintf(w, "  99%%:         %12.2f\n", ps[3])
+			fmt.Fprintf(w, "  99.9%%:       %12.2f\n", ps[4])
+			fmt.Fprintf(w, "  1-min rate:  %12.2f\n", t.Rate1())
+			fmt.Fprintf(w, "  5-min rate:  %12.2f\n", t.Rate5())
+			fmt.Fprintf(w, "  15-min rate: %12.2f\n", t.Rate15())
+			fmt.Fprintf(w, "  mean rate:   %12.2f\n", t.RateMean())
+		}
+	}
+}
+
+type namedMetric struct {
+	name string
+	m    interface{}
+}
+
+// namedMetricSlice is a slice of namedMetrics that implements sort.Interface.
+type namedMetricSlice []namedMetric
+
+func (nms namedMetricSlice) Len() int { return len(nms) }
+
+func (nms namedMetricSlice) Swap(i, j int) { nms[i], nms[j] = nms[j], nms[i] }
+
+func (nms namedMetricSlice) Less(i, j int) bool {
+	return nms[i].name < nms[j].name
+}

diff --git a/vendor/github.com/Sirupsen/logrus/LICENSE b/vendor/github.com/sirupsen/logrus/LICENSE
similarity index 100%
rename from vendor/github.com/Sirupsen/logrus/LICENSE
rename to vendor/github.com/sirupsen/logrus/LICENSE


diff --git a/vendor/github.com/Sirupsen/logrus/hooks.go b/vendor/github.com/sirupsen/logrus/hooks.go
similarity index 100%
rename from vendor/github.com/Sirupsen/logrus/hooks.go
rename to vendor/github.com/sirupsen/logrus/hooks.go
commit	9a2772a1846566b498d4fb1fefa3959d0de823eb	[log] [tgz]
author	Matteo Scandolo <matteo.scandolo@gmail.com>	Mon Nov 19 14:56:26 2018 -0800
committer	Matteo Scandolo <matteo.scandolo@gmail.com>	Mon Dec 10 13:15:52 2018 -0800
tree	996f9cb56ed03fb317229f581b6bdc69ca6d375f
parent	1f49bf54632d764fb201c185fc576511d73626d8 [diff]