VOL-1558 Implementation of openolt adapter with dep for dependency management
Also updated the build system to take this into account.

Currently dep ensure fails due to missing libraries in voltha-go, but the vendor folder has been updated otherwise.
This can be worked around in development using the LOCAL_VOLTHAGO variable described in the readme
This does not build currrently, but that is due to missing code in voltha-go master.

This pattern is consistent with how voltha-go does things, but does not leave you dependent on it to build.

See the readme for how to use dep.

The resourcemanager file is no longer hidden.

Change-Id: I25b8472dbc517b193970597c9f43ddff18c2d89f
diff --git a/Gopkg.lock b/Gopkg.lock
new file mode 100644
index 0000000..6a9680d
--- /dev/null
+++ b/Gopkg.lock
@@ -0,0 +1,420 @@
+# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'.
+
+
+[[projects]]
+  digest = "1:ed77032e4241e3b8329c9304d66452ed196e795876e14be677a546f36b94e67a"
+  name = "github.com/DataDog/zstd"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "c7161f8c63c045cbc7ca051dcc969dd0e4054de2"
+  version = "v1.3.5"
+
+[[projects]]
+  digest = "1:82a18170c9c41e36939cb5d26da1546b2cfa786aa030a978d3bf183519849230"
+  name = "github.com/Shopify/sarama"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "4602b5a8c6e826f9e0737865818dd43b2339a092"
+  version = "v1.21.0"
+
+[[projects]]
+  branch = "master"
+  digest = "1:ef5b0622d834c139454148b8fd0c92bb314828900532b267ae62da9fec109866"
+  name = "github.com/armon/go-metrics"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "f0300d1749da6fa982027e449ec0c7a145510c3c"
+
+[[projects]]
+  digest = "1:526d64d0a3ac6c24875724a9355895be56a21f89a5d3ab5ba88d91244269a7d8"
+  name = "github.com/bsm/sarama-cluster"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "c618e605e15c0d7535f6c96ff8efbb0dba4fd66c"
+  version = "v2.1.15"
+
+[[projects]]
+  digest = "1:bcf36df8d43860bfde913d008301aef27c6e9a303582118a837c4a34c0d18167"
+  name = "github.com/coreos/go-systemd"
+  packages = ["journal"]
+  pruneopts = "UT"
+  revision = "95778dfbb74eb7e4dbaf43bf7d71809650ef8076"
+  version = "v19"
+
+[[projects]]
+  digest = "1:39ff4d4d6baca1ac63c6a03404b761d664be14a71bb5a4aca8fef643d0f66fef"
+  name = "github.com/coreos/pkg"
+  packages = ["capnslog"]
+  pruneopts = "UT"
+  revision = "97fdf19511ea361ae1c100dd393cc47f8dcfa1e1"
+  version = "v4"
+
+[[projects]]
+  digest = "1:ffe9824d294da03b391f44e1ae8281281b4afc1bdaa9588c9097785e3af10cec"
+  name = "github.com/davecgh/go-spew"
+  packages = ["spew"]
+  pruneopts = "UT"
+  revision = "8991bc29aa16c548c550c7ff78260e27b9ab7c73"
+  version = "v1.1.1"
+
+[[projects]]
+  digest = "1:1f0c7ab489b407a7f8f9ad16c25a504d28ab461517a971d341388a56156c1bd7"
+  name = "github.com/eapache/go-resiliency"
+  packages = ["breaker"]
+  pruneopts = "UT"
+  revision = "ea41b0fad31007accc7f806884dcdf3da98b79ce"
+  version = "v1.1.0"
+
+[[projects]]
+  branch = "master"
+  digest = "1:79f16588b5576b1b3cd90e48d2374cc9a1a8776862d28d8fd0f23b0e15534967"
+  name = "github.com/eapache/go-xerial-snappy"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "776d5712da21bc4762676d614db1d8a64f4238b0"
+
+[[projects]]
+  digest = "1:444b82bfe35c83bbcaf84e310fb81a1f9ece03edfed586483c869e2c046aef69"
+  name = "github.com/eapache/queue"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "44cc805cf13205b55f69e14bcb69867d1ae92f98"
+  version = "v1.1.0"
+
+[[projects]]
+  digest = "1:4652941215f23c7ad6546bab86964c72248a1134baa53f0df618661344ba5cac"
+  name = "github.com/gogo/protobuf"
+  packages = [
+    "gogoproto",
+    "proto",
+    "protoc-gen-gogo/descriptor",
+  ]
+  pruneopts = "UT"
+  revision = "ba06b47c162d49f2af050fb4c75bcbc86a159d5c"
+  version = "v1.2.1"
+
+[[projects]]
+  digest = "1:f2bc519aefa5511928d70d067ae2828cfa60fd3947cc8d166f070a0ce705f782"
+  name = "github.com/golang/protobuf"
+  packages = [
+    "proto",
+    "protoc-gen-go/descriptor",
+    "ptypes",
+    "ptypes/any",
+    "ptypes/duration",
+    "ptypes/empty",
+    "ptypes/timestamp",
+  ]
+  pruneopts = "UT"
+  revision = "b5d812f8a3706043e23a9cd5babf2e5423744d30"
+  version = "v1.3.1"
+
+[[projects]]
+  digest = "1:e4f5819333ac698d294fe04dbf640f84719658d5c7ce195b10060cc37292ce79"
+  name = "github.com/golang/snappy"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "2a8bb927dd31d8daada140a5d09578521ce5c36a"
+  version = "v0.0.1"
+
+[[projects]]
+  digest = "1:582b704bebaa06b48c29b0cec224a6058a09c86883aaddabde889cd1a5f73e1b"
+  name = "github.com/google/uuid"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "0cd6bf5da1e1c83f8b45653022c74f71af0538a4"
+  version = "v1.1.1"
+
+[[projects]]
+  digest = "1:9537873f2fc92467f451178506405fae087efc624d8d92644042209a03a2bfa0"
+  name = "github.com/hashicorp/consul"
+  packages = ["api"]
+  pruneopts = "UT"
+  revision = "ea5210a30e154f4da9a4c8e729b45b8ce7b9b92c"
+  version = "v1.4.4"
+
+[[projects]]
+  digest = "1:af105c7c5dc0b4ae41991f122cae860b9600f7d226072c2a83127048c991660c"
+  name = "github.com/hashicorp/go-cleanhttp"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "eda1e5db218aad1db63ca4642c8906b26bcf2744"
+  version = "v0.5.1"
+
+[[projects]]
+  digest = "1:2be5a35f0c5b35162c41bb24971e5dcf6ce825403296ee435429cdcc4e1e847e"
+  name = "github.com/hashicorp/go-immutable-radix"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "27df80928bb34bb1b0d6d0e01b9e679902e7a6b5"
+  version = "v1.0.0"
+
+[[projects]]
+  digest = "1:a54ada9beb59fdc35b69322979e870ff0b780e03f4dc309c4c8674b94927df75"
+  name = "github.com/hashicorp/go-rootcerts"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "63503fb4e1eca22f9ae0f90b49c5d5538a0e87eb"
+  version = "v1.0.0"
+
+[[projects]]
+  digest = "1:67474f760e9ac3799f740db2c489e6423a4cde45520673ec123ac831ad849cb8"
+  name = "github.com/hashicorp/golang-lru"
+  packages = ["simplelru"]
+  pruneopts = "UT"
+  revision = "7087cb70de9f7a8bc0a10c375cb0d2280a8edf9c"
+  version = "v0.5.1"
+
+[[projects]]
+  digest = "1:acc81e4e4289587b257ccdfccbc6eaf16d4c2fb57dda73c6bb349bf50f02501f"
+  name = "github.com/hashicorp/serf"
+  packages = ["coordinate"]
+  pruneopts = "UT"
+  revision = "b89a09ebd4b1b570e0076d5097272e67c10ac4f6"
+  version = "v0.8.2"
+
+[[projects]]
+  digest = "1:5d231480e1c64a726869bc4142d270184c419749d34f167646baa21008eb0a79"
+  name = "github.com/mitchellh/go-homedir"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "af06845cf3004701891bf4fdb884bfe4920b3727"
+  version = "v1.1.0"
+
+[[projects]]
+  digest = "1:53bc4cd4914cd7cd52139990d5170d6dc99067ae31c56530621b18b35fc30318"
+  name = "github.com/mitchellh/mapstructure"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "3536a929edddb9a5b34bd6861dc4a9647cb459fe"
+  version = "v1.1.2"
+
+[[projects]]
+  branch = "master"
+  digest = "1:724ce67c43f6b392e3e57a05f566ebe339354a9b3245cba64515769d8506e22f"
+  name = "github.com/opencord/voltha-go"
+  packages = [
+    "adapters",
+    "adapters/common",
+    "common/log",
+    "db/kvstore",
+    "kafka",
+  ]
+  pruneopts = "UT"
+  revision = "c91266ef204b7747a2cbe9bb53005d92d88ea497"
+
+[[projects]]
+  branch = "master"
+  digest = "1:9075f267bf479a68085b08d33b7df7f938b6f813ad51de98d890ea95fc4c81cf"
+  name = "github.com/opencord/voltha-protos"
+  packages = [
+    "go/common",
+    "go/inter_container",
+    "go/omci",
+    "go/openflow_13",
+    "go/openolt",
+    "go/voltha",
+  ]
+  pruneopts = "UT"
+  revision = "61e94873aa4b4cfd5e9277c666402214b031c05b"
+
+[[projects]]
+  digest = "1:d886a3c32c8c1a770d07e36340f061d3afc948d065ffc3c9a19b01b34d4f0b65"
+  name = "github.com/pierrec/lz4"
+  packages = [
+    ".",
+    "internal/xxh32",
+  ]
+  pruneopts = "UT"
+  revision = "315a67e90e415bcdaff33057da191569bf4d8479"
+  version = "v2.1.1"
+
+[[projects]]
+  branch = "master"
+  digest = "1:d38f81081a389f1466ec98192cf9115a82158854d6f01e1c23e2e7554b97db71"
+  name = "github.com/rcrowley/go-metrics"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "3113b8401b8a98917cde58f8bbd42a1b1c03b1fd"
+
+[[projects]]
+  branch = "master"
+  digest = "1:254ca48be90c0a91c39b2be976a8b7a96e59c47e43e892cb8449f279d8951d70"
+  name = "go.etcd.io/etcd"
+  packages = [
+    "auth/authpb",
+    "clientv3",
+    "clientv3/balancer",
+    "clientv3/balancer/picker",
+    "clientv3/balancer/resolver/endpoint",
+    "clientv3/concurrency",
+    "etcdserver/api/v3rpc/rpctypes",
+    "etcdserver/etcdserverpb",
+    "mvcc/mvccpb",
+    "pkg/logutil",
+    "pkg/systemd",
+    "pkg/types",
+    "raft",
+    "raft/raftpb",
+  ]
+  pruneopts = "UT"
+  revision = "a621d807f061e1dd635033a8d6bc261461429e27"
+
+[[projects]]
+  digest = "1:3c1a69cdae3501bf75e76d0d86dc6f2b0a7421bc205c0cb7b96b19eed464a34d"
+  name = "go.uber.org/atomic"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "1ea20fb1cbb1cc08cbd0d913a96dead89aa18289"
+  version = "v1.3.2"
+
+[[projects]]
+  digest = "1:60bf2a5e347af463c42ed31a493d817f8a72f102543060ed992754e689805d1a"
+  name = "go.uber.org/multierr"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "3c4937480c32f4c13a875a1829af76c98ca3d40a"
+  version = "v1.1.0"
+
+[[projects]]
+  digest = "1:c52caf7bd44f92e54627a31b85baf06a68333a196b3d8d241480a774733dcf8b"
+  name = "go.uber.org/zap"
+  packages = [
+    ".",
+    "buffer",
+    "internal/bufferpool",
+    "internal/color",
+    "internal/exit",
+    "zapcore",
+  ]
+  pruneopts = "UT"
+  revision = "ff33455a0e382e8a81d14dd7c922020b6b5e7982"
+  version = "v1.9.1"
+
+[[projects]]
+  branch = "master"
+  digest = "1:a7550c619a7f9701bc2f0b2c866668a207cfb693a2834796f723d2c78b514372"
+  name = "golang.org/x/net"
+  packages = [
+    "context",
+    "http/httpguts",
+    "http2",
+    "http2/hpack",
+    "idna",
+    "internal/timeseries",
+    "trace",
+  ]
+  pruneopts = "UT"
+  revision = "b630fd6fe46bcfc98f989005d8b8ec1400e60a6e"
+
+[[projects]]
+  branch = "master"
+  digest = "1:e18c1645adfe83d66f80d3676040e478895669f9456ca8bafa6a7290656c1b48"
+  name = "golang.org/x/sys"
+  packages = ["unix"]
+  pruneopts = "UT"
+  revision = "81d4e9dc473e5e8c933f2aaeba2a3d81efb9aed2"
+
+[[projects]]
+  digest = "1:a2ab62866c75542dd18d2b069fec854577a20211d7c0ea6ae746072a1dccdd18"
+  name = "golang.org/x/text"
+  packages = [
+    "collate",
+    "collate/build",
+    "internal/colltab",
+    "internal/gen",
+    "internal/tag",
+    "internal/triegen",
+    "internal/ucd",
+    "language",
+    "secure/bidirule",
+    "transform",
+    "unicode/bidi",
+    "unicode/cldr",
+    "unicode/norm",
+    "unicode/rangetable",
+  ]
+  pruneopts = "UT"
+  revision = "f21a4dfb5e38f5895301dc265a8def02365cc3d0"
+  version = "v0.3.0"
+
+[[projects]]
+  branch = "master"
+  digest = "1:08aea4eff69cb424f2ce0d69f88c2f0d351fc44e82cab057ddf3946f41f02108"
+  name = "google.golang.org/genproto"
+  packages = [
+    "googleapis/api/annotations",
+    "googleapis/rpc/status",
+  ]
+  pruneopts = "UT"
+  revision = "f467c93bbac2133ff463e1f93d18d8f9f3f04451"
+
+[[projects]]
+  digest = "1:a0bdff6a1dd68e7fa6edbffe3bef8e061db79543975d1c53afcae3a8d8a6c445"
+  name = "google.golang.org/grpc"
+  packages = [
+    ".",
+    "balancer",
+    "balancer/base",
+    "balancer/roundrobin",
+    "binarylog/grpc_binarylog_v1",
+    "codes",
+    "connectivity",
+    "credentials",
+    "credentials/internal",
+    "encoding",
+    "encoding/proto",
+    "grpclog",
+    "health/grpc_health_v1",
+    "internal",
+    "internal/backoff",
+    "internal/binarylog",
+    "internal/channelz",
+    "internal/envconfig",
+    "internal/grpcrand",
+    "internal/grpcsync",
+    "internal/syscall",
+    "internal/transport",
+    "keepalive",
+    "metadata",
+    "naming",
+    "peer",
+    "resolver",
+    "resolver/dns",
+    "resolver/passthrough",
+    "stats",
+    "status",
+    "tap",
+  ]
+  pruneopts = "UT"
+  revision = "3507fb8e1a5ad030303c106fef3a47c9fdad16ad"
+  version = "v1.19.1"
+
+[[projects]]
+  digest = "1:82a18170c9c41e36939cb5d26da1546b2cfa786aa030a978d3bf183519849230"
+  name = "gopkg.in/Shopify/sarama.v1"
+  packages = ["."]
+  pruneopts = "UT"
+  revision = "4602b5a8c6e826f9e0737865818dd43b2339a092"
+  version = "v1.21.0"
+
+[solve-meta]
+  analyzer-name = "dep"
+  analyzer-version = 1
+  input-imports = [
+    "github.com/gogo/protobuf/proto",
+    "github.com/golang/protobuf/ptypes",
+    "github.com/opencord/voltha-go/adapters",
+    "github.com/opencord/voltha-go/adapters/common",
+    "github.com/opencord/voltha-go/common/log",
+    "github.com/opencord/voltha-go/db/kvstore",
+    "github.com/opencord/voltha-go/kafka",
+    "github.com/opencord/voltha-protos/go/common",
+    "github.com/opencord/voltha-protos/go/inter_container",
+    "github.com/opencord/voltha-protos/go/openflow_13",
+    "github.com/opencord/voltha-protos/go/openolt",
+    "github.com/opencord/voltha-protos/go/voltha",
+    "google.golang.org/grpc",
+  ]
+  solver-name = "gps-cdcl"
+  solver-version = 1
diff --git a/Gopkg.toml b/Gopkg.toml
new file mode 100644
index 0000000..413989e
--- /dev/null
+++ b/Gopkg.toml
@@ -0,0 +1,54 @@
+# Gopkg.toml example
+#
+# Refer to https://golang.github.io/dep/docs/Gopkg.toml.html
+# for detailed Gopkg.toml documentation.
+#
+# required = ["github.com/user/thing/cmd/thing"]
+# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"]
+#
+# [[constraint]]
+#   name = "github.com/user/project"
+#   version = "1.0.0"
+#
+# [[constraint]]
+#   name = "github.com/user/project2"
+#   branch = "dev"
+#   source = "github.com/myfork/project2"
+#
+# [[override]]
+#   name = "github.com/x/y"
+#   version = "2.4.0"
+#
+# [prune]
+#   non-go = false
+#   go-tests = true
+#   unused-packages = true
+
+
+[[constraint]]
+  name = "github.com/gogo/protobuf"
+  version = "1.2.1"
+
+[[constraint]]
+  name = "github.com/golang/protobuf"
+  version = "1.3.1"
+
+[[constraint]]
+  branch = "master"
+  name = "github.com/opencord/voltha-go"
+
+[[constraint]]
+  branch = "master"
+  name = "github.com/opencord/voltha-protos"
+
+[[constraint]]
+  name = "google.golang.org/grpc"
+  version = "1.19.1"
+
+[[override]]
+  branch = "master"
+  name = "go.etcd.io/etcd"
+
+[prune]
+  go-tests = true
+  unused-packages = true
diff --git a/Makefile b/Makefile
index 08fddff..f272c3c 100644
--- a/Makefile
+++ b/Makefile
@@ -82,6 +82,15 @@
 containers: adapter_openolt_go
 
 adapter_openolt_go:
+ifdef LOCAL_PROTOS
+	mkdir -p vendor/github.com/opencord/voltha-protos/go
+	cp -rf ${GOPATH}/src/github.com/opencord/voltha-protos/go/ vendor/github.com/opencord/voltha-protos
+endif
+ifdef LOCAL_VOLTHAGO
+	mkdir -p vendor/github.com/opencord/voltha-go/
+	cp -rf ${GOPATH}/src/github.com/opencord/voltha-go/ vendor/github.com/opencord/
+	rm -rf vendor/github.com/opencord/voltha-go/vendor
+endif
 	docker build $(DOCKER_BUILD_ARGS) -t ${REGISTRY}${REPOSITORY}voltha-openolt-adapter-go:${TAG} -f docker/Dockerfile.openolt .
 
 # end file
diff --git a/README.md b/README.md
index 0c3609b..77d9f22 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,30 @@
 
 Assuming the VOLTHA2.0 environment is made using the quickstart.md in voltha-go.
 
+Dependencies are committed to the repos as per current standard practice.
+If you need to update them you can do so with dep. This includes the voltha-protos and voltha-go dependencies.
+
+Ensure your GOPATH variable is set according to the quickstart
+Create a symbolic link in the $GOPATH/src tree to the voltha-openolt-adapter repository:
+```sh
+mkdir -p $GOPATH/src/github.com/opencord
+ln -s ~/repos/voltha-openolt-adapter $GOPATH/src/github.com/opencord/voltha-openolt-adapter
 ```
+
+Install dep for fetching dependencies
+```sh
+mkdir -p $GOPATH/bin
+curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
+
+Pull and build dependencies for the project.  This may take a while.  This may likely update versions of 3rd party packages in the vendor folder.   This may introduce unexpected changes.   If everything still works feel free to check in the vendor updates as needed.
+
+cd $GOPATH/src/github.com/opencord/voltha-go/
+dep ensure
+
+If you are using a custom local copy of protos or voltha-go
+
+Just export LOCAL_PROTOS=true or LOCAL_VOLTHAGO=true to use that instead, assuming you have set them up on GOPATH. See the quickstart.
+
 cd ~/source/voltha-openolt-adapter
 ```
 
diff --git a/adaptercore/device_handler.go b/adaptercore/device_handler.go
index b626242..1c2fdf9 100644
--- a/adaptercore/device_handler.go
+++ b/adaptercore/device_handler.go
@@ -33,7 +33,7 @@
 	ic "github.com/opencord/voltha-protos/go/inter_container"
 	of "github.com/opencord/voltha-protos/go/openflow_13"
 	oop "github.com/opencord/voltha-protos/go/openolt"
-        rsrcMgr "github.com/opencord/voltha-go/adapters/openolt/adaptercore/resourcemanager"
+	rsrcMgr "github.com/opencord/voltha-openolt-adapter/adaptercore/resourcemanager"
 	"github.com/opencord/voltha-protos/go/voltha"
 	"google.golang.org/grpc"
 )
diff --git a/adaptercore/openolt_flowmgr.go b/adaptercore/openolt_flowmgr.go
index 8d90fd8..18efb33 100644
--- a/adaptercore/openolt_flowmgr.go
+++ b/adaptercore/openolt_flowmgr.go
@@ -23,8 +23,7 @@
 	"errors"
 	"fmt"
 	"math/big"
-
-	rsrcMgr "github.com/opencord/voltha-go/adapters/openolt/adaptercore/resourcemanager"
+	rsrcMgr "github.com/opencord/voltha-openolt-adapter/adaptercore/resourcemanager"
 	"github.com/opencord/voltha-go/common/log"
 	tp "github.com/opencord/voltha-go/common/techprofile"
 	fd "github.com/opencord/voltha-go/rw_core/flow_decomposition"
diff --git a/docker/Dockerfile.openolt b/docker/Dockerfile.openolt
index c975591..ed20905 100644
--- a/docker/Dockerfile.openolt
+++ b/docker/Dockerfile.openolt
@@ -6,44 +6,22 @@
 # Install required packages
 RUN apk add --no-cache wget git make build-base protobuf protobuf-dev
 
-# Install protobuf requirements
-RUN git clone https://github.com/googleapis/googleapis.git /usr/local/include/googleapis
-RUN go get google.golang.org/genproto/googleapis/api/annotations
-
 # Prepare directory structure
-RUN ["mkdir", "-p", "/src", "src/protos"]
+RUN ["mkdir", "-p", "/src"]
 RUN ["mkdir", "-p", "$GOPATH/src", "$GOPATH/pkg", "$GOPATH/bin"]
 RUN ["mkdir", "-p", "$GOPATH/src/github.com/opencord"]
-RUN ["mkdir", "-p", "$GOPATH/src/github.com/opencord/voltha-go"]
+RUN ["mkdir", "-p", "$GOPATH/src/github.com/opencord/voltha-openolt-adapter"]
 
-WORKDIR $GOPATH/src/github.com/opencord
+WORKDIR $GOPATH/src/github.com/opencord/voltha-openolt-adapter
 
-RUN git clone https://gerrit.opencord.org/voltha-go.git
-
-WORKDIR $GOPATH/src/github.com/opencord/voltha-go
-
-
-# Copy files
-ADD adaptercore ./adapters/openolt/adaptercore
-ADD config      ./adapters/openolt/config
-ADD *.go        ./adapters/openolt
-
-RUN ls ./adapters/openolt
-
-# Copy required proto files
-ADD openolt.proto ./protos
-
-# Install the protoc-gen-go
-RUN go install ./vendor/github.com/golang/protobuf/protoc-gen-go
-
-# Compile protobuf files
-RUN sh protos/scripts/build_protos.sh protos
-
-RUN protoc --go_out=Mgoogle/protobuf/descriptor.proto=github.com/golang/protobuf/protoc-gen-go/descriptor,plugins=grpc:$GOPATH/src -I protos -I /usr/local/include/googleapis  protos/openolt.proto
+COPY adaptercore ./adaptercore
+COPY config ./config
+COPY vendor ./vendor
+COPY *.go ./
 
 # Build openolt
+RUN go build -o /src/openolt
 
-RUN cd adapters/openolt && go build -o /src/openolt
 
 # -------------
 # Image creation stage
diff --git a/main.go b/main.go
index b349fa8..e4f22fa 100644
--- a/main.go
+++ b/main.go
@@ -21,8 +21,8 @@
 	"fmt"
 	"github.com/opencord/voltha-go/adapters"
 	com "github.com/opencord/voltha-go/adapters/common"
-	ac "github.com/opencord/voltha-go/adapters/openolt/adaptercore"
-	"github.com/opencord/voltha-go/adapters/openolt/config"
+	ac "github.com/opencord/voltha-openolt-adapter/adaptercore"
+	"github.com/opencord/voltha-openolt-adapter/config"
 	"github.com/opencord/voltha-go/common/log"
 	"github.com/opencord/voltha-go/db/kvstore"
 	"github.com/opencord/voltha-go/kafka"
diff --git a/openolt.proto b/openolt.proto
deleted file mode 100644
index 709105b..0000000
--- a/openolt.proto
+++ /dev/null
@@ -1,560 +0,0 @@
-// Copyright (c) 2018 Open Networking Foundation

-//

-// Licensed under the Apache License, Version 2.0 (the "License");

-// you may not use this file except in compliance with the License.

-// You may obtain a copy of the License at:

-//

-//     http://www.apache.org/licenses/LICENSE-2.0

-//

-// Unless required by applicable law or agreed to in writing, software

-// distributed under the License is distributed on an "AS IS" BASIS,

-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-// See the License for the specific language governing permissions and

-// limitations under the License.

-

-syntax = "proto3";

-

-option go_package = "github.com/opencord/voltha-go/protos/openolt";

-

-package openolt;

-

-import "google/api/annotations.proto";

-

-service Openolt {

-

-    rpc DisableOlt(Empty) returns (Empty) {

-        option (google.api.http) = {

-          post: "/v1/Disable"

-          body: "*"

-        };

-    }

-

-    rpc ReenableOlt(Empty) returns (Empty) {

-        option (google.api.http) = {

-          post: "/v1/Reenable"

-          body: "*"

-        };

-    }

-

-    rpc ActivateOnu(Onu) returns (Empty) {

-        option (google.api.http) = {

-          post: "/v1/EnableOnu"

-          body: "*"

-        };

-    }

-

-    rpc DeactivateOnu(Onu) returns (Empty) {

-        option (google.api.http) = {

-          post: "/v1/DisableOnu"

-          body: "*"

-        };

-    }

-

-    rpc DeleteOnu(Onu) returns (Empty) {

-        option (google.api.http) = {

-          post: "/v1/DeleteOnu"

-          body: "*"

-        };

-    }

-

-    rpc OmciMsgOut(OmciMsg) returns (Empty) {

-        option (google.api.http) = {

-          post: "/v1/OmciMsgOut"

-          body: "*"

-        };

-    }

-

-    rpc OnuPacketOut(OnuPacket) returns (Empty) {

-        option (google.api.http) = {

-          post: "/v1/OnuPacketOut"

-          body: "*"

-        };

-    }

-

-    rpc UplinkPacketOut(UplinkPacket) returns (Empty) {

-        option (google.api.http) = {

-          post: "/v1/UplinkPacketOut"

-          body: "*"

-        };

-    }

-

-    rpc FlowAdd(Flow) returns (Empty) {

-        option (google.api.http) = {

-          post: "/v1/FlowAdd"

-          body: "*"

-        };

-    }

-

-    rpc FlowRemove(Flow) returns (Empty) {

-        option (google.api.http) = {

-          post: "/v1/FlowRemove"

-          body: "*"

-        };

-    }

-

-    rpc HeartbeatCheck(Empty) returns (Heartbeat) {

-        option (google.api.http) = {

-          post: "/v1/HeartbeatCheck"

-          body: "*"

-        };

-    }

-

-    rpc EnablePonIf(Interface) returns (Empty) {

-        option (google.api.http) = {

-            post: "/v1/EnablePonIf"

-            body: "*"

-        };

-    }

-

-    rpc DisablePonIf(Interface) returns (Empty) {

-        option (google.api.http) = {

-            post: "/v1/DisablePonIf"

-            body: "*"

-        };

-    }

-

-    rpc GetDeviceInfo(Empty) returns (DeviceInfo) {

-        option (google.api.http) = {

-            post: "/v1/GetDeviceInfo"

-            body: "*"

-        };

-    }

-

-    rpc Reboot(Empty) returns (Empty) {

-         option (google.api.http) = {

-            post: "/v1/Reboot"

-            body: "*"

-        };

-    }

-

-    rpc CollectStatistics(Empty) returns (Empty) {

-        option (google.api.http) = {

-            post: "/v1/CollectStatistics"

-            body: "*"

-        };

-    }

-

-    rpc CreateTconts(Tconts) returns (Empty) {

-        option (google.api.http) = {

-            post: "/v1/CreateTconts"

-            body: "*"

-        };

-    }

-

-    rpc RemoveTconts(Tconts) returns (Empty) {

-        option (google.api.http) = {

-            post: "/v1/RemoveTconts"

-            body: "*"

-        };

-    }

-

-    rpc EnableIndication(Empty) returns (stream Indication) {}

-}

-

-message Indication {

-    oneof data {

-        OltIndication olt_ind = 1;

-        IntfIndication intf_ind = 2;

-        IntfOperIndication intf_oper_ind = 3;

-        OnuDiscIndication onu_disc_ind = 4;

-        OnuIndication onu_ind = 5;

-        OmciIndication omci_ind = 6;

-        PacketIndication pkt_ind = 7;

-        PortStatistics port_stats = 8;

-        FlowStatistics flow_stats = 9;

-        AlarmIndication alarm_ind= 10;

-    }

-}

-

-message AlarmIndication {

-    oneof data {

-        LosIndication los_ind = 1;

-        DyingGaspIndication dying_gasp_ind = 2;

-        OnuAlarmIndication onu_alarm_ind = 3;

-        OnuStartupFailureIndication onu_startup_fail_ind = 4;

-        OnuSignalDegradeIndication onu_signal_degrade_ind = 5;

-        OnuDriftOfWindowIndication onu_drift_of_window_ind = 6;

-        OnuLossOfOmciChannelIndication onu_loss_omci_ind = 7;

-        OnuSignalsFailureIndication onu_signals_fail_ind = 8;

-        OnuTransmissionInterferenceWarning onu_tiwi_ind = 9;

-        OnuActivationFailureIndication onu_activation_fail_ind = 10;

-        OnuProcessingErrorIndication onu_processing_error_ind = 11;

-    }

-}

-

-message OltIndication {

-    string oper_state = 1;	// up, down

-}

-

-message IntfIndication {

-    fixed32 intf_id = 1;

-    string oper_state = 2;      // up, down

-}

-

-message OnuDiscIndication {

-    fixed32 intf_id = 1;

-    SerialNumber serial_number = 2;

-}

-

-message OnuIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    string oper_state = 3;      // up, down

-    string admin_state = 5;     // up, down

-    SerialNumber serial_number = 4;

-}

-

-message IntfOperIndication {

-    string type = 1;		// nni, pon

-    fixed32 intf_id = 2;

-    string oper_state = 3;      // up, down

-}

-

-message OmciIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    bytes pkt = 3;

-}

-

-message PacketIndication {

-    string intf_type = 5;		// nni, pon, unknown

-    fixed32 intf_id = 1;

-    fixed32 gemport_id = 2;

-    fixed32 flow_id = 3;

-    fixed32 port_no = 6;

-    fixed64 cookie = 7;

-    bytes pkt = 4;

-}

-

-message Interface {

-    fixed32 intf_id = 1;

-}

-

-message Heartbeat {

-    fixed32 heartbeat_signature = 1;

-}

-

-message Onu {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    SerialNumber serial_number = 3;

-    fixed32 pir = 4;   // peak information rate assigned to onu

-}

-

-message OmciMsg {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    bytes pkt = 3;

-}

-

-message OnuPacket {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    fixed32 port_no = 4;

-    bytes pkt = 3;

-}

-

-message UplinkPacket {

-    fixed32 intf_id = 1;

-    bytes pkt = 2;

-}

-

-message DeviceInfo {

-    string vendor = 1;

-    string model = 2;

-    string hardware_version = 3;

-    string firmware_version = 4;

-    string device_id = 16;

-    string device_serial_number = 17;

-

-    // Total number of pon intf ports on the device

-    fixed32 pon_ports = 12;

-

-    // If using global per-device technology profile. To be deprecated

-    string technology = 5;

-    fixed32 onu_id_start = 6;

-    fixed32 onu_id_end = 7;

-    fixed32 alloc_id_start = 8;

-    fixed32 alloc_id_end = 9;

-    fixed32 gemport_id_start = 10;

-    fixed32 gemport_id_end = 11;

-    fixed32 flow_id_start = 13;

-    fixed32 flow_id_end = 14;

-

-    message DeviceResourceRanges {

-

-        // List of 0 or more intf_ids that use the same technology and pools.

-        // If 0 intf_ids supplied, it implies ALL interfaces

-        repeated fixed32 intf_ids = 1;

-

-        // Technology profile for this pool

-        string technology = 2;

-

-        message Pool {

-            enum PoolType {

-                ONU_ID = 0;

-                ALLOC_ID = 1;

-                GEMPORT_ID = 2;

-                FLOW_ID = 3;

-            }

-

-            enum SharingType {

-                DEDICATED_PER_INTF = 0;

-                SHARED_BY_ALL_INTF_ALL_TECH = 1; // Shared across all interfaces in all technologies in all ranges

-                SHARED_BY_ALL_INTF_SAME_TECH = 2; // Shared across all interfaces of the same technology used in this range

-            }

-

-            PoolType type = 1;

-	    SharingType sharing = 2;

-	    fixed32 start = 3; // lower bound on IDs allocated from this pool

-	    fixed32 end = 4; // upper bound on IDs allocated from this pool

-	}

-        repeated Pool pools = 3;

-    }

-    repeated DeviceResourceRanges ranges = 15;

-}

-

-message Classifier {

-    fixed32 o_tpid = 1;

-    fixed32 o_vid = 2;

-    fixed32 i_tpid = 3;

-    fixed32 i_vid = 4;

-    fixed32 o_pbits = 5;

-    fixed32 i_pbits = 6;

-    fixed32 eth_type = 7;

-    bytes dst_mac = 8;

-    bytes src_mac = 9;

-    fixed32 ip_proto = 10;

-    fixed32 dst_ip = 11;

-    fixed32 src_ip = 12;

-    fixed32 src_port = 13;

-    fixed32 dst_port = 14;

-    string pkt_tag_type = 15;	// untagged, single_tag, double_tag

-}

-

-message ActionCmd {

-    bool add_outer_tag = 1;

-    bool remove_outer_tag = 2;

-    bool trap_to_host = 3;

-}

-

-message Action {

-    ActionCmd cmd = 1;

-    fixed32 o_vid = 2;

-    fixed32 o_pbits = 3;

-    fixed32 o_tpid = 4;

-    fixed32 i_vid = 5;

-    fixed32 i_pbits = 6;

-    fixed32 i_tpid = 7;

-}

-

-message Flow {

-    sfixed32 access_intf_id = 1;

-    sfixed32 onu_id = 2;

-    sfixed32 uni_id = 11;

-    fixed32 flow_id = 3;

-    string flow_type = 4;	// upstream, downstream, broadcast, multicast

-    sfixed32 alloc_id = 10;

-    sfixed32 network_intf_id = 5;

-    sfixed32 gemport_id = 6;

-    Classifier classifier = 7;

-    Action action = 8;

-    sfixed32 priority = 9;

-    fixed64 cookie = 12; // must be provided for any flow with trap_to_host action. Returned in PacketIndication

-    fixed32 port_no = 13; // must be provided for any flow with trap_to_host action. Returned in PacketIndication

-}

-

-message SerialNumber {

-    bytes vendor_id = 1;

-    bytes vendor_specific = 2;

-}

-

-message PortStatistics {

-    fixed32 intf_id = 1;

-    fixed64 rx_bytes = 2;

-    fixed64 rx_packets = 3;

-    fixed64 rx_ucast_packets = 4;

-    fixed64 rx_mcast_packets = 5;

-    fixed64 rx_bcast_packets = 6;

-    fixed64 rx_error_packets = 7;

-    fixed64 tx_bytes = 8;

-    fixed64 tx_packets = 9;

-    fixed64 tx_ucast_packets = 10;

-    fixed64 tx_mcast_packets = 11;

-    fixed64 tx_bcast_packets = 12;

-    fixed64 tx_error_packets = 13;

-    fixed64 rx_crc_errors = 14;

-    fixed64 bip_errors = 15;

-    fixed32 timestamp = 16;

-}

-

-message FlowStatistics {

-    fixed32 flow_id = 1;

-    fixed64 rx_bytes = 2;

-    fixed64 rx_packets = 3;

-    fixed64 tx_bytes = 8;

-    fixed64 tx_packets = 9;

-    fixed32 timestamp = 16;

-}

-

-message LosIndication {

-    fixed32 intf_id = 1;

-    string status = 2;

-}

-

-message DyingGaspIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    string status = 3;

-}

-

-message OnuAlarmIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    string los_status = 3;

-    string lob_status = 4;

-    string lopc_miss_status = 5;

-    string lopc_mic_error_status = 6;

-}

-

-message OnuStartupFailureIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    string status = 3;

-}

-

-message OnuSignalDegradeIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    string status = 3;

-    fixed32 inverse_bit_error_rate = 4;

-}

-

-message OnuDriftOfWindowIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    string status = 3;

-    fixed32 drift = 4;

-    fixed32 new_eqd = 5;

-}

-

-message OnuLossOfOmciChannelIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    string status = 3;

-}

-

-message OnuSignalsFailureIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    string status = 3;

-    fixed32 inverse_bit_error_rate = 4;

-}

-

-message OnuTransmissionInterferenceWarning {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    string status = 3;

-    fixed32 drift = 4;

-}

-

-message OnuActivationFailureIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-}

-

-message OnuProcessingErrorIndication {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-}

-

-enum Direction {

-    UPSTREAM = 0;

-    DOWNSTREAM = 1;

-    BIDIRECTIONAL = 2;

-}

-

-enum SchedulingPolicy {

-    WRR = 0;

-    StrictPriority = 1;

-    Hybrid = 2;

-}

-

-enum AdditionalBW {

-    AdditionalBW_None = 0;

-    AdditionalBW_NA = 1;

-    AdditionalBW_BestEffort = 2;

-    AdditionalBW_Auto = 3;

-}

-

-enum DiscardPolicy {

-    TailDrop = 0;

-    WTailDrop = 1;

-    Red = 2;

-    WRed = 3;

-}

-

-enum InferredAdditionBWIndication {

-    InferredAdditionBWIndication_None = 0;

-    InferredAdditionBWIndication_Assured = 1;

-    InferredAdditionBWIndication_BestEffort = 2;

-}

-

-message Scheduler {

-    Direction direction = 1;

-    AdditionalBW additional_bw = 2; // Valid on for “direction == Upstream”.

-    fixed32 priority = 3;

-    fixed32 weight = 4;

-    SchedulingPolicy sched_policy = 5;

-}

-

-message TrafficShapingInfo {

-    fixed32 cir = 1;

-    fixed32 cbs = 2;

-    fixed32 pir = 3;

-    fixed32 pbs = 4;

-    fixed32 gir = 5; // only if “direction == Upstream ”

-    InferredAdditionBWIndication add_bw_ind = 6; // only if “direction == Upstream”

-}

-

-message Tcont {

-    Direction direction = 1;

-    fixed32 alloc_id = 2; // valid only if “direction == Upstream ”

-    Scheduler scheduler = 3;

-    TrafficShapingInfo traffic_shaping_info = 4;

-}

-

-message Tconts {

-    fixed32 intf_id = 1;

-    fixed32 onu_id = 2;

-    fixed32 uni_id = 4;

-    fixed32 port_no = 5;

-    repeated Tcont tconts = 3;

-}

-

-message TailDropDiscardConfig {

-    fixed32 queue_size = 1;

-}

-

-message RedDiscardConfig {

-    fixed32 min_threshold = 1;

-    fixed32 max_threshold = 2;

-    fixed32 max_probability = 3;

-}

-

-message WRedDiscardConfig {

-    RedDiscardConfig green = 1;

-    RedDiscardConfig yellow = 2;

-    RedDiscardConfig red = 3;

-}

-

-message DiscardConfig {

-    DiscardPolicy discard_policy = 1;

-    oneof discard_config {

-        TailDropDiscardConfig tail_drop_discard_config = 2;

-        RedDiscardConfig red_discard_config = 3;

-        WRedDiscardConfig wred_discard_config = 4;

-    }

-}

-

-message Empty {}

diff --git a/vendor/github.com/DataDog/zstd/.travis.yml b/vendor/github.com/DataDog/zstd/.travis.yml
new file mode 100644
index 0000000..c5aa33d
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/.travis.yml
@@ -0,0 +1,30 @@
+language: go
+
+go:
+  - 1.9.x
+  - 1.10.x
+  - 1.11.x
+
+os:
+  - linux
+  - osx
+
+matrix:
+  include:
+    name: "Go 1.11.x CentOS 32bits"
+    language: go
+    go: 1.11.x
+    os: linux
+    services:
+      - docker
+    script:
+      # Please update Go version in travis_test_32 as needed
+      - "docker run -i -v \"${PWD}:/zstd\" toopher/centos-i386:centos6 /bin/bash -c \"linux32 --32bit i386 /zstd/travis_test_32.sh\""
+
+install:
+  - "wget https://github.com/DataDog/zstd/files/2246767/mr.zip"
+  - "unzip mr.zip"
+script:
+  - "go build"
+  - "PAYLOAD=`pwd`/mr go test -v"
+  - "PAYLOAD=`pwd`/mr go test -bench ."
diff --git a/vendor/github.com/DataDog/zstd/LICENSE b/vendor/github.com/DataDog/zstd/LICENSE
new file mode 100644
index 0000000..345c1eb
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/LICENSE
@@ -0,0 +1,27 @@
+Simplified BSD License
+
+Copyright (c) 2016, Datadog <info@datadoghq.com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/DataDog/zstd/README.md b/vendor/github.com/DataDog/zstd/README.md
new file mode 100644
index 0000000..6c02e16
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/README.md
@@ -0,0 +1,120 @@
+# Zstd Go Wrapper
+
+[C Zstd Homepage](https://github.com/Cyan4973/zstd)
+
+The current headers and C files are from *v1.3.4* (Commit
+[2555975](https://github.com/facebook/zstd/releases/tag/v1.3.4)).
+
+## Usage
+
+There are two main APIs:
+
+* simple Compress/Decompress
+* streaming API (io.Reader/io.Writer)
+
+The compress/decompress APIs mirror that of lz4, while the streaming API was
+designed to be a drop-in replacement for zlib.
+
+### Simple `Compress/Decompress`
+
+
+```go
+// Compress compresses the byte array given in src and writes it to dst.
+// If you already have a buffer allocated, you can pass it to prevent allocation
+// If not, you can pass nil as dst.
+// If the buffer is too small, it will be reallocated, resized, and returned bu the function
+// If dst is nil, this will allocate the worst case size (CompressBound(src))
+Compress(dst, src []byte) ([]byte, error)
+```
+
+```go
+// CompressLevel is the same as Compress but you can pass another compression level
+CompressLevel(dst, src []byte, level int) ([]byte, error)
+```
+
+```go
+// Decompress will decompress your payload into dst.
+// If you already have a buffer allocated, you can pass it to prevent allocation
+// If not, you can pass nil as dst (allocates a 4*src size as default).
+// If the buffer is too small, it will retry 3 times by doubling the dst size
+// After max retries, it will switch to the slower stream API to be sure to be able
+// to decompress. Currently switches if compression ratio > 4*2**3=32.
+Decompress(dst, src []byte) ([]byte, error)
+```
+
+### Stream API
+
+```go
+// NewWriter creates a new object that can optionally be initialized with
+// a precomputed dictionary. If dict is nil, compress without a dictionary.
+// The dictionary array should not be changed during the use of this object.
+// You MUST CALL Close() to write the last bytes of a zstd stream and free C objects.
+NewWriter(w io.Writer) *Writer
+NewWriterLevel(w io.Writer, level int) *Writer
+NewWriterLevelDict(w io.Writer, level int, dict []byte) *Writer
+
+// Write compresses the input data and write it to the underlying writer
+(w *Writer) Write(p []byte) (int, error)
+
+// Close flushes the buffer and frees C zstd objects
+(w *Writer) Close() error
+```
+
+```go
+// NewReader returns a new io.ReadCloser that will decompress data from the
+// underlying reader.  If a dictionary is provided to NewReaderDict, it must
+// not be modified until Close is called.  It is the caller's responsibility
+// to call Close, which frees up C objects.
+NewReader(r io.Reader) io.ReadCloser
+NewReaderDict(r io.Reader, dict []byte) io.ReadCloser
+```
+
+### Benchmarks (benchmarked with v0.5.0)
+
+The author of Zstd also wrote lz4. Zstd is intended to occupy a speed/ratio
+level similar to what zlib currently provides.  In our tests, the can always
+be made to be better than zlib by chosing an appropriate level while still
+keeping compression and decompression time faster than zlib.
+
+You can run the benchmarks against your own payloads by using the Go benchmarks tool.
+Just export your payload filepath as the `PAYLOAD` environment variable and run the benchmarks:
+
+```go
+go test -bench .
+```
+
+Compression of a 7Mb pdf zstd (this wrapper) vs [czlib](https://github.com/DataDog/czlib):
+```
+BenchmarkCompression               5     221056624 ns/op      67.34 MB/s
+BenchmarkDecompression           100      18370416 ns/op     810.32 MB/s
+
+BenchmarkFzlibCompress             2     610156603 ns/op      24.40 MB/s
+BenchmarkFzlibDecompress          20      81195246 ns/op     183.33 MB/s
+```
+
+Ratio is also better by a margin of ~20%.
+Compression speed is always better than zlib on all the payloads we tested;
+However, [czlib](https://github.com/DataDog/czlib) has optimisations that make it
+faster at decompressiong small payloads:
+
+```
+Testing with size: 11... czlib: 8.97 MB/s, zstd: 3.26 MB/s
+Testing with size: 27... czlib: 23.3 MB/s, zstd: 8.22 MB/s
+Testing with size: 62... czlib: 31.6 MB/s, zstd: 19.49 MB/s
+Testing with size: 141... czlib: 74.54 MB/s, zstd: 42.55 MB/s
+Testing with size: 323... czlib: 155.14 MB/s, zstd: 99.39 MB/s
+Testing with size: 739... czlib: 235.9 MB/s, zstd: 216.45 MB/s
+Testing with size: 1689... czlib: 116.45 MB/s, zstd: 345.64 MB/s
+Testing with size: 3858... czlib: 176.39 MB/s, zstd: 617.56 MB/s
+Testing with size: 8811... czlib: 254.11 MB/s, zstd: 824.34 MB/s
+Testing with size: 20121... czlib: 197.43 MB/s, zstd: 1339.11 MB/s
+Testing with size: 45951... czlib: 201.62 MB/s, zstd: 1951.57 MB/s
+```
+
+zstd starts to shine with payloads > 1KB
+
+### Stability - Current state: STABLE
+
+The C library seems to be pretty stable and according to the author has been tested and fuzzed.
+
+For the Go wrapper, the test cover most usual cases and we have succesfully tested it on all staging and prod data.
diff --git a/vendor/github.com/DataDog/zstd/ZSTD_LICENSE b/vendor/github.com/DataDog/zstd/ZSTD_LICENSE
new file mode 100644
index 0000000..a793a80
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/ZSTD_LICENSE
@@ -0,0 +1,30 @@
+BSD License
+
+For Zstandard software
+
+Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/DataDog/zstd/bitstream.h b/vendor/github.com/DataDog/zstd/bitstream.h
new file mode 100644
index 0000000..f7f389f
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/bitstream.h
@@ -0,0 +1,471 @@
+/* ******************************************************************
+   bitstream
+   Part of FSE library
+   header file (to include)
+   Copyright (C) 2013-2017, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "mem.h"            /* unaligned access routines */
+#include "error_private.h"  /* error codes and messages */
+
+
+/*-*************************************
+*  Debug
+***************************************/
+#if defined(BIT_DEBUG) && (BIT_DEBUG>=1)
+#  include <assert.h>
+#else
+#  ifndef assert
+#    define assert(condition) ((void)0)
+#  endif
+#endif
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#if defined(__BMI__) && defined(__GNUC__)
+#  include <immintrin.h>   /* support for bextr (experimental) */
+#endif
+
+#define STREAM_ACCUMULATOR_MIN_32  25
+#define STREAM_ACCUMULATOR_MIN_64  57
+#define STREAM_ACCUMULATOR_MIN    ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+ * A critical property of these streams is that they encode and decode in **reverse** direction.
+ * So the first bit sequence you add will be the last to be read, like a LIFO stack.
+ */
+typedef struct
+{
+    size_t bitContainer;
+    unsigned bitPos;
+    char*  startPtr;
+    char*  ptr;
+    char*  endPtr;
+} BIT_CStream_t;
+
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+    const char* limitPtr;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+/* unsafe version; does not check buffer overflow */
+
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+    assert(val != 0);
+    {
+#   if defined(_MSC_VER)   /* Visual */
+        unsigned long r=0;
+        _BitScanReverse ( &r, val );
+        return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+        return 31 - __builtin_clz (val);
+#   else   /* Software version */
+        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+                                                 11, 14, 16, 18, 22, 25,  3, 30,
+                                                  8, 12, 20, 28, 15, 17, 24,  7,
+                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+        U32 v = val;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+#   endif
+    }
+}
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = {
+    0,          1,         3,         7,         0xF,       0x1F,
+    0x3F,       0x7F,      0xFF,      0x1FF,     0x3FF,     0x7FF,
+    0xFFF,      0x1FFF,    0x3FFF,    0x7FFF,    0xFFFF,    0x1FFFF,
+    0x3FFFF,    0x7FFFF,   0xFFFFF,   0x1FFFFF,  0x3FFFFF,  0x7FFFFF,
+    0xFFFFFF,   0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
+    0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */
+#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0]))
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(size_t)
+ *  @return : 0 if success,
+ *            otherwise an error code (can be tested using ERR_isError()) */
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
+{
+    bitC->bitContainer = 0;
+    bitC->bitPos = 0;
+    bitC->startPtr = (char*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+    if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+/*! BIT_addBits() :
+ *  can add up to 31 bits into `bitC`.
+ *  Note : does not check for register overflow ! */
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+                            size_t value, unsigned nbBits)
+{
+    MEM_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+    assert(nbBits < BIT_MASK_SIZE);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+                                size_t value, unsigned nbBits)
+{
+    assert((value>>nbBits) == 0);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  assumption : bitContainer has not overflowed
+ *  unsafe version; does not check buffer overflow */
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    assert(bitC->ptr <= bitC->endPtr);
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_flushBits() :
+ *  assumption : bitContainer has not overflowed
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow.
+ *  overflow will be revealed later on using BIT_closeCStream() */
+MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+ *            or 0 if it could not fit into dstBuffer */
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+{
+    BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+    BIT_flushBits(bitC);
+    if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+
+/*-********************************************************
+*  bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+ *  Initialize a BIT_DStream_t.
+ * `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+ * `srcSize` must be the *exact* size of the bitStream, in bytes.
+ * @return : size of stream (== srcSize), or an errorCode if a problem is detected
+ */
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    bitD->start = (const char*)srcBuffer;
+    bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+    } else {
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                /* fall-through */
+
+        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                /* fall-through */
+
+        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                /* fall-through */
+
+        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
+                /* fall-through */
+
+        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
+                /* fall-through */
+
+        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                /* fall-through */
+
+        default: break;
+        }
+        {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+            if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+        }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
+{
+    return bitContainer >> start;
+}
+
+MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
+{
+#if defined(__BMI__) && defined(__GNUC__) && __GNUC__*1000+__GNUC_MINOR__ >= 4008  /* experimental */
+#  if defined(__x86_64__)
+    if (sizeof(bitContainer)==8)
+        return _bextr_u64(bitContainer, start, nbBits);
+    else
+#  endif
+        return _bextr_u32(bitContainer, start, nbBits);
+#else
+    assert(nbBits < BIT_MASK_SIZE);
+    return (bitContainer >> start) & BIT_mask[nbBits];
+#endif
+}
+
+MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+}
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ * @return : value extracted */
+MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
+{
+#if defined(__BMI__) && defined(__GNUC__)   /* experimental; fails if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8 */
+    return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
+#else
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
+#endif
+}
+
+/*! BIT_lookBitsFast() :
+ *  unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+{
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    assert(nbBits >= 1);
+    return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ * @return : extracted value. */
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_readBitsFast() :
+ *  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BIT_lookBitsFast(bitD, nbBits);
+    assert(nbBits >= 1);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_reloadDStream() :
+ *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+ *  This function is safe, it guarantees it will not read beyond src buffer.
+ * @return : status of `BIT_DStream_t` internal register.
+ *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->limitPtr) {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BIT_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    /* start < ptr < limitPtr */
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream() :
+ * @return : 1 if DStream has _exactly_ reached its end (all bits consumed).
+ */
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
diff --git a/vendor/github.com/DataDog/zstd/compiler.h b/vendor/github.com/DataDog/zstd/compiler.h
new file mode 100644
index 0000000..e90a3bc
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/compiler.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPILER_H
+#define ZSTD_COMPILER_H
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+/* force inlining */
+#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+/**
+ * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+ * parameters. They must be inlined for the compiler to elimininate the constant
+ * branches.
+ */
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+/**
+ * HINT_INLINE is used to help the compiler generate better code. It is *not*
+ * used for "templates", so it can be tweaked based on the compilers
+ * performance.
+ *
+ * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
+ * always_inline attribute.
+ *
+ * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
+ * attribute.
+ */
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+#  define HINT_INLINE static INLINE_KEYWORD
+#else
+#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
+#endif
+
+/* force no inlining */
+#ifdef _MSC_VER
+#  define FORCE_NOINLINE static __declspec(noinline)
+#else
+#  ifdef __GNUC__
+#    define FORCE_NOINLINE static __attribute__((__noinline__))
+#  else
+#    define FORCE_NOINLINE static
+#  endif
+#endif
+
+/* target attribute */
+#ifndef __has_attribute
+  #define __has_attribute(x) 0  /* Compatibility with non-clang compilers. */
+#endif
+#if defined(__GNUC__)
+#  define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
+#else
+#  define TARGET_ATTRIBUTE(target)
+#endif
+
+/* Enable runtime BMI2 dispatch based on the CPU.
+ * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
+ */
+#ifndef DYNAMIC_BMI2
+  #if (defined(__clang__) && __has_attribute(__target__)) \
+      || (defined(__GNUC__) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) \
+      && (defined(__x86_64__) || defined(_M_X86)) \
+      && !defined(__BMI2__)
+  #  define DYNAMIC_BMI2 1
+  #else
+  #  define DYNAMIC_BMI2 0
+  #endif
+#endif
+
+/* prefetch */
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#  include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#  define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
+#elif defined(__GNUC__)
+#  define PREFETCH(ptr)   __builtin_prefetch(ptr, 0, 0)
+#else
+#  define PREFETCH(ptr)   /* disabled */
+#endif
+
+/* disable warnings */
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+#endif /* ZSTD_COMPILER_H */
diff --git a/vendor/github.com/DataDog/zstd/cover.c b/vendor/github.com/DataDog/zstd/cover.c
new file mode 100644
index 0000000..b5a3957
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/cover.c
@@ -0,0 +1,1048 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* *****************************************************************************
+ * Constructs a dictionary using a heuristic based on the following paper:
+ *
+ * Liao, Petri, Moffat, Wirth
+ * Effective Construction of Relative Lempel-Ziv Dictionaries
+ * Published in WWW 2016.
+ *
+ * Adapted from code originally written by @ot (Giuseppe Ottaviano).
+ ******************************************************************************/
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+/*-*************************************
+*  Constants
+***************************************/
+#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+
+/*-*************************************
+*  Console display
+***************************************/
+static int g_displayLevel = 2;
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+/*-*************************************
+* Hash table
+***************************************
+* A small specialized hash map for storing activeDmers.
+* The map does not resize, so if it becomes full it will loop forever.
+* Thus, the map must be large enough to store every value.
+* The map implements linear probing and keeps its load less than 0.5.
+*/
+
+#define MAP_EMPTY_VALUE ((U32)-1)
+typedef struct COVER_map_pair_t_s {
+  U32 key;
+  U32 value;
+} COVER_map_pair_t;
+
+typedef struct COVER_map_s {
+  COVER_map_pair_t *data;
+  U32 sizeLog;
+  U32 size;
+  U32 sizeMask;
+} COVER_map_t;
+
+/**
+ * Clear the map.
+ */
+static void COVER_map_clear(COVER_map_t *map) {
+  memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
+}
+
+/**
+ * Initializes a map of the given size.
+ * Returns 1 on success and 0 on failure.
+ * The map must be destroyed with COVER_map_destroy().
+ * The map is only guaranteed to be large enough to hold size elements.
+ */
+static int COVER_map_init(COVER_map_t *map, U32 size) {
+  map->sizeLog = ZSTD_highbit32(size) + 2;
+  map->size = (U32)1 << map->sizeLog;
+  map->sizeMask = map->size - 1;
+  map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
+  if (!map->data) {
+    map->sizeLog = 0;
+    map->size = 0;
+    return 0;
+  }
+  COVER_map_clear(map);
+  return 1;
+}
+
+/**
+ * Internal hash function
+ */
+static const U32 prime4bytes = 2654435761U;
+static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
+  return (key * prime4bytes) >> (32 - map->sizeLog);
+}
+
+/**
+ * Helper function that returns the index that a key should be placed into.
+ */
+static U32 COVER_map_index(COVER_map_t *map, U32 key) {
+  const U32 hash = COVER_map_hash(map, key);
+  U32 i;
+  for (i = hash;; i = (i + 1) & map->sizeMask) {
+    COVER_map_pair_t *pos = &map->data[i];
+    if (pos->value == MAP_EMPTY_VALUE) {
+      return i;
+    }
+    if (pos->key == key) {
+      return i;
+    }
+  }
+}
+
+/**
+ * Returns the pointer to the value for key.
+ * If key is not in the map, it is inserted and the value is set to 0.
+ * The map must not be full.
+ */
+static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
+  COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
+  if (pos->value == MAP_EMPTY_VALUE) {
+    pos->key = key;
+    pos->value = 0;
+  }
+  return &pos->value;
+}
+
+/**
+ * Deletes key from the map if present.
+ */
+static void COVER_map_remove(COVER_map_t *map, U32 key) {
+  U32 i = COVER_map_index(map, key);
+  COVER_map_pair_t *del = &map->data[i];
+  U32 shift = 1;
+  if (del->value == MAP_EMPTY_VALUE) {
+    return;
+  }
+  for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
+    COVER_map_pair_t *const pos = &map->data[i];
+    /* If the position is empty we are done */
+    if (pos->value == MAP_EMPTY_VALUE) {
+      del->value = MAP_EMPTY_VALUE;
+      return;
+    }
+    /* If pos can be moved to del do so */
+    if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
+      del->key = pos->key;
+      del->value = pos->value;
+      del = pos;
+      shift = 1;
+    } else {
+      ++shift;
+    }
+  }
+}
+
+/**
+ * Destroyes a map that is inited with COVER_map_init().
+ */
+static void COVER_map_destroy(COVER_map_t *map) {
+  if (map->data) {
+    free(map->data);
+  }
+  map->data = NULL;
+  map->size = 0;
+}
+
+/*-*************************************
+* Context
+***************************************/
+
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  U32 *suffix;
+  size_t suffixSize;
+  U32 *freqs;
+  U32 *dmerAt;
+  unsigned d;
+} COVER_ctx_t;
+
+/* We need a global context for qsort... */
+static COVER_ctx_t *g_ctx = NULL;
+
+/*-*************************************
+*  Helper functions
+***************************************/
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+  size_t sum = 0;
+  size_t i;
+  for (i = 0; i < nbSamples; ++i) {
+    sum += samplesSizes[i];
+  }
+  return sum;
+}
+
+/**
+ * Returns -1 if the dmer at lp is less than the dmer at rp.
+ * Return 0 if the dmers at lp and rp are equal.
+ * Returns 1 if the dmer at lp is greater than the dmer at rp.
+ */
+static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
+  U32 const lhs = *(U32 const *)lp;
+  U32 const rhs = *(U32 const *)rp;
+  return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
+}
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
+  U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
+  U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
+  U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
+  if (lhs < rhs) {
+    return -1;
+  }
+  return (lhs > rhs);
+}
+
+/**
+ * Same as COVER_cmp() except ties are broken by pointer value
+ * NOTE: g_ctx must be set to call this function.  A global is required because
+ * qsort doesn't take an opaque pointer.
+ */
+static int COVER_strict_cmp(const void *lp, const void *rp) {
+  int result = COVER_cmp(g_ctx, lp, rp);
+  if (result == 0) {
+    result = lp < rp ? -1 : 1;
+  }
+  return result;
+}
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_strict_cmp8(const void *lp, const void *rp) {
+  int result = COVER_cmp8(g_ctx, lp, rp);
+  if (result == 0) {
+    result = lp < rp ? -1 : 1;
+  }
+  return result;
+}
+
+/**
+ * Returns the first pointer in [first, last) whose element does not compare
+ * less than value.  If no such element exists it returns last.
+ */
+static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
+                                       size_t value) {
+  size_t count = last - first;
+  while (count != 0) {
+    size_t step = count / 2;
+    const size_t *ptr = first;
+    ptr += step;
+    if (*ptr < value) {
+      first = ++ptr;
+      count -= step + 1;
+    } else {
+      count = step;
+    }
+  }
+  return first;
+}
+
+/**
+ * Generic groupBy function.
+ * Groups an array sorted by cmp into groups with equivalent values.
+ * Calls grp for each group.
+ */
+static void
+COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
+              int (*cmp)(COVER_ctx_t *, const void *, const void *),
+              void (*grp)(COVER_ctx_t *, const void *, const void *)) {
+  const BYTE *ptr = (const BYTE *)data;
+  size_t num = 0;
+  while (num < count) {
+    const BYTE *grpEnd = ptr + size;
+    ++num;
+    while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
+      grpEnd += size;
+      ++num;
+    }
+    grp(ctx, ptr, grpEnd);
+    ptr = grpEnd;
+  }
+}
+
+/*-*************************************
+*  Cover functions
+***************************************/
+
+/**
+ * Called on each group of positions with the same dmer.
+ * Counts the frequency of each dmer and saves it in the suffix array.
+ * Fills `ctx->dmerAt`.
+ */
+static void COVER_group(COVER_ctx_t *ctx, const void *group,
+                        const void *groupEnd) {
+  /* The group consists of all the positions with the same first d bytes. */
+  const U32 *grpPtr = (const U32 *)group;
+  const U32 *grpEnd = (const U32 *)groupEnd;
+  /* The dmerId is how we will reference this dmer.
+   * This allows us to map the whole dmer space to a much smaller space, the
+   * size of the suffix array.
+   */
+  const U32 dmerId = (U32)(grpPtr - ctx->suffix);
+  /* Count the number of samples this dmer shows up in */
+  U32 freq = 0;
+  /* Details */
+  const size_t *curOffsetPtr = ctx->offsets;
+  const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
+  /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
+   * different sample than the last.
+   */
+  size_t curSampleEnd = ctx->offsets[0];
+  for (; grpPtr != grpEnd; ++grpPtr) {
+    /* Save the dmerId for this position so we can get back to it. */
+    ctx->dmerAt[*grpPtr] = dmerId;
+    /* Dictionaries only help for the first reference to the dmer.
+     * After that zstd can reference the match from the previous reference.
+     * So only count each dmer once for each sample it is in.
+     */
+    if (*grpPtr < curSampleEnd) {
+      continue;
+    }
+    freq += 1;
+    /* Binary search to find the end of the sample *grpPtr is in.
+     * In the common case that grpPtr + 1 == grpEnd we can skip the binary
+     * search because the loop is over.
+     */
+    if (grpPtr + 1 != grpEnd) {
+      const size_t *sampleEndPtr =
+          COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
+      curSampleEnd = *sampleEndPtr;
+      curOffsetPtr = sampleEndPtr + 1;
+    }
+  }
+  /* At this point we are never going to look at this segment of the suffix
+   * array again.  We take advantage of this fact to save memory.
+   * We store the frequency of the dmer in the first position of the group,
+   * which is dmerId.
+   */
+  ctx->suffix[dmerId] = freq;
+}
+
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} COVER_segment_t;
+
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of dmer d.
+ * Let S_i be the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer d is in the dictionay we set F(d) = 0.
+ */
+static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
+                                           COVER_map_t *activeDmers, U32 begin,
+                                           U32 end,
+                                           ZDICT_cover_params_t parameters) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 dmersInK = k - d + 1;
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+  /* Reset the activeDmers in the segment */
+  COVER_map_clear(activeDmers);
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* The dmerId for the dmer at the next position */
+    U32 newDmer = ctx->dmerAt[activeSegment.end];
+    /* The entry in activeDmers for this dmerId */
+    U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
+    /* If the dmer isn't already present in the segment add its score. */
+    if (*newDmerOcc == 0) {
+      /* The paper suggest using the L-0.5 norm, but experiments show that it
+       * doesn't help.
+       */
+      activeSegment.score += freqs[newDmer];
+    }
+    /* Add the dmer to the segment */
+    activeSegment.end += 1;
+    *newDmerOcc += 1;
+
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      U32 delDmer = ctx->dmerAt[activeSegment.begin];
+      U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
+      activeSegment.begin += 1;
+      *delDmerOcc -= 1;
+      /* If this is the last occurence of the dmer, subtract its score */
+      if (*delDmerOcc == 0) {
+        COVER_map_remove(activeDmers, delDmer);
+        activeSegment.score -= freqs[delDmer];
+      }
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+  {
+    /* Trim off the zero frequency head and tail from the segment. */
+    U32 newBegin = bestSegment.end;
+    U32 newEnd = bestSegment.begin;
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      U32 freq = freqs[ctx->dmerAt[pos]];
+      if (freq != 0) {
+        newBegin = MIN(newBegin, pos);
+        newEnd = pos + 1;
+      }
+    }
+    bestSegment.begin = newBegin;
+    bestSegment.end = newEnd;
+  }
+  {
+    /* Zero out the frequency of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      freqs[ctx->dmerAt[pos]] = 0;
+    }
+  }
+  return bestSegment;
+}
+
+/**
+ * Check the validity of the parameters.
+ * Returns non-zero if the parameters are valid and 0 otherwise.
+ */
+static int COVER_checkParameters(ZDICT_cover_params_t parameters,
+                                 size_t maxDictSize) {
+  /* k and d are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  return 1;
+}
+
+/**
+ * Clean up a context initialized with `COVER_ctx_init()`.
+ */
+static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
+  if (!ctx) {
+    return;
+  }
+  if (ctx->suffix) {
+    free(ctx->suffix);
+    ctx->suffix = NULL;
+  }
+  if (ctx->freqs) {
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+  }
+  if (ctx->dmerAt) {
+    free(ctx->dmerAt);
+    ctx->dmerAt = NULL;
+  }
+  if (ctx->offsets) {
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+  }
+}
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can used multiple
+ * times.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `COVER_ctx_destroy()`.
+ */
+static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
+                          const size_t *samplesSizes, unsigned nbSamples,
+                          unsigned d) {
+  const BYTE *const samples = (const BYTE *)samplesBuffer;
+  const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+  /* Checks */
+  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+      totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
+    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                 (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
+    return 0;
+  }
+  /* Zero the context */
+  memset(ctx, 0, sizeof(*ctx));
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples,
+               (U32)totalSamplesSize);
+  ctx->samples = samples;
+  ctx->samplesSizes = samplesSizes;
+  ctx->nbSamples = nbSamples;
+  /* Partial suffix array */
+  ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1;
+  ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  /* Maps index to the dmerID */
+  ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  /* The offsets of each file */
+  ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
+  if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
+    DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
+    COVER_ctx_destroy(ctx);
+    return 0;
+  }
+  ctx->freqs = NULL;
+  ctx->d = d;
+
+  /* Fill offsets from the samlesSizes */
+  {
+    U32 i;
+    ctx->offsets[0] = 0;
+    for (i = 1; i <= nbSamples; ++i) {
+      ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+    }
+  }
+  DISPLAYLEVEL(2, "Constructing partial suffix array\n");
+  {
+    /* suffix is a partial suffix array.
+     * It only sorts suffixes by their first parameters.d bytes.
+     * The sort is stable, so each dmer group is sorted by position in input.
+     */
+    U32 i;
+    for (i = 0; i < ctx->suffixSize; ++i) {
+      ctx->suffix[i] = i;
+    }
+    /* qsort doesn't take an opaque pointer, so pass as a global */
+    g_ctx = ctx;
+    qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
+          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
+  }
+  DISPLAYLEVEL(2, "Computing frequencies\n");
+  /* For each dmer group (group of positions with the same first d bytes):
+   * 1. For each position we set dmerAt[position] = dmerID.  The dmerID is
+   *    (groupBeginPtr - suffix).  This allows us to go from position to
+   *    dmerID so we can look up values in freq.
+   * 2. We calculate how many samples the dmer occurs in and save it in
+   *    freqs[dmerId].
+   */
+  COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
+                (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
+  ctx->freqs = ctx->suffix;
+  ctx->suffix = NULL;
+  return 1;
+}
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
+                                    COVER_map_t *activeDmers, void *dictBuffer,
+                                    size_t dictBufferCapacity,
+                                    ZDICT_cover_params_t parameters) {
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data up into epochs of equal size.
+   * We will select at least one segment from each epoch.
+   */
+  const U32 epochs = (U32)(dictBufferCapacity / parameters.k);
+  const U32 epochSize = (U32)(ctx->suffixSize / epochs);
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
+               epochSize);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
+    const U32 epochBegin = (U32)(epoch * epochSize);
+    const U32 epochEnd = epochBegin + epochSize;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = COVER_selectSegment(
+        ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
+    /* If the segment covers no dmers, then we are out of content */
+    if (segment.score == 0) {
+      break;
+    }
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t parameters)
+{
+  BYTE* const dict = (BYTE*)dictBuffer;
+  COVER_ctx_t ctx;
+  COVER_map_t activeDmers;
+
+  /* Initialize global data */
+  g_displayLevel = parameters.zParams.notificationLevel;
+  /* Checks */
+  if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
+    DISPLAYLEVEL(1, "Cover parameters incorrect\n");
+    return ERROR(GENERIC);
+  }
+  if (nbSamples == 0) {
+    DISPLAYLEVEL(1, "Cover must have at least one input file\n");
+    return ERROR(GENERIC);
+  }
+  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+    DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                 ZDICT_DICTSIZE_MIN);
+    return ERROR(dstSize_tooSmall);
+  }
+  /* Initialize context and activeDmers */
+  if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                      parameters.d)) {
+    return ERROR(GENERIC);
+  }
+  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
+    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
+    COVER_ctx_destroy(&ctx);
+    return ERROR(GENERIC);
+  }
+
+  DISPLAYLEVEL(2, "Building dictionary\n");
+  {
+    const size_t tail =
+        COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
+                              dictBufferCapacity, parameters);
+    const size_t dictionarySize = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
+    if (!ZSTD_isError(dictionarySize)) {
+      DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                   (U32)dictionarySize);
+    }
+    COVER_ctx_destroy(&ctx);
+    COVER_map_destroy(&activeDmers);
+    return dictionarySize;
+  }
+}
+
+/**
+ * COVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except COVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct COVER_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_cover_params_t parameters;
+  size_t compressedSize;
+} COVER_best_t;
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+static void COVER_best_init(COVER_best_t *best) {
+  if (best==NULL) return; /* compatible with init on NULL */
+  (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
+  (void)ZSTD_pthread_cond_init(&best->cond, NULL);
+  best->liveJobs = 0;
+  best->dict = NULL;
+  best->dictSize = 0;
+  best->compressedSize = (size_t)-1;
+  memset(&best->parameters, 0, sizeof(best->parameters));
+}
+
+/**
+ * Wait until liveJobs == 0.
+ */
+static void COVER_best_wait(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  while (best->liveJobs != 0) {
+    ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
+  }
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+static void COVER_best_destroy(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  COVER_best_wait(best);
+  if (best->dict) {
+    free(best->dict);
+  }
+  ZSTD_pthread_mutex_destroy(&best->mutex);
+  ZSTD_pthread_cond_destroy(&best->cond);
+}
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+static void COVER_best_start(COVER_best_t *best) {
+  if (!best) {
+    return;
+  }
+  ZSTD_pthread_mutex_lock(&best->mutex);
+  ++best->liveJobs;
+  ZSTD_pthread_mutex_unlock(&best->mutex);
+}
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+                              ZDICT_cover_params_t parameters, void *dict,
+                              size_t dictSize) {
+  if (!best) {
+    return;
+  }
+  {
+    size_t liveJobs;
+    ZSTD_pthread_mutex_lock(&best->mutex);
+    --best->liveJobs;
+    liveJobs = best->liveJobs;
+    /* If the new dictionary is better */
+    if (compressedSize < best->compressedSize) {
+      /* Allocate space if necessary */
+      if (!best->dict || best->dictSize < dictSize) {
+        if (best->dict) {
+          free(best->dict);
+        }
+        best->dict = malloc(dictSize);
+        if (!best->dict) {
+          best->compressedSize = ERROR(GENERIC);
+          best->dictSize = 0;
+          return;
+        }
+      }
+      /* Save the dictionary, parameters, and size */
+      memcpy(best->dict, dict, dictSize);
+      best->dictSize = dictSize;
+      best->parameters = parameters;
+      best->compressedSize = compressedSize;
+    }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
+    if (liveJobs == 0) {
+      ZSTD_pthread_cond_broadcast(&best->cond);
+    }
+  }
+}
+
+/**
+ * Parameters for COVER_tryParameters().
+ */
+typedef struct COVER_tryParameters_data_s {
+  const COVER_ctx_t *ctx;
+  COVER_best_t *best;
+  size_t dictBufferCapacity;
+  ZDICT_cover_params_t parameters;
+} COVER_tryParameters_data_t;
+
+/**
+ * Tries a set of parameters and upates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void COVER_tryParameters(void *opaque) {
+  /* Save parameters as local variables */
+  COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
+  const COVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Allocate space for hash table, dict, and freqs */
+  COVER_map_t activeDmers;
+  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
+  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
+    DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
+    goto _cleanup;
+  }
+  if (!dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
+  /* Build the dictionary */
+  {
+    const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
+                                              dictBufferCapacity, parameters);
+    dictBufferCapacity = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples,
+        parameters.zParams);
+    if (ZDICT_isError(dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+      goto _cleanup;
+    }
+  }
+  /* Check total compressed size */
+  {
+    /* Pointers */
+    ZSTD_CCtx *cctx;
+    ZSTD_CDict *cdict;
+    void *dst;
+    /* Local variables */
+    size_t dstCapacity;
+    size_t i;
+    /* Allocate dst with enough space to compress the maximum sized sample */
+    {
+      size_t maxSampleSize = 0;
+      for (i = 0; i < ctx->nbSamples; ++i) {
+        maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
+      }
+      dstCapacity = ZSTD_compressBound(maxSampleSize);
+      dst = malloc(dstCapacity);
+    }
+    /* Create the cctx and cdict */
+    cctx = ZSTD_createCCtx();
+    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                             parameters.zParams.compressionLevel);
+    if (!dst || !cctx || !cdict) {
+      goto _compressCleanup;
+    }
+    /* Compress each sample and sum their sizes (or error) */
+    totalCompressedSize = dictBufferCapacity;
+    for (i = 0; i < ctx->nbSamples; ++i) {
+      const size_t size = ZSTD_compress_usingCDict(
+          cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
+          ctx->samplesSizes[i], cdict);
+      if (ZSTD_isError(size)) {
+        totalCompressedSize = ERROR(GENERIC);
+        goto _compressCleanup;
+      }
+      totalCompressedSize += size;
+    }
+  _compressCleanup:
+    ZSTD_freeCCtx(cctx);
+    ZSTD_freeCDict(cdict);
+    if (dst) {
+      free(dst);
+    }
+  }
+
+_cleanup:
+  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
+                    dictBufferCapacity);
+  free(data);
+  COVER_map_destroy(&activeDmers);
+  if (dict) {
+    free(dict);
+  }
+  if (freqs) {
+    free(freqs);
+  }
+}
+
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t *parameters) {
+  /* constants */
+  const unsigned nbThreads = parameters->nbThreads;
+  const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+  const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+  const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+  const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+  const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+  const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+  const unsigned kIterations =
+      (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+  /* Local variables */
+  const int displayLevel = parameters->zParams.notificationLevel;
+  unsigned iteration = 1;
+  unsigned d;
+  unsigned k;
+  COVER_best_t best;
+  POOL_ctx *pool = NULL;
+
+  /* Checks */
+  if (kMinK < kMaxD || kMaxK < kMinK) {
+    LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+    return ERROR(GENERIC);
+  }
+  if (nbSamples == 0) {
+    DISPLAYLEVEL(1, "Cover must have at least one input file\n");
+    return ERROR(GENERIC);
+  }
+  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+    DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                 ZDICT_DICTSIZE_MIN);
+    return ERROR(dstSize_tooSmall);
+  }
+  if (nbThreads > 1) {
+    pool = POOL_create(nbThreads, 1);
+    if (!pool) {
+      return ERROR(memory_allocation);
+    }
+  }
+  /* Initialization */
+  COVER_best_init(&best);
+  /* Turn down global display level to clean up display at level 2 and below */
+  g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+  /* Loop through d first because each new value needs a new context */
+  LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                    kIterations);
+  for (d = kMinD; d <= kMaxD; d += 2) {
+    /* Initialize the context for this value of d */
+    COVER_ctx_t ctx;
+    LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+    if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return ERROR(GENERIC);
+    }
+    /* Loop through k reusing the same context */
+    for (k = kMinK; k <= kMaxK; k += kStepSize) {
+      /* Prepare the arguments */
+      COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
+          sizeof(COVER_tryParameters_data_t));
+      LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+      if (!data) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+        COVER_best_destroy(&best);
+        COVER_ctx_destroy(&ctx);
+        POOL_free(pool);
+        return ERROR(GENERIC);
+      }
+      data->ctx = &ctx;
+      data->best = &best;
+      data->dictBufferCapacity = dictBufferCapacity;
+      data->parameters = *parameters;
+      data->parameters.k = k;
+      data->parameters.d = d;
+      data->parameters.steps = kSteps;
+      data->parameters.zParams.notificationLevel = g_displayLevel;
+      /* Check the parameters */
+      if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
+        DISPLAYLEVEL(1, "Cover parameters incorrect\n");
+        free(data);
+        continue;
+      }
+      /* Call the function and pass ownership of data to it */
+      COVER_best_start(&best);
+      if (pool) {
+        POOL_add(pool, &COVER_tryParameters, data);
+      } else {
+        COVER_tryParameters(data);
+      }
+      /* Print status */
+      LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                         (U32)((iteration * 100) / kIterations));
+      ++iteration;
+    }
+    COVER_best_wait(&best);
+    COVER_ctx_destroy(&ctx);
+  }
+  LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+  /* Fill the output buffer and parameters with output of the best parameters */
+  {
+    const size_t dictSize = best.dictSize;
+    if (ZSTD_isError(best.compressedSize)) {
+      const size_t compressedSize = best.compressedSize;
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return compressedSize;
+    }
+    *parameters = best.parameters;
+    memcpy(dictBuffer, best.dict, dictSize);
+    COVER_best_destroy(&best);
+    POOL_free(pool);
+    return dictSize;
+  }
+}
diff --git a/vendor/github.com/DataDog/zstd/cpu.h b/vendor/github.com/DataDog/zstd/cpu.h
new file mode 100755
index 0000000..4eb48e3
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/cpu.h
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMMON_CPU_H
+#define ZSTD_COMMON_CPU_H
+
+/**
+ * Implementation taken from folly/CpuId.h
+ * https://github.com/facebook/folly/blob/master/folly/CpuId.h
+ */
+
+#include <string.h>
+
+#include "mem.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+typedef struct {
+    U32 f1c;
+    U32 f1d;
+    U32 f7b;
+    U32 f7c;
+} ZSTD_cpuid_t;
+
+MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
+    U32 f1c = 0;
+    U32 f1d = 0;
+    U32 f7b = 0;
+    U32 f7c = 0;
+#ifdef _MSC_VER
+    int reg[4];
+    __cpuid((int*)reg, 0);
+    {
+        int const n = reg[0];
+        if (n >= 1) {
+            __cpuid((int*)reg, 1);
+            f1c = (U32)reg[2];
+            f1d = (U32)reg[3];
+        }
+        if (n >= 7) {
+            __cpuidex((int*)reg, 7, 0);
+            f7b = (U32)reg[1];
+            f7c = (U32)reg[2];
+        }
+    }
+#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
+    /* The following block like the normal cpuid branch below, but gcc
+     * reserves ebx for use of its pic register so we must specially
+     * handle the save and restore to avoid clobbering the register
+     */
+    U32 n;
+    __asm__(
+        "pushl %%ebx\n\t"
+        "cpuid\n\t"
+        "popl %%ebx\n\t"
+        : "=a"(n)
+        : "a"(0)
+        : "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "popl %%ebx\n\t"
+          : "=a"(f1a), "=c"(f1c), "=d"(f1d)
+          : "a"(1)
+          :);
+    }
+    if (n >= 7) {
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "movl %%ebx, %%eax\n\r"
+          "popl %%ebx"
+          : "=a"(f7b), "=c"(f7c)
+          : "a"(7), "c"(0)
+          : "edx");
+    }
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+    U32 n;
+    __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
+    }
+    if (n >= 7) {
+      U32 f7a;
+      __asm__("cpuid"
+              : "=a"(f7a), "=b"(f7b), "=c"(f7c)
+              : "a"(7), "c"(0)
+              : "edx");
+    }
+#endif
+    {
+        ZSTD_cpuid_t cpuid;
+        cpuid.f1c = f1c;
+        cpuid.f1d = f1d;
+        cpuid.f7b = f7b;
+        cpuid.f7c = f7c;
+        return cpuid;
+    }
+}
+
+#define X(name, r, bit)                                                        \
+  MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) {                 \
+    return ((cpuid.r) & (1U << bit)) != 0;                                     \
+  }
+
+/* cpuid(1): Processor Info and Feature Bits. */
+#define C(name, bit) X(name, f1c, bit)
+  C(sse3, 0)
+  C(pclmuldq, 1)
+  C(dtes64, 2)
+  C(monitor, 3)
+  C(dscpl, 4)
+  C(vmx, 5)
+  C(smx, 6)
+  C(eist, 7)
+  C(tm2, 8)
+  C(ssse3, 9)
+  C(cnxtid, 10)
+  C(fma, 12)
+  C(cx16, 13)
+  C(xtpr, 14)
+  C(pdcm, 15)
+  C(pcid, 17)
+  C(dca, 18)
+  C(sse41, 19)
+  C(sse42, 20)
+  C(x2apic, 21)
+  C(movbe, 22)
+  C(popcnt, 23)
+  C(tscdeadline, 24)
+  C(aes, 25)
+  C(xsave, 26)
+  C(osxsave, 27)
+  C(avx, 28)
+  C(f16c, 29)
+  C(rdrand, 30)
+#undef C
+#define D(name, bit) X(name, f1d, bit)
+  D(fpu, 0)
+  D(vme, 1)
+  D(de, 2)
+  D(pse, 3)
+  D(tsc, 4)
+  D(msr, 5)
+  D(pae, 6)
+  D(mce, 7)
+  D(cx8, 8)
+  D(apic, 9)
+  D(sep, 11)
+  D(mtrr, 12)
+  D(pge, 13)
+  D(mca, 14)
+  D(cmov, 15)
+  D(pat, 16)
+  D(pse36, 17)
+  D(psn, 18)
+  D(clfsh, 19)
+  D(ds, 21)
+  D(acpi, 22)
+  D(mmx, 23)
+  D(fxsr, 24)
+  D(sse, 25)
+  D(sse2, 26)
+  D(ss, 27)
+  D(htt, 28)
+  D(tm, 29)
+  D(pbe, 31)
+#undef D
+
+/* cpuid(7): Extended Features. */
+#define B(name, bit) X(name, f7b, bit)
+  B(bmi1, 3)
+  B(hle, 4)
+  B(avx2, 5)
+  B(smep, 7)
+  B(bmi2, 8)
+  B(erms, 9)
+  B(invpcid, 10)
+  B(rtm, 11)
+  B(mpx, 14)
+  B(avx512f, 16)
+  B(avx512dq, 17)
+  B(rdseed, 18)
+  B(adx, 19)
+  B(smap, 20)
+  B(avx512ifma, 21)
+  B(pcommit, 22)
+  B(clflushopt, 23)
+  B(clwb, 24)
+  B(avx512pf, 26)
+  B(avx512er, 27)
+  B(avx512cd, 28)
+  B(sha, 29)
+  B(avx512bw, 30)
+  B(avx512vl, 31)
+#undef B
+#define C(name, bit) X(name, f7c, bit)
+  C(prefetchwt1, 0)
+  C(avx512vbmi, 1)
+#undef C
+
+#undef X
+
+#endif /* ZSTD_COMMON_CPU_H */
diff --git a/vendor/github.com/DataDog/zstd/divsufsort.c b/vendor/github.com/DataDog/zstd/divsufsort.c
new file mode 100644
index 0000000..60cceb0
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/divsufsort.c
@@ -0,0 +1,1913 @@
+/*
+ * divsufsort.c for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*- Compiler specifics -*/
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4244)
+#  pragma warning(disable : 4127)    /* C4127 : Condition expression is constant */
+#endif
+
+
+/*- Dependencies -*/
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "divsufsort.h"
+
+/*- Constants -*/
+#if defined(INLINE)
+# undef INLINE
+#endif
+#if !defined(INLINE)
+# define INLINE __inline
+#endif
+#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1)
+# undef ALPHABET_SIZE
+#endif
+#if !defined(ALPHABET_SIZE)
+# define ALPHABET_SIZE (256)
+#endif
+#define BUCKET_A_SIZE (ALPHABET_SIZE)
+#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
+#if defined(SS_INSERTIONSORT_THRESHOLD)
+# if SS_INSERTIONSORT_THRESHOLD < 1
+#  undef SS_INSERTIONSORT_THRESHOLD
+#  define SS_INSERTIONSORT_THRESHOLD (1)
+# endif
+#else
+# define SS_INSERTIONSORT_THRESHOLD (8)
+#endif
+#if defined(SS_BLOCKSIZE)
+# if SS_BLOCKSIZE < 0
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (0)
+# elif 32768 <= SS_BLOCKSIZE
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (32767)
+# endif
+#else
+# define SS_BLOCKSIZE (1024)
+#endif
+/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */
+#if SS_BLOCKSIZE == 0
+# define SS_MISORT_STACKSIZE (96)
+#elif SS_BLOCKSIZE <= 4096
+# define SS_MISORT_STACKSIZE (16)
+#else
+# define SS_MISORT_STACKSIZE (24)
+#endif
+#define SS_SMERGE_STACKSIZE (32)
+#define TR_INSERTIONSORT_THRESHOLD (8)
+#define TR_STACKSIZE (64)
+
+
+/*- Macros -*/
+#ifndef SWAP
+# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0)
+#endif /* SWAP */
+#ifndef MIN
+# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b))
+#endif /* MIN */
+#ifndef MAX
+# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b))
+#endif /* MAX */
+#define STACK_PUSH(_a, _b, _c, _d)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize++].d = (_d);\
+  } while(0)
+#define STACK_PUSH5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\
+  } while(0)
+#define STACK_POP(_a, _b, _c, _d)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d;\
+  } while(0)
+#define STACK_POP5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\
+  } while(0)
+#define BUCKET_A(_c0) bucket_A[(_c0)]
+#if ALPHABET_SIZE == 256
+#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
+#else
+#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
+#endif
+
+
+/*- Private Functions -*/
+
+static const int lg_table[256]= {
+ -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+int
+ss_ilg(int n) {
+#if SS_BLOCKSIZE == 0
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+#elif SS_BLOCKSIZE < 256
+  return lg_table[n];
+#else
+  return (n & 0xff00) ?
+          8 + lg_table[(n >> 8) & 0xff] :
+          0 + lg_table[(n >> 0) & 0xff];
+#endif
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+#if SS_BLOCKSIZE != 0
+
+static const int sqq_table[256] = {
+  0,  16,  22,  27,  32,  35,  39,  42,  45,  48,  50,  53,  55,  57,  59,  61,
+ 64,  65,  67,  69,  71,  73,  75,  76,  78,  80,  81,  83,  84,  86,  87,  89,
+ 90,  91,  93,  94,  96,  97,  98,  99, 101, 102, 103, 104, 106, 107, 108, 109,
+110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155,
+156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
+169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180,
+181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191,
+192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201,
+202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211,
+212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221,
+221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230,
+230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
+239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247,
+247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255
+};
+
+static INLINE
+int
+ss_isqrt(int x) {
+  int y, e;
+
+  if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; }
+  e = (x & 0xffff0000) ?
+        ((x & 0xff000000) ?
+          24 + lg_table[(x >> 24) & 0xff] :
+          16 + lg_table[(x >> 16) & 0xff]) :
+        ((x & 0x0000ff00) ?
+           8 + lg_table[(x >>  8) & 0xff] :
+           0 + lg_table[(x >>  0) & 0xff]);
+
+  if(e >= 16) {
+    y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7);
+    if(e >= 24) { y = (y + 1 + x / y) >> 1; }
+    y = (y + 1 + x / y) >> 1;
+  } else if(e >= 8) {
+    y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1;
+  } else {
+    return sqq_table[x] >> 4;
+  }
+
+  return (x < (y * y)) ? y - 1 : y;
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Compares two suffixes. */
+static INLINE
+int
+ss_compare(const unsigned char *T,
+           const int *p1, const int *p2,
+           int depth) {
+  const unsigned char *U1, *U2, *U1n, *U2n;
+
+  for(U1 = T + depth + *p1,
+      U2 = T + depth + *p2,
+      U1n = T + *(p1 + 1) + 2,
+      U2n = T + *(p2 + 1) + 2;
+      (U1 < U1n) && (U2 < U2n) && (*U1 == *U2);
+      ++U1, ++U2) {
+  }
+
+  return U1 < U1n ?
+        (U2 < U2n ? *U1 - *U2 : 1) :
+        (U2 < U2n ? -1 : 0);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1)
+
+/* Insertionsort for small size groups */
+static
+void
+ss_insertionsort(const unsigned char *T, const int *PA,
+                 int *first, int *last, int depth) {
+  int *i, *j;
+  int t;
+  int r;
+
+  for(i = last - 2; first <= i; --i) {
+    for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) {
+      do { *(j - 1) = *j; } while((++j < last) && (*j < 0));
+      if(last <= j) { break; }
+    }
+    if(r == 0) { *j = ~*j; }
+    *(j - 1) = t;
+  }
+}
+
+#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+void
+ss_fixdown(const unsigned char *Td, const int *PA,
+           int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = Td[PA[SA[k = j++]]];
+    if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    ss_fixdown(Td, PA, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+ss_median3(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3) {
+  int *t;
+  if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); }
+  if(Td[PA[*v2]] > Td[PA[*v3]]) {
+    if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+ss_median5(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); }
+  if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); }
+  if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); }
+  if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return ss_median3(Td, PA, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = ss_median3(Td, PA, first, first + t, first + (t << 1));
+  middle = ss_median3(Td, PA, middle - t, middle, middle + t);
+  last   = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return ss_median3(Td, PA, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Binary partition for substrings. */
+static INLINE
+int *
+ss_partition(const int *PA,
+                    int *first, int *last, int depth) {
+  int *a, *b;
+  int t;
+  for(a = first - 1, b = last;;) {
+    for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; }
+    for(; (a < --b) && ((PA[*b] + depth) <  (PA[*b + 1] + 1));) { }
+    if(b <= a) { break; }
+    t = ~*b;
+    *b = *a;
+    *a = t;
+  }
+  if(first < a) { *first = ~*first; }
+  return a;
+}
+
+/* Multikey introsort for medium size groups. */
+static
+void
+ss_mintrosort(const unsigned char *T, const int *PA,
+              int *first, int *last,
+              int depth) {
+#define STACK_SIZE SS_MISORT_STACKSIZE
+  struct { int *a, *b, c; int d; } stack[STACK_SIZE];
+  const unsigned char *Td;
+  int *a, *b, *c, *d, *e, *f;
+  int s, t;
+  int ssize;
+  int limit;
+  int v, x = 0;
+
+  for(ssize = 0, limit = ss_ilg(last - first);;) {
+
+    if((last - first) <= SS_INSERTIONSORT_THRESHOLD) {
+#if 1 < SS_INSERTIONSORT_THRESHOLD
+      if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); }
+#endif
+      STACK_POP(first, last, depth, limit);
+      continue;
+    }
+
+    Td = T + depth;
+    if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); }
+    if(limit < 0) {
+      for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) {
+        if((x = Td[PA[*a]]) != v) {
+          if(1 < (a - first)) { break; }
+          v = x;
+          first = a;
+        }
+      }
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, a, depth);
+      }
+      if((a - first) <= (last - a)) {
+        if(1 < (a - first)) {
+          STACK_PUSH(a, last, depth, -1);
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        } else {
+          first = a, limit = -1;
+        }
+      } else {
+        if(1 < (last - a)) {
+          STACK_PUSH(first, a, depth + 1, ss_ilg(a - first));
+          first = a, limit = -1;
+        } else {
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        }
+      }
+      continue;
+    }
+
+    /* choose pivot */
+    a = ss_pivot(Td, PA, first, last);
+    v = Td[PA[*a]];
+    SWAP(*first, *a);
+
+    /* partition */
+    for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { }
+    if(((a = b) < last) && (x < v)) {
+      for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+    }
+    for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { }
+    if((b < (d = c)) && (x > v)) {
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+    for(; b < c;) {
+      SWAP(*b, *c);
+      for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+
+    if(a <= d) {
+      c = b - 1;
+
+      if((s = a - first) > (t = b - a)) { s = t; }
+      for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+      if((s = d - c) > (t = last - d - 1)) { s = t; }
+      for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+
+      a = first + (b - a), c = last - (d - c);
+      b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth);
+
+      if((a - first) <= (last - c)) {
+        if((last - c) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(c, last, depth, limit);
+          last = a;
+        } else if((a - first) <= (c - b)) {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          last = a;
+        } else {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(first, a, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      } else {
+        if((a - first) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(first, a, depth, limit);
+          first = c;
+        } else if((last - c) <= (c - b)) {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          first = c;
+        } else {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(c, last, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      }
+    } else {
+      limit += 1;
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, last, depth);
+        limit = ss_ilg(last - first);
+      }
+      depth += 1;
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if SS_BLOCKSIZE != 0
+
+static INLINE
+void
+ss_blockswap(int *a, int *b, int n) {
+  int t;
+  for(; 0 < n; --n, ++a, ++b) {
+    t = *a, *a = *b, *b = t;
+  }
+}
+
+static INLINE
+void
+ss_rotate(int *first, int *middle, int *last) {
+  int *a, *b, t;
+  int l, r;
+  l = middle - first, r = last - middle;
+  for(; (0 < l) && (0 < r);) {
+    if(l == r) { ss_blockswap(first, middle, l); break; }
+    if(l < r) {
+      a = last - 1, b = middle - 1;
+      t = *a;
+      do {
+        *a-- = *b, *b-- = *a;
+        if(b < first) {
+          *a = t;
+          last = a;
+          if((r -= l + 1) <= l) { break; }
+          a -= 1, b = middle - 1;
+          t = *a;
+        }
+      } while(1);
+    } else {
+      a = first, b = middle;
+      t = *a;
+      do {
+        *a++ = *b, *b++ = *a;
+        if(last <= b) {
+          *a = t;
+          first = a + 1;
+          if((l -= r + 1) <= r) { break; }
+          a += 1, b = middle;
+          t = *a;
+        }
+      } while(1);
+    }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static
+void
+ss_inplacemerge(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int depth) {
+  const int *p;
+  int *a, *b;
+  int len, half;
+  int q, r;
+  int x;
+
+  for(;;) {
+    if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); }
+    else                { x = 0; p = PA +  *(last - 1); }
+    for(a = first, len = middle - first, half = len >> 1, r = -1;
+        0 < len;
+        len = half, half >>= 1) {
+      b = a + half;
+      q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth);
+      if(q < 0) {
+        a = b + 1;
+        half -= (len & 1) ^ 1;
+      } else {
+        r = q;
+      }
+    }
+    if(a < middle) {
+      if(r == 0) { *a = ~*a; }
+      ss_rotate(a, middle, last);
+      last -= middle - a;
+      middle = a;
+      if(first == middle) { break; }
+    }
+    --last;
+    if(x != 0) { while(*--last < 0) { } }
+    if(middle == last) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Merge-forward with internal buffer. */
+static
+void
+ss_mergeforward(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int *buf, int depth) {
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+
+  bufend = buf + (middle - first) - 1;
+  ss_blockswap(buf, first, middle - first);
+
+  for(t = *(a = first), b = buf, c = middle;;) {
+    r = ss_compare(T, PA + *b, PA + *c, depth);
+    if(r < 0) {
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+    } else if(r > 0) {
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    } else {
+      *c = ~*c;
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    }
+  }
+}
+
+/* Merge-backward with internal buffer. */
+static
+void
+ss_mergebackward(const unsigned char *T, const int *PA,
+                 int *first, int *middle, int *last,
+                 int *buf, int depth) {
+  const int *p1, *p2;
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+  int x;
+
+  bufend = buf + (last - middle) - 1;
+  ss_blockswap(buf, middle, last - middle);
+
+  x = 0;
+  if(*bufend < 0)       { p1 = PA + ~*bufend; x |= 1; }
+  else                  { p1 = PA +  *bufend; }
+  if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; }
+  else                  { p2 = PA +  *(middle - 1); }
+  for(t = *(a = last - 1), b = bufend, c = middle - 1;;) {
+    r = ss_compare(T, p1, p2, depth);
+    if(0 < r) {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = *b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+    } else if(r < 0) {
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    } else {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = ~*b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    }
+  }
+}
+
+/* D&C based merge. */
+static
+void
+ss_swapmerge(const unsigned char *T, const int *PA,
+             int *first, int *middle, int *last,
+             int *buf, int bufsize, int depth) {
+#define STACK_SIZE SS_SMERGE_STACKSIZE
+#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a)))
+#define MERGE_CHECK(a, b, c)\
+  do {\
+    if(((c) & 1) ||\
+       (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\
+      *(a) = ~*(a);\
+    }\
+    if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\
+      *(b) = ~*(b);\
+    }\
+  } while(0)
+  struct { int *a, *b, *c; int d; } stack[STACK_SIZE];
+  int *l, *r, *lm, *rm;
+  int m, len, half;
+  int ssize;
+  int check, next;
+
+  for(check = 0, ssize = 0;;) {
+    if((last - middle) <= bufsize) {
+      if((first < middle) && (middle < last)) {
+        ss_mergebackward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    if((middle - first) <= bufsize) {
+      if(first < middle) {
+        ss_mergeforward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1;
+        0 < len;
+        len = half, half >>= 1) {
+      if(ss_compare(T, PA + GETIDX(*(middle + m + half)),
+                       PA + GETIDX(*(middle - m - half - 1)), depth) < 0) {
+        m += half + 1;
+        half -= (len & 1) ^ 1;
+      }
+    }
+
+    if(0 < m) {
+      lm = middle - m, rm = middle + m;
+      ss_blockswap(lm, middle, m);
+      l = r = middle, next = 0;
+      if(rm < last) {
+        if(*rm < 0) {
+          *rm = ~*rm;
+          if(first < lm) { for(; *--l < 0;) { } next |= 4; }
+          next |= 1;
+        } else if(first < lm) {
+          for(; *r < 0; ++r) { }
+          next |= 2;
+        }
+      }
+
+      if((l - first) <= (last - r)) {
+        STACK_PUSH(r, rm, last, (next & 3) | (check & 4));
+        middle = lm, last = l, check = (check & 3) | (next & 4);
+      } else {
+        if((next & 2) && (r == middle)) { next ^= 6; }
+        STACK_PUSH(first, lm, l, (check & 3) | (next & 4));
+        first = r, middle = rm, check = (next & 3) | (check & 4);
+      }
+    } else {
+      if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) {
+        *middle = ~*middle;
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Substring sort */
+static
+void
+sssort(const unsigned char *T, const int *PA,
+       int *first, int *last,
+       int *buf, int bufsize,
+       int depth, int n, int lastsuffix) {
+  int *a;
+#if SS_BLOCKSIZE != 0
+  int *b, *middle, *curbuf;
+  int j, k, curbufsize, limit;
+#endif
+  int i;
+
+  if(lastsuffix != 0) { ++first; }
+
+#if SS_BLOCKSIZE == 0
+  ss_mintrosort(T, PA, first, last, depth);
+#else
+  if((bufsize < SS_BLOCKSIZE) &&
+      (bufsize < (last - first)) &&
+      (bufsize < (limit = ss_isqrt(last - first)))) {
+    if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; }
+    buf = middle = last - limit, bufsize = limit;
+  } else {
+    middle = last, limit = 0;
+  }
+  for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#endif
+    curbufsize = last - (a + SS_BLOCKSIZE);
+    curbuf = a + SS_BLOCKSIZE;
+    if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; }
+    for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) {
+      ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth);
+    }
+  }
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+  ss_mintrosort(T, PA, a, middle, depth);
+#elif 1 < SS_BLOCKSIZE
+  ss_insertionsort(T, PA, a, middle, depth);
+#endif
+  for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) {
+    if(i & 1) {
+      ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth);
+      a -= k;
+    }
+  }
+  if(limit != 0) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, middle, last, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, middle, last, depth);
+#endif
+    ss_inplacemerge(T, PA, first, middle, last, depth);
+  }
+#endif
+
+  if(lastsuffix != 0) {
+    /* Insert last type B* suffix. */
+    int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2;
+    for(a = first, i = *(first - 1);
+        (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth)));
+        ++a) {
+      *(a - 1) = *a;
+    }
+    *(a - 1) = i;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+int
+tr_ilg(int n) {
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Simple insertionsort for small size groups. */
+static
+void
+tr_insertionsort(const int *ISAd, int *first, int *last) {
+  int *a, *b;
+  int t, r;
+
+  for(a = first + 1; a < last; ++a) {
+    for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) {
+      do { *(b + 1) = *b; } while((first <= --b) && (*b < 0));
+      if(b < first) { break; }
+    }
+    if(r == 0) { *b = ~*b; }
+    *(b + 1) = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_fixdown(const int *ISAd, int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = ISAd[SA[k = j++]];
+    if(d < (e = ISAd[SA[j]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+tr_heapsort(const int *ISAd, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    tr_fixdown(ISAd, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+tr_median3(const int *ISAd, int *v1, int *v2, int *v3) {
+  int *t;
+  if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); }
+  if(ISAd[*v2] > ISAd[*v3]) {
+    if(ISAd[*v1] > ISAd[*v3]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+tr_median5(const int *ISAd,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); }
+  if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); }
+  if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); }
+  if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(ISAd[*v3] > ISAd[*v4]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+tr_pivot(const int *ISAd, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return tr_median3(ISAd, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = tr_median3(ISAd, first, first + t, first + (t << 1));
+  middle = tr_median3(ISAd, middle - t, middle, middle + t);
+  last   = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return tr_median3(ISAd, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+typedef struct _trbudget_t trbudget_t;
+struct _trbudget_t {
+  int chance;
+  int remain;
+  int incval;
+  int count;
+};
+
+static INLINE
+void
+trbudget_init(trbudget_t *budget, int chance, int incval) {
+  budget->chance = chance;
+  budget->remain = budget->incval = incval;
+}
+
+static INLINE
+int
+trbudget_check(trbudget_t *budget, int size) {
+  if(size <= budget->remain) { budget->remain -= size; return 1; }
+  if(budget->chance == 0) { budget->count += size; return 0; }
+  budget->remain += budget->incval - size;
+  budget->chance -= 1;
+  return 1;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_partition(const int *ISAd,
+             int *first, int *middle, int *last,
+             int **pa, int **pb, int v) {
+  int *a, *b, *c, *d, *e, *f;
+  int t, s;
+  int x = 0;
+
+  for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { }
+  if(((a = b) < last) && (x < v)) {
+    for(; (++b < last) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+  }
+  for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { }
+  if((b < (d = c)) && (x > v)) {
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+  for(; b < c;) {
+    SWAP(*b, *c);
+    for(; (++b < c) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+
+  if(a <= d) {
+    c = b - 1;
+    if((s = a - first) > (t = b - a)) { s = t; }
+    for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    if((s = d - c) > (t = last - d - 1)) { s = t; }
+    for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    first += (b - a), last -= (d - c);
+  }
+  *pa = first, *pb = last;
+}
+
+static
+void
+tr_copy(int *ISA, const int *SA,
+        int *first, int *a, int *b, int *last,
+        int depth) {
+  /* sort suffixes of middle partition
+     by using sorted order of suffixes of left and right partition. */
+  int *c, *d, *e;
+  int s, v;
+
+  v = b - SA - 1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      ISA[s] = d - SA;
+    }
+  }
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      ISA[s] = d - SA;
+    }
+  }
+}
+
+static
+void
+tr_partialcopy(int *ISA, const int *SA,
+               int *first, int *a, int *b, int *last,
+               int depth) {
+  int *c, *d, *e;
+  int s, v;
+  int rank, lastrank, newrank = -1;
+
+  v = b - SA - 1;
+  lastrank = -1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+
+  lastrank = -1;
+  for(e = d; first <= e; --e) {
+    rank = ISA[*e];
+    if(lastrank != rank) { lastrank = rank; newrank = e - SA; }
+    if(newrank != rank) { ISA[*e] = newrank; }
+  }
+
+  lastrank = -1;
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+}
+
+static
+void
+tr_introsort(int *ISA, const int *ISAd,
+             int *SA, int *first, int *last,
+             trbudget_t *budget) {
+#define STACK_SIZE TR_STACKSIZE
+  struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE];
+  int *a, *b, *c;
+  int t;
+  int v, x = 0;
+  int incr = ISAd - ISA;
+  int limit, next;
+  int ssize, trlink = -1;
+
+  for(ssize = 0, limit = tr_ilg(last - first);;) {
+
+    if(limit < 0) {
+      if(limit == -1) {
+        /* tandem repeat partition */
+        tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1);
+
+        /* update ranks */
+        if(a < last) {
+          for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+        }
+        if(b < last) {
+          for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; }
+        }
+
+        /* push */
+        if(1 < (b - a)) {
+          STACK_PUSH5(NULL, a, b, 0, 0);
+          STACK_PUSH5(ISAd - incr, first, last, -2, trlink);
+          trlink = ssize - 2;
+        }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink);
+            last = a, limit = tr_ilg(a - first);
+          } else if(1 < (last - b)) {
+            first = b, limit = tr_ilg(last - b);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink);
+            first = b, limit = tr_ilg(last - b);
+          } else if(1 < (a - first)) {
+            last = a, limit = tr_ilg(a - first);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      } else if(limit == -2) {
+        /* tandem repeat copy */
+        a = stack[--ssize].b, b = stack[ssize].c;
+        if(stack[ssize].d == 0) {
+          tr_copy(ISA, SA, first, a, b, last, ISAd - ISA);
+        } else {
+          if(0 <= trlink) { stack[trlink].d = -1; }
+          tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA);
+        }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      } else {
+        /* sorted partition */
+        if(0 <= *first) {
+          a = first;
+          do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a));
+          first = a;
+        }
+        if(first < last) {
+          a = first; do { *a = ~*a; } while(*++a < 0);
+          next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1;
+          if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } }
+
+          /* push */
+          if(trbudget_check(budget, a - first)) {
+            if((a - first) <= (last - a)) {
+              STACK_PUSH5(ISAd, a, last, -3, trlink);
+              ISAd += incr, last = a, limit = next;
+            } else {
+              if(1 < (last - a)) {
+                STACK_PUSH5(ISAd + incr, first, a, next, trlink);
+                first = a, limit = -3;
+              } else {
+                ISAd += incr, last = a, limit = next;
+              }
+            }
+          } else {
+            if(0 <= trlink) { stack[trlink].d = -1; }
+            if(1 < (last - a)) {
+              first = a, limit = -3;
+            } else {
+              STACK_POP5(ISAd, first, last, limit, trlink);
+            }
+          }
+        } else {
+          STACK_POP5(ISAd, first, last, limit, trlink);
+        }
+      }
+      continue;
+    }
+
+    if((last - first) <= TR_INSERTIONSORT_THRESHOLD) {
+      tr_insertionsort(ISAd, first, last);
+      limit = -3;
+      continue;
+    }
+
+    if(limit-- == 0) {
+      tr_heapsort(ISAd, first, last - first);
+      for(a = last - 1; first < a; a = b) {
+        for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; }
+      }
+      limit = -3;
+      continue;
+    }
+
+    /* choose pivot */
+    a = tr_pivot(ISAd, first, last);
+    SWAP(*first, *a);
+    v = ISAd[*first];
+
+    /* partition */
+    tr_partition(ISAd, first, first + 1, last, &a, &b, v);
+    if((last - first) != (b - a)) {
+      next = (ISA[*a] != v) ? tr_ilg(b - a) : -1;
+
+      /* update ranks */
+      for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+      if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } }
+
+      /* push */
+      if((1 < (b - a)) && (trbudget_check(budget, b - a))) {
+        if((a - first) <= (last - b)) {
+          if((last - b) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              last = a;
+            } else if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((a - first) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        } else {
+          if((a - first) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              first = b;
+            } else if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((last - b) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        }
+      } else {
+        if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            last = a;
+          } else if(1 < (last - b)) {
+            first = b;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            first = b;
+          } else if(1 < (a - first)) {
+            last = a;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      }
+    } else {
+      if(trbudget_check(budget, last - first)) {
+        limit = tr_ilg(last - first), ISAd += incr;
+      } else {
+        if(0 <= trlink) { stack[trlink].d = -1; }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      }
+    }
+  }
+#undef STACK_SIZE
+}
+
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Tandem repeat sort */
+static
+void
+trsort(int *ISA, int *SA, int n, int depth) {
+  int *ISAd;
+  int *first, *last;
+  trbudget_t budget;
+  int t, skip, unsorted;
+
+  trbudget_init(&budget, tr_ilg(n) * 2 / 3, n);
+/*  trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */
+  for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) {
+    first = SA;
+    skip = 0;
+    unsorted = 0;
+    do {
+      if((t = *first) < 0) { first -= t; skip += t; }
+      else {
+        if(skip != 0) { *(first + skip) = skip; skip = 0; }
+        last = SA + ISA[t] + 1;
+        if(1 < (last - first)) {
+          budget.count = 0;
+          tr_introsort(ISA, ISAd, SA, first, last, &budget);
+          if(budget.count != 0) { unsorted += budget.count; }
+          else { skip = first - last; }
+        } else if((last - first) == 1) {
+          skip = -1;
+        }
+        first = last;
+      }
+    } while(first < (SA + n));
+    if(skip != 0) { *(first + skip) = skip; }
+    if(unsorted == 0) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Sorts suffixes of type B*. */
+static
+int
+sort_typeBstar(const unsigned char *T, int *SA,
+               int *bucket_A, int *bucket_B,
+               int n, int openMP) {
+  int *PAb, *ISAb, *buf;
+#ifdef LIBBSC_OPENMP
+  int *curbuf;
+  int l;
+#endif
+  int i, j, k, t, m, bufsize;
+  int c0, c1;
+#ifdef LIBBSC_OPENMP
+  int d0, d1;
+#endif
+  (void)openMP;
+
+  /* Initialize bucket arrays. */
+  for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
+  for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
+
+  /* Count the number of occurrences of the first one or two characters of each
+     type A, B and B* suffix. Moreover, store the beginning position of all
+     type B* suffixes into the array SA. */
+  for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
+    /* type A suffix. */
+    do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
+    if(0 <= i) {
+      /* type B* suffix. */
+      ++BUCKET_BSTAR(c0, c1);
+      SA[--m] = i;
+      /* type B suffix. */
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
+        ++BUCKET_B(c0, c1);
+      }
+    }
+  }
+  m = n - m;
+/*
+note:
+  A type B* suffix is lexicographically smaller than a type B suffix that
+  begins with the same first two characters.
+*/
+
+  /* Calculate the index of start/end point of each bucket. */
+  for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
+    t = i + BUCKET_A(c0);
+    BUCKET_A(c0) = i + j; /* start point */
+    i = t + BUCKET_B(c0, c0);
+    for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
+      j += BUCKET_BSTAR(c0, c1);
+      BUCKET_BSTAR(c0, c1) = j; /* end point */
+      i += BUCKET_B(c0, c1);
+    }
+  }
+
+  if(0 < m) {
+    /* Sort the type B* suffixes by their first two characters. */
+    PAb = SA + n - m; ISAb = SA + m;
+    for(i = m - 2; 0 <= i; --i) {
+      t = PAb[i], c0 = T[t], c1 = T[t + 1];
+      SA[--BUCKET_BSTAR(c0, c1)] = i;
+    }
+    t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
+    SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
+
+    /* Sort the type B* substrings using sssort. */
+#ifdef LIBBSC_OPENMP
+    if (openMP)
+    {
+        buf = SA + m;
+        c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
+#pragma omp parallel default(shared) private(bufsize, curbuf, k, l, d0, d1)
+        {
+          bufsize = (n - (2 * m)) / omp_get_num_threads();
+          curbuf = buf + omp_get_thread_num() * bufsize;
+          k = 0;
+          for(;;) {
+            #pragma omp critical(sssort_lock)
+            {
+              if(0 < (l = j)) {
+                d0 = c0, d1 = c1;
+                do {
+                  k = BUCKET_BSTAR(d0, d1);
+                  if(--d1 <= d0) {
+                    d1 = ALPHABET_SIZE - 1;
+                    if(--d0 < 0) { break; }
+                  }
+                } while(((l - k) <= 1) && (0 < (l = k)));
+                c0 = d0, c1 = d1, j = k;
+              }
+            }
+            if(l == 0) { break; }
+            sssort(T, PAb, SA + k, SA + l,
+                   curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
+          }
+        }
+    }
+    else
+    {
+        buf = SA + m, bufsize = n - (2 * m);
+        for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+          for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+            i = BUCKET_BSTAR(c0, c1);
+            if(1 < (j - i)) {
+              sssort(T, PAb, SA + i, SA + j,
+                     buf, bufsize, 2, n, *(SA + i) == (m - 1));
+            }
+          }
+        }
+    }
+#else
+    buf = SA + m, bufsize = n - (2 * m);
+    for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+        i = BUCKET_BSTAR(c0, c1);
+        if(1 < (j - i)) {
+          sssort(T, PAb, SA + i, SA + j,
+                 buf, bufsize, 2, n, *(SA + i) == (m - 1));
+        }
+      }
+    }
+#endif
+
+    /* Compute ranks of type B* substrings. */
+    for(i = m - 1; 0 <= i; --i) {
+      if(0 <= SA[i]) {
+        j = i;
+        do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
+        SA[i + 1] = i - j;
+        if(i <= 0) { break; }
+      }
+      j = i;
+      do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
+      ISAb[SA[i]] = j;
+    }
+
+    /* Construct the inverse suffix array of type B* suffixes using trsort. */
+    trsort(ISAb, SA, m, 1);
+
+    /* Set the sorted order of tyoe B* suffixes. */
+    for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
+      if(0 <= i) {
+        t = i;
+        for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
+        SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
+      }
+    }
+
+    /* Calculate the index of start/end point of each bucket. */
+    BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
+    for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
+      i = BUCKET_A(c0 + 1) - 1;
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
+        t = i - BUCKET_B(c0, c1);
+        BUCKET_B(c0, c1) = i; /* end point */
+
+        /* Move all type B* suffixes to the correct position. */
+        for(i = t, j = BUCKET_BSTAR(c0, c1);
+            j <= k;
+            --i, --k) { SA[i] = SA[k]; }
+      }
+      BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
+      BUCKET_B(c0, c0) = i; /* end point */
+    }
+  }
+
+  return m;
+}
+
+/* Constructs the suffix array by using the sorted order of type B* suffixes. */
+static
+void
+construct_SA(const unsigned char *T, int *SA,
+             int *bucket_A, int *bucket_B,
+             int n, int m) {
+  int *i, *j, *k;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          *j = ~s;
+          c0 = T[--s];
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else {
+          assert(((s == 0) && (T[s] == c1)) || (s < 0));
+          *j = ~s;
+        }
+      }
+    }
+  }
+
+  /* Construct the suffix array by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else {
+      assert(s < 0);
+      *i = ~s;
+    }
+  }
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT(const unsigned char *T, int *SA,
+              int *bucket_A, int *bucket_B,
+              int n, int m) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      *i = c0;
+      if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT_indexes(const unsigned char *T, int *SA,
+                      int *bucket_A, int *bucket_B,
+                      int n, int m,
+                      unsigned char * num_indexes, int * indexes) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  int mod = n / 8;
+  {
+      mod |= mod >> 1;  mod |= mod >> 2;
+      mod |= mod >> 4;  mod |= mod >> 8;
+      mod |= mod >> 16; mod >>= 1;
+
+      *num_indexes = (unsigned char)((n - 1) / (mod + 1));
+  }
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+
+          if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = j - SA;
+
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  if (T[n - 2] < c2) {
+    if (((n - 1) & mod) == 0) indexes[(n - 1) / (mod + 1) - 1] = k - SA;
+    *k++ = ~((int)T[n - 2]);
+  }
+  else {
+    *k++ = n - 1;
+  }
+
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+
+      if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = i - SA;
+
+      c0 = T[--s];
+      *i = c0;
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      if((0 < s) && (T[s - 1] < c0)) {
+          if ((s & mod) == 0) indexes[s / (mod + 1) - 1] = k - SA;
+          *k++ = ~((int)T[s - 1]);
+      } else
+        *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/*- Function -*/
+
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP) {
+  int *bucket_A, *bucket_B;
+  int m;
+  int err = 0;
+
+  /* Check arguments. */
+  if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
+  else if(n == 0) { return 0; }
+  else if(n == 1) { SA[0] = 0; return 0; }
+  else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
+
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Suffixsort. */
+  if((bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, SA, bucket_A, bucket_B, n, openMP);
+    construct_SA(T, SA, bucket_A, bucket_B, n, m);
+  } else {
+    err = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+
+  return err;
+}
+
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP) {
+  int *B;
+  int *bucket_A, *bucket_B;
+  int m, pidx, i;
+
+  /* Check arguments. */
+  if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
+  else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
+
+  if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); }
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Burrows-Wheeler Transform. */
+  if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, B, bucket_A, bucket_B, n, openMP);
+
+    if (num_indexes == NULL || indexes == NULL) {
+        pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
+    } else {
+        pidx = construct_BWT_indexes(T, B, bucket_A, bucket_B, n, m, num_indexes, indexes);
+    }
+
+    /* Copy to output string. */
+    U[0] = T[n - 1];
+    for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; }
+    for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; }
+    pidx += 1;
+  } else {
+    pidx = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+  if(A == NULL) { free(B); }
+
+  return pidx;
+}
diff --git a/vendor/github.com/DataDog/zstd/divsufsort.h b/vendor/github.com/DataDog/zstd/divsufsort.h
new file mode 100644
index 0000000..5440994
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/divsufsort.h
@@ -0,0 +1,67 @@
+/*
+ * divsufsort.h for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT_H
+#define _DIVSUFSORT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/*- Prototypes -*/
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param openMP enables OpenMP optimization.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP);
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string. (can be T)
+ * @param A [0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @param num_indexes The length of secondary indexes array. (can be NULL)
+ * @param indexes The secondary indexes array. (can be NULL)
+ * @param openMP enables OpenMP optimization.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT_H */
diff --git a/vendor/github.com/DataDog/zstd/entropy_common.c b/vendor/github.com/DataDog/zstd/entropy_common.c
new file mode 100644
index 0000000..b37a082
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/entropy_common.c
@@ -0,0 +1,221 @@
+/*
+   Common functions of New Generation Entropy library
+   Copyright (C) 2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+*************************************************************************** */
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include "mem.h"
+#include "error_private.h"       /* ERR_*, ERROR */
+#define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+#include "huf.h"
+
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+
+/*===   Error Management   ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) & (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0 += 24;
+                if (ip < iend-5) {
+                    ip += 2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount   += 16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0 += 3;
+                bitStream >>= 2;
+                bitCount += 2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            } else {
+                bitStream >>= 2;
+        }   }
+        {   int const max = (2*threshold-1) - remaining;
+            int count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = bitStream & (threshold-1);
+                bitCount += nbBits-1;
+            } else {
+                count = bitStream & (2*threshold-1);
+                if (count >= threshold) count -= max;
+                bitCount += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= count < 0 ? -count : count;   /* -1 means +1 */
+            normalizedCounter[charnum++] = (short)count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }   /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+    if (remaining != 1) return ERROR(corruption_detected);
+    if (bitCount > 32) return ERROR(corruption_detected);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    return ip-istart;
+}
+
+
+/*! HUF_readStats() :
+    Read compact Huffman tree, saved by HUF_writeCTable().
+    `huffWeight` is destination buffer.
+    `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    /* memset(huffWeight, 0, hwSize);   *//* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128) {  /* special header */
+        oSize = iSize - 127;
+        iSize = ((oSize+1)/2);
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        if (oSize >= hwSize) return ERROR(corruption_detected);
+        ip += 1;
+        {   U32 n;
+            for (n=0; n<oSize; n+=2) {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)];  /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] >= HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+        if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << BIT_highbit32(rest);
+            U32 const lastWeight = BIT_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
diff --git a/vendor/github.com/DataDog/zstd/error_private.c b/vendor/github.com/DataDog/zstd/error_private.c
new file mode 100644
index 0000000..d004ee6
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/error_private.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* The purpose of this file is to have a single list of error strings embedded in binary */
+
+#include "error_private.h"
+
+const char* ERR_getErrorString(ERR_enum code)
+{
+    static const char* const notErrorCode = "Unspecified error code";
+    switch( code )
+    {
+    case PREFIX(no_error): return "No error detected";
+    case PREFIX(GENERIC):  return "Error (generic)";
+    case PREFIX(prefix_unknown): return "Unknown frame descriptor";
+    case PREFIX(version_unsupported): return "Version not supported";
+    case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+    case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+    case PREFIX(init_missing): return "Context should be init first";
+    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough";
+    case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
+    case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+    case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+    case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+    case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+    case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+    case PREFIX(srcSize_wrong): return "Src size is incorrect";
+        /* following error codes are not stable and may be removed or changed in a future version */
+    case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+    case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+    case PREFIX(maxCode):
+    default: return notErrorCode;
+    }
+}
diff --git a/vendor/github.com/DataDog/zstd/error_private.h b/vendor/github.com/DataDog/zstd/error_private.h
new file mode 100644
index 0000000..0d2fa7e
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/error_private.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>        /* size_t */
+#include "zstd_errors.h"  /* enum list */
+
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#define ERROR(name) ZSTD_ERROR(name)
+#define ZSTD_ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+
+
+/*-****************************************
+*  Error Strings
+******************************************/
+
+const char* ERR_getErrorString(ERR_enum code);   /* error_private.c */
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    return ERR_getErrorString(ERR_getErrorCode(code));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
diff --git a/vendor/github.com/DataDog/zstd/errors.go b/vendor/github.com/DataDog/zstd/errors.go
new file mode 100644
index 0000000..38db0d5
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/errors.go
@@ -0,0 +1,35 @@
+package zstd
+
+/*
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"
+*/
+import "C"
+
+// ErrorCode is an error returned by the zstd library.
+type ErrorCode int
+
+// Error returns the error string given by zstd
+func (e ErrorCode) Error() string {
+	return C.GoString(C.ZSTD_getErrorName(C.size_t(e)))
+}
+
+func cIsError(code int) bool {
+	return int(C.ZSTD_isError(C.size_t(code))) != 0
+}
+
+// getError returns an error for the return code, or nil if it's not an error
+func getError(code int) error {
+	if code < 0 && cIsError(code) {
+		return ErrorCode(code)
+	}
+	return nil
+}
+
+// IsDstSizeTooSmallError returns whether the error correspond to zstd standard sDstSizeTooSmall error
+func IsDstSizeTooSmallError(e error) bool {
+	if e != nil && e.Error() == "Destination buffer is too small" {
+		return true
+	}
+	return false
+}
diff --git a/vendor/github.com/DataDog/zstd/fse.h b/vendor/github.com/DataDog/zstd/fse.h
new file mode 100644
index 0000000..6a1d272
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/fse.h
@@ -0,0 +1,704 @@
+/* ******************************************************************
+   FSE : Finite State Entropy codec
+   Public Prototypes declaration
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef FSE_H
+#define FSE_H
+
+
+/*-*****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+
+
+/*-*****************************************
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define FSE_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define FSE_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define FSE_PUBLIC_API
+#endif
+
+/*------   Version   ------*/
+#define FSE_VERSION_MAJOR    0
+#define FSE_VERSION_MINOR    9
+#define FSE_VERSION_RELEASE  0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER  (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /**< library version number; to be used when checking dll version */
+
+/*-****************************************
+*  FSE simple functions
+******************************************/
+/*! FSE_compress() :
+    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+    @return : size of compressed data (<= dstCapacity).
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+*/
+FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/*! FSE_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+    @return : size of regenerated data (<= maxDstSize),
+              or an error code, which can be tested using FSE_isError() .
+
+    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size);       /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+/*-*****************************************
+*  FSE advanced functions
+******************************************/
+/*! FSE_compress2() :
+    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+    Both parameters can be defined as '0' to mean : use default value
+    @return : size of compressed data
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+                     if FSE_isError(return), it's an error code.
+*/
+FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[]
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+
+/*! FSE_count():
+    Provides the precise count of each byte within a table 'count'.
+    'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+    *maxSymbolValuePtr will be updated if detected smaller than initial value.
+    @return : the count of the most frequent symbol (which is not identified).
+              if return == srcSize, there is only one symbol.
+              Can also return an error code, which can be tested with FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+/*! FSE_optimalTableLog():
+    dynamically downsize 'tableLog' when conditions are met.
+    It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+    @return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+    normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+    'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+    @return : tableLog,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_NCountWriteBound():
+    Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+    Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+    Compactly save 'normalizedCounter' into 'buffer'.
+    @return : size of the compressed table,
+              or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*! Constructor and Destructor of FSE_CTable.
+    Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+
+/*! FSE_buildCTable():
+    Builds `ct`, which must be already allocated, using FSE_createCTable().
+    @return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_compress_usingCTable():
+    Compress `src` using `ct` into `dst` which must be already allocated.
+    @return : size of compressed data (<= `dstCapacity`),
+              or 0 if compressed data could not fit into `dst`,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSE_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSE_DTable.
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+
+/*! FSE_buildDTable():
+    Builds 'dt', which must be already allocated, using FSE_createDTable().
+    return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_decompress_usingDTable():
+    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+    into `dst` which must be already allocated.
+    @return : size of regenerated data (necessarily <= `dstCapacity`),
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+#endif  /* FSE_H */
+
+#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+#define FSE_H_FSE_STATIC_LINKING_ONLY
+
+/* *** Dependency *** */
+#include "bitstream.h"
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+/* or use the size to malloc() space directly. Pay attention to alignment restrictions though */
+#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue)   (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable))
+#define FSE_DTABLE_SIZE(maxTableLog)                   (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable))
+
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned
+ */
+size_t FSE_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* source, size_t sourceSize, unsigned* workSpace);
+
+/** FSE_countFast() :
+ *  same as FSE_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr
+ */
+size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` must be a table of minimum `1024` unsigned
+ */
+size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* workSpace);
+
+/*! FSE_count_simple() :
+ * Same as FSE_countFast(), but does not use any additional memory (not even on stack).
+ * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`).
+*/
+size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+ */
+#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `(1<<tableLog)`.
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
+
+typedef enum {
+   FSE_repeat_none,  /**< Cannot use the previous table */
+   FSE_repeat_check, /**< Can use the previous table but it must be checked */
+   FSE_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+ } FSE_repeat;
+
+/* *****************************************
+*  FSE symbol compression API
+*******************************************/
+/*!
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   Hence their body are included in next section.
+*/
+typedef struct {
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+    size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+    FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+    FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+    BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+    BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+    FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+    size_t size = BIT_closeCStream(&bitStream);
+*/
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct {
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+    errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+    errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+    unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+    size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+    endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+    BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+    BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+    FSE_endOfDState(&DState);
+*/
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+typedef struct {
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+    const void* ptr = ct;
+    const U16* u16ptr = (const U16*) ptr;
+    const U32 tableLog = MEM_read16(ptr);
+    statePtr->value = (ptrdiff_t)1<<tableLog;
+    statePtr->stateTable = u16ptr+2;
+    statePtr->symbolTT = ((const U32*)ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1));
+    statePtr->stateLog = tableLog;
+}
+
+
+/*! FSE_initCState2() :
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    FSE_initCState(statePtr, ct);
+    {   const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+    }
+}
+
+MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
+{
+    FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    BIT_addBits(bitC, statePtr->value, nbBitsOut);
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+    BIT_flushBits(bitC);
+}
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#  define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#  define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#  define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#  error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3)
+
+
+#endif /* FSE_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/vendor/github.com/DataDog/zstd/fse_compress.c b/vendor/github.com/DataDog/zstd/fse_compress.c
new file mode 100644
index 0000000..cb8f1fa
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/fse_compress.c
@@ -0,0 +1,849 @@
+/* ******************************************************************
+   FSE : Finite State Entropy encoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+#include "bitstream.h"
+#include "compiler.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    U32 const tableSize = 1 << tableLog;
+    U32 const tableMask = tableSize - 1;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    U32 const step = FSE_TABLESTEP(tableSize);
+    U32 cumul[FSE_MAX_SYMBOL_VALUE+2];
+
+    FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)workSpace;
+    U32 highThreshold = tableSize-1;
+
+    /* CTable header */
+    if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
+    tableU16[-2] = (U16) tableLog;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* For explanations on how to distribute symbol values over the table :
+    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+    /* symbol start positions */
+    {   U32 u;
+        cumul[0] = 0;
+        for (u=1; u<=maxSymbolValue+1; u++) {
+            if (normalizedCounter[u-1]==-1) {  /* Low proba symbol */
+                cumul[u] = cumul[u-1] + 1;
+                tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
+            } else {
+                cumul[u] = cumul[u-1] + normalizedCounter[u-1];
+        }   }
+        cumul[maxSymbolValue+1] = tableSize+1;
+    }
+
+    /* Spread symbols */
+    {   U32 position = 0;
+        U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            int nbOccurences;
+            for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++) {
+                tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* Low proba area */
+        }   }
+
+        if (position!=0) return ERROR(GENERIC);   /* Must have gone through all positions */
+    }
+
+    /* Build table */
+    {   U32 u; for (u=0; u<tableSize; u++) {
+        FSE_FUNCTION_TYPE s = tableSymbol[u];   /* note : static analyzer may not understand tableSymbol is properly initialized */
+        tableU16[cumul[s]++] = (U16) (tableSize+u);   /* TableU16 : sorted by symbol order; gives next state value */
+    }   }
+
+    /* Build Symbol Transformation Table */
+    {   unsigned total = 0;
+        unsigned s;
+        for (s=0; s<=maxSymbolValue; s++) {
+            switch (normalizedCounter[s])
+            {
+            case  0: break;
+
+            case -1:
+            case  1:
+                symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
+                symbolTT[s].deltaFindState = total - 1;
+                total ++;
+                break;
+            default :
+                {
+                    U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1);
+                    U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
+                    symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                    symbolTT[s].deltaFindState = total - normalizedCounter[s];
+                    total +=  normalizedCounter[s];
+    }   }   }   }
+
+    return 0;
+}
+
+
+size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE];   /* memset() is not necessary, even if static analyzer complain about it */
+    return FSE_buildCTable_wksp(ct, normalizedCounter, maxSymbolValue, tableLog, tableSymbol, sizeof(tableSymbol));
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3;
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+}
+
+static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                                       unsigned writeIsSafe)
+{
+    BYTE* const ostart = (BYTE*) header;
+    BYTE* out = ostart;
+    BYTE* const oend = ostart + headerBufferSize;
+    int nbBits;
+    const int tableSize = 1 << tableLog;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    bitStream = 0;
+    bitCount  = 0;
+    /* Table Size */
+    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
+    bitCount  += 4;
+
+    /* Init */
+    remaining = tableSize+1;   /* +1 for extra accuracy */
+    threshold = tableSize;
+    nbBits = tableLog+1;
+
+    while (remaining>1) {  /* stops at 1 */
+        if (previous0) {
+            unsigned start = charnum;
+            while (!normalizedCounter[charnum]) charnum++;
+            while (charnum >= start+24) {
+                start+=24;
+                bitStream += 0xFFFFU << bitCount;
+                if ((!writeIsSafe) && (out > oend-2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE) bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out+=2;
+                bitStream>>=16;
+            }
+            while (charnum >= start+3) {
+                start+=3;
+                bitStream += 3 << bitCount;
+                bitCount += 2;
+            }
+            bitStream += (charnum-start) << bitCount;
+            bitCount += 2;
+            if (bitCount>16) {
+                if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE)bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out += 2;
+                bitStream >>= 16;
+                bitCount -= 16;
+        }   }
+        {   int count = normalizedCounter[charnum++];
+            int const max = (2*threshold-1)-remaining;
+            remaining -= count < 0 ? -count : count;
+            count++;   /* +1 for extra accuracy */
+            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            bitStream += count << bitCount;
+            bitCount  += nbBits;
+            bitCount  -= (count<max);
+            previous0  = (count==1);
+            if (remaining<1) return ERROR(GENERIC);
+            while (remaining<threshold) { nbBits--; threshold>>=1; }
+        }
+        if (bitCount>16) {
+            if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            out[0] = (BYTE)bitStream;
+            out[1] = (BYTE)(bitStream>>8);
+            out += 2;
+            bitStream >>= 16;
+            bitCount -= 16;
+    }   }
+
+    /* flush remaining bitStream */
+    if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    out[0] = (BYTE)bitStream;
+    out[1] = (BYTE)(bitStream>>8);
+    out+= (bitCount+7) /8;
+
+    if (charnum > maxSymbolValue + 1) return ERROR(GENERIC);
+
+    return (out-ostart);
+}
+
+
+size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
+
+    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+}
+
+
+
+/*-**************************************************************
+*  Counting histogram
+****************************************************************/
+/*! FSE_count_simple
+    This function counts byte values within `src`, and store the histogram into table `count`.
+    It doesn't use any additional memory.
+    But this function is unsafe : it doesn't check that all values within `src` can fit into `count`.
+    For this reason, prefer using a table `count` with 256 elements.
+    @return : count of most numerous element.
+*/
+size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                        const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+
+    memset(count, 0, (maxSymbolValue+1)*sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) {
+        assert(*ip <= maxSymbolValue);
+        count[*ip++]++;
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    { U32 s; for (s=0; s<=maxSymbolValue; s++) if (count[s] > max) max = count[s]; }
+
+    return (size_t)max;
+}
+
+
+/* FSE_count_parallel_wksp() :
+ * Same as FSE_count_parallel(), but using an externally provided scratch buffer.
+ * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`.
+ * @return : largest histogram frequency, or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */
+static size_t FSE_count_parallel_wksp(
+                                unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                unsigned checkMax, unsigned* const workSpace)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+    U32* const Counting1 = workSpace;
+    U32* const Counting2 = Counting1 + 256;
+    U32* const Counting3 = Counting2 + 256;
+    U32* const Counting4 = Counting3 + 256;
+
+    memset(workSpace, 0, 4*256*sizeof(unsigned));
+
+    /* safety checks */
+    if (!sourceSize) {
+        memset(count, 0, maxSymbolValue + 1);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    if (!maxSymbolValue) maxSymbolValue = 255;            /* 0 == default */
+
+    /* by stripes of 16 bytes */
+    {   U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    if (checkMax) {   /* verify stats will fit into destination table */
+        U32 s; for (s=255; s>maxSymbolValue; s--) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
+    }   }
+
+    {   U32 s;
+        if (maxSymbolValue > 255) maxSymbolValue = 255;
+        for (s=0; s<=maxSymbolValue; s++) {
+            count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+            if (count[s] > max) max = count[s];
+    }   }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+    return (size_t)max;
+}
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                          const void* source, size_t sourceSize,
+                          unsigned* workSpace)
+{
+    if (sourceSize < 1500) /* heuristic threshold */
+        return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
+}
+
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    unsigned tmpCounters[1024];
+    return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters);
+}
+
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* source, size_t sourceSize, unsigned* workSpace)
+{
+    if (*maxSymbolValuePtr < 255)
+        return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
+    *maxSymbolValuePtr = 255;
+    return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
+}
+
+size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* src, size_t srcSize)
+{
+    unsigned tmpCounters[1024];
+    return FSE_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters);
+}
+
+
+
+/*-**************************************************************
+*  FSE Compression Code
+****************************************************************/
+/*! FSE_sizeof_CTable() :
+    FSE_CTable is a variable size structure which contains :
+    `U16 tableLog;`
+    `U16 maxSymbolValue;`
+    `U16 nextStateNumber[1 << tableLog];`                         // This size is variable
+    `FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];`  // This size is variable
+Allocation is manual (C standard does not support variable-size structures).
+*/
+size_t FSE_sizeof_CTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    return FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+}
+
+FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t size;
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+    return (FSE_CTable*)malloc(size);
+}
+
+void FSE_freeCTable (FSE_CTable* ct) { free(ct); }
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+    U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+    U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+    U32 tableLog = maxTableLog;
+    U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
+    if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
+    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
+    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
+    return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue)
+{
+    short const NOT_YET_ASSIGNED = -2;
+    U32 s;
+    U32 distributed = 0;
+    U32 ToDistribute;
+
+    /* Init */
+    U32 const lowThreshold = (U32)(total >> tableLog);
+    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (count[s] == 0) {
+            norm[s]=0;
+            continue;
+        }
+        if (count[s] <= lowThreshold) {
+            norm[s] = -1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+        if (count[s] <= lowOne) {
+            norm[s] = 1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+
+        norm[s]=NOT_YET_ASSIGNED;
+    }
+    ToDistribute = (1 << tableLog) - distributed;
+
+    if ((total / ToDistribute) > lowOne) {
+        /* risk of rounding to zero */
+        lowOne = (U32)((total * 3) / (ToDistribute * 2));
+        for (s=0; s<=maxSymbolValue; s++) {
+            if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+                norm[s] = 1;
+                distributed++;
+                total -= count[s];
+                continue;
+        }   }
+        ToDistribute = (1 << tableLog) - distributed;
+    }
+
+    if (distributed == maxSymbolValue+1) {
+        /* all values are pretty poor;
+           probably incompressible data (should have already been detected);
+           find max, then give all remaining points to max */
+        U32 maxV = 0, maxC = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > maxC) { maxV=s; maxC=count[s]; }
+        norm[maxV] += (short)ToDistribute;
+        return 0;
+    }
+
+    if (total == 0) {
+        /* all of the symbols were low enough for the lowOne or lowThreshold */
+        for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1))
+            if (norm[s] > 0) { ToDistribute--; norm[s]++; }
+        return 0;
+    }
+
+    {   U64 const vStepLog = 62 - tableLog;
+        U64 const mid = (1ULL << (vStepLog-1)) - 1;
+        U64 const rStep = ((((U64)1<<vStepLog) * ToDistribute) + mid) / total;   /* scale on remaining */
+        U64 tmpTotal = mid;
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (norm[s]==NOT_YET_ASSIGNED) {
+                U64 const end = tmpTotal + (count[s] * rStep);
+                U32 const sStart = (U32)(tmpTotal >> vStepLog);
+                U32 const sEnd = (U32)(end >> vStepLog);
+                U32 const weight = sEnd - sStart;
+                if (weight < 1)
+                    return ERROR(GENERIC);
+                norm[s] = (short)weight;
+                tmpTotal = end;
+    }   }   }
+
+    return 0;
+}
+
+
+size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+                           const unsigned* count, size_t total,
+                           unsigned maxSymbolValue)
+{
+    /* Sanity checks */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported size */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported size */
+    if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC);   /* Too small tableLog, compression potentially impossible */
+
+    {   static U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+        U64 const scale = 62 - tableLog;
+        U64 const step = ((U64)1<<62) / total;   /* <== here, one division ! */
+        U64 const vStep = 1ULL<<(scale-20);
+        int stillToDistribute = 1<<tableLog;
+        unsigned s;
+        unsigned largest=0;
+        short largestP=0;
+        U32 lowThreshold = (U32)(total >> tableLog);
+
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (count[s] == total) return 0;   /* rle special case */
+            if (count[s] == 0) { normalizedCounter[s]=0; continue; }
+            if (count[s] <= lowThreshold) {
+                normalizedCounter[s] = -1;
+                stillToDistribute--;
+            } else {
+                short proba = (short)((count[s]*step) >> scale);
+                if (proba<8) {
+                    U64 restToBeat = vStep * rtbTable[proba];
+                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
+                }
+                if (proba > largestP) { largestP=proba; largest=s; }
+                normalizedCounter[s] = proba;
+                stillToDistribute -= proba;
+        }   }
+        if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+            /* corner case, need another normalization method */
+            size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+            if (FSE_isError(errorCode)) return errorCode;
+        }
+        else normalizedCounter[largest] += (short)stillToDistribute;
+    }
+
+#if 0
+    {   /* Print Table (debug) */
+        U32 s;
+        U32 nTotal = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            printf("%3i: %4i \n", s, normalizedCounter[s]);
+        for (s=0; s<=maxSymbolValue; s++)
+            nTotal += abs(normalizedCounter[s]);
+        if (nTotal != (1U<<tableLog))
+            printf("Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+        getchar();
+    }
+#endif
+
+    return tableLog;
+}
+
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+{
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+
+    /* header */
+    tableU16[-2] = (U16) nbBits;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* Build table */
+    for (s=0; s<tableSize; s++)
+        tableU16[s] = (U16)(tableSize + s);
+
+    /* Build Symbol Transformation Table */
+    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+        for (s=0; s<=maxSymbolValue; s++) {
+            symbolTT[s].deltaNbBits = deltaNbBits;
+            symbolTT[s].deltaFindState = s-1;
+    }   }
+
+    return 0;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+{
+    void* ptr = ct;
+    U16* tableU16 = ( (U16*) ptr) + 2;
+    void* FSCTptr = (U32*)ptr + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) FSCTptr;
+
+    /* header */
+    tableU16[-2] = (U16) 0;
+    tableU16[-1] = (U16) symbolValue;
+
+    /* Build table */
+    tableU16[0] = 0;
+    tableU16[1] = 0;   /* just in case */
+
+    /* Build Symbol Transformation Table */
+    symbolTT[symbolValue].deltaNbBits = 0;
+    symbolTT[symbolValue].deltaFindState = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct, const unsigned fast)
+{
+    const BYTE* const istart = (const BYTE*) src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip=iend;
+
+    BIT_CStream_t bitC;
+    FSE_CState_t CState1, CState2;
+
+    /* init */
+    if (srcSize <= 2) return 0;
+    { size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+      if (FSE_isError(initError)) return 0; /* not enough space available to write a bitstream */ }
+
+#define FSE_FLUSHBITS(s)  (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+    if (srcSize & 1) {
+        FSE_initCState2(&CState1, ct, *--ip);
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    } else {
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_initCState2(&CState1, ct, *--ip);
+    }
+
+    /* join to mod 4 */
+    srcSize -= 2;
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) {  /* test bit 2 */
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* 2 or 4 encoding per loop */
+    while ( ip>istart ) {
+
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+            FSE_FLUSHBITS(&bitC);
+
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) {  /* this test must be static */
+            FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        }
+
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    FSE_flushCState(&bitC, &CState2);
+    FSE_flushCState(&bitC, &CState1);
+    return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct)
+{
+    unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+    if (fast)
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+    else
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` size must be `(1<<tableLog)`.
+ */
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32   count[FSE_MAX_SYMBOL_VALUE+1];
+    S16   norm[FSE_MAX_SYMBOL_VALUE+1];
+    FSE_CTable* CTable = (FSE_CTable*)workSpace;
+    size_t const CTableSize = FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue);
+    void* scratchBuffer = (void*)(CTable + CTableSize);
+    size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
+
+    /* init conditions */
+    if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
+    if (srcSize <= 1) return 0;  /* Not compressible */
+    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(maxCount, FSE_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned*)scratchBuffer) );
+        if (maxCount == srcSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+        if (maxCount < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue) );
+
+    /* Write table description header */
+    {   CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+        op += nc_err;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    /* check compressibility */
+    if ( (size_t)(op-ostart) >= srcSize-1 ) return 0;
+
+    return op-ostart;
+}
+
+typedef struct {
+    FSE_CTable CTable_max[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
+    BYTE scratchBuffer[1 << FSE_MAX_TABLELOG];
+} fseWkspMax_t;
+
+size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
+{
+    fseWkspMax_t scratchBuffer;
+    FSE_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE));   /* compilation failures here means scratchBuffer is not large enough */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
+}
+
+size_t FSE_compress (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return FSE_compress2(dst, dstCapacity, src, srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
+}
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/vendor/github.com/DataDog/zstd/fse_decompress.c b/vendor/github.com/DataDog/zstd/fse_decompress.c
new file mode 100644
index 0000000..4c66c3b
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/fse_decompress.c
@@ -0,0 +1,309 @@
+/* ******************************************************************
+   FSE : Finite State Entropy decoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include "bitstream.h"
+#include "compiler.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+/* check and forward error code */
+#define CHECK_F(f) { size_t const e = f; if (FSE_isError(e)) return e; }
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+FSE_DTable* FSE_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSE_freeDTable (FSE_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSE_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSV1 = tableMask+1;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<maxSV1; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+
+    /* Init */
+    CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) & (op<olimit) ; op+=4) {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state1);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state2);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+
+size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+
+    /* normal FSE decoding mode */
+    size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(NCountLength)) return NCountLength;
+    //if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */
+    if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+    ip += NCountLength;
+    cSrcSize -= NCountLength;
+
+    CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) );
+
+    return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace);   /* always return, even if it is an error code */
+}
+
+
+typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize)
+{
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/vendor/github.com/DataDog/zstd/huf.h b/vendor/github.com/DataDog/zstd/huf.h
new file mode 100644
index 0000000..b4645b4
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/huf.h
@@ -0,0 +1,327 @@
+/* ******************************************************************
+   Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include <stddef.h>    /* size_t */
+
+
+/* *** library symbols visibility *** */
+/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+ *        HUF symbols remain "private" (internal symbols for library only).
+ *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define HUF_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+#else
+#  define HUF_PUBLIC_API
+#endif
+
+
+/* ========================== */
+/* ***  simple functions  *** */
+/* ========================== */
+
+/** HUF_compress() :
+ *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+ * 'dst' buffer must be already allocated.
+ *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+ * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+ * @return : size of compressed data (<= `dstCapacity`).
+ *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+ *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+ */
+HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/** HUF_decompress() :
+ *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+ *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+ * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+ *  Note : in contrast with FSE, HUF_decompress can regenerate
+ *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+ *         because it knows size to regenerate (originalSize).
+ * @return : size of regenerated data (== originalSize),
+ *           or an error code, which can be tested using HUF_isError()
+ */
+HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/* ***   Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /**< maximum input size for a single block compressed with HUF_compress */
+HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /**< maximum compressed size (worst case) */
+
+/* Error Management */
+HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /**< tells if a return value is an error code */
+HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /**< provides error code string (useful for debugging) */
+
+
+/* ***   Advanced function   *** */
+
+/** HUF_compress2() :
+ *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+ * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+ * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               unsigned maxSymbolValue, unsigned tableLog);
+
+/** HUF_compress4X_wksp() :
+ *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+ * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */
+#define HUF_WORKSPACE_SIZE (6 << 10)
+#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
+HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     unsigned maxSymbolValue, unsigned tableLog,
+                                     void* workSpace, size_t wkspSize);
+
+#endif   /* HUF_H_298734234 */
+
+/* ******************************************************************
+ *  WARNING !!
+ *  The following section contains advanced and experimental definitions
+ *  which shall never be used in the context of a dynamic library,
+ *  because they are not guaranteed to remain stable in the future.
+ *  Only consider them in association with static linking.
+ * *****************************************************************/
+#if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+#define HUF_H_HUF_STATIC_LINKING_ONLY
+
+/* *** Dependencies *** */
+#include "mem.h"   /* U32 */
+
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT  11      /* default tableLog value when none specified */
+#define HUF_SYMBOLVALUE_MAX  255
+
+#define HUF_TABLELOG_ABSOLUTEMAX  15  /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#  error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true when incompressible is pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+#define HUF_CTABLE_SIZE_U32(maxSymbolValue)   ((maxSymbolValue)+1)   /* Use tables of U32, for proper alignment */
+#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32))
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+    U32 name##hb[HUF_CTABLE_SIZE_U32(maxSymbolValue)]; \
+    void* name##hv = &(name##hb); \
+    HUF_CElt* name = (HUF_CElt*)(name##hv)   /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+
+
+/* ****************************************
+ *  HUF detailed API
+ * ****************************************/
+
+/*! HUF_compress() does the following:
+ *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+ *  2. (optional) refine tableLog using HUF_optimalTableLog()
+ *  3. build Huffman table from count using HUF_buildCTable()
+ *  4. save Huffman table to memory buffer using HUF_writeCTable()
+ *  5. encode the data stream using HUF_compress4X_usingCTable()
+ *
+ *  The following API allows targeting specific sub-functions for advanced tasks.
+ *  For example, it's possible to compress several blocks using the same 'CTable',
+ *  or to save and regenerate 'CTable' using external methods.
+ */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+
+typedef enum {
+   HUF_repeat_none,  /**< Cannot use the previous table */
+   HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+   HUF_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+ } HUF_repeat;
+/** HUF_compress4X_repeat() :
+ *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,    /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+ */
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+ *  Read compact Huffman tree, saved by HUF_writeCTable().
+ * `huffWeight` is destination buffer.
+ * @return : size read from `src` , or an error Code .
+ *  Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
+                     U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize);
+
+/** HUF_readCTable() :
+ *  Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+
+/*
+ * HUF_decompress() does the following:
+ * 1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+ * 2. build Huffman table from save, using HUF_readDTableX?()
+ * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
+ */
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+
+/**
+ *  The minimum workspace size for the `workSpace` used in
+ *  HUF_readDTableX2_wksp() and HUF_readDTableX4_wksp().
+ *
+ *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
+ *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
+ *  Buffer overflow errors may potentially occur if code modifications result in
+ *  a required workspace size greater than that specified in the following
+ *  macro.
+ */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX4_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+
+
+/* ====================== */
+/* single stream variants */
+/* ====================== */
+
+size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+/** HUF_compress1X_repeat() :
+ *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+
+size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /**< automatic selection of sing or double symbol decoder, based on DTable */
+size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+
+/* BMI2 variants.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+
+#endif /* HUF_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/vendor/github.com/DataDog/zstd/huf_compress.c b/vendor/github.com/DataDog/zstd/huf_compress.c
new file mode 100644
index 0000000..83230b4
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/huf_compress.c
@@ -0,0 +1,788 @@
+/* ******************************************************************
+   Huffman encoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+#include "bitstream.h"
+#include "compiler.h"
+#define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+#include "fse.h"        /* header compression */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+
+/* **************************************************************
+*  Utils
+****************************************************************/
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+}
+
+
+/* *******************************************************
+*  HUF : Huffman block compression
+*********************************************************/
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32 maxSymbolValue = HUF_TABLELOG_MAX;
+    U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+
+    FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
+    BYTE scratchBuffer[1<<MAX_FSE_TABLELOG_FOR_HUFF_HEADER];
+
+    U32 count[HUF_TABLELOG_MAX+1];
+    S16 norm[HUF_TABLELOG_MAX+1];
+
+    /* init conditions */
+    if (wtSize <= 1) return 0;  /* Not compressible */
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(maxCount, FSE_count_simple(count, &maxSymbolValue, weightTable, wtSize) );
+        if (maxCount == wtSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue) );
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+        op += hSize;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    return op-ostart;
+}
+
+
+struct HUF_CElt_s {
+  U16  val;
+  BYTE nbBits;
+};   /* typedef'd to HUF_CElt within "huf.h" */
+
+/*! HUF_writeCTable() :
+    `CTable` : Huffman tree to save, using huf representation.
+    @return : size of saved CTable */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+                        const HUF_CElt* CTable, U32 maxSymbolValue, U32 huffLog)
+{
+    BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
+    BYTE* op = (BYTE*)dst;
+    U32 n;
+
+     /* check conditions */
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+
+    /* convert to weight */
+    bitsToWeight[0] = 0;
+    for (n=1; n<huffLog+1; n++)
+        bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+    for (n=0; n<maxSymbolValue; n++)
+        huffWeight[n] = bitsToWeight[CTable[n].nbBits];
+
+    /* attempt weights compression by FSE */
+    {   CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, huffWeight, maxSymbolValue) );
+        if ((hSize>1) & (hSize < maxSymbolValue/2)) {   /* FSE compressed */
+            op[0] = (BYTE)hSize;
+            return hSize+1;
+    }   }
+
+    /* write raw values as 4-bits (max : 15) */
+    if (maxSymbolValue > (256-128)) return ERROR(GENERIC);   /* should not happen : likely means source cannot be compressed */
+    if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall);   /* not enough space within dst buffer */
+    op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1));
+    huffWeight[maxSymbolValue] = 0;   /* to be sure it doesn't cause msan issue in final combination */
+    for (n=0; n<maxSymbolValue; n+=2)
+        op[(n/2)+1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n+1]);
+    return ((maxSymbolValue+1)/2) + 1;
+}
+
+
+size_t HUF_readCTable (HUF_CElt* CTable, U32* maxSymbolValuePtr, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];   /* init not required, even though some static analyzer may complain */
+    U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+
+    /* get symbol weights */
+    CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
+
+    /* check result */
+    if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+
+    /* Prepare base value per rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<=tableLog; n++) {
+            U32 current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill nbBits */
+    {   U32 n; for (n=0; n<nbSymbols; n++) {
+            const U32 w = huffWeight[n];
+            CTable[n].nbBits = (BYTE)(tableLog + 1 - w);
+    }   }
+
+    /* fill val */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+2]  = {0};  /* support w=0=>n=tableLog+1 */
+        U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
+        { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
+        /* determine stating value per rank */
+        valPerRank[tableLog+1] = 0;   /* for w==0 */
+        {   U16 min = 0;
+            U32 n; for (n=tableLog; n>0; n--) {  /* start at n=tablelog <-> w=1 */
+                valPerRank[n] = min;     /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        /* assign value within rank, symbol order */
+        { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
+    }
+
+    *maxSymbolValuePtr = nbSymbols - 1;
+    return readSize;
+}
+
+
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+{
+    const U32 largestBits = huffNode[lastNonNull].nbBits;
+    if (largestBits <= maxNbBits) return largestBits;   /* early exit : no elt > maxNbBits */
+
+    /* there are several too large elements (at least >= 2) */
+    {   int totalCost = 0;
+        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        U32 n = lastNonNull;
+
+        while (huffNode[n].nbBits > maxNbBits) {
+            totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+            huffNode[n].nbBits = (BYTE)maxNbBits;
+            n --;
+        }  /* n stops at huffNode[n].nbBits <= maxNbBits */
+        while (huffNode[n].nbBits == maxNbBits) n--;   /* n end at index of smallest symbol using < maxNbBits */
+
+        /* renorm totalCost */
+        totalCost >>= (largestBits - maxNbBits);  /* note : totalCost is necessarily a multiple of baseCost */
+
+        /* repay normalized cost */
+        {   U32 const noSymbol = 0xF0F0F0F0;
+            U32 rankLast[HUF_TABLELOG_MAX+2];
+            int pos;
+
+            /* Get pos of last (smallest) symbol per rank */
+            memset(rankLast, 0xF0, sizeof(rankLast));
+            {   U32 currentNbBits = maxNbBits;
+                for (pos=n ; pos >= 0; pos--) {
+                    if (huffNode[pos].nbBits >= currentNbBits) continue;
+                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+                    rankLast[maxNbBits-currentNbBits] = pos;
+            }   }
+
+            while (totalCost > 0) {
+                U32 nBitsToDecrease = BIT_highbit32(totalCost) + 1;
+                for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                    U32 highPos = rankLast[nBitsToDecrease];
+                    U32 lowPos = rankLast[nBitsToDecrease-1];
+                    if (highPos == noSymbol) continue;
+                    if (lowPos == noSymbol) break;
+                    {   U32 const highTotal = huffNode[highPos].count;
+                        U32 const lowTotal = 2 * huffNode[lowPos].count;
+                        if (highTotal <= lowTotal) break;
+                }   }
+                /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+                /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+                while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
+                    nBitsToDecrease ++;
+                totalCost -= 1 << (nBitsToDecrease-1);
+                if (rankLast[nBitsToDecrease-1] == noSymbol)
+                    rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease];   /* this rank is no longer empty */
+                huffNode[rankLast[nBitsToDecrease]].nbBits ++;
+                if (rankLast[nBitsToDecrease] == 0)    /* special case, reached largest symbol */
+                    rankLast[nBitsToDecrease] = noSymbol;
+                else {
+                    rankLast[nBitsToDecrease]--;
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                        rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+            }   }   /* while (totalCost > 0) */
+
+            while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+                if (rankLast[1] == noSymbol) {  /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
+                    while (huffNode[n].nbBits == maxNbBits) n--;
+                    huffNode[n+1].nbBits--;
+                    rankLast[1] = n+1;
+                    totalCost++;
+                    continue;
+                }
+                huffNode[ rankLast[1] + 1 ].nbBits--;
+                rankLast[1]++;
+                totalCost ++;
+    }   }   }   /* there are several too large elements (at least >= 2) */
+
+    return maxNbBits;
+}
+
+
+typedef struct {
+    U32 base;
+    U32 current;
+} rankPos;
+
+static void HUF_sort(nodeElt* huffNode, const U32* count, U32 maxSymbolValue)
+{
+    rankPos rank[32];
+    U32 n;
+
+    memset(rank, 0, sizeof(rank));
+    for (n=0; n<=maxSymbolValue; n++) {
+        U32 r = BIT_highbit32(count[n] + 1);
+        rank[r].base ++;
+    }
+    for (n=30; n>0; n--) rank[n-1].base += rank[n].base;
+    for (n=0; n<32; n++) rank[n].current = rank[n].base;
+    for (n=0; n<=maxSymbolValue; n++) {
+        U32 const c = count[n];
+        U32 const r = BIT_highbit32(c+1) + 1;
+        U32 pos = rank[r].current++;
+        while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) {
+            huffNode[pos] = huffNode[pos-1];
+            pos--;
+        }
+        huffNode[pos].count = c;
+        huffNode[pos].byte  = (BYTE)n;
+    }
+}
+
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of HUF_CTABLE_WORKSPACE_SIZE_U32 unsigned.
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX+1)
+typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
+size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+{
+    nodeElt* const huffNode0 = (nodeElt*)workSpace;
+    nodeElt* const huffNode = huffNode0+1;
+    U32 n, nonNullRank;
+    int lowS, lowN;
+    U16 nodeNb = STARTNODE;
+    U32 nodeRoot;
+
+    /* safety checks */
+    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (wkspSize < sizeof(huffNodeTable)) return ERROR(workSpace_tooSmall);
+    if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+    memset(huffNode0, 0, sizeof(huffNodeTable));
+
+    /* sort, decreasing order */
+    HUF_sort(huffNode, count, maxSymbolValue);
+
+    /* init for parents */
+    nonNullRank = maxSymbolValue;
+    while(huffNode[nonNullRank].count == 0) nonNullRank--;
+    lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb;
+    huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count;
+    huffNode[lowS].parent = huffNode[lowS-1].parent = nodeNb;
+    nodeNb++; lowS-=2;
+    for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30);
+    huffNode0[0].count = (U32)(1U<<31);  /* fake entry, strong barrier */
+
+    /* create parents */
+    while (nodeNb <= nodeRoot) {
+        U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+        huffNode[n1].parent = huffNode[n2].parent = nodeNb;
+        nodeNb++;
+    }
+
+    /* distribute weights (unlimited tree height) */
+    huffNode[nodeRoot].nbBits = 0;
+    for (n=nodeRoot-1; n>=STARTNODE; n--)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+    for (n=0; n<=nonNullRank; n++)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
+    /* enforce maxTableLog */
+    maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits);
+
+    /* fill result into tree (val, nbBits) */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
+        U16 valPerRank[HUF_TABLELOG_MAX+1] = {0};
+        if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+        for (n=0; n<=nonNullRank; n++)
+            nbPerRank[huffNode[n].nbBits]++;
+        /* determine stating value per rank */
+        {   U16 min = 0;
+            for (n=maxNbBits; n>0; n--) {
+                valPerRank[n] = min;      /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[huffNode[n].byte].nbBits = huffNode[n].nbBits;   /* push nbBits per symbol, symbol order */
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[n].val = valPerRank[tree[n].nbBits]++;   /* assign value within rank, symbol order */
+    }
+
+    return maxNbBits;
+}
+
+/** HUF_buildCTable() :
+ * @return : maxNbBits
+ *  Note : count is used before tree is written, so they can safely overlap
+ */
+size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)
+{
+    huffNodeTable nodeTable;
+    return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, nodeTable, sizeof(nodeTable));
+}
+
+static size_t HUF_estimateCompressedSize(HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
+{
+    size_t nbBits = 0;
+    int s;
+    for (s = 0; s <= (int)maxSymbolValue; ++s) {
+        nbBits += CTable[s].nbBits * count[s];
+    }
+    return nbBits >> 3;
+}
+
+static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+  int bad = 0;
+  int s;
+  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+    bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
+  }
+  return !bad;
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+FORCE_INLINE_TEMPLATE void
+HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
+{
+    BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+#define HUF_FLUSHBITS(s)  BIT_flushBits(s)
+
+#define HUF_FLUSHBITS_1(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
+
+#define HUF_FLUSHBITS_2(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    const BYTE* ip = (const BYTE*) src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+    size_t n;
+    BIT_CStream_t bitC;
+
+    /* init */
+    if (dstSize < 8) return 0;   /* not enough space to compress */
+    { size_t const initErr = BIT_initCStream(&bitC, op, oend-op);
+      if (HUF_isError(initErr)) return 0; }
+
+    n = srcSize & ~3;  /* join to mod 4 */
+    switch (srcSize & 3)
+    {
+        case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
+                 HUF_FLUSHBITS_2(&bitC);
+		 /* fall-through */
+        case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
+                 HUF_FLUSHBITS_1(&bitC);
+		 /* fall-through */
+        case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
+                 HUF_FLUSHBITS(&bitC);
+		 /* fall-through */
+        case 0 : /* fall-through */
+        default: break;
+    }
+
+    for (; n>0; n-=4) {  /* note : n&3==0 at this stage */
+        HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
+        HUF_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
+        HUF_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
+        HUF_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
+        HUF_FLUSHBITS(&bitC);
+    }
+
+    return BIT_closeCStream(&bitC);
+}
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+                                      const void* src, size_t srcSize,
+                                      const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    if (bmi2) {
+        return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+    }
+    return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+}
+
+#else
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    (void)bmi2;
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+#endif
+
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+
+static size_t
+HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, int bmi2)
+{
+    size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    if (dstSize < 6 + 1 + 1 + 1 + 8) return 0;   /* minimum space to compress successfully */
+    if (srcSize < 12) return 0;   /* no saving possible : too small input */
+    op += 6;   /* jumpTable */
+
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart+2, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart+4, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, iend-ip, CTable, bmi2) );
+        if (cSize==0) return 0;
+        op += cSize;
+    }
+
+    return op-ostart;
+}
+
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+
+static size_t HUF_compressCTable_internal(
+                BYTE* const ostart, BYTE* op, BYTE* const oend,
+                const void* src, size_t srcSize,
+                unsigned singleStream, const HUF_CElt* CTable, const int bmi2)
+{
+    size_t const cSize = singleStream ?
+                         HUF_compress1X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2) :
+                         HUF_compress4X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2);
+    if (HUF_isError(cSize)) { return cSize; }
+    if (cSize==0) { return 0; }   /* uncompressible */
+    op += cSize;
+    /* check compressibility */
+    if ((size_t)(op-ostart) >= srcSize-1) { return 0; }
+    return op-ostart;
+}
+
+typedef struct {
+    U32 count[HUF_SYMBOLVALUE_MAX + 1];
+    HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
+    huffNodeTable nodeTable;
+} HUF_compress_tables_t;
+
+/* HUF_compress_internal() :
+ * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+static size_t HUF_compress_internal (
+                void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog,
+                unsigned singleStream,
+                void* workSpace, size_t wkspSize,
+                HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+                const int bmi2)
+{
+    HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    /* checks & inits */
+    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
+    if (!srcSize) return 0;  /* Uncompressed */
+    if (!dstSize) return 0;  /* cannot fit anything within dst budget */
+    if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);   /* current block size limit */
+    if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+    if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+
+    /* Heuristic : If old table is valid, use it for small inputs */
+    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           singleStream, oldHufTable, bmi2);
+    }
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(largest, FSE_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) );
+        if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+        if (largest <= (srcSize >> 7)+1) return 0;   /* heuristic : probably not compressible enough */
+    }
+
+    /* Check validity of previous table */
+    if ( repeat
+      && *repeat == HUF_repeat_check
+      && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) {
+        *repeat = HUF_repeat_none;
+    }
+    /* Heuristic : use existing table for small inputs */
+    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           singleStream, oldHufTable, bmi2);
+    }
+
+    /* Build Huffman Tree */
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    {   CHECK_V_F(maxBits, HUF_buildCTable_wksp(table->CTable, table->count,
+                                                maxSymbolValue, huffLog,
+                                                table->nodeTable, sizeof(table->nodeTable)) );
+        huffLog = (U32)maxBits;
+        /* Zero unused symbols in CTable, so we can check it for validity */
+        memset(table->CTable + (maxSymbolValue + 1), 0,
+               sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
+    }
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) );
+        /* Check if using previous huffman table is beneficial */
+        if (repeat && *repeat != HUF_repeat_none) {
+            size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue);
+            size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue);
+            if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                return HUF_compressCTable_internal(ostart, op, oend,
+                                                   src, srcSize,
+                                                   singleStream, oldHufTable, bmi2);
+        }   }
+
+        /* Use the new huffman table */
+        if (hSize + 12ul >= srcSize) { return 0; }
+        op += hSize;
+        if (repeat) { *repeat = HUF_repeat_none; }
+        if (oldHufTable)
+            memcpy(oldHufTable, table->CTable, sizeof(table->CTable));  /* Save new table */
+    }
+    return HUF_compressCTable_internal(ostart, op, oend,
+                                       src, srcSize,
+                                       singleStream, table->CTable, bmi2);
+}
+
+
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 1 /*single stream*/,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/);
+}
+
+size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 1 /*single stream*/,
+                                 workSpace, wkspSize, hufTable,
+                                 repeat, preferRepeat, bmi2);
+}
+
+size_t HUF_compress1X (void* dst, size_t dstSize,
+                 const void* src, size_t srcSize,
+                 unsigned maxSymbolValue, unsigned huffLog)
+{
+    unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+    return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * provide workspace to generate compression tables */
+size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 0 /*4 streams*/,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/);
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * re-use an existing huffman compression table */
+size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 0 /* 4 streams */,
+                                 workSpace, wkspSize,
+                                 hufTable, repeat, preferRepeat, bmi2);
+}
+
+size_t HUF_compress2 (void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog)
+{
+    unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+    return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT);
+}
diff --git a/vendor/github.com/DataDog/zstd/huf_decompress.c b/vendor/github.com/DataDog/zstd/huf_decompress.c
new file mode 100644
index 0000000..73f5c46
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/huf_decompress.c
@@ -0,0 +1,1096 @@
+/* ******************************************************************
+   Huffman decoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include <string.h>     /* memcpy, memset */
+#include "bitstream.h"  /* BIT_* */
+#include "compiler.h"
+#include "fse.h"        /* header compression */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "error_private.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+#define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; }
+
+
+/* **************************************************************
+*  Byte alignment for workSpace management
+****************************************************************/
+#define HUF_ALIGN(x, a)         HUF_ALIGN_MASK((x), (a) - 1)
+#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+{
+    DTableDesc dtd;
+    memcpy(&dtd, table, sizeof(dtd));
+    return dtd;
+}
+
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+{
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    size_t iSize;
+    void* const dtPtr = DTable + 1;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+
+    U32* rankVal;
+    BYTE* huffWeight;
+    size_t spaceUsed32 = 0;
+
+    rankVal = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+    huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+    if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
+
+    HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+    /* memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* Table header */
+    {   DTableDesc dtd = HUF_getDTableDesc(DTable);
+        if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge);   /* DTable too small, Huffman tree cannot fit in */
+        dtd.tableType = 0;
+        dtd.tableLog = (BYTE)tableLog;
+        memcpy(DTable, &dtd, sizeof(dtd));
+    }
+
+    /* Calculate starting value for each rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<tableLog+1; n++) {
+            U32 const current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill DTable */
+    {   U32 n;
+        for (n=0; n<nbSymbols; n++) {
+            U32 const w = huffWeight[n];
+            U32 const length = (1 << w) >> 1;
+            U32 u;
+            HUF_DEltX2 D;
+            D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+            for (u = rankVal[w]; u < rankVal[w] + length; u++)
+                dt[u] = D;
+            rankVal[w] += length;
+    }   }
+
+    return iSize;
+}
+
+size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_readDTableX2_wksp(DTable, src, srcSize,
+                                 workSpace, sizeof(workSpace));
+}
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
+
+FORCE_INLINE_TEMPLATE BYTE
+HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    BYTE const c = dt[val].byte;
+    BIT_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)  \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+HINT_INLINE size_t
+HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* [0-3] symbols remaining */
+    if (MEM_32bits())
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
+            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const void* dtPtr = DTable + 1;
+    const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+    BIT_DStream_t bitD;
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    U32 const dtLog = dtd.tableLog;
+
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable + 1;
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal = BIT_DStream_unfinished;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+            BIT_reloadDStream(&bitD1);
+            BIT_reloadDStream(&bitD2);
+            BIT_reloadDStream(&bitD3);
+            BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        /* note : should not be necessary : op# advance in lock step, and we control op4.
+         *        but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
+    }   }
+    return 1;
+}
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+HINT_INLINE size_t
+HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+                const HUF_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X4_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BIT_DStream_t bitD;
+
+    /* Init */
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X4_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;
+        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+                                               const void *cSrc,
+                                               size_t cSrcSize,
+                                               const HUF_DTable *DTable);
+#if DYNAMIC_BMI2
+
+#define X(fn)                                                               \
+                                                                            \
+    static size_t fn##_default(                                             \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2(                       \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        if (bmi2) {                                                         \
+            return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+        }                                                                   \
+        return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+    }
+
+#else
+
+#define X(fn)                                                               \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        (void)bmi2;                                                         \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }
+
+#endif
+
+X(HUF_decompress1X2_usingDTable_internal)
+X(HUF_decompress4X2_usingDTable_internal)
+X(HUF_decompress1X4_usingDTable_internal)
+X(HUF_decompress4X4_usingDTable_internal)
+
+#undef X
+
+
+size_t HUF_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp (dctx, cSrc, cSrcSize,
+                                                workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+}
+
+
+size_t HUF_decompress4X2_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/* HUF_fillDTableX4Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX4 DElt;
+    U32 rankVal[HUF_TABLELOG_MAX + 1];
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    {   U32 s; for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+            const U32 symbol = sortedSymbols[s].symbol;
+            const U32 weight = sortedSymbols[s].weight;
+            const U32 nbBits = nbBitsBaseline - weight;
+            const U32 length = 1 << (sizeLog-nbBits);
+            const U32 start = rankVal[weight];
+            U32 i = start;
+            const U32 end = start + length;
+
+            MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+            DElt.nbBits = (BYTE)(nbBits + consumed);
+            DElt.length = 2;
+            do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+            rankVal[weight] += length;
+    }   }
+}
+
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
+
+static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_TABLELOG_MAX + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            HUF_DEltX4 DElt;
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits = (BYTE)(nbBits);
+            DElt.length = 1;
+            {   U32 const end = start + length;
+                U32 u;
+                for (u = start; u < end; u++) DTable[u] = DElt;
+        }   }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
+                             size_t srcSize, void* workSpace,
+                             size_t wkspSize)
+{
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    U32 const maxTableLog = dtd.maxTableLog;
+    size_t iSize;
+    void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
+    HUF_DEltX4* const dt = (HUF_DEltX4*)dtPtr;
+    U32 *rankStart;
+
+    rankValCol_t* rankVal;
+    U32* rankStats;
+    U32* rankStart0;
+    sortedSymbol_t* sortedSymbol;
+    BYTE* weightList;
+    size_t spaceUsed32 = 0;
+
+    rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+    rankStats = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_MAX + 1;
+    rankStart0 = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_MAX + 2;
+    sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
+    spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+    weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+    if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
+
+    rankStart = rankStart0 + 1;
+    memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
+    if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    /* memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = weightList[s];
+            U32 const r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = rankVal[0];
+        {   int const rescale = (maxTableLog-tableLog) - 1;   /* tableLog <= maxTableLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 current = nextRankVal;
+                nextRankVal += rankStats[w] << (w+rescale);
+                rankVal0[w] = current;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUF_fillDTableX4(dt, maxTableLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    dtd.tableLog = (BYTE)maxTableLog;
+    dtd.tableType = 1;
+    memcpy(DTable, &dtd, sizeof(dtd));
+    return iSize;
+}
+
+size_t HUF_readDTableX4(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+  U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+  return HUF_readDTableX4_wksp(DTable, src, srcSize,
+                               workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize,
+                                               workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X4_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X4_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X4_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize,
+                                         workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+{
+    assert(dstSize > 0);
+    assert(dstSize <= 128 KB);
+    /* decoder timing evaluation */
+    {   U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 */
+        U32 const D256 = (U32)(dstSize >> 8);
+        U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+        U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+        DTime1 += DTime1 >> 3;  /* advantage to algorithm using less memory, to reduce cache eviction */
+        return DTime1 < DTime0;
+}   }
+
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[2] = { HUF_decompress4X2, HUF_decompress4X4 };
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+    }
+}
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                         workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+                                     size_t dstSize, const void* cSrc,
+                                     size_t cSrcSize, void* workSpace,
+                                     size_t wkspSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize):
+                        HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+    }
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                  const void* cSrc, size_t cSrcSize,
+                                  void* workSpace, size_t wkspSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize):
+                        HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize);
+    }
+}
+
+size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                             const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                      workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+}
+
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+                        HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    }
+}
diff --git a/vendor/github.com/DataDog/zstd/mem.h b/vendor/github.com/DataDog/zstd/mem.h
new file mode 100644
index 0000000..47d2300
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/mem.h
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>     /* size_t, ptrdiff_t */
+#include <string.h>     /* memcpy */
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __inline __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+/* code only tested on 32 and 64 bits systems */
+#define MEM_STATIC_ASSERT(c)   { enum { MEM_static_assert = 1/(int)(!!(c)) }; }
+MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef   uint8_t BYTE;
+  typedef  uint16_t U16;
+  typedef   int16_t S16;
+  typedef  uint32_t U32;
+  typedef   int32_t S32;
+  typedef  uint64_t U64;
+  typedef   int64_t S64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || defined(__GNUC__)
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
+    __pragma( pack(push, 1) )
+    typedef struct { U16 v; } unalign16;
+    typedef struct { U32 v; } unalign32;
+    typedef struct { U64 v; } unalign64;
+    typedef struct { size_t v; } unalignArch;
+    __pragma( pack(pop) )
+#else
+    typedef struct { U16 v; } __attribute__((packed)) unalign16;
+    typedef struct { U32 v; } __attribute__((packed)) unalign32;
+    typedef struct { U64 v; } __attribute__((packed)) unalign64;
+    typedef struct { size_t v; } __attribute__((packed)) unalignArch;
+#endif
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; }
+MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC size_t MEM_readST(const void* memPtr)
+{
+    size_t val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap32(in);
+#else
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap64(in);
+#else
+    return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+#endif
+}
+
+MEM_STATIC size_t MEM_swapST(size_t in)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_swap32((U32)in);
+    else
+        return (size_t)MEM_swap64((U64)in);
+}
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE24(const void* memPtr)
+{
+    return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16);
+}
+
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val)
+{
+    MEM_writeLE16(memPtr, (U16)val);
+    ((BYTE*)memPtr)[2] = (BYTE)(val>>16);
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, val32);
+    else
+        MEM_write32(memPtr, MEM_swap32(val32));
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, val64);
+    else
+        MEM_write64(memPtr, MEM_swap64(val64));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeLE32(memPtr, (U32)val);
+    else
+        MEM_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+MEM_STATIC U32 MEM_readBE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap32(MEM_read32(memPtr));
+    else
+        return MEM_read32(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, MEM_swap32(val32));
+    else
+        MEM_write32(memPtr, val32);
+}
+
+MEM_STATIC U64 MEM_readBE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap64(MEM_read64(memPtr));
+    else
+        return MEM_read64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, MEM_swap64(val64));
+    else
+        MEM_write64(memPtr, val64);
+}
+
+MEM_STATIC size_t MEM_readBEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readBE32(memPtr);
+    else
+        return (size_t)MEM_readBE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeBE32(memPtr, (U32)val);
+    else
+        MEM_writeBE64(memPtr, (U64)val);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
diff --git a/vendor/github.com/DataDog/zstd/pool.c b/vendor/github.com/DataDog/zstd/pool.c
new file mode 100644
index 0000000..773488b
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/pool.c
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Dependencies   ======= */
+#include <stddef.h>  /* size_t */
+#include "pool.h"
+#include "zstd_internal.h"  /* ZSTD_malloc, ZSTD_free */
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+#ifdef ZSTD_MULTITHREAD
+
+#include "threading.h"   /* pthread adaptation */
+
+/* A job is a function and an opaque argument */
+typedef struct POOL_job_s {
+    POOL_function function;
+    void *opaque;
+} POOL_job;
+
+struct POOL_ctx_s {
+    ZSTD_customMem customMem;
+    /* Keep track of the threads */
+    ZSTD_pthread_t *threads;
+    size_t numThreads;
+
+    /* The queue is a circular buffer */
+    POOL_job *queue;
+    size_t queueHead;
+    size_t queueTail;
+    size_t queueSize;
+
+    /* The number of threads working on jobs */
+    size_t numThreadsBusy;
+    /* Indicates if the queue is empty */
+    int queueEmpty;
+
+    /* The mutex protects the queue */
+    ZSTD_pthread_mutex_t queueMutex;
+    /* Condition variable for pushers to wait on when the queue is full */
+    ZSTD_pthread_cond_t queuePushCond;
+    /* Condition variables for poppers to wait on when the queue is empty */
+    ZSTD_pthread_cond_t queuePopCond;
+    /* Indicates if the queue is shutting down */
+    int shutdown;
+};
+
+/* POOL_thread() :
+   Work thread for the thread pool.
+   Waits for jobs and executes them.
+   @returns : NULL on failure else non-null.
+*/
+static void* POOL_thread(void* opaque) {
+    POOL_ctx* const ctx = (POOL_ctx*)opaque;
+    if (!ctx) { return NULL; }
+    for (;;) {
+        /* Lock the mutex and wait for a non-empty queue or until shutdown */
+        ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+
+        while (ctx->queueEmpty && !ctx->shutdown) {
+            ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
+        }
+        /* empty => shutting down: so stop */
+        if (ctx->queueEmpty) {
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+            return opaque;
+        }
+        /* Pop a job off the queue */
+        {   POOL_job const job = ctx->queue[ctx->queueHead];
+            ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
+            ctx->numThreadsBusy++;
+            ctx->queueEmpty = ctx->queueHead == ctx->queueTail;
+            /* Unlock the mutex, signal a pusher, and run the job */
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+
+            job.function(job.opaque);
+
+            /* If the intended queue size was 0, signal after finishing job */
+            if (ctx->queueSize == 1) {
+                ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+                ctx->numThreadsBusy--;
+                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+                ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+        }   }
+    }  /* for (;;) */
+    /* Unreachable */
+}
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) {
+    POOL_ctx* ctx;
+    /* Check the parameters */
+    if (!numThreads) { return NULL; }
+    /* Allocate the context and zero initialize */
+    ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem);
+    if (!ctx) { return NULL; }
+    /* Initialize the job queue.
+     * It needs one extra space since one space is wasted to differentiate empty
+     * and full queues.
+     */
+    ctx->queueSize = queueSize + 1;
+    ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem);
+    ctx->queueHead = 0;
+    ctx->queueTail = 0;
+    ctx->numThreadsBusy = 0;
+    ctx->queueEmpty = 1;
+    (void)ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL);
+    (void)ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL);
+    (void)ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL);
+    ctx->shutdown = 0;
+    /* Allocate space for the thread handles */
+    ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
+    ctx->numThreads = 0;
+    ctx->customMem = customMem;
+    /* Check for errors */
+    if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
+    /* Initialize the threads */
+    {   size_t i;
+        for (i = 0; i < numThreads; ++i) {
+            if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
+                ctx->numThreads = i;
+                POOL_free(ctx);
+                return NULL;
+        }   }
+        ctx->numThreads = numThreads;
+    }
+    return ctx;
+}
+
+/*! POOL_join() :
+    Shutdown the queue, wake any sleeping threads, and join all of the threads.
+*/
+static void POOL_join(POOL_ctx* ctx) {
+    /* Shut down the queue */
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    ctx->shutdown = 1;
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    /* Wake up sleeping threads */
+    ZSTD_pthread_cond_broadcast(&ctx->queuePushCond);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    /* Join all of the threads */
+    {   size_t i;
+        for (i = 0; i < ctx->numThreads; ++i) {
+            ZSTD_pthread_join(ctx->threads[i], NULL);
+    }   }
+}
+
+void POOL_free(POOL_ctx *ctx) {
+    if (!ctx) { return; }
+    POOL_join(ctx);
+    ZSTD_pthread_mutex_destroy(&ctx->queueMutex);
+    ZSTD_pthread_cond_destroy(&ctx->queuePushCond);
+    ZSTD_pthread_cond_destroy(&ctx->queuePopCond);
+    ZSTD_free(ctx->queue, ctx->customMem);
+    ZSTD_free(ctx->threads, ctx->customMem);
+    ZSTD_free(ctx, ctx->customMem);
+}
+
+size_t POOL_sizeof(POOL_ctx *ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    return sizeof(*ctx)
+        + ctx->queueSize * sizeof(POOL_job)
+        + ctx->numThreads * sizeof(ZSTD_pthread_t);
+}
+
+/**
+ * Returns 1 if the queue is full and 0 otherwise.
+ *
+ * If the queueSize is 1 (the pool was created with an intended queueSize of 0),
+ * then a queue is empty if there is a thread free and no job is waiting.
+ */
+static int isQueueFull(POOL_ctx const* ctx) {
+    if (ctx->queueSize > 1) {
+        return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
+    } else {
+        return ctx->numThreadsBusy == ctx->numThreads ||
+               !ctx->queueEmpty;
+    }
+}
+
+
+static void POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque)
+{
+    POOL_job const job = {function, opaque};
+    assert(ctx != NULL);
+    if (ctx->shutdown) return;
+
+    ctx->queueEmpty = 0;
+    ctx->queue[ctx->queueTail] = job;
+    ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
+    ZSTD_pthread_cond_signal(&ctx->queuePopCond);
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    /* Wait until there is space in the queue for the new job */
+    while (isQueueFull(ctx) && (!ctx->shutdown)) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    if (isQueueFull(ctx)) {
+        ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        return 0;
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return 1;
+}
+
+
+#else  /* ZSTD_MULTITHREAD  not defined */
+
+/* ========================== */
+/* No multi-threading support */
+/* ========================== */
+
+
+/* We don't need any data, but if it is empty, malloc() might return NULL. */
+struct POOL_ctx_s {
+    int dummy;
+};
+static POOL_ctx g_ctx;
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) {
+    (void)numThreads;
+    (void)queueSize;
+    (void)customMem;
+    return &g_ctx;
+}
+
+void POOL_free(POOL_ctx* ctx) {
+    assert(!ctx || ctx == &g_ctx);
+    (void)ctx;
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+}
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+    return 1;
+}
+
+size_t POOL_sizeof(POOL_ctx* ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    assert(ctx == &g_ctx);
+    return sizeof(*ctx);
+}
+
+#endif  /* ZSTD_MULTITHREAD */
diff --git a/vendor/github.com/DataDog/zstd/pool.h b/vendor/github.com/DataDog/zstd/pool.h
new file mode 100644
index 0000000..a57e9b4
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/pool.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef POOL_H
+#define POOL_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#include <stddef.h>   /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_customMem */
+#include "zstd.h"
+
+typedef struct POOL_ctx_s POOL_ctx;
+
+/*! POOL_create() :
+ *  Create a thread pool with at most `numThreads` threads.
+ * `numThreads` must be at least 1.
+ *  The maximum number of queued jobs before blocking is `queueSize`.
+ * @return : POOL_ctx pointer on success, else NULL.
+*/
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem);
+
+/*! POOL_free() :
+    Free a thread pool returned by POOL_create().
+*/
+void POOL_free(POOL_ctx* ctx);
+
+/*! POOL_sizeof() :
+    return memory usage of pool returned by POOL_create().
+*/
+size_t POOL_sizeof(POOL_ctx* ctx);
+
+/*! POOL_function :
+    The function type that can be added to a thread pool.
+*/
+typedef void (*POOL_function)(void*);
+/*! POOL_add_function :
+    The function type for a generic thread pool add function.
+*/
+typedef void (*POOL_add_function)(void*, POOL_function, void*);
+
+/*! POOL_add() :
+    Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
+    Possibly blocks until there is room in the queue.
+    Note : The function may be executed asynchronously, so `opaque` must live until the function has been completed.
+*/
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+/*! POOL_tryAdd() :
+    Add the job `function(opaque)` to the thread pool if a worker is available.
+    return immediately otherwise.
+   @return : 1 if successful, 0 if not.
+*/
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
diff --git a/vendor/github.com/DataDog/zstd/threading.c b/vendor/github.com/DataDog/zstd/threading.c
new file mode 100644
index 0000000..8be8c8d
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/threading.c
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ */
+
+/**
+ * This file will hold wrapper for systems, which do not support pthreads
+ */
+
+/* create fake symbol to avoid empty trnaslation unit warning */
+int g_ZSTD_threading_useles_symbol;
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+
+
+/* ===  Dependencies  === */
+#include <process.h>
+#include <errno.h>
+#include "threading.h"
+
+
+/* ===  Implementation  === */
+
+static unsigned __stdcall worker(void *arg)
+{
+    ZSTD_pthread_t* const thread = (ZSTD_pthread_t*) arg;
+    thread->arg = thread->start_routine(thread->arg);
+    return 0;
+}
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+            void* (*start_routine) (void*), void* arg)
+{
+    (void)unused;
+    thread->arg = arg;
+    thread->start_routine = start_routine;
+    thread->handle = (HANDLE) _beginthreadex(NULL, 0, worker, thread, 0, NULL);
+
+    if (!thread->handle)
+        return errno;
+    else
+        return 0;
+}
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread, void **value_ptr)
+{
+    DWORD result;
+
+    if (!thread.handle) return 0;
+
+    result = WaitForSingleObject(thread.handle, INFINITE);
+    switch (result) {
+    case WAIT_OBJECT_0:
+        if (value_ptr) *value_ptr = thread.arg;
+        return 0;
+    case WAIT_ABANDONED:
+        return EINVAL;
+    default:
+        return GetLastError();
+    }
+}
+
+#endif   /* ZSTD_MULTITHREAD */
diff --git a/vendor/github.com/DataDog/zstd/threading.h b/vendor/github.com/DataDog/zstd/threading.h
new file mode 100644
index 0000000..d806c89
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/threading.h
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ */
+
+#ifndef THREADING_H_938743
+#define THREADING_H_938743
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+#ifdef WINVER
+#  undef WINVER
+#endif
+#define WINVER       0x0600
+
+#ifdef _WIN32_WINNT
+#  undef _WIN32_WINNT
+#endif
+#define _WIN32_WINNT 0x0600
+
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#include <windows.h>
+#undef ERROR
+#define ERROR(name) ZSTD_ERROR(name)
+
+
+/* mutex */
+#define ZSTD_pthread_mutex_t           CRITICAL_SECTION
+#define ZSTD_pthread_mutex_init(a, b)  ((void)(b), InitializeCriticalSection((a)), 0)
+#define ZSTD_pthread_mutex_destroy(a)  DeleteCriticalSection((a))
+#define ZSTD_pthread_mutex_lock(a)     EnterCriticalSection((a))
+#define ZSTD_pthread_mutex_unlock(a)   LeaveCriticalSection((a))
+
+/* condition variable */
+#define ZSTD_pthread_cond_t             CONDITION_VARIABLE
+#define ZSTD_pthread_cond_init(a, b)    ((void)(b), InitializeConditionVariable((a)), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    SleepConditionVariableCS((a), (b), INFINITE)
+#define ZSTD_pthread_cond_signal(a)     WakeConditionVariable((a))
+#define ZSTD_pthread_cond_broadcast(a)  WakeAllConditionVariable((a))
+
+/* ZSTD_pthread_create() and ZSTD_pthread_join() */
+typedef struct {
+    HANDLE handle;
+    void* (*start_routine)(void*);
+    void* arg;
+} ZSTD_pthread_t;
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+                   void* (*start_routine) (void*), void* arg);
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr);
+
+/**
+ * add here more wrappers as required
+ */
+
+
+#elif defined(ZSTD_MULTITHREAD)   /* posix assumed ; need a better detection method */
+/* ===   POSIX Systems   === */
+#  include <pthread.h>
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t
+#define ZSTD_pthread_mutex_init(a, b)   pthread_mutex_init((a), (b))
+#define ZSTD_pthread_mutex_destroy(a)   pthread_mutex_destroy((a))
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock((a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock((a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t
+#define ZSTD_pthread_cond_init(a, b)    pthread_cond_init((a), (b))
+#define ZSTD_pthread_cond_destroy(a)    pthread_cond_destroy((a))
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait((a), (b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal((a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast((a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a, b)         pthread_join((a),(b))
+
+#else  /* ZSTD_MULTITHREAD not defined */
+/* No multithreading support */
+
+typedef int ZSTD_pthread_mutex_t;
+#define ZSTD_pthread_mutex_init(a, b)   ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_mutex_destroy(a)   ((void)(a))
+#define ZSTD_pthread_mutex_lock(a)      ((void)(a))
+#define ZSTD_pthread_mutex_unlock(a)    ((void)(a))
+
+typedef int ZSTD_pthread_cond_t;
+#define ZSTD_pthread_cond_init(a, b)    ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    ((void)(a), (void)(b))
+#define ZSTD_pthread_cond_signal(a)     ((void)(a))
+#define ZSTD_pthread_cond_broadcast(a)  ((void)(a))
+
+/* do not use ZSTD_pthread_t */
+
+#endif /* ZSTD_MULTITHREAD */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* THREADING_H_938743 */
diff --git a/vendor/github.com/DataDog/zstd/travis_test_32.sh b/vendor/github.com/DataDog/zstd/travis_test_32.sh
new file mode 100755
index 0000000..d29c86c
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/travis_test_32.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Get utilities
+yum -y -q -e 0 install wget tar unzip gcc
+
+# Get Go
+wget -q https://dl.google.com/go/go1.11.1.linux-386.tar.gz
+tar -C /usr/local -xzf go1.11.1.linux-386.tar.gz
+export PATH=$PATH:/usr/local/go/bin
+
+# Get payload
+wget -q https://github.com/DataDog/zstd/files/2246767/mr.zip
+unzip mr.zip
+
+# Build and run tests
+cd zstd
+go build
+PAYLOAD=$(pwd)/mr go test -v
+PAYLOAD=$(pwd)/mr go test -bench .
diff --git a/vendor/github.com/DataDog/zstd/update.txt b/vendor/github.com/DataDog/zstd/update.txt
new file mode 100644
index 0000000..1de939f
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/update.txt
@@ -0,0 +1,56 @@
+./lib/common/bitstream.h
+./lib/common/compiler.h
+./lib/compress/zstd_compress_internal.h
+./lib/compress/zstd_fast.h
+./lib/compress/zstd_double_fast.h
+./lib/compress/zstd_lazy.h
+./lib/compress/zstd_ldm.h
+./lib/dictBuilder/cover.c
+./lib/dictBuilder/divsufsort.c
+./lib/dictBuilder/divsufsort.h
+./lib/common/entropy_common.c
+./lib/common/error_private.c
+./lib/common/error_private.h
+./lib/compress/fse_compress.c
+./lib/common/fse_decompress.c
+./lib/common/fse.h
+./lib/compress/huf_compress.c
+./lib/decompress/huf_decompress.c
+./lib/common/huf.h
+./lib/common/mem.h
+./lib/common/pool.c
+./lib/common/pool.h
+./lib/common/threading.c
+./lib/common/threading.h
+./lib/common/xxhash.c
+./lib/common/xxhash.h
+./lib/deprecated/zbuff_common.c
+./lib/deprecated/zbuff_compress.c
+./lib/deprecated/zbuff_decompress.c
+./lib/deprecated/zbuff.h
+./lib/dictBuilder/zdict.c
+./lib/dictBuilder/zdict.h
+./lib/common/zstd_common.c
+./lib/compress/zstd_compress.c
+./lib/decompress/zstd_decompress.c
+./lib/common/zstd_errors.h
+./lib/zstd.h
+./lib/common/zstd_internal.h
+./lib/legacy/zstd_legacy.h
+./lib/compress/zstd_opt.c
+./lib/compress/zstd_opt.h
+./lib/legacy/zstd_v01.c
+./lib/legacy/zstd_v01.h
+./lib/legacy/zstd_v02.c
+./lib/legacy/zstd_v02.h
+./lib/legacy/zstd_v03.c
+./lib/legacy/zstd_v03.h
+./lib/legacy/zstd_v04.c
+./lib/legacy/zstd_v04.h
+./lib/legacy/zstd_v05.c
+./lib/legacy/zstd_v05.h
+./lib/legacy/zstd_v06.c
+./lib/legacy/zstd_v06.h
+./lib/legacy/zstd_v07.c
+./lib/legacy/zstd_v07.h
+
diff --git a/vendor/github.com/DataDog/zstd/xxhash.c b/vendor/github.com/DataDog/zstd/xxhash.c
new file mode 100644
index 0000000..9d9c0e9
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/xxhash.c
@@ -0,0 +1,875 @@
+/*
+*  xxHash - Fast Hash algorithm
+*  Copyright (C) 2012-2016, Yann Collet
+*
+*  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions are
+*  met:
+*
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following disclaimer
+*  in the documentation and/or other materials provided with the
+*  distribution.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*  You can contact the author at :
+*  - xxHash homepage: http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+ * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+ * By default, this option is disabled. To enable it, uncomment below define :
+ */
+/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+
+/*!XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independance be of no importance for your application, you may set the #define below to 1,
+ * to improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#ifndef XXH_FORCE_NATIVE_FORMAT   /* can be defined externally */
+#  define XXH_FORCE_NATIVE_FORMAT 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash; set to 0 when the input data
+ * is guaranteed to be aligned.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/* for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY
+#endif
+#include "xxhash.h"
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+
+
+#ifdef _MSC_VER
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#ifndef MEM_MODULE
+# define MEM_MODULE
+# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint8_t  BYTE;
+    typedef uint16_t U16;
+    typedef uint32_t U32;
+    typedef  int32_t S32;
+    typedef uint64_t U64;
+#  else
+    typedef unsigned char      BYTE;
+    typedef unsigned short     U16;
+    typedef unsigned int       U32;
+    typedef   signed int       S32;
+    typedef unsigned long long U64;   /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */
+#  endif
+#endif
+
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static U32 XXH_read32(const void* memPtr)
+{
+    U32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+static U64 XXH_read64(const void* memPtr)
+{
+    U64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#  define XXH_swap64 _byteswap_uint64
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#  define XXH_swap64 __builtin_bswap64
+#else
+static U32 XXH_swap32 (U32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+static U64 XXH_swap64 (U64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* *************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+    static const int g_one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&g_one))
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+static U32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+
+/* *************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(int)(!!(c)) }; }    /* use only *after* variable declarations */
+
+
+/* *************************************
+*  Constants
+***************************************/
+static const U32 PRIME32_1 = 2654435761U;
+static const U32 PRIME32_2 = 2246822519U;
+static const U32 PRIME32_3 = 3266489917U;
+static const U32 PRIME32_4 =  668265263U;
+static const U32 PRIME32_5 =  374761393U;
+
+static const U64 PRIME64_1 = 11400714785074694791ULL;
+static const U64 PRIME64_2 = 14029467366897019727ULL;
+static const U64 PRIME64_3 =  1609587929392839161ULL;
+static const U64 PRIME64_4 =  9650029242287828579ULL;
+static const U64 PRIME64_5 =  2870177450012600261ULL;
+
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* **************************
+*  Utils
+****************************/
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+
+/* ***************************
+*  Simple Hash Functions
+*****************************/
+
+static U32 XXH32_round(U32 seed, U32 input)
+{
+    seed += input * PRIME32_2;
+    seed  = XXH_rotl32(seed, 13);
+    seed *= PRIME32_1;
+    return seed;
+}
+
+FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U32 h32;
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
+            v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
+            v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
+            v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
+        } while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p+4<=bEnd) {
+        h32 += XXH_get32bits(p) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_CREATESTATE_STATIC(state);
+    XXH32_reset(state, seed);
+    XXH32_update(state, input, len);
+    return XXH32_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+static U64 XXH64_round(U64 acc, U64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+    U64 h64;
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8;
+        } while (p<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (U64) len;
+
+    while (p+8<=bEnd) {
+        U64 const k1 = XXH64_round(0, XXH_get64bits(p));
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd) {
+        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_CREATESTATE_STATIC(state);
+    XXH64_reset(state, seed);
+    XXH64_update(state, input, len);
+    return XXH64_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+/* **************************************************
+*  Advanced Hash Functions
+****************************************************/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+
+/*** Hash feed ***/
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state)-4);   /* do not write into reserved, for future removal */
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    memcpy(statePtr, &state, sizeof(state));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
+{
+    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state)-8);   /* do not write into reserved, for future removal */
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+    memcpy(statePtr, &state, sizeof(state));
+    return XXH_OK;
+}
+
+
+FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len_32 += (unsigned)len;
+    state->large_len |= (len>=16) | (state->total_len_32>=16);
+
+    if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+        state->memsize += (unsigned)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize) {   /* some data left from previous update */
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+        {   const U32* p32 = state->mem32;
+            state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
+            state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
+            state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
+            state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16) {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do {
+            v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
+            v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
+            v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
+            v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+        XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+        state->memsize = (unsigned)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
+{
+    const BYTE * p = (const BYTE*)state->mem32;
+    const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;
+    U32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    while (p+4<=bEnd) {
+        h32 += XXH_readLE32(p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h32 += (*p) * PRIME32_5;
+        h32  = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+
+/* **** XXH64 **** */
+
+FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32) {  /* fill in tmp buffer */
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize) {   /* tmp buffer is full */
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+        state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
+        state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
+        state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
+        state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
+        p += 32-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p+32 <= bEnd) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        do {
+            v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
+            v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
+            v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
+            v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+        XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+        state->memsize = (unsigned)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
+{
+    const BYTE * p = (const BYTE*)state->mem64;
+    const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;
+    U64 h64;
+
+    if (state->total_len >= 32) {
+        U64 const v1 = state->v1;
+        U64 const v2 = state->v2;
+        U64 const v3 = state->v3;
+        U64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 + PRIME64_5;
+    }
+
+    h64 += (U64) state->total_len;
+
+    while (p+8<=bEnd) {
+        U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd) {
+        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
+        h64  = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h64 ^= (*p) * PRIME64_5;
+        h64  = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+/* **************************
+*  Canonical representation
+****************************/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
diff --git a/vendor/github.com/DataDog/zstd/xxhash.h b/vendor/github.com/DataDog/zstd/xxhash.h
new file mode 100644
index 0000000..9bad1f5
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/xxhash.h
@@ -0,0 +1,305 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/* ****************************
+*  API modifier
+******************************/
+/** XXH_PRIVATE_API
+*   This is useful if you want to include xxhash functions in `static` mode
+*   in order to inline them, and remove their symbol from the public list.
+*   Methodology :
+*     #define XXH_PRIVATE_API
+*     #include "xxhash.h"
+*   `xxhash.c` is automatically included.
+*   It's not useful to compile and link it as a separate module anymore.
+*/
+#ifdef XXH_PRIVATE_API
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+#    define XXH_PUBLIC_API static   /* this version may generate warnings for unused static functions; disable the relevant warning */
+#  endif
+#else
+#  define XXH_PUBLIC_API   /* do nothing */
+#endif /* XXH_PRIVATE_API */
+
+/*!XXH_NAMESPACE, aka Namespace Emulation :
+
+If you want to include _and expose_ xxHash functions from within your own library,
+but also want to avoid symbol collisions with another library which also includes xxHash,
+
+you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values).
+
+Note that no change is required within the calling program as long as it includes `xxhash.h` :
+regular symbol name will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    6
+#define XXH_VERSION_RELEASE  2
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Simple Hash Functions
+******************************/
+typedef unsigned int       XXH32_hash_t;
+typedef unsigned long long XXH64_hash_t;
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*!
+XXH32() :
+    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    "seed" can be used to alter the result predictably.
+    This function runs 2x faster on 64-bits systems, but slower on 32-bits systems (see benchmark).
+*/
+
+
+/* ****************************
+*  Streaming Hash Functions
+******************************/
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+
+/*! State allocation, compatible with dynamic libraries */
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+
+/* hash streaming */
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned int seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*
+These functions generate the xxHash of an input provided in multiple segments.
+Note that, for small input, they are slower than single-call functions, due to state management.
+For small input, prefer `XXH32()` and `XXH64()` .
+
+XXH state must first be allocated, using XXH*_createState() .
+
+Start a new hash by initializing state with a seed, using XXH*_reset().
+
+Then, feed the hash state by calling XXH*_update() as many times as necessary.
+Obviously, input must be allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, a hash value can be produced anytime, by using XXH*_digest().
+This function returns the nn-bits hash as an int or long long.
+
+It's still possible to continue inserting input into the hash state after a digest,
+and generate some new hashes later on, by calling again XXH*_digest().
+
+When done, free XXH state space if it was allocated dynamically.
+*/
+
+
+/* **************************
+*  Utils
+****************************/
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* ! C99 */
+#  define restrict   /* disable restrict */
+#endif
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state);
+
+
+/* **************************
+*  Canonical representation
+****************************/
+/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
+*  The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+*  These functions allow transformation of hash result into and from its canonical format.
+*  This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+*/
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+/* ================================================================================================
+   This section contains definitions which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   They shall only be used with static linking.
+   Never use these definitions in association with dynamic linking !
+=================================================================================================== */
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345)
+#define XXH_STATIC_H_3543687687345
+
+/* These definitions are only meant to allow allocation of XXH state
+   statically, on stack, or in a struct for example.
+   Do not use members directly. */
+
+   struct XXH32_state_s {
+       unsigned total_len_32;
+       unsigned large_len;
+       unsigned v1;
+       unsigned v2;
+       unsigned v3;
+       unsigned v4;
+       unsigned mem32[4];   /* buffer defined as U32 for alignment */
+       unsigned memsize;
+       unsigned reserved;   /* never read nor write, will be removed in a future version */
+   };   /* typedef'd to XXH32_state_t */
+
+   struct XXH64_state_s {
+       unsigned long long total_len;
+       unsigned long long v1;
+       unsigned long long v2;
+       unsigned long long v3;
+       unsigned long long v4;
+       unsigned long long mem64[4];   /* buffer defined as U64 for alignment */
+       unsigned memsize;
+       unsigned reserved[2];          /* never read nor write, will be removed in a future version */
+   };   /* typedef'd to XXH64_state_t */
+
+
+#  ifdef XXH_PRIVATE_API
+#    include "xxhash.c"   /* include xxhash functions as `static`, for inlining */
+#  endif
+
+#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/vendor/github.com/DataDog/zstd/zbuff.h b/vendor/github.com/DataDog/zstd/zbuff.h
new file mode 100644
index 0000000..a93115d
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zbuff.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* ***************************************************************
+*  NOTES/WARNINGS
+******************************************************************/
+/* The streaming API defined here is deprecated.
+ * Consider migrating towards ZSTD_compressStream() API in `zstd.h`
+ * See 'lib/README.md'.
+ *****************************************************************/
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_BUFFERED_H_23987
+#define ZSTD_BUFFERED_H_23987
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include <stddef.h>      /* size_t */
+#include "zstd.h"        /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
+
+
+/* ***************************************************************
+*  Compiler specifics
+*****************************************************************/
+/* Deprecation warnings */
+/* Should these warnings be a problem,
+   it is generally possible to disable them,
+   typically with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual.
+   Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS */
+#ifdef ZBUFF_DISABLE_DEPRECATE_WARNINGS
+#  define ZBUFF_DEPRECATED(message) ZSTDLIB_API  /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define ZBUFF_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API
+#  elif (defined(__GNUC__) && (__GNUC__ >= 5)) || defined(__clang__)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ >= 3)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement ZBUFF_DEPRECATED for this compiler")
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API
+#  endif
+#endif /* ZBUFF_DISABLE_DEPRECATE_WARNINGS */
+
+
+/* *************************************
+*  Streaming functions
+***************************************/
+/* This is the easier "buffered" streaming API,
+*  using an internal buffer to lift all restrictions on user-provided buffers
+*  which can be any size, any place, for both input and output.
+*  ZBUFF and ZSTD are 100% interoperable,
+*  frames created by one can be decoded by the other one */
+
+typedef ZSTD_CStream ZBUFF_CCtx;
+ZBUFF_DEPRECATED("use ZSTD_createCStream") ZBUFF_CCtx* ZBUFF_createCCtx(void);
+ZBUFF_DEPRECATED("use ZSTD_freeCStream")   size_t      ZBUFF_freeCCtx(ZBUFF_CCtx* cctx);
+
+ZBUFF_DEPRECATED("use ZSTD_initCStream")           size_t ZBUFF_compressInit(ZBUFF_CCtx* cctx, int compressionLevel);
+ZBUFF_DEPRECATED("use ZSTD_initCStream_usingDict") size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+
+ZBUFF_DEPRECATED("use ZSTD_compressStream") size_t ZBUFF_compressContinue(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr, const void* src, size_t* srcSizePtr);
+ZBUFF_DEPRECATED("use ZSTD_flushStream")    size_t ZBUFF_compressFlush(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr);
+ZBUFF_DEPRECATED("use ZSTD_endStream")      size_t ZBUFF_compressEnd(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr);
+
+/*-*************************************************
+*  Streaming compression - howto
+*
+*  A ZBUFF_CCtx object is required to track streaming operation.
+*  Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources.
+*  ZBUFF_CCtx objects can be reused multiple times.
+*
+*  Start by initializing ZBUF_CCtx.
+*  Use ZBUFF_compressInit() to start a new compression operation.
+*  Use ZBUFF_compressInitDictionary() for a compression which requires a dictionary.
+*
+*  Use ZBUFF_compressContinue() repetitively to consume input stream.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written within *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present again remaining data.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each call, so save its content if it matters or change @dst .
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's just a hint, to improve latency)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  At any moment, it's possible to flush whatever data remains within buffer, using ZBUFF_compressFlush().
+*  The nb of bytes written into `dst` will be reported into *dstCapacityPtr.
+*  Note that the function cannot output more than *dstCapacityPtr,
+*  therefore, some content might still be left into internal buffer if *dstCapacityPtr is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressEnd() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small.
+*  In which case, call again ZBUFF_compressFlush() to complete the flush.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : _recommended buffer_ sizes (not compulsory) : ZBUFF_recommendedCInSize() / ZBUFF_recommendedCOutSize()
+*  input : ZBUFF_recommendedCInSize==128 KB block size is the internal unit, use this value to reduce intermediate stages (better latency)
+*  output : ZBUFF_recommendedCOutSize==ZSTD_compressBound(128 KB) + 3 + 3 : ensures it's always possible to write/flush/end a full block. Skip some buffering.
+*  By using both, it ensures that input will be entirely consumed, and output will always contain the result, reducing intermediate buffering.
+* **************************************************/
+
+
+typedef ZSTD_DStream ZBUFF_DCtx;
+ZBUFF_DEPRECATED("use ZSTD_createDStream") ZBUFF_DCtx* ZBUFF_createDCtx(void);
+ZBUFF_DEPRECATED("use ZSTD_freeDStream")   size_t      ZBUFF_freeDCtx(ZBUFF_DCtx* dctx);
+
+ZBUFF_DEPRECATED("use ZSTD_initDStream")           size_t ZBUFF_decompressInit(ZBUFF_DCtx* dctx);
+ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZBUFF_DEPRECATED("use ZSTD_decompressStream") size_t ZBUFF_decompressContinue(ZBUFF_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFF_DCtx object is required to track streaming operations.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation,
+*   or ZBUFF_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFF_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+*  @return : 0 when a frame is completely decoded and fully flushed,
+*            1 when there is still some data left within internal buffer to flush,
+*            >1 when more data is expected, with value being a suggested next input size (it's just a hint, which helps latency),
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize() and ZBUFF_recommendedDOutSize()
+*  output : ZBUFF_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFF_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+ZBUFF_DEPRECATED("use ZSTD_isError")      unsigned ZBUFF_isError(size_t errorCode);
+ZBUFF_DEPRECATED("use ZSTD_getErrorName") const char* ZBUFF_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, they tend to offer better latency */
+ZBUFF_DEPRECATED("use ZSTD_CStreamInSize")  size_t ZBUFF_recommendedCInSize(void);
+ZBUFF_DEPRECATED("use ZSTD_CStreamOutSize") size_t ZBUFF_recommendedCOutSize(void);
+ZBUFF_DEPRECATED("use ZSTD_DStreamInSize")  size_t ZBUFF_recommendedDInSize(void);
+ZBUFF_DEPRECATED("use ZSTD_DStreamOutSize") size_t ZBUFF_recommendedDOutSize(void);
+
+#endif  /* ZSTD_BUFFERED_H_23987 */
+
+
+#ifdef ZBUFF_STATIC_LINKING_ONLY
+#ifndef ZBUFF_STATIC_H_30298098432
+#define ZBUFF_STATIC_H_30298098432
+
+/* ====================================================================================
+ * The definitions in this section are considered experimental.
+ * They should never be used in association with a dynamic library, as they may change in the future.
+ * They are provided for advanced usages.
+ * Use them only in association with static linking.
+ * ==================================================================================== */
+
+/*--- Dependency ---*/
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters, ZSTD_customMem */
+#include "zstd.h"
+
+
+/*--- Custom memory allocator ---*/
+/*! ZBUFF_createCCtx_advanced() :
+ *  Create a ZBUFF compression context using external alloc and free functions */
+ZBUFF_DEPRECATED("use ZSTD_createCStream_advanced") ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem);
+
+/*! ZBUFF_createDCtx_advanced() :
+ *  Create a ZBUFF decompression context using external alloc and free functions */
+ZBUFF_DEPRECATED("use ZSTD_createDStream_advanced") ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem);
+
+
+/*--- Advanced Streaming Initialization ---*/
+ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc,
+                                               const void* dict, size_t dictSize,
+                                               ZSTD_parameters params, unsigned long long pledgedSrcSize);
+
+
+#endif    /* ZBUFF_STATIC_H_30298098432 */
+#endif    /* ZBUFF_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/vendor/github.com/DataDog/zstd/zbuff_common.c b/vendor/github.com/DataDog/zstd/zbuff_common.c
new file mode 100644
index 0000000..661b9b0
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zbuff_common.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "error_private.h"
+#include "zbuff.h"
+
+/*-****************************************
+*  ZBUFF Error Management  (deprecated)
+******************************************/
+
+/*! ZBUFF_isError() :
+*   tells if a return value is an error code */
+unsigned ZBUFF_isError(size_t errorCode) { return ERR_isError(errorCode); }
+/*! ZBUFF_getErrorName() :
+*   provides error code string from function result (useful for debugging) */
+const char* ZBUFF_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
diff --git a/vendor/github.com/DataDog/zstd/zbuff_compress.c b/vendor/github.com/DataDog/zstd/zbuff_compress.c
new file mode 100644
index 0000000..f39c60d
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zbuff_compress.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/* *************************************
+*  Dependencies
+***************************************/
+#define ZBUFF_STATIC_LINKING_ONLY
+#include "zbuff.h"
+
+
+/*-***********************************************************
+*  Streaming compression
+*
+*  A ZBUFF_CCtx object is required to track streaming operation.
+*  Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources.
+*  Use ZBUFF_compressInit() to start a new compression operation.
+*  ZBUFF_CCtx objects can be reused multiple times.
+*
+*  Use ZBUFF_compressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input.
+*  The content of dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change dst .
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressFlush() can be used to instruct ZBUFF to compress and output whatever remains within its buffer.
+*  Note that it will not output more than *dstCapacityPtr.
+*  Therefore, some content might still be left into its internal buffer if dst buffer is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressEnd() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory)
+*  input : ZSTD_BLOCKSIZE_MAX (128 KB), internal unit size, it improves latency to use this value.
+*  output : ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + ZBUFF_endFrameSize : ensures it's always possible to write/flush/end a full block at best speed.
+* ***********************************************************/
+
+ZBUFF_CCtx* ZBUFF_createCCtx(void)
+{
+    return ZSTD_createCStream();
+}
+
+ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createCStream_advanced(customMem);
+}
+
+size_t ZBUFF_freeCCtx(ZBUFF_CCtx* zbc)
+{
+    return ZSTD_freeCStream(zbc);
+}
+
+
+/* ======   Initialization   ====== */
+
+size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc,
+                                   const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* preserve "0 == unknown" behavior */
+    return ZSTD_initCStream_advanced(zbc, dict, dictSize, params, pledgedSrcSize);
+}
+
+
+size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, int compressionLevel)
+{
+    return ZSTD_initCStream_usingDict(zbc, dict, dictSize, compressionLevel);
+}
+
+size_t ZBUFF_compressInit(ZBUFF_CCtx* zbc, int compressionLevel)
+{
+    return ZSTD_initCStream(zbc, compressionLevel);
+}
+
+/* ======   Compression   ====== */
+
+
+size_t ZBUFF_compressContinue(ZBUFF_CCtx* zbc,
+                              void* dst, size_t* dstCapacityPtr,
+                        const void* src, size_t* srcSizePtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    ZSTD_inBuffer inBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    inBuff.src = src;
+    inBuff.pos = 0;
+    inBuff.size = *srcSizePtr;
+    result = ZSTD_compressStream(zbc, &outBuff, &inBuff);
+    *dstCapacityPtr = outBuff.pos;
+    *srcSizePtr = inBuff.pos;
+    return result;
+}
+
+
+
+/* ======   Finalize   ====== */
+
+size_t ZBUFF_compressFlush(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    result = ZSTD_flushStream(zbc, &outBuff);
+    *dstCapacityPtr = outBuff.pos;
+    return result;
+}
+
+
+size_t ZBUFF_compressEnd(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    result = ZSTD_endStream(zbc, &outBuff);
+    *dstCapacityPtr = outBuff.pos;
+    return result;
+}
+
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFF_recommendedCInSize(void)  { return ZSTD_CStreamInSize(); }
+size_t ZBUFF_recommendedCOutSize(void) { return ZSTD_CStreamOutSize(); }
diff --git a/vendor/github.com/DataDog/zstd/zbuff_decompress.c b/vendor/github.com/DataDog/zstd/zbuff_decompress.c
new file mode 100644
index 0000000..923c22b
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zbuff_decompress.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/* *************************************
+*  Dependencies
+***************************************/
+#define ZBUFF_STATIC_LINKING_ONLY
+#include "zbuff.h"
+
+
+ZBUFF_DCtx* ZBUFF_createDCtx(void)
+{
+    return ZSTD_createDStream();
+}
+
+ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDStream_advanced(customMem);
+}
+
+size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbd)
+{
+    return ZSTD_freeDStream(zbd);
+}
+
+
+/* *** Initialization *** */
+
+size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* zbd, const void* dict, size_t dictSize)
+{
+    return ZSTD_initDStream_usingDict(zbd, dict, dictSize);
+}
+
+size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbd)
+{
+    return ZSTD_initDStream(zbd);
+}
+
+
+/* *** Decompression *** */
+
+size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbd,
+                                void* dst, size_t* dstCapacityPtr,
+                          const void* src, size_t* srcSizePtr)
+{
+    ZSTD_outBuffer outBuff;
+    ZSTD_inBuffer inBuff;
+    size_t result;
+    outBuff.dst  = dst;
+    outBuff.pos  = 0;
+    outBuff.size = *dstCapacityPtr;
+    inBuff.src  = src;
+    inBuff.pos  = 0;
+    inBuff.size = *srcSizePtr;
+    result = ZSTD_decompressStream(zbd, &outBuff, &inBuff);
+    *dstCapacityPtr = outBuff.pos;
+    *srcSizePtr = inBuff.pos;
+    return result;
+}
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFF_recommendedDInSize(void)  { return ZSTD_DStreamInSize(); }
+size_t ZBUFF_recommendedDOutSize(void) { return ZSTD_DStreamOutSize(); }
diff --git a/vendor/github.com/DataDog/zstd/zdict.c b/vendor/github.com/DataDog/zstd/zdict.c
new file mode 100644
index 0000000..7d24e49
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zdict.c
@@ -0,0 +1,1108 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/*-**************************************
+*  Tuning parameters
+****************************************/
+#define MINRATIO 4   /* minimum nb of apparition to be selected in dictionary */
+#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
+#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
+
+
+/*-**************************************
+*  Compiler Options
+****************************************/
+/* Unix Large Files support (>4GB) */
+#define _FILE_OFFSET_BITS 64
+#if (defined(__sun__) && (!defined(__LP64__)))   /* Sun Solaris 32-bits requires specific definitions */
+#  define _LARGEFILE_SOURCE
+#elif ! defined(__LP64__)                        /* No point defining Large file for 64 bit */
+#  define _LARGEFILE64_SOURCE
+#endif
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdlib.h>        /* malloc, free */
+#include <string.h>        /* memset */
+#include <stdio.h>         /* fprintf, fopen, ftello64 */
+#include <time.h>          /* clock */
+
+#include "mem.h"           /* read */
+#include "fse.h"           /* FSE_normalizeCount, FSE_writeNCount */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"           /* HUF_buildCTable, HUF_writeCTable */
+#include "zstd_internal.h" /* includes zstd.h */
+#include "xxhash.h"        /* XXH64 */
+#include "divsufsort.h"
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#  define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define DICTLISTSIZE_DEFAULT 10000
+
+#define NOISELENGTH 32
+
+static const int g_compressionLevel_default = 3;
+static const U32 g_selectivity_default = 9;
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
+#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); }    /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+
+static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
+
+static void ZDICT_printHex(const void* ptr, size_t length)
+{
+    const BYTE* const b = (const BYTE*)ptr;
+    size_t u;
+    for (u=0; u<length; u++) {
+        BYTE c = b[u];
+        if (c<32 || c>126) c = '.';   /* non-printable char */
+        DISPLAY("%c", c);
+    }
+}
+
+
+/*-********************************************************
+*  Helper functions
+**********************************************************/
+unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
+
+const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
+
+unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
+{
+    if (dictSize < 8) return 0;
+    if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
+    return MEM_readLE32((const char*)dictBuffer + 4);
+}
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static unsigned ZDICT_NbCommonBytes (size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r=0;
+            _BitScanForward( &r, (U32)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clzll(val) >> 3);
+#       else
+            unsigned r;
+            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+    }   }
+}
+
+
+/*! ZDICT_count() :
+    Count the nb of common bytes between 2 pointers.
+    Note : this function presumes end of buffer followed by noisy guard band.
+*/
+static size_t ZDICT_count(const void* pIn, const void* pMatch)
+{
+    const char* const pStart = (const char*)pIn;
+    for (;;) {
+        size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+        if (!diff) {
+            pIn = (const char*)pIn+sizeof(size_t);
+            pMatch = (const char*)pMatch+sizeof(size_t);
+            continue;
+        }
+        pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
+        return (size_t)((const char*)pIn - pStart);
+    }
+}
+
+
+typedef struct {
+    U32 pos;
+    U32 length;
+    U32 savings;
+} dictItem;
+
+static void ZDICT_initDictItem(dictItem* d)
+{
+    d->pos = 1;
+    d->length = 0;
+    d->savings = (U32)(-1);
+}
+
+
+#define LLIMIT 64          /* heuristic determined experimentally */
+#define MINMATCHLENGTH 7   /* heuristic determined experimentally */
+static dictItem ZDICT_analyzePos(
+                       BYTE* doneMarks,
+                       const int* suffix, U32 start,
+                       const void* buffer, U32 minRatio, U32 notificationLevel)
+{
+    U32 lengthList[LLIMIT] = {0};
+    U32 cumulLength[LLIMIT] = {0};
+    U32 savings[LLIMIT] = {0};
+    const BYTE* b = (const BYTE*)buffer;
+    size_t maxLength = LLIMIT;
+    size_t pos = suffix[start];
+    U32 end = start;
+    dictItem solution;
+
+    /* init */
+    memset(&solution, 0, sizeof(solution));
+    doneMarks[pos] = 1;
+
+    /* trivial repetition cases */
+    if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
+       ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
+       ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
+        /* skip and mark segment */
+        U16 const pattern16 = MEM_read16(b+pos+4);
+        U32 u, patternEnd = 6;
+        while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
+        if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
+        for (u=1; u<patternEnd; u++)
+            doneMarks[pos+u] = 1;
+        return solution;
+    }
+
+    /* look forward */
+    {   size_t length;
+        do {
+            end++;
+            length = ZDICT_count(b + pos, b + suffix[end]);
+        } while (length >= MINMATCHLENGTH);
+    }
+
+    /* look backward */
+    {   size_t length;
+        do {
+            length = ZDICT_count(b + pos, b + *(suffix+start-1));
+            if (length >=MINMATCHLENGTH) start--;
+        } while(length >= MINMATCHLENGTH);
+    }
+
+    /* exit if not found a minimum nb of repetitions */
+    if (end-start < minRatio) {
+        U32 idx;
+        for(idx=start; idx<end; idx++)
+            doneMarks[suffix[idx]] = 1;
+        return solution;
+    }
+
+    {   int i;
+        U32 searchLength;
+        U32 refinedStart = start;
+        U32 refinedEnd = end;
+
+        DISPLAYLEVEL(4, "\n");
+        DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u  ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
+        DISPLAYLEVEL(4, "\n");
+
+        for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
+            BYTE currentChar = 0;
+            U32 currentCount = 0;
+            U32 currentID = refinedStart;
+            U32 id;
+            U32 selectedCount = 0;
+            U32 selectedID = currentID;
+            for (id =refinedStart; id < refinedEnd; id++) {
+                if (b[suffix[id] + searchLength] != currentChar) {
+                    if (currentCount > selectedCount) {
+                        selectedCount = currentCount;
+                        selectedID = currentID;
+                    }
+                    currentID = id;
+                    currentChar = b[ suffix[id] + searchLength];
+                    currentCount = 0;
+                }
+                currentCount ++;
+            }
+            if (currentCount > selectedCount) {  /* for last */
+                selectedCount = currentCount;
+                selectedID = currentID;
+            }
+
+            if (selectedCount < minRatio)
+                break;
+            refinedStart = selectedID;
+            refinedEnd = refinedStart + selectedCount;
+        }
+
+        /* evaluate gain based on new ref */
+        start = refinedStart;
+        pos = suffix[refinedStart];
+        end = start;
+        memset(lengthList, 0, sizeof(lengthList));
+
+        /* look forward */
+        {   size_t length;
+            do {
+                end++;
+                length = ZDICT_count(b + pos, b + suffix[end]);
+                if (length >= LLIMIT) length = LLIMIT-1;
+                lengthList[length]++;
+            } while (length >=MINMATCHLENGTH);
+        }
+
+        /* look backward */
+        {   size_t length = MINMATCHLENGTH;
+            while ((length >= MINMATCHLENGTH) & (start > 0)) {
+                length = ZDICT_count(b + pos, b + suffix[start - 1]);
+                if (length >= LLIMIT) length = LLIMIT - 1;
+                lengthList[length]++;
+                if (length >= MINMATCHLENGTH) start--;
+            }
+        }
+
+        /* largest useful length */
+        memset(cumulLength, 0, sizeof(cumulLength));
+        cumulLength[maxLength-1] = lengthList[maxLength-1];
+        for (i=(int)(maxLength-2); i>=0; i--)
+            cumulLength[i] = cumulLength[i+1] + lengthList[i];
+
+        for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
+        maxLength = i;
+
+        /* reduce maxLength in case of final into repetitive data */
+        {   U32 l = (U32)maxLength;
+            BYTE const c = b[pos + maxLength-1];
+            while (b[pos+l-2]==c) l--;
+            maxLength = l;
+        }
+        if (maxLength < MINMATCHLENGTH) return solution;   /* skip : no long-enough solution */
+
+        /* calculate savings */
+        savings[5] = 0;
+        for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
+            savings[i] = savings[i-1] + (lengthList[i] * (i-3));
+
+        DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f)  \n",
+                     (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
+
+        solution.pos = (U32)pos;
+        solution.length = (U32)maxLength;
+        solution.savings = savings[maxLength];
+
+        /* mark positions done */
+        {   U32 id;
+            for (id=start; id<end; id++) {
+                U32 p, pEnd, length;
+                U32 const testedPos = suffix[id];
+                if (testedPos == pos)
+                    length = solution.length;
+                else {
+                    length = (U32)ZDICT_count(b+pos, b+testedPos);
+                    if (length > solution.length) length = solution.length;
+                }
+                pEnd = (U32)(testedPos + length);
+                for (p=testedPos; p<pEnd; p++)
+                    doneMarks[p] = 1;
+    }   }   }
+
+    return solution;
+}
+
+
+static int isIncluded(const void* in, const void* container, size_t length)
+{
+    const char* const ip = (const char*) in;
+    const char* const into = (const char*) container;
+    size_t u;
+
+    for (u=0; u<length; u++) {  /* works because end of buffer is a noisy guard band */
+        if (ip[u] != into[u]) break;
+    }
+
+    return u==length;
+}
+
+/*! ZDICT_tryMerge() :
+    check if dictItem can be merged, do it if possible
+    @return : id of destination elt, 0 if not merged
+*/
+static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
+{
+    const U32 tableSize = table->pos;
+    const U32 eltEnd = elt.pos + elt.length;
+    const char* const buf = (const char*) buffer;
+
+    /* tail overlap */
+    U32 u; for (u=1; u<tableSize; u++) {
+        if (u==eltNbToSkip) continue;
+        if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) {  /* overlap, existing > new */
+            /* append */
+            U32 const addedLength = table[u].pos - elt.pos;
+            table[u].length += addedLength;
+            table[u].pos = elt.pos;
+            table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
+            table[u].savings += elt.length / 8;    /* rough approx bonus */
+            elt = table[u];
+            /* sort : improve rank */
+            while ((u>1) && (table[u-1].savings < elt.savings))
+            table[u] = table[u-1], u--;
+            table[u] = elt;
+            return u;
+    }   }
+
+    /* front overlap */
+    for (u=1; u<tableSize; u++) {
+        if (u==eltNbToSkip) continue;
+
+        if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) {  /* overlap, existing < new */
+            /* append */
+            int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
+            table[u].savings += elt.length / 8;    /* rough approx bonus */
+            if (addedLength > 0) {   /* otherwise, elt fully included into existing */
+                table[u].length += addedLength;
+                table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
+            }
+            /* sort : improve rank */
+            elt = table[u];
+            while ((u>1) && (table[u-1].savings < elt.savings))
+                table[u] = table[u-1], u--;
+            table[u] = elt;
+            return u;
+        }
+
+        if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
+            if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
+                size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
+                table[u].pos = elt.pos;
+                table[u].savings += (U32)(elt.savings * addedLength / elt.length);
+                table[u].length = MIN(elt.length, table[u].length + 1);
+                return u;
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+static void ZDICT_removeDictItem(dictItem* table, U32 id)
+{
+    /* convention : table[0].pos stores nb of elts */
+    U32 const max = table[0].pos;
+    U32 u;
+    if (!id) return;   /* protection, should never happen */
+    for (u=id; u<max-1; u++)
+        table[u] = table[u+1];
+    table->pos--;
+}
+
+
+static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
+{
+    /* merge if possible */
+    U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
+    if (mergeId) {
+        U32 newMerge = 1;
+        while (newMerge) {
+            newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
+            if (newMerge) ZDICT_removeDictItem(table, mergeId);
+            mergeId = newMerge;
+        }
+        return;
+    }
+
+    /* insert */
+    {   U32 current;
+        U32 nextElt = table->pos;
+        if (nextElt >= maxSize) nextElt = maxSize-1;
+        current = nextElt-1;
+        while (table[current].savings < elt.savings) {
+            table[current+1] = table[current];
+            current--;
+        }
+        table[current+1] = elt;
+        table->pos = nextElt+1;
+    }
+}
+
+
+static U32 ZDICT_dictSize(const dictItem* dictList)
+{
+    U32 u, dictSize = 0;
+    for (u=1; u<dictList[0].pos; u++)
+        dictSize += dictList[u].length;
+    return dictSize;
+}
+
+
+static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
+                            const void* const buffer, size_t bufferSize,   /* buffer must end with noisy guard band */
+                            const size_t* fileSizes, unsigned nbFiles,
+                            U32 minRatio, U32 notificationLevel)
+{
+    int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
+    int* const suffix = suffix0+1;
+    U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
+    BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks));   /* +16 for overflow security */
+    U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
+    size_t result = 0;
+    clock_t displayClock = 0;
+    clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
+
+#   define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
+            if (ZDICT_clockSpan(displayClock) > refreshRate)  \
+            { displayClock = clock(); DISPLAY(__VA_ARGS__); \
+            if (notificationLevel>=4) fflush(stderr); } }
+
+    /* init */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
+        result = ERROR(memory_allocation);
+        goto _cleanup;
+    }
+    if (minRatio < MINRATIO) minRatio = MINRATIO;
+    memset(doneMarks, 0, bufferSize+16);
+
+    /* limit sample set size (divsufsort limitation)*/
+    if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
+    while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
+
+    /* sort */
+    DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
+    {   int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
+        if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
+    }
+    suffix[bufferSize] = (int)bufferSize;   /* leads into noise */
+    suffix0[0] = (int)bufferSize;           /* leads into noise */
+    /* build reverse suffix sort */
+    {   size_t pos;
+        for (pos=0; pos < bufferSize; pos++)
+            reverseSuffix[suffix[pos]] = (U32)pos;
+        /* note filePos tracks borders between samples.
+           It's not used at this stage, but planned to become useful in a later update */
+        filePos[0] = 0;
+        for (pos=1; pos<nbFiles; pos++)
+            filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
+    }
+
+    DISPLAYLEVEL(2, "finding patterns ... \n");
+    DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
+
+    {   U32 cursor; for (cursor=0; cursor < bufferSize; ) {
+            dictItem solution;
+            if (doneMarks[cursor]) { cursor++; continue; }
+            solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
+            if (solution.length==0) { cursor++; continue; }
+            ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
+            cursor += solution.length;
+            DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
+    }   }
+
+_cleanup:
+    free(suffix0);
+    free(reverseSuffix);
+    free(doneMarks);
+    free(filePos);
+    return result;
+}
+
+
+static void ZDICT_fillNoise(void* buffer, size_t length)
+{
+    unsigned const prime1 = 2654435761U;
+    unsigned const prime2 = 2246822519U;
+    unsigned acc = prime1;
+    size_t p=0;;
+    for (p=0; p<length; p++) {
+        acc *= prime2;
+        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
+    }
+}
+
+
+typedef struct
+{
+    ZSTD_CCtx* ref;    /* contains reference to dictionary */
+    ZSTD_CCtx* zc;     /* working context */
+    void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
+} EStats_ress_t;
+
+#define MAXREPOFFSET 1024
+
+static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
+                              U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
+                              const void* src, size_t srcSize,
+                              U32 notificationLevel)
+{
+    size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
+    size_t cSize;
+
+    if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
+    {   size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
+        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
+    }
+    cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
+    if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
+
+    if (cSize) {  /* if == 0; block is not compressible */
+        const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
+
+        /* literals stats */
+        {   const BYTE* bytePtr;
+            for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
+                countLit[*bytePtr]++;
+        }
+
+        /* seqStats */
+        {   U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+            ZSTD_seqToCodes(seqStorePtr);
+
+            {   const BYTE* codePtr = seqStorePtr->ofCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
+            }
+
+            {   const BYTE* codePtr = seqStorePtr->mlCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
+            }
+
+            {   const BYTE* codePtr = seqStorePtr->llCode;
+                U32 u;
+                for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
+            }
+
+            if (nbSeq >= 2) { /* rep offsets */
+                const seqDef* const seq = seqStorePtr->sequencesStart;
+                U32 offset1 = seq[0].offset - 3;
+                U32 offset2 = seq[1].offset - 3;
+                if (offset1 >= MAXREPOFFSET) offset1 = 0;
+                if (offset2 >= MAXREPOFFSET) offset2 = 0;
+                repOffsets[offset1] += 3;
+                repOffsets[offset2] += 1;
+    }   }   }
+}
+
+static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
+{
+    size_t total=0;
+    unsigned u;
+    for (u=0; u<nbFiles; u++) total += fileSizes[u];
+    return total;
+}
+
+typedef struct { U32 offset; U32 count; } offsetCount_t;
+
+static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
+{
+    U32 u;
+    table[ZSTD_REP_NUM].offset = val;
+    table[ZSTD_REP_NUM].count = count;
+    for (u=ZSTD_REP_NUM; u>0; u--) {
+        offsetCount_t tmp;
+        if (table[u-1].count >= table[u].count) break;
+        tmp = table[u-1];
+        table[u-1] = table[u];
+        table[u] = tmp;
+    }
+}
+
+/* ZDICT_flatLit() :
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
+ */
+static void ZDICT_flatLit(U32* countLit)
+{
+    int u;
+    for (u=1; u<256; u++) countLit[u] = 2;
+    countLit[0]   = 4;
+    countLit[253] = 1;
+    countLit[254] = 1;
+}
+
+#define OFFCODE_MAX 30  /* only applicable to first block */
+static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
+                                   unsigned compressionLevel,
+                             const void*  srcBuffer, const size_t* fileSizes, unsigned nbFiles,
+                             const void* dictBuffer, size_t  dictBufferSize,
+                                   unsigned notificationLevel)
+{
+    U32 countLit[256];
+    HUF_CREATE_STATIC_CTABLE(hufTable, 255);
+    U32 offcodeCount[OFFCODE_MAX+1];
+    short offcodeNCount[OFFCODE_MAX+1];
+    U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
+    U32 matchLengthCount[MaxML+1];
+    short matchLengthNCount[MaxML+1];
+    U32 litLengthCount[MaxLL+1];
+    short litLengthNCount[MaxLL+1];
+    U32 repOffset[MAXREPOFFSET];
+    offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
+    EStats_ress_t esr;
+    ZSTD_parameters params;
+    U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
+    size_t pos = 0, errorCode;
+    size_t eSize = 0;
+    size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
+    size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
+    BYTE* dstPtr = (BYTE*)dstBuffer;
+
+    /* init */
+    DEBUGLOG(4, "ZDICT_analyzeEntropy");
+    esr.ref = ZSTD_createCCtx();
+    esr.zc = ZSTD_createCCtx();
+    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
+    if (!esr.ref || !esr.zc || !esr.workPlace) {
+        eSize = ERROR(memory_allocation);
+        DISPLAYLEVEL(1, "Not enough memory \n");
+        goto _cleanup;
+    }
+    if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; }   /* too large dictionary */
+    for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
+    for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
+    for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
+    for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
+    memset(repOffset, 0, sizeof(repOffset));
+    repOffset[1] = repOffset[4] = repOffset[8] = 1;
+    memset(bestRepOffset, 0, sizeof(bestRepOffset));
+    if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
+    params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
+    {   size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
+        if (ZSTD_isError(beginResult)) {
+            DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
+            eSize = ERROR(GENERIC);
+            goto _cleanup;
+    }   }
+
+    /* collect stats on all samples */
+    for (u=0; u<nbFiles; u++) {
+        ZDICT_countEStats(esr, params,
+                          countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
+                         (const char*)srcBuffer + pos, fileSizes[u],
+                          notificationLevel);
+        pos += fileSizes[u];
+    }
+
+    /* analyze, build stats, starting with literals */
+    {   size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+        if (HUF_isError(maxNbBits)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, " HUF_buildCTable error \n");
+            goto _cleanup;
+        }
+        if (maxNbBits==8) {  /* not compressible : will fail on HUF_writeCTable() */
+            DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
+            ZDICT_flatLit(countLit);  /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
+            maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+            assert(maxNbBits==9);
+        }
+        huffLog = (U32)maxNbBits;
+    }
+
+    /* looking for most common first offsets */
+    {   U32 offset;
+        for (offset=1; offset<MAXREPOFFSET; offset++)
+            ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
+    }
+    /* note : the result of this phase should be used to better appreciate the impact on statistics */
+
+    total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
+    errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
+        goto _cleanup;
+    }
+    Offlog = (U32)errorCode;
+
+    total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
+    errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
+        goto _cleanup;
+    }
+    mlLog = (U32)errorCode;
+
+    total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
+    errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
+    if (FSE_isError(errorCode)) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
+        goto _cleanup;
+    }
+    llLog = (U32)errorCode;
+
+    /* write result to buffer */
+    {   size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
+        if (HUF_isError(hhSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "HUF_writeCTable error \n");
+            goto _cleanup;
+        }
+        dstPtr += hhSize;
+        maxDstSize -= hhSize;
+        eSize += hhSize;
+    }
+
+    {   size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
+        if (FSE_isError(ohSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += ohSize;
+        maxDstSize -= ohSize;
+        eSize += ohSize;
+    }
+
+    {   size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
+        if (FSE_isError(mhSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += mhSize;
+        maxDstSize -= mhSize;
+        eSize += mhSize;
+    }
+
+    {   size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
+        if (FSE_isError(lhSize)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
+            goto _cleanup;
+        }
+        dstPtr += lhSize;
+        maxDstSize -= lhSize;
+        eSize += lhSize;
+    }
+
+    if (maxDstSize<12) {
+        eSize = ERROR(GENERIC);
+        DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
+        goto _cleanup;
+    }
+# if 0
+    MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
+    MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
+    MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
+#else
+    /* at this stage, we don't use the result of "most common first offset",
+       as the impact of statistics is not properly evaluated */
+    MEM_writeLE32(dstPtr+0, repStartValue[0]);
+    MEM_writeLE32(dstPtr+4, repStartValue[1]);
+    MEM_writeLE32(dstPtr+8, repStartValue[2]);
+#endif
+    eSize += 12;
+
+_cleanup:
+    ZSTD_freeCCtx(esr.ref);
+    ZSTD_freeCCtx(esr.zc);
+    free(esr.workPlace);
+
+    return eSize;
+}
+
+
+
+size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
+                          const void* customDictContent, size_t dictContentSize,
+                          const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                          ZDICT_params_t params)
+{
+    size_t hSize;
+#define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
+    BYTE header[HBUFFSIZE];
+    int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
+    U32 const notificationLevel = params.notificationLevel;
+
+    /* check conditions */
+    DEBUGLOG(4, "ZDICT_finalizeDictionary");
+    if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
+    if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
+
+    /* dictionary header */
+    MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
+    {   U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
+        U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
+        U32 const dictID = params.dictID ? params.dictID : compliantID;
+        MEM_writeLE32(header+4, dictID);
+    }
+    hSize = 8;
+
+    /* entropy tables */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    DISPLAYLEVEL(2, "statistics ... \n");
+    {   size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
+                                  compressionLevel,
+                                  samplesBuffer, samplesSizes, nbSamples,
+                                  customDictContent, dictContentSize,
+                                  notificationLevel);
+        if (ZDICT_isError(eSize)) return eSize;
+        hSize += eSize;
+    }
+
+    /* copy elements in final buffer ; note : src and dst buffer can overlap */
+    if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
+    {   size_t const dictSize = hSize + dictContentSize;
+        char* dictEnd = (char*)dictBuffer + dictSize;
+        memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
+        memcpy(dictBuffer, header, hSize);
+        return dictSize;
+    }
+}
+
+
+size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+                                                 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                                 ZDICT_params_t params)
+{
+    int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
+    U32 const notificationLevel = params.notificationLevel;
+    size_t hSize = 8;
+
+    /* calculate entropy tables */
+    DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
+    DISPLAYLEVEL(2, "statistics ... \n");
+    {   size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
+                                  compressionLevel,
+                                  samplesBuffer, samplesSizes, nbSamples,
+                                  (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
+                                  notificationLevel);
+        if (ZDICT_isError(eSize)) return eSize;
+        hSize += eSize;
+    }
+
+    /* add dictionary header (after entropy tables) */
+    MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
+    {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
+        U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
+        U32 const dictID = params.dictID ? params.dictID : compliantID;
+        MEM_writeLE32((char*)dictBuffer+4, dictID);
+    }
+
+    if (hSize + dictContentSize < dictBufferCapacity)
+        memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
+    return MIN(dictBufferCapacity, hSize+dictContentSize);
+}
+
+
+/*! ZDICT_trainFromBuffer_unsafe_legacy() :
+*   Warning : `samplesBuffer` must be followed by noisy guard band.
+*   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
+*/
+size_t ZDICT_trainFromBuffer_unsafe_legacy(
+                            void* dictBuffer, size_t maxDictSize,
+                            const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                            ZDICT_legacy_params_t params)
+{
+    U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
+    dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
+    unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
+    unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
+    size_t const targetDictSize = maxDictSize;
+    size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
+    size_t dictSize = 0;
+    U32 const notificationLevel = params.zParams.notificationLevel;
+
+    /* checks */
+    if (!dictList) return ERROR(memory_allocation);
+    if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); }   /* requested dictionary size is too small */
+    if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* not enough source to create dictionary */
+
+    /* init */
+    ZDICT_initDictItem(dictList);
+
+    /* build dictionary */
+    ZDICT_trainBuffer_legacy(dictList, dictListSize,
+                       samplesBuffer, samplesBuffSize,
+                       samplesSizes, nbSamples,
+                       minRep, notificationLevel);
+
+    /* display best matches */
+    if (params.zParams.notificationLevel>= 3) {
+        U32 const nb = MIN(25, dictList[0].pos);
+        U32 const dictContentSize = ZDICT_dictSize(dictList);
+        U32 u;
+        DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
+        DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
+        for (u=1; u<nb; u++) {
+            U32 const pos = dictList[u].pos;
+            U32 const length = dictList[u].length;
+            U32 const printedLength = MIN(40, length);
+            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
+                return ERROR(GENERIC);   /* should never happen */
+            DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
+                         u, length, pos, dictList[u].savings);
+            ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
+            DISPLAYLEVEL(3, "| \n");
+    }   }
+
+
+    /* create dictionary */
+    {   U32 dictContentSize = ZDICT_dictSize(dictList);
+        if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* dictionary content too small */
+        if (dictContentSize < targetDictSize/4) {
+            DISPLAYLEVEL(2, "!  warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
+            if (samplesBuffSize < 10 * targetDictSize)
+                DISPLAYLEVEL(2, "!  consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
+            if (minRep > MINRATIO) {
+                DISPLAYLEVEL(2, "!  consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
+                DISPLAYLEVEL(2, "!  note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
+            }
+        }
+
+        if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
+            U32 proposedSelectivity = selectivity-1;
+            while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
+            DISPLAYLEVEL(2, "!  note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
+            DISPLAYLEVEL(2, "!  consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
+            DISPLAYLEVEL(2, "!  always test dictionary efficiency on real samples \n");
+        }
+
+        /* limit dictionary size */
+        {   U32 const max = dictList->pos;   /* convention : nb of useful elts within dictList */
+            U32 currentSize = 0;
+            U32 n; for (n=1; n<max; n++) {
+                currentSize += dictList[n].length;
+                if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
+            }
+            dictList->pos = n;
+            dictContentSize = currentSize;
+        }
+
+        /* build dict content */
+        {   U32 u;
+            BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
+            for (u=1; u<dictList->pos; u++) {
+                U32 l = dictList[u].length;
+                ptr -= l;
+                if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); }   /* should not happen */
+                memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
+        }   }
+
+        dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
+                                                             samplesBuffer, samplesSizes, nbSamples,
+                                                             params.zParams);
+    }
+
+    /* clean up */
+    free(dictList);
+    return dictSize;
+}
+
+
+/* ZDICT_trainFromBuffer_legacy() :
+ * issue : samplesBuffer need to be followed by a noisy guard band.
+ * work around : duplicate the buffer, and add the noise */
+size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                              ZDICT_legacy_params_t params)
+{
+    size_t result;
+    void* newBuff;
+    size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
+    if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0;   /* not enough content => no dictionary */
+
+    newBuff = malloc(sBuffSize + NOISELENGTH);
+    if (!newBuff) return ERROR(memory_allocation);
+
+    memcpy(newBuff, samplesBuffer, sBuffSize);
+    ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
+
+    result =
+        ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
+                                            samplesSizes, nbSamples, params);
+    free(newBuff);
+    return result;
+}
+
+
+size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
+                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+{
+    ZDICT_cover_params_t params;
+    DEBUGLOG(3, "ZDICT_trainFromBuffer");
+    memset(&params, 0, sizeof(params));
+    params.d = 8;
+    params.steps = 4;
+    /* Default to level 6 since no compression level information is available */
+    params.zParams.compressionLevel = 6;
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
+    params.zParams.notificationLevel = ZSTD_DEBUG;
+#endif
+    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+                                               samplesBuffer, samplesSizes, nbSamples,
+                                               &params);
+}
+
+size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+{
+    ZDICT_params_t params;
+    memset(&params, 0, sizeof(params));
+    return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
+                                                     samplesBuffer, samplesSizes, nbSamples,
+                                                     params);
+}
diff --git a/vendor/github.com/DataDog/zstd/zdict.h b/vendor/github.com/DataDog/zstd/zdict.h
new file mode 100644
index 0000000..ad459c2
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zdict.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef DICTBUILDER_H_001
+#define DICTBUILDER_H_001
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*======  Dependencies  ======*/
+#include <stddef.h>  /* size_t */
+
+
+/* =====   ZDICTLIB_API : control library symbols visibility   ===== */
+#ifndef ZDICTLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZDICTLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZDICTLIB_API ZDICTLIB_VISIBILITY
+#endif
+
+
+/*! ZDICT_trainFromBuffer():
+ *  Train a dictionary from an array of samples.
+ *  Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte.
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
+                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+
+
+/*======   Helper functions   ======*/
+ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize);  /**< extracts dictID; @return zero if error (not a valid dictionary) */
+ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
+ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
+
+
+
+#ifdef ZDICT_STATIC_LINKING_ONLY
+
+/* ====================================================================================
+ * The definitions in this section are considered experimental.
+ * They should never be used with a dynamic library, as they may change in the future.
+ * They are provided for advanced usages.
+ * Use them only in association with static linking.
+ * ==================================================================================== */
+
+typedef struct {
+    int      compressionLevel;   /* optimize for a specific zstd compression level; 0 means default */
+    unsigned notificationLevel;  /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
+    unsigned dictID;             /* force dictID value; 0 means auto mode (32-bits random value) */
+} ZDICT_params_t;
+
+/*! ZDICT_cover_params_t:
+ *  k and d are the only required parameters.
+ *  For others, value 0 means default.
+ */
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    ZDICT_params_t zParams;
+} ZDICT_cover_params_t;
+
+
+/*! ZDICT_trainFromBuffer_cover():
+ *  Train a dictionary from an array of samples using the COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
+          void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+          ZDICT_cover_params_t parameters);
+
+/*! ZDICT_optimizeTrainFromBuffer_cover():
+ * The same requirements as above hold for all the parameters except `parameters`.
+ * This function tries many parameter combinations and picks the best parameters.
+ * `*parameters` is filled with the best parameters found,
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
+ *
+ * All of the parameters d, k, steps are optional.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ * if steps is zero it defaults to its default value.
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+          void* dictBuffer, size_t dictBufferCapacity,
+    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+          ZDICT_cover_params_t* parameters);
+
+/*! ZDICT_finalizeDictionary():
+ * Given a custom content as a basis for dictionary, and a set of samples,
+ * finalize dictionary by adding headers and statistics.
+ *
+ * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
+ *
+ * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
+ * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
+ *           or an error code, which can be tested by ZDICT_isError().
+ * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
+ * Note 2: dictBuffer and dictContent can overlap
+ */
+#define ZDICT_CONTENTSIZE_MIN 128
+#define ZDICT_DICTSIZE_MIN    256
+ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
+                                const void* dictContent, size_t dictContentSize,
+                                const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                ZDICT_params_t parameters);
+
+typedef struct {
+    unsigned selectivityLevel;   /* 0 means default; larger => select more => larger dictionary */
+    ZDICT_params_t zParams;
+} ZDICT_legacy_params_t;
+
+/*! ZDICT_trainFromBuffer_legacy():
+ *  Train a dictionary from an array of samples.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * `parameters` is optional and can be provided with values set to 0 to mean "default".
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ *  Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_legacy_params_t parameters);
+
+/* Deprecation warnings */
+/* It is generally possible to disable deprecation warnings from compiler,
+   for example with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual.
+   Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */
+#ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS
+#  define ZDICT_DEPRECATED(message) ZDICTLIB_API   /* disable deprecation warnings */
+#else
+#  define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API
+#  elif (ZDICT_GCC_VERSION >= 405) || defined(__clang__)
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message)))
+#  elif (ZDICT_GCC_VERSION >= 301)
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
+#    define ZDICT_DEPRECATED(message) ZDICTLIB_API
+#  endif
+#endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */
+
+ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead")
+size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+
+
+#endif   /* ZDICT_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* DICTBUILDER_H_001 */
diff --git a/vendor/github.com/DataDog/zstd/zstd.go b/vendor/github.com/DataDog/zstd/zstd.go
new file mode 100644
index 0000000..e9953d6
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd.go
@@ -0,0 +1,144 @@
+package zstd
+
+/*
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"
+#include "stdint.h"  // for uintptr_t
+
+// The following *_wrapper function are used for removing superflouos
+// memory allocations when calling the wrapped functions from Go code.
+// See https://github.com/golang/go/issues/24450 for details.
+
+static size_t ZSTD_compress_wrapper(uintptr_t dst, size_t maxDstSize, const uintptr_t src, size_t srcSize, int compressionLevel) {
+	return ZSTD_compress((void*)dst, maxDstSize, (const void*)src, srcSize, compressionLevel);
+}
+
+static size_t ZSTD_decompress_wrapper(uintptr_t dst, size_t maxDstSize, uintptr_t src, size_t srcSize) {
+	return ZSTD_decompress((void*)dst, maxDstSize, (const void *)src, srcSize);
+}
+
+*/
+import "C"
+import (
+	"bytes"
+	"errors"
+	"io/ioutil"
+	"unsafe"
+)
+
+// Defines best and standard values for zstd cli
+const (
+	BestSpeed          = 1
+	BestCompression    = 20
+	DefaultCompression = 5
+)
+
+var (
+	// ErrEmptySlice is returned when there is nothing to compress
+	ErrEmptySlice = errors.New("Bytes slice is empty")
+)
+
+// CompressBound returns the worst case size needed for a destination buffer,
+// which can be used to preallocate a destination buffer or select a previously
+// allocated buffer from a pool.
+// See zstd.h to mirror implementation of ZSTD_COMPRESSBOUND
+func CompressBound(srcSize int) int {
+	lowLimit := 128 << 10 // 128 kB
+	var margin int
+	if srcSize < lowLimit {
+		margin = (lowLimit - srcSize) >> 11
+	}
+	return srcSize + (srcSize >> 8) + margin
+}
+
+// cCompressBound is a cgo call to check the go implementation above against the c code.
+func cCompressBound(srcSize int) int {
+	return int(C.ZSTD_compressBound(C.size_t(srcSize)))
+}
+
+// Compress src into dst.  If you have a buffer to use, you can pass it to
+// prevent allocation.  If it is too small, or if nil is passed, a new buffer
+// will be allocated and returned.
+func Compress(dst, src []byte) ([]byte, error) {
+	return CompressLevel(dst, src, DefaultCompression)
+}
+
+// CompressLevel is the same as Compress but you can pass a compression level
+func CompressLevel(dst, src []byte, level int) ([]byte, error) {
+	bound := CompressBound(len(src))
+	if cap(dst) >= bound {
+		dst = dst[0:bound] // Reuse dst buffer
+	} else {
+		dst = make([]byte, bound)
+	}
+
+	srcPtr := C.uintptr_t(uintptr(0)) // Do not point anywhere, if src is empty
+	if len(src) > 0 {
+		srcPtr = C.uintptr_t(uintptr(unsafe.Pointer(&src[0])))
+	}
+
+	cWritten := C.ZSTD_compress_wrapper(
+		C.uintptr_t(uintptr(unsafe.Pointer(&dst[0]))),
+		C.size_t(len(dst)),
+		srcPtr,
+		C.size_t(len(src)),
+		C.int(level))
+
+	written := int(cWritten)
+	// Check if the return is an Error code
+	if err := getError(written); err != nil {
+		return nil, err
+	}
+	return dst[:written], nil
+}
+
+// Decompress src into dst.  If you have a buffer to use, you can pass it to
+// prevent allocation.  If it is too small, or if nil is passed, a new buffer
+// will be allocated and returned.
+func Decompress(dst, src []byte) ([]byte, error) {
+	if len(src) == 0 {
+		return []byte{}, ErrEmptySlice
+	}
+	decompress := func(dst, src []byte) ([]byte, error) {
+
+		cWritten := C.ZSTD_decompress_wrapper(
+			C.uintptr_t(uintptr(unsafe.Pointer(&dst[0]))),
+			C.size_t(len(dst)),
+			C.uintptr_t(uintptr(unsafe.Pointer(&src[0]))),
+			C.size_t(len(src)))
+
+		written := int(cWritten)
+		// Check error
+		if err := getError(written); err != nil {
+			return nil, err
+		}
+		return dst[:written], nil
+	}
+
+	if dst == nil {
+		// Attempt to use zStd to determine decompressed size (may result in error or 0)
+		size := int(C.size_t(C.ZSTD_getDecompressedSize(unsafe.Pointer(&src[0]), C.size_t(len(src)))))
+
+		if err := getError(size); err != nil {
+			return nil, err
+		}
+
+		if size > 0 {
+			dst = make([]byte, size)
+		} else {
+			dst = make([]byte, len(src)*3) // starting guess
+		}
+	}
+	for i := 0; i < 3; i++ { // 3 tries to allocate a bigger buffer
+		result, err := decompress(dst, src)
+		if !IsDstSizeTooSmallError(err) {
+			return result, err
+		}
+		dst = make([]byte, len(dst)*2) // Grow buffer by 2
+	}
+
+	// We failed getting a dst buffer of correct size, use stream API
+	r := NewReader(bytes.NewReader(src))
+	defer r.Close()
+	return ioutil.ReadAll(r)
+}
diff --git a/vendor/github.com/DataDog/zstd/zstd.h b/vendor/github.com/DataDog/zstd/zstd.h
new file mode 100644
index 0000000..6405da6
--- /dev/null
+++ b/vendor/github.com/DataDog/zstd/zstd.h
@@ -0,0 +1,1399 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
+
+/* ======   Dependency   ======*/
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDLIB_API ZSTDLIB_VISIBILITY
+#endif
+
+
+/*******************************************************************************************************
+  Introduction
+
+  zstd, short for Zstandard, is a fast lossless compression algorithm,
+  targeting real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression functions.
+  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
+  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  Compression can be done in:
+    - a single step (described as Simple API)
+    - a single step, reusing a context (described as Explicit context)
+    - unbounded multiple steps (described as Streaming compression)
+  The compression ratio achievable on small data can be highly improved using a dictionary in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)
+
+  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
+  Advanced experimental APIs shall never be used with a dynamic library.
+  They are not "stable", their definition may change in the future. Only static linking is allowed.
+*********************************************************************************************************/
+
+/*------   Version   ------*/
+#define ZSTD_VERSION_MAJOR    1
+#define ZSTD_VERSION_MINOR    3
+#define ZSTD_VERSION_RELEASE  4
+
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< useful to check dll version */
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+ZSTDLIB_API const char* ZSTD_versionString(void);   /* added in v1.3.0 */
+
+
+/***************************************
+*  Simple API
+***************************************/
+/*! ZSTD_compress() :
+ *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  @return : compressed size written into `dst` (<= `dstCapacity),
+ *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  int compressionLevel);
+
+/*! ZSTD_decompress() :
+ *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ *  `dstCapacity` is an upper bound of originalSize to regenerate.
+ *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                              const void* src, size_t compressedSize);
+
+/*! ZSTD_getFrameContentSize() : added in v1.3.0
+ *  `src` should point to the start of a ZSTD encoded frame.
+ *  `srcSize` must be at least as large as the frame header.
+ *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ *  @return : - decompressed size of the frame in `src`, if known
+ *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ *   note 1 : a 0 return value means the frame is valid but "empty".
+ *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *            Optionally, application can rely on some implicit limit,
+ *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+ *            (For example, data could be necessarily cut into blocks <= 16 KB).
+ *   note 3 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 4 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure return value fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ *  "empty", "unknown" and "error" results to the same return value (0),
+ *  while ZSTD_getFrameContentSize() gives them separate return values.
+ * `src` is the start of a zstd compressed frame.
+ * @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. */
+ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+
+/*======  Helper functions  ======*/
+#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+
+
+/***************************************
+*  Explicit context
+***************************************/
+/*= Compression context
+ *  When compressing many times,
+ *  it is recommended to allocate a context just once, and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution in multi-threaded environments. */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);
+
+/*! ZSTD_compressCCtx() :
+ *  Same as ZSTD_compress(), requires an allocated ZSTD_CCtx (see ZSTD_createCCtx()). */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* ctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     int compressionLevel);
+
+/*= Decompression context
+ *  When decompressing many times,
+ *  it is recommended to allocate a context only once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution. */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
+
+/*! ZSTD_decompressDCtx() :
+ *  Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()) */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize);
+
+
+/**************************
+*  Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+ *  Compression using a predefined Dictionary (see dictBuilder/zdict.h).
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+ *  Decompression using a predefined Dictionary (see dictBuilder/zdict.h).
+ *  Dictionary must be identical to the one used during compression.
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/**********************************
+ *  Bulk processing dictionary API
+ *********************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+ *  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
+ *  ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
+ *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
+                                         int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+ *  Function frees memory allocated by ZSTD_createCDict(). */
+ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ *  Note that compression level is decided during dictionary creation.
+ *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  dictBuffer can be released after DDict creation, as its content is copied inside DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+ *  Function frees memory allocated with ZSTD_createDDict() */
+ZSTDLIB_API size_t      ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+ *  Decompression using a digested Dictionary.
+ *  Faster startup than ZSTD_decompress_usingDict(), recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_DDict* ddict);
+
+
+/****************************
+*  Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+  const void* src;    /**< start of input buffer */
+  size_t size;        /**< size of input buffer */
+  size_t pos;         /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+  void*  dst;         /**< start of output buffer */
+  size_t size;        /**< size of output buffer */
+  size_t pos;         /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
+
+
+/*-***********************************************************************
+*  Streaming compression - HowTo
+*
+*  A ZSTD_CStream object is required to track streaming operation.
+*  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+*  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+*  It is recommended to re-use ZSTD_CStream in situations where many streaming operations will be achieved consecutively,
+*  since it will play nicer with system's memory, by re-using already allocated memory.
+*  Use one separate ZSTD_CStream per thread for parallel execution.
+*
+*  Start a new compression by initializing ZSTD_CStream.
+*  Use ZSTD_initCStream() to start a new compression operation.
+*  Use ZSTD_initCStream_usingDict() or ZSTD_initCStream_usingCDict() for a compression which requires a dictionary (experimental section)
+*
+*  Use ZSTD_compressStream() repetitively to consume input stream.
+*  The function will automatically update both `pos` fields.
+*  Note that it may not consume the entire input, in which case `pos < size`,
+*  and it's up to the caller to present again remaining data.
+*  @return : a size hint, preferred nb of bytes to use as input for next function call
+*            or an error code, which can be tested using ZSTD_isError().
+*            Note 1 : it's just a hint, to help latency a little, any other value will work fine.
+*            Note 2 : size hint is guaranteed to be <= ZSTD_CStreamInSize()
+*
+*  At any moment, it's possible to flush whatever data remains within internal buffer, using ZSTD_flushStream().
+*  `output->pos` will be updated.
+*  Note that some content might still be left within internal buffer if `output->size` is too small.
+*  @return : nb of bytes still present within internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZSTD_isError().
+*
+*  ZSTD_endStream() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  ZSTD_endStream() may not be able to flush full data if `output->size` is too small.
+*  In which case, call again ZSTD_endStream() to complete the flush.
+*  @return : 0 if frame fully completed and fully flushed,
+             or >0 if some data is still present within internal buffer
+                  (value is minimum size estimation for remaining data to flush, but it could be more)
+*            or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
+                                 /* Continue to distinguish them for compatibility with versions <= v1.2.0 */
+/*===== ZSTD_CStream management functions =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
+
+/*===== Streaming compression functions =====*/
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void);    /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block in all circumstances. */
+
+
+
+/*-***************************************************************************
+*  Streaming decompression - HowTo
+*
+*  A ZSTD_DStream object is required to track streaming operations.
+*  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+*  ZSTD_DStream objects can be re-used multiple times.
+*
+*  Use ZSTD_initDStream() to start a new decompression operation,
+*   or ZSTD_initDStream_usingDict() if decompression requires a dictionary.
+*   @return : recommended first input size
+*
+*  Use ZSTD_decompressStream() repetitively to consume your input.
+*  The function will update both `pos` fields.
+*  If `input.pos < input.size`, some input has not been consumed.
+*  It's up to the caller to present again remaining data.
+*  If `output.pos < output.size`, decoder has flushed everything it could.
+*  @return : 0 when a frame is completely decoded and fully flushed,
+*            an error code, which can be tested using ZSTD_isError(),
+*            any other value > 0, which means there is still some decoding to do to complete current frame.
+*            The return value is a suggested next input size (a hint to improve latency) that will never load more than the current frame.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
+                                 /* For compatibility with versions <= v1.2.0, continue to consider them separated. */
+/*===== ZSTD_DStream management functions =====*/
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
+
+/*===== Streaming decompression functions =====*/
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+#endif  /* ZSTD_H_235446 */
+
+
+
+/****************************************************************************************
+ * START OF ADVANCED AND EXPERIMENTAL FUNCTIONS
+ * The definitions in this section are considered experimental.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * They are provided for advanced scenarios.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/* --- Constants ---*/
+#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* >= v0.7.0 */
+
+#define ZSTD_WINDOWLOG_MAX_32   30
+#define ZSTD_WINDOWLOG_MAX_64   31
+#define ZSTD_WINDOWLOG_MAX    ((unsigned)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN      10
+#define ZSTD_HASHLOG_MAX      ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN         6
+#define ZSTD_CHAINLOG_MAX_32    29
+#define ZSTD_CHAINLOG_MAX_64    30
+#define ZSTD_CHAINLOG_MAX     ((unsigned)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN       ZSTD_HASHLOG_MIN
+#define ZSTD_HASHLOG3_MAX       17
+#define ZSTD_SEARCHLOG_MAX     (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN       1
+#define ZSTD_SEARCHLENGTH_MAX    7   /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_SEARCHLENGTH_MIN    3   /* only for ZSTD_btopt, other strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MIN    1   /* only used by btopt, btultra and btfast */
+#define ZSTD_LDM_MINMATCH_MIN    4
+#define ZSTD_LDM_MINMATCH_MAX 4096
+#define ZSTD_LDM_BUCKETSIZELOG_MAX 8
+
+#define ZSTD_FRAMEHEADERSIZE_PREFIX 5   /* minimum input size to know frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN    6
+#define ZSTD_FRAMEHEADERSIZE_MAX   18   /* for static allocation */
+static const size_t ZSTD_frameHeaderSize_prefix = ZSTD_FRAMEHEADERSIZE_PREFIX;
+static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
+static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
+static const size_t ZSTD_skippableHeaderSize = 8;  /* magic number + skippable frame length */
+
+
+/*--- Advanced types ---*/
+typedef enum { ZSTD_fast=1, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2,
+               ZSTD_btlazy2, ZSTD_btopt, ZSTD_btultra } ZSTD_strategy;   /* from faster to stronger */
+
+typedef struct {
+    unsigned windowLog;      /**< largest match distance : larger == more compression, more memory needed during decompression */
+    unsigned chainLog;       /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+    unsigned hashLog;        /**< dispatch table : larger == faster, more memory */
+    unsigned searchLog;      /**< nb of searches : larger == more compression, slower */
+    unsigned searchLength;   /**< match length searched : larger == faster decompression, sometimes less compression */
+    unsigned targetLength;   /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+    ZSTD_strategy strategy;
+} ZSTD_compressionParameters;
+
+typedef struct {
+    unsigned contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+    unsigned checksumFlag;    /**< 1: generate a 32-bits checksum at end of frame, for error detection */
+    unsigned noDictIDFlag;    /**< 1: no dictID will be saved into frame header (if dictionary compression) */
+} ZSTD_frameParameters;
+
+typedef struct {
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
+
+typedef enum {
+    ZSTD_dct_auto=0,      /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+    ZSTD_dct_rawContent,  /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_fullDict     /* refuses to load a dictionary if it does not respect Zstandard's specification */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+    ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */
+    ZSTD_dlm_byRef,      /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+
+
+
+/***************************************
+*  Frame size functions
+***************************************/
+
+/*! ZSTD_findFrameCompressedSize() :