[VOL-5292] Implementation for fetching the GEM port history Data from the ONT
Change-Id: I4cf22555cbd13bcd5e49e620c8aa8b67cbd2891c
Signed-off-by: Akash Reddy Kankanala <akash.kankanala@radisys.com>
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index e7d7eb0..beb7fa8 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -16,8 +16,7 @@
Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`.
-Godoc Documentation: https://godoc.org/github.com/klauspost/compress/zstd
-
+[](https://pkg.go.dev/github.com/klauspost/compress/zstd)
## Compressor
@@ -79,6 +78,9 @@
in the future. So if you want to limit concurrency for future updates, specify the concurrency
you would like.
+If you would like stream encoding to be done without spawning async goroutines, use `WithEncoderConcurrency(1)`
+which will compress input as each block is completed, blocking on writes until each has completed.
+
You can specify your desired compression level using `WithEncoderLevel()` option. Currently only pre-defined
compression settings can be specified.
@@ -105,7 +107,8 @@
For compressing small blocks, the returned encoder has a function called `EncodeAll(src, dst []byte) []byte`.
`EncodeAll` will encode all input in src and append it to dst.
-This function can be called concurrently, but each call will only run on a single goroutine.
+This function can be called concurrently.
+Each call will only run on a same goroutine as the caller.
Encoded blocks can be concatenated and the result will be the combined input stream.
Data compressed with EncodeAll can be decoded with the Decoder, using either a stream or `DecodeAll`.
@@ -150,10 +153,10 @@
This package:
file out level insize outsize millis mb/s
-silesia.tar zskp 1 211947520 73101992 643 313.87
-silesia.tar zskp 2 211947520 67504318 969 208.38
-silesia.tar zskp 3 211947520 64595893 2007 100.68
-silesia.tar zskp 4 211947520 60995370 7691 26.28
+silesia.tar zskp 1 211947520 73821326 634 318.47
+silesia.tar zskp 2 211947520 67655404 1508 133.96
+silesia.tar zskp 3 211947520 64746933 3000 67.37
+silesia.tar zskp 4 211947520 60073508 16926 11.94
cgo zstd:
silesia.tar zstd 1 211947520 73605392 543 371.56
@@ -162,84 +165,94 @@
silesia.tar zstd 9 211947520 60212393 5063 39.92
gzip, stdlib/this package:
-silesia.tar gzstd 1 211947520 80007735 1654 122.21
-silesia.tar gzkp 1 211947520 80369488 1168 173.06
+silesia.tar gzstd 1 211947520 80007735 1498 134.87
+silesia.tar gzkp 1 211947520 80088272 1009 200.31
GOB stream of binary data. Highly compressible.
https://files.klauspost.com/compress/gob-stream.7z
file out level insize outsize millis mb/s
-gob-stream zskp 1 1911399616 235022249 3088 590.30
-gob-stream zskp 2 1911399616 205669791 3786 481.34
-gob-stream zskp 3 1911399616 175034659 9636 189.17
-gob-stream zskp 4 1911399616 167273881 29337 62.13
+gob-stream zskp 1 1911399616 233948096 3230 564.34
+gob-stream zskp 2 1911399616 203997694 4997 364.73
+gob-stream zskp 3 1911399616 173526523 13435 135.68
+gob-stream zskp 4 1911399616 162195235 47559 38.33
+
gob-stream zstd 1 1911399616 249810424 2637 691.26
gob-stream zstd 3 1911399616 208192146 3490 522.31
gob-stream zstd 6 1911399616 193632038 6687 272.56
gob-stream zstd 9 1911399616 177620386 16175 112.70
-gob-stream gzstd 1 1911399616 357382641 10251 177.82
-gob-stream gzkp 1 1911399616 362156523 5695 320.08
+
+gob-stream gzstd 1 1911399616 357382013 9046 201.49
+gob-stream gzkp 1 1911399616 359136669 4885 373.08
The test data for the Large Text Compression Benchmark is the first
10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
http://mattmahoney.net/dc/textdata.html
file out level insize outsize millis mb/s
-enwik9 zskp 1 1000000000 343848582 3609 264.18
-enwik9 zskp 2 1000000000 317276632 5746 165.97
-enwik9 zskp 3 1000000000 292243069 12162 78.41
-enwik9 zskp 4 1000000000 275241169 36430 26.18
+enwik9 zskp 1 1000000000 343833605 3687 258.64
+enwik9 zskp 2 1000000000 317001237 7672 124.29
+enwik9 zskp 3 1000000000 291915823 15923 59.89
+enwik9 zskp 4 1000000000 261710291 77697 12.27
+
enwik9 zstd 1 1000000000 358072021 3110 306.65
enwik9 zstd 3 1000000000 313734672 4784 199.35
enwik9 zstd 6 1000000000 295138875 10290 92.68
enwik9 zstd 9 1000000000 278348700 28549 33.40
-enwik9 gzstd 1 1000000000 382578136 9604 99.30
-enwik9 gzkp 1 1000000000 383825945 6544 145.73
+
+enwik9 gzstd 1 1000000000 382578136 8608 110.78
+enwik9 gzkp 1 1000000000 382781160 5628 169.45
Highly compressible JSON file.
https://files.klauspost.com/compress/github-june-2days-2019.json.zst
file out level insize outsize millis mb/s
-github-june-2days-2019.json zskp 1 6273951764 699045015 10620 563.40
-github-june-2days-2019.json zskp 2 6273951764 617881763 11687 511.96
-github-june-2days-2019.json zskp 3 6273951764 524340691 34043 175.75
-github-june-2days-2019.json zskp 4 6273951764 503314661 93811 63.78
+github-june-2days-2019.json zskp 1 6273951764 697439532 9789 611.17
+github-june-2days-2019.json zskp 2 6273951764 610876538 18553 322.49
+github-june-2days-2019.json zskp 3 6273951764 517662858 44186 135.41
+github-june-2days-2019.json zskp 4 6273951764 464617114 165373 36.18
+
github-june-2days-2019.json zstd 1 6273951764 766284037 8450 708.00
github-june-2days-2019.json zstd 3 6273951764 661889476 10927 547.57
github-june-2days-2019.json zstd 6 6273951764 642756859 22996 260.18
github-june-2days-2019.json zstd 9 6273951764 601974523 52413 114.16
-github-june-2days-2019.json gzstd 1 6273951764 1164400847 29948 199.79
-github-june-2days-2019.json gzkp 1 6273951764 1128755542 19236 311.03
+
+github-june-2days-2019.json gzstd 1 6273951764 1164397768 26793 223.32
+github-june-2days-2019.json gzkp 1 6273951764 1120631856 17693 338.16
VM Image, Linux mint with a few installed applications:
https://files.klauspost.com/compress/rawstudio-mint14.7z
file out level insize outsize millis mb/s
-rawstudio-mint14.tar zskp 1 8558382592 3667489370 20210 403.84
-rawstudio-mint14.tar zskp 2 8558382592 3364592300 31873 256.07
-rawstudio-mint14.tar zskp 3 8558382592 3158085214 77675 105.08
-rawstudio-mint14.tar zskp 4 8558382592 3020370044 404956 20.16
+rawstudio-mint14.tar zskp 1 8558382592 3718400221 18206 448.29
+rawstudio-mint14.tar zskp 2 8558382592 3326118337 37074 220.15
+rawstudio-mint14.tar zskp 3 8558382592 3163842361 87306 93.49
+rawstudio-mint14.tar zskp 4 8558382592 2970480650 783862 10.41
+
rawstudio-mint14.tar zstd 1 8558382592 3609250104 17136 476.27
rawstudio-mint14.tar zstd 3 8558382592 3341679997 29262 278.92
rawstudio-mint14.tar zstd 6 8558382592 3235846406 77904 104.77
rawstudio-mint14.tar zstd 9 8558382592 3160778861 140946 57.91
-rawstudio-mint14.tar gzstd 1 8558382592 3926257486 57722 141.40
-rawstudio-mint14.tar gzkp 1 8558382592 3970463184 41749 195.49
+
+rawstudio-mint14.tar gzstd 1 8558382592 3926234992 51345 158.96
+rawstudio-mint14.tar gzkp 1 8558382592 3960117298 36722 222.26
CSV data:
https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
file out level insize outsize millis mb/s
-nyc-taxi-data-10M.csv zskp 1 3325605752 641339945 8925 355.35
-nyc-taxi-data-10M.csv zskp 2 3325605752 591748091 11268 281.44
-nyc-taxi-data-10M.csv zskp 3 3325605752 530289687 25239 125.66
-nyc-taxi-data-10M.csv zskp 4 3325605752 490907191 65939 48.10
+nyc-taxi-data-10M.csv zskp 1 3325605752 641319332 9462 335.17
+nyc-taxi-data-10M.csv zskp 2 3325605752 588976126 17570 180.50
+nyc-taxi-data-10M.csv zskp 3 3325605752 529329260 32432 97.79
+nyc-taxi-data-10M.csv zskp 4 3325605752 474949772 138025 22.98
+
nyc-taxi-data-10M.csv zstd 1 3325605752 687399637 8233 385.18
nyc-taxi-data-10M.csv zstd 3 3325605752 598514411 10065 315.07
nyc-taxi-data-10M.csv zstd 6 3325605752 570522953 20038 158.27
nyc-taxi-data-10M.csv zstd 9 3325605752 517554797 64565 49.12
-nyc-taxi-data-10M.csv gzstd 1 3325605752 928656485 23876 132.83
-nyc-taxi-data-10M.csv gzkp 1 3325605752 924718719 16388 193.53
+
+nyc-taxi-data-10M.csv gzstd 1 3325605752 928654908 21270 149.11
+nyc-taxi-data-10M.csv gzkp 1 3325605752 922273214 13929 227.68
```
## Decompressor
@@ -274,8 +287,13 @@
}
```
-It is important to use the "Close" function when you no longer need the Reader to stop running goroutines.
-See "Allocation-less operation" below.
+It is important to use the "Close" function when you no longer need the Reader to stop running goroutines,
+when running with default settings.
+Goroutines will exit once an error has been returned, including `io.EOF` at the end of a stream.
+
+Streams are decoded concurrently in 4 asynchronous stages to give the best possible throughput.
+However, if you prefer synchronous decompression, use `WithDecoderConcurrency(1)` which will decompress data
+as it is being requested only.
For decoding buffers, it could look something like this:
@@ -284,7 +302,7 @@
// Create a reader that caches decompressors.
// For this operation type we supply a nil Reader.
-var decoder, _ = zstd.NewReader(nil)
+var decoder, _ = zstd.NewReader(nil, WithDecoderConcurrency(0))
// Decompress a buffer. We don't supply a destination buffer,
// so it will be allocated by the decoder.
@@ -294,9 +312,12 @@
```
Both of these cases should provide the functionality needed.
-The decoder can be used for *concurrent* decompression of multiple buffers.
+The decoder can be used for *concurrent* decompression of multiple buffers.
+By default 4 decompressors will be created.
+
It will only allow a certain number of concurrent operations to run.
-To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.
+To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.
+It is possible to use `WithDecoderConcurrency(0)` to create GOMAXPROCS decoders.
### Dictionaries
@@ -348,70 +369,71 @@
The buffer decoder does everything on the same goroutine and does nothing concurrently.
It can however decode several buffers concurrently. Use `WithDecoderConcurrency(n)` to limit that.
-The stream decoder operates on
+The stream decoder will create goroutines that:
-* One goroutine reads input and splits the input to several block decoders.
-* A number of decoders will decode blocks.
-* A goroutine coordinates these blocks and sends history from one to the next.
+1) Reads input and splits the input into blocks.
+2) Decompression of literals.
+3) Decompression of sequences.
+4) Reconstruction of output stream.
So effectively this also means the decoder will "read ahead" and prepare data to always be available for output.
+The concurrency level will, for streams, determine how many blocks ahead the compression will start.
+
Since "blocks" are quite dependent on the output of the previous block stream decoding will only have limited concurrency.
-In practice this means that concurrency is often limited to utilizing about 2 cores effectively.
-
-
+In practice this means that concurrency is often limited to utilizing about 3 cores effectively.
+
### Benchmarks
-These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
-
The first two are streaming decodes and the last are smaller inputs.
-
+
+Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
+
```
-BenchmarkDecoderSilesia-8 3 385000067 ns/op 550.51 MB/s 5498 B/op 8 allocs/op
-BenchmarkDecoderSilesiaCgo-8 6 197666567 ns/op 1072.25 MB/s 270672 B/op 8 allocs/op
+BenchmarkDecoderSilesia-32 5 206878840 ns/op 1024.50 MB/s 49808 B/op 43 allocs/op
+BenchmarkDecoderEnwik9-32 1 1271809000 ns/op 786.28 MB/s 72048 B/op 52 allocs/op
-BenchmarkDecoderEnwik9-8 1 2027001600 ns/op 493.34 MB/s 10496 B/op 18 allocs/op
-BenchmarkDecoderEnwik9Cgo-8 2 979499200 ns/op 1020.93 MB/s 270672 B/op 8 allocs/op
+Concurrent blocks, performance:
-Concurrent performance:
-
-BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16 28915 42469 ns/op 4340.07 MB/s 114 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16 116505 9965 ns/op 11900.16 MB/s 16 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16 8952 134272 ns/op 3588.70 MB/s 915 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16 11820 102538 ns/op 4161.90 MB/s 594 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16 34782 34184 ns/op 3661.88 MB/s 60 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16 27712 43447 ns/op 3500.58 MB/s 99 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16 62826 18750 ns/op 21845.10 MB/s 104 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16 631545 1794 ns/op 57078.74 MB/s 2 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16 1690140 712 ns/op 172938.13 MB/s 1 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16 10432 113593 ns/op 6180.73 MB/s 1143 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html.zst-16 113206 10671 ns/op 9596.27 MB/s 15 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16 1530615 779 ns/op 5229.49 MB/s 0 B/op 0 allocs/op
-
-BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16 65217 16192 ns/op 11383.34 MB/s 46 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16 292671 4039 ns/op 29363.19 MB/s 6 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16 26314 46021 ns/op 10470.43 MB/s 293 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16 33897 34900 ns/op 12227.96 MB/s 205 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16 104348 11433 ns/op 10949.01 MB/s 20 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16 75949 15510 ns/op 9805.60 MB/s 32 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16 173910 6756 ns/op 60624.29 MB/s 37 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16 923076 1339 ns/op 76474.87 MB/s 1 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16 922920 1351 ns/op 91102.57 MB/s 2 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16 27649 43618 ns/op 16096.19 MB/s 407 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16 279073 4160 ns/op 24614.18 MB/s 6 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16 749938 1579 ns/op 2581.71 MB/s 0 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32 67356 17857 ns/op 10321.96 MB/s 22.48 pct 102 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32 266656 4421 ns/op 26823.21 MB/s 11.89 pct 19 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32 20992 56842 ns/op 8477.17 MB/s 39.90 pct 754 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32 27456 43932 ns/op 9714.01 MB/s 33.27 pct 524 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32 78432 15047 ns/op 8319.15 MB/s 40.34 pct 66 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32 65800 18436 ns/op 8249.63 MB/s 37.75 pct 88 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32 102993 11523 ns/op 35546.09 MB/s 3.637 pct 143 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32 1000000 1070 ns/op 95720.98 MB/s 80.53 pct 3 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32 749802 1752 ns/op 70272.35 MB/s 100.0 pct 5 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32 22640 52934 ns/op 13263.37 MB/s 26.25 pct 1014 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html.zst-32 226412 5232 ns/op 19572.27 MB/s 14.49 pct 20 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32 923041 1276 ns/op 3194.71 MB/s 31.26 pct 0 B/op 0 allocs/op
```
-This reflects the performance around May 2020, but this may be out of date.
+This reflects the performance around May 2022, but this may be out of date.
+
+## Zstd inside ZIP files
+
+It is possible to use zstandard to compress individual files inside zip archives.
+While this isn't widely supported it can be useful for internal files.
+
+To support the compression and decompression of these files you must register a compressor and decompressor.
+
+It is highly recommended registering the (de)compressors on individual zip Reader/Writer and NOT
+use the global registration functions. The main reason for this is that 2 registrations from
+different packages will result in a panic.
+
+It is a good idea to only have a single compressor and decompressor, since they can be used for multiple zip
+files concurrently, and using a single instance will allow reusing some resources.
+
+See [this example](https://pkg.go.dev/github.com/klauspost/compress/zstd#example-ZipCompressor) for
+how to compress and decompress files inside zip archives.
# Contributions
Contributions are always welcome.
For new features/fixes, remember to add tests and for performance enhancements include benchmarks.
-For sending files for reproducing errors use a service like [goobox](https://goobox.io/#/upload) or similar to share your files.
-
For general feedback and experience reports, feel free to open an issue or write me on [Twitter](https://twitter.com/sh0dan).
This package includes the excellent [`github.com/cespare/xxhash`](https://github.com/cespare/xxhash) package Copyright (c) 2016 Caleb Spare.
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index 8544585..97299d4 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -7,6 +7,7 @@
import (
"encoding/binary"
"errors"
+ "fmt"
"io"
"math/bits"
)
@@ -50,16 +51,16 @@
if n == 0 /*|| b.bitsRead >= 64 */ {
return 0
}
- return b.getBitsFast(n)
+ return int(b.get32BitsFast(n))
}
-// getBitsFast requires that at least one bit is requested every time.
+// get32BitsFast requires that at least one bit is requested every time.
// There are no checks if the buffer is filled.
-func (b *bitReader) getBitsFast(n uint8) int {
+func (b *bitReader) get32BitsFast(n uint8) uint32 {
const regMask = 64 - 1
v := uint32((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
b.bitsRead += n
- return int(v)
+ return v
}
// fillFast() will make sure at least 32 bits are available.
@@ -125,6 +126,9 @@
func (b *bitReader) close() error {
// Release reference.
b.in = nil
+ if !b.finished() {
+ return fmt.Errorf("%d extra bits on block, should be 0", b.remain())
+ }
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}
diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
index 303ae90..78b3c61 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
@@ -5,8 +5,6 @@
package zstd
-import "fmt"
-
// bitWriter will write bits.
// First bit will be LSB of the first byte of output.
type bitWriter struct {
@@ -38,7 +36,7 @@
b.nBits += bits
}
-// addBits32NC will add up to 32 bits.
+// addBits32NC will add up to 31 bits.
// It will not check if there is space for them,
// so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits32NC(value uint32, bits uint8) {
@@ -46,6 +44,26 @@
b.nBits += bits
}
+// addBits64NC will add up to 64 bits.
+// There must be space for 32 bits.
+func (b *bitWriter) addBits64NC(value uint64, bits uint8) {
+ if bits <= 31 {
+ b.addBits32Clean(uint32(value), bits)
+ return
+ }
+ b.addBits32Clean(uint32(value), 32)
+ b.flush32()
+ b.addBits32Clean(uint32(value>>32), bits-32)
+}
+
+// addBits32Clean will add up to 32 bits.
+// It will not check if there is space for them.
+// The input must not contain more bits than specified.
+func (b *bitWriter) addBits32Clean(value uint32, bits uint8) {
+ b.bitContainer |= uint64(value) << (b.nBits & 63)
+ b.nBits += bits
+}
+
// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@@ -53,80 +71,6 @@
b.nBits += bits
}
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
- v := b.nBits >> 3
- switch v {
- case 0:
- case 1:
- b.out = append(b.out,
- byte(b.bitContainer),
- )
- case 2:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- )
- case 3:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- )
- case 4:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- )
- case 5:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- )
- case 6:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- )
- case 7:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- )
- case 8:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- byte(b.bitContainer>>56),
- )
- default:
- panic(fmt.Errorf("bits (%d) > 64", b.nBits))
- }
- b.bitContainer >>= v << 3
- b.nBits &= 7
-}
-
// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index b51d922..7eed729 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -5,9 +5,14 @@
package zstd
import (
+ "bytes"
+ "encoding/binary"
"errors"
"fmt"
"io"
+ "io/ioutil"
+ "os"
+ "path/filepath"
"sync"
"github.com/klauspost/compress/huff0"
@@ -38,14 +43,14 @@
// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
maxCompressedBlockSize = 128 << 10
+ compressedBlockOverAlloc = 16
+ maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
+
// Maximum possible block size (all Raw+Uncompressed).
maxBlockSize = (1 << 21) - 1
- // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
- maxCompressedLiteralSize = 1 << 18
- maxRLELiteralSize = 1 << 20
- maxMatchLen = 131074
- maxSequences = 0x7f00 + 0xffff
+ maxMatchLen = 131074
+ maxSequences = 0x7f00 + 0xffff
// We support slightly less than the reference decoder to be able to
// use ints on 32 bit archs.
@@ -76,20 +81,27 @@
// Window size of the block.
WindowSize uint64
- history chan *history
- input chan struct{}
- result chan decodeOutput
- sequenceBuf []seq
- err error
- decWG sync.WaitGroup
+ err error
+
+ // Check against this crc
+ checkCRC []byte
// Frame to use for singlethreaded decoding.
// Should not be used by the decoder itself since parent may be another frame.
localFrame *frameDec
+ sequence []seqVals
+
+ async struct {
+ newHist *history
+ literals []byte
+ seqData []byte
+ seqSize int // Size of uncompressed sequences
+ fcs uint64
+ }
+
// Block is RLE, this is the size.
RLESize uint32
- tmp [4]byte
Type blockType
@@ -109,13 +121,8 @@
func newBlockDec(lowMem bool) *blockDec {
b := blockDec{
- lowMem: lowMem,
- result: make(chan decodeOutput, 1),
- input: make(chan struct{}, 1),
- history: make(chan *history, 1),
+ lowMem: lowMem,
}
- b.decWG.Add(1)
- go b.startDecoder()
return &b
}
@@ -123,44 +130,60 @@
// Input must be a start of a block and will be at the end of the block when returned.
func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
b.WindowSize = windowSize
- tmp := br.readSmall(3)
- if tmp == nil {
- if debug {
- println("Reading block header:", io.ErrUnexpectedEOF)
- }
- return io.ErrUnexpectedEOF
+ tmp, err := br.readSmall(3)
+ if err != nil {
+ println("Reading block header:", err)
+ return err
}
bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
b.Last = bh&1 != 0
b.Type = blockType((bh >> 1) & 3)
// find size.
cSize := int(bh >> 3)
- maxSize := maxBlockSize
+ maxSize := maxCompressedBlockSizeAlloc
switch b.Type {
case blockTypeReserved:
return ErrReservedBlockType
case blockTypeRLE:
+ if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
+ if debugDecoder {
+ printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
+ }
+ return ErrWindowSizeExceeded
+ }
b.RLESize = uint32(cSize)
if b.lowMem {
maxSize = cSize
}
cSize = 1
case blockTypeCompressed:
- if debug {
+ if debugDecoder {
println("Data size on stream:", cSize)
}
b.RLESize = 0
- maxSize = maxCompressedBlockSize
+ maxSize = maxCompressedBlockSizeAlloc
if windowSize < maxCompressedBlockSize && b.lowMem {
- maxSize = int(windowSize)
+ maxSize = int(windowSize) + compressedBlockOverAlloc
}
if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
- if debug {
+ if debugDecoder {
printf("compressed block too big: csize:%d block: %+v\n", uint64(cSize), b)
}
return ErrCompressedSizeTooBig
}
+ // Empty compressed blocks must at least be 2 bytes
+ // for Literals_Block_Type and one for Sequences_Section_Header.
+ if cSize < 2 {
+ return ErrBlockTooSmall
+ }
case blockTypeRaw:
+ if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
+ if debugDecoder {
+ printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
+ }
+ return ErrWindowSizeExceeded
+ }
+
b.RLESize = 0
// We do not need a destination for raw blocks.
maxSize = -1
@@ -170,19 +193,18 @@
// Read block data.
if cap(b.dataStorage) < cSize {
- if b.lowMem {
- b.dataStorage = make([]byte, 0, cSize)
+ if b.lowMem || cSize > maxCompressedBlockSize {
+ b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
} else {
- b.dataStorage = make([]byte, 0, maxBlockSize)
+ b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
}
}
if cap(b.dst) <= maxSize {
b.dst = make([]byte, 0, maxSize+1)
}
- var err error
b.data, err = br.readBig(cSize, b.dataStorage)
if err != nil {
- if debug {
+ if debugDecoder {
println("Reading block:", err, "(", cSize, ")", len(b.data))
printf("%T", br)
}
@@ -196,85 +218,14 @@
b.Last = true
b.Type = blockTypeReserved
b.err = err
- b.input <- struct{}{}
}
// Close will release resources.
// Closed blockDec cannot be reset.
func (b *blockDec) Close() {
- close(b.input)
- close(b.history)
- close(b.result)
- b.decWG.Wait()
}
-// decodeAsync will prepare decoding the block when it receives input.
-// This will separate output and history.
-func (b *blockDec) startDecoder() {
- defer b.decWG.Done()
- for range b.input {
- //println("blockDec: Got block input")
- switch b.Type {
- case blockTypeRLE:
- if cap(b.dst) < int(b.RLESize) {
- if b.lowMem {
- b.dst = make([]byte, b.RLESize)
- } else {
- b.dst = make([]byte, maxBlockSize)
- }
- }
- o := decodeOutput{
- d: b,
- b: b.dst[:b.RLESize],
- err: nil,
- }
- v := b.data[0]
- for i := range o.b {
- o.b[i] = v
- }
- hist := <-b.history
- hist.append(o.b)
- b.result <- o
- case blockTypeRaw:
- o := decodeOutput{
- d: b,
- b: b.data,
- err: nil,
- }
- hist := <-b.history
- hist.append(o.b)
- b.result <- o
- case blockTypeCompressed:
- b.dst = b.dst[:0]
- err := b.decodeCompressed(nil)
- o := decodeOutput{
- d: b,
- b: b.dst,
- err: err,
- }
- if debug {
- println("Decompressed to", len(b.dst), "bytes, error:", err)
- }
- b.result <- o
- case blockTypeReserved:
- // Used for returning errors.
- <-b.history
- b.result <- decodeOutput{
- d: b,
- b: nil,
- err: b.err,
- }
- default:
- panic("Invalid block type")
- }
- if debug {
- println("blockDec: Finished block")
- }
- }
-}
-
-// decodeAsync will prepare decoding the block when it receives the history.
-// If history is provided, it will not fetch it from the channel.
+// decodeBuf
func (b *blockDec) decodeBuf(hist *history) error {
switch b.Type {
case blockTypeRLE:
@@ -297,14 +248,23 @@
return nil
case blockTypeCompressed:
saved := b.dst
- b.dst = hist.b
- hist.b = nil
+ // Append directly to history
+ if hist.ignoreBuffer == 0 {
+ b.dst = hist.b
+ hist.b = nil
+ } else {
+ b.dst = b.dst[:0]
+ }
err := b.decodeCompressed(hist)
- if debug {
+ if debugDecoder {
println("Decompressed to total", len(b.dst), "bytes, hash:", xxhash.Sum64(b.dst), "error:", err)
}
- hist.b = b.dst
- b.dst = saved
+ if hist.ignoreBuffer == 0 {
+ hist.b = b.dst
+ b.dst = saved
+ } else {
+ hist.appendKeep(b.dst)
+ }
return err
case blockTypeReserved:
// Used for returning errors.
@@ -314,30 +274,18 @@
}
}
-// decodeCompressed will start decompressing a block.
-// If no history is supplied the decoder will decodeAsync as much as possible
-// before fetching from blockDec.history
-func (b *blockDec) decodeCompressed(hist *history) error {
- in := b.data
- delayedHistory := hist == nil
-
- if delayedHistory {
- // We must always grab history.
- defer func() {
- if hist == nil {
- <-b.history
- }
- }()
- }
+func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err error) {
// There must be at least one byte for Literals_Block_Type and one for Sequences_Section_Header
if len(in) < 2 {
- return ErrBlockTooSmall
+ return in, ErrBlockTooSmall
}
+
litType := literalsBlockType(in[0] & 3)
var litRegenSize int
var litCompSize int
sizeFormat := (in[0] >> 2) & 3
var fourStreams bool
+ var literals []byte
switch litType {
case literalsBlockRaw, literalsBlockRLE:
switch sizeFormat {
@@ -353,7 +301,7 @@
// Regenerated_Size uses 20 bits (0-1048575). Literals_Section_Header uses 3 bytes.
if len(in) < 3 {
println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
- return ErrBlockTooSmall
+ return in, ErrBlockTooSmall
}
litRegenSize = int(in[0]>>4) + (int(in[1]) << 4) + (int(in[2]) << 12)
in = in[3:]
@@ -364,7 +312,7 @@
// Both Regenerated_Size and Compressed_Size use 10 bits (0-1023).
if len(in) < 3 {
println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
- return ErrBlockTooSmall
+ return in, ErrBlockTooSmall
}
n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12)
litRegenSize = int(n & 1023)
@@ -375,7 +323,7 @@
fourStreams = true
if len(in) < 4 {
println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
- return ErrBlockTooSmall
+ return in, ErrBlockTooSmall
}
n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20)
litRegenSize = int(n & 16383)
@@ -385,7 +333,7 @@
fourStreams = true
if len(in) < 5 {
println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
- return ErrBlockTooSmall
+ return in, ErrBlockTooSmall
}
n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20) + (uint64(in[4]) << 28)
litRegenSize = int(n & 262143)
@@ -393,16 +341,18 @@
in = in[5:]
}
}
- if debug {
+ if debugDecoder {
println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize:", litCompSize, "sizeFormat:", sizeFormat, "4X:", fourStreams)
}
- var literals []byte
- var huff *huff0.Scratch
+ if litRegenSize > int(b.WindowSize) || litRegenSize > maxCompressedBlockSize {
+ return in, ErrWindowSizeExceeded
+ }
+
switch litType {
case literalsBlockRaw:
if len(in) < litRegenSize {
println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litRegenSize)
- return ErrBlockTooSmall
+ return in, ErrBlockTooSmall
}
literals = in[:litRegenSize]
in = in[litRegenSize:]
@@ -410,19 +360,13 @@
case literalsBlockRLE:
if len(in) < 1 {
println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", 1)
- return ErrBlockTooSmall
+ return in, ErrBlockTooSmall
}
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
- b.literalBuf = make([]byte, litRegenSize)
+ b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
} else {
- if litRegenSize > maxCompressedLiteralSize {
- // Exceptional
- b.literalBuf = make([]byte, litRegenSize)
- } else {
- b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
-
- }
+ b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
literals = b.literalBuf[:litRegenSize]
@@ -431,45 +375,79 @@
literals[i] = v
}
in = in[1:]
- if debug {
+ if debugDecoder {
printf("Found %d RLE compressed literals\n", litRegenSize)
}
case literalsBlockTreeless:
if len(in) < litCompSize {
println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
- return ErrBlockTooSmall
+ return in, ErrBlockTooSmall
}
// Store compressed literals, so we defer decoding until we get history.
literals = in[:litCompSize]
in = in[litCompSize:]
- if debug {
+ if debugDecoder {
printf("Found %d compressed literals\n", litCompSize)
}
- case literalsBlockCompressed:
- if len(in) < litCompSize {
- println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
- return ErrBlockTooSmall
+ huff := hist.huffTree
+ if huff == nil {
+ return in, errors.New("literal block was treeless, but no history was defined")
}
- literals = in[:litCompSize]
- in = in[litCompSize:]
- huff = huffDecoderPool.Get().(*huff0.Scratch)
- var err error
// Ensure we have space to store it.
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
- b.literalBuf = make([]byte, 0, litRegenSize)
+ b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
} else {
- b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+ b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
- if huff == nil {
- huff = &huff0.Scratch{}
+ var err error
+ // Use our out buffer.
+ huff.MaxDecodedSize = litRegenSize
+ if fourStreams {
+ literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
+ } else {
+ literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
}
+ // Make sure we don't leak our literals buffer
+ if err != nil {
+ println("decompressing literals:", err)
+ return in, err
+ }
+ if len(literals) != litRegenSize {
+ return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+ }
+
+ case literalsBlockCompressed:
+ if len(in) < litCompSize {
+ println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
+ return in, ErrBlockTooSmall
+ }
+ literals = in[:litCompSize]
+ in = in[litCompSize:]
+ // Ensure we have space to store it.
+ if cap(b.literalBuf) < litRegenSize {
+ if b.lowMem {
+ b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
+ } else {
+ b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
+ }
+ }
+ huff := hist.huffTree
+ if huff == nil || (hist.dict != nil && huff == hist.dict.litEnc) {
+ huff = huffDecoderPool.Get().(*huff0.Scratch)
+ if huff == nil {
+ huff = &huff0.Scratch{}
+ }
+ }
+ var err error
huff, literals, err = huff0.ReadTable(literals, huff)
if err != nil {
println("reading huffman table:", err)
- return err
+ return in, err
}
+ hist.huffTree = huff
+ huff.MaxDecodedSize = litRegenSize
// Use our out buffer.
if fourStreams {
literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
@@ -478,27 +456,63 @@
}
if err != nil {
println("decoding compressed literals:", err)
- return err
+ return in, err
}
// Make sure we don't leak our literals buffer
if len(literals) != litRegenSize {
- return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+ return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
}
- if debug {
+ // Re-cap to get extra size.
+ literals = b.literalBuf[:len(literals)]
+ if debugDecoder {
printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
}
}
+ hist.decoders.literals = literals
+ return in, nil
+}
+// decodeCompressed will start decompressing a block.
+func (b *blockDec) decodeCompressed(hist *history) error {
+ in := b.data
+ in, err := b.decodeLiterals(in, hist)
+ if err != nil {
+ return err
+ }
+ err = b.prepareSequences(in, hist)
+ if err != nil {
+ return err
+ }
+ if hist.decoders.nSeqs == 0 {
+ b.dst = append(b.dst, hist.decoders.literals...)
+ return nil
+ }
+ before := len(hist.decoders.out)
+ err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
+ if err != nil {
+ return err
+ }
+ if hist.decoders.maxSyncLen > 0 {
+ hist.decoders.maxSyncLen += uint64(before)
+ hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
+ }
+ b.dst = hist.decoders.out
+ hist.recentOffsets = hist.decoders.prevOffset
+ return nil
+}
+
+func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
+ if debugDecoder {
+ printf("prepareSequences: %d byte(s) input\n", len(in))
+ }
// Decode Sequences
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequences-section
if len(in) < 1 {
return ErrBlockTooSmall
}
+ var nSeqs int
seqHeader := in[0]
- nSeqs := 0
switch {
- case seqHeader == 0:
- in = in[1:]
case seqHeader < 128:
nSeqs = int(seqHeader)
in = in[1:]
@@ -515,19 +529,16 @@
nSeqs = 0x7f00 + int(in[1]) + (int(in[2]) << 8)
in = in[3:]
}
- // Allocate sequences
- if cap(b.sequenceBuf) < nSeqs {
- if b.lowMem {
- b.sequenceBuf = make([]seq, nSeqs)
- } else {
- // Allocate max
- b.sequenceBuf = make([]seq, nSeqs, maxSequences)
+ if nSeqs == 0 && len(in) != 0 {
+ // When no sequences, there should not be any more data...
+ if debugDecoder {
+ printf("prepareSequences: 0 sequences, but %d byte(s) left on stream\n", len(in))
}
- } else {
- // Reuse buffer
- b.sequenceBuf = b.sequenceBuf[:nSeqs]
+ return ErrUnexpectedBlockSize
}
- var seqs = &sequenceDecs{}
+
+ var seqs = &hist.decoders
+ seqs.nSeqs = nSeqs
if nSeqs > 0 {
if len(in) < 1 {
return ErrBlockTooSmall
@@ -535,12 +546,12 @@
br := byteReader{b: in, off: 0}
compMode := br.Uint8()
br.advance(1)
- if debug {
+ if debugDecoder {
printf("Compression modes: 0b%b", compMode)
}
for i := uint(0); i < 3; i++ {
mode := seqCompMode((compMode >> (6 - i*2)) & 3)
- if debug {
+ if debugDecoder {
println("Table", tableIndex(i), "is", mode)
}
var seq *sequenceDec
@@ -556,6 +567,9 @@
}
switch mode {
case compModePredefined:
+ if seq.fse != nil && !seq.fse.preDefined {
+ fseDecoderPool.Put(seq.fse)
+ }
seq.fse = &fsePredef[i]
case compModeRLE:
if br.remain() < 1 {
@@ -563,34 +577,36 @@
}
v := br.Uint8()
br.advance(1)
- dec := fseDecoderPool.Get().(*fseDecoder)
+ if seq.fse == nil || seq.fse.preDefined {
+ seq.fse = fseDecoderPool.Get().(*fseDecoder)
+ }
symb, err := decSymbolValue(v, symbolTableX[i])
if err != nil {
printf("RLE Transform table (%v) error: %v", tableIndex(i), err)
return err
}
- dec.setRLE(symb)
- seq.fse = dec
- if debug {
+ seq.fse.setRLE(symb)
+ if debugDecoder {
printf("RLE set to %+v, code: %v", symb, v)
}
case compModeFSE:
println("Reading table for", tableIndex(i))
- dec := fseDecoderPool.Get().(*fseDecoder)
- err := dec.readNCount(&br, uint16(maxTableSymbol[i]))
+ if seq.fse == nil || seq.fse.preDefined {
+ seq.fse = fseDecoderPool.Get().(*fseDecoder)
+ }
+ err := seq.fse.readNCount(&br, uint16(maxTableSymbol[i]))
if err != nil {
println("Read table error:", err)
return err
}
- err = dec.transform(symbolTableX[i])
+ err = seq.fse.transform(symbolTableX[i])
if err != nil {
println("Transform table error:", err)
return err
}
- if debug {
- println("Read table ok", "symbolLen:", dec.symbolLen)
+ if debugDecoder {
+ println("Read table ok", "symbolLen:", seq.fse.symbolLen)
}
- seq.fse = dec
case compModeRepeat:
seq.repeat = true
}
@@ -600,140 +616,106 @@
}
in = br.unread()
}
-
- // Wait for history.
- // All time spent after this is critical since it is strictly sequential.
- if hist == nil {
- hist = <-b.history
- if hist.error {
- return ErrDecoderClosed
- }
- }
-
- // Decode treeless literal block.
- if litType == literalsBlockTreeless {
- // TODO: We could send the history early WITHOUT the stream history.
- // This would allow decoding treeless literals before the byte history is available.
- // Silencia stats: Treeless 4393, with: 32775, total: 37168, 11% treeless.
- // So not much obvious gain here.
-
- if hist.huffTree == nil {
- return errors.New("literal block was treeless, but no history was defined")
- }
- // Ensure we have space to store it.
- if cap(b.literalBuf) < litRegenSize {
- if b.lowMem {
- b.literalBuf = make([]byte, 0, litRegenSize)
- } else {
- b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
- }
- }
- var err error
- // Use our out buffer.
- huff = hist.huffTree
- if fourStreams {
- literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
- } else {
- literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
- }
- // Make sure we don't leak our literals buffer
- if err != nil {
- println("decompressing literals:", err)
- return err
- }
- if len(literals) != litRegenSize {
- return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
- }
- } else {
- if hist.huffTree != nil && huff != nil {
- if hist.dict == nil || hist.dict.litEnc != hist.huffTree {
- huffDecoderPool.Put(hist.huffTree)
- }
- hist.huffTree = nil
- }
- }
- if huff != nil {
- hist.huffTree = huff
- }
- if debug {
- println("Final literals:", len(literals), "hash:", xxhash.Sum64(literals), "and", nSeqs, "sequences.")
+ if debugDecoder {
+ println("Literals:", len(seqs.literals), "hash:", xxhash.Sum64(seqs.literals), "and", seqs.nSeqs, "sequences.")
}
if nSeqs == 0 {
- // Decompressed content is defined entirely as Literals Section content.
- b.dst = append(b.dst, literals...)
- if delayedHistory {
- hist.append(literals)
+ if len(b.sequence) > 0 {
+ b.sequence = b.sequence[:0]
}
return nil
}
-
- seqs, err := seqs.mergeHistory(&hist.decoders)
- if err != nil {
- return err
+ br := seqs.br
+ if br == nil {
+ br = &bitReader{}
}
- if debug {
- println("History merged ok")
- }
- br := &bitReader{}
if err := br.init(in); err != nil {
return err
}
- // TODO: Investigate if sending history without decoders are faster.
- // This would allow the sequences to be decoded async and only have to construct stream history.
- // If only recent offsets were not transferred, this would be an obvious win.
- // Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
+ if err := seqs.initialize(br, hist, b.dst); err != nil {
+ println("initializing sequences:", err)
+ return err
+ }
+ // Extract blocks...
+ if false && hist.dict == nil {
+ fatalErr := func(err error) {
+ if err != nil {
+ panic(err)
+ }
+ }
+ fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
+ var buf bytes.Buffer
+ fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
+ fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
+ fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
+ buf.Write(in)
+ ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
+ }
+ return nil
+}
+
+func (b *blockDec) decodeSequences(hist *history) error {
+ if cap(b.sequence) < hist.decoders.nSeqs {
+ if b.lowMem {
+ b.sequence = make([]seqVals, 0, hist.decoders.nSeqs)
+ } else {
+ b.sequence = make([]seqVals, 0, 0x7F00+0xffff)
+ }
+ }
+ b.sequence = b.sequence[:hist.decoders.nSeqs]
+ if hist.decoders.nSeqs == 0 {
+ hist.decoders.seqSize = len(hist.decoders.literals)
+ return nil
+ }
+ hist.decoders.windowSize = hist.windowSize
+ hist.decoders.prevOffset = hist.recentOffsets
+
+ err := hist.decoders.decode(b.sequence)
+ hist.recentOffsets = hist.decoders.prevOffset
+ return err
+}
+
+func (b *blockDec) executeSequences(hist *history) error {
hbytes := hist.b
if len(hbytes) > hist.windowSize {
hbytes = hbytes[len(hbytes)-hist.windowSize:]
- // We do not need history any more.
+ // We do not need history anymore.
if hist.dict != nil {
hist.dict.content = nil
}
}
-
- if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
- println("initializing sequences:", err)
- return err
- }
-
- err = seqs.decode(nSeqs, br, hbytes)
+ hist.decoders.windowSize = hist.windowSize
+ hist.decoders.out = b.dst[:0]
+ err := hist.decoders.execute(b.sequence, hbytes)
if err != nil {
return err
}
- if !br.finished() {
- return fmt.Errorf("%d extra bits on block, should be 0", br.remain())
- }
+ return b.updateHistory(hist)
+}
- err = br.close()
- if err != nil {
- printf("Closing sequences: %v, %+v\n", err, *br)
- }
+func (b *blockDec) updateHistory(hist *history) error {
if len(b.data) > maxCompressedBlockSize {
return fmt.Errorf("compressed block size too large (%d)", len(b.data))
}
// Set output and release references.
- b.dst = seqs.out
- seqs.out, seqs.literals, seqs.hist = nil, nil, nil
+ b.dst = hist.decoders.out
+ hist.recentOffsets = hist.decoders.prevOffset
- if !delayedHistory {
- // If we don't have delayed history, no need to update.
- hist.recentOffsets = seqs.prevOffset
- return nil
- }
if b.Last {
// if last block we don't care about history.
println("Last block, no history returned")
hist.b = hist.b[:0]
return nil
+ } else {
+ hist.append(b.dst)
+ if debugDecoder {
+ println("Finished block with ", len(b.sequence), "sequences. Added", len(b.dst), "to history, now length", len(hist.b))
+ }
}
- hist.append(b.dst)
- hist.recentOffsets = seqs.prevOffset
- if debug {
- println("Finished block with literals:", len(literals), "and", nSeqs, "sequences.")
- }
+ hist.decoders.out, hist.decoders.literals = nil, nil
return nil
}
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index e1be092..12e8f6f 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -51,7 +51,7 @@
if cap(b.literals) < maxCompressedBlockSize {
b.literals = make([]byte, 0, maxCompressedBlockSize)
}
- const defSeqs = 200
+ const defSeqs = 2000
if cap(b.sequences) < defSeqs {
b.sequences = make([]seq, 0, defSeqs)
}
@@ -156,7 +156,7 @@
switch {
case inBits < 5:
lh |= (uint64(regenLen) << 3) | (1 << 60)
- if debug {
+ if debugEncoder {
got := int(lh>>3) & 0xff
if got != regenLen {
panic(fmt.Sprint("litRegenSize = ", regenLen, "(want) != ", got, "(got)"))
@@ -184,7 +184,7 @@
lh |= 1 << 2
}
lh |= (uint64(inLen) << 4) | (uint64(compLen) << (10 + 4)) | (3 << 60)
- if debug {
+ if debugEncoder {
const mmask = (1 << 24) - 1
n := (lh >> 4) & mmask
if int(n&1023) != inLen {
@@ -312,7 +312,7 @@
bh.setType(blockTypeRaw)
b.output = bh.appendTo(b.output[:0])
b.output = append(b.output, a...)
- if debug {
+ if debugEncoder {
println("Adding RAW block, length", len(a), "last:", b.last)
}
}
@@ -325,7 +325,7 @@
bh.setType(blockTypeRaw)
dst = bh.appendTo(dst)
dst = append(dst, src...)
- if debug {
+ if debugEncoder {
println("Adding RAW block, length", len(src), "last:", b.last)
}
return dst
@@ -339,7 +339,7 @@
// Don't compress extremely small blocks
if len(lits) < 8 || (len(lits) < 32 && b.dictLitEnc == nil) || raw {
- if debug {
+ if debugEncoder {
println("Adding RAW block, length", len(lits), "last:", b.last)
}
bh.setType(blockTypeRaw)
@@ -371,7 +371,7 @@
switch err {
case huff0.ErrIncompressible:
- if debug {
+ if debugEncoder {
println("Adding RAW block, length", len(lits), "last:", b.last)
}
bh.setType(blockTypeRaw)
@@ -379,7 +379,7 @@
b.output = append(b.output, lits...)
return nil
case huff0.ErrUseRLE:
- if debug {
+ if debugEncoder {
println("Adding RLE block, length", len(lits))
}
bh.setType(blockTypeRLE)
@@ -396,12 +396,12 @@
bh.setType(blockTypeCompressed)
var lh literalsHeader
if reUsed {
- if debug {
+ if debugEncoder {
println("Reused tree, compressed to", len(out))
}
lh.setType(literalsBlockTreeless)
} else {
- if debug {
+ if debugEncoder {
println("New tree, compressed to", len(out), "tree size:", len(b.litEnc.OutTable))
}
lh.setType(literalsBlockCompressed)
@@ -426,7 +426,7 @@
return 0
}
enc := fseEncoder{}
- hist := enc.Histogram()[:256]
+ hist := enc.Histogram()
maxSym := uint8(0)
for i, v := range data {
v = v & 63
@@ -517,7 +517,7 @@
lh.setSize(len(b.literals))
b.output = lh.appendTo(b.output)
b.output = append(b.output, b.literals...)
- if debug {
+ if debugEncoder {
println("Adding literals RAW, length", len(b.literals))
}
case huff0.ErrUseRLE:
@@ -525,22 +525,22 @@
lh.setSize(len(b.literals))
b.output = lh.appendTo(b.output)
b.output = append(b.output, b.literals[0])
- if debug {
+ if debugEncoder {
println("Adding literals RLE")
}
case nil:
// Compressed litLen...
if reUsed {
- if debug {
+ if debugEncoder {
println("reused tree")
}
lh.setType(literalsBlockTreeless)
} else {
- if debug {
+ if debugEncoder {
println("new tree, size:", len(b.litEnc.OutTable))
}
lh.setType(literalsBlockCompressed)
- if debug {
+ if debugEncoder {
_, _, err := huff0.ReadTable(out, nil)
if err != nil {
panic(err)
@@ -548,18 +548,18 @@
}
}
lh.setSizes(len(out), len(b.literals), single)
- if debug {
+ if debugEncoder {
printf("Compressed %d literals to %d bytes", len(b.literals), len(out))
println("Adding literal header:", lh)
}
b.output = lh.appendTo(b.output)
b.output = append(b.output, out...)
b.litEnc.Reuse = huff0.ReusePolicyAllow
- if debug {
+ if debugEncoder {
println("Adding literals compressed")
}
default:
- if debug {
+ if debugEncoder {
println("Adding literals ERROR:", err)
}
return err
@@ -577,7 +577,7 @@
n := len(b.sequences) - 0x7f00
b.output = append(b.output, 255, uint8(n), uint8(n>>8))
}
- if debug {
+ if debugEncoder {
println("Encoding", len(b.sequences), "sequences")
}
b.genCodes()
@@ -611,17 +611,17 @@
nSize = nSize + (nSize+2*8*16)>>4
switch {
case predefSize <= prevSize && predefSize <= nSize || forcePreDef:
- if debug {
+ if debugEncoder {
println("Using predefined", predefSize>>3, "<=", nSize>>3)
}
return preDef, compModePredefined
case prevSize <= nSize:
- if debug {
+ if debugEncoder {
println("Using previous", prevSize>>3, "<=", nSize>>3)
}
return prev, compModeRepeat
default:
- if debug {
+ if debugEncoder {
println("Using new, predef", predefSize>>3, ". previous:", prevSize>>3, ">", nSize>>3, "header max:", cur.maxHeaderSize()>>3, "bytes")
println("tl:", cur.actualTableLog, "symbolLen:", cur.symbolLen, "norm:", cur.norm[:cur.symbolLen], "hist", cur.count[:cur.symbolLen])
}
@@ -634,7 +634,7 @@
if llEnc.useRLE {
mode |= uint8(compModeRLE) << 6
llEnc.setRLE(b.sequences[0].llCode)
- if debug {
+ if debugEncoder {
println("llEnc.useRLE")
}
} else {
@@ -645,7 +645,7 @@
if ofEnc.useRLE {
mode |= uint8(compModeRLE) << 4
ofEnc.setRLE(b.sequences[0].ofCode)
- if debug {
+ if debugEncoder {
println("ofEnc.useRLE")
}
} else {
@@ -657,7 +657,7 @@
if mlEnc.useRLE {
mode |= uint8(compModeRLE) << 2
mlEnc.setRLE(b.sequences[0].mlCode)
- if debug {
+ if debugEncoder {
println("mlEnc.useRLE, code: ", b.sequences[0].mlCode, "value", b.sequences[0].matchLen)
}
} else {
@@ -666,7 +666,7 @@
mode |= uint8(m) << 2
}
b.output = append(b.output, mode)
- if debug {
+ if debugEncoder {
printf("Compression modes: 0b%b", mode)
}
b.output, err = llEnc.writeCount(b.output)
@@ -722,52 +722,53 @@
println("Encoded seq", seq, s, "codes:", s.llCode, s.mlCode, s.ofCode, "states:", ll.state, ml.state, of.state, "bits:", llB, mlB, ofB)
}
seq--
- if llEnc.maxBits+mlEnc.maxBits+ofEnc.maxBits <= 32 {
- // No need to flush (common)
- for seq >= 0 {
- s = b.sequences[seq]
- wr.flush32()
- llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
- // tabelog max is 8 for all.
- of.encode(ofB)
- ml.encode(mlB)
- ll.encode(llB)
- wr.flush32()
+ // Store sequences in reverse...
+ for seq >= 0 {
+ s = b.sequences[seq]
- // We checked that all can stay within 32 bits
- wr.addBits32NC(s.litLen, llB.outBits)
- wr.addBits32NC(s.matchLen, mlB.outBits)
- wr.addBits32NC(s.offset, ofB.outBits)
+ ofB := ofTT[s.ofCode]
+ wr.flush32() // tablelog max is below 8 for each, so it will fill max 24 bits.
+ //of.encode(ofB)
+ nbBitsOut := (uint32(of.state) + ofB.deltaNbBits) >> 16
+ dstState := int32(of.state>>(nbBitsOut&15)) + int32(ofB.deltaFindState)
+ wr.addBits16NC(of.state, uint8(nbBitsOut))
+ of.state = of.stateTable[dstState]
- if debugSequences {
- println("Encoded seq", seq, s)
- }
+ // Accumulate extra bits.
+ outBits := ofB.outBits & 31
+ extraBits := uint64(s.offset & bitMask32[outBits])
+ extraBitsN := outBits
- seq--
+ mlB := mlTT[s.mlCode]
+ //ml.encode(mlB)
+ nbBitsOut = (uint32(ml.state) + mlB.deltaNbBits) >> 16
+ dstState = int32(ml.state>>(nbBitsOut&15)) + int32(mlB.deltaFindState)
+ wr.addBits16NC(ml.state, uint8(nbBitsOut))
+ ml.state = ml.stateTable[dstState]
+
+ outBits = mlB.outBits & 31
+ extraBits = extraBits<<outBits | uint64(s.matchLen&bitMask32[outBits])
+ extraBitsN += outBits
+
+ llB := llTT[s.llCode]
+ //ll.encode(llB)
+ nbBitsOut = (uint32(ll.state) + llB.deltaNbBits) >> 16
+ dstState = int32(ll.state>>(nbBitsOut&15)) + int32(llB.deltaFindState)
+ wr.addBits16NC(ll.state, uint8(nbBitsOut))
+ ll.state = ll.stateTable[dstState]
+
+ outBits = llB.outBits & 31
+ extraBits = extraBits<<outBits | uint64(s.litLen&bitMask32[outBits])
+ extraBitsN += outBits
+
+ wr.flush32()
+ wr.addBits64NC(extraBits, extraBitsN)
+
+ if debugSequences {
+ println("Encoded seq", seq, s)
}
- } else {
- for seq >= 0 {
- s = b.sequences[seq]
- wr.flush32()
- llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
- // tabelog max is below 8 for each.
- of.encode(ofB)
- ml.encode(mlB)
- ll.encode(llB)
- wr.flush32()
- // ml+ll = max 32 bits total
- wr.addBits32NC(s.litLen, llB.outBits)
- wr.addBits32NC(s.matchLen, mlB.outBits)
- wr.flush32()
- wr.addBits32NC(s.offset, ofB.outBits)
-
- if debugSequences {
- println("Encoded seq", seq, s)
- }
-
- seq--
- }
+ seq--
}
ml.flush(mlEnc.actualTableLog)
of.flush(ofEnc.actualTableLog)
@@ -786,7 +787,7 @@
// Size is output minus block header.
bh.setSize(uint32(len(b.output)-bhOffset) - 3)
- if debug {
+ if debugEncoder {
println("Rewriting block header", bh)
}
_ = bh.appendTo(b.output[bhOffset:bhOffset])
@@ -801,14 +802,13 @@
// nothing to do
return
}
-
if len(b.sequences) > math.MaxUint16 {
panic("can only encode up to 64K sequences")
}
// No bounds checks after here:
- llH := b.coders.llEnc.Histogram()[:256]
- ofH := b.coders.ofEnc.Histogram()[:256]
- mlH := b.coders.mlEnc.Histogram()[:256]
+ llH := b.coders.llEnc.Histogram()
+ ofH := b.coders.ofEnc.Histogram()
+ mlH := b.coders.mlEnc.Histogram()
for i := range llH {
llH[i] = 0
}
@@ -820,7 +820,8 @@
}
var llMax, ofMax, mlMax uint8
- for i, seq := range b.sequences {
+ for i := range b.sequences {
+ seq := &b.sequences[i]
v := llCode(seq.litLen)
seq.llCode = v
llH[v]++
@@ -844,7 +845,6 @@
panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d), matchlen: %d", mlMax, seq.matchLen))
}
}
- b.sequences[i] = seq
}
maxCount := func(a []uint32) int {
var max uint32
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index 658ef78..2ad0207 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -12,8 +12,8 @@
type byteBuffer interface {
// Read up to 8 bytes.
- // Returns nil if no more input is available.
- readSmall(n int) []byte
+ // Returns io.ErrUnexpectedEOF if this cannot be satisfied.
+ readSmall(n int) ([]byte, error)
// Read >8 bytes.
// MAY use the destination slice.
@@ -23,23 +23,23 @@
readByte() (byte, error)
// Skip n bytes.
- skipN(n int) error
+ skipN(n int64) error
}
// in-memory buffer
type byteBuf []byte
-func (b *byteBuf) readSmall(n int) []byte {
+func (b *byteBuf) readSmall(n int) ([]byte, error) {
if debugAsserts && n > 8 {
panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
}
bb := *b
if len(bb) < n {
- return nil
+ return nil, io.ErrUnexpectedEOF
}
r := bb[:n]
*b = bb[n:]
- return r
+ return r, nil
}
func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
@@ -52,10 +52,6 @@
return r, nil
}
-func (b *byteBuf) remain() []byte {
- return *b
-}
-
func (b *byteBuf) readByte() (byte, error) {
bb := *b
if len(bb) < 1 {
@@ -66,9 +62,12 @@
return r, nil
}
-func (b *byteBuf) skipN(n int) error {
+func (b *byteBuf) skipN(n int64) error {
bb := *b
- if len(bb) < n {
+ if n < 0 {
+ return fmt.Errorf("negative skip (%d) requested", n)
+ }
+ if int64(len(bb)) < n {
return io.ErrUnexpectedEOF
}
*b = bb[n:]
@@ -81,19 +80,22 @@
tmp [8]byte
}
-func (r *readerWrapper) readSmall(n int) []byte {
+func (r *readerWrapper) readSmall(n int) ([]byte, error) {
if debugAsserts && n > 8 {
panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
}
n2, err := io.ReadFull(r.r, r.tmp[:n])
// We only really care about the actual bytes read.
- if n2 != n {
- if debug {
+ if err != nil {
+ if err == io.EOF {
+ return nil, io.ErrUnexpectedEOF
+ }
+ if debugDecoder {
println("readSmall: got", n2, "want", n, "err", err)
}
- return nil
+ return nil, err
}
- return r.tmp[:n]
+ return r.tmp[:n], nil
}
func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) {
@@ -110,6 +112,9 @@
func (r *readerWrapper) readByte() (byte, error) {
n2, err := r.r.Read(r.tmp[:1])
if err != nil {
+ if err == io.EOF {
+ err = io.ErrUnexpectedEOF
+ }
return 0, err
}
if n2 != 1 {
@@ -118,9 +123,9 @@
return r.tmp[0], nil
}
-func (r *readerWrapper) skipN(n int) error {
- n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
- if n2 != int64(n) {
+func (r *readerWrapper) skipN(n int64) error {
+ n2, err := io.CopyN(ioutil.Discard, r.r, n)
+ if n2 != n {
err = io.ErrUnexpectedEOF
}
return err
diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go
index 2c4fca1..0e59a24 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@@ -13,12 +13,6 @@
off int
}
-// init will initialize the reader and set the input.
-func (b *byteReader) init(in []byte) {
- b.b = in
- b.off = 0
-}
-
// advance the stream b n bytes.
func (b *byteReader) advance(n uint) {
b.off += int(n)
diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
index 69736e8..5022e71 100644
--- a/vendor/github.com/klauspost/compress/zstd/decodeheader.go
+++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
@@ -5,6 +5,7 @@
import (
"bytes"
+ "encoding/binary"
"errors"
"io"
)
@@ -15,18 +16,50 @@
// Header contains information about the first frame and block within that.
type Header struct {
- // Window Size the window of data to keep while decoding.
- // Will only be set if HasFCS is false.
- WindowSize uint64
+ // SingleSegment specifies whether the data is to be decompressed into a
+ // single contiguous memory segment.
+ // It implies that WindowSize is invalid and that FrameContentSize is valid.
+ SingleSegment bool
- // Frame content size.
- // Expected size of the entire frame.
- FrameContentSize uint64
+ // WindowSize is the window of data to keep while decoding.
+ // Will only be set if SingleSegment is false.
+ WindowSize uint64
// Dictionary ID.
// If 0, no dictionary.
DictionaryID uint32
+ // HasFCS specifies whether FrameContentSize has a valid value.
+ HasFCS bool
+
+ // FrameContentSize is the expected uncompressed size of the entire frame.
+ FrameContentSize uint64
+
+ // Skippable will be true if the frame is meant to be skipped.
+ // This implies that FirstBlock.OK is false.
+ Skippable bool
+
+ // SkippableID is the user-specific ID for the skippable frame.
+ // Valid values are between 0 to 15, inclusive.
+ SkippableID int
+
+ // SkippableSize is the length of the user data to skip following
+ // the header.
+ SkippableSize uint32
+
+ // HeaderSize is the raw size of the frame header.
+ //
+ // For normal frames, it includes the size of the magic number and
+ // the size of the header (per section 3.1.1.1).
+ // It does not include the size for any data blocks (section 3.1.1.2) nor
+ // the size for the trailing content checksum.
+ //
+ // For skippable frames, this counts the size of the magic number
+ // along with the size of the size field of the payload.
+ // It does not include the size of the skippable payload itself.
+ // The total frame size is the HeaderSize plus the SkippableSize.
+ HeaderSize int
+
// First block information.
FirstBlock struct {
// OK will be set if first block could be decoded.
@@ -51,17 +84,9 @@
CompressedSize int
}
- // Skippable will be true if the frame is meant to be skipped.
- // No other information will be populated.
- Skippable bool
-
// If set there is a checksum present for the block content.
+ // The checksum field at the end is always 4 bytes long.
HasCheckSum bool
-
- // If this is true FrameContentSize will have a valid value
- HasFCS bool
-
- SingleSegment bool
}
// Decode the header from the beginning of the stream.
@@ -71,39 +96,46 @@
// If there isn't enough input, io.ErrUnexpectedEOF is returned.
// The FirstBlock.OK will indicate if enough information was available to decode the first block header.
func (h *Header) Decode(in []byte) error {
+ *h = Header{}
if len(in) < 4 {
return io.ErrUnexpectedEOF
}
+ h.HeaderSize += 4
b, in := in[:4], in[4:]
if !bytes.Equal(b, frameMagic) {
if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
return ErrMagicMismatch
}
- *h = Header{Skippable: true}
+ if len(in) < 4 {
+ return io.ErrUnexpectedEOF
+ }
+ h.HeaderSize += 4
+ h.Skippable = true
+ h.SkippableID = int(b[0] & 0xf)
+ h.SkippableSize = binary.LittleEndian.Uint32(in)
return nil
}
- if len(in) < 1 {
- return io.ErrUnexpectedEOF
- }
-
- // Clear output
- *h = Header{}
- fhd, in := in[0], in[1:]
- h.SingleSegment = fhd&(1<<5) != 0
- h.HasCheckSum = fhd&(1<<2) != 0
-
- if fhd&(1<<3) != 0 {
- return errors.New("reserved bit set on frame header")
- }
// Read Window_Descriptor
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
+ if len(in) < 1 {
+ return io.ErrUnexpectedEOF
+ }
+ fhd, in := in[0], in[1:]
+ h.HeaderSize++
+ h.SingleSegment = fhd&(1<<5) != 0
+ h.HasCheckSum = fhd&(1<<2) != 0
+ if fhd&(1<<3) != 0 {
+ return errors.New("reserved bit set on frame header")
+ }
+
if !h.SingleSegment {
if len(in) < 1 {
return io.ErrUnexpectedEOF
}
var wd byte
wd, in = in[0], in[1:]
+ h.HeaderSize++
windowLog := 10 + (wd >> 3)
windowBase := uint64(1) << windowLog
windowAdd := (windowBase / 8) * uint64(wd&0x7)
@@ -120,9 +152,7 @@
return io.ErrUnexpectedEOF
}
b, in = in[:size], in[size:]
- if b == nil {
- return io.ErrUnexpectedEOF
- }
+ h.HeaderSize += int(size)
switch size {
case 1:
h.DictionaryID = uint32(b[0])
@@ -152,9 +182,7 @@
return io.ErrUnexpectedEOF
}
b, in = in[:fcsSize], in[fcsSize:]
- if b == nil {
- return io.ErrUnexpectedEOF
- }
+ h.HeaderSize += int(fcsSize)
switch fcsSize {
case 1:
h.FrameContentSize = uint64(b[0])
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index f593e46..d212f47 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -5,9 +5,13 @@
package zstd
import (
- "errors"
+ "bytes"
+ "context"
+ "encoding/binary"
"io"
"sync"
+
+ "github.com/klauspost/compress/zstd/internal/xxhash"
)
// Decoder provides decoding of zstandard streams.
@@ -22,12 +26,19 @@
// Unreferenced decoders, ready for use.
decoders chan *blockDec
- // Streams ready to be decoded.
- stream chan decodeStream
-
// Current read position used for Reader functionality.
current decoderState
+ // sync stream decoding
+ syncStream struct {
+ decodedFrame uint64
+ br readerWrapper
+ enabled bool
+ inFrame bool
+ }
+
+ frame *frameDec
+
// Custom dictionaries.
// Always uses copies.
dicts map[uint32]dict
@@ -46,7 +57,10 @@
output chan decodeOutput
// cancel remaining output.
- cancel chan struct{}
+ cancel context.CancelFunc
+
+ // crc of current frame
+ crc *xxhash.Digest
flushed bool
}
@@ -81,7 +95,7 @@
return nil, err
}
}
- d.current.output = make(chan decodeOutput, d.o.concurrent)
+ d.current.crc = xxhash.New()
d.current.flushed = true
if r == nil {
@@ -113,9 +127,6 @@
// Returns the number of bytes written and any error that occurred.
// When the stream is done, io.EOF will be returned.
func (d *Decoder) Read(p []byte) (int, error) {
- if d.stream == nil {
- return 0, ErrDecoderNilInput
- }
var n int
for {
if len(d.current.b) > 0 {
@@ -133,12 +144,12 @@
break
}
if !d.nextBlock(n == 0) {
- return n, nil
+ return n, d.current.err
}
}
}
if len(d.current.b) > 0 {
- if debug {
+ if debugDecoder {
println("returning", n, "still bytes left:", len(d.current.b))
}
// Only return error at end of block
@@ -147,7 +158,7 @@
if d.current.err != nil {
d.drainOutput()
}
- if debug {
+ if debugDecoder {
println("returning", n, d.current.err, len(d.decoders))
}
return n, d.current.err
@@ -165,22 +176,20 @@
d.drainOutput()
+ d.syncStream.br.r = nil
if r == nil {
d.current.err = ErrDecoderNilInput
+ if len(d.current.b) > 0 {
+ d.current.b = d.current.b[:0]
+ }
d.current.flushed = true
return nil
}
- if d.stream == nil {
- d.stream = make(chan decodeStream, 1)
- d.streamWg.Add(1)
- go d.startStreamDecoder(d.stream)
- }
-
- // If bytes buffer and < 1MB, do sync decoding anyway.
- if bb, ok := r.(byter); ok && bb.Len() < 1<<20 {
+ // If bytes buffer and < 5MB, do sync decoding anyway.
+ if bb, ok := r.(byter); ok && bb.Len() < 5<<20 {
bb2 := bb
- if debug {
+ if debugDecoder {
println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
}
b := bb2.Bytes()
@@ -196,36 +205,48 @@
d.current.b = dst
d.current.err = err
d.current.flushed = true
- if debug {
+ if debugDecoder {
println("sync decode to", len(dst), "bytes, err:", err)
}
return nil
}
-
// Remove current block.
+ d.stashDecoder()
d.current.decodeOutput = decodeOutput{}
d.current.err = nil
- d.current.cancel = make(chan struct{})
d.current.flushed = false
d.current.d = nil
- d.stream <- decodeStream{
- r: r,
- output: d.current.output,
- cancel: d.current.cancel,
+ // Ensure no-one else is still running...
+ d.streamWg.Wait()
+ if d.frame == nil {
+ d.frame = newFrameDec(d.o)
}
+
+ if d.o.concurrent == 1 {
+ return d.startSyncDecoder(r)
+ }
+
+ d.current.output = make(chan decodeOutput, d.o.concurrent)
+ ctx, cancel := context.WithCancel(context.Background())
+ d.current.cancel = cancel
+ d.streamWg.Add(1)
+ go d.startStreamDecoder(ctx, r, d.current.output)
+
return nil
}
// drainOutput will drain the output until errEndOfStream is sent.
func (d *Decoder) drainOutput() {
if d.current.cancel != nil {
- println("cancelling current")
- close(d.current.cancel)
+ if debugDecoder {
+ println("cancelling current")
+ }
+ d.current.cancel()
d.current.cancel = nil
}
if d.current.d != nil {
- if debug {
+ if debugDecoder {
printf("re-adding current decoder %p, decoders: %d", d.current.d, len(d.decoders))
}
d.decoders <- d.current.d
@@ -238,34 +259,29 @@
}
for v := range d.current.output {
if v.d != nil {
- if debug {
+ if debugDecoder {
printf("re-adding decoder %p", v.d)
}
d.decoders <- v.d
}
- if v.err == errEndOfStream {
- println("current flushed")
- d.current.flushed = true
- return
- }
}
+ d.current.output = nil
+ d.current.flushed = true
}
// WriteTo writes data to w until there's no more data to write or when an error occurs.
// The return value n is the number of bytes written.
// Any error encountered during the write is also returned.
func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
- if d.stream == nil {
- return 0, ErrDecoderNilInput
- }
var n int64
for {
if len(d.current.b) > 0 {
n2, err2 := w.Write(d.current.b)
n += int64(n2)
- if err2 != nil && d.current.err == nil {
+ if err2 != nil && (d.current.err == nil || d.current.err == io.EOF) {
d.current.err = err2
- break
+ } else if n2 != len(d.current.b) {
+ d.current.err = io.ErrShortWrite
}
}
if d.current.err != nil {
@@ -289,7 +305,7 @@
// DecodeAll can be used concurrently.
// The Decoder concurrency limits will be respected.
func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
- if d.current.err == ErrDecoderClosed {
+ if d.decoders == nil {
return dst, ErrDecoderClosed
}
@@ -297,11 +313,14 @@
block := <-d.decoders
frame := block.localFrame
defer func() {
- if debug {
+ if debugDecoder {
printf("re-adding decoder: %p", block)
}
frame.rawInput = nil
frame.bBuf = nil
+ if frame.history.decoders.br != nil {
+ frame.history.decoders.br.in = nil
+ }
d.decoders <- block
}()
frame.bBuf = input
@@ -309,33 +328,42 @@
for {
frame.history.reset()
err := frame.reset(&frame.bBuf)
- if err == io.EOF {
- if debug {
- println("frame reset return EOF")
+ if err != nil {
+ if err == io.EOF {
+ if debugDecoder {
+ println("frame reset return EOF")
+ }
+ return dst, nil
}
- return dst, nil
+ return dst, err
}
if frame.DictionaryID != nil {
dict, ok := d.dicts[*frame.DictionaryID]
if !ok {
return nil, ErrUnknownDictionary
}
+ if debugDecoder {
+ println("setting dict", frame.DictionaryID)
+ }
frame.history.setDict(&dict)
}
- if err != nil {
- return dst, err
+ if frame.WindowSize > d.o.maxWindowSize {
+ if debugDecoder {
+ println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
+ }
+ return dst, ErrWindowSizeExceeded
}
- if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
- return dst, ErrDecoderSizeExceeded
- }
- if frame.FrameContentSize > 0 && frame.FrameContentSize < 1<<30 {
- // Never preallocate moe than 1 GB up front.
+ if frame.FrameContentSize != fcsUnknown {
+ if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+ return dst, ErrDecoderSizeExceeded
+ }
if cap(dst)-len(dst) < int(frame.FrameContentSize) {
- dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
+ dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
copy(dst2, dst)
dst = dst2
}
}
+
if cap(dst) == 0 {
// Allocate len(input) * 2 by default if nothing is provided
// and we didn't get frame content size.
@@ -355,7 +383,7 @@
return dst, err
}
if len(frame.bBuf) == 0 {
- if debug {
+ if debugDecoder {
println("frame dbuf empty")
}
break
@@ -370,31 +398,174 @@
// If non-blocking mode is used the returned boolean will be false
// if no data was available without blocking.
func (d *Decoder) nextBlock(blocking bool) (ok bool) {
+ if d.current.err != nil {
+ // Keep error state.
+ return false
+ }
+ d.current.b = d.current.b[:0]
+
+ // SYNC:
+ if d.syncStream.enabled {
+ if !blocking {
+ return false
+ }
+ ok = d.nextBlockSync()
+ if !ok {
+ d.stashDecoder()
+ }
+ return ok
+ }
+
+ //ASYNC:
+ d.stashDecoder()
+ if blocking {
+ d.current.decodeOutput, ok = <-d.current.output
+ } else {
+ select {
+ case d.current.decodeOutput, ok = <-d.current.output:
+ default:
+ return false
+ }
+ }
+ if !ok {
+ // This should not happen, so signal error state...
+ d.current.err = io.ErrUnexpectedEOF
+ return false
+ }
+ next := d.current.decodeOutput
+ if next.d != nil && next.d.async.newHist != nil {
+ d.current.crc.Reset()
+ }
+ if debugDecoder {
+ var tmp [4]byte
+ binary.LittleEndian.PutUint32(tmp[:], uint32(xxhash.Sum64(next.b)))
+ println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
+ }
+
+ if !d.o.ignoreChecksum && len(next.b) > 0 {
+ n, err := d.current.crc.Write(next.b)
+ if err == nil {
+ if n != len(next.b) {
+ d.current.err = io.ErrShortWrite
+ }
+ }
+ }
+ if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 {
+ got := d.current.crc.Sum64()
+ var tmp [4]byte
+ binary.LittleEndian.PutUint32(tmp[:], uint32(got))
+ if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
+ if debugDecoder {
+ println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
+ }
+ d.current.err = ErrCRCMismatch
+ } else {
+ if debugDecoder {
+ println("CRC ok", tmp[:])
+ }
+ }
+ }
+
+ return true
+}
+
+func (d *Decoder) nextBlockSync() (ok bool) {
+ if d.current.d == nil {
+ d.current.d = <-d.decoders
+ }
+ for len(d.current.b) == 0 {
+ if !d.syncStream.inFrame {
+ d.frame.history.reset()
+ d.current.err = d.frame.reset(&d.syncStream.br)
+ if d.current.err != nil {
+ return false
+ }
+ if d.frame.DictionaryID != nil {
+ dict, ok := d.dicts[*d.frame.DictionaryID]
+ if !ok {
+ d.current.err = ErrUnknownDictionary
+ return false
+ } else {
+ d.frame.history.setDict(&dict)
+ }
+ }
+ if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
+ d.current.err = ErrDecoderSizeExceeded
+ return false
+ }
+
+ d.syncStream.decodedFrame = 0
+ d.syncStream.inFrame = true
+ }
+ d.current.err = d.frame.next(d.current.d)
+ if d.current.err != nil {
+ return false
+ }
+ d.frame.history.ensureBlock()
+ if debugDecoder {
+ println("History trimmed:", len(d.frame.history.b), "decoded already:", d.syncStream.decodedFrame)
+ }
+ histBefore := len(d.frame.history.b)
+ d.current.err = d.current.d.decodeBuf(&d.frame.history)
+
+ if d.current.err != nil {
+ println("error after:", d.current.err)
+ return false
+ }
+ d.current.b = d.frame.history.b[histBefore:]
+ if debugDecoder {
+ println("history after:", len(d.frame.history.b))
+ }
+
+ // Check frame size (before CRC)
+ d.syncStream.decodedFrame += uint64(len(d.current.b))
+ if d.syncStream.decodedFrame > d.frame.FrameContentSize {
+ if debugDecoder {
+ printf("DecodedFrame (%d) > FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
+ }
+ d.current.err = ErrFrameSizeExceeded
+ return false
+ }
+
+ // Check FCS
+ if d.current.d.Last && d.frame.FrameContentSize != fcsUnknown && d.syncStream.decodedFrame != d.frame.FrameContentSize {
+ if debugDecoder {
+ printf("DecodedFrame (%d) != FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
+ }
+ d.current.err = ErrFrameSizeMismatch
+ return false
+ }
+
+ // Update/Check CRC
+ if d.frame.HasCheckSum {
+ if !d.o.ignoreChecksum {
+ d.frame.crc.Write(d.current.b)
+ }
+ if d.current.d.Last {
+ if !d.o.ignoreChecksum {
+ d.current.err = d.frame.checkCRC()
+ } else {
+ d.current.err = d.frame.consumeCRC()
+ }
+ if d.current.err != nil {
+ println("CRC error:", d.current.err)
+ return false
+ }
+ }
+ }
+ d.syncStream.inFrame = !d.current.d.Last
+ }
+ return true
+}
+
+func (d *Decoder) stashDecoder() {
if d.current.d != nil {
- if debug {
+ if debugDecoder {
printf("re-adding current decoder %p", d.current.d)
}
d.decoders <- d.current.d
d.current.d = nil
}
- if d.current.err != nil {
- // Keep error state.
- return blocking
- }
-
- if blocking {
- d.current.decodeOutput = <-d.current.output
- } else {
- select {
- case d.current.decodeOutput = <-d.current.output:
- default:
- return false
- }
- }
- if debug {
- println("got", len(d.current.b), "bytes, error:", d.current.err)
- }
- return true
}
// Close will release all resources.
@@ -404,10 +575,10 @@
return
}
d.drainOutput()
- if d.stream != nil {
- close(d.stream)
+ if d.current.cancel != nil {
+ d.current.cancel()
d.streamWg.Wait()
- d.stream = nil
+ d.current.cancel = nil
}
if d.decoders != nil {
close(d.decoders)
@@ -458,100 +629,296 @@
err error
}
-type decodeStream struct {
- r io.Reader
-
- // Blocks ready to be written to output.
- output chan decodeOutput
-
- // cancel reading from the input
- cancel chan struct{}
+func (d *Decoder) startSyncDecoder(r io.Reader) error {
+ d.frame.history.reset()
+ d.syncStream.br = readerWrapper{r: r}
+ d.syncStream.inFrame = false
+ d.syncStream.enabled = true
+ d.syncStream.decodedFrame = 0
+ return nil
}
-// errEndOfStream indicates that everything from the stream was read.
-var errEndOfStream = errors.New("end-of-stream")
-
// Create Decoder:
-// Spawn n block decoders. These accept tasks to decode a block.
-// Create goroutine that handles stream processing, this will send history to decoders as they are available.
-// Decoders update the history as they decode.
-// When a block is returned:
-// a) history is sent to the next decoder,
-// b) content written to CRC.
-// c) return data to WRITER.
-// d) wait for next block to return data.
-// Once WRITTEN, the decoders reused by the writer frame decoder for re-use.
-func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
+// ASYNC:
+// Spawn 3 go routines.
+// 0: Read frames and decode block literals.
+// 1: Decode sequences.
+// 2: Execute sequences, send to output.
+func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
defer d.streamWg.Done()
- frame := newFrameDec(d.o)
- for stream := range inStream {
- if debug {
- println("got new stream")
+ br := readerWrapper{r: r}
+
+ var seqDecode = make(chan *blockDec, d.o.concurrent)
+ var seqExecute = make(chan *blockDec, d.o.concurrent)
+
+ // Async 1: Decode sequences...
+ go func() {
+ var hist history
+ var hasErr bool
+
+ for block := range seqDecode {
+ if hasErr {
+ if block != nil {
+ seqExecute <- block
+ }
+ continue
+ }
+ if block.async.newHist != nil {
+ if debugDecoder {
+ println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
+ }
+ hist.decoders = block.async.newHist.decoders
+ hist.recentOffsets = block.async.newHist.recentOffsets
+ hist.windowSize = block.async.newHist.windowSize
+ if block.async.newHist.dict != nil {
+ hist.setDict(block.async.newHist.dict)
+ }
+ }
+ if block.err != nil || block.Type != blockTypeCompressed {
+ hasErr = block.err != nil
+ seqExecute <- block
+ continue
+ }
+
+ hist.decoders.literals = block.async.literals
+ block.err = block.prepareSequences(block.async.seqData, &hist)
+ if debugDecoder && block.err != nil {
+ println("prepareSequences returned:", block.err)
+ }
+ hasErr = block.err != nil
+ if block.err == nil {
+ block.err = block.decodeSequences(&hist)
+ if debugDecoder && block.err != nil {
+ println("decodeSequences returned:", block.err)
+ }
+ hasErr = block.err != nil
+ // block.async.sequence = hist.decoders.seq[:hist.decoders.nSeqs]
+ block.async.seqSize = hist.decoders.seqSize
+ }
+ seqExecute <- block
}
- br := readerWrapper{r: stream.r}
- decodeStream:
- for {
- frame.history.reset()
- err := frame.reset(&br)
- if debug && err != nil {
- println("Frame decoder returned", err)
+ close(seqExecute)
+ }()
+
+ var wg sync.WaitGroup
+ wg.Add(1)
+
+ // Async 3: Execute sequences...
+ frameHistCache := d.frame.history.b
+ go func() {
+ var hist history
+ var decodedFrame uint64
+ var fcs uint64
+ var hasErr bool
+ for block := range seqExecute {
+ out := decodeOutput{err: block.err, d: block}
+ if block.err != nil || hasErr {
+ hasErr = true
+ output <- out
+ continue
}
- if err == nil && frame.DictionaryID != nil {
- dict, ok := d.dicts[*frame.DictionaryID]
- if !ok {
- err = ErrUnknownDictionary
+ if block.async.newHist != nil {
+ if debugDecoder {
+ println("Async 2: new history")
+ }
+ hist.windowSize = block.async.newHist.windowSize
+ hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
+ if block.async.newHist.dict != nil {
+ hist.setDict(block.async.newHist.dict)
+ }
+
+ if cap(hist.b) < hist.allocFrameBuffer {
+ if cap(frameHistCache) >= hist.allocFrameBuffer {
+ hist.b = frameHistCache
+ } else {
+ hist.b = make([]byte, 0, hist.allocFrameBuffer)
+ println("Alloc history sized", hist.allocFrameBuffer)
+ }
+ }
+ hist.b = hist.b[:0]
+ fcs = block.async.fcs
+ decodedFrame = 0
+ }
+ do := decodeOutput{err: block.err, d: block}
+ switch block.Type {
+ case blockTypeRLE:
+ if debugDecoder {
+ println("add rle block length:", block.RLESize)
+ }
+
+ if cap(block.dst) < int(block.RLESize) {
+ if block.lowMem {
+ block.dst = make([]byte, block.RLESize)
+ } else {
+ block.dst = make([]byte, maxBlockSize)
+ }
+ }
+ block.dst = block.dst[:block.RLESize]
+ v := block.data[0]
+ for i := range block.dst {
+ block.dst[i] = v
+ }
+ hist.append(block.dst)
+ do.b = block.dst
+ case blockTypeRaw:
+ if debugDecoder {
+ println("add raw block length:", len(block.data))
+ }
+ hist.append(block.data)
+ do.b = block.data
+ case blockTypeCompressed:
+ if debugDecoder {
+ println("execute with history length:", len(hist.b), "window:", hist.windowSize)
+ }
+ hist.decoders.seqSize = block.async.seqSize
+ hist.decoders.literals = block.async.literals
+ do.err = block.executeSequences(&hist)
+ hasErr = do.err != nil
+ if debugDecoder && hasErr {
+ println("executeSequences returned:", do.err)
+ }
+ do.b = block.dst
+ }
+ if !hasErr {
+ decodedFrame += uint64(len(do.b))
+ if decodedFrame > fcs {
+ println("fcs exceeded", block.Last, fcs, decodedFrame)
+ do.err = ErrFrameSizeExceeded
+ hasErr = true
+ } else if block.Last && fcs != fcsUnknown && decodedFrame != fcs {
+ do.err = ErrFrameSizeMismatch
+ hasErr = true
} else {
- frame.history.setDict(&dict)
+ if debugDecoder {
+ println("fcs ok", block.Last, fcs, decodedFrame)
+ }
}
}
- if err != nil {
- stream.output <- decodeOutput{
- err: err,
+ output <- do
+ }
+ close(output)
+ frameHistCache = hist.b
+ wg.Done()
+ if debugDecoder {
+ println("decoder goroutines finished")
+ }
+ }()
+
+decodeStream:
+ for {
+ var hist history
+ var hasErr bool
+
+ decodeBlock := func(block *blockDec) {
+ if hasErr {
+ if block != nil {
+ seqDecode <- block
}
+ return
+ }
+ if block.err != nil || block.Type != blockTypeCompressed {
+ hasErr = block.err != nil
+ seqDecode <- block
+ return
+ }
+
+ remain, err := block.decodeLiterals(block.data, &hist)
+ block.err = err
+ hasErr = block.err != nil
+ if err == nil {
+ block.async.literals = hist.decoders.literals
+ block.async.seqData = remain
+ } else if debugDecoder {
+ println("decodeLiterals error:", err)
+ }
+ seqDecode <- block
+ }
+ frame := d.frame
+ if debugDecoder {
+ println("New frame...")
+ }
+ var historySent bool
+ frame.history.reset()
+ err := frame.reset(&br)
+ if debugDecoder && err != nil {
+ println("Frame decoder returned", err)
+ }
+ if err == nil && frame.DictionaryID != nil {
+ dict, ok := d.dicts[*frame.DictionaryID]
+ if !ok {
+ err = ErrUnknownDictionary
+ } else {
+ frame.history.setDict(&dict)
+ }
+ }
+ if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
+ err = ErrDecoderSizeExceeded
+ }
+ if err != nil {
+ select {
+ case <-ctx.Done():
+ case dec := <-d.decoders:
+ dec.sendErr(err)
+ decodeBlock(dec)
+ }
+ break decodeStream
+ }
+
+ // Go through all blocks of the frame.
+ for {
+ var dec *blockDec
+ select {
+ case <-ctx.Done():
+ break decodeStream
+ case dec = <-d.decoders:
+ // Once we have a decoder, we MUST return it.
+ }
+ err := frame.next(dec)
+ if !historySent {
+ h := frame.history
+ if debugDecoder {
+ println("Alloc History:", h.allocFrameBuffer)
+ }
+ hist.reset()
+ if h.dict != nil {
+ hist.setDict(h.dict)
+ }
+ dec.async.newHist = &h
+ dec.async.fcs = frame.FrameContentSize
+ historySent = true
+ } else {
+ dec.async.newHist = nil
+ }
+ if debugDecoder && err != nil {
+ println("next block returned error:", err)
+ }
+ dec.err = err
+ dec.checkCRC = nil
+ if dec.Last && frame.HasCheckSum && err == nil {
+ crc, err := frame.rawInput.readSmall(4)
+ if err != nil {
+ println("CRC missing?", err)
+ dec.err = err
+ }
+ var tmp [4]byte
+ copy(tmp[:], crc)
+ dec.checkCRC = tmp[:]
+ if debugDecoder {
+ println("found crc to check:", dec.checkCRC)
+ }
+ }
+ err = dec.err
+ last := dec.Last
+ decodeBlock(dec)
+ if err != nil {
+ break decodeStream
+ }
+ if last {
break
}
- if debug {
- println("starting frame decoder")
- }
-
- // This goroutine will forward history between frames.
- frame.frameDone.Add(1)
- frame.initAsync()
-
- go frame.startDecoder(stream.output)
- decodeFrame:
- // Go through all blocks of the frame.
- for {
- dec := <-d.decoders
- select {
- case <-stream.cancel:
- if !frame.sendErr(dec, io.EOF) {
- // To not let the decoder dangle, send it back.
- stream.output <- decodeOutput{d: dec}
- }
- break decodeStream
- default:
- }
- err := frame.next(dec)
- switch err {
- case io.EOF:
- // End of current frame, no error
- println("EOF on next block")
- break decodeFrame
- case nil:
- continue
- default:
- println("block decoder returned", err)
- break decodeStream
- }
- }
- // All blocks have started decoding, check if there are more frames.
- println("waiting for done")
- frame.frameDone.Wait()
- println("done waiting...")
}
- frame.frameDone.Wait()
- println("Sending EOS")
- stream.output <- decodeOutput{err: errEndOfStream}
}
+ close(seqDecode)
+ wg.Wait()
+ d.frame.history.b = frameHistCache
}
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index c0fd058..c70e6fa 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -17,16 +17,22 @@
lowMem bool
concurrent int
maxDecodedSize uint64
+ maxWindowSize uint64
dicts []dict
+ ignoreChecksum bool
}
func (o *decoderOptions) setDefault() {
*o = decoderOptions{
// use less ram: true for now, but may change.
- lowMem: true,
- concurrent: runtime.GOMAXPROCS(0),
+ lowMem: true,
+ concurrent: runtime.GOMAXPROCS(0),
+ maxWindowSize: MaxWindowSize,
}
- o.maxDecodedSize = 1 << 63
+ if o.concurrent > 4 {
+ o.concurrent = 4
+ }
+ o.maxDecodedSize = 64 << 30
}
// WithDecoderLowmem will set whether to use a lower amount of memory,
@@ -35,16 +41,25 @@
return func(o *decoderOptions) error { o.lowMem = b; return nil }
}
-// WithDecoderConcurrency will set the concurrency,
-// meaning the maximum number of decoders to run concurrently.
-// The value supplied must be at least 1.
-// By default this will be set to GOMAXPROCS.
+// WithDecoderConcurrency sets the number of created decoders.
+// When decoding block with DecodeAll, this will limit the number
+// of possible concurrently running decodes.
+// When decoding streams, this will limit the number of
+// inflight blocks.
+// When decoding streams and setting maximum to 1,
+// no async decoding will be done.
+// When a value of 0 is provided GOMAXPROCS will be used.
+// By default this will be set to 4 or GOMAXPROCS, whatever is lower.
func WithDecoderConcurrency(n int) DOption {
return func(o *decoderOptions) error {
- if n <= 0 {
+ if n < 0 {
return errors.New("concurrency must be at least 1")
}
- o.concurrent = n
+ if n == 0 {
+ o.concurrent = runtime.GOMAXPROCS(0)
+ } else {
+ o.concurrent = n
+ }
return nil
}
}
@@ -52,8 +67,7 @@
// WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
// non-streaming operations or maximum window size for streaming operations.
// This can be used to control memory usage of potentially hostile content.
-// For streaming operations, the maximum window size is capped at 1<<30 bytes.
-// Maximum and default is 1 << 63 bytes.
+// Maximum is 1 << 63 bytes. Default is 64GiB.
func WithDecoderMaxMemory(n uint64) DOption {
return func(o *decoderOptions) error {
if n == 0 {
@@ -81,3 +95,29 @@
return nil
}
}
+
+// WithDecoderMaxWindow allows to set a maximum window size for decodes.
+// This allows rejecting packets that will cause big memory usage.
+// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.
+// If WithDecoderMaxMemory is set to a lower value, that will be used.
+// Default is 512MB, Maximum is ~3.75 TB as per zstandard spec.
+func WithDecoderMaxWindow(size uint64) DOption {
+ return func(o *decoderOptions) error {
+ if size < MinWindowSize {
+ return errors.New("WithMaxWindowSize must be at least 1KB, 1024 bytes")
+ }
+ if size > (1<<41)+7*(1<<38) {
+ return errors.New("WithMaxWindowSize must be less than (1<<41) + 7*(1<<38) ~ 3.75TB")
+ }
+ o.maxWindowSize = size
+ return nil
+ }
+}
+
+// IgnoreChecksum allows to forcibly ignore checksum checking.
+func IgnoreChecksum(b bool) DOption {
+ return func(o *decoderOptions) error {
+ o.ignoreChecksum = b
+ return nil
+ }
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go
index fa25a18..a36ae83 100644
--- a/vendor/github.com/klauspost/compress/zstd/dict.go
+++ b/vendor/github.com/klauspost/compress/zstd/dict.go
@@ -82,7 +82,7 @@
println("Transform table error:", err)
return err
}
- if debug {
+ if debugDecoder || debugEncoder {
println("Read table ok", "symbolLen:", dec.symbolLen)
}
// Set decoders as predefined so they aren't reused.
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go
index 60f2986..15ae8ee 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_base.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go
@@ -38,8 +38,8 @@
// WindowSize returns the window size of the encoder,
// or a window size small enough to contain the input size, if > 0.
-func (e *fastBase) WindowSize(size int) int32 {
- if size > 0 && size < int(e.maxMatchOff) {
+func (e *fastBase) WindowSize(size int64) int32 {
+ if size > 0 && size < int64(e.maxMatchOff) {
b := int32(1) << uint(bits.Len(uint(size)))
// Keep minimum window.
if b < 1024 {
@@ -108,11 +108,6 @@
e.blk = enc
}
-func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 {
- // Extend the match to be as long as possible.
- return int32(matchLen(src[s:], src[t:]))
-}
-
func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
if debugAsserts {
if s < 0 {
@@ -131,9 +126,24 @@
panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
}
}
+ a := src[s:]
+ b := src[t:]
+ b = b[:len(a)]
+ end := int32((len(a) >> 3) << 3)
+ for i := int32(0); i < end; i += 8 {
+ if diff := load6432(a, i) ^ load6432(b, i); diff != 0 {
+ return i + int32(bits.TrailingZeros64(diff)>>3)
+ }
+ }
- // Extend the match to be as long as possible.
- return int32(matchLen(src[s:], src[t:]))
+ a = a[end:]
+ b = b[end:]
+ for i := range a {
+ if a[i] != b[i] {
+ return int32(i) + end
+ }
+ }
+ return int32(len(a)) + end
}
// Reset the encoding table.
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_best.go b/vendor/github.com/klauspost/compress/zstd/enc_best.go
index dc1eed5..96028ec 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_best.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go
@@ -5,22 +5,61 @@
package zstd
import (
+ "bytes"
"fmt"
- "math/bits"
+
+ "github.com/klauspost/compress"
)
const (
- bestLongTableBits = 20 // Bits used in the long match table
+ bestLongTableBits = 22 // Bits used in the long match table
bestLongTableSize = 1 << bestLongTableBits // Size of the table
+ bestLongLen = 8 // Bytes used for table hash
// Note: Increasing the short table bits or making the hash shorter
// can actually lead to compression degradation since it will 'steal' more from the
// long match table and match offsets are quite big.
// This greatly depends on the type of input.
- bestShortTableBits = 16 // Bits used in the short match table
+ bestShortTableBits = 18 // Bits used in the short match table
bestShortTableSize = 1 << bestShortTableBits // Size of the table
+ bestShortLen = 4 // Bytes used for table hash
+
)
+type match struct {
+ offset int32
+ s int32
+ length int32
+ rep int32
+ est int32
+}
+
+const highScore = 25000
+
+// estBits will estimate output bits from predefined tables.
+func (m *match) estBits(bitsPerByte int32) {
+ mlc := mlCode(uint32(m.length - zstdMinMatch))
+ var ofc uint8
+ if m.rep < 0 {
+ ofc = ofCode(uint32(m.s-m.offset) + 3)
+ } else {
+ ofc = ofCode(uint32(m.rep))
+ }
+ // Cost, excluding
+ ofTT, mlTT := fsePredefEnc[tableOffsets].ct.symbolTT[ofc], fsePredefEnc[tableMatchLengths].ct.symbolTT[mlc]
+
+ // Add cost of match encoding...
+ m.est = int32(ofTT.outBits + mlTT.outBits)
+ m.est += int32(ofTT.deltaNbBits>>16 + mlTT.deltaNbBits>>16)
+ // Subtract savings compared to literal encoding...
+ m.est -= (m.length * bitsPerByte) >> 10
+ if m.est > 0 {
+ // Unlikely gain..
+ m.length = 0
+ m.est = highScore
+ }
+}
+
// bestFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
// The long match table contains the previous entry with the same hash,
// effectively making it a "chain" of length 2.
@@ -109,6 +148,14 @@
return
}
+ // Use this to estimate literal cost.
+ // Scaled by 10 bits.
+ bitsPerByte := int32((compress.ShannonEntropyBits(src) * 1024) / len(src))
+ // Huffman can never go < 1 bit/byte
+ if bitsPerByte < 1024 {
+ bitsPerByte = 1024
+ }
+
// Override src
src = e.hist
sLimit := int32(len(src)) - inputMargin
@@ -132,7 +179,7 @@
}
_ = addLiterals
- if debug {
+ if debugEncoder {
println("recent offsets:", blk.recentOffsets)
}
@@ -145,51 +192,49 @@
panic("offset0 was 0")
}
- type match struct {
- offset int32
- s int32
- length int32
- rep int32
- }
- matchAt := func(offset int32, s int32, first uint32, rep int32) match {
- if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
- return match{offset: offset, s: s}
- }
- return match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
- }
-
bestOf := func(a, b match) match {
- aScore := b.s - a.s + a.length
- bScore := a.s - b.s + b.length
- if a.rep < 0 {
- aScore = aScore - int32(bits.Len32(uint32(a.offset)))/8
- }
- if b.rep < 0 {
- bScore = bScore - int32(bits.Len32(uint32(b.offset)))/8
- }
- if aScore >= bScore {
+ if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 {
return a
}
return b
}
const goodEnough = 100
- nextHashL := hash8(cv, bestLongTableBits)
- nextHashS := hash4x64(cv, bestShortTableBits)
+ nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
+ nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
+ matchAt := func(offset int32, s int32, first uint32, rep int32) match {
+ if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
+ return match{s: s, est: highScore}
+ }
+ if debugAsserts {
+ if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
+ panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
+ }
+ }
+ m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
+ m.estBits(bitsPerByte)
+ return m
+ }
+
best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1))
+
if canRepeat && best.length < goodEnough {
- best = bestOf(best, matchAt(s-offset1+1, s+1, uint32(cv>>8), 1))
- best = bestOf(best, matchAt(s-offset2+1, s+1, uint32(cv>>8), 2))
- best = bestOf(best, matchAt(s-offset3+1, s+1, uint32(cv>>8), 3))
+ cv32 := uint32(cv >> 8)
+ spp := s + 1
+ best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
+ best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
+ best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
if best.length > 0 {
- best = bestOf(best, matchAt(s-offset1+3, s+3, uint32(cv>>24), 1))
- best = bestOf(best, matchAt(s-offset2+3, s+3, uint32(cv>>24), 2))
- best = bestOf(best, matchAt(s-offset3+3, s+3, uint32(cv>>24), 3))
+ cv32 = uint32(cv >> 24)
+ spp += 2
+ best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
+ best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
+ best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
}
}
// Load next and check...
@@ -209,22 +254,28 @@
}
s++
- candidateS = e.table[hash4x64(cv>>8, bestShortTableBits)]
+ candidateS = e.table[hashLen(cv>>8, bestShortTableBits, bestShortLen)]
cv = load6432(src, s)
cv2 := load6432(src, s+1)
- candidateL = e.longTable[hash8(cv, bestLongTableBits)]
- candidateL2 := e.longTable[hash8(cv2, bestLongTableBits)]
+ candidateL = e.longTable[hashLen(cv, bestLongTableBits, bestLongLen)]
+ candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
+ // Short at s+1
best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
+ // Long at s+1, s+2
best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
-
+ if false {
+ // Short at s+3.
+ // Too often worse...
+ best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1))
+ }
// See if we can find a better match by checking where the current best ends.
// Use that offset to see if we can find a better full match.
if sAt := best.s + best.length; sAt < sLimit {
- nextHashL := hash8(load6432(src, sAt), bestLongTableBits)
+ nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
candidateEnd := e.longTable[nextHashL]
if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 {
bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1))
@@ -236,6 +287,12 @@
}
}
+ if debugAsserts {
+ if !bytes.Equal(src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]) {
+ panic(fmt.Sprintf("match mismatch: %v != %v", src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]))
+ }
+ }
+
// We have a match, we can store the forward value
if best.rep > 0 {
s = best.s
@@ -274,7 +331,7 @@
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, best.length)
}
@@ -284,8 +341,8 @@
off := index0 + e.cur
for index0 < s-1 {
cv0 := load6432(src, index0)
- h0 := hash8(cv0, bestLongTableBits)
- h1 := hash4x64(cv0, bestShortTableBits)
+ h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
+ h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
off++
@@ -311,7 +368,7 @@
panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
}
- if debugAsserts && canRepeat && int(offset1) > len(src) {
+ if debugAsserts && int(offset1) > len(src) {
panic("invalid offset")
}
@@ -352,8 +409,8 @@
// every entry
for index0 < s-1 {
cv0 := load6432(src, index0)
- h0 := hash8(cv0, bestLongTableBits)
- h1 := hash4x64(cv0, bestShortTableBits)
+ h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
+ h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
@@ -374,8 +431,8 @@
}
// Store this, since we have it.
- nextHashS := hash4x64(cv, bestShortTableBits)
- nextHashL := hash8(cv, bestLongTableBits)
+ nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
+ nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
@@ -412,7 +469,7 @@
blk.recentOffsets[0] = uint32(offset1)
blk.recentOffsets[1] = uint32(offset2)
blk.recentOffsets[2] = uint32(offset3)
- if debug {
+ if debugEncoder {
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
}
@@ -425,7 +482,7 @@
e.Encode(blk, src)
}
-// ResetDict will reset and set a dictionary if not nil
+// Reset will reset and set a dictionary if not nil
func (e *bestFastEncoder) Reset(d *dict, singleBlock bool) {
e.resetBase(d, singleBlock)
if d == nil {
@@ -441,10 +498,10 @@
const hashLog = bestShortTableBits
cv := load6432(d.content, i-e.maxMatchOff)
- nextHash := hash4x64(cv, hashLog) // 0 -> 4
- nextHash1 := hash4x64(cv>>8, hashLog) // 1 -> 5
- nextHash2 := hash4x64(cv>>16, hashLog) // 2 -> 6
- nextHash3 := hash4x64(cv>>24, hashLog) // 3 -> 7
+ nextHash := hashLen(cv, hashLog, bestShortLen) // 0 -> 4
+ nextHash1 := hashLen(cv>>8, hashLog, bestShortLen) // 1 -> 5
+ nextHash2 := hashLen(cv>>16, hashLog, bestShortLen) // 2 -> 6
+ nextHash3 := hashLen(cv>>24, hashLog, bestShortLen) // 3 -> 7
e.dictTable[nextHash] = prevEntry{
prev: e.dictTable[nextHash].offset,
offset: i,
@@ -472,7 +529,7 @@
}
if len(d.content) >= 8 {
cv := load6432(d.content, 0)
- h := hash8(cv, bestLongTableBits)
+ h := hashLen(cv, bestLongTableBits, bestLongLen)
e.dictLongTable[h] = prevEntry{
offset: e.maxMatchOff,
prev: e.dictLongTable[h].offset,
@@ -482,7 +539,7 @@
off := 8 // First to read
for i := e.maxMatchOff + 1; i < end; i++ {
cv = cv>>8 | (uint64(d.content[off]) << 56)
- h := hash8(cv, bestLongTableBits)
+ h := hashLen(cv, bestLongTableBits, bestLongLen)
e.dictLongTable[h] = prevEntry{
offset: i,
prev: e.dictLongTable[h].offset,
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
index 6049542..c769f69 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -9,6 +9,7 @@
const (
betterLongTableBits = 19 // Bits used in the long match table
betterLongTableSize = 1 << betterLongTableBits // Size of the table
+ betterLongLen = 8 // Bytes used for table hash
// Note: Increasing the short table bits or making the hash shorter
// can actually lead to compression degradation since it will 'steal' more from the
@@ -16,6 +17,7 @@
// This greatly depends on the type of input.
betterShortTableBits = 13 // Bits used in the short match table
betterShortTableSize = 1 << betterShortTableBits // Size of the table
+ betterShortLen = 5 // Bytes used for table hash
betterLongTableShardCnt = 1 << (betterLongTableBits - dictShardBits) // Number of shards in the table
betterLongTableShardSize = betterLongTableSize / betterLongTableShardCnt // Size of an individual shard
@@ -138,7 +140,7 @@
blk.literals = append(blk.literals, src[nextEmit:until]...)
s.litLen = uint32(until - nextEmit)
}
- if debug {
+ if debugEncoder {
println("recent offsets:", blk.recentOffsets)
}
@@ -154,8 +156,8 @@
panic("offset0 was 0")
}
- nextHashS := hash5(cv, betterShortTableBits)
- nextHashL := hash8(cv, betterLongTableBits)
+ nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -204,7 +206,7 @@
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, lenght)
}
@@ -214,10 +216,10 @@
for index0 < s-1 {
cv0 := load6432(src, index0)
cv1 := cv0 >> 8
- h0 := hash8(cv0, betterLongTableBits)
+ h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
- e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+ e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
index0 += 2
}
cv = load6432(src, s)
@@ -264,7 +266,7 @@
s += lenght + repOff2
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, lenght)
}
@@ -275,10 +277,10 @@
for index0 < s-1 {
cv0 := load6432(src, index0)
cv1 := cv0 >> 8
- h0 := hash8(cv0, betterLongTableBits)
+ h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
- e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+ e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
index0 += 2
}
cv = load6432(src, s)
@@ -353,7 +355,7 @@
// See if we can find a long match at s+1
const checkAt = 1
cv := load6432(src, s+checkAt)
- nextHashL = hash8(cv, betterLongTableBits)
+ nextHashL = hashLen(cv, betterLongTableBits, betterLongLen)
candidateL = e.longTable[nextHashL]
coffsetL = candidateL.offset - e.cur
@@ -413,8 +415,8 @@
}
// Try to find a better match by searching for a long match at the end of the current best match
- if true && s+matched < sLimit {
- nextHashL := hash8(load6432(src, s+matched), betterLongTableBits)
+ if s+matched < sLimit {
+ nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
cv := load3232(src, s)
candidateL := e.longTable[nextHashL]
coffsetL := candidateL.offset - e.cur - matched
@@ -495,10 +497,10 @@
for index0 < s-1 {
cv0 := load6432(src, index0)
cv1 := cv0 >> 8
- h0 := hash8(cv0, betterLongTableBits)
+ h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
- e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+ e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
index0 += 2
}
@@ -516,8 +518,8 @@
}
// Store this, since we have it.
- nextHashS := hash5(cv, betterShortTableBits)
- nextHashL := hash8(cv, betterLongTableBits)
+ nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
@@ -553,7 +555,7 @@
}
blk.recentOffsets[0] = uint32(offset1)
blk.recentOffsets[1] = uint32(offset2)
- if debug {
+ if debugEncoder {
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
}
@@ -656,7 +658,7 @@
blk.literals = append(blk.literals, src[nextEmit:until]...)
s.litLen = uint32(until - nextEmit)
}
- if debug {
+ if debugEncoder {
println("recent offsets:", blk.recentOffsets)
}
@@ -672,8 +674,8 @@
panic("offset0 was 0")
}
- nextHashS := hash5(cv, betterShortTableBits)
- nextHashL := hash8(cv, betterLongTableBits)
+ nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -724,7 +726,7 @@
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, lenght)
}
@@ -734,11 +736,11 @@
for index0 < s-1 {
cv0 := load6432(src, index0)
cv1 := cv0 >> 8
- h0 := hash8(cv0, betterLongTableBits)
+ h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.markLongShardDirty(h0)
- h1 := hash5(cv1, betterShortTableBits)
+ h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
e.markShortShardDirty(h1)
index0 += 2
@@ -787,7 +789,7 @@
s += lenght + repOff2
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, lenght)
}
@@ -798,11 +800,11 @@
for index0 < s-1 {
cv0 := load6432(src, index0)
cv1 := cv0 >> 8
- h0 := hash8(cv0, betterLongTableBits)
+ h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.markLongShardDirty(h0)
- h1 := hash5(cv1, betterShortTableBits)
+ h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
e.markShortShardDirty(h1)
index0 += 2
@@ -879,7 +881,7 @@
// See if we can find a long match at s+1
const checkAt = 1
cv := load6432(src, s+checkAt)
- nextHashL = hash8(cv, betterLongTableBits)
+ nextHashL = hashLen(cv, betterLongTableBits, betterLongLen)
candidateL = e.longTable[nextHashL]
coffsetL = candidateL.offset - e.cur
@@ -940,7 +942,7 @@
}
// Try to find a better match by searching for a long match at the end of the current best match
if s+matched < sLimit {
- nextHashL := hash8(load6432(src, s+matched), betterLongTableBits)
+ nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
cv := load3232(src, s)
candidateL := e.longTable[nextHashL]
coffsetL := candidateL.offset - e.cur - matched
@@ -1021,11 +1023,11 @@
for index0 < s-1 {
cv0 := load6432(src, index0)
cv1 := cv0 >> 8
- h0 := hash8(cv0, betterLongTableBits)
+ h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
off := index0 + e.cur
e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
e.markLongShardDirty(h0)
- h1 := hash5(cv1, betterShortTableBits)
+ h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
e.markShortShardDirty(h1)
index0 += 2
@@ -1045,8 +1047,8 @@
}
// Store this, since we have it.
- nextHashS := hash5(cv, betterShortTableBits)
- nextHashL := hash8(cv, betterLongTableBits)
+ nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
@@ -1084,7 +1086,7 @@
}
blk.recentOffsets[0] = uint32(offset1)
blk.recentOffsets[1] = uint32(offset2)
- if debug {
+ if debugEncoder {
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
}
@@ -1113,10 +1115,10 @@
const hashLog = betterShortTableBits
cv := load6432(d.content, i-e.maxMatchOff)
- nextHash := hash5(cv, hashLog) // 0 -> 4
- nextHash1 := hash5(cv>>8, hashLog) // 1 -> 5
- nextHash2 := hash5(cv>>16, hashLog) // 2 -> 6
- nextHash3 := hash5(cv>>24, hashLog) // 3 -> 7
+ nextHash := hashLen(cv, hashLog, betterShortLen) // 0 -> 4
+ nextHash1 := hashLen(cv>>8, hashLog, betterShortLen) // 1 -> 5
+ nextHash2 := hashLen(cv>>16, hashLog, betterShortLen) // 2 -> 6
+ nextHash3 := hashLen(cv>>24, hashLog, betterShortLen) // 3 -> 7
e.dictTable[nextHash] = tableEntry{
val: uint32(cv),
offset: i,
@@ -1145,7 +1147,7 @@
}
if len(d.content) >= 8 {
cv := load6432(d.content, 0)
- h := hash8(cv, betterLongTableBits)
+ h := hashLen(cv, betterLongTableBits, betterLongLen)
e.dictLongTable[h] = prevEntry{
offset: e.maxMatchOff,
prev: e.dictLongTable[h].offset,
@@ -1155,7 +1157,7 @@
off := 8 // First to read
for i := e.maxMatchOff + 1; i < end; i++ {
cv = cv>>8 | (uint64(d.content[off]) << 56)
- h := hash8(cv, betterLongTableBits)
+ h := hashLen(cv, betterLongTableBits, betterLongLen)
e.dictLongTable[h] = prevEntry{
offset: i,
prev: e.dictLongTable[h].offset,
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index 8629d43..7ff0c64 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -10,6 +10,7 @@
dFastLongTableBits = 17 // Bits used in the long match table
dFastLongTableSize = 1 << dFastLongTableBits // Size of the table
dFastLongTableMask = dFastLongTableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks.
+ dFastLongLen = 8 // Bytes used for table hash
dLongTableShardCnt = 1 << (dFastLongTableBits - dictShardBits) // Number of shards in the table
dLongTableShardSize = dFastLongTableSize / tableShardCnt // Size of an individual shard
@@ -17,6 +18,8 @@
dFastShortTableBits = tableBits // Bits used in the short match table
dFastShortTableSize = 1 << dFastShortTableBits // Size of the table
dFastShortTableMask = dFastShortTableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks.
+ dFastShortLen = 5 // Bytes used for table hash
+
)
type doubleFastEncoder struct {
@@ -109,7 +112,7 @@
blk.literals = append(blk.literals, src[nextEmit:until]...)
s.litLen = uint32(until - nextEmit)
}
- if debug {
+ if debugEncoder {
println("recent offsets:", blk.recentOffsets)
}
@@ -124,8 +127,8 @@
panic("offset0 was 0")
}
- nextHashS := hash5(cv, dFastShortTableBits)
- nextHashL := hash8(cv, dFastLongTableBits)
+ nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -170,7 +173,7 @@
s += lenght + repOff
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, lenght)
}
@@ -208,7 +211,7 @@
// See if we can find a long match at s+1
const checkAt = 1
cv := load6432(src, s+checkAt)
- nextHashL = hash8(cv, dFastLongTableBits)
+ nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
candidateL = e.longTable[nextHashL]
coffsetL = s - (candidateL.offset - e.cur) + checkAt
@@ -304,16 +307,16 @@
cv1 := load6432(src, index1)
te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
- e.longTable[hash8(cv0, dFastLongTableBits)] = te0
- e.longTable[hash8(cv1, dFastLongTableBits)] = te1
+ e.longTable[hashLen(cv0, dFastLongTableBits, dFastLongLen)] = te0
+ e.longTable[hashLen(cv1, dFastLongTableBits, dFastLongLen)] = te1
cv0 >>= 8
cv1 >>= 8
te0.offset++
te1.offset++
te0.val = uint32(cv0)
te1.val = uint32(cv1)
- e.table[hash5(cv0, dFastShortTableBits)] = te0
- e.table[hash5(cv1, dFastShortTableBits)] = te1
+ e.table[hashLen(cv0, dFastShortTableBits, dFastShortLen)] = te0
+ e.table[hashLen(cv1, dFastShortTableBits, dFastShortLen)] = te1
cv = load6432(src, s)
@@ -330,8 +333,8 @@
}
// Store this, since we have it.
- nextHashS := hash5(cv, dFastShortTableBits)
- nextHashL := hash8(cv, dFastLongTableBits)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
+ nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
@@ -368,7 +371,7 @@
}
blk.recentOffsets[0] = uint32(offset1)
blk.recentOffsets[1] = uint32(offset2)
- if debug {
+ if debugEncoder {
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
}
@@ -427,7 +430,7 @@
blk.literals = append(blk.literals, src[nextEmit:until]...)
s.litLen = uint32(until - nextEmit)
}
- if debug {
+ if debugEncoder {
println("recent offsets:", blk.recentOffsets)
}
@@ -436,8 +439,8 @@
var t int32
for {
- nextHashS := hash5(cv, dFastShortTableBits)
- nextHashL := hash8(cv, dFastLongTableBits)
+ nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -483,7 +486,7 @@
s += length + repOff
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, length)
}
@@ -521,7 +524,7 @@
// See if we can find a long match at s+1
const checkAt = 1
cv := load6432(src, s+checkAt)
- nextHashL = hash8(cv, dFastLongTableBits)
+ nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
candidateL = e.longTable[nextHashL]
coffsetL = s - (candidateL.offset - e.cur) + checkAt
@@ -614,16 +617,16 @@
cv1 := load6432(src, index1)
te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
- e.longTable[hash8(cv0, dFastLongTableBits)] = te0
- e.longTable[hash8(cv1, dFastLongTableBits)] = te1
+ e.longTable[hashLen(cv0, dFastLongTableBits, dFastLongLen)] = te0
+ e.longTable[hashLen(cv1, dFastLongTableBits, dFastLongLen)] = te1
cv0 >>= 8
cv1 >>= 8
te0.offset++
te1.offset++
te0.val = uint32(cv0)
te1.val = uint32(cv1)
- e.table[hash5(cv0, dFastShortTableBits)] = te0
- e.table[hash5(cv1, dFastShortTableBits)] = te1
+ e.table[hashLen(cv0, dFastShortTableBits, dFastShortLen)] = te0
+ e.table[hashLen(cv1, dFastShortTableBits, dFastShortLen)] = te1
cv = load6432(src, s)
@@ -640,8 +643,8 @@
}
// Store this, since we have it.
- nextHashS := hash5(cv1>>8, dFastShortTableBits)
- nextHashL := hash8(cv, dFastLongTableBits)
+ nextHashS := hashLen(cv1>>8, dFastShortTableBits, dFastShortLen)
+ nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
@@ -677,7 +680,7 @@
blk.literals = append(blk.literals, src[nextEmit:]...)
blk.extraLits = len(src) - int(nextEmit)
}
- if debug {
+ if debugEncoder {
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
@@ -767,7 +770,7 @@
blk.literals = append(blk.literals, src[nextEmit:until]...)
s.litLen = uint32(until - nextEmit)
}
- if debug {
+ if debugEncoder {
println("recent offsets:", blk.recentOffsets)
}
@@ -782,8 +785,8 @@
panic("offset0 was 0")
}
- nextHashS := hash5(cv, dFastShortTableBits)
- nextHashL := hash8(cv, dFastLongTableBits)
+ nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -830,7 +833,7 @@
s += lenght + repOff
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, lenght)
}
@@ -868,7 +871,7 @@
// See if we can find a long match at s+1
const checkAt = 1
cv := load6432(src, s+checkAt)
- nextHashL = hash8(cv, dFastLongTableBits)
+ nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
candidateL = e.longTable[nextHashL]
coffsetL = s - (candidateL.offset - e.cur) + checkAt
@@ -965,8 +968,8 @@
cv1 := load6432(src, index1)
te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
- longHash1 := hash8(cv0, dFastLongTableBits)
- longHash2 := hash8(cv0, dFastLongTableBits)
+ longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
+ longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen)
e.longTable[longHash1] = te0
e.longTable[longHash2] = te1
e.markLongShardDirty(longHash1)
@@ -977,8 +980,8 @@
te1.offset++
te0.val = uint32(cv0)
te1.val = uint32(cv1)
- hashVal1 := hash5(cv0, dFastShortTableBits)
- hashVal2 := hash5(cv1, dFastShortTableBits)
+ hashVal1 := hashLen(cv0, dFastShortTableBits, dFastShortLen)
+ hashVal2 := hashLen(cv1, dFastShortTableBits, dFastShortLen)
e.table[hashVal1] = te0
e.markShardDirty(hashVal1)
e.table[hashVal2] = te1
@@ -999,8 +1002,8 @@
}
// Store this, since we have it.
- nextHashS := hash5(cv, dFastShortTableBits)
- nextHashL := hash8(cv, dFastLongTableBits)
+ nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
@@ -1039,7 +1042,7 @@
}
blk.recentOffsets[0] = uint32(offset1)
blk.recentOffsets[1] = uint32(offset2)
- if debug {
+ if debugEncoder {
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
// If we encoded more than 64K mark all dirty.
@@ -1071,14 +1074,14 @@
}
if len(d.content) >= 8 {
cv := load6432(d.content, 0)
- e.dictLongTable[hash8(cv, dFastLongTableBits)] = tableEntry{
+ e.dictLongTable[hashLen(cv, dFastLongTableBits, dFastLongLen)] = tableEntry{
val: uint32(cv),
offset: e.maxMatchOff,
}
end := int32(len(d.content)) - 8 + e.maxMatchOff
for i := e.maxMatchOff + 1; i < end; i++ {
cv = cv>>8 | (uint64(d.content[i-e.maxMatchOff+7]) << 56)
- e.dictLongTable[hash8(cv, dFastLongTableBits)] = tableEntry{
+ e.dictLongTable[hashLen(cv, dFastLongTableBits, dFastLongLen)] = tableEntry{
val: uint32(cv),
offset: i,
}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index ba4a17e..f51ab52 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -6,17 +6,16 @@
import (
"fmt"
- "math"
- "math/bits"
)
const (
- tableBits = 15 // Bits used in the table
- tableSize = 1 << tableBits // Size of the table
- tableShardCnt = 1 << (tableBits - dictShardBits) // Number of shards in the table
- tableShardSize = tableSize / tableShardCnt // Size of an individual shard
- tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks.
- maxMatchLength = 131074
+ tableBits = 15 // Bits used in the table
+ tableSize = 1 << tableBits // Size of the table
+ tableShardCnt = 1 << (tableBits - dictShardBits) // Number of shards in the table
+ tableShardSize = tableSize / tableShardCnt // Size of an individual shard
+ tableFastHashLen = 6
+ tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks.
+ maxMatchLength = 131074
)
type tableEntry struct {
@@ -86,7 +85,7 @@
// TEMPLATE
const hashLog = tableBits
// seems global, but would be nice to tweak.
- const kSearchStrength = 7
+ const kSearchStrength = 6
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := s
@@ -103,7 +102,7 @@
blk.literals = append(blk.literals, src[nextEmit:until]...)
s.litLen = uint32(until - nextEmit)
}
- if debug {
+ if debugEncoder {
println("recent offsets:", blk.recentOffsets)
}
@@ -122,8 +121,8 @@
panic("offset0 was 0")
}
- nextHash := hash6(cv, hashLog)
- nextHash2 := hash6(cv>>8, hashLog)
+ nextHash := hashLen(cv, hashLog, tableFastHashLen)
+ nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
candidate := e.table[nextHash]
candidate2 := e.table[nextHash2]
repIndex := s - offset1 + 2
@@ -135,20 +134,7 @@
// Consider history as well.
var seq seq
var length int32
- // length = 4 + e.matchlen(s+6, repIndex+4, src)
- {
- a := src[s+6:]
- b := src[repIndex+4:]
- endI := len(a) & (math.MaxInt32 - 7)
- length = int32(endI) + 4
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
- break
- }
- }
- }
-
+ length = 4 + e.matchlen(s+6, repIndex+4, src)
seq.matchLen = uint32(length - zstdMinMatch)
// We might be able to match backwards.
@@ -178,7 +164,7 @@
s += length + 2
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, length)
}
@@ -235,20 +221,7 @@
}
// Extend the 4-byte match as long as possible.
- //l := e.matchlen(s+4, t+4, src) + 4
- var l int32
- {
- a := src[s+4:]
- b := src[t+4:]
- endI := len(a) & (math.MaxInt32 - 7)
- l = int32(endI) + 4
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
- break
- }
- }
- }
+ l := e.matchlen(s+4, t+4, src) + 4
// Extend backwards
tMin := s - e.maxMatchOff
@@ -285,23 +258,10 @@
if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
- //l := 4 + e.matchlen(s+4, o2+4, src)
- var l int32
- {
- a := src[s+4:]
- b := src[o2+4:]
- endI := len(a) & (math.MaxInt32 - 7)
- l = int32(endI) + 4
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
- break
- }
- }
- }
+ l := 4 + e.matchlen(s+4, o2+4, src)
// Store this, since we have it.
- nextHash := hash6(cv, hashLog)
+ nextHash := hashLen(cv, hashLog, tableFastHashLen)
e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
seq.matchLen = uint32(l) - zstdMinMatch
seq.litLen = 0
@@ -330,7 +290,7 @@
}
blk.recentOffsets[0] = uint32(offset1)
blk.recentOffsets[1] = uint32(offset2)
- if debug {
+ if debugEncoder {
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
}
@@ -343,7 +303,7 @@
inputMargin = 8
minNonLiteralBlockSize = 1 + 1 + inputMargin
)
- if debug {
+ if debugEncoder {
if len(src) > maxBlockSize {
panic("src too big")
}
@@ -374,7 +334,7 @@
// TEMPLATE
const hashLog = tableBits
// seems global, but would be nice to tweak.
- const kSearchStrength = 8
+ const kSearchStrength = 6
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := s
@@ -391,7 +351,7 @@
blk.literals = append(blk.literals, src[nextEmit:until]...)
s.litLen = uint32(until - nextEmit)
}
- if debug {
+ if debugEncoder {
println("recent offsets:", blk.recentOffsets)
}
@@ -405,8 +365,8 @@
// By not using them for the first 3 matches
for {
- nextHash := hash6(cv, hashLog)
- nextHash2 := hash6(cv>>8, hashLog)
+ nextHash := hashLen(cv, hashLog, tableFastHashLen)
+ nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
candidate := e.table[nextHash]
candidate2 := e.table[nextHash2]
repIndex := s - offset1 + 2
@@ -417,21 +377,7 @@
if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
// Consider history as well.
var seq seq
- // length := 4 + e.matchlen(s+6, repIndex+4, src)
- // length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
- var length int32
- {
- a := src[s+6:]
- b := src[repIndex+4:]
- endI := len(a) & (math.MaxInt32 - 7)
- length = int32(endI) + 4
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
- break
- }
- }
- }
+ length := 4 + e.matchlen(s+6, repIndex+4, src)
seq.matchLen = uint32(length - zstdMinMatch)
@@ -462,7 +408,7 @@
s += length + 2
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, length)
}
@@ -521,21 +467,7 @@
panic(fmt.Sprintf("t (%d) < 0 ", t))
}
// Extend the 4-byte match as long as possible.
- //l := e.matchlenNoHist(s+4, t+4, src) + 4
- // l := int32(matchLen(src[s+4:], src[t+4:])) + 4
- var l int32
- {
- a := src[s+4:]
- b := src[t+4:]
- endI := len(a) & (math.MaxInt32 - 7)
- l = int32(endI) + 4
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
- break
- }
- }
- }
+ l := e.matchlen(s+4, t+4, src) + 4
// Extend backwards
tMin := s - e.maxMatchOff
@@ -572,24 +504,10 @@
if o2 := s - offset2; len(blk.sequences) > 2 && load3232(src, o2) == uint32(cv) {
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
- //l := 4 + e.matchlenNoHist(s+4, o2+4, src)
- // l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
- var l int32
- {
- a := src[s+4:]
- b := src[o2+4:]
- endI := len(a) & (math.MaxInt32 - 7)
- l = int32(endI) + 4
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
- break
- }
- }
- }
+ l := 4 + e.matchlen(s+4, o2+4, src)
// Store this, since we have it.
- nextHash := hash6(cv, hashLog)
+ nextHash := hashLen(cv, hashLog, tableFastHashLen)
e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
seq.matchLen = uint32(l) - zstdMinMatch
seq.litLen = 0
@@ -616,7 +534,7 @@
blk.literals = append(blk.literals, src[nextEmit:]...)
blk.extraLits = len(src) - int(nextEmit)
}
- if debug {
+ if debugEncoder {
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
// We do not store history, so we must offset e.cur to avoid false matches for next user.
@@ -696,7 +614,7 @@
blk.literals = append(blk.literals, src[nextEmit:until]...)
s.litLen = uint32(until - nextEmit)
}
- if debug {
+ if debugEncoder {
println("recent offsets:", blk.recentOffsets)
}
@@ -715,8 +633,8 @@
panic("offset0 was 0")
}
- nextHash := hash6(cv, hashLog)
- nextHash2 := hash6(cv>>8, hashLog)
+ nextHash := hashLen(cv, hashLog, tableFastHashLen)
+ nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
candidate := e.table[nextHash]
candidate2 := e.table[nextHash2]
repIndex := s - offset1 + 2
@@ -730,19 +648,7 @@
// Consider history as well.
var seq seq
var length int32
- // length = 4 + e.matchlen(s+6, repIndex+4, src)
- {
- a := src[s+6:]
- b := src[repIndex+4:]
- endI := len(a) & (math.MaxInt32 - 7)
- length = int32(endI) + 4
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
- break
- }
- }
- }
+ length = 4 + e.matchlen(s+6, repIndex+4, src)
seq.matchLen = uint32(length - zstdMinMatch)
@@ -773,7 +679,7 @@
s += length + 2
nextEmit = s
if s >= sLimit {
- if debug {
+ if debugEncoder {
println("repeat ended", s, length)
}
@@ -830,20 +736,7 @@
}
// Extend the 4-byte match as long as possible.
- //l := e.matchlen(s+4, t+4, src) + 4
- var l int32
- {
- a := src[s+4:]
- b := src[t+4:]
- endI := len(a) & (math.MaxInt32 - 7)
- l = int32(endI) + 4
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
- break
- }
- }
- }
+ l := e.matchlen(s+4, t+4, src) + 4
// Extend backwards
tMin := s - e.maxMatchOff
@@ -880,23 +773,10 @@
if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
- //l := 4 + e.matchlen(s+4, o2+4, src)
- var l int32
- {
- a := src[s+4:]
- b := src[o2+4:]
- endI := len(a) & (math.MaxInt32 - 7)
- l = int32(endI) + 4
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
- break
- }
- }
- }
+ l := 4 + e.matchlen(s+4, o2+4, src)
// Store this, since we have it.
- nextHash := hash6(cv, hashLog)
+ nextHash := hashLen(cv, hashLog, tableFastHashLen)
e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
e.markShardDirty(nextHash)
seq.matchLen = uint32(l) - zstdMinMatch
@@ -926,7 +806,7 @@
}
blk.recentOffsets[0] = uint32(offset1)
blk.recentOffsets[1] = uint32(offset2)
- if debug {
+ if debugEncoder {
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
}
@@ -957,9 +837,9 @@
const hashLog = tableBits
cv := load6432(d.content, i-e.maxMatchOff)
- nextHash := hash6(cv, hashLog) // 0 -> 5
- nextHash1 := hash6(cv>>8, hashLog) // 1 -> 6
- nextHash2 := hash6(cv>>16, hashLog) // 2 -> 7
+ nextHash := hashLen(cv, hashLog, tableFastHashLen) // 0 -> 5
+ nextHash1 := hashLen(cv>>8, hashLog, tableFastHashLen) // 1 -> 6
+ nextHash2 := hashLen(cv>>16, hashLog, tableFastHashLen) // 2 -> 7
e.dictTable[nextHash] = tableEntry{
val: uint32(cv),
offset: i,
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index 4871dd0..7aaaedb 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -33,7 +33,7 @@
Block() *blockEnc
CRC() *xxhash.Digest
AppendCRC([]byte) []byte
- WindowSize(size int) int32
+ WindowSize(size int64) int32
UseBlock(*blockEnc)
Reset(d *dict, singleBlock bool)
}
@@ -48,6 +48,8 @@
err error
writeErr error
nWritten int64
+ nInput int64
+ frameContentSize int64
headerWritten bool
eofWritten bool
fullFrameWritten bool
@@ -96,23 +98,25 @@
if cap(s.filling) == 0 {
s.filling = make([]byte, 0, e.o.blockSize)
}
- if cap(s.current) == 0 {
- s.current = make([]byte, 0, e.o.blockSize)
- }
- if cap(s.previous) == 0 {
- s.previous = make([]byte, 0, e.o.blockSize)
+ if e.o.concurrent > 1 {
+ if cap(s.current) == 0 {
+ s.current = make([]byte, 0, e.o.blockSize)
+ }
+ if cap(s.previous) == 0 {
+ s.previous = make([]byte, 0, e.o.blockSize)
+ }
+ s.current = s.current[:0]
+ s.previous = s.previous[:0]
+ if s.writing == nil {
+ s.writing = &blockEnc{lowMem: e.o.lowMem}
+ s.writing.init()
+ }
+ s.writing.initNewEncode()
}
if s.encoder == nil {
s.encoder = e.o.encoder()
}
- if s.writing == nil {
- s.writing = &blockEnc{lowMem: e.o.lowMem}
- s.writing.init()
- }
- s.writing.initNewEncode()
s.filling = s.filling[:0]
- s.current = s.current[:0]
- s.previous = s.previous[:0]
s.encoder.Reset(e.o.dict, false)
s.headerWritten = false
s.eofWritten = false
@@ -120,7 +124,21 @@
s.w = w
s.err = nil
s.nWritten = 0
+ s.nInput = 0
s.writeErr = nil
+ s.frameContentSize = 0
+}
+
+// ResetContentSize will reset and set a content size for the next stream.
+// If the bytes written does not match the size given an error will be returned
+// when calling Close().
+// This is removed when Reset is called.
+// Sizes <= 0 results in no content size set.
+func (e *Encoder) ResetContentSize(w io.Writer, size int64) {
+ e.Reset(w)
+ if size >= 0 {
+ e.state.frameContentSize = size
+ }
}
// Write data to the encoder.
@@ -190,6 +208,7 @@
return s.err
}
s.nWritten += int64(n2)
+ s.nInput += int64(len(s.filling))
s.current = s.current[:0]
s.filling = s.filling[:0]
s.headerWritten = true
@@ -200,8 +219,8 @@
var tmp [maxHeaderSize]byte
fh := frameHeader{
- ContentSize: 0,
- WindowSize: uint32(s.encoder.WindowSize(0)),
+ ContentSize: uint64(s.frameContentSize),
+ WindowSize: uint32(s.encoder.WindowSize(s.frameContentSize)),
SingleSegment: false,
Checksum: e.o.crc,
DictID: e.o.dict.ID(),
@@ -241,11 +260,52 @@
return s.err
}
+ // SYNC:
+ if e.o.concurrent == 1 {
+ src := s.filling
+ s.nInput += int64(len(s.filling))
+ if debugEncoder {
+ println("Adding sync block,", len(src), "bytes, final:", final)
+ }
+ enc := s.encoder
+ blk := enc.Block()
+ blk.reset(nil)
+ enc.Encode(blk, src)
+ blk.last = final
+ if final {
+ s.eofWritten = true
+ }
+
+ err := errIncompressible
+ // If we got the exact same number of literals as input,
+ // assume the literals cannot be compressed.
+ if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
+ err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
+ }
+ switch err {
+ case errIncompressible:
+ if debugEncoder {
+ println("Storing incompressible block as raw")
+ }
+ blk.encodeRaw(src)
+ // In fast mode, we do not transfer offsets, so we don't have to deal with changing the.
+ case nil:
+ default:
+ s.err = err
+ return err
+ }
+ _, s.err = s.w.Write(blk.output)
+ s.nWritten += int64(len(blk.output))
+ s.filling = s.filling[:0]
+ return s.err
+ }
+
// Move blocks forward.
s.filling, s.current, s.previous = s.previous[:0], s.filling, s.current
+ s.nInput += int64(len(s.current))
s.wg.Add(1)
go func(src []byte) {
- if debug {
+ if debugEncoder {
println("Adding block,", len(src), "bytes, final:", final)
}
defer func() {
@@ -290,7 +350,7 @@
}
switch err {
case errIncompressible:
- if debug {
+ if debugEncoder {
println("Storing incompressible block as raw")
}
blk.encodeRaw(src)
@@ -313,7 +373,7 @@
//
// The Copy function uses ReaderFrom if available.
func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) {
- if debug {
+ if debugEncoder {
println("Using ReadFrom")
}
@@ -336,20 +396,20 @@
switch err {
case io.EOF:
e.state.filling = e.state.filling[:len(e.state.filling)-len(src)]
- if debug {
+ if debugEncoder {
println("ReadFrom: got EOF final block:", len(e.state.filling))
}
return n, nil
case nil:
default:
- if debug {
+ if debugEncoder {
println("ReadFrom: got error:", err)
}
e.state.err = err
return n, err
}
if len(src) > 0 {
- if debug {
+ if debugEncoder {
println("ReadFrom: got space left in source:", len(src))
}
continue
@@ -394,6 +454,11 @@
if err != nil {
return err
}
+ if s.frameContentSize > 0 {
+ if s.nInput != s.frameContentSize {
+ return fmt.Errorf("frame content size %d given, but %d bytes was written", s.frameContentSize, s.nInput)
+ }
+ }
if e.state.fullFrameWritten {
return s.err
}
@@ -463,14 +528,14 @@
// If a non-single block is needed the encoder will reset again.
e.encoders <- enc
}()
- // Use single segments when above minimum window and below 1MB.
- single := len(src) < 1<<20 && len(src) > MinWindowSize
+ // Use single segments when above minimum window and below window size.
+ single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
if e.o.single != nil {
single = *e.o.single
}
fh := frameHeader{
ContentSize: uint64(len(src)),
- WindowSize: uint32(enc.WindowSize(len(src))),
+ WindowSize: uint32(enc.WindowSize(int64(len(src)))),
SingleSegment: single,
Checksum: e.o.crc,
DictID: e.o.dict.ID(),
@@ -486,7 +551,7 @@
}
// If we can do everything in one block, prefer that.
- if len(src) <= maxCompressedBlockSize {
+ if len(src) <= e.o.blockSize {
enc.Reset(e.o.dict, true)
// Slightly faster with no history and everything in one block.
if e.o.crc {
@@ -512,7 +577,7 @@
switch err {
case errIncompressible:
- if debug {
+ if debugEncoder {
println("Storing incompressible block as raw")
}
dst = blk.encodeRawTo(dst, src)
@@ -548,7 +613,7 @@
switch err {
case errIncompressible:
- if debug {
+ if debugEncoder {
println("Storing incompressible block as raw")
}
dst = blk.encodeRawTo(dst, todo)
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 16d4ab6..a7c5e1a 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -24,6 +24,7 @@
allLitEntropy bool
customWindow bool
customALEntropy bool
+ customBlockSize bool
lowMem bool
dict *dict
}
@@ -33,7 +34,7 @@
concurrent: runtime.GOMAXPROCS(0),
crc: true,
single: nil,
- blockSize: 1 << 16,
+ blockSize: maxCompressedBlockSize,
windowSize: 8 << 20,
level: SpeedDefault,
allLitEntropy: true,
@@ -75,6 +76,7 @@
// WithEncoderConcurrency will set the concurrency,
// meaning the maximum number of encoders to run concurrently.
// The value supplied must be at least 1.
+// For streams, setting a value of 1 will disable async compression.
// By default this will be set to GOMAXPROCS.
func WithEncoderConcurrency(n int) EOption {
return func(o *encoderOptions) error {
@@ -106,6 +108,7 @@
o.customWindow = true
if o.blockSize > o.windowSize {
o.blockSize = o.windowSize
+ o.customBlockSize = true
}
return nil
}
@@ -188,10 +191,9 @@
return SpeedDefault
case level >= 6 && level < 10:
return SpeedBetterCompression
- case level >= 10:
- return SpeedBetterCompression
+ default:
+ return SpeedBestCompression
}
- return SpeedDefault
}
// String provides a string representation of the compression level.
@@ -222,6 +224,9 @@
switch o.level {
case SpeedFastest:
o.windowSize = 4 << 20
+ if !o.customBlockSize {
+ o.blockSize = 1 << 16
+ }
case SpeedDefault:
o.windowSize = 8 << 20
case SpeedBetterCompression:
@@ -278,7 +283,7 @@
// a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
// For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
// This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
-// If this is not specified, block encodes will automatically choose this based on the input size.
+// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
// This setting has no effect on streamed encodes.
func WithSingleSegment(b bool) EOption {
return func(o *encoderOptions) error {
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 693c5f0..9568a4b 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -8,27 +8,17 @@
"bytes"
"encoding/hex"
"errors"
- "hash"
"io"
- "sync"
"github.com/klauspost/compress/zstd/internal/xxhash"
)
type frameDec struct {
- o decoderOptions
- crc hash.Hash64
- offset int64
+ o decoderOptions
+ crc *xxhash.Digest
WindowSize uint64
- // maxWindowSize is the maximum windows size to support.
- // should never be bigger than max-int.
- maxWindowSize uint64
-
- // In order queue of blocks being decoded.
- decoding chan *blockDec
-
// Frame history passed between blocks
history history
@@ -38,20 +28,18 @@
bBuf byteBuf
FrameContentSize uint64
- frameDone sync.WaitGroup
DictionaryID *uint32
HasCheckSum bool
SingleSegment bool
-
- // asyncRunning indicates whether the async routine processes input on 'decoding'.
- asyncRunningMu sync.Mutex
- asyncRunning bool
}
const (
- // The minimum Window_Size is 1 KB.
+ // MinWindowSize is the minimum Window Size, which is 1 KB.
MinWindowSize = 1 << 10
+
+ // MaxWindowSize is the maximum encoder window size
+ // and the default decoder maximum window size.
MaxWindowSize = 1 << 29
)
@@ -61,12 +49,11 @@
)
func newFrameDec(o decoderOptions) *frameDec {
- d := frameDec{
- o: o,
- maxWindowSize: MaxWindowSize,
+ if o.maxWindowSize > o.maxDecodedSize {
+ o.maxWindowSize = o.maxDecodedSize
}
- if d.maxWindowSize > o.maxDecodedSize {
- d.maxWindowSize = o.maxDecodedSize
+ d := frameDec{
+ o: o,
}
return &d
}
@@ -78,44 +65,68 @@
func (d *frameDec) reset(br byteBuffer) error {
d.HasCheckSum = false
d.WindowSize = 0
- var b []byte
+ var signature [4]byte
for {
- b = br.readSmall(4)
- if b == nil {
+ var err error
+ // Check if we can read more...
+ b, err := br.readSmall(1)
+ switch err {
+ case io.EOF, io.ErrUnexpectedEOF:
return io.EOF
+ default:
+ return err
+ case nil:
+ signature[0] = b[0]
}
- if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
- if debug {
- println("Not skippable", hex.EncodeToString(b), hex.EncodeToString(skippableFrameMagic))
+ // Read the rest, don't allow io.ErrUnexpectedEOF
+ b, err = br.readSmall(3)
+ switch err {
+ case io.EOF:
+ return io.EOF
+ default:
+ return err
+ case nil:
+ copy(signature[1:], b)
+ }
+
+ if !bytes.Equal(signature[1:4], skippableFrameMagic) || signature[0]&0xf0 != 0x50 {
+ if debugDecoder {
+ println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString(skippableFrameMagic))
}
// Break if not skippable frame.
break
}
// Read size to skip
- b = br.readSmall(4)
- if b == nil {
- println("Reading Frame Size EOF")
- return io.ErrUnexpectedEOF
+ b, err = br.readSmall(4)
+ if err != nil {
+ if debugDecoder {
+ println("Reading Frame Size", err)
+ }
+ return err
}
n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
println("Skipping frame with", n, "bytes.")
- err := br.skipN(int(n))
+ err = br.skipN(int64(n))
if err != nil {
- if debug {
+ if debugDecoder {
println("Reading discarded frame", err)
}
return err
}
}
- if !bytes.Equal(b, frameMagic) {
- println("Got magic numbers: ", b, "want:", frameMagic)
+ if !bytes.Equal(signature[:], frameMagic) {
+ if debugDecoder {
+ println("Got magic numbers: ", signature, "want:", frameMagic)
+ }
return ErrMagicMismatch
}
// Read Frame_Header_Descriptor
fhd, err := br.readByte()
if err != nil {
- println("Reading Frame_Header_Descriptor", err)
+ if debugDecoder {
+ println("Reading Frame_Header_Descriptor", err)
+ }
return err
}
d.SingleSegment = fhd&(1<<5) != 0
@@ -130,7 +141,9 @@
if !d.SingleSegment {
wd, err := br.readByte()
if err != nil {
- println("Reading Window_Descriptor", err)
+ if debugDecoder {
+ println("Reading Window_Descriptor", err)
+ }
return err
}
printf("raw: %x, mantissa: %d, exponent: %d\n", wd, wd&7, wd>>3)
@@ -147,12 +160,11 @@
if size == 3 {
size = 4
}
- b = br.readSmall(int(size))
- if b == nil {
- if debug {
- println("Reading Dictionary_ID", io.ErrUnexpectedEOF)
- }
- return io.ErrUnexpectedEOF
+
+ b, err := br.readSmall(int(size))
+ if err != nil {
+ println("Reading Dictionary_ID", err)
+ return err
}
var id uint32
switch size {
@@ -163,7 +175,7 @@
case 4:
id = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
}
- if debug {
+ if debugDecoder {
println("Dict size", size, "ID:", id)
}
if id > 0 {
@@ -185,12 +197,12 @@
default:
fcsSize = 1 << v
}
- d.FrameContentSize = 0
+ d.FrameContentSize = fcsUnknown
if fcsSize > 0 {
- b := br.readSmall(fcsSize)
- if b == nil {
- println("Reading Frame content", io.ErrUnexpectedEOF)
- return io.ErrUnexpectedEOF
+ b, err := br.readSmall(fcsSize)
+ if err != nil {
+ println("Reading Frame content", err)
+ return err
}
switch fcsSize {
case 1:
@@ -205,10 +217,11 @@
d2 := uint32(b[4]) | (uint32(b[5]) << 8) | (uint32(b[6]) << 16) | (uint32(b[7]) << 24)
d.FrameContentSize = uint64(d1) | (uint64(d2) << 32)
}
- if debug {
- println("field size bits:", v, "fcsSize:", fcsSize, "FrameContentSize:", d.FrameContentSize, hex.EncodeToString(b[:fcsSize]), "singleseg:", d.SingleSegment, "window:", d.WindowSize)
+ if debugDecoder {
+ println("Read FCS:", d.FrameContentSize)
}
}
+
// Move this to shared.
d.HasCheckSum = fhd&(1<<2) != 0
if d.HasCheckSum {
@@ -218,29 +231,47 @@
d.crc.Reset()
}
+ if d.WindowSize > d.o.maxWindowSize {
+ if debugDecoder {
+ printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+ }
+ return ErrWindowSizeExceeded
+ }
+
if d.WindowSize == 0 && d.SingleSegment {
// We may not need window in this case.
d.WindowSize = d.FrameContentSize
if d.WindowSize < MinWindowSize {
d.WindowSize = MinWindowSize
}
+ if d.WindowSize > d.o.maxDecodedSize {
+ if debugDecoder {
+ printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+ }
+ return ErrDecoderSizeExceeded
+ }
}
- if d.WindowSize > d.maxWindowSize {
- printf("window size %d > max %d\n", d.WindowSize, d.maxWindowSize)
- return ErrWindowSizeExceeded
- }
// The minimum Window_Size is 1 KB.
if d.WindowSize < MinWindowSize {
- println("got window size: ", d.WindowSize)
+ if debugDecoder {
+ println("got window size: ", d.WindowSize)
+ }
return ErrWindowSizeTooSmall
}
d.history.windowSize = int(d.WindowSize)
- if d.o.lowMem && d.history.windowSize < maxBlockSize {
- d.history.maxSize = d.history.windowSize * 2
+ if !d.o.lowMem || d.history.windowSize < maxBlockSize {
+ // Alloc 2x window size if not low-mem, or very small window size.
+ d.history.allocFrameBuffer = d.history.windowSize * 2
} else {
- d.history.maxSize = d.history.windowSize + maxBlockSize
+ // Alloc with one additional block
+ d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
}
+
+ if debugDecoder {
+ println("Frame: Dict:", d.DictionaryID, "FrameContentSize:", d.FrameContentSize, "singleseg:", d.SingleSegment, "window:", d.WindowSize, "crc:", d.HasCheckSum)
+ }
+
// history contains input - maybe we do something
d.rawInput = br
return nil
@@ -248,56 +279,37 @@
// next will start decoding the next block from stream.
func (d *frameDec) next(block *blockDec) error {
- if debug {
- printf("decoding new block %p:%p", block, block.data)
+ if debugDecoder {
+ println("decoding new block")
}
err := block.reset(d.rawInput, d.WindowSize)
if err != nil {
println("block error:", err)
// Signal the frame decoder we have a problem.
- d.sendErr(block, err)
+ block.sendErr(err)
return err
}
- block.input <- struct{}{}
- if debug {
- println("next block:", block)
- }
- d.asyncRunningMu.Lock()
- defer d.asyncRunningMu.Unlock()
- if !d.asyncRunning {
- return nil
- }
- if block.Last {
- // We indicate the frame is done by sending io.EOF
- d.decoding <- block
- return io.EOF
- }
- d.decoding <- block
return nil
}
-// sendEOF will queue an error block on the frame.
-// This will cause the frame decoder to return when it encounters the block.
-// Returns true if the decoder was added.
-func (d *frameDec) sendErr(block *blockDec, err error) bool {
- d.asyncRunningMu.Lock()
- defer d.asyncRunningMu.Unlock()
- if !d.asyncRunning {
- return false
- }
-
- println("sending error", err.Error())
- block.sendErr(err)
- d.decoding <- block
- return true
-}
-
// checkCRC will check the checksum if the frame has one.
// Will return ErrCRCMismatch if crc check failed, otherwise nil.
func (d *frameDec) checkCRC() error {
if !d.HasCheckSum {
return nil
}
+
+ // We can overwrite upper tmp now
+ want, err := d.rawInput.readSmall(4)
+ if err != nil {
+ println("CRC missing?", err)
+ return err
+ }
+
+ if d.o.ignoreChecksum {
+ return nil
+ }
+
var tmp [4]byte
got := d.crc.Sum64()
// Flip to match file order.
@@ -306,142 +318,29 @@
tmp[2] = byte(got >> 16)
tmp[3] = byte(got >> 24)
- // We can overwrite upper tmp now
- want := d.rawInput.readSmall(4)
- if want == nil {
- println("CRC missing?")
- return io.ErrUnexpectedEOF
- }
-
if !bytes.Equal(tmp[:], want) {
- if debug {
+ if debugDecoder {
println("CRC Check Failed:", tmp[:], "!=", want)
}
return ErrCRCMismatch
}
- if debug {
+ if debugDecoder {
println("CRC ok", tmp[:])
}
return nil
}
-func (d *frameDec) initAsync() {
- if !d.o.lowMem && !d.SingleSegment {
- // set max extra size history to 10MB.
- d.history.maxSize = d.history.windowSize + maxBlockSize*5
+// consumeCRC reads the checksum data if the frame has one.
+func (d *frameDec) consumeCRC() error {
+ if d.HasCheckSum {
+ _, err := d.rawInput.readSmall(4)
+ if err != nil {
+ println("CRC missing?", err)
+ return err
+ }
}
- // re-alloc if more than one extra block size.
- if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize {
- d.history.b = make([]byte, 0, d.history.maxSize)
- }
- if cap(d.history.b) < d.history.maxSize {
- d.history.b = make([]byte, 0, d.history.maxSize)
- }
- if cap(d.decoding) < d.o.concurrent {
- d.decoding = make(chan *blockDec, d.o.concurrent)
- }
- if debug {
- h := d.history
- printf("history init. len: %d, cap: %d", len(h.b), cap(h.b))
- }
- d.asyncRunningMu.Lock()
- d.asyncRunning = true
- d.asyncRunningMu.Unlock()
-}
-// startDecoder will start decoding blocks and write them to the writer.
-// The decoder will stop as soon as an error occurs or at end of frame.
-// When the frame has finished decoding the *bufio.Reader
-// containing the remaining input will be sent on frameDec.frameDone.
-func (d *frameDec) startDecoder(output chan decodeOutput) {
- written := int64(0)
-
- defer func() {
- d.asyncRunningMu.Lock()
- d.asyncRunning = false
- d.asyncRunningMu.Unlock()
-
- // Drain the currently decoding.
- d.history.error = true
- flushdone:
- for {
- select {
- case b := <-d.decoding:
- b.history <- &d.history
- output <- <-b.result
- default:
- break flushdone
- }
- }
- println("frame decoder done, signalling done")
- d.frameDone.Done()
- }()
- // Get decoder for first block.
- block := <-d.decoding
- block.history <- &d.history
- for {
- var next *blockDec
- // Get result
- r := <-block.result
- if r.err != nil {
- println("Result contained error", r.err)
- output <- r
- return
- }
- if debug {
- println("got result, from ", d.offset, "to", d.offset+int64(len(r.b)))
- d.offset += int64(len(r.b))
- }
- if !block.Last {
- // Send history to next block
- select {
- case next = <-d.decoding:
- if debug {
- println("Sending ", len(d.history.b), "bytes as history")
- }
- next.history <- &d.history
- default:
- // Wait until we have sent the block, so
- // other decoders can potentially get the decoder.
- next = nil
- }
- }
-
- // Add checksum, async to decoding.
- if d.HasCheckSum {
- n, err := d.crc.Write(r.b)
- if err != nil {
- r.err = err
- if n != len(r.b) {
- r.err = io.ErrShortWrite
- }
- output <- r
- return
- }
- }
- written += int64(len(r.b))
- if d.SingleSegment && uint64(written) > d.FrameContentSize {
- println("runDecoder: single segment and", uint64(written), ">", d.FrameContentSize)
- r.err = ErrFrameSizeExceeded
- output <- r
- return
- }
- if block.Last {
- r.err = d.checkCRC()
- output <- r
- return
- }
- output <- r
- if next == nil {
- // There was no decoder available, we wait for one now that we have sent to the writer.
- if debug {
- println("Sending ", len(d.history.b), " bytes as history")
- }
- next = <-d.decoding
- next.history <- &d.history
- }
- block = next
- }
+ return nil
}
// runDecoder will create a sync decoder that will decode a block of data.
@@ -450,41 +349,67 @@
// We use the history for output to avoid copying it.
d.history.b = dst
+ d.history.ignoreBuffer = len(dst)
// Store input length, so we only check new data.
crcStart := len(dst)
+ d.history.decoders.maxSyncLen = 0
+ if d.FrameContentSize != fcsUnknown {
+ d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+ if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
+ return dst, ErrDecoderSizeExceeded
+ }
+ if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
+ // Alloc for output
+ dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
+ copy(dst2, dst)
+ dst = dst2
+ }
+ }
var err error
for {
err = dec.reset(d.rawInput, d.WindowSize)
if err != nil {
break
}
- if debug {
+ if debugDecoder {
println("next block:", dec)
}
err = dec.decodeBuf(&d.history)
- if err != nil || dec.Last {
+ if err != nil {
break
}
if uint64(len(d.history.b)) > d.o.maxDecodedSize {
err = ErrDecoderSizeExceeded
break
}
- if d.SingleSegment && uint64(len(d.history.b)) > d.o.maxDecodedSize {
- println("runDecoder: single segment and", uint64(len(d.history.b)), ">", d.o.maxDecodedSize)
+ if uint64(len(d.history.b)-crcStart) > d.FrameContentSize {
+ println("runDecoder: FrameContentSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.FrameContentSize)
err = ErrFrameSizeExceeded
break
}
+ if dec.Last {
+ break
+ }
+ if debugDecoder {
+ println("runDecoder: FrameContentSize", uint64(len(d.history.b)-crcStart), "<=", d.FrameContentSize)
+ }
}
dst = d.history.b
if err == nil {
- if d.HasCheckSum {
- var n int
- n, err = d.crc.Write(dst[crcStart:])
- if err == nil {
- if n != len(dst)-crcStart {
- err = io.ErrShortWrite
- } else {
- err = d.checkCRC()
+ if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
+ err = ErrFrameSizeMismatch
+ } else if d.HasCheckSum {
+ if d.o.ignoreChecksum {
+ err = d.consumeCRC()
+ } else {
+ var n int
+ n, err = d.crc.Write(dst[crcStart:])
+ if err == nil {
+ if n != len(dst)-crcStart {
+ err = io.ErrShortWrite
+ } else {
+ err = d.checkCRC()
+ }
}
}
}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index e6d3d49..2f8860a 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -5,8 +5,10 @@
package zstd
import (
+ "encoding/binary"
"errors"
"fmt"
+ "io"
)
const (
@@ -178,10 +180,32 @@
return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
}
b.advance((bitCount + 7) >> 3)
- // println(s.norm[:s.symbolLen], s.symbolLen)
return s.buildDtable()
}
+func (s *fseDecoder) mustReadFrom(r io.Reader) {
+ fatalErr := func(err error) {
+ if err != nil {
+ panic(err)
+ }
+ }
+ // dt [maxTablesize]decSymbol // Decompression table.
+ // symbolLen uint16 // Length of active part of the symbol table.
+ // actualTableLog uint8 // Selected tablelog.
+ // maxBits uint8 // Maximum number of additional bits
+ // // used for table creation to avoid allocations.
+ // stateTable [256]uint16
+ // norm [maxSymbolValue + 1]int16
+ // preDefined bool
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
+}
+
// decSymbol contains information about a state entry,
// Including the state offset base, the output symbol and
// the number of bits to read for the low part of the destination state.
@@ -204,18 +228,10 @@
return uint16(d >> 16)
}
-func (d decSymbol) baseline() uint32 {
- return uint32(d >> 32)
-}
-
func (d decSymbol) baselineInt() int {
return int(d >> 32)
}
-func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) {
- *d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
-}
-
func (d *decSymbol) setNBits(nBits uint8) {
const mask = 0xffffffffffffff00
*d = (*d & mask) | decSymbol(nBits)
@@ -231,11 +247,6 @@
*d = (*d & mask) | decSymbol(state)<<16
}
-func (d *decSymbol) setBaseline(baseline uint32) {
- const mask = 0xffffffff
- *d = (*d & mask) | decSymbol(baseline)<<32
-}
-
func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
const mask = 0xffff00ff
*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
@@ -257,68 +268,6 @@
s.dt[0] = symbol
}
-// buildDtable will build the decoding table.
-func (s *fseDecoder) buildDtable() error {
- tableSize := uint32(1 << s.actualTableLog)
- highThreshold := tableSize - 1
- symbolNext := s.stateTable[:256]
-
- // Init, lay down lowprob symbols
- {
- for i, v := range s.norm[:s.symbolLen] {
- if v == -1 {
- s.dt[highThreshold].setAddBits(uint8(i))
- highThreshold--
- symbolNext[i] = 1
- } else {
- symbolNext[i] = uint16(v)
- }
- }
- }
- // Spread symbols
- {
- tableMask := tableSize - 1
- step := tableStep(tableSize)
- position := uint32(0)
- for ss, v := range s.norm[:s.symbolLen] {
- for i := 0; i < int(v); i++ {
- s.dt[position].setAddBits(uint8(ss))
- position = (position + step) & tableMask
- for position > highThreshold {
- // lowprob area
- position = (position + step) & tableMask
- }
- }
- }
- if position != 0 {
- // position must reach all cells once, otherwise normalizedCounter is incorrect
- return errors.New("corrupted input (position != 0)")
- }
- }
-
- // Build Decoding table
- {
- tableSize := uint16(1 << s.actualTableLog)
- for u, v := range s.dt[:tableSize] {
- symbol := v.addBits()
- nextState := symbolNext[symbol]
- symbolNext[symbol] = nextState + 1
- nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
- s.dt[u&maxTableMask].setNBits(nBits)
- newState := (nextState << nBits) - tableSize
- if newState > tableSize {
- return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
- }
- if newState == uint16(u) && nBits == 0 {
- // Seems weird that this is possible with nbits > 0.
- return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
- }
- s.dt[u&maxTableMask].setNewState(newState)
- }
- }
- return nil
-}
-
// transform will transform the decoder table into a table usable for
// decoding without having to apply the transformation while decoding.
// The state will contain the base value and the number of bits to read.
@@ -352,34 +301,7 @@
s.state = dt[br.getBits(tableLog)]
}
-// next returns the current symbol and sets the next state.
-// At least tablelog bits must be available in the bit reader.
-func (s *fseState) next(br *bitReader) {
- lowBits := uint16(br.getBits(s.state.nbBits()))
- s.state = s.dt[s.state.newState()+lowBits]
-}
-
-// finished returns true if all bits have been read from the bitstream
-// and the next state would require reading bits from the input.
-func (s *fseState) finished(br *bitReader) bool {
- return br.finished() && s.state.nbBits() > 0
-}
-
-// final returns the current state symbol without decoding the next.
-func (s *fseState) final() (int, uint8) {
- return s.state.baselineInt(), s.state.addBits()
-}
-
// final returns the current state symbol without decoding the next.
func (s decSymbol) final() (int, uint8) {
return s.baselineInt(), s.addBits()
}
-
-// nextFast returns the next symbol and sets the next state.
-// This can only be used if no symbols are 0 bits.
-// At least tablelog bits must be available in the bit reader.
-func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
- lowBits := uint16(br.getBitsFast(s.state.nbBits()))
- s.state = s.dt[s.state.newState()+lowBits]
- return s.state.baseline(), s.state.addBits()
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
new file mode 100644
index 0000000..c881d28
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
@@ -0,0 +1,64 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package zstd
+
+import (
+ "fmt"
+)
+
+type buildDtableAsmContext struct {
+ // inputs
+ stateTable *uint16
+ norm *int16
+ dt *uint64
+
+ // outputs --- set by the procedure in the case of error;
+ // for interpretation please see the error handling part below
+ errParam1 uint64
+ errParam2 uint64
+}
+
+// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
+// Function returns non-zero exit code on error.
+// go:noescape
+func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
+
+// please keep in sync with _generate/gen_fse.go
+const (
+ errorCorruptedNormalizedCounter = 1
+ errorNewStateTooBig = 2
+ errorNewStateNoBits = 3
+)
+
+// buildDtable will build the decoding table.
+func (s *fseDecoder) buildDtable() error {
+ ctx := buildDtableAsmContext{
+ stateTable: &s.stateTable[0],
+ norm: &s.norm[0],
+ dt: (*uint64)(&s.dt[0]),
+ }
+ code := buildDtable_asm(s, &ctx)
+
+ if code != 0 {
+ switch code {
+ case errorCorruptedNormalizedCounter:
+ position := ctx.errParam1
+ return fmt.Errorf("corrupted input (position=%d, expected 0)", position)
+
+ case errorNewStateTooBig:
+ newState := decSymbol(ctx.errParam1)
+ size := ctx.errParam2
+ return fmt.Errorf("newState (%d) outside table size (%d)", newState, size)
+
+ case errorNewStateNoBits:
+ newState := decSymbol(ctx.errParam1)
+ oldState := decSymbol(ctx.errParam2)
+ return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState)
+
+ default:
+ return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code)
+ }
+ }
+ return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
new file mode 100644
index 0000000..da32b44
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
@@ -0,0 +1,127 @@
+// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+// +build !appengine,!noasm,gc,!noasm
+
+// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
+TEXT ·buildDtable_asm(SB), $0-24
+ MOVQ ctx+8(FP), CX
+ MOVQ s+0(FP), DI
+
+ // Load values
+ MOVBQZX 4098(DI), DX
+ XORQ AX, AX
+ BTSQ DX, AX
+ MOVQ (CX), BX
+ MOVQ 16(CX), SI
+ LEAQ -1(AX), R8
+ MOVQ 8(CX), CX
+ MOVWQZX 4096(DI), DI
+
+ // End load values
+ // Init, lay down lowprob symbols
+ XORQ R9, R9
+ JMP init_main_loop_condition
+
+init_main_loop:
+ MOVWQSX (CX)(R9*2), R10
+ CMPW R10, $-1
+ JNE do_not_update_high_threshold
+ MOVB R9, 1(SI)(R8*8)
+ DECQ R8
+ MOVQ $0x0000000000000001, R10
+
+do_not_update_high_threshold:
+ MOVW R10, (BX)(R9*2)
+ INCQ R9
+
+init_main_loop_condition:
+ CMPQ R9, DI
+ JL init_main_loop
+
+ // Spread symbols
+ // Calculate table step
+ MOVQ AX, R9
+ SHRQ $0x01, R9
+ MOVQ AX, R10
+ SHRQ $0x03, R10
+ LEAQ 3(R9)(R10*1), R9
+
+ // Fill add bits values
+ LEAQ -1(AX), R10
+ XORQ R11, R11
+ XORQ R12, R12
+ JMP spread_main_loop_condition
+
+spread_main_loop:
+ XORQ R13, R13
+ MOVWQSX (CX)(R12*2), R14
+ JMP spread_inner_loop_condition
+
+spread_inner_loop:
+ MOVB R12, 1(SI)(R11*8)
+
+adjust_position:
+ ADDQ R9, R11
+ ANDQ R10, R11
+ CMPQ R11, R8
+ JG adjust_position
+ INCQ R13
+
+spread_inner_loop_condition:
+ CMPQ R13, R14
+ JL spread_inner_loop
+ INCQ R12
+
+spread_main_loop_condition:
+ CMPQ R12, DI
+ JL spread_main_loop
+ TESTQ R11, R11
+ JZ spread_check_ok
+ MOVQ ctx+8(FP), AX
+ MOVQ R11, 24(AX)
+ MOVQ $+1, ret+16(FP)
+ RET
+
+spread_check_ok:
+ // Build Decoding table
+ XORQ DI, DI
+
+build_table_main_table:
+ MOVBQZX 1(SI)(DI*8), CX
+ MOVWQZX (BX)(CX*2), R8
+ LEAQ 1(R8), R9
+ MOVW R9, (BX)(CX*2)
+ MOVQ R8, R9
+ BSRQ R9, R9
+ MOVQ DX, CX
+ SUBQ R9, CX
+ SHLQ CL, R8
+ SUBQ AX, R8
+ MOVB CL, (SI)(DI*8)
+ MOVW R8, 2(SI)(DI*8)
+ CMPQ R8, AX
+ JLE build_table_check1_ok
+ MOVQ ctx+8(FP), CX
+ MOVQ R8, 24(CX)
+ MOVQ AX, 32(CX)
+ MOVQ $+2, ret+16(FP)
+ RET
+
+build_table_check1_ok:
+ TESTB CL, CL
+ JNZ build_table_check2_ok
+ CMPW R8, DI
+ JNE build_table_check2_ok
+ MOVQ ctx+8(FP), AX
+ MOVQ R8, 24(AX)
+ MOVQ DI, 32(AX)
+ MOVQ $+3, ret+16(FP)
+ RET
+
+build_table_check2_ok:
+ INCQ DI
+ CMPQ DI, AX
+ JL build_table_main_table
+ MOVQ $+0, ret+16(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
new file mode 100644
index 0000000..332e51f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
@@ -0,0 +1,72 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package zstd
+
+import (
+ "errors"
+ "fmt"
+)
+
+// buildDtable will build the decoding table.
+func (s *fseDecoder) buildDtable() error {
+ tableSize := uint32(1 << s.actualTableLog)
+ highThreshold := tableSize - 1
+ symbolNext := s.stateTable[:256]
+
+ // Init, lay down lowprob symbols
+ {
+ for i, v := range s.norm[:s.symbolLen] {
+ if v == -1 {
+ s.dt[highThreshold].setAddBits(uint8(i))
+ highThreshold--
+ symbolNext[i] = 1
+ } else {
+ symbolNext[i] = uint16(v)
+ }
+ }
+ }
+
+ // Spread symbols
+ {
+ tableMask := tableSize - 1
+ step := tableStep(tableSize)
+ position := uint32(0)
+ for ss, v := range s.norm[:s.symbolLen] {
+ for i := 0; i < int(v); i++ {
+ s.dt[position].setAddBits(uint8(ss))
+ position = (position + step) & tableMask
+ for position > highThreshold {
+ // lowprob area
+ position = (position + step) & tableMask
+ }
+ }
+ }
+ if position != 0 {
+ // position must reach all cells once, otherwise normalizedCounter is incorrect
+ return errors.New("corrupted input (position != 0)")
+ }
+ }
+
+ // Build Decoding table
+ {
+ tableSize := uint16(1 << s.actualTableLog)
+ for u, v := range s.dt[:tableSize] {
+ symbol := v.addBits()
+ nextState := symbolNext[symbol]
+ symbolNext[symbol] = nextState + 1
+ nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
+ s.dt[u&maxTableMask].setNBits(nBits)
+ newState := (nextState << nBits) - tableSize
+ if newState > tableSize {
+ return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
+ }
+ if newState == uint16(u) && nBits == 0 {
+ // Seems weird that this is possible with nbits > 0.
+ return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
+ }
+ s.dt[u&maxTableMask].setNewState(newState)
+ }
+ }
+ return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
index c74681b..ab26326 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -62,9 +62,8 @@
// To indicate that you have populated the histogram call HistogramFinished
// with the value of the highest populated symbol, as well as the number of entries
// in the most populated entry. These are accepted at face value.
-// The returned slice will always be length 256.
-func (s *fseEncoder) Histogram() []uint32 {
- return s.count[:]
+func (s *fseEncoder) Histogram() *[256]uint32 {
+ return &s.count
}
// HistogramFinished can be called to indicate that the histogram has been populated.
@@ -77,21 +76,6 @@
s.clearCount = maxCount != 0
}
-// prepare will prepare and allocate scratch tables used for both compression and decompression.
-func (s *fseEncoder) prepare() (*fseEncoder, error) {
- if s == nil {
- s = &fseEncoder{}
- }
- s.useRLE = false
- if s.clearCount && s.maxCount == 0 {
- for i := range s.count {
- s.count[i] = 0
- }
- s.clearCount = false
- }
- return s, nil
-}
-
// allocCtable will allocate tables needed for compression.
// If existing tables a re big enough, they are simply re-used.
func (s *fseEncoder) allocCtable() {
@@ -229,7 +213,7 @@
deltaFindState: 0,
deltaNbBits: 0,
}
- if debug {
+ if debugEncoder {
println("setRLE: val", val, "symbolTT", s.ct.symbolTT[val])
}
s.rleVal = val
@@ -710,14 +694,6 @@
c.state = c.stateTable[lu]
}
-// encode the output symbol provided and write it to the bitstream.
-func (c *cState) encode(symbolTT symbolTransform) {
- nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
- dstState := int32(c.state>>(nbBitsOut&15)) + int32(symbolTT.deltaFindState)
- c.bw.addBits16NC(c.state, uint8(nbBitsOut))
- c.state = c.stateTable[dstState]
-}
-
// flush will write the tablelog to the output and flush the remaining full bytes.
func (c *cState) flush(tableLog uint8) {
c.bw.flush32()
diff --git a/vendor/github.com/klauspost/compress/zstd/hash.go b/vendor/github.com/klauspost/compress/zstd/hash.go
index 4a75206..5d73c21 100644
--- a/vendor/github.com/klauspost/compress/zstd/hash.go
+++ b/vendor/github.com/klauspost/compress/zstd/hash.go
@@ -13,65 +13,23 @@
prime8bytes = 0xcf1bbcdcb7a56463
)
-// hashLen returns a hash of the lowest l bytes of u for a size size of h bytes.
-// l must be >=4 and <=8. Any other value will return hash for 4 bytes.
-// h should always be <32.
-// Preferably h and l should be a constant.
-// FIXME: This does NOT get resolved, if 'mls' is constant,
-// so this cannot be used.
-func hashLen(u uint64, hashLog, mls uint8) uint32 {
+// hashLen returns a hash of the lowest mls bytes of with length output bits.
+// mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
+// length should always be < 32.
+// Preferably length and mls should be a constant for inlining.
+func hashLen(u uint64, length, mls uint8) uint32 {
switch mls {
+ case 3:
+ return (uint32(u<<8) * prime3bytes) >> (32 - length)
case 5:
- return hash5(u, hashLog)
+ return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length))
case 6:
- return hash6(u, hashLog)
+ return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length))
case 7:
- return hash7(u, hashLog)
+ return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length))
case 8:
- return hash8(u, hashLog)
+ return uint32((u * prime8bytes) >> (64 - length))
default:
- return hash4x64(u, hashLog)
+ return (uint32(u) * prime4bytes) >> (32 - length)
}
}
-
-// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash3(u uint32, h uint8) uint32 {
- return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
-}
-
-// hash4 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4(u uint32, h uint8) uint32 {
- return (u * prime4bytes) >> ((32 - h) & 31)
-}
-
-// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4x64(u uint64, h uint8) uint32 {
- return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
-}
-
-// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash5(u uint64, h uint8) uint32 {
- return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
-}
-
-// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash6(u uint64, h uint8) uint32 {
- return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
-}
-
-// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash7(u uint64, h uint8) uint32 {
- return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
-}
-
-// hash8 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash8(u uint64, h uint8) uint32 {
- return uint32((u * prime8bytes) >> ((64 - h) & 63))
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
index f783e32..28b4015 100644
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@@ -10,20 +10,31 @@
// history contains the information transferred between blocks.
type history struct {
- b []byte
- huffTree *huff0.Scratch
- recentOffsets [3]int
+ // Literal decompression
+ huffTree *huff0.Scratch
+
+ // Sequence decompression
decoders sequenceDecs
- windowSize int
- maxSize int
- error bool
- dict *dict
+ recentOffsets [3]int
+
+ // History buffer...
+ b []byte
+
+ // ignoreBuffer is meant to ignore a number of bytes
+ // when checking for matches in history
+ ignoreBuffer int
+
+ windowSize int
+ allocFrameBuffer int // needed?
+ error bool
+ dict *dict
}
// reset will reset the history to initial state of a frame.
// The history must already have been initialized to the desired size.
func (h *history) reset() {
h.b = h.b[:0]
+ h.ignoreBuffer = 0
h.error = false
h.recentOffsets = [3]int{1, 4, 8}
if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
@@ -35,7 +46,7 @@
if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
fseDecoderPool.Put(f)
}
- h.decoders = sequenceDecs{}
+ h.decoders = sequenceDecs{br: h.decoders.br}
if h.huffTree != nil {
if h.dict == nil || h.dict.litEnc != h.huffTree {
huffDecoderPool.Put(h.huffTree)
@@ -54,6 +65,7 @@
h.decoders.litLengths = dict.llDec
h.decoders.offsets = dict.ofDec
h.decoders.matchLengths = dict.mlDec
+ h.decoders.dict = dict.content
h.recentOffsets = dict.offsets
h.huffTree = dict.litEnc
}
@@ -83,6 +95,24 @@
copy(h.b[h.windowSize-len(b):], b)
}
+// ensureBlock will ensure there is space for at least one block...
+func (h *history) ensureBlock() {
+ if cap(h.b) < h.allocFrameBuffer {
+ h.b = make([]byte, 0, h.allocFrameBuffer)
+ return
+ }
+
+ avail := cap(h.b) - len(h.b)
+ if avail >= h.windowSize || avail > maxCompressedBlockSize {
+ return
+ }
+ // Move data down so we only have window size left.
+ // We know we have less than window size in b at this point.
+ discard := len(h.b) - h.windowSize
+ copy(h.b, h.b[discard:])
+ h.b = h.b[:h.windowSize]
+}
+
// append bytes to history without ever discarding anything.
func (h *history) appendKeep(b []byte) {
h.b = append(h.b, b...)
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
index 426b9ca..2c112a0 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
@@ -195,7 +195,6 @@
b, d.v4 = consumeUint64(b)
b, d.total = consumeUint64(b)
copy(d.mem[:], b)
- b = b[len(d.mem):]
d.n = int(d.total % uint64(len(d.mem)))
return nil
}
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go
deleted file mode 100644
index 35318d7..0000000
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go
+++ /dev/null
@@ -1,13 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !purego
-
-package xxhash
-
-// Sum64 computes the 64-bit xxHash digest of b.
-//
-//go:noescape
-func Sum64(b []byte) uint64
-
-//go:noescape
-func writeBlocks(*Digest, []byte) int
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
index 2c9c535..cea1785 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
@@ -1,12 +1,13 @@
// +build !appengine
// +build gc
// +build !purego
+// +build !noasm
#include "textflag.h"
// Register allocation:
// AX h
-// CX pointer to advance through b
+// SI pointer to advance through b
// DX n
// BX loop end
// R8 v1, k1
@@ -16,39 +17,39 @@
// R12 tmp
// R13 prime1v
// R14 prime2v
-// R15 prime4v
+// DI prime4v
-// round reads from and advances the buffer pointer in CX.
+// round reads from and advances the buffer pointer in SI.
// It assumes that R13 has prime1v and R14 has prime2v.
#define round(r) \
- MOVQ (CX), R12 \
- ADDQ $8, CX \
+ MOVQ (SI), R12 \
+ ADDQ $8, SI \
IMULQ R14, R12 \
ADDQ R12, r \
ROLQ $31, r \
IMULQ R13, r
// mergeRound applies a merge round on the two registers acc and val.
-// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
+// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
#define mergeRound(acc, val) \
IMULQ R14, val \
ROLQ $31, val \
IMULQ R13, val \
XORQ val, acc \
IMULQ R13, acc \
- ADDQ R15, acc
+ ADDQ DI, acc
// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOSPLIT, $0-32
// Load fixed primes.
MOVQ ·prime1v(SB), R13
MOVQ ·prime2v(SB), R14
- MOVQ ·prime4v(SB), R15
+ MOVQ ·prime4v(SB), DI
// Load slice.
- MOVQ b_base+0(FP), CX
+ MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), DX
- LEAQ (CX)(DX*1), BX
+ LEAQ (SI)(DX*1), BX
// The first loop limit will be len(b)-32.
SUBQ $32, BX
@@ -65,14 +66,14 @@
XORQ R11, R11
SUBQ R13, R11
- // Loop until CX > BX.
+ // Loop until SI > BX.
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)
- CMPQ CX, BX
+ CMPQ SI, BX
JLE blockLoop
MOVQ R8, AX
@@ -100,16 +101,16 @@
afterBlocks:
ADDQ DX, AX
- // Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
+ // Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
ADDQ $24, BX
- CMPQ CX, BX
+ CMPQ SI, BX
JG fourByte
wordLoop:
// Calculate k1.
- MOVQ (CX), R8
- ADDQ $8, CX
+ MOVQ (SI), R8
+ ADDQ $8, SI
IMULQ R14, R8
ROLQ $31, R8
IMULQ R13, R8
@@ -117,18 +118,18 @@
XORQ R8, AX
ROLQ $27, AX
IMULQ R13, AX
- ADDQ R15, AX
+ ADDQ DI, AX
- CMPQ CX, BX
+ CMPQ SI, BX
JLE wordLoop
fourByte:
ADDQ $4, BX
- CMPQ CX, BX
+ CMPQ SI, BX
JG singles
- MOVL (CX), R8
- ADDQ $4, CX
+ MOVL (SI), R8
+ ADDQ $4, SI
IMULQ R13, R8
XORQ R8, AX
@@ -138,19 +139,19 @@
singles:
ADDQ $4, BX
- CMPQ CX, BX
+ CMPQ SI, BX
JGE finalize
singlesLoop:
- MOVBQZX (CX), R12
- ADDQ $1, CX
+ MOVBQZX (SI), R12
+ ADDQ $1, SI
IMULQ ·prime5v(SB), R12
XORQ R12, AX
ROLQ $11, AX
IMULQ R13, AX
- CMPQ CX, BX
+ CMPQ SI, BX
JL singlesLoop
finalize:
@@ -179,13 +180,13 @@
MOVQ ·prime2v(SB), R14
// Load slice.
- MOVQ arg1_base+8(FP), CX
- MOVQ arg1_len+16(FP), DX
- LEAQ (CX)(DX*1), BX
+ MOVQ b_base+8(FP), SI
+ MOVQ b_len+16(FP), DX
+ LEAQ (SI)(DX*1), BX
SUBQ $32, BX
// Load vN from d.
- MOVQ arg+0(FP), AX
+ MOVQ d+0(FP), AX
MOVQ 0(AX), R8 // v1
MOVQ 8(AX), R9 // v2
MOVQ 16(AX), R10 // v3
@@ -199,7 +200,7 @@
round(R10)
round(R11)
- CMPQ CX, BX
+ CMPQ SI, BX
JLE blockLoop
// Copy vN back to d.
@@ -208,8 +209,8 @@
MOVQ R10, 16(AX)
MOVQ R11, 24(AX)
- // The number of bytes written is CX minus the old base pointer.
- SUBQ arg1_base+8(FP), CX
- MOVQ CX, ret+32(FP)
+ // The number of bytes written is SI minus the old base pointer.
+ SUBQ b_base+8(FP), SI
+ MOVQ SI, ret+32(FP)
RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
new file mode 100644
index 0000000..4d64a17
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
@@ -0,0 +1,186 @@
+// +build gc,!purego,!noasm
+
+#include "textflag.h"
+
+// Register allocation.
+#define digest R1
+#define h R2 // Return value.
+#define p R3 // Input pointer.
+#define len R4
+#define nblocks R5 // len / 32.
+#define prime1 R7
+#define prime2 R8
+#define prime3 R9
+#define prime4 R10
+#define prime5 R11
+#define v1 R12
+#define v2 R13
+#define v3 R14
+#define v4 R15
+#define x1 R20
+#define x2 R21
+#define x3 R22
+#define x4 R23
+
+#define round(acc, x) \
+ MADD prime2, acc, x, acc \
+ ROR $64-31, acc \
+ MUL prime1, acc \
+
+// x = round(0, x).
+#define round0(x) \
+ MUL prime2, x \
+ ROR $64-31, x \
+ MUL prime1, x \
+
+#define mergeRound(x) \
+ round0(x) \
+ EOR x, h \
+ MADD h, prime4, prime1, h \
+
+// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
+#define blocksLoop() \
+ LSR $5, len, nblocks \
+ PCALIGN $16 \
+ loop: \
+ LDP.P 32(p), (x1, x2) \
+ round(v1, x1) \
+ LDP -16(p), (x3, x4) \
+ round(v2, x2) \
+ SUB $1, nblocks \
+ round(v3, x3) \
+ round(v4, x4) \
+ CBNZ nblocks, loop \
+
+// The primes are repeated here to ensure that they're stored
+// in a contiguous array, so we can load them with LDP.
+DATA primes<> +0(SB)/8, $11400714785074694791
+DATA primes<> +8(SB)/8, $14029467366897019727
+DATA primes<>+16(SB)/8, $1609587929392839161
+DATA primes<>+24(SB)/8, $9650029242287828579
+DATA primes<>+32(SB)/8, $2870177450012600261
+GLOBL primes<>(SB), NOPTR+RODATA, $40
+
+// func Sum64(b []byte) uint64
+TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
+ LDP b_base+0(FP), (p, len)
+
+ LDP primes<> +0(SB), (prime1, prime2)
+ LDP primes<>+16(SB), (prime3, prime4)
+ MOVD primes<>+32(SB), prime5
+
+ CMP $32, len
+ CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
+ BLO afterLoop
+
+ ADD prime1, prime2, v1
+ MOVD prime2, v2
+ MOVD $0, v3
+ NEG prime1, v4
+
+ blocksLoop()
+
+ ROR $64-1, v1, x1
+ ROR $64-7, v2, x2
+ ADD x1, x2
+ ROR $64-12, v3, x3
+ ROR $64-18, v4, x4
+ ADD x3, x4
+ ADD x2, x4, h
+
+ mergeRound(v1)
+ mergeRound(v2)
+ mergeRound(v3)
+ mergeRound(v4)
+
+afterLoop:
+ ADD len, h
+
+ TBZ $4, len, try8
+ LDP.P 16(p), (x1, x2)
+
+ round0(x1)
+ ROR $64-27, h
+ EOR x1 @> 64-27, h, h
+ MADD h, prime4, prime1, h
+
+ round0(x2)
+ ROR $64-27, h
+ EOR x2 @> 64-27, h
+ MADD h, prime4, prime1, h
+
+try8:
+ TBZ $3, len, try4
+ MOVD.P 8(p), x1
+
+ round0(x1)
+ ROR $64-27, h
+ EOR x1 @> 64-27, h
+ MADD h, prime4, prime1, h
+
+try4:
+ TBZ $2, len, try2
+ MOVWU.P 4(p), x2
+
+ MUL prime1, x2
+ ROR $64-23, h
+ EOR x2 @> 64-23, h
+ MADD h, prime3, prime2, h
+
+try2:
+ TBZ $1, len, try1
+ MOVHU.P 2(p), x3
+ AND $255, x3, x1
+ LSR $8, x3, x2
+
+ MUL prime5, x1
+ ROR $64-11, h
+ EOR x1 @> 64-11, h
+ MUL prime1, h
+
+ MUL prime5, x2
+ ROR $64-11, h
+ EOR x2 @> 64-11, h
+ MUL prime1, h
+
+try1:
+ TBZ $0, len, end
+ MOVBU (p), x4
+
+ MUL prime5, x4
+ ROR $64-11, h
+ EOR x4 @> 64-11, h
+ MUL prime1, h
+
+end:
+ EOR h >> 33, h
+ MUL prime2, h
+ EOR h >> 29, h
+ MUL prime3, h
+ EOR h >> 32, h
+
+ MOVD h, ret+24(FP)
+ RET
+
+// func writeBlocks(d *Digest, b []byte) int
+//
+// Assumes len(b) >= 32.
+TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
+ LDP primes<>(SB), (prime1, prime2)
+
+ // Load state. Assume v[1-4] are stored contiguously.
+ MOVD d+0(FP), digest
+ LDP 0(digest), (v1, v2)
+ LDP 16(digest), (v3, v4)
+
+ LDP b_base+8(FP), (p, len)
+
+ blocksLoop()
+
+ // Store updated state.
+ STP (v1, v2), 0(digest)
+ STP (v3, v4), 16(digest)
+
+ BIC $31, len
+ MOVD len, ret+32(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
new file mode 100644
index 0000000..1a1fac9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
@@ -0,0 +1,16 @@
+//go:build (amd64 || arm64) && !appengine && gc && !purego && !noasm
+// +build amd64 arm64
+// +build !appengine
+// +build gc
+// +build !purego
+// +build !noasm
+
+package xxhash
+
+// Sum64 computes the 64-bit xxHash digest of b.
+//
+//go:noescape
+func Sum64(b []byte) uint64
+
+//go:noescape
+func writeBlocks(d *Digest, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
index 4a5a821..209cb4a 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
@@ -1,4 +1,5 @@
-// +build !amd64 appengine !gc purego
+//go:build (!amd64 && !arm64) || appengine || !gc || purego || noasm
+// +build !amd64,!arm64 appengine !gc purego noasm
package xxhash
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index 1dd39e6..df04472 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -20,6 +20,10 @@
llCode, mlCode, ofCode uint8
}
+type seqVals struct {
+ ll, ml, mo int
+}
+
func (s seq) String() string {
if s.offset <= 3 {
if s.offset == 0 {
@@ -61,16 +65,19 @@
offsets sequenceDec
matchLengths sequenceDec
prevOffset [3]int
- hist []byte
dict []byte
literals []byte
out []byte
+ nSeqs int
+ br *bitReader
+ seqSize int
windowSize int
maxBits uint8
+ maxSyncLen uint64
}
// initialize all 3 decoders from the stream input.
-func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []byte) error {
+func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) error {
if err := s.litLengths.init(br); err != nil {
return errors.New("litLengths:" + err.Error())
}
@@ -80,8 +87,7 @@
if err := s.matchLengths.init(br); err != nil {
return errors.New("matchLengths:" + err.Error())
}
- s.literals = literals
- s.hist = hist.b
+ s.br = br
s.prevOffset = hist.recentOffsets
s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
s.windowSize = hist.windowSize
@@ -93,12 +99,127 @@
return nil
}
+// execute will execute the decoded sequence with the provided history.
+// The sequence must be evaluated before being sent.
+func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
+ if len(s.dict) == 0 {
+ return s.executeSimple(seqs, hist)
+ }
+
+ // Ensure we have enough output size...
+ if len(s.out)+s.seqSize > cap(s.out) {
+ addBytes := s.seqSize + len(s.out)
+ s.out = append(s.out, make([]byte, addBytes)...)
+ s.out = s.out[:len(s.out)-addBytes]
+ }
+
+ if debugDecoder {
+ printf("Execute %d seqs with hist %d, dict %d, literals: %d into %d bytes\n", len(seqs), len(hist), len(s.dict), len(s.literals), s.seqSize)
+ }
+
+ var t = len(s.out)
+ out := s.out[:t+s.seqSize]
+
+ for _, seq := range seqs {
+ // Add literals
+ copy(out[t:], s.literals[:seq.ll])
+ t += seq.ll
+ s.literals = s.literals[seq.ll:]
+
+ // Copy from dictionary...
+ if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+ if len(s.dict) == 0 {
+ return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+ }
+
+ // we may be in dictionary.
+ dictO := len(s.dict) - (seq.mo - (t + len(hist)))
+ if dictO < 0 || dictO >= len(s.dict) {
+ return fmt.Errorf("match offset (%d) bigger than current history+dict (%d)", seq.mo, t+len(hist)+len(s.dict))
+ }
+ end := dictO + seq.ml
+ if end > len(s.dict) {
+ n := len(s.dict) - dictO
+ copy(out[t:], s.dict[dictO:])
+ t += n
+ seq.ml -= n
+ } else {
+ copy(out[t:], s.dict[dictO:end])
+ t += end - dictO
+ continue
+ }
+ }
+
+ // Copy from history.
+ if v := seq.mo - t; v > 0 {
+ // v is the start position in history from end.
+ start := len(hist) - v
+ if seq.ml > v {
+ // Some goes into current block.
+ // Copy remainder of history
+ copy(out[t:], hist[start:])
+ t += v
+ seq.ml -= v
+ } else {
+ copy(out[t:], hist[start:start+seq.ml])
+ t += seq.ml
+ continue
+ }
+ }
+ // We must be in current buffer now
+ if seq.ml > 0 {
+ start := t - seq.mo
+ if seq.ml <= t-start {
+ // No overlap
+ copy(out[t:], out[start:start+seq.ml])
+ t += seq.ml
+ continue
+ } else {
+ // Overlapping copy
+ // Extend destination slice and copy one byte at the time.
+ src := out[start : start+seq.ml]
+ dst := out[t:]
+ dst = dst[:len(src)]
+ t += len(src)
+ // Destination is the space we just added.
+ for i := range src {
+ dst[i] = src[i]
+ }
+ }
+ }
+ }
+
+ // Add final literals
+ copy(out[t:], s.literals)
+ if debugDecoder {
+ t += len(s.literals)
+ if t != len(out) {
+ panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+ }
+ }
+ s.out = out
+
+ return nil
+}
+
// decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
+func (s *sequenceDecs) decodeSync(hist []byte) error {
+ supported, err := s.decodeSyncSimple(hist)
+ if supported {
+ return err
+ }
+
+ br := s.br
+ seqs := s.nSeqs
startSize := len(s.out)
// Grab full sizes tables, to avoid bounds checks.
llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+ out := s.out
+ maxBlockSize := maxCompressedBlockSize
+ if s.windowSize < maxBlockSize {
+ maxBlockSize = s.windowSize
+ }
for i := seqs - 1; i >= 0; i-- {
if br.overread() {
@@ -151,7 +272,7 @@
if temp == 0 {
// 0 is not valid; input is corrupted; force offset to 1
- println("temp was 0")
+ println("WARNING: temp was 0")
temp = 1
}
@@ -176,51 +297,49 @@
if ll > len(s.literals) {
return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals))
}
- size := ll + ml + len(s.out)
+ size := ll + ml + len(out)
if size-startSize > maxBlockSize {
- return fmt.Errorf("output (%d) bigger than max block size", size)
+ return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
}
- if size > cap(s.out) {
+ if size > cap(out) {
// Not enough size, which can happen under high volume block streaming conditions
// but could be if destination slice is too small for sync operations.
// over-allocating here can create a large amount of GC pressure so we try to keep
// it as contained as possible
- used := len(s.out) - startSize
+ used := len(out) - startSize
addBytes := 256 + ll + ml + used>>2
// Clamp to max block size.
if used+addBytes > maxBlockSize {
addBytes = maxBlockSize - used
}
- s.out = append(s.out, make([]byte, addBytes)...)
- s.out = s.out[:len(s.out)-addBytes]
+ out = append(out, make([]byte, addBytes)...)
+ out = out[:len(out)-addBytes]
}
if ml > maxMatchLen {
return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
}
// Add literals
- s.out = append(s.out, s.literals[:ll]...)
+ out = append(out, s.literals[:ll]...)
s.literals = s.literals[ll:]
- out := s.out
if mo == 0 && ml > 0 {
return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
}
- if mo > len(s.out)+len(hist) || mo > s.windowSize {
+ if mo > len(out)+len(hist) || mo > s.windowSize {
if len(s.dict) == 0 {
- return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+ return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
}
// we may be in dictionary.
- dictO := len(s.dict) - (mo - (len(s.out) + len(hist)))
+ dictO := len(s.dict) - (mo - (len(out) + len(hist)))
if dictO < 0 || dictO >= len(s.dict) {
- return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+ return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
}
end := dictO + ml
if end > len(s.dict) {
out = append(out, s.dict[dictO:]...)
- mo -= len(s.dict) - dictO
ml -= len(s.dict) - dictO
} else {
out = append(out, s.dict[dictO:end]...)
@@ -231,26 +350,25 @@
// Copy from history.
// TODO: Blocks without history could be made to ignore this completely.
- if v := mo - len(s.out); v > 0 {
+ if v := mo - len(out); v > 0 {
// v is the start position in history from end.
- start := len(s.hist) - v
+ start := len(hist) - v
if ml > v {
// Some goes into current block.
// Copy remainder of history
- out = append(out, s.hist[start:]...)
- mo -= v
+ out = append(out, hist[start:]...)
ml -= v
} else {
- out = append(out, s.hist[start:start+ml]...)
+ out = append(out, hist[start:start+ml]...)
ml = 0
}
}
// We must be in current buffer now
if ml > 0 {
- start := len(s.out) - mo
- if ml <= len(s.out)-start {
+ start := len(out) - mo
+ if ml <= len(out)-start {
// No overlap
- out = append(out, s.out[start:start+ml]...)
+ out = append(out, out[start:start+ml]...)
} else {
// Overlapping copy
// Extend destination slice and copy one byte at the time.
@@ -264,7 +382,6 @@
}
}
}
- s.out = out
if i == 0 {
// This is the last sequence, so we shouldn't update state.
break
@@ -278,7 +395,8 @@
mlState = mlTable[mlState.newState()&maxTableMask]
ofState = ofTable[ofState.newState()&maxTableMask]
} else {
- bits := br.getBitsFast(nBits)
+ bits := br.get32BitsFast(nBits)
+
lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
llState = llTable[(llState.newState()+lowBits)&maxTableMask]
@@ -291,19 +409,14 @@
}
}
- // Add final literals
- s.out = append(s.out, s.literals...)
- return nil
-}
+ // Check if space for literals
+ if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
+ return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
+ }
-// update states, at least 27 bits must be available.
-func (s *sequenceDecs) update(br *bitReader) {
- // Max 8 bits
- s.litLengths.state.next(br)
- // Max 9 bits
- s.matchLengths.state.next(br)
- // Max 8 bits
- s.offsets.state.next(br)
+ // Add final literals
+ s.out = append(out, s.literals...)
+ return br.close()
}
var bitMask [16]uint16
@@ -314,87 +427,6 @@
}
}
-// update states, at least 27 bits must be available.
-func (s *sequenceDecs) updateAlt(br *bitReader) {
- // Update all 3 states at once. Approx 20% faster.
- a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
-
- nBits := a.nbBits() + b.nbBits() + c.nbBits()
- if nBits == 0 {
- s.litLengths.state.state = s.litLengths.state.dt[a.newState()]
- s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()]
- s.offsets.state.state = s.offsets.state.dt[c.newState()]
- return
- }
- bits := br.getBitsFast(nBits)
- lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
- s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
-
- lowBits = uint16(bits >> (c.nbBits() & 31))
- lowBits &= bitMask[b.nbBits()&15]
- s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()+lowBits]
-
- lowBits = uint16(bits) & bitMask[c.nbBits()&15]
- s.offsets.state.state = s.offsets.state.dt[c.newState()+lowBits]
-}
-
-// nextFast will return new states when there are at least 4 unused bytes left on the stream when done.
-func (s *sequenceDecs) nextFast(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
- // Final will not read from stream.
- ll, llB := llState.final()
- ml, mlB := mlState.final()
- mo, moB := ofState.final()
-
- // extra bits are stored in reverse order.
- br.fillFast()
- mo += br.getBits(moB)
- if s.maxBits > 32 {
- br.fillFast()
- }
- ml += br.getBits(mlB)
- ll += br.getBits(llB)
-
- if moB > 1 {
- s.prevOffset[2] = s.prevOffset[1]
- s.prevOffset[1] = s.prevOffset[0]
- s.prevOffset[0] = mo
- return
- }
- // mo = s.adjustOffset(mo, ll, moB)
- // Inlined for rather big speedup
- if ll == 0 {
- // There is an exception though, when current sequence's literals_length = 0.
- // In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
- // an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
- mo++
- }
-
- if mo == 0 {
- mo = s.prevOffset[0]
- return
- }
- var temp int
- if mo == 3 {
- temp = s.prevOffset[0] - 1
- } else {
- temp = s.prevOffset[mo]
- }
-
- if temp == 0 {
- // 0 is not valid; input is corrupted; force offset to 1
- println("temp was 0")
- temp = 1
- }
-
- if mo != 1 {
- s.prevOffset[2] = s.prevOffset[1]
- }
- s.prevOffset[1] = s.prevOffset[0]
- s.prevOffset[0] = temp
- mo = temp
- return
-}
-
func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
// Final will not read from stream.
ll, llB := llState.final()
@@ -457,36 +489,3 @@
s.prevOffset[0] = temp
return temp
}
-
-// mergeHistory will merge history.
-func (s *sequenceDecs) mergeHistory(hist *sequenceDecs) (*sequenceDecs, error) {
- for i := uint(0); i < 3; i++ {
- var sNew, sHist *sequenceDec
- switch i {
- default:
- // same as "case 0":
- sNew = &s.litLengths
- sHist = &hist.litLengths
- case 1:
- sNew = &s.offsets
- sHist = &hist.offsets
- case 2:
- sNew = &s.matchLengths
- sHist = &hist.matchLengths
- }
- if sNew.repeat {
- if sHist.fse == nil {
- return nil, fmt.Errorf("sequence stream %d, repeat requested, but no history", i)
- }
- continue
- }
- if sNew.fse == nil {
- return nil, fmt.Errorf("sequence stream %d, no fse found", i)
- }
- if sHist.fse != nil && !sHist.fse.preDefined {
- fseDecoderPool.Put(sHist.fse)
- }
- sHist.fse = sNew.fse
- }
- return hist, nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
new file mode 100644
index 0000000..7598c10
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -0,0 +1,368 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package zstd
+
+import (
+ "fmt"
+
+ "github.com/klauspost/compress/internal/cpuinfo"
+)
+
+type decodeSyncAsmContext struct {
+ llTable []decSymbol
+ mlTable []decSymbol
+ ofTable []decSymbol
+ llState uint64
+ mlState uint64
+ ofState uint64
+ iteration int
+ litRemain int
+ out []byte
+ outPosition int
+ literals []byte
+ litPosition int
+ history []byte
+ windowSize int
+ ll int // set on error (not for all errors, please refer to _generate/gen.go)
+ ml int // set on error (not for all errors, please refer to _generate/gen.go)
+ mo int // set on error (not for all errors, please refer to _generate/gen.go)
+}
+
+// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
+//go:noescape
+func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
+//go:noescape
+func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// decode sequences from the stream with the provided history but without a dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+ if len(s.dict) > 0 {
+ return false, nil
+ }
+ if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
+ return false, nil
+ }
+
+ // FIXME: Using unsafe memory copies leads to rare, random crashes
+ // with fuzz testing. It is therefore disabled for now.
+ const useSafe = true
+ /*
+ useSafe := false
+ if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
+ useSafe = true
+ }
+ if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
+ useSafe = true
+ }
+ if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+ useSafe = true
+ }
+ */
+
+ br := s.br
+
+ maxBlockSize := maxCompressedBlockSize
+ if s.windowSize < maxBlockSize {
+ maxBlockSize = s.windowSize
+ }
+
+ ctx := decodeSyncAsmContext{
+ llTable: s.litLengths.fse.dt[:maxTablesize],
+ mlTable: s.matchLengths.fse.dt[:maxTablesize],
+ ofTable: s.offsets.fse.dt[:maxTablesize],
+ llState: uint64(s.litLengths.state.state),
+ mlState: uint64(s.matchLengths.state.state),
+ ofState: uint64(s.offsets.state.state),
+ iteration: s.nSeqs - 1,
+ litRemain: len(s.literals),
+ out: s.out,
+ outPosition: len(s.out),
+ literals: s.literals,
+ windowSize: s.windowSize,
+ history: hist,
+ }
+
+ s.seqSize = 0
+ startSize := len(s.out)
+
+ var errCode int
+ if cpuinfo.HasBMI2() {
+ if useSafe {
+ errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx)
+ } else {
+ errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx)
+ }
+ } else {
+ if useSafe {
+ errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx)
+ } else {
+ errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx)
+ }
+ }
+ switch errCode {
+ case noError:
+ break
+
+ case errorMatchLenOfsMismatch:
+ return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml)
+
+ case errorMatchLenTooBig:
+ return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml)
+
+ case errorMatchOffTooBig:
+ return true, fmt.Errorf("match offset (%d) bigger than current history (%d)",
+ ctx.mo, ctx.outPosition+len(hist)-startSize)
+
+ case errorNotEnoughLiterals:
+ return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
+ ctx.ll, ctx.litRemain+ctx.ll)
+
+ case errorNotEnoughSpace:
+ size := ctx.outPosition + ctx.ll + ctx.ml
+ if debugDecoder {
+ println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
+ }
+ return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+
+ default:
+ return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
+ }
+
+ s.seqSize += ctx.litRemain
+ if s.seqSize > maxBlockSize {
+ return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ }
+ err := br.close()
+ if err != nil {
+ printf("Closing sequences: %v, %+v\n", err, *br)
+ return true, err
+ }
+
+ s.literals = s.literals[ctx.litPosition:]
+ t := ctx.outPosition
+ s.out = s.out[:t]
+
+ // Add final literals
+ s.out = append(s.out, s.literals...)
+ if debugDecoder {
+ t += len(s.literals)
+ if t != len(s.out) {
+ panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t))
+ }
+ }
+
+ return true, nil
+}
+
+// --------------------------------------------------------------------------------
+
+type decodeAsmContext struct {
+ llTable []decSymbol
+ mlTable []decSymbol
+ ofTable []decSymbol
+ llState uint64
+ mlState uint64
+ ofState uint64
+ iteration int
+ seqs []seqVals
+ litRemain int
+}
+
+const noError = 0
+
+// error reported when mo == 0 && ml > 0
+const errorMatchLenOfsMismatch = 1
+
+// error reported when ml > maxMatchLen
+const errorMatchLenTooBig = 2
+
+// error reported when mo > available history or mo > s.windowSize
+const errorMatchOffTooBig = 3
+
+// error reported when the sum of literal lengths exeeceds the literal buffer size
+const errorNotEnoughLiterals = 4
+
+// error reported when capacity of `out` is too small
+const errorNotEnoughSpace = 5
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+ br := s.br
+
+ maxBlockSize := maxCompressedBlockSize
+ if s.windowSize < maxBlockSize {
+ maxBlockSize = s.windowSize
+ }
+
+ ctx := decodeAsmContext{
+ llTable: s.litLengths.fse.dt[:maxTablesize],
+ mlTable: s.matchLengths.fse.dt[:maxTablesize],
+ ofTable: s.offsets.fse.dt[:maxTablesize],
+ llState: uint64(s.litLengths.state.state),
+ mlState: uint64(s.matchLengths.state.state),
+ ofState: uint64(s.offsets.state.state),
+ seqs: seqs,
+ iteration: len(seqs) - 1,
+ litRemain: len(s.literals),
+ }
+
+ s.seqSize = 0
+ lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
+ var errCode int
+ if cpuinfo.HasBMI2() {
+ if lte56bits {
+ errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx)
+ } else {
+ errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
+ }
+ } else {
+ if lte56bits {
+ errCode = sequenceDecs_decode_56_amd64(s, br, &ctx)
+ } else {
+ errCode = sequenceDecs_decode_amd64(s, br, &ctx)
+ }
+ }
+ if errCode != 0 {
+ i := len(seqs) - ctx.iteration - 1
+ switch errCode {
+ case errorMatchLenOfsMismatch:
+ ml := ctx.seqs[i].ml
+ return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+
+ case errorMatchLenTooBig:
+ ml := ctx.seqs[i].ml
+ return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+
+ case errorNotEnoughLiterals:
+ ll := ctx.seqs[i].ll
+ return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
+ }
+
+ return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
+ }
+
+ if ctx.litRemain < 0 {
+ return fmt.Errorf("literal count is too big: total available %d, total requested %d",
+ len(s.literals), len(s.literals)-ctx.litRemain)
+ }
+
+ s.seqSize += ctx.litRemain
+ if s.seqSize > maxBlockSize {
+ return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ }
+ err := br.close()
+ if err != nil {
+ printf("Closing sequences: %v, %+v\n", err, *br)
+ }
+ return err
+}
+
+// --------------------------------------------------------------------------------
+
+type executeAsmContext struct {
+ seqs []seqVals
+ seqIndex int
+ out []byte
+ history []byte
+ literals []byte
+ outPosition int
+ litPosition int
+ windowSize int
+}
+
+// sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm.
+//
+// Returns false if a match offset is too big.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+
+// Same as above, but with safe memcopies
+//go:noescape
+func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
+
+// executeSimple handles cases when dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+ // Ensure we have enough output size...
+ if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) {
+ addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc
+ s.out = append(s.out, make([]byte, addBytes)...)
+ s.out = s.out[:len(s.out)-addBytes]
+ }
+
+ if debugDecoder {
+ printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+ }
+
+ var t = len(s.out)
+ out := s.out[:t+s.seqSize]
+
+ ctx := executeAsmContext{
+ seqs: seqs,
+ seqIndex: 0,
+ out: out,
+ history: hist,
+ outPosition: t,
+ litPosition: 0,
+ literals: s.literals,
+ windowSize: s.windowSize,
+ }
+ var ok bool
+ if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+ ok = sequenceDecs_executeSimple_safe_amd64(&ctx)
+ } else {
+ ok = sequenceDecs_executeSimple_amd64(&ctx)
+ }
+ if !ok {
+ return fmt.Errorf("match offset (%d) bigger than current history (%d)",
+ seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
+ }
+ s.literals = s.literals[ctx.litPosition:]
+ t = ctx.outPosition
+
+ // Add final literals
+ copy(out[t:], s.literals)
+ if debugDecoder {
+ t += len(s.literals)
+ if t != len(out) {
+ panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+ }
+ }
+ s.out = out
+
+ return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
new file mode 100644
index 0000000..27e7677
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -0,0 +1,4100 @@
+// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+// +build !appengine,!noasm,gc,!noasm
+
+// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_amd64(SB), $8-32
+ MOVQ br+8(FP), AX
+ MOVQ 32(AX), DX
+ MOVBQZX 40(AX), BX
+ MOVQ 24(AX), SI
+ MOVQ (AX), AX
+ ADDQ SI, AX
+ MOVQ AX, (SP)
+ MOVQ ctx+16(FP), AX
+ MOVQ 72(AX), DI
+ MOVQ 80(AX), R8
+ MOVQ 88(AX), R9
+ MOVQ 104(AX), R10
+ MOVQ s+0(FP), AX
+ MOVQ 144(AX), R11
+ MOVQ 152(AX), R12
+ MOVQ 160(AX), R13
+
+sequenceDecs_decode_amd64_main_loop:
+ MOVQ (SP), R14
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ SI, $0x08
+ JL sequenceDecs_decode_amd64_fill_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R14
+ MOVQ (R14), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decode_amd64_fill_end
+
+sequenceDecs_decode_amd64_fill_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decode_amd64_fill_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decode_amd64_fill_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R14
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R14), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decode_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_end:
+ // Update offset
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_of_update_zero:
+ MOVQ AX, 16(R10)
+
+ // Update match length
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_ml_update_zero:
+ MOVQ AX, 8(R10)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ SI, $0x08
+ JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R14
+ MOVQ (R14), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decode_amd64_fill_2_end
+
+sequenceDecs_decode_amd64_fill_2_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decode_amd64_fill_2_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decode_amd64_fill_2_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R14
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R14), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_2_end:
+ // Update literal length
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_ll_update_zero:
+ MOVQ AX, (R10)
+
+ // Fill bitreader for state updates
+ MOVQ R14, (SP)
+ MOVQ R9, AX
+ SHRQ $0x08, AX
+ MOVBQZX AL, AX
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decode_amd64_skip_update
+
+ // Update Literal Length State
+ MOVBQZX DI, R14
+ SHRQ $0x10, DI
+ MOVWQZX DI, DI
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, DI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Match Length State
+ MOVBQZX R8, R14
+ SHRQ $0x10, R8
+ MOVWQZX R8, R8
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, R8
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Offset State
+ MOVBQZX R9, R14
+ SHRQ $0x10, R9
+ MOVWQZX R9, R9
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, R9
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_amd64_skip_update:
+ // Adjust offset
+ MOVQ 16(R10), CX
+ CMPQ AX, $0x01
+ JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
+ MOVQ R12, R13
+ MOVQ R11, R12
+ MOVQ CX, R11
+ JMP sequenceDecs_decode_amd64_after_adjust
+
+sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
+ CMPQ (R10), $0x00000000
+ JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
+ INCQ CX
+ JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_amd64_adjust_offset_maybezero:
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
+ MOVQ R11, CX
+ JMP sequenceDecs_decode_amd64_after_adjust
+
+sequenceDecs_decode_amd64_adjust_offset_nonzero:
+ CMPQ CX, $0x01
+ JB sequenceDecs_decode_amd64_adjust_zero
+ JEQ sequenceDecs_decode_amd64_adjust_one
+ CMPQ CX, $0x02
+ JA sequenceDecs_decode_amd64_adjust_three
+ JMP sequenceDecs_decode_amd64_adjust_two
+
+sequenceDecs_decode_amd64_adjust_zero:
+ MOVQ R11, AX
+ JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_one:
+ MOVQ R12, AX
+ JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_two:
+ MOVQ R13, AX
+ JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_three:
+ LEAQ -1(R11), AX
+
+sequenceDecs_decode_amd64_adjust_test_temp_valid:
+ TESTQ AX, AX
+ JNZ sequenceDecs_decode_amd64_adjust_temp_valid
+ MOVQ $0x00000001, AX
+
+sequenceDecs_decode_amd64_adjust_temp_valid:
+ CMPQ CX, $0x01
+ CMOVQNE R12, R13
+ MOVQ R11, R12
+ MOVQ AX, R11
+ MOVQ AX, CX
+
+sequenceDecs_decode_amd64_after_adjust:
+ MOVQ CX, 16(R10)
+
+ // Check values
+ MOVQ 8(R10), AX
+ MOVQ (R10), R14
+ LEAQ (AX)(R14*1), R15
+ MOVQ s+0(FP), BP
+ ADDQ R15, 256(BP)
+ MOVQ ctx+16(FP), R15
+ SUBQ R14, 128(R15)
+ JS error_not_enough_literals
+ CMPQ AX, $0x00020002
+ JA sequenceDecs_decode_amd64_error_match_len_too_big
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
+ TESTQ AX, AX
+ JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_amd64_match_len_ofs_ok:
+ ADDQ $0x18, R10
+ MOVQ ctx+16(FP), AX
+ DECQ 96(AX)
+ JNS sequenceDecs_decode_amd64_main_loop
+ MOVQ s+0(FP), AX
+ MOVQ R11, 144(AX)
+ MOVQ R12, 152(AX)
+ MOVQ R13, 160(AX)
+ MOVQ br+8(FP), AX
+ MOVQ DX, 32(AX)
+ MOVB BL, 40(AX)
+ MOVQ SI, 24(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decode_amd64_error_match_len_too_big:
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
+ MOVQ br+8(FP), AX
+ MOVQ 32(AX), DX
+ MOVBQZX 40(AX), BX
+ MOVQ 24(AX), SI
+ MOVQ (AX), AX
+ ADDQ SI, AX
+ MOVQ AX, (SP)
+ MOVQ ctx+16(FP), AX
+ MOVQ 72(AX), DI
+ MOVQ 80(AX), R8
+ MOVQ 88(AX), R9
+ MOVQ 104(AX), R10
+ MOVQ s+0(FP), AX
+ MOVQ 144(AX), R11
+ MOVQ 152(AX), R12
+ MOVQ 160(AX), R13
+
+sequenceDecs_decode_56_amd64_main_loop:
+ MOVQ (SP), R14
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ SI, $0x08
+ JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R14
+ MOVQ (R14), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decode_56_amd64_fill_end
+
+sequenceDecs_decode_56_amd64_fill_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decode_56_amd64_fill_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decode_56_amd64_fill_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R14
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R14), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_56_amd64_fill_end:
+ // Update offset
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_of_update_zero:
+ MOVQ AX, 16(R10)
+
+ // Update match length
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_ml_update_zero:
+ MOVQ AX, 8(R10)
+
+ // Update literal length
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_ll_update_zero:
+ MOVQ AX, (R10)
+
+ // Fill bitreader for state updates
+ MOVQ R14, (SP)
+ MOVQ R9, AX
+ SHRQ $0x08, AX
+ MOVBQZX AL, AX
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decode_56_amd64_skip_update
+
+ // Update Literal Length State
+ MOVBQZX DI, R14
+ SHRQ $0x10, DI
+ MOVWQZX DI, DI
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, DI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Match Length State
+ MOVBQZX R8, R14
+ SHRQ $0x10, R8
+ MOVWQZX R8, R8
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, R8
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Offset State
+ MOVBQZX R9, R14
+ SHRQ $0x10, R9
+ MOVWQZX R9, R9
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, R9
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_56_amd64_skip_update:
+ // Adjust offset
+ MOVQ 16(R10), CX
+ CMPQ AX, $0x01
+ JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
+ MOVQ R12, R13
+ MOVQ R11, R12
+ MOVQ CX, R11
+ JMP sequenceDecs_decode_56_amd64_after_adjust
+
+sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
+ CMPQ (R10), $0x00000000
+ JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
+ INCQ CX
+ JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+ MOVQ R11, CX
+ JMP sequenceDecs_decode_56_amd64_after_adjust
+
+sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
+ CMPQ CX, $0x01
+ JB sequenceDecs_decode_56_amd64_adjust_zero
+ JEQ sequenceDecs_decode_56_amd64_adjust_one
+ CMPQ CX, $0x02
+ JA sequenceDecs_decode_56_amd64_adjust_three
+ JMP sequenceDecs_decode_56_amd64_adjust_two
+
+sequenceDecs_decode_56_amd64_adjust_zero:
+ MOVQ R11, AX
+ JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_one:
+ MOVQ R12, AX
+ JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_two:
+ MOVQ R13, AX
+ JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_three:
+ LEAQ -1(R11), AX
+
+sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
+ TESTQ AX, AX
+ JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
+ MOVQ $0x00000001, AX
+
+sequenceDecs_decode_56_amd64_adjust_temp_valid:
+ CMPQ CX, $0x01
+ CMOVQNE R12, R13
+ MOVQ R11, R12
+ MOVQ AX, R11
+ MOVQ AX, CX
+
+sequenceDecs_decode_56_amd64_after_adjust:
+ MOVQ CX, 16(R10)
+
+ // Check values
+ MOVQ 8(R10), AX
+ MOVQ (R10), R14
+ LEAQ (AX)(R14*1), R15
+ MOVQ s+0(FP), BP
+ ADDQ R15, 256(BP)
+ MOVQ ctx+16(FP), R15
+ SUBQ R14, 128(R15)
+ JS error_not_enough_literals
+ CMPQ AX, $0x00020002
+ JA sequenceDecs_decode_56_amd64_error_match_len_too_big
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
+ TESTQ AX, AX
+ JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_amd64_match_len_ofs_ok:
+ ADDQ $0x18, R10
+ MOVQ ctx+16(FP), AX
+ DECQ 96(AX)
+ JNS sequenceDecs_decode_56_amd64_main_loop
+ MOVQ s+0(FP), AX
+ MOVQ R11, 144(AX)
+ MOVQ R12, 152(AX)
+ MOVQ R13, 160(AX)
+ MOVQ br+8(FP), AX
+ MOVQ DX, 32(AX)
+ MOVB BL, 40(AX)
+ MOVQ SI, 24(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decode_56_amd64_error_match_len_too_big:
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
+ MOVQ br+8(FP), CX
+ MOVQ 32(CX), AX
+ MOVBQZX 40(CX), DX
+ MOVQ 24(CX), BX
+ MOVQ (CX), CX
+ ADDQ BX, CX
+ MOVQ CX, (SP)
+ MOVQ ctx+16(FP), CX
+ MOVQ 72(CX), SI
+ MOVQ 80(CX), DI
+ MOVQ 88(CX), R8
+ MOVQ 104(CX), R9
+ MOVQ s+0(FP), CX
+ MOVQ 144(CX), R10
+ MOVQ 152(CX), R11
+ MOVQ 160(CX), R12
+
+sequenceDecs_decode_bmi2_main_loop:
+ MOVQ (SP), R13
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ BX, $0x08
+ JL sequenceDecs_decode_bmi2_fill_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R13
+ MOVQ (R13), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decode_bmi2_fill_end
+
+sequenceDecs_decode_bmi2_fill_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decode_bmi2_fill_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decode_bmi2_fill_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R13
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R13), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_end:
+ // Update offset
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ R8, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, 16(R9)
+
+ // Update match length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, DI, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ DI, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, 8(R9)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ BX, $0x08
+ JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R13
+ MOVQ (R13), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decode_bmi2_fill_2_end
+
+sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decode_bmi2_fill_2_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decode_bmi2_fill_2_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R13
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R13), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_2_end:
+ // Update literal length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, SI, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ SI, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, (R9)
+
+ // Fill bitreader for state updates
+ MOVQ R13, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R13
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decode_bmi2_skip_update
+ LEAQ (SI)(DI*1), R14
+ ADDQ R8, R14
+ MOVBQZX R14, R14
+ LEAQ (DX)(R14*1), CX
+ MOVQ AX, R15
+ MOVQ CX, DX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+
+ // Update Offset State
+ BZHIQ R8, R15, CX
+ SHRXQ R8, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Match Length State
+ BZHIQ DI, R15, CX
+ SHRXQ DI, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, DI, DI
+ ADDQ CX, DI
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Literal Length State
+ BZHIQ SI, R15, CX
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, SI, SI
+ ADDQ CX, SI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decode_bmi2_skip_update:
+ // Adjust offset
+ MOVQ 16(R9), CX
+ CMPQ R13, $0x01
+ JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
+ MOVQ R11, R12
+ MOVQ R10, R11
+ MOVQ CX, R10
+ JMP sequenceDecs_decode_bmi2_after_adjust
+
+sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
+ CMPQ (R9), $0x00000000
+ JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
+ INCQ CX
+ JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_bmi2_adjust_offset_maybezero:
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
+ MOVQ R10, CX
+ JMP sequenceDecs_decode_bmi2_after_adjust
+
+sequenceDecs_decode_bmi2_adjust_offset_nonzero:
+ CMPQ CX, $0x01
+ JB sequenceDecs_decode_bmi2_adjust_zero
+ JEQ sequenceDecs_decode_bmi2_adjust_one
+ CMPQ CX, $0x02
+ JA sequenceDecs_decode_bmi2_adjust_three
+ JMP sequenceDecs_decode_bmi2_adjust_two
+
+sequenceDecs_decode_bmi2_adjust_zero:
+ MOVQ R10, R13
+ JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_one:
+ MOVQ R11, R13
+ JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_two:
+ MOVQ R12, R13
+ JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_three:
+ LEAQ -1(R10), R13
+
+sequenceDecs_decode_bmi2_adjust_test_temp_valid:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
+ MOVQ $0x00000001, R13
+
+sequenceDecs_decode_bmi2_adjust_temp_valid:
+ CMPQ CX, $0x01
+ CMOVQNE R11, R12
+ MOVQ R10, R11
+ MOVQ R13, R10
+ MOVQ R13, CX
+
+sequenceDecs_decode_bmi2_after_adjust:
+ MOVQ CX, 16(R9)
+
+ // Check values
+ MOVQ 8(R9), R13
+ MOVQ (R9), R14
+ LEAQ (R13)(R14*1), R15
+ MOVQ s+0(FP), BP
+ ADDQ R15, 256(BP)
+ MOVQ ctx+16(FP), R15
+ SUBQ R14, 128(R15)
+ JS error_not_enough_literals
+ CMPQ R13, $0x00020002
+ JA sequenceDecs_decode_bmi2_error_match_len_too_big
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
+ TESTQ R13, R13
+ JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_bmi2_match_len_ofs_ok:
+ ADDQ $0x18, R9
+ MOVQ ctx+16(FP), CX
+ DECQ 96(CX)
+ JNS sequenceDecs_decode_bmi2_main_loop
+ MOVQ s+0(FP), CX
+ MOVQ R10, 144(CX)
+ MOVQ R11, 152(CX)
+ MOVQ R12, 160(CX)
+ MOVQ br+8(FP), CX
+ MOVQ AX, 32(CX)
+ MOVB DL, 40(CX)
+ MOVQ BX, 24(CX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decode_bmi2_error_match_len_too_big:
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
+ MOVQ br+8(FP), CX
+ MOVQ 32(CX), AX
+ MOVBQZX 40(CX), DX
+ MOVQ 24(CX), BX
+ MOVQ (CX), CX
+ ADDQ BX, CX
+ MOVQ CX, (SP)
+ MOVQ ctx+16(FP), CX
+ MOVQ 72(CX), SI
+ MOVQ 80(CX), DI
+ MOVQ 88(CX), R8
+ MOVQ 104(CX), R9
+ MOVQ s+0(FP), CX
+ MOVQ 144(CX), R10
+ MOVQ 152(CX), R11
+ MOVQ 160(CX), R12
+
+sequenceDecs_decode_56_bmi2_main_loop:
+ MOVQ (SP), R13
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ BX, $0x08
+ JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R13
+ MOVQ (R13), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decode_56_bmi2_fill_end
+
+sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decode_56_bmi2_fill_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decode_56_bmi2_fill_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R13
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R13), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_56_bmi2_fill_end:
+ // Update offset
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ R8, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, 16(R9)
+
+ // Update match length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, DI, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ DI, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, 8(R9)
+
+ // Update literal length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, SI, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ SI, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, (R9)
+
+ // Fill bitreader for state updates
+ MOVQ R13, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R13
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decode_56_bmi2_skip_update
+ LEAQ (SI)(DI*1), R14
+ ADDQ R8, R14
+ MOVBQZX R14, R14
+ LEAQ (DX)(R14*1), CX
+ MOVQ AX, R15
+ MOVQ CX, DX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+
+ // Update Offset State
+ BZHIQ R8, R15, CX
+ SHRXQ R8, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Match Length State
+ BZHIQ DI, R15, CX
+ SHRXQ DI, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, DI, DI
+ ADDQ CX, DI
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Literal Length State
+ BZHIQ SI, R15, CX
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, SI, SI
+ ADDQ CX, SI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decode_56_bmi2_skip_update:
+ // Adjust offset
+ MOVQ 16(R9), CX
+ CMPQ R13, $0x01
+ JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
+ MOVQ R11, R12
+ MOVQ R10, R11
+ MOVQ CX, R10
+ JMP sequenceDecs_decode_56_bmi2_after_adjust
+
+sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
+ CMPQ (R9), $0x00000000
+ JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
+ INCQ CX
+ JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+ MOVQ R10, CX
+ JMP sequenceDecs_decode_56_bmi2_after_adjust
+
+sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
+ CMPQ CX, $0x01
+ JB sequenceDecs_decode_56_bmi2_adjust_zero
+ JEQ sequenceDecs_decode_56_bmi2_adjust_one
+ CMPQ CX, $0x02
+ JA sequenceDecs_decode_56_bmi2_adjust_three
+ JMP sequenceDecs_decode_56_bmi2_adjust_two
+
+sequenceDecs_decode_56_bmi2_adjust_zero:
+ MOVQ R10, R13
+ JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_one:
+ MOVQ R11, R13
+ JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_two:
+ MOVQ R12, R13
+ JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_three:
+ LEAQ -1(R10), R13
+
+sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
+ MOVQ $0x00000001, R13
+
+sequenceDecs_decode_56_bmi2_adjust_temp_valid:
+ CMPQ CX, $0x01
+ CMOVQNE R11, R12
+ MOVQ R10, R11
+ MOVQ R13, R10
+ MOVQ R13, CX
+
+sequenceDecs_decode_56_bmi2_after_adjust:
+ MOVQ CX, 16(R9)
+
+ // Check values
+ MOVQ 8(R9), R13
+ MOVQ (R9), R14
+ LEAQ (R13)(R14*1), R15
+ MOVQ s+0(FP), BP
+ ADDQ R15, 256(BP)
+ MOVQ ctx+16(FP), R15
+ SUBQ R14, 128(R15)
+ JS error_not_enough_literals
+ CMPQ R13, $0x00020002
+ JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
+ TESTQ R13, R13
+ JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
+ ADDQ $0x18, R9
+ MOVQ ctx+16(FP), CX
+ DECQ 96(CX)
+ JNS sequenceDecs_decode_56_bmi2_main_loop
+ MOVQ s+0(FP), CX
+ MOVQ R10, 144(CX)
+ MOVQ R11, 152(CX)
+ MOVQ R12, 160(CX)
+ MOVQ br+8(FP), CX
+ MOVQ AX, 32(CX)
+ MOVB DL, 40(CX)
+ MOVQ BX, 24(CX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decode_56_bmi2_error_match_len_too_big:
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+// Requires: SSE
+TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
+ MOVQ ctx+0(FP), R10
+ MOVQ 8(R10), CX
+ TESTQ CX, CX
+ JZ empty_seqs
+ MOVQ (R10), AX
+ MOVQ 24(R10), DX
+ MOVQ 32(R10), BX
+ MOVQ 80(R10), SI
+ MOVQ 104(R10), DI
+ MOVQ 120(R10), R8
+ MOVQ 56(R10), R9
+ MOVQ 64(R10), R10
+ ADDQ R10, R9
+
+ // seqsBase += 24 * seqIndex
+ LEAQ (DX)(DX*2), R11
+ SHLQ $0x03, R11
+ ADDQ R11, AX
+
+ // outBase += outPosition
+ ADDQ DI, BX
+
+main_loop:
+ MOVQ (AX), R11
+ MOVQ 16(AX), R12
+ MOVQ 8(AX), R13
+
+ // Copy literals
+ TESTQ R11, R11
+ JZ check_offset
+ XORQ R14, R14
+
+copy_1:
+ MOVUPS (SI)(R14*1), X0
+ MOVUPS X0, (BX)(R14*1)
+ ADDQ $0x10, R14
+ CMPQ R14, R11
+ JB copy_1
+ ADDQ R11, SI
+ ADDQ R11, BX
+ ADDQ R11, DI
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ LEAQ (DI)(R10*1), R11
+ CMPQ R12, R11
+ JG error_match_off_too_big
+ CMPQ R12, R8
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ R12, R11
+ SUBQ DI, R11
+ JLS copy_match
+ MOVQ R9, R14
+ SUBQ R11, R14
+ CMPQ R13, R11
+ JG copy_all_from_history
+ MOVQ R13, R11
+ SUBQ $0x10, R11
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R11
+ JAE copy_4_loop
+ LEAQ 16(R14)(R11*1), R14
+ LEAQ 16(BX)(R11*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), R11
+ MOVB 2(R14), R12
+ MOVW R11, (BX)
+ MOVB R12, 2(BX)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), R11
+ MOVL -4(R14)(R13*1), R12
+ MOVL R11, (BX)
+ MOVL R12, -4(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), R11
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ R11, (BX)
+ MOVQ R12, -8(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+
+copy_4_end:
+ ADDQ R13, DI
+ ADDQ $0x18, AX
+ INCQ DX
+ CMPQ DX, CX
+ JB main_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ R11, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(BX)(R15*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ R11, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ R11, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(R11*1), BP
+ MOVB R15, (BX)
+ MOVB BP, -1(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (BX)
+ MOVB BP, 2(BX)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(R11*1), BP
+ MOVL R15, (BX)
+ MOVL BP, -4(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(R11*1), BP
+ MOVQ R15, (BX)
+ MOVQ BP, -8(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+
+copy_5_end:
+ ADDQ R11, DI
+ SUBQ R11, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ BX, R11
+ SUBQ R12, R11
+
+ // ml <= mo
+ CMPQ R13, R12
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, DI
+ MOVQ BX, R12
+ ADDQ R13, BX
+
+copy_2:
+ MOVUPS (R11), X0
+ MOVUPS X0, (R12)
+ ADDQ $0x10, R11
+ ADDQ $0x10, R12
+ SUBQ $0x10, R13
+ JHI copy_2
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, DI
+
+copy_slow_3:
+ MOVB (R11), R12
+ MOVB R12, (BX)
+ INCQ R11
+ INCQ BX
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ ADDQ $0x18, AX
+ INCQ DX
+ CMPQ DX, CX
+ JB main_loop
+
+loop_finished:
+ // Return value
+ MOVB $0x01, ret+8(FP)
+
+ // Update the context
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, 24(AX)
+ MOVQ DI, 104(AX)
+ MOVQ 80(AX), CX
+ SUBQ CX, SI
+ MOVQ SI, 112(AX)
+ RET
+
+error_match_off_too_big:
+ // Return value
+ MOVB $0x00, ret+8(FP)
+
+ // Update the context
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, 24(AX)
+ MOVQ DI, 104(AX)
+ MOVQ 80(AX), CX
+ SUBQ CX, SI
+ MOVQ SI, 112(AX)
+ RET
+
+empty_seqs:
+ // Return value
+ MOVB $0x01, ret+8(FP)
+ RET
+
+// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
+// Requires: SSE
+TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
+ MOVQ ctx+0(FP), R10
+ MOVQ 8(R10), CX
+ TESTQ CX, CX
+ JZ empty_seqs
+ MOVQ (R10), AX
+ MOVQ 24(R10), DX
+ MOVQ 32(R10), BX
+ MOVQ 80(R10), SI
+ MOVQ 104(R10), DI
+ MOVQ 120(R10), R8
+ MOVQ 56(R10), R9
+ MOVQ 64(R10), R10
+ ADDQ R10, R9
+
+ // seqsBase += 24 * seqIndex
+ LEAQ (DX)(DX*2), R11
+ SHLQ $0x03, R11
+ ADDQ R11, AX
+
+ // outBase += outPosition
+ ADDQ DI, BX
+
+main_loop:
+ MOVQ (AX), R11
+ MOVQ 16(AX), R12
+ MOVQ 8(AX), R13
+
+ // Copy literals
+ TESTQ R11, R11
+ JZ check_offset
+ MOVQ R11, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
+
+copy_1_loop:
+ MOVUPS (SI), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, SI
+ ADDQ $0x10, BX
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(SI)(R14*1), SI
+ LEAQ 16(BX)(R14*1), BX
+ MOVUPS -16(SI), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ R11, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ R11, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (SI), R14
+ MOVB -1(SI)(R11*1), R15
+ MOVB R14, (BX)
+ MOVB R15, -1(BX)(R11*1)
+ ADDQ R11, SI
+ ADDQ R11, BX
+ JMP copy_1_end
+
+copy_1_move_3:
+ MOVW (SI), R14
+ MOVB 2(SI), R15
+ MOVW R14, (BX)
+ MOVB R15, 2(BX)
+ ADDQ R11, SI
+ ADDQ R11, BX
+ JMP copy_1_end
+
+copy_1_move_4through7:
+ MOVL (SI), R14
+ MOVL -4(SI)(R11*1), R15
+ MOVL R14, (BX)
+ MOVL R15, -4(BX)(R11*1)
+ ADDQ R11, SI
+ ADDQ R11, BX
+ JMP copy_1_end
+
+copy_1_move_8through16:
+ MOVQ (SI), R14
+ MOVQ -8(SI)(R11*1), R15
+ MOVQ R14, (BX)
+ MOVQ R15, -8(BX)(R11*1)
+ ADDQ R11, SI
+ ADDQ R11, BX
+
+copy_1_end:
+ ADDQ R11, DI
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ LEAQ (DI)(R10*1), R11
+ CMPQ R12, R11
+ JG error_match_off_too_big
+ CMPQ R12, R8
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ R12, R11
+ SUBQ DI, R11
+ JLS copy_match
+ MOVQ R9, R14
+ SUBQ R11, R14
+ CMPQ R13, R11
+ JG copy_all_from_history
+ MOVQ R13, R11
+ SUBQ $0x10, R11
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R11
+ JAE copy_4_loop
+ LEAQ 16(R14)(R11*1), R14
+ LEAQ 16(BX)(R11*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), R11
+ MOVB 2(R14), R12
+ MOVW R11, (BX)
+ MOVB R12, 2(BX)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), R11
+ MOVL -4(R14)(R13*1), R12
+ MOVL R11, (BX)
+ MOVL R12, -4(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), R11
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ R11, (BX)
+ MOVQ R12, -8(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+
+copy_4_end:
+ ADDQ R13, DI
+ ADDQ $0x18, AX
+ INCQ DX
+ CMPQ DX, CX
+ JB main_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ R11, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(BX)(R15*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ R11, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ R11, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(R11*1), BP
+ MOVB R15, (BX)
+ MOVB BP, -1(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (BX)
+ MOVB BP, 2(BX)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(R11*1), BP
+ MOVL R15, (BX)
+ MOVL BP, -4(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(R11*1), BP
+ MOVQ R15, (BX)
+ MOVQ BP, -8(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+
+copy_5_end:
+ ADDQ R11, DI
+ SUBQ R11, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ BX, R11
+ SUBQ R12, R11
+
+ // ml <= mo
+ CMPQ R13, R12
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, DI
+ MOVQ R13, R12
+ SUBQ $0x10, R12
+ JB copy_2_small
+
+copy_2_loop:
+ MOVUPS (R11), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R11
+ ADDQ $0x10, BX
+ SUBQ $0x10, R12
+ JAE copy_2_loop
+ LEAQ 16(R11)(R12*1), R11
+ LEAQ 16(BX)(R12*1), BX
+ MOVUPS -16(R11), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (R11), R12
+ MOVB -1(R11)(R13*1), R14
+ MOVB R12, (BX)
+ MOVB R14, -1(BX)(R13*1)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
+
+copy_2_move_3:
+ MOVW (R11), R12
+ MOVB 2(R11), R14
+ MOVW R12, (BX)
+ MOVB R14, 2(BX)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (R11), R12
+ MOVL -4(R11)(R13*1), R14
+ MOVL R12, (BX)
+ MOVL R14, -4(BX)(R13*1)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (R11), R12
+ MOVQ -8(R11)(R13*1), R14
+ MOVQ R12, (BX)
+ MOVQ R14, -8(BX)(R13*1)
+ ADDQ R13, R11
+ ADDQ R13, BX
+
+copy_2_end:
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, DI
+
+copy_slow_3:
+ MOVB (R11), R12
+ MOVB R12, (BX)
+ INCQ R11
+ INCQ BX
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ ADDQ $0x18, AX
+ INCQ DX
+ CMPQ DX, CX
+ JB main_loop
+
+loop_finished:
+ // Return value
+ MOVB $0x01, ret+8(FP)
+
+ // Update the context
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, 24(AX)
+ MOVQ DI, 104(AX)
+ MOVQ 80(AX), CX
+ SUBQ CX, SI
+ MOVQ SI, 112(AX)
+ RET
+
+error_match_off_too_big:
+ // Return value
+ MOVB $0x00, ret+8(FP)
+
+ // Update the context
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, 24(AX)
+ MOVQ DI, 104(AX)
+ MOVQ 80(AX), CX
+ SUBQ CX, SI
+ MOVQ SI, 112(AX)
+ RET
+
+empty_seqs:
+ // Return value
+ MOVB $0x01, ret+8(FP)
+ RET
+
+// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
+ MOVQ br+8(FP), AX
+ MOVQ 32(AX), DX
+ MOVBQZX 40(AX), BX
+ MOVQ 24(AX), SI
+ MOVQ (AX), AX
+ ADDQ SI, AX
+ MOVQ AX, (SP)
+ MOVQ ctx+16(FP), AX
+ MOVQ 72(AX), DI
+ MOVQ 80(AX), R8
+ MOVQ 88(AX), R9
+ XORQ CX, CX
+ MOVQ CX, 8(SP)
+ MOVQ CX, 16(SP)
+ MOVQ CX, 24(SP)
+ MOVQ 112(AX), R10
+ MOVQ 128(AX), CX
+ MOVQ CX, 32(SP)
+ MOVQ 144(AX), R11
+ MOVQ 136(AX), R12
+ MOVQ 200(AX), CX
+ MOVQ CX, 56(SP)
+ MOVQ 176(AX), CX
+ MOVQ CX, 48(SP)
+ MOVQ 184(AX), AX
+ MOVQ AX, 40(SP)
+ MOVQ 40(SP), AX
+ ADDQ AX, 48(SP)
+
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+ ADDQ R10, 32(SP)
+
+ // outBase += outPosition
+ ADDQ R12, R10
+
+sequenceDecs_decodeSync_amd64_main_loop:
+ MOVQ (SP), R13
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ SI, $0x08
+ JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R13
+ MOVQ (R13), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decodeSync_amd64_fill_end
+
+sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decodeSync_amd64_fill_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decodeSync_amd64_fill_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R13
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R13), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_end:
+ // Update offset
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_of_update_zero:
+ MOVQ AX, 8(SP)
+
+ // Update match length
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_ml_update_zero:
+ MOVQ AX, 16(SP)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ SI, $0x08
+ JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R13
+ MOVQ (R13), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decodeSync_amd64_fill_2_end
+
+sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decodeSync_amd64_fill_2_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decodeSync_amd64_fill_2_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R13
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R13), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_2_end:
+ // Update literal length
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_ll_update_zero:
+ MOVQ AX, 24(SP)
+
+ // Fill bitreader for state updates
+ MOVQ R13, (SP)
+ MOVQ R9, AX
+ SHRQ $0x08, AX
+ MOVBQZX AL, AX
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decodeSync_amd64_skip_update
+
+ // Update Literal Length State
+ MOVBQZX DI, R13
+ SHRQ $0x10, DI
+ MOVWQZX DI, DI
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, DI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Match Length State
+ MOVBQZX R8, R13
+ SHRQ $0x10, R8
+ MOVWQZX R8, R8
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, R8
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Offset State
+ MOVBQZX R9, R13
+ SHRQ $0x10, R9
+ MOVWQZX R9, R9
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, R9
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_amd64_skip_update:
+ // Adjust offset
+ MOVQ s+0(FP), CX
+ MOVQ 8(SP), R13
+ CMPQ AX, $0x01
+ JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
+ MOVUPS 144(CX), X0
+ MOVQ R13, 144(CX)
+ MOVUPS X0, 152(CX)
+ JMP sequenceDecs_decodeSync_amd64_after_adjust
+
+sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
+ CMPQ 24(SP), $0x00000000
+ JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
+ INCQ R13
+ JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+ MOVQ 144(CX), R13
+ JMP sequenceDecs_decodeSync_amd64_after_adjust
+
+sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
+ MOVQ R13, AX
+ XORQ R14, R14
+ MOVQ $-1, R15
+ CMPQ R13, $0x03
+ CMOVQEQ R14, AX
+ CMOVQEQ R15, R14
+ ADDQ 144(CX)(AX*8), R14
+ JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
+ MOVQ $0x00000001, R14
+
+sequenceDecs_decodeSync_amd64_adjust_temp_valid:
+ CMPQ R13, $0x01
+ JZ sequenceDecs_decodeSync_amd64_adjust_skip
+ MOVQ 152(CX), AX
+ MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_amd64_adjust_skip:
+ MOVQ 144(CX), AX
+ MOVQ AX, 152(CX)
+ MOVQ R14, 144(CX)
+ MOVQ R14, R13
+
+sequenceDecs_decodeSync_amd64_after_adjust:
+ MOVQ R13, 8(SP)
+
+ // Check values
+ MOVQ 16(SP), AX
+ MOVQ 24(SP), CX
+ LEAQ (AX)(CX*1), R14
+ MOVQ s+0(FP), R15
+ ADDQ R14, 256(R15)
+ MOVQ ctx+16(FP), R14
+ SUBQ CX, 104(R14)
+ JS error_not_enough_literals
+ CMPQ AX, $0x00020002
+ JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
+ TESTQ AX, AX
+ JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
+ MOVQ 24(SP), AX
+ MOVQ 8(SP), CX
+ MOVQ 16(SP), R13
+
+ // Check if we have enough space in s.out
+ LEAQ (AX)(R13*1), R14
+ ADDQ R10, R14
+ CMPQ R14, 32(SP)
+ JA error_not_enough_space
+
+ // Copy literals
+ TESTQ AX, AX
+ JZ check_offset
+ XORQ R14, R14
+
+copy_1:
+ MOVUPS (R11)(R14*1), X0
+ MOVUPS X0, (R10)(R14*1)
+ ADDQ $0x10, R14
+ CMPQ R14, AX
+ JB copy_1
+ ADDQ AX, R11
+ ADDQ AX, R10
+ ADDQ AX, R12
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ MOVQ R12, AX
+ ADDQ 40(SP), AX
+ CMPQ CX, AX
+ JG error_match_off_too_big
+ CMPQ CX, 56(SP)
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ CX, AX
+ SUBQ R12, AX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ AX, R14
+ CMPQ R13, AX
+ JG copy_all_from_history
+ MOVQ R13, AX
+ SUBQ $0x10, AX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, AX
+ JAE copy_4_loop
+ LEAQ 16(R14)(AX*1), R14
+ LEAQ 16(R10)(AX*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), AX
+ MOVB 2(R14), CL
+ MOVW AX, (R10)
+ MOVB CL, 2(R10)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), AX
+ MOVL -4(R14)(R13*1), CX
+ MOVL AX, (R10)
+ MOVL CX, -4(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), AX
+ MOVQ -8(R14)(R13*1), CX
+ MOVQ AX, (R10)
+ MOVQ CX, -8(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+
+copy_4_end:
+ ADDQ R13, R12
+ JMP handle_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ AX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R10)(R15*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ AX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ AX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(AX*1), BP
+ MOVB R15, (R10)
+ MOVB BP, -1(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R10)
+ MOVB BP, 2(R10)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(AX*1), BP
+ MOVL R15, (R10)
+ MOVL BP, -4(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(AX*1), BP
+ MOVQ R15, (R10)
+ MOVQ BP, -8(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+
+copy_5_end:
+ ADDQ AX, R12
+ SUBQ AX, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ R10, AX
+ SUBQ CX, AX
+
+ // ml <= mo
+ CMPQ R13, CX
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, R12
+ MOVQ R10, CX
+ ADDQ R13, R10
+
+copy_2:
+ MOVUPS (AX), X0
+ MOVUPS X0, (CX)
+ ADDQ $0x10, AX
+ ADDQ $0x10, CX
+ SUBQ $0x10, R13
+ JHI copy_2
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, R12
+
+copy_slow_3:
+ MOVB (AX), CL
+ MOVB CL, (R10)
+ INCQ AX
+ INCQ R10
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ MOVQ ctx+16(FP), AX
+ DECQ 96(AX)
+ JNS sequenceDecs_decodeSync_amd64_main_loop
+
+loop_finished:
+ MOVQ br+8(FP), AX
+ MOVQ DX, 32(AX)
+ MOVB BL, 40(AX)
+ MOVQ SI, 24(AX)
+
+ // Update the context
+ MOVQ ctx+16(FP), AX
+ MOVQ R12, 136(AX)
+ MOVQ 144(AX), CX
+ SUBQ CX, R11
+ MOVQ R11, 168(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
+ MOVQ 16(SP), AX
+ MOVQ ctx+16(FP), CX
+ MOVQ AX, 216(CX)
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decodeSync_amd64_error_match_len_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+error_match_off_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 8(SP), CX
+ MOVQ CX, 224(AX)
+ MOVQ R12, 136(AX)
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+error_not_enough_space:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ R12, 136(AX)
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
+ MOVQ br+8(FP), CX
+ MOVQ 32(CX), AX
+ MOVBQZX 40(CX), DX
+ MOVQ 24(CX), BX
+ MOVQ (CX), CX
+ ADDQ BX, CX
+ MOVQ CX, (SP)
+ MOVQ ctx+16(FP), CX
+ MOVQ 72(CX), SI
+ MOVQ 80(CX), DI
+ MOVQ 88(CX), R8
+ XORQ R9, R9
+ MOVQ R9, 8(SP)
+ MOVQ R9, 16(SP)
+ MOVQ R9, 24(SP)
+ MOVQ 112(CX), R9
+ MOVQ 128(CX), R10
+ MOVQ R10, 32(SP)
+ MOVQ 144(CX), R10
+ MOVQ 136(CX), R11
+ MOVQ 200(CX), R12
+ MOVQ R12, 56(SP)
+ MOVQ 176(CX), R12
+ MOVQ R12, 48(SP)
+ MOVQ 184(CX), CX
+ MOVQ CX, 40(SP)
+ MOVQ 40(SP), CX
+ ADDQ CX, 48(SP)
+
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+ ADDQ R9, 32(SP)
+
+ // outBase += outPosition
+ ADDQ R11, R9
+
+sequenceDecs_decodeSync_bmi2_main_loop:
+ MOVQ (SP), R12
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ BX, $0x08
+ JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R12
+ MOVQ (R12), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decodeSync_bmi2_fill_end
+
+sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decodeSync_bmi2_fill_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decodeSync_bmi2_fill_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R12
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R12), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_end:
+ // Update offset
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ R8, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 8(SP)
+
+ // Update match length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, DI, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ DI, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 16(SP)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ BX, $0x08
+ JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R12
+ MOVQ (R12), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decodeSync_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decodeSync_bmi2_fill_2_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decodeSync_bmi2_fill_2_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R12
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R12), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_2_end:
+ // Update literal length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, SI, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ SI, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 24(SP)
+
+ // Fill bitreader for state updates
+ MOVQ R12, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R12
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decodeSync_bmi2_skip_update
+ LEAQ (SI)(DI*1), R13
+ ADDQ R8, R13
+ MOVBQZX R13, R13
+ LEAQ (DX)(R13*1), CX
+ MOVQ AX, R14
+ MOVQ CX, DX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+
+ // Update Offset State
+ BZHIQ R8, R14, CX
+ SHRXQ R8, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Match Length State
+ BZHIQ DI, R14, CX
+ SHRXQ DI, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, DI, DI
+ ADDQ CX, DI
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Literal Length State
+ BZHIQ SI, R14, CX
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, SI, SI
+ ADDQ CX, SI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decodeSync_bmi2_skip_update:
+ // Adjust offset
+ MOVQ s+0(FP), CX
+ MOVQ 8(SP), R13
+ CMPQ R12, $0x01
+ JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
+ MOVUPS 144(CX), X0
+ MOVQ R13, 144(CX)
+ MOVUPS X0, 152(CX)
+ JMP sequenceDecs_decodeSync_bmi2_after_adjust
+
+sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
+ CMPQ 24(SP), $0x00000000
+ JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
+ INCQ R13
+ JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+ MOVQ 144(CX), R13
+ JMP sequenceDecs_decodeSync_bmi2_after_adjust
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
+ MOVQ R13, R12
+ XORQ R14, R14
+ MOVQ $-1, R15
+ CMPQ R13, $0x03
+ CMOVQEQ R14, R12
+ CMOVQEQ R15, R14
+ ADDQ 144(CX)(R12*8), R14
+ JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
+ MOVQ $0x00000001, R14
+
+sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
+ CMPQ R13, $0x01
+ JZ sequenceDecs_decodeSync_bmi2_adjust_skip
+ MOVQ 152(CX), R12
+ MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_bmi2_adjust_skip:
+ MOVQ 144(CX), R12
+ MOVQ R12, 152(CX)
+ MOVQ R14, 144(CX)
+ MOVQ R14, R13
+
+sequenceDecs_decodeSync_bmi2_after_adjust:
+ MOVQ R13, 8(SP)
+
+ // Check values
+ MOVQ 16(SP), CX
+ MOVQ 24(SP), R12
+ LEAQ (CX)(R12*1), R14
+ MOVQ s+0(FP), R15
+ ADDQ R14, 256(R15)
+ MOVQ ctx+16(FP), R14
+ SUBQ R12, 104(R14)
+ JS error_not_enough_literals
+ CMPQ CX, $0x00020002
+ JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
+ TESTQ CX, CX
+ JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
+ MOVQ 24(SP), CX
+ MOVQ 8(SP), R12
+ MOVQ 16(SP), R13
+
+ // Check if we have enough space in s.out
+ LEAQ (CX)(R13*1), R14
+ ADDQ R9, R14
+ CMPQ R14, 32(SP)
+ JA error_not_enough_space
+
+ // Copy literals
+ TESTQ CX, CX
+ JZ check_offset
+ XORQ R14, R14
+
+copy_1:
+ MOVUPS (R10)(R14*1), X0
+ MOVUPS X0, (R9)(R14*1)
+ ADDQ $0x10, R14
+ CMPQ R14, CX
+ JB copy_1
+ ADDQ CX, R10
+ ADDQ CX, R9
+ ADDQ CX, R11
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ MOVQ R11, CX
+ ADDQ 40(SP), CX
+ CMPQ R12, CX
+ JG error_match_off_too_big
+ CMPQ R12, 56(SP)
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ R12, CX
+ SUBQ R11, CX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ CX, R14
+ CMPQ R13, CX
+ JG copy_all_from_history
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, CX
+ JAE copy_4_loop
+ LEAQ 16(R14)(CX*1), R14
+ LEAQ 16(R9)(CX*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), CX
+ MOVB 2(R14), R12
+ MOVW CX, (R9)
+ MOVB R12, 2(R9)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), CX
+ MOVL -4(R14)(R13*1), R12
+ MOVL CX, (R9)
+ MOVL R12, -4(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), CX
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ CX, (R9)
+ MOVQ R12, -8(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+
+copy_4_end:
+ ADDQ R13, R11
+ JMP handle_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ CX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R9)(R15*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ CX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ CX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(CX*1), BP
+ MOVB R15, (R9)
+ MOVB BP, -1(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R9)
+ MOVB BP, 2(R9)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(CX*1), BP
+ MOVL R15, (R9)
+ MOVL BP, -4(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(CX*1), BP
+ MOVQ R15, (R9)
+ MOVQ BP, -8(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+
+copy_5_end:
+ ADDQ CX, R11
+ SUBQ CX, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ R9, CX
+ SUBQ R12, CX
+
+ // ml <= mo
+ CMPQ R13, R12
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, R11
+ MOVQ R9, R12
+ ADDQ R13, R9
+
+copy_2:
+ MOVUPS (CX), X0
+ MOVUPS X0, (R12)
+ ADDQ $0x10, CX
+ ADDQ $0x10, R12
+ SUBQ $0x10, R13
+ JHI copy_2
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, R11
+
+copy_slow_3:
+ MOVB (CX), R12
+ MOVB R12, (R9)
+ INCQ CX
+ INCQ R9
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ MOVQ ctx+16(FP), CX
+ DECQ 96(CX)
+ JNS sequenceDecs_decodeSync_bmi2_main_loop
+
+loop_finished:
+ MOVQ br+8(FP), CX
+ MOVQ AX, 32(CX)
+ MOVB DL, 40(CX)
+ MOVQ BX, 24(CX)
+
+ // Update the context
+ MOVQ ctx+16(FP), AX
+ MOVQ R11, 136(AX)
+ MOVQ 144(AX), CX
+ SUBQ CX, R10
+ MOVQ R10, 168(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
+ MOVQ 16(SP), AX
+ MOVQ ctx+16(FP), CX
+ MOVQ AX, 216(CX)
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+error_match_off_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 8(SP), CX
+ MOVQ CX, 224(AX)
+ MOVQ R11, 136(AX)
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+error_not_enough_space:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ R11, 136(AX)
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
+ MOVQ br+8(FP), AX
+ MOVQ 32(AX), DX
+ MOVBQZX 40(AX), BX
+ MOVQ 24(AX), SI
+ MOVQ (AX), AX
+ ADDQ SI, AX
+ MOVQ AX, (SP)
+ MOVQ ctx+16(FP), AX
+ MOVQ 72(AX), DI
+ MOVQ 80(AX), R8
+ MOVQ 88(AX), R9
+ XORQ CX, CX
+ MOVQ CX, 8(SP)
+ MOVQ CX, 16(SP)
+ MOVQ CX, 24(SP)
+ MOVQ 112(AX), R10
+ MOVQ 128(AX), CX
+ MOVQ CX, 32(SP)
+ MOVQ 144(AX), R11
+ MOVQ 136(AX), R12
+ MOVQ 200(AX), CX
+ MOVQ CX, 56(SP)
+ MOVQ 176(AX), CX
+ MOVQ CX, 48(SP)
+ MOVQ 184(AX), AX
+ MOVQ AX, 40(SP)
+ MOVQ 40(SP), AX
+ ADDQ AX, 48(SP)
+
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+ ADDQ R10, 32(SP)
+
+ // outBase += outPosition
+ ADDQ R12, R10
+
+sequenceDecs_decodeSync_safe_amd64_main_loop:
+ MOVQ (SP), R13
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ SI, $0x08
+ JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R13
+ MOVQ (R13), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R13
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R13), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_end:
+ // Update offset
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_of_update_zero:
+ MOVQ AX, 8(SP)
+
+ // Update match length
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
+ MOVQ AX, 16(SP)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ SI, $0x08
+ JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R13
+ MOVQ (R13), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R13
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R13), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_end:
+ // Update literal length
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
+ MOVQ AX, 24(SP)
+
+ // Fill bitreader for state updates
+ MOVQ R13, (SP)
+ MOVQ R9, AX
+ SHRQ $0x08, AX
+ MOVBQZX AL, AX
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decodeSync_safe_amd64_skip_update
+
+ // Update Literal Length State
+ MOVBQZX DI, R13
+ SHRQ $0x10, DI
+ MOVWQZX DI, DI
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, DI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Match Length State
+ MOVBQZX R8, R13
+ SHRQ $0x10, R8
+ MOVWQZX R8, R8
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, R8
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Offset State
+ MOVBQZX R9, R13
+ SHRQ $0x10, R9
+ MOVWQZX R9, R9
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, R9
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_safe_amd64_skip_update:
+ // Adjust offset
+ MOVQ s+0(FP), CX
+ MOVQ 8(SP), R13
+ CMPQ AX, $0x01
+ JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
+ MOVUPS 144(CX), X0
+ MOVQ R13, 144(CX)
+ MOVUPS X0, 152(CX)
+ JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
+ CMPQ 24(SP), $0x00000000
+ JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
+ INCQ R13
+ JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+ MOVQ 144(CX), R13
+ JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
+ MOVQ R13, AX
+ XORQ R14, R14
+ MOVQ $-1, R15
+ CMPQ R13, $0x03
+ CMOVQEQ R14, AX
+ CMOVQEQ R15, R14
+ ADDQ 144(CX)(AX*8), R14
+ JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
+ MOVQ $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
+ CMPQ R13, $0x01
+ JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
+ MOVQ 152(CX), AX
+ MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_safe_amd64_adjust_skip:
+ MOVQ 144(CX), AX
+ MOVQ AX, 152(CX)
+ MOVQ R14, 144(CX)
+ MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_amd64_after_adjust:
+ MOVQ R13, 8(SP)
+
+ // Check values
+ MOVQ 16(SP), AX
+ MOVQ 24(SP), CX
+ LEAQ (AX)(CX*1), R14
+ MOVQ s+0(FP), R15
+ ADDQ R14, 256(R15)
+ MOVQ ctx+16(FP), R14
+ SUBQ CX, 104(R14)
+ JS error_not_enough_literals
+ CMPQ AX, $0x00020002
+ JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
+ TESTQ AX, AX
+ JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
+ MOVQ 24(SP), AX
+ MOVQ 8(SP), CX
+ MOVQ 16(SP), R13
+
+ // Check if we have enough space in s.out
+ LEAQ (AX)(R13*1), R14
+ ADDQ R10, R14
+ CMPQ R14, 32(SP)
+ JA error_not_enough_space
+
+ // Copy literals
+ TESTQ AX, AX
+ JZ check_offset
+ MOVQ AX, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
+
+copy_1_loop:
+ MOVUPS (R11), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R11
+ ADDQ $0x10, R10
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(R11)(R14*1), R11
+ LEAQ 16(R10)(R14*1), R10
+ MOVUPS -16(R11), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ AX, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ AX, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (R11), R14
+ MOVB -1(R11)(AX*1), R15
+ MOVB R14, (R10)
+ MOVB R15, -1(R10)(AX*1)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
+
+copy_1_move_3:
+ MOVW (R11), R14
+ MOVB 2(R11), R15
+ MOVW R14, (R10)
+ MOVB R15, 2(R10)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
+
+copy_1_move_4through7:
+ MOVL (R11), R14
+ MOVL -4(R11)(AX*1), R15
+ MOVL R14, (R10)
+ MOVL R15, -4(R10)(AX*1)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
+
+copy_1_move_8through16:
+ MOVQ (R11), R14
+ MOVQ -8(R11)(AX*1), R15
+ MOVQ R14, (R10)
+ MOVQ R15, -8(R10)(AX*1)
+ ADDQ AX, R11
+ ADDQ AX, R10
+
+copy_1_end:
+ ADDQ AX, R12
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ MOVQ R12, AX
+ ADDQ 40(SP), AX
+ CMPQ CX, AX
+ JG error_match_off_too_big
+ CMPQ CX, 56(SP)
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ CX, AX
+ SUBQ R12, AX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ AX, R14
+ CMPQ R13, AX
+ JG copy_all_from_history
+ MOVQ R13, AX
+ SUBQ $0x10, AX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, AX
+ JAE copy_4_loop
+ LEAQ 16(R14)(AX*1), R14
+ LEAQ 16(R10)(AX*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), AX
+ MOVB 2(R14), CL
+ MOVW AX, (R10)
+ MOVB CL, 2(R10)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), AX
+ MOVL -4(R14)(R13*1), CX
+ MOVL AX, (R10)
+ MOVL CX, -4(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), AX
+ MOVQ -8(R14)(R13*1), CX
+ MOVQ AX, (R10)
+ MOVQ CX, -8(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+
+copy_4_end:
+ ADDQ R13, R12
+ JMP handle_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ AX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R10)(R15*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ AX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ AX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(AX*1), BP
+ MOVB R15, (R10)
+ MOVB BP, -1(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R10)
+ MOVB BP, 2(R10)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(AX*1), BP
+ MOVL R15, (R10)
+ MOVL BP, -4(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(AX*1), BP
+ MOVQ R15, (R10)
+ MOVQ BP, -8(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+
+copy_5_end:
+ ADDQ AX, R12
+ SUBQ AX, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ R10, AX
+ SUBQ CX, AX
+
+ // ml <= mo
+ CMPQ R13, CX
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, R12
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_2_small
+
+copy_2_loop:
+ MOVUPS (AX), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, AX
+ ADDQ $0x10, R10
+ SUBQ $0x10, CX
+ JAE copy_2_loop
+ LEAQ 16(AX)(CX*1), AX
+ LEAQ 16(R10)(CX*1), R10
+ MOVUPS -16(AX), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (AX), CL
+ MOVB -1(AX)(R13*1), R14
+ MOVB CL, (R10)
+ MOVB R14, -1(R10)(R13*1)
+ ADDQ R13, AX
+ ADDQ R13, R10
+ JMP copy_2_end
+
+copy_2_move_3:
+ MOVW (AX), CX
+ MOVB 2(AX), R14
+ MOVW CX, (R10)
+ MOVB R14, 2(R10)
+ ADDQ R13, AX
+ ADDQ R13, R10
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (AX), CX
+ MOVL -4(AX)(R13*1), R14
+ MOVL CX, (R10)
+ MOVL R14, -4(R10)(R13*1)
+ ADDQ R13, AX
+ ADDQ R13, R10
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (AX), CX
+ MOVQ -8(AX)(R13*1), R14
+ MOVQ CX, (R10)
+ MOVQ R14, -8(R10)(R13*1)
+ ADDQ R13, AX
+ ADDQ R13, R10
+
+copy_2_end:
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, R12
+
+copy_slow_3:
+ MOVB (AX), CL
+ MOVB CL, (R10)
+ INCQ AX
+ INCQ R10
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ MOVQ ctx+16(FP), AX
+ DECQ 96(AX)
+ JNS sequenceDecs_decodeSync_safe_amd64_main_loop
+
+loop_finished:
+ MOVQ br+8(FP), AX
+ MOVQ DX, 32(AX)
+ MOVB BL, 40(AX)
+ MOVQ SI, 24(AX)
+
+ // Update the context
+ MOVQ ctx+16(FP), AX
+ MOVQ R12, 136(AX)
+ MOVQ 144(AX), CX
+ SUBQ CX, R11
+ MOVQ R11, 168(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
+ MOVQ 16(SP), AX
+ MOVQ ctx+16(FP), CX
+ MOVQ AX, 216(CX)
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+error_match_off_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 8(SP), CX
+ MOVQ CX, 224(AX)
+ MOVQ R12, 136(AX)
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+error_not_enough_space:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ R12, 136(AX)
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
+ MOVQ br+8(FP), CX
+ MOVQ 32(CX), AX
+ MOVBQZX 40(CX), DX
+ MOVQ 24(CX), BX
+ MOVQ (CX), CX
+ ADDQ BX, CX
+ MOVQ CX, (SP)
+ MOVQ ctx+16(FP), CX
+ MOVQ 72(CX), SI
+ MOVQ 80(CX), DI
+ MOVQ 88(CX), R8
+ XORQ R9, R9
+ MOVQ R9, 8(SP)
+ MOVQ R9, 16(SP)
+ MOVQ R9, 24(SP)
+ MOVQ 112(CX), R9
+ MOVQ 128(CX), R10
+ MOVQ R10, 32(SP)
+ MOVQ 144(CX), R10
+ MOVQ 136(CX), R11
+ MOVQ 200(CX), R12
+ MOVQ R12, 56(SP)
+ MOVQ 176(CX), R12
+ MOVQ R12, 48(SP)
+ MOVQ 184(CX), CX
+ MOVQ CX, 40(SP)
+ MOVQ 40(SP), CX
+ ADDQ CX, 48(SP)
+
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+ ADDQ R9, 32(SP)
+
+ // outBase += outPosition
+ ADDQ R11, R9
+
+sequenceDecs_decodeSync_safe_bmi2_main_loop:
+ MOVQ (SP), R12
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ BX, $0x08
+ JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R12
+ MOVQ (R12), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R12
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R12), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_end:
+ // Update offset
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ R8, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 8(SP)
+
+ // Update match length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, DI, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ DI, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 16(SP)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ BX, $0x08
+ JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R12
+ MOVQ (R12), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R12
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R12), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
+ // Update literal length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, SI, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ SI, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 24(SP)
+
+ // Fill bitreader for state updates
+ MOVQ R12, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R12
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
+ LEAQ (SI)(DI*1), R13
+ ADDQ R8, R13
+ MOVBQZX R13, R13
+ LEAQ (DX)(R13*1), CX
+ MOVQ AX, R14
+ MOVQ CX, DX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+
+ // Update Offset State
+ BZHIQ R8, R14, CX
+ SHRXQ R8, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Match Length State
+ BZHIQ DI, R14, CX
+ SHRXQ DI, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, DI, DI
+ ADDQ CX, DI
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Literal Length State
+ BZHIQ SI, R14, CX
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, SI, SI
+ ADDQ CX, SI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decodeSync_safe_bmi2_skip_update:
+ // Adjust offset
+ MOVQ s+0(FP), CX
+ MOVQ 8(SP), R13
+ CMPQ R12, $0x01
+ JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
+ MOVUPS 144(CX), X0
+ MOVQ R13, 144(CX)
+ MOVUPS X0, 152(CX)
+ JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
+ CMPQ 24(SP), $0x00000000
+ JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
+ INCQ R13
+ JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+ MOVQ 144(CX), R13
+ JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
+ MOVQ R13, R12
+ XORQ R14, R14
+ MOVQ $-1, R15
+ CMPQ R13, $0x03
+ CMOVQEQ R14, R12
+ CMOVQEQ R15, R14
+ ADDQ 144(CX)(R12*8), R14
+ JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
+ MOVQ $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
+ CMPQ R13, $0x01
+ JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
+ MOVQ 152(CX), R12
+ MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
+ MOVQ 144(CX), R12
+ MOVQ R12, 152(CX)
+ MOVQ R14, 144(CX)
+ MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_bmi2_after_adjust:
+ MOVQ R13, 8(SP)
+
+ // Check values
+ MOVQ 16(SP), CX
+ MOVQ 24(SP), R12
+ LEAQ (CX)(R12*1), R14
+ MOVQ s+0(FP), R15
+ ADDQ R14, 256(R15)
+ MOVQ ctx+16(FP), R14
+ SUBQ R12, 104(R14)
+ JS error_not_enough_literals
+ CMPQ CX, $0x00020002
+ JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
+ TESTQ CX, CX
+ JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
+ MOVQ 24(SP), CX
+ MOVQ 8(SP), R12
+ MOVQ 16(SP), R13
+
+ // Check if we have enough space in s.out
+ LEAQ (CX)(R13*1), R14
+ ADDQ R9, R14
+ CMPQ R14, 32(SP)
+ JA error_not_enough_space
+
+ // Copy literals
+ TESTQ CX, CX
+ JZ check_offset
+ MOVQ CX, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
+
+copy_1_loop:
+ MOVUPS (R10), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R10
+ ADDQ $0x10, R9
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(R10)(R14*1), R10
+ LEAQ 16(R9)(R14*1), R9
+ MOVUPS -16(R10), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ CX, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ CX, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (R10), R14
+ MOVB -1(R10)(CX*1), R15
+ MOVB R14, (R9)
+ MOVB R15, -1(R9)(CX*1)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
+
+copy_1_move_3:
+ MOVW (R10), R14
+ MOVB 2(R10), R15
+ MOVW R14, (R9)
+ MOVB R15, 2(R9)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
+
+copy_1_move_4through7:
+ MOVL (R10), R14
+ MOVL -4(R10)(CX*1), R15
+ MOVL R14, (R9)
+ MOVL R15, -4(R9)(CX*1)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
+
+copy_1_move_8through16:
+ MOVQ (R10), R14
+ MOVQ -8(R10)(CX*1), R15
+ MOVQ R14, (R9)
+ MOVQ R15, -8(R9)(CX*1)
+ ADDQ CX, R10
+ ADDQ CX, R9
+
+copy_1_end:
+ ADDQ CX, R11
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ MOVQ R11, CX
+ ADDQ 40(SP), CX
+ CMPQ R12, CX
+ JG error_match_off_too_big
+ CMPQ R12, 56(SP)
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ R12, CX
+ SUBQ R11, CX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ CX, R14
+ CMPQ R13, CX
+ JG copy_all_from_history
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, CX
+ JAE copy_4_loop
+ LEAQ 16(R14)(CX*1), R14
+ LEAQ 16(R9)(CX*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), CX
+ MOVB 2(R14), R12
+ MOVW CX, (R9)
+ MOVB R12, 2(R9)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), CX
+ MOVL -4(R14)(R13*1), R12
+ MOVL CX, (R9)
+ MOVL R12, -4(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), CX
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ CX, (R9)
+ MOVQ R12, -8(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+
+copy_4_end:
+ ADDQ R13, R11
+ JMP handle_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ CX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R9)(R15*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ CX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ CX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(CX*1), BP
+ MOVB R15, (R9)
+ MOVB BP, -1(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R9)
+ MOVB BP, 2(R9)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(CX*1), BP
+ MOVL R15, (R9)
+ MOVL BP, -4(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(CX*1), BP
+ MOVQ R15, (R9)
+ MOVQ BP, -8(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+
+copy_5_end:
+ ADDQ CX, R11
+ SUBQ CX, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ R9, CX
+ SUBQ R12, CX
+
+ // ml <= mo
+ CMPQ R13, R12
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, R11
+ MOVQ R13, R12
+ SUBQ $0x10, R12
+ JB copy_2_small
+
+copy_2_loop:
+ MOVUPS (CX), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, CX
+ ADDQ $0x10, R9
+ SUBQ $0x10, R12
+ JAE copy_2_loop
+ LEAQ 16(CX)(R12*1), CX
+ LEAQ 16(R9)(R12*1), R9
+ MOVUPS -16(CX), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (CX), R12
+ MOVB -1(CX)(R13*1), R14
+ MOVB R12, (R9)
+ MOVB R14, -1(R9)(R13*1)
+ ADDQ R13, CX
+ ADDQ R13, R9
+ JMP copy_2_end
+
+copy_2_move_3:
+ MOVW (CX), R12
+ MOVB 2(CX), R14
+ MOVW R12, (R9)
+ MOVB R14, 2(R9)
+ ADDQ R13, CX
+ ADDQ R13, R9
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (CX), R12
+ MOVL -4(CX)(R13*1), R14
+ MOVL R12, (R9)
+ MOVL R14, -4(R9)(R13*1)
+ ADDQ R13, CX
+ ADDQ R13, R9
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (CX), R12
+ MOVQ -8(CX)(R13*1), R14
+ MOVQ R12, (R9)
+ MOVQ R14, -8(R9)(R13*1)
+ ADDQ R13, CX
+ ADDQ R13, R9
+
+copy_2_end:
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, R11
+
+copy_slow_3:
+ MOVB (CX), R12
+ MOVB R12, (R9)
+ INCQ CX
+ INCQ R9
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ MOVQ ctx+16(FP), CX
+ DECQ 96(CX)
+ JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
+
+loop_finished:
+ MOVQ br+8(FP), CX
+ MOVQ AX, 32(CX)
+ MOVB DL, 40(CX)
+ MOVQ BX, 24(CX)
+
+ // Update the context
+ MOVQ ctx+16(FP), AX
+ MOVQ R11, 136(AX)
+ MOVQ 144(AX), CX
+ SUBQ CX, R10
+ MOVQ R10, 168(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
+ MOVQ 16(SP), AX
+ MOVQ ctx+16(FP), CX
+ MOVQ AX, 216(CX)
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+error_match_off_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 8(SP), CX
+ MOVQ CX, 224(AX)
+ MOVQ R11, 136(AX)
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+error_not_enough_space:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ R11, 136(AX)
+ MOVQ $0x00000005, ret+24(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
new file mode 100644
index 0000000..c3452bc
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@@ -0,0 +1,237 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package zstd
+
+import (
+ "fmt"
+ "io"
+)
+
+// decode sequences from the stream with the provided history but without dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+ return false, nil
+}
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+ br := s.br
+
+ // Grab full sizes tables, to avoid bounds checks.
+ llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
+ llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+ s.seqSize = 0
+ litRemain := len(s.literals)
+
+ maxBlockSize := maxCompressedBlockSize
+ if s.windowSize < maxBlockSize {
+ maxBlockSize = s.windowSize
+ }
+ for i := range seqs {
+ var ll, mo, ml int
+ if br.off > 4+((maxOffsetBits+16+16)>>3) {
+ // inlined function:
+ // ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
+
+ // Final will not read from stream.
+ var llB, mlB, moB uint8
+ ll, llB = llState.final()
+ ml, mlB = mlState.final()
+ mo, moB = ofState.final()
+
+ // extra bits are stored in reverse order.
+ br.fillFast()
+ mo += br.getBits(moB)
+ if s.maxBits > 32 {
+ br.fillFast()
+ }
+ ml += br.getBits(mlB)
+ ll += br.getBits(llB)
+
+ if moB > 1 {
+ s.prevOffset[2] = s.prevOffset[1]
+ s.prevOffset[1] = s.prevOffset[0]
+ s.prevOffset[0] = mo
+ } else {
+ // mo = s.adjustOffset(mo, ll, moB)
+ // Inlined for rather big speedup
+ if ll == 0 {
+ // There is an exception though, when current sequence's literals_length = 0.
+ // In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+ // an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+ mo++
+ }
+
+ if mo == 0 {
+ mo = s.prevOffset[0]
+ } else {
+ var temp int
+ if mo == 3 {
+ temp = s.prevOffset[0] - 1
+ } else {
+ temp = s.prevOffset[mo]
+ }
+
+ if temp == 0 {
+ // 0 is not valid; input is corrupted; force offset to 1
+ println("WARNING: temp was 0")
+ temp = 1
+ }
+
+ if mo != 1 {
+ s.prevOffset[2] = s.prevOffset[1]
+ }
+ s.prevOffset[1] = s.prevOffset[0]
+ s.prevOffset[0] = temp
+ mo = temp
+ }
+ }
+ br.fillFast()
+ } else {
+ if br.overread() {
+ if debugDecoder {
+ printf("reading sequence %d, exceeded available data\n", i)
+ }
+ return io.ErrUnexpectedEOF
+ }
+ ll, mo, ml = s.next(br, llState, mlState, ofState)
+ br.fill()
+ }
+
+ if debugSequences {
+ println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
+ }
+ // Evaluate.
+ // We might be doing this async, so do it early.
+ if mo == 0 && ml > 0 {
+ return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+ }
+ if ml > maxMatchLen {
+ return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+ }
+ s.seqSize += ll + ml
+ if s.seqSize > maxBlockSize {
+ return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ }
+ litRemain -= ll
+ if litRemain < 0 {
+ return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
+ }
+ seqs[i] = seqVals{
+ ll: ll,
+ ml: ml,
+ mo: mo,
+ }
+ if i == len(seqs)-1 {
+ // This is the last sequence, so we shouldn't update state.
+ break
+ }
+
+ // Manually inlined, ~ 5-20% faster
+ // Update all 3 states at once. Approx 20% faster.
+ nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
+ if nBits == 0 {
+ llState = llTable[llState.newState()&maxTableMask]
+ mlState = mlTable[mlState.newState()&maxTableMask]
+ ofState = ofTable[ofState.newState()&maxTableMask]
+ } else {
+ bits := br.get32BitsFast(nBits)
+ lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
+ llState = llTable[(llState.newState()+lowBits)&maxTableMask]
+
+ lowBits = uint16(bits >> (ofState.nbBits() & 31))
+ lowBits &= bitMask[mlState.nbBits()&15]
+ mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
+
+ lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
+ ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
+ }
+ }
+ s.seqSize += litRemain
+ if s.seqSize > maxBlockSize {
+ return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ }
+ err := br.close()
+ if err != nil {
+ printf("Closing sequences: %v, %+v\n", err, *br)
+ }
+ return err
+}
+
+// executeSimple handles cases when a dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+ // Ensure we have enough output size...
+ if len(s.out)+s.seqSize > cap(s.out) {
+ addBytes := s.seqSize + len(s.out)
+ s.out = append(s.out, make([]byte, addBytes)...)
+ s.out = s.out[:len(s.out)-addBytes]
+ }
+
+ if debugDecoder {
+ printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+ }
+
+ var t = len(s.out)
+ out := s.out[:t+s.seqSize]
+
+ for _, seq := range seqs {
+ // Add literals
+ copy(out[t:], s.literals[:seq.ll])
+ t += seq.ll
+ s.literals = s.literals[seq.ll:]
+
+ // Malformed input
+ if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+ return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+ }
+
+ // Copy from history.
+ if v := seq.mo - t; v > 0 {
+ // v is the start position in history from end.
+ start := len(hist) - v
+ if seq.ml > v {
+ // Some goes into the current block.
+ // Copy remainder of history
+ copy(out[t:], hist[start:])
+ t += v
+ seq.ml -= v
+ } else {
+ copy(out[t:], hist[start:start+seq.ml])
+ t += seq.ml
+ continue
+ }
+ }
+
+ // We must be in the current buffer now
+ if seq.ml > 0 {
+ start := t - seq.mo
+ if seq.ml <= t-start {
+ // No overlap
+ copy(out[t:], out[start:start+seq.ml])
+ t += seq.ml
+ } else {
+ // Overlapping copy
+ // Extend destination slice and copy one byte at the time.
+ src := out[start : start+seq.ml]
+ dst := out[t:]
+ dst = dst[:len(src)]
+ t += len(src)
+ // Destination is the space we just added.
+ for i := range src {
+ dst[i] = src[i]
+ }
+ }
+ }
+ }
+ // Add final literals
+ copy(out[t:], s.literals)
+ if debugDecoder {
+ t += len(s.literals)
+ if t != len(out) {
+ panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+ }
+ }
+ s.out = out
+
+ return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go
index 9d9d1d5..9e1baad 100644
--- a/vendor/github.com/klauspost/compress/zstd/snappy.go
+++ b/vendor/github.com/klauspost/compress/zstd/snappy.go
@@ -10,8 +10,8 @@
"hash/crc32"
"io"
- "github.com/golang/snappy"
"github.com/klauspost/compress/huff0"
+ snappy "github.com/klauspost/compress/internal/snapref"
)
const (
@@ -203,7 +203,7 @@
written += int64(n)
continue
case chunkTypeUncompressedData:
- if debug {
+ if debugEncoder {
println("Uncompressed, chunklen", chunkLen)
}
// Section 4.3. Uncompressed data (chunk type 0x01).
@@ -246,7 +246,7 @@
continue
case chunkTypeStreamIdentifier:
- if debug {
+ if debugEncoder {
println("stream id", chunkLen, len(snappyMagicBody))
}
// Section 4.1. Stream identifier (chunk type 0xff).
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
index e35a0a2..29c15c8 100644
--- a/vendor/github.com/klauspost/compress/zstd/zip.go
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -13,40 +13,63 @@
// See https://www.winzip.com/win/en/comp_info.html
const ZipMethodWinZip = 93
-// ZipMethodPKWare is the method number used by PKWARE to indicate Zstandard compression.
-// See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.7.TXT
+// ZipMethodPKWare is the original method number used by PKWARE to indicate Zstandard compression.
+// Deprecated: This has been deprecated by PKWARE, use ZipMethodWinZip instead for compression.
+// See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.9.TXT
const ZipMethodPKWare = 20
-var zipReaderPool sync.Pool
-
-// newZipReader cannot be used since we would leak goroutines...
-func newZipReader(r io.Reader) io.ReadCloser {
- dec, ok := zipReaderPool.Get().(*Decoder)
- if ok {
- dec.Reset(r)
- } else {
- d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
- if err != nil {
- panic(err)
- }
- dec = d
+// zipReaderPool is the default reader pool.
+var zipReaderPool = sync.Pool{New: func() interface{} {
+ z, err := NewReader(nil, WithDecoderLowmem(true), WithDecoderMaxWindow(128<<20), WithDecoderConcurrency(1))
+ if err != nil {
+ panic(err)
}
- return &pooledZipReader{dec: dec}
+ return z
+}}
+
+// newZipReader creates a pooled zip decompressor.
+func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
+ pool := &zipReaderPool
+ if len(opts) > 0 {
+ opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...)
+ // Force concurrency 1
+ opts = append(opts, WithDecoderConcurrency(1))
+ // Create our own pool
+ pool = &sync.Pool{}
+ }
+ return func(r io.Reader) io.ReadCloser {
+ dec, ok := pool.Get().(*Decoder)
+ if ok {
+ dec.Reset(r)
+ } else {
+ d, err := NewReader(r, opts...)
+ if err != nil {
+ panic(err)
+ }
+ dec = d
+ }
+ return &pooledZipReader{dec: dec, pool: pool}
+ }
}
type pooledZipReader struct {
- mu sync.Mutex // guards Close and Read
- dec *Decoder
+ mu sync.Mutex // guards Close and Read
+ pool *sync.Pool
+ dec *Decoder
}
func (r *pooledZipReader) Read(p []byte) (n int, err error) {
r.mu.Lock()
defer r.mu.Unlock()
if r.dec == nil {
- return 0, errors.New("Read after Close")
+ return 0, errors.New("read after close or EOF")
}
dec, err := r.dec.Read(p)
-
+ if err == io.EOF {
+ r.dec.Reset(nil)
+ r.pool.Put(r.dec)
+ r.dec = nil
+ }
return dec, err
}
@@ -56,15 +79,16 @@
var err error
if r.dec != nil {
err = r.dec.Reset(nil)
- zipReaderPool.Put(r.dec)
+ r.pool.Put(r.dec)
r.dec = nil
}
return err
}
type pooledZipWriter struct {
- mu sync.Mutex // guards Close and Read
- enc *Encoder
+ mu sync.Mutex // guards Close and Read
+ enc *Encoder
+ pool *sync.Pool
}
func (w *pooledZipWriter) Write(p []byte) (n int, err error) {
@@ -82,7 +106,7 @@
var err error
if w.enc != nil {
err = w.enc.Close()
- zipReaderPool.Put(w.enc)
+ w.pool.Put(w.enc)
w.enc = nil
}
return err
@@ -103,18 +127,15 @@
return nil, err
}
}
- return &pooledZipWriter{enc: enc}, nil
+ return &pooledZipWriter{enc: enc, pool: &pool}, nil
}
}
// ZipDecompressor returns a decompressor that can be registered with zip libraries.
// See ZipCompressor for example.
-func ZipDecompressor() func(r io.Reader) io.ReadCloser {
- return func(r io.Reader) io.ReadCloser {
- d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
- if err != nil {
- panic(err)
- }
- return d.IOReadCloser()
- }
+// Options can be specified. WithDecoderConcurrency(1) is forced,
+// and by default a 128MB maximum decompression window is specified.
+// The window size can be overridden if required.
+func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser {
+ return newZipReader(opts...)
}
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 1ba308c..3eb3f1c 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -15,6 +15,12 @@
// enable debug printing
const debug = false
+// enable encoding debug printing
+const debugEncoder = debug
+
+// enable decoding debug printing
+const debugDecoder = debug
+
// Enable extra assertions.
const debugAsserts = debug || false
@@ -33,6 +39,9 @@
// Reset the buffer offset when reaching this.
const bufferReset = math.MaxInt32 - MaxWindowSize
+// fcsUnknown is used for unknown frame content size.
+const fcsUnknown = math.MaxUint64
+
var (
// ErrReservedBlockType is returned when a reserved block type is found.
// Typically this indicates wrong or corrupted input.
@@ -46,6 +55,10 @@
// Typically returned on invalid input.
ErrBlockTooSmall = errors.New("block too small")
+ // ErrUnexpectedBlockSize is returned when a block has unexpected size.
+ // Typically returned on invalid input.
+ ErrUnexpectedBlockSize = errors.New("unexpected block size")
+
// ErrMagicMismatch is returned when a "magic" number isn't what is expected.
// Typically this indicates wrong or corrupted input.
ErrMagicMismatch = errors.New("invalid input: magic number mismatch")
@@ -69,6 +82,10 @@
// This is only returned if SingleSegment is specified on the frame.
ErrFrameSizeExceeded = errors.New("frame size exceeded")
+ // ErrFrameSizeMismatch is returned if the stated frame size does not match the expected size.
+ // This is only returned if SingleSegment is specified on the frame.
+ ErrFrameSizeMismatch = errors.New("frame size does not match size on stream")
+
// ErrCRCMismatch is returned if CRC mismatches.
ErrCRCMismatch = errors.New("CRC check failed")
@@ -82,28 +99,17 @@
)
func println(a ...interface{}) {
- if debug {
+ if debug || debugDecoder || debugEncoder {
log.Println(a...)
}
}
func printf(format string, a ...interface{}) {
- if debug {
+ if debug || debugDecoder || debugEncoder {
log.Printf(format, a...)
}
}
-// matchLenFast does matching, but will not match the last up to 7 bytes.
-func matchLenFast(a, b []byte) int {
- endI := len(a) & (math.MaxInt32 - 7)
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- return i + bits.TrailingZeros64(diff)>>3
- }
- }
- return endI
-}
-
// matchLen returns the maximum length.
// a must be the shortest of the two.
// The function also returns whether all bytes matched.