blob: 53249df0561ff74d6fb10628bd90a462978a9a3b [file] [log] [blame]
Scott Bakered4efab2020-01-13 19:12:25 -08001// Package huff0 provides fast huffman encoding as used in zstd.
2//
3// See README.md at https://github.com/klauspost/compress/tree/master/huff0 for details.
4package huff0
5
6import (
7 "errors"
8 "fmt"
9 "math"
10 "math/bits"
11
12 "github.com/klauspost/compress/fse"
13)
14
15const (
16 maxSymbolValue = 255
17
18 // zstandard limits tablelog to 11, see:
19 // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#huffman-tree-description
20 tableLogMax = 11
21 tableLogDefault = 11
22 minTablelog = 5
23 huffNodesLen = 512
24
25 // BlockSizeMax is maximum input size for a single block uncompressed.
26 BlockSizeMax = 1<<18 - 1
27)
28
29var (
30 // ErrIncompressible is returned when input is judged to be too hard to compress.
31 ErrIncompressible = errors.New("input is not compressible")
32
33 // ErrUseRLE is returned from the compressor when the input is a single byte value repeated.
34 ErrUseRLE = errors.New("input is single value repeated")
35
36 // ErrTooBig is return if input is too large for a single block.
37 ErrTooBig = errors.New("input too big")
38
39 // ErrMaxDecodedSizeExceeded is return if input is too large for a single block.
40 ErrMaxDecodedSizeExceeded = errors.New("maximum output size exceeded")
41)
42
43type ReusePolicy uint8
44
45const (
46 // ReusePolicyAllow will allow reuse if it produces smaller output.
47 ReusePolicyAllow ReusePolicy = iota
48
49 // ReusePolicyPrefer will re-use aggressively if possible.
50 // This will not check if a new table will produce smaller output,
51 // except if the current table is impossible to use or
52 // compressed output is bigger than input.
53 ReusePolicyPrefer
54
55 // ReusePolicyNone will disable re-use of tables.
56 // This is slightly faster than ReusePolicyAllow but may produce larger output.
57 ReusePolicyNone
58)
59
60type Scratch struct {
61 count [maxSymbolValue + 1]uint32
62
63 // Per block parameters.
64 // These can be used to override compression parameters of the block.
65 // Do not touch, unless you know what you are doing.
66
67 // Out is output buffer.
68 // If the scratch is re-used before the caller is done processing the output,
69 // set this field to nil.
70 // Otherwise the output buffer will be re-used for next Compression/Decompression step
71 // and allocation will be avoided.
72 Out []byte
73
74 // OutTable will contain the table data only, if a new table has been generated.
75 // Slice of the returned data.
76 OutTable []byte
77
78 // OutData will contain the compressed data.
79 // Slice of the returned data.
80 OutData []byte
81
82 // MaxSymbolValue will override the maximum symbol value of the next block.
83 MaxSymbolValue uint8
84
85 // TableLog will attempt to override the tablelog for the next block.
86 // Must be <= 11 and >= 5.
87 TableLog uint8
88
89 // Reuse will specify the reuse policy
90 Reuse ReusePolicy
91
92 // WantLogLess allows to specify a log 2 reduction that should at least be achieved,
93 // otherwise the block will be returned as incompressible.
94 // The reduction should then at least be (input size >> WantLogLess)
95 // If WantLogLess == 0 any improvement will do.
96 WantLogLess uint8
97
98 // MaxDecodedSize will set the maximum allowed output size.
99 // This value will automatically be set to BlockSizeMax if not set.
100 // Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
101 MaxDecodedSize int
102
103 br byteReader
104 symbolLen uint16 // Length of active part of the symbol table.
105 maxCount int // count of the most probable symbol
106 clearCount bool // clear count
107 actualTableLog uint8 // Selected tablelog.
108 prevTableLog uint8 // Tablelog for previous table
109 prevTable cTable // Table used for previous compression.
110 cTable cTable // compression table
111 dt dTable // decompression table
112 nodes []nodeElt
113 tmpOut [4][]byte
114 fse *fse.Scratch
115 huffWeight [maxSymbolValue + 1]byte
116}
117
118func (s *Scratch) prepare(in []byte) (*Scratch, error) {
119 if len(in) > BlockSizeMax {
120 return nil, ErrTooBig
121 }
122 if s == nil {
123 s = &Scratch{}
124 }
125 if s.MaxSymbolValue == 0 {
126 s.MaxSymbolValue = maxSymbolValue
127 }
128 if s.TableLog == 0 {
129 s.TableLog = tableLogDefault
130 }
131 if s.TableLog > tableLogMax || s.TableLog < minTablelog {
132 return nil, fmt.Errorf(" invalid tableLog %d (%d -> %d)", s.TableLog, minTablelog, tableLogMax)
133 }
134 if s.MaxDecodedSize <= 0 || s.MaxDecodedSize > BlockSizeMax {
135 s.MaxDecodedSize = BlockSizeMax
136 }
137 if s.clearCount && s.maxCount == 0 {
138 for i := range s.count {
139 s.count[i] = 0
140 }
141 s.clearCount = false
142 }
143 if cap(s.Out) == 0 {
144 s.Out = make([]byte, 0, len(in))
145 }
146 s.Out = s.Out[:0]
147
148 s.OutTable = nil
149 s.OutData = nil
150 if cap(s.nodes) < huffNodesLen+1 {
151 s.nodes = make([]nodeElt, 0, huffNodesLen+1)
152 }
153 s.nodes = s.nodes[:0]
154 if s.fse == nil {
155 s.fse = &fse.Scratch{}
156 }
157 s.br.init(in)
158
159 return s, nil
160}
161
162type cTable []cTableEntry
163
164func (c cTable) write(s *Scratch) error {
165 var (
166 // precomputed conversion table
167 bitsToWeight [tableLogMax + 1]byte
168 huffLog = s.actualTableLog
169 // last weight is not saved.
170 maxSymbolValue = uint8(s.symbolLen - 1)
171 huffWeight = s.huffWeight[:256]
172 )
173 const (
174 maxFSETableLog = 6
175 )
176 // convert to weight
177 bitsToWeight[0] = 0
178 for n := uint8(1); n < huffLog+1; n++ {
179 bitsToWeight[n] = huffLog + 1 - n
180 }
181
182 // Acquire histogram for FSE.
183 hist := s.fse.Histogram()
184 hist = hist[:256]
185 for i := range hist[:16] {
186 hist[i] = 0
187 }
188 for n := uint8(0); n < maxSymbolValue; n++ {
189 v := bitsToWeight[c[n].nBits] & 15
190 huffWeight[n] = v
191 hist[v]++
192 }
193
194 // FSE compress if feasible.
195 if maxSymbolValue >= 2 {
196 huffMaxCnt := uint32(0)
197 huffMax := uint8(0)
198 for i, v := range hist[:16] {
199 if v == 0 {
200 continue
201 }
202 huffMax = byte(i)
203 if v > huffMaxCnt {
204 huffMaxCnt = v
205 }
206 }
207 s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
208 s.fse.TableLog = maxFSETableLog
209 b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
210 if err == nil && len(b) < int(s.symbolLen>>1) {
211 s.Out = append(s.Out, uint8(len(b)))
212 s.Out = append(s.Out, b...)
213 return nil
214 }
215 // Unable to compress (RLE/uncompressible)
216 }
217 // write raw values as 4-bits (max : 15)
218 if maxSymbolValue > (256 - 128) {
219 // should not happen : likely means source cannot be compressed
220 return ErrIncompressible
221 }
222 op := s.Out
223 // special case, pack weights 4 bits/weight.
224 op = append(op, 128|(maxSymbolValue-1))
225 // be sure it doesn't cause msan issue in final combination
226 huffWeight[maxSymbolValue] = 0
227 for n := uint16(0); n < uint16(maxSymbolValue); n += 2 {
228 op = append(op, (huffWeight[n]<<4)|huffWeight[n+1])
229 }
230 s.Out = op
231 return nil
232}
233
234// estimateSize returns the estimated size in bytes of the input represented in the
235// histogram supplied.
236func (c cTable) estimateSize(hist []uint32) int {
237 nbBits := uint32(7)
238 for i, v := range c[:len(hist)] {
239 nbBits += uint32(v.nBits) * hist[i]
240 }
241 return int(nbBits >> 3)
242}
243
244// minSize returns the minimum possible size considering the shannon limit.
245func (s *Scratch) minSize(total int) int {
246 nbBits := float64(7)
247 fTotal := float64(total)
248 for _, v := range s.count[:s.symbolLen] {
249 n := float64(v)
250 if n > 0 {
251 nbBits += math.Log2(fTotal/n) * n
252 }
253 }
254 return int(nbBits) >> 3
255}
256
257func highBit32(val uint32) (n uint32) {
258 return uint32(bits.Len32(val) - 1)
259}