Scott Baker | ed4efab | 2020-01-13 19:12:25 -0800 | [diff] [blame] | 1 | // Copyright 2018 Klaus Post. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | // Based on work Copyright (c) 2013, Yann Collet, released under BSD License. |
| 5 | |
| 6 | // Package fse provides Finite State Entropy encoding and decoding. |
| 7 | // |
| 8 | // Finite State Entropy encoding provides a fast near-optimal symbol encoding/decoding |
| 9 | // for byte blocks as implemented in zstd. |
| 10 | // |
| 11 | // See https://github.com/klauspost/compress/tree/master/fse for more information. |
| 12 | package fse |
| 13 | |
| 14 | import ( |
| 15 | "errors" |
| 16 | "fmt" |
| 17 | "math/bits" |
| 18 | ) |
| 19 | |
| 20 | const ( |
| 21 | /*!MEMORY_USAGE : |
| 22 | * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) |
| 23 | * Increasing memory usage improves compression ratio |
| 24 | * Reduced memory usage can improve speed, due to cache effect |
| 25 | * Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ |
| 26 | maxMemoryUsage = 14 |
| 27 | defaultMemoryUsage = 13 |
| 28 | |
| 29 | maxTableLog = maxMemoryUsage - 2 |
| 30 | maxTablesize = 1 << maxTableLog |
| 31 | defaultTablelog = defaultMemoryUsage - 2 |
| 32 | minTablelog = 5 |
| 33 | maxSymbolValue = 255 |
| 34 | ) |
| 35 | |
| 36 | var ( |
| 37 | // ErrIncompressible is returned when input is judged to be too hard to compress. |
| 38 | ErrIncompressible = errors.New("input is not compressible") |
| 39 | |
| 40 | // ErrUseRLE is returned from the compressor when the input is a single byte value repeated. |
| 41 | ErrUseRLE = errors.New("input is single value repeated") |
| 42 | ) |
| 43 | |
| 44 | // Scratch provides temporary storage for compression and decompression. |
| 45 | type Scratch struct { |
| 46 | // Private |
David K. Bainbridge | bd6b288 | 2021-08-26 13:31:02 +0000 | [diff] [blame] | 47 | count [maxSymbolValue + 1]uint32 |
| 48 | norm [maxSymbolValue + 1]int16 |
| 49 | br byteReader |
| 50 | bits bitReader |
| 51 | bw bitWriter |
| 52 | ct cTable // Compression tables. |
| 53 | decTable []decSymbol // Decompression table. |
| 54 | maxCount int // count of the most probable symbol |
Scott Baker | ed4efab | 2020-01-13 19:12:25 -0800 | [diff] [blame] | 55 | |
| 56 | // Per block parameters. |
| 57 | // These can be used to override compression parameters of the block. |
| 58 | // Do not touch, unless you know what you are doing. |
| 59 | |
| 60 | // Out is output buffer. |
| 61 | // If the scratch is re-used before the caller is done processing the output, |
| 62 | // set this field to nil. |
| 63 | // Otherwise the output buffer will be re-used for next Compression/Decompression step |
| 64 | // and allocation will be avoided. |
| 65 | Out []byte |
| 66 | |
Scott Baker | ed4efab | 2020-01-13 19:12:25 -0800 | [diff] [blame] | 67 | // DecompressLimit limits the maximum decoded size acceptable. |
| 68 | // If > 0 decompression will stop when approximately this many bytes |
| 69 | // has been decoded. |
| 70 | // If 0, maximum size will be 2GB. |
| 71 | DecompressLimit int |
David K. Bainbridge | bd6b288 | 2021-08-26 13:31:02 +0000 | [diff] [blame] | 72 | |
| 73 | symbolLen uint16 // Length of active part of the symbol table. |
| 74 | actualTableLog uint8 // Selected tablelog. |
| 75 | zeroBits bool // no bits has prob > 50%. |
| 76 | clearCount bool // clear count |
| 77 | |
| 78 | // MaxSymbolValue will override the maximum symbol value of the next block. |
| 79 | MaxSymbolValue uint8 |
| 80 | |
| 81 | // TableLog will attempt to override the tablelog for the next block. |
| 82 | TableLog uint8 |
Scott Baker | ed4efab | 2020-01-13 19:12:25 -0800 | [diff] [blame] | 83 | } |
| 84 | |
| 85 | // Histogram allows to populate the histogram and skip that step in the compression, |
| 86 | // It otherwise allows to inspect the histogram when compression is done. |
| 87 | // To indicate that you have populated the histogram call HistogramFinished |
| 88 | // with the value of the highest populated symbol, as well as the number of entries |
| 89 | // in the most populated entry. These are accepted at face value. |
| 90 | // The returned slice will always be length 256. |
| 91 | func (s *Scratch) Histogram() []uint32 { |
| 92 | return s.count[:] |
| 93 | } |
| 94 | |
| 95 | // HistogramFinished can be called to indicate that the histogram has been populated. |
| 96 | // maxSymbol is the index of the highest set symbol of the next data segment. |
| 97 | // maxCount is the number of entries in the most populated entry. |
| 98 | // These are accepted at face value. |
| 99 | func (s *Scratch) HistogramFinished(maxSymbol uint8, maxCount int) { |
| 100 | s.maxCount = maxCount |
| 101 | s.symbolLen = uint16(maxSymbol) + 1 |
| 102 | s.clearCount = maxCount != 0 |
| 103 | } |
| 104 | |
| 105 | // prepare will prepare and allocate scratch tables used for both compression and decompression. |
| 106 | func (s *Scratch) prepare(in []byte) (*Scratch, error) { |
| 107 | if s == nil { |
| 108 | s = &Scratch{} |
| 109 | } |
| 110 | if s.MaxSymbolValue == 0 { |
| 111 | s.MaxSymbolValue = 255 |
| 112 | } |
| 113 | if s.TableLog == 0 { |
| 114 | s.TableLog = defaultTablelog |
| 115 | } |
| 116 | if s.TableLog > maxTableLog { |
| 117 | return nil, fmt.Errorf("tableLog (%d) > maxTableLog (%d)", s.TableLog, maxTableLog) |
| 118 | } |
| 119 | if cap(s.Out) == 0 { |
| 120 | s.Out = make([]byte, 0, len(in)) |
| 121 | } |
| 122 | if s.clearCount && s.maxCount == 0 { |
| 123 | for i := range s.count { |
| 124 | s.count[i] = 0 |
| 125 | } |
| 126 | s.clearCount = false |
| 127 | } |
| 128 | s.br.init(in) |
| 129 | if s.DecompressLimit == 0 { |
| 130 | // Max size 2GB. |
| 131 | s.DecompressLimit = (2 << 30) - 1 |
| 132 | } |
| 133 | |
| 134 | return s, nil |
| 135 | } |
| 136 | |
| 137 | // tableStep returns the next table index. |
| 138 | func tableStep(tableSize uint32) uint32 { |
| 139 | return (tableSize >> 1) + (tableSize >> 3) + 3 |
| 140 | } |
| 141 | |
| 142 | func highBits(val uint32) (n uint32) { |
| 143 | return uint32(bits.Len32(val) - 1) |
| 144 | } |