blob: ea5a692d5130f55b62d741e753d264e2ba3ac598 [file] [log] [blame]
kesavandc71914f2022-03-25 11:19:03 +05301package compress
2
3import "math"
4
5// Estimate returns a normalized compressibility estimate of block b.
6// Values close to zero are likely uncompressible.
7// Values above 0.1 are likely to be compressible.
8// Values above 0.5 are very compressible.
9// Very small lengths will return 0.
10func Estimate(b []byte) float64 {
11 if len(b) < 16 {
12 return 0
13 }
14
15 // Correctly predicted order 1
16 hits := 0
17 lastMatch := false
18 var o1 [256]byte
19 var hist [256]int
20 c1 := byte(0)
21 for _, c := range b {
22 if c == o1[c1] {
23 // We only count a hit if there was two correct predictions in a row.
24 if lastMatch {
25 hits++
26 }
27 lastMatch = true
28 } else {
29 lastMatch = false
30 }
31 o1[c1] = c
32 c1 = c
33 hist[c]++
34 }
35
36 // Use x^0.6 to give better spread
37 prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
38
39 // Calculate histogram distribution
40 variance := float64(0)
41 avg := float64(len(b)) / 256
42
43 for _, v := range hist {
44 Δ := float64(v) - avg
45 variance += Δ * Δ
46 }
47
48 stddev := math.Sqrt(float64(variance)) / float64(len(b))
49 exp := math.Sqrt(1 / float64(len(b)))
50
51 // Subtract expected stddev
52 stddev -= exp
53 if stddev < 0 {
54 stddev = 0
55 }
56 stddev *= 1 + exp
57
58 // Use x^0.4 to give better spread
59 entropy := math.Pow(stddev, 0.4)
60
61 // 50/50 weight between prediction and histogram distribution
62 return math.Pow((prediction+entropy)/2, 0.9)
63}
64
65// ShannonEntropyBits returns the number of bits minimum required to represent
66// an entropy encoding of the input bytes.
67// https://en.wiktionary.org/wiki/Shannon_entropy
68func ShannonEntropyBits(b []byte) int {
69 if len(b) == 0 {
70 return 0
71 }
72 var hist [256]int
73 for _, c := range b {
74 hist[c]++
75 }
76 shannon := float64(0)
77 invTotal := 1.0 / float64(len(b))
78 for _, v := range hist[:] {
79 if v > 0 {
80 n := float64(v)
81 shannon += math.Ceil(-math.Log2(n*invTotal) * n)
82 }
83 }
84 return int(math.Ceil(shannon))
85}