Blame - vendor/github.com/klauspost/compress/compressible.go - kafka-topic-exporter

blob: ea5a692d5130f55b62d741e753d264e2ba3ac598 [file] [log] [blame]

kesavand	c71914f	2022-03-25 11:19:03 +0530	[diff] [blame^]	1	package compress
				2
				3	import "math"
				4
				5	// Estimate returns a normalized compressibility estimate of block b.
				6	// Values close to zero are likely uncompressible.
				7	// Values above 0.1 are likely to be compressible.
				8	// Values above 0.5 are very compressible.
				9	// Very small lengths will return 0.
				10	func Estimate(b []byte) float64 {
				11	if len(b) < 16 {
				12	return 0
				13	}
				14
				15	// Correctly predicted order 1
				16	hits := 0
				17	lastMatch := false
				18	var o1 [256]byte
				19	var hist [256]int
				20	c1 := byte(0)
				21	for _, c := range b {
				22	if c == o1[c1] {
				23	// We only count a hit if there was two correct predictions in a row.
				24	if lastMatch {
				25	hits++
				26	}
				27	lastMatch = true
				28	} else {
				29	lastMatch = false
				30	}
				31	o1[c1] = c
				32	c1 = c
				33	hist[c]++
				34	}
				35
				36	// Use x^0.6 to give better spread
				37	prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
				38
				39	// Calculate histogram distribution
				40	variance := float64(0)
				41	avg := float64(len(b)) / 256
				42
				43	for _, v := range hist {
				44	Δ := float64(v) - avg
				45	variance += Δ * Δ
				46	}
				47
				48	stddev := math.Sqrt(float64(variance)) / float64(len(b))
				49	exp := math.Sqrt(1 / float64(len(b)))
				50
				51	// Subtract expected stddev
				52	stddev -= exp
				53	if stddev < 0 {
				54	stddev = 0
				55	}
				56	stddev *= 1 + exp
				57
				58	// Use x^0.4 to give better spread
				59	entropy := math.Pow(stddev, 0.4)
				60
				61	// 50/50 weight between prediction and histogram distribution
				62	return math.Pow((prediction+entropy)/2, 0.9)
				63	}
				64
				65	// ShannonEntropyBits returns the number of bits minimum required to represent
				66	// an entropy encoding of the input bytes.
				67	// https://en.wiktionary.org/wiki/Shannon_entropy
				68	func ShannonEntropyBits(b []byte) int {
				69	if len(b) == 0 {
				70	return 0
				71	}
				72	var hist [256]int
				73	for _, c := range b {
				74	hist[c]++
				75	}
				76	shannon := float64(0)
				77	invTotal := 1.0 / float64(len(b))
				78	for _, v := range hist[:] {
				79	if v > 0 {
				80	n := float64(v)
				81	shannon += math.Ceil(-math.Log2(ninvTotal) n)
				82	}
				83	}
				84	return int(math.Ceil(shannon))
				85	}