Scott Baker | 2d89798 | 2019-09-24 11:50:08 -0700 | [diff] [blame] | 1 | // Copyright 2016 The Snappy-Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build !amd64 appengine !gc noasm |
| 6 | |
| 7 | package snappy |
| 8 | |
| 9 | func load32(b []byte, i int) uint32 { |
| 10 | b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. |
| 11 | return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 |
| 12 | } |
| 13 | |
| 14 | func load64(b []byte, i int) uint64 { |
| 15 | b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line. |
| 16 | return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | |
| 17 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 |
| 18 | } |
| 19 | |
| 20 | // emitLiteral writes a literal chunk and returns the number of bytes written. |
| 21 | // |
| 22 | // It assumes that: |
| 23 | // dst is long enough to hold the encoded bytes |
| 24 | // 1 <= len(lit) && len(lit) <= 65536 |
| 25 | func emitLiteral(dst, lit []byte) int { |
| 26 | i, n := 0, uint(len(lit)-1) |
| 27 | switch { |
| 28 | case n < 60: |
| 29 | dst[0] = uint8(n)<<2 | tagLiteral |
| 30 | i = 1 |
| 31 | case n < 1<<8: |
| 32 | dst[0] = 60<<2 | tagLiteral |
| 33 | dst[1] = uint8(n) |
| 34 | i = 2 |
| 35 | default: |
| 36 | dst[0] = 61<<2 | tagLiteral |
| 37 | dst[1] = uint8(n) |
| 38 | dst[2] = uint8(n >> 8) |
| 39 | i = 3 |
| 40 | } |
| 41 | return i + copy(dst[i:], lit) |
| 42 | } |
| 43 | |
| 44 | // emitCopy writes a copy chunk and returns the number of bytes written. |
| 45 | // |
| 46 | // It assumes that: |
| 47 | // dst is long enough to hold the encoded bytes |
| 48 | // 1 <= offset && offset <= 65535 |
| 49 | // 4 <= length && length <= 65535 |
| 50 | func emitCopy(dst []byte, offset, length int) int { |
| 51 | i := 0 |
| 52 | // The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The |
| 53 | // threshold for this loop is a little higher (at 68 = 64 + 4), and the |
| 54 | // length emitted down below is is a little lower (at 60 = 64 - 4), because |
| 55 | // it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed |
| 56 | // by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as |
| 57 | // a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as |
| 58 | // 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a |
| 59 | // tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an |
| 60 | // encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1. |
| 61 | for length >= 68 { |
| 62 | // Emit a length 64 copy, encoded as 3 bytes. |
| 63 | dst[i+0] = 63<<2 | tagCopy2 |
| 64 | dst[i+1] = uint8(offset) |
| 65 | dst[i+2] = uint8(offset >> 8) |
| 66 | i += 3 |
| 67 | length -= 64 |
| 68 | } |
| 69 | if length > 64 { |
| 70 | // Emit a length 60 copy, encoded as 3 bytes. |
| 71 | dst[i+0] = 59<<2 | tagCopy2 |
| 72 | dst[i+1] = uint8(offset) |
| 73 | dst[i+2] = uint8(offset >> 8) |
| 74 | i += 3 |
| 75 | length -= 60 |
| 76 | } |
| 77 | if length >= 12 || offset >= 2048 { |
| 78 | // Emit the remaining copy, encoded as 3 bytes. |
| 79 | dst[i+0] = uint8(length-1)<<2 | tagCopy2 |
| 80 | dst[i+1] = uint8(offset) |
| 81 | dst[i+2] = uint8(offset >> 8) |
| 82 | return i + 3 |
| 83 | } |
| 84 | // Emit the remaining copy, encoded as 2 bytes. |
| 85 | dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 |
| 86 | dst[i+1] = uint8(offset) |
| 87 | return i + 2 |
| 88 | } |
| 89 | |
| 90 | // extendMatch returns the largest k such that k <= len(src) and that |
| 91 | // src[i:i+k-j] and src[j:k] have the same contents. |
| 92 | // |
| 93 | // It assumes that: |
| 94 | // 0 <= i && i < j && j <= len(src) |
| 95 | func extendMatch(src []byte, i, j int) int { |
| 96 | for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 { |
| 97 | } |
| 98 | return j |
| 99 | } |
| 100 | |
| 101 | func hash(u, shift uint32) uint32 { |
| 102 | return (u * 0x1e35a7bd) >> shift |
| 103 | } |
| 104 | |
| 105 | // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It |
| 106 | // assumes that the varint-encoded length of the decompressed bytes has already |
| 107 | // been written. |
| 108 | // |
| 109 | // It also assumes that: |
| 110 | // len(dst) >= MaxEncodedLen(len(src)) && |
| 111 | // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize |
| 112 | func encodeBlock(dst, src []byte) (d int) { |
| 113 | // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. |
| 114 | // The table element type is uint16, as s < sLimit and sLimit < len(src) |
| 115 | // and len(src) <= maxBlockSize and maxBlockSize == 65536. |
| 116 | const ( |
| 117 | maxTableSize = 1 << 14 |
| 118 | // tableMask is redundant, but helps the compiler eliminate bounds |
| 119 | // checks. |
| 120 | tableMask = maxTableSize - 1 |
| 121 | ) |
| 122 | shift := uint32(32 - 8) |
| 123 | for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { |
| 124 | shift-- |
| 125 | } |
| 126 | // In Go, all array elements are zero-initialized, so there is no advantage |
| 127 | // to a smaller tableSize per se. However, it matches the C++ algorithm, |
| 128 | // and in the asm versions of this code, we can get away with zeroing only |
| 129 | // the first tableSize elements. |
| 130 | var table [maxTableSize]uint16 |
| 131 | |
| 132 | // sLimit is when to stop looking for offset/length copies. The inputMargin |
| 133 | // lets us use a fast path for emitLiteral in the main loop, while we are |
| 134 | // looking for copies. |
| 135 | sLimit := len(src) - inputMargin |
| 136 | |
| 137 | // nextEmit is where in src the next emitLiteral should start from. |
| 138 | nextEmit := 0 |
| 139 | |
| 140 | // The encoded form must start with a literal, as there are no previous |
| 141 | // bytes to copy, so we start looking for hash matches at s == 1. |
| 142 | s := 1 |
| 143 | nextHash := hash(load32(src, s), shift) |
| 144 | |
| 145 | for { |
| 146 | // Copied from the C++ snappy implementation: |
| 147 | // |
| 148 | // Heuristic match skipping: If 32 bytes are scanned with no matches |
| 149 | // found, start looking only at every other byte. If 32 more bytes are |
| 150 | // scanned (or skipped), look at every third byte, etc.. When a match |
| 151 | // is found, immediately go back to looking at every byte. This is a |
| 152 | // small loss (~5% performance, ~0.1% density) for compressible data |
| 153 | // due to more bookkeeping, but for non-compressible data (such as |
| 154 | // JPEG) it's a huge win since the compressor quickly "realizes" the |
| 155 | // data is incompressible and doesn't bother looking for matches |
| 156 | // everywhere. |
| 157 | // |
| 158 | // The "skip" variable keeps track of how many bytes there are since |
| 159 | // the last match; dividing it by 32 (ie. right-shifting by five) gives |
| 160 | // the number of bytes to move ahead for each iteration. |
| 161 | skip := 32 |
| 162 | |
| 163 | nextS := s |
| 164 | candidate := 0 |
| 165 | for { |
| 166 | s = nextS |
| 167 | bytesBetweenHashLookups := skip >> 5 |
| 168 | nextS = s + bytesBetweenHashLookups |
| 169 | skip += bytesBetweenHashLookups |
| 170 | if nextS > sLimit { |
| 171 | goto emitRemainder |
| 172 | } |
| 173 | candidate = int(table[nextHash&tableMask]) |
| 174 | table[nextHash&tableMask] = uint16(s) |
| 175 | nextHash = hash(load32(src, nextS), shift) |
| 176 | if load32(src, s) == load32(src, candidate) { |
| 177 | break |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | // A 4-byte match has been found. We'll later see if more than 4 bytes |
| 182 | // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit |
| 183 | // them as literal bytes. |
| 184 | d += emitLiteral(dst[d:], src[nextEmit:s]) |
| 185 | |
| 186 | // Call emitCopy, and then see if another emitCopy could be our next |
| 187 | // move. Repeat until we find no match for the input immediately after |
| 188 | // what was consumed by the last emitCopy call. |
| 189 | // |
| 190 | // If we exit this loop normally then we need to call emitLiteral next, |
| 191 | // though we don't yet know how big the literal will be. We handle that |
| 192 | // by proceeding to the next iteration of the main loop. We also can |
| 193 | // exit this loop via goto if we get close to exhausting the input. |
| 194 | for { |
| 195 | // Invariant: we have a 4-byte match at s, and no need to emit any |
| 196 | // literal bytes prior to s. |
| 197 | base := s |
| 198 | |
| 199 | // Extend the 4-byte match as long as possible. |
| 200 | // |
| 201 | // This is an inlined version of: |
| 202 | // s = extendMatch(src, candidate+4, s+4) |
| 203 | s += 4 |
| 204 | for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 { |
| 205 | } |
| 206 | |
| 207 | d += emitCopy(dst[d:], base-candidate, s-base) |
| 208 | nextEmit = s |
| 209 | if s >= sLimit { |
| 210 | goto emitRemainder |
| 211 | } |
| 212 | |
| 213 | // We could immediately start working at s now, but to improve |
| 214 | // compression we first update the hash table at s-1 and at s. If |
| 215 | // another emitCopy is not our next move, also calculate nextHash |
| 216 | // at s+1. At least on GOARCH=amd64, these three hash calculations |
| 217 | // are faster as one load64 call (with some shifts) instead of |
| 218 | // three load32 calls. |
| 219 | x := load64(src, s-1) |
| 220 | prevHash := hash(uint32(x>>0), shift) |
| 221 | table[prevHash&tableMask] = uint16(s - 1) |
| 222 | currHash := hash(uint32(x>>8), shift) |
| 223 | candidate = int(table[currHash&tableMask]) |
| 224 | table[currHash&tableMask] = uint16(s) |
| 225 | if uint32(x>>8) != load32(src, candidate) { |
| 226 | nextHash = hash(uint32(x>>16), shift) |
| 227 | s++ |
| 228 | break |
| 229 | } |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | emitRemainder: |
| 234 | if nextEmit < len(src) { |
| 235 | d += emitLiteral(dst[d:], src[nextEmit:]) |
| 236 | } |
| 237 | return d |
| 238 | } |