Dinesh Belwalkar | e63f7f9 | 2019-11-22 23:11:16 +0000 | [diff] [blame^] | 1 | // Copyright 2011 The Snappy-Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package snappy |
| 6 | |
| 7 | import ( |
| 8 | "encoding/binary" |
| 9 | "errors" |
| 10 | "io" |
| 11 | ) |
| 12 | |
| 13 | // Encode returns the encoded form of src. The returned slice may be a sub- |
| 14 | // slice of dst if dst was large enough to hold the entire encoded block. |
| 15 | // Otherwise, a newly allocated slice will be returned. |
| 16 | // |
| 17 | // The dst and src must not overlap. It is valid to pass a nil dst. |
| 18 | func Encode(dst, src []byte) []byte { |
| 19 | if n := MaxEncodedLen(len(src)); n < 0 { |
| 20 | panic(ErrTooLarge) |
| 21 | } else if len(dst) < n { |
| 22 | dst = make([]byte, n) |
| 23 | } |
| 24 | |
| 25 | // The block starts with the varint-encoded length of the decompressed bytes. |
| 26 | d := binary.PutUvarint(dst, uint64(len(src))) |
| 27 | |
| 28 | for len(src) > 0 { |
| 29 | p := src |
| 30 | src = nil |
| 31 | if len(p) > maxBlockSize { |
| 32 | p, src = p[:maxBlockSize], p[maxBlockSize:] |
| 33 | } |
| 34 | if len(p) < minNonLiteralBlockSize { |
| 35 | d += emitLiteral(dst[d:], p) |
| 36 | } else { |
| 37 | d += encodeBlock(dst[d:], p) |
| 38 | } |
| 39 | } |
| 40 | return dst[:d] |
| 41 | } |
| 42 | |
| 43 | // inputMargin is the minimum number of extra input bytes to keep, inside |
| 44 | // encodeBlock's inner loop. On some architectures, this margin lets us |
| 45 | // implement a fast path for emitLiteral, where the copy of short (<= 16 byte) |
| 46 | // literals can be implemented as a single load to and store from a 16-byte |
| 47 | // register. That literal's actual length can be as short as 1 byte, so this |
| 48 | // can copy up to 15 bytes too much, but that's OK as subsequent iterations of |
| 49 | // the encoding loop will fix up the copy overrun, and this inputMargin ensures |
| 50 | // that we don't overrun the dst and src buffers. |
| 51 | const inputMargin = 16 - 1 |
| 52 | |
| 53 | // minNonLiteralBlockSize is the minimum size of the input to encodeBlock that |
| 54 | // could be encoded with a copy tag. This is the minimum with respect to the |
| 55 | // algorithm used by encodeBlock, not a minimum enforced by the file format. |
| 56 | // |
| 57 | // The encoded output must start with at least a 1 byte literal, as there are |
| 58 | // no previous bytes to copy. A minimal (1 byte) copy after that, generated |
| 59 | // from an emitCopy call in encodeBlock's main loop, would require at least |
| 60 | // another inputMargin bytes, for the reason above: we want any emitLiteral |
| 61 | // calls inside encodeBlock's main loop to use the fast path if possible, which |
| 62 | // requires being able to overrun by inputMargin bytes. Thus, |
| 63 | // minNonLiteralBlockSize equals 1 + 1 + inputMargin. |
| 64 | // |
| 65 | // The C++ code doesn't use this exact threshold, but it could, as discussed at |
| 66 | // https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion |
| 67 | // The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an |
| 68 | // optimization. It should not affect the encoded form. This is tested by |
| 69 | // TestSameEncodingAsCppShortCopies. |
| 70 | const minNonLiteralBlockSize = 1 + 1 + inputMargin |
| 71 | |
| 72 | // MaxEncodedLen returns the maximum length of a snappy block, given its |
| 73 | // uncompressed length. |
| 74 | // |
| 75 | // It will return a negative value if srcLen is too large to encode. |
| 76 | func MaxEncodedLen(srcLen int) int { |
| 77 | n := uint64(srcLen) |
| 78 | if n > 0xffffffff { |
| 79 | return -1 |
| 80 | } |
| 81 | // Compressed data can be defined as: |
| 82 | // compressed := item* literal* |
| 83 | // item := literal* copy |
| 84 | // |
| 85 | // The trailing literal sequence has a space blowup of at most 62/60 |
| 86 | // since a literal of length 60 needs one tag byte + one extra byte |
| 87 | // for length information. |
| 88 | // |
| 89 | // Item blowup is trickier to measure. Suppose the "copy" op copies |
| 90 | // 4 bytes of data. Because of a special check in the encoding code, |
| 91 | // we produce a 4-byte copy only if the offset is < 65536. Therefore |
| 92 | // the copy op takes 3 bytes to encode, and this type of item leads |
| 93 | // to at most the 62/60 blowup for representing literals. |
| 94 | // |
| 95 | // Suppose the "copy" op copies 5 bytes of data. If the offset is big |
| 96 | // enough, it will take 5 bytes to encode the copy op. Therefore the |
| 97 | // worst case here is a one-byte literal followed by a five-byte copy. |
| 98 | // That is, 6 bytes of input turn into 7 bytes of "compressed" data. |
| 99 | // |
| 100 | // This last factor dominates the blowup, so the final estimate is: |
| 101 | n = 32 + n + n/6 |
| 102 | if n > 0xffffffff { |
| 103 | return -1 |
| 104 | } |
| 105 | return int(n) |
| 106 | } |
| 107 | |
| 108 | var errClosed = errors.New("snappy: Writer is closed") |
| 109 | |
| 110 | // NewWriter returns a new Writer that compresses to w. |
| 111 | // |
| 112 | // The Writer returned does not buffer writes. There is no need to Flush or |
| 113 | // Close such a Writer. |
| 114 | // |
| 115 | // Deprecated: the Writer returned is not suitable for many small writes, only |
| 116 | // for few large writes. Use NewBufferedWriter instead, which is efficient |
| 117 | // regardless of the frequency and shape of the writes, and remember to Close |
| 118 | // that Writer when done. |
| 119 | func NewWriter(w io.Writer) *Writer { |
| 120 | return &Writer{ |
| 121 | w: w, |
| 122 | obuf: make([]byte, obufLen), |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | // NewBufferedWriter returns a new Writer that compresses to w, using the |
| 127 | // framing format described at |
| 128 | // https://github.com/google/snappy/blob/master/framing_format.txt |
| 129 | // |
| 130 | // The Writer returned buffers writes. Users must call Close to guarantee all |
| 131 | // data has been forwarded to the underlying io.Writer. They may also call |
| 132 | // Flush zero or more times before calling Close. |
| 133 | func NewBufferedWriter(w io.Writer) *Writer { |
| 134 | return &Writer{ |
| 135 | w: w, |
| 136 | ibuf: make([]byte, 0, maxBlockSize), |
| 137 | obuf: make([]byte, obufLen), |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | // Writer is an io.Writer that can write Snappy-compressed bytes. |
| 142 | type Writer struct { |
| 143 | w io.Writer |
| 144 | err error |
| 145 | |
| 146 | // ibuf is a buffer for the incoming (uncompressed) bytes. |
| 147 | // |
| 148 | // Its use is optional. For backwards compatibility, Writers created by the |
| 149 | // NewWriter function have ibuf == nil, do not buffer incoming bytes, and |
| 150 | // therefore do not need to be Flush'ed or Close'd. |
| 151 | ibuf []byte |
| 152 | |
| 153 | // obuf is a buffer for the outgoing (compressed) bytes. |
| 154 | obuf []byte |
| 155 | |
| 156 | // wroteStreamHeader is whether we have written the stream header. |
| 157 | wroteStreamHeader bool |
| 158 | } |
| 159 | |
| 160 | // Reset discards the writer's state and switches the Snappy writer to write to |
| 161 | // w. This permits reusing a Writer rather than allocating a new one. |
| 162 | func (w *Writer) Reset(writer io.Writer) { |
| 163 | w.w = writer |
| 164 | w.err = nil |
| 165 | if w.ibuf != nil { |
| 166 | w.ibuf = w.ibuf[:0] |
| 167 | } |
| 168 | w.wroteStreamHeader = false |
| 169 | } |
| 170 | |
| 171 | // Write satisfies the io.Writer interface. |
| 172 | func (w *Writer) Write(p []byte) (nRet int, errRet error) { |
| 173 | if w.ibuf == nil { |
| 174 | // Do not buffer incoming bytes. This does not perform or compress well |
| 175 | // if the caller of Writer.Write writes many small slices. This |
| 176 | // behavior is therefore deprecated, but still supported for backwards |
| 177 | // compatibility with code that doesn't explicitly Flush or Close. |
| 178 | return w.write(p) |
| 179 | } |
| 180 | |
| 181 | // The remainder of this method is based on bufio.Writer.Write from the |
| 182 | // standard library. |
| 183 | |
| 184 | for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil { |
| 185 | var n int |
| 186 | if len(w.ibuf) == 0 { |
| 187 | // Large write, empty buffer. |
| 188 | // Write directly from p to avoid copy. |
| 189 | n, _ = w.write(p) |
| 190 | } else { |
| 191 | n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) |
| 192 | w.ibuf = w.ibuf[:len(w.ibuf)+n] |
| 193 | w.Flush() |
| 194 | } |
| 195 | nRet += n |
| 196 | p = p[n:] |
| 197 | } |
| 198 | if w.err != nil { |
| 199 | return nRet, w.err |
| 200 | } |
| 201 | n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) |
| 202 | w.ibuf = w.ibuf[:len(w.ibuf)+n] |
| 203 | nRet += n |
| 204 | return nRet, nil |
| 205 | } |
| 206 | |
| 207 | func (w *Writer) write(p []byte) (nRet int, errRet error) { |
| 208 | if w.err != nil { |
| 209 | return 0, w.err |
| 210 | } |
| 211 | for len(p) > 0 { |
| 212 | obufStart := len(magicChunk) |
| 213 | if !w.wroteStreamHeader { |
| 214 | w.wroteStreamHeader = true |
| 215 | copy(w.obuf, magicChunk) |
| 216 | obufStart = 0 |
| 217 | } |
| 218 | |
| 219 | var uncompressed []byte |
| 220 | if len(p) > maxBlockSize { |
| 221 | uncompressed, p = p[:maxBlockSize], p[maxBlockSize:] |
| 222 | } else { |
| 223 | uncompressed, p = p, nil |
| 224 | } |
| 225 | checksum := crc(uncompressed) |
| 226 | |
| 227 | // Compress the buffer, discarding the result if the improvement |
| 228 | // isn't at least 12.5%. |
| 229 | compressed := Encode(w.obuf[obufHeaderLen:], uncompressed) |
| 230 | chunkType := uint8(chunkTypeCompressedData) |
| 231 | chunkLen := 4 + len(compressed) |
| 232 | obufEnd := obufHeaderLen + len(compressed) |
| 233 | if len(compressed) >= len(uncompressed)-len(uncompressed)/8 { |
| 234 | chunkType = chunkTypeUncompressedData |
| 235 | chunkLen = 4 + len(uncompressed) |
| 236 | obufEnd = obufHeaderLen |
| 237 | } |
| 238 | |
| 239 | // Fill in the per-chunk header that comes before the body. |
| 240 | w.obuf[len(magicChunk)+0] = chunkType |
| 241 | w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0) |
| 242 | w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8) |
| 243 | w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16) |
| 244 | w.obuf[len(magicChunk)+4] = uint8(checksum >> 0) |
| 245 | w.obuf[len(magicChunk)+5] = uint8(checksum >> 8) |
| 246 | w.obuf[len(magicChunk)+6] = uint8(checksum >> 16) |
| 247 | w.obuf[len(magicChunk)+7] = uint8(checksum >> 24) |
| 248 | |
| 249 | if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil { |
| 250 | w.err = err |
| 251 | return nRet, err |
| 252 | } |
| 253 | if chunkType == chunkTypeUncompressedData { |
| 254 | if _, err := w.w.Write(uncompressed); err != nil { |
| 255 | w.err = err |
| 256 | return nRet, err |
| 257 | } |
| 258 | } |
| 259 | nRet += len(uncompressed) |
| 260 | } |
| 261 | return nRet, nil |
| 262 | } |
| 263 | |
| 264 | // Flush flushes the Writer to its underlying io.Writer. |
| 265 | func (w *Writer) Flush() error { |
| 266 | if w.err != nil { |
| 267 | return w.err |
| 268 | } |
| 269 | if len(w.ibuf) == 0 { |
| 270 | return nil |
| 271 | } |
| 272 | w.write(w.ibuf) |
| 273 | w.ibuf = w.ibuf[:0] |
| 274 | return w.err |
| 275 | } |
| 276 | |
| 277 | // Close calls Flush and then closes the Writer. |
| 278 | func (w *Writer) Close() error { |
| 279 | w.Flush() |
| 280 | ret := w.err |
| 281 | if w.err == nil { |
| 282 | w.err = errClosed |
| 283 | } |
| 284 | return ret |
| 285 | } |