Scott Baker | 2d89798 | 2019-09-24 11:50:08 -0700 | [diff] [blame] | 1 | // Copyright 2011 The Snappy-Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Package snappy implements the Snappy compression format. It aims for very |
| 6 | // high speeds and reasonable compression. |
| 7 | // |
| 8 | // There are actually two Snappy formats: block and stream. They are related, |
| 9 | // but different: trying to decompress block-compressed data as a Snappy stream |
| 10 | // will fail, and vice versa. The block format is the Decode and Encode |
| 11 | // functions and the stream format is the Reader and Writer types. |
| 12 | // |
| 13 | // The block format, the more common case, is used when the complete size (the |
| 14 | // number of bytes) of the original data is known upfront, at the time |
| 15 | // compression starts. The stream format, also known as the framing format, is |
| 16 | // for when that isn't always true. |
| 17 | // |
| 18 | // The canonical, C++ implementation is at https://github.com/google/snappy and |
| 19 | // it only implements the block format. |
| 20 | package snappy // import "github.com/golang/snappy" |
| 21 | |
| 22 | import ( |
| 23 | "hash/crc32" |
| 24 | ) |
| 25 | |
| 26 | /* |
| 27 | Each encoded block begins with the varint-encoded length of the decoded data, |
| 28 | followed by a sequence of chunks. Chunks begin and end on byte boundaries. The |
| 29 | first byte of each chunk is broken into its 2 least and 6 most significant bits |
| 30 | called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. |
| 31 | Zero means a literal tag. All other values mean a copy tag. |
| 32 | |
| 33 | For literal tags: |
| 34 | - If m < 60, the next 1 + m bytes are literal bytes. |
| 35 | - Otherwise, let n be the little-endian unsigned integer denoted by the next |
| 36 | m - 59 bytes. The next 1 + n bytes after that are literal bytes. |
| 37 | |
| 38 | For copy tags, length bytes are copied from offset bytes ago, in the style of |
| 39 | Lempel-Ziv compression algorithms. In particular: |
| 40 | - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). |
| 41 | The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 |
| 42 | of the offset. The next byte is bits 0-7 of the offset. |
| 43 | - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). |
| 44 | The length is 1 + m. The offset is the little-endian unsigned integer |
| 45 | denoted by the next 2 bytes. |
| 46 | - For l == 3, this tag is a legacy format that is no longer issued by most |
| 47 | encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in |
| 48 | [1, 65). The length is 1 + m. The offset is the little-endian unsigned |
| 49 | integer denoted by the next 4 bytes. |
| 50 | */ |
| 51 | const ( |
| 52 | tagLiteral = 0x00 |
| 53 | tagCopy1 = 0x01 |
| 54 | tagCopy2 = 0x02 |
| 55 | tagCopy4 = 0x03 |
| 56 | ) |
| 57 | |
| 58 | const ( |
| 59 | checksumSize = 4 |
| 60 | chunkHeaderSize = 4 |
| 61 | magicChunk = "\xff\x06\x00\x00" + magicBody |
| 62 | magicBody = "sNaPpY" |
| 63 | |
| 64 | // maxBlockSize is the maximum size of the input to encodeBlock. It is not |
| 65 | // part of the wire format per se, but some parts of the encoder assume |
| 66 | // that an offset fits into a uint16. |
| 67 | // |
| 68 | // Also, for the framing format (Writer type instead of Encode function), |
| 69 | // https://github.com/google/snappy/blob/master/framing_format.txt says |
| 70 | // that "the uncompressed data in a chunk must be no longer than 65536 |
| 71 | // bytes". |
| 72 | maxBlockSize = 65536 |
| 73 | |
| 74 | // maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is |
| 75 | // hard coded to be a const instead of a variable, so that obufLen can also |
| 76 | // be a const. Their equivalence is confirmed by |
| 77 | // TestMaxEncodedLenOfMaxBlockSize. |
| 78 | maxEncodedLenOfMaxBlockSize = 76490 |
| 79 | |
| 80 | obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize |
| 81 | obufLen = obufHeaderLen + maxEncodedLenOfMaxBlockSize |
| 82 | ) |
| 83 | |
| 84 | const ( |
| 85 | chunkTypeCompressedData = 0x00 |
| 86 | chunkTypeUncompressedData = 0x01 |
| 87 | chunkTypePadding = 0xfe |
| 88 | chunkTypeStreamIdentifier = 0xff |
| 89 | ) |
| 90 | |
| 91 | var crcTable = crc32.MakeTable(crc32.Castagnoli) |
| 92 | |
| 93 | // crc implements the checksum specified in section 3 of |
| 94 | // https://github.com/google/snappy/blob/master/framing_format.txt |
| 95 | func crc(b []byte) uint32 { |
| 96 | c := crc32.Update(0, crcTable, b) |
| 97 | return uint32(c>>15|c<<17) + 0xa282ead8 |
| 98 | } |