| //go:build amd64 && !appengine && !noasm && gc |
| // +build amd64,!appengine,!noasm,gc |
| |
| // This file contains the specialisation of Decoder.Decompress4X |
| // and Decoder.Decompress1X that use an asm implementation of thir main loops. |
| package huff0 |
| |
| import ( |
| "errors" |
| "fmt" |
| |
| "github.com/klauspost/compress/internal/cpuinfo" |
| ) |
| |
| // decompress4x_main_loop_x86 is an x86 assembler implementation |
| // of Decompress4X when tablelog > 8. |
| //go:noescape |
| func decompress4x_main_loop_amd64(ctx *decompress4xContext) |
| |
| // decompress4x_8b_loop_x86 is an x86 assembler implementation |
| // of Decompress4X when tablelog <= 8 which decodes 4 entries |
| // per loop. |
| //go:noescape |
| func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) |
| |
| // fallback8BitSize is the size where using Go version is faster. |
| const fallback8BitSize = 800 |
| |
| type decompress4xContext struct { |
| pbr *[4]bitReaderShifted |
| peekBits uint8 |
| out *byte |
| dstEvery int |
| tbl *dEntrySingle |
| decoded int |
| limit *byte |
| } |
| |
| // Decompress4X will decompress a 4X encoded stream. |
| // The length of the supplied input must match the end of a block exactly. |
| // The *capacity* of the dst slice must match the destination size of |
| // the uncompressed data exactly. |
| func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { |
| if len(d.dt.single) == 0 { |
| return nil, errors.New("no table loaded") |
| } |
| if len(src) < 6+(4*1) { |
| return nil, errors.New("input too small") |
| } |
| |
| use8BitTables := d.actualTableLog <= 8 |
| if cap(dst) < fallback8BitSize && use8BitTables { |
| return d.decompress4X8bit(dst, src) |
| } |
| |
| var br [4]bitReaderShifted |
| // Decode "jump table" |
| start := 6 |
| for i := 0; i < 3; i++ { |
| length := int(src[i*2]) | (int(src[i*2+1]) << 8) |
| if start+length >= len(src) { |
| return nil, errors.New("truncated input (or invalid offset)") |
| } |
| err := br[i].init(src[start : start+length]) |
| if err != nil { |
| return nil, err |
| } |
| start += length |
| } |
| err := br[3].init(src[start:]) |
| if err != nil { |
| return nil, err |
| } |
| |
| // destination, offset to match first output |
| dstSize := cap(dst) |
| dst = dst[:dstSize] |
| out := dst |
| dstEvery := (dstSize + 3) / 4 |
| |
| const tlSize = 1 << tableLogMax |
| const tlMask = tlSize - 1 |
| single := d.dt.single[:tlSize] |
| |
| var decoded int |
| |
| if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) { |
| ctx := decompress4xContext{ |
| pbr: &br, |
| peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() |
| out: &out[0], |
| dstEvery: dstEvery, |
| tbl: &single[0], |
| limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last. |
| } |
| if use8BitTables { |
| decompress4x_8b_main_loop_amd64(&ctx) |
| } else { |
| decompress4x_main_loop_amd64(&ctx) |
| } |
| |
| decoded = ctx.decoded |
| out = out[decoded/4:] |
| } |
| |
| // Decode remaining. |
| remainBytes := dstEvery - (decoded / 4) |
| for i := range br { |
| offset := dstEvery * i |
| endsAt := offset + remainBytes |
| if endsAt > len(out) { |
| endsAt = len(out) |
| } |
| br := &br[i] |
| bitsLeft := br.remaining() |
| for bitsLeft > 0 { |
| br.fill() |
| if offset >= endsAt { |
| return nil, errors.New("corruption detected: stream overrun 4") |
| } |
| |
| // Read value and increment offset. |
| val := br.peekBitsFast(d.actualTableLog) |
| v := single[val&tlMask].entry |
| nBits := uint8(v) |
| br.advance(nBits) |
| bitsLeft -= uint(nBits) |
| out[offset] = uint8(v >> 8) |
| offset++ |
| } |
| if offset != endsAt { |
| return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt) |
| } |
| decoded += offset - dstEvery*i |
| err = br.close() |
| if err != nil { |
| return nil, err |
| } |
| } |
| if dstSize != decoded { |
| return nil, errors.New("corruption detected: short output block") |
| } |
| return dst, nil |
| } |
| |
| // decompress4x_main_loop_x86 is an x86 assembler implementation |
| // of Decompress1X when tablelog > 8. |
| //go:noescape |
| func decompress1x_main_loop_amd64(ctx *decompress1xContext) |
| |
| // decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation |
| // of Decompress1X when tablelog > 8. |
| //go:noescape |
| func decompress1x_main_loop_bmi2(ctx *decompress1xContext) |
| |
| type decompress1xContext struct { |
| pbr *bitReaderShifted |
| peekBits uint8 |
| out *byte |
| outCap int |
| tbl *dEntrySingle |
| decoded int |
| } |
| |
| // Error reported by asm implementations |
| const error_max_decoded_size_exeeded = -1 |
| |
| // Decompress1X will decompress a 1X encoded stream. |
| // The cap of the output buffer will be the maximum decompressed size. |
| // The length of the supplied input must match the end of a block exactly. |
| func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { |
| if len(d.dt.single) == 0 { |
| return nil, errors.New("no table loaded") |
| } |
| var br bitReaderShifted |
| err := br.init(src) |
| if err != nil { |
| return dst, err |
| } |
| maxDecodedSize := cap(dst) |
| dst = dst[:maxDecodedSize] |
| |
| const tlSize = 1 << tableLogMax |
| const tlMask = tlSize - 1 |
| |
| if maxDecodedSize >= 4 { |
| ctx := decompress1xContext{ |
| pbr: &br, |
| out: &dst[0], |
| outCap: maxDecodedSize, |
| peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() |
| tbl: &d.dt.single[0], |
| } |
| |
| if cpuinfo.HasBMI2() { |
| decompress1x_main_loop_bmi2(&ctx) |
| } else { |
| decompress1x_main_loop_amd64(&ctx) |
| } |
| if ctx.decoded == error_max_decoded_size_exeeded { |
| return nil, ErrMaxDecodedSizeExceeded |
| } |
| |
| dst = dst[:ctx.decoded] |
| } |
| |
| // br < 8, so uint8 is fine |
| bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead |
| for bitsLeft > 0 { |
| br.fill() |
| if len(dst) >= maxDecodedSize { |
| br.close() |
| return nil, ErrMaxDecodedSizeExceeded |
| } |
| v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask] |
| nBits := uint8(v.entry) |
| br.advance(nBits) |
| bitsLeft -= nBits |
| dst = append(dst, uint8(v.entry>>8)) |
| } |
| return dst, br.close() |
| } |