| // Copyright 2019+ Klaus Post. All rights reserved. |
| // License information can be found in the LICENSE file. |
| // Based on work by Yann Collet, released under BSD License. |
| |
| package zstd |
| |
| import ( |
| "crypto/rand" |
| "fmt" |
| "io" |
| rdebug "runtime/debug" |
| "sync" |
| |
| "github.com/klauspost/compress/zstd/internal/xxhash" |
| ) |
| |
| // Encoder provides encoding to Zstandard. |
| // An Encoder can be used for either compressing a stream via the |
| // io.WriteCloser interface supported by the Encoder or as multiple independent |
| // tasks via the EncodeAll function. |
| // Smaller encodes are encouraged to use the EncodeAll function. |
| // Use NewWriter to create a new instance. |
| type Encoder struct { |
| o encoderOptions |
| encoders chan encoder |
| state encoderState |
| init sync.Once |
| } |
| |
| type encoder interface { |
| Encode(blk *blockEnc, src []byte) |
| EncodeNoHist(blk *blockEnc, src []byte) |
| Block() *blockEnc |
| CRC() *xxhash.Digest |
| AppendCRC([]byte) []byte |
| WindowSize(size int) int32 |
| UseBlock(*blockEnc) |
| Reset(d *dict, singleBlock bool) |
| } |
| |
| type encoderState struct { |
| w io.Writer |
| filling []byte |
| current []byte |
| previous []byte |
| encoder encoder |
| writing *blockEnc |
| err error |
| writeErr error |
| nWritten int64 |
| headerWritten bool |
| eofWritten bool |
| fullFrameWritten bool |
| |
| // This waitgroup indicates an encode is running. |
| wg sync.WaitGroup |
| // This waitgroup indicates we have a block encoding/writing. |
| wWg sync.WaitGroup |
| } |
| |
| // NewWriter will create a new Zstandard encoder. |
| // If the encoder will be used for encoding blocks a nil writer can be used. |
| func NewWriter(w io.Writer, opts ...EOption) (*Encoder, error) { |
| initPredefined() |
| var e Encoder |
| e.o.setDefault() |
| for _, o := range opts { |
| err := o(&e.o) |
| if err != nil { |
| return nil, err |
| } |
| } |
| if w != nil { |
| e.Reset(w) |
| } |
| return &e, nil |
| } |
| |
| func (e *Encoder) initialize() { |
| if e.o.concurrent == 0 { |
| e.o.setDefault() |
| } |
| e.encoders = make(chan encoder, e.o.concurrent) |
| for i := 0; i < e.o.concurrent; i++ { |
| enc := e.o.encoder() |
| e.encoders <- enc |
| } |
| } |
| |
| // Reset will re-initialize the writer and new writes will encode to the supplied writer |
| // as a new, independent stream. |
| func (e *Encoder) Reset(w io.Writer) { |
| s := &e.state |
| s.wg.Wait() |
| s.wWg.Wait() |
| if cap(s.filling) == 0 { |
| s.filling = make([]byte, 0, e.o.blockSize) |
| } |
| if cap(s.current) == 0 { |
| s.current = make([]byte, 0, e.o.blockSize) |
| } |
| if cap(s.previous) == 0 { |
| s.previous = make([]byte, 0, e.o.blockSize) |
| } |
| if s.encoder == nil { |
| s.encoder = e.o.encoder() |
| } |
| if s.writing == nil { |
| s.writing = &blockEnc{lowMem: e.o.lowMem} |
| s.writing.init() |
| } |
| s.writing.initNewEncode() |
| s.filling = s.filling[:0] |
| s.current = s.current[:0] |
| s.previous = s.previous[:0] |
| s.encoder.Reset(e.o.dict, false) |
| s.headerWritten = false |
| s.eofWritten = false |
| s.fullFrameWritten = false |
| s.w = w |
| s.err = nil |
| s.nWritten = 0 |
| s.writeErr = nil |
| } |
| |
| // Write data to the encoder. |
| // Input data will be buffered and as the buffer fills up |
| // content will be compressed and written to the output. |
| // When done writing, use Close to flush the remaining output |
| // and write CRC if requested. |
| func (e *Encoder) Write(p []byte) (n int, err error) { |
| s := &e.state |
| for len(p) > 0 { |
| if len(p)+len(s.filling) < e.o.blockSize { |
| if e.o.crc { |
| _, _ = s.encoder.CRC().Write(p) |
| } |
| s.filling = append(s.filling, p...) |
| return n + len(p), nil |
| } |
| add := p |
| if len(p)+len(s.filling) > e.o.blockSize { |
| add = add[:e.o.blockSize-len(s.filling)] |
| } |
| if e.o.crc { |
| _, _ = s.encoder.CRC().Write(add) |
| } |
| s.filling = append(s.filling, add...) |
| p = p[len(add):] |
| n += len(add) |
| if len(s.filling) < e.o.blockSize { |
| return n, nil |
| } |
| err := e.nextBlock(false) |
| if err != nil { |
| return n, err |
| } |
| if debugAsserts && len(s.filling) > 0 { |
| panic(len(s.filling)) |
| } |
| } |
| return n, nil |
| } |
| |
| // nextBlock will synchronize and start compressing input in e.state.filling. |
| // If an error has occurred during encoding it will be returned. |
| func (e *Encoder) nextBlock(final bool) error { |
| s := &e.state |
| // Wait for current block. |
| s.wg.Wait() |
| if s.err != nil { |
| return s.err |
| } |
| if len(s.filling) > e.o.blockSize { |
| return fmt.Errorf("block > maxStoreBlockSize") |
| } |
| if !s.headerWritten { |
| // If we have a single block encode, do a sync compression. |
| if final && len(s.filling) == 0 && !e.o.fullZero { |
| s.headerWritten = true |
| s.fullFrameWritten = true |
| s.eofWritten = true |
| return nil |
| } |
| if final && len(s.filling) > 0 { |
| s.current = e.EncodeAll(s.filling, s.current[:0]) |
| var n2 int |
| n2, s.err = s.w.Write(s.current) |
| if s.err != nil { |
| return s.err |
| } |
| s.nWritten += int64(n2) |
| s.current = s.current[:0] |
| s.filling = s.filling[:0] |
| s.headerWritten = true |
| s.fullFrameWritten = true |
| s.eofWritten = true |
| return nil |
| } |
| |
| var tmp [maxHeaderSize]byte |
| fh := frameHeader{ |
| ContentSize: 0, |
| WindowSize: uint32(s.encoder.WindowSize(0)), |
| SingleSegment: false, |
| Checksum: e.o.crc, |
| DictID: e.o.dict.ID(), |
| } |
| |
| dst, err := fh.appendTo(tmp[:0]) |
| if err != nil { |
| return err |
| } |
| s.headerWritten = true |
| s.wWg.Wait() |
| var n2 int |
| n2, s.err = s.w.Write(dst) |
| if s.err != nil { |
| return s.err |
| } |
| s.nWritten += int64(n2) |
| } |
| if s.eofWritten { |
| // Ensure we only write it once. |
| final = false |
| } |
| |
| if len(s.filling) == 0 { |
| // Final block, but no data. |
| if final { |
| enc := s.encoder |
| blk := enc.Block() |
| blk.reset(nil) |
| blk.last = true |
| blk.encodeRaw(nil) |
| s.wWg.Wait() |
| _, s.err = s.w.Write(blk.output) |
| s.nWritten += int64(len(blk.output)) |
| s.eofWritten = true |
| } |
| return s.err |
| } |
| |
| // Move blocks forward. |
| s.filling, s.current, s.previous = s.previous[:0], s.filling, s.current |
| s.wg.Add(1) |
| go func(src []byte) { |
| if debug { |
| println("Adding block,", len(src), "bytes, final:", final) |
| } |
| defer func() { |
| if r := recover(); r != nil { |
| s.err = fmt.Errorf("panic while encoding: %v", r) |
| rdebug.PrintStack() |
| } |
| s.wg.Done() |
| }() |
| enc := s.encoder |
| blk := enc.Block() |
| enc.Encode(blk, src) |
| blk.last = final |
| if final { |
| s.eofWritten = true |
| } |
| // Wait for pending writes. |
| s.wWg.Wait() |
| if s.writeErr != nil { |
| s.err = s.writeErr |
| return |
| } |
| // Transfer encoders from previous write block. |
| blk.swapEncoders(s.writing) |
| // Transfer recent offsets to next. |
| enc.UseBlock(s.writing) |
| s.writing = blk |
| s.wWg.Add(1) |
| go func() { |
| defer func() { |
| if r := recover(); r != nil { |
| s.writeErr = fmt.Errorf("panic while encoding/writing: %v", r) |
| rdebug.PrintStack() |
| } |
| s.wWg.Done() |
| }() |
| err := errIncompressible |
| // If we got the exact same number of literals as input, |
| // assume the literals cannot be compressed. |
| if len(src) != len(blk.literals) || len(src) != e.o.blockSize { |
| err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy) |
| } |
| switch err { |
| case errIncompressible: |
| if debug { |
| println("Storing incompressible block as raw") |
| } |
| blk.encodeRaw(src) |
| // In fast mode, we do not transfer offsets, so we don't have to deal with changing the. |
| case nil: |
| default: |
| s.writeErr = err |
| return |
| } |
| _, s.writeErr = s.w.Write(blk.output) |
| s.nWritten += int64(len(blk.output)) |
| }() |
| }(s.current) |
| return nil |
| } |
| |
| // ReadFrom reads data from r until EOF or error. |
| // The return value n is the number of bytes read. |
| // Any error except io.EOF encountered during the read is also returned. |
| // |
| // The Copy function uses ReaderFrom if available. |
| func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) { |
| if debug { |
| println("Using ReadFrom") |
| } |
| |
| // Flush any current writes. |
| if len(e.state.filling) > 0 { |
| if err := e.nextBlock(false); err != nil { |
| return 0, err |
| } |
| } |
| e.state.filling = e.state.filling[:e.o.blockSize] |
| src := e.state.filling |
| for { |
| n2, err := r.Read(src) |
| if e.o.crc { |
| _, _ = e.state.encoder.CRC().Write(src[:n2]) |
| } |
| // src is now the unfilled part... |
| src = src[n2:] |
| n += int64(n2) |
| switch err { |
| case io.EOF: |
| e.state.filling = e.state.filling[:len(e.state.filling)-len(src)] |
| if debug { |
| println("ReadFrom: got EOF final block:", len(e.state.filling)) |
| } |
| return n, nil |
| case nil: |
| default: |
| if debug { |
| println("ReadFrom: got error:", err) |
| } |
| e.state.err = err |
| return n, err |
| } |
| if len(src) > 0 { |
| if debug { |
| println("ReadFrom: got space left in source:", len(src)) |
| } |
| continue |
| } |
| err = e.nextBlock(false) |
| if err != nil { |
| return n, err |
| } |
| e.state.filling = e.state.filling[:e.o.blockSize] |
| src = e.state.filling |
| } |
| } |
| |
| // Flush will send the currently written data to output |
| // and block until everything has been written. |
| // This should only be used on rare occasions where pushing the currently queued data is critical. |
| func (e *Encoder) Flush() error { |
| s := &e.state |
| if len(s.filling) > 0 { |
| err := e.nextBlock(false) |
| if err != nil { |
| return err |
| } |
| } |
| s.wg.Wait() |
| s.wWg.Wait() |
| if s.err != nil { |
| return s.err |
| } |
| return s.writeErr |
| } |
| |
| // Close will flush the final output and close the stream. |
| // The function will block until everything has been written. |
| // The Encoder can still be re-used after calling this. |
| func (e *Encoder) Close() error { |
| s := &e.state |
| if s.encoder == nil { |
| return nil |
| } |
| err := e.nextBlock(true) |
| if err != nil { |
| return err |
| } |
| if e.state.fullFrameWritten { |
| return s.err |
| } |
| s.wg.Wait() |
| s.wWg.Wait() |
| |
| if s.err != nil { |
| return s.err |
| } |
| if s.writeErr != nil { |
| return s.writeErr |
| } |
| |
| // Write CRC |
| if e.o.crc && s.err == nil { |
| // heap alloc. |
| var tmp [4]byte |
| _, s.err = s.w.Write(s.encoder.AppendCRC(tmp[:0])) |
| s.nWritten += 4 |
| } |
| |
| // Add padding with content from crypto/rand.Reader |
| if s.err == nil && e.o.pad > 0 { |
| add := calcSkippableFrame(s.nWritten, int64(e.o.pad)) |
| frame, err := skippableFrame(s.filling[:0], add, rand.Reader) |
| if err != nil { |
| return err |
| } |
| _, s.err = s.w.Write(frame) |
| } |
| return s.err |
| } |
| |
| // EncodeAll will encode all input in src and append it to dst. |
| // This function can be called concurrently, but each call will only run on a single goroutine. |
| // If empty input is given, nothing is returned, unless WithZeroFrames is specified. |
| // Encoded blocks can be concatenated and the result will be the combined input stream. |
| // Data compressed with EncodeAll can be decoded with the Decoder, |
| // using either a stream or DecodeAll. |
| func (e *Encoder) EncodeAll(src, dst []byte) []byte { |
| if len(src) == 0 { |
| if e.o.fullZero { |
| // Add frame header. |
| fh := frameHeader{ |
| ContentSize: 0, |
| WindowSize: MinWindowSize, |
| SingleSegment: true, |
| // Adding a checksum would be a waste of space. |
| Checksum: false, |
| DictID: 0, |
| } |
| dst, _ = fh.appendTo(dst) |
| |
| // Write raw block as last one only. |
| var blk blockHeader |
| blk.setSize(0) |
| blk.setType(blockTypeRaw) |
| blk.setLast(true) |
| dst = blk.appendTo(dst) |
| } |
| return dst |
| } |
| e.init.Do(e.initialize) |
| enc := <-e.encoders |
| defer func() { |
| // Release encoder reference to last block. |
| // If a non-single block is needed the encoder will reset again. |
| e.encoders <- enc |
| }() |
| // Use single segments when above minimum window and below 1MB. |
| single := len(src) < 1<<20 && len(src) > MinWindowSize |
| if e.o.single != nil { |
| single = *e.o.single |
| } |
| fh := frameHeader{ |
| ContentSize: uint64(len(src)), |
| WindowSize: uint32(enc.WindowSize(len(src))), |
| SingleSegment: single, |
| Checksum: e.o.crc, |
| DictID: e.o.dict.ID(), |
| } |
| |
| // If less than 1MB, allocate a buffer up front. |
| if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 && !e.o.lowMem { |
| dst = make([]byte, 0, len(src)) |
| } |
| dst, err := fh.appendTo(dst) |
| if err != nil { |
| panic(err) |
| } |
| |
| // If we can do everything in one block, prefer that. |
| if len(src) <= maxCompressedBlockSize { |
| enc.Reset(e.o.dict, true) |
| // Slightly faster with no history and everything in one block. |
| if e.o.crc { |
| _, _ = enc.CRC().Write(src) |
| } |
| blk := enc.Block() |
| blk.last = true |
| if e.o.dict == nil { |
| enc.EncodeNoHist(blk, src) |
| } else { |
| enc.Encode(blk, src) |
| } |
| |
| // If we got the exact same number of literals as input, |
| // assume the literals cannot be compressed. |
| err := errIncompressible |
| oldout := blk.output |
| if len(blk.literals) != len(src) || len(src) != e.o.blockSize { |
| // Output directly to dst |
| blk.output = dst |
| err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy) |
| } |
| |
| switch err { |
| case errIncompressible: |
| if debug { |
| println("Storing incompressible block as raw") |
| } |
| dst = blk.encodeRawTo(dst, src) |
| case nil: |
| dst = blk.output |
| default: |
| panic(err) |
| } |
| blk.output = oldout |
| } else { |
| enc.Reset(e.o.dict, false) |
| blk := enc.Block() |
| for len(src) > 0 { |
| todo := src |
| if len(todo) > e.o.blockSize { |
| todo = todo[:e.o.blockSize] |
| } |
| src = src[len(todo):] |
| if e.o.crc { |
| _, _ = enc.CRC().Write(todo) |
| } |
| blk.pushOffsets() |
| enc.Encode(blk, todo) |
| if len(src) == 0 { |
| blk.last = true |
| } |
| err := errIncompressible |
| // If we got the exact same number of literals as input, |
| // assume the literals cannot be compressed. |
| if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize { |
| err = blk.encode(todo, e.o.noEntropy, !e.o.allLitEntropy) |
| } |
| |
| switch err { |
| case errIncompressible: |
| if debug { |
| println("Storing incompressible block as raw") |
| } |
| dst = blk.encodeRawTo(dst, todo) |
| blk.popOffsets() |
| case nil: |
| dst = append(dst, blk.output...) |
| default: |
| panic(err) |
| } |
| blk.reset(nil) |
| } |
| } |
| if e.o.crc { |
| dst = enc.AppendCRC(dst) |
| } |
| // Add padding with content from crypto/rand.Reader |
| if e.o.pad > 0 { |
| add := calcSkippableFrame(int64(len(dst)), int64(e.o.pad)) |
| dst, err = skippableFrame(dst, add, rand.Reader) |
| if err != nil { |
| panic(err) |
| } |
| } |
| return dst |
| } |