| #include <stdio.h> /* fprintf */ |
| #include <stdlib.h> /* malloc, free, qsort */ |
| #include <string.h> /* memset */ |
| #include <time.h> /* clock */ |
| #include "mem.h" /* read */ |
| #include "pool.h" |
| #include "threading.h" |
| #include "zstd_internal.h" /* includes zstd.h */ |
| #ifndef ZDICT_STATIC_LINKING_ONLY |
| #define ZDICT_STATIC_LINKING_ONLY |
| #endif |
| #include "zdict.h" |
| |
| /** |
| * COVER_best_t is used for two purposes: |
| * 1. Synchronizing threads. |
| * 2. Saving the best parameters and dictionary. |
| * |
| * All of the methods except COVER_best_init() are thread safe if zstd is |
| * compiled with multithreaded support. |
| */ |
| typedef struct COVER_best_s { |
| ZSTD_pthread_mutex_t mutex; |
| ZSTD_pthread_cond_t cond; |
| size_t liveJobs; |
| void *dict; |
| size_t dictSize; |
| ZDICT_cover_params_t parameters; |
| size_t compressedSize; |
| } COVER_best_t; |
| |
| /** |
| * A segment is a range in the source as well as the score of the segment. |
| */ |
| typedef struct { |
| U32 begin; |
| U32 end; |
| U32 score; |
| } COVER_segment_t; |
| |
| /** |
| *Number of epochs and size of each epoch. |
| */ |
| typedef struct { |
| U32 num; |
| U32 size; |
| } COVER_epoch_info_t; |
| |
| /** |
| * Struct used for the dictionary selection function. |
| */ |
| typedef struct COVER_dictSelection { |
| BYTE* dictContent; |
| size_t dictSize; |
| size_t totalCompressedSize; |
| } COVER_dictSelection_t; |
| |
| /** |
| * Computes the number of epochs and the size of each epoch. |
| * We will make sure that each epoch gets at least 10 * k bytes. |
| * |
| * The COVER algorithms divide the data up into epochs of equal size and |
| * select one segment from each epoch. |
| * |
| * @param maxDictSize The maximum allowed dictionary size. |
| * @param nbDmers The number of dmers we are training on. |
| * @param k The parameter k (segment size). |
| * @param passes The target number of passes over the dmer corpus. |
| * More passes means a better dictionary. |
| */ |
| COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, |
| U32 k, U32 passes); |
| |
| /** |
| * Warns the user when their corpus is too small. |
| */ |
| void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); |
| |
| /** |
| * Checks total compressed size of a dictionary |
| */ |
| size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, |
| const size_t *samplesSizes, const BYTE *samples, |
| size_t *offsets, |
| size_t nbTrainSamples, size_t nbSamples, |
| BYTE *const dict, size_t dictBufferCapacity); |
| |
| /** |
| * Returns the sum of the sample sizes. |
| */ |
| size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; |
| |
| /** |
| * Initialize the `COVER_best_t`. |
| */ |
| void COVER_best_init(COVER_best_t *best); |
| |
| /** |
| * Wait until liveJobs == 0. |
| */ |
| void COVER_best_wait(COVER_best_t *best); |
| |
| /** |
| * Call COVER_best_wait() and then destroy the COVER_best_t. |
| */ |
| void COVER_best_destroy(COVER_best_t *best); |
| |
| /** |
| * Called when a thread is about to be launched. |
| * Increments liveJobs. |
| */ |
| void COVER_best_start(COVER_best_t *best); |
| |
| /** |
| * Called when a thread finishes executing, both on error or success. |
| * Decrements liveJobs and signals any waiting threads if liveJobs == 0. |
| * If this dictionary is the best so far save it and its parameters. |
| */ |
| void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, |
| COVER_dictSelection_t selection); |
| /** |
| * Error function for COVER_selectDict function. Checks if the return |
| * value is an error. |
| */ |
| unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); |
| |
| /** |
| * Error function for COVER_selectDict function. Returns a struct where |
| * return.totalCompressedSize is a ZSTD error. |
| */ |
| COVER_dictSelection_t COVER_dictSelectionError(size_t error); |
| |
| /** |
| * Always call after selectDict is called to free up used memory from |
| * newly created dictionary. |
| */ |
| void COVER_dictSelectionFree(COVER_dictSelection_t selection); |
| |
| /** |
| * Called to finalize the dictionary and select one based on whether or not |
| * the shrink-dict flag was enabled. If enabled the dictionary used is the |
| * smallest dictionary within a specified regression of the compressed size |
| * from the largest dictionary. |
| */ |
| COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, |
| size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, |
| size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); |