blob: d9e0636a65981c614397dd4e044e4fcfc949ac8b [file] [log] [blame]
Scott Baker2c1c4822019-10-16 11:02:41 -07001#include <stdio.h> /* fprintf */
2#include <stdlib.h> /* malloc, free, qsort */
3#include <string.h> /* memset */
4#include <time.h> /* clock */
5#include "mem.h" /* read */
6#include "pool.h"
7#include "threading.h"
8#include "zstd_internal.h" /* includes zstd.h */
9#ifndef ZDICT_STATIC_LINKING_ONLY
10#define ZDICT_STATIC_LINKING_ONLY
11#endif
12#include "zdict.h"
13
14/**
15 * COVER_best_t is used for two purposes:
16 * 1. Synchronizing threads.
17 * 2. Saving the best parameters and dictionary.
18 *
19 * All of the methods except COVER_best_init() are thread safe if zstd is
20 * compiled with multithreaded support.
21 */
22typedef struct COVER_best_s {
23 ZSTD_pthread_mutex_t mutex;
24 ZSTD_pthread_cond_t cond;
25 size_t liveJobs;
26 void *dict;
27 size_t dictSize;
28 ZDICT_cover_params_t parameters;
29 size_t compressedSize;
30} COVER_best_t;
31
32/**
33 * A segment is a range in the source as well as the score of the segment.
34 */
35typedef struct {
36 U32 begin;
37 U32 end;
38 U32 score;
39} COVER_segment_t;
40
41/**
42 *Number of epochs and size of each epoch.
43 */
44typedef struct {
45 U32 num;
46 U32 size;
47} COVER_epoch_info_t;
48
49/**
50 * Struct used for the dictionary selection function.
51 */
52typedef struct COVER_dictSelection {
53 BYTE* dictContent;
54 size_t dictSize;
55 size_t totalCompressedSize;
56} COVER_dictSelection_t;
57
58/**
59 * Computes the number of epochs and the size of each epoch.
60 * We will make sure that each epoch gets at least 10 * k bytes.
61 *
62 * The COVER algorithms divide the data up into epochs of equal size and
63 * select one segment from each epoch.
64 *
65 * @param maxDictSize The maximum allowed dictionary size.
66 * @param nbDmers The number of dmers we are training on.
67 * @param k The parameter k (segment size).
68 * @param passes The target number of passes over the dmer corpus.
69 * More passes means a better dictionary.
70 */
71COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
72 U32 k, U32 passes);
73
74/**
75 * Warns the user when their corpus is too small.
76 */
77void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
78
79/**
80 * Checks total compressed size of a dictionary
81 */
82size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
83 const size_t *samplesSizes, const BYTE *samples,
84 size_t *offsets,
85 size_t nbTrainSamples, size_t nbSamples,
86 BYTE *const dict, size_t dictBufferCapacity);
87
88/**
89 * Returns the sum of the sample sizes.
90 */
91size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
92
93/**
94 * Initialize the `COVER_best_t`.
95 */
96void COVER_best_init(COVER_best_t *best);
97
98/**
99 * Wait until liveJobs == 0.
100 */
101void COVER_best_wait(COVER_best_t *best);
102
103/**
104 * Call COVER_best_wait() and then destroy the COVER_best_t.
105 */
106void COVER_best_destroy(COVER_best_t *best);
107
108/**
109 * Called when a thread is about to be launched.
110 * Increments liveJobs.
111 */
112void COVER_best_start(COVER_best_t *best);
113
114/**
115 * Called when a thread finishes executing, both on error or success.
116 * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
117 * If this dictionary is the best so far save it and its parameters.
118 */
119void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
120 COVER_dictSelection_t selection);
121/**
122 * Error function for COVER_selectDict function. Checks if the return
123 * value is an error.
124 */
125unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
126
127 /**
128 * Error function for COVER_selectDict function. Returns a struct where
129 * return.totalCompressedSize is a ZSTD error.
130 */
131COVER_dictSelection_t COVER_dictSelectionError(size_t error);
132
133/**
134 * Always call after selectDict is called to free up used memory from
135 * newly created dictionary.
136 */
137void COVER_dictSelectionFree(COVER_dictSelection_t selection);
138
139/**
140 * Called to finalize the dictionary and select one based on whether or not
141 * the shrink-dict flag was enabled. If enabled the dictionary used is the
142 * smallest dictionary within a specified regression of the compressed size
143 * from the largest dictionary.
144 */
145 COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
146 size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
147 size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);