Blame - vendor/github.com/DataDog/zstd/cover.c - voltha-openolt-adapter

blob: 621996759b6aeb67ecc3d101a6ffa5fb13e28caf [file] [log] [blame]

William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1	/*
				2	* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
				3	* All rights reserved.
				4	*
				5	* This source code is licensed under both the BSD-style license (found in the
				6	* LICENSE file in the root directory of this source tree) and the GPLv2 (found
				7	* in the COPYING file in the root directory of this source tree).
				8	* You may select, at your option, one of the above-listed licenses.
				9	*/
				10
				11	/* *****************************************************************************
				12	* Constructs a dictionary using a heuristic based on the following paper:
				13	*
				14	* Liao, Petri, Moffat, Wirth
				15	* Effective Construction of Relative Lempel-Ziv Dictionaries
				16	* Published in WWW 2016.
				17	*
				18	* Adapted from code originally written by @ot (Giuseppe Ottaviano).
				19	******************************************************************************/
				20
				21	/-************************************
				22	* Dependencies
				23	***************************************/
				24	#include <stdio.h> /* fprintf */
				25	#include <stdlib.h> /* malloc, free, qsort */
				26	#include <string.h> /* memset */
				27	#include <time.h> /* clock */
				28
				29	#include "mem.h" /* read */
				30	#include "pool.h"
				31	#include "threading.h"
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	32	#include "cover.h"
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	33	#include "zstd_internal.h" /* includes zstd.h */
				34	#ifndef ZDICT_STATIC_LINKING_ONLY
				35	#define ZDICT_STATIC_LINKING_ONLY
				36	#endif
				37	#include "zdict.h"
				38
				39	/-************************************
				40	* Constants
				41	***************************************/
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	42	#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
				43	#define DEFAULT_SPLITPOINT 1.0
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	44
				45	/-************************************
				46	* Console display
				47	***************************************/
				48	static int g_displayLevel = 2;
				49	#define DISPLAY(...) \
				50	{ \
				51	fprintf(stderr, __VA_ARGS__); \
				52	fflush(stderr); \
				53	}
				54	#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
				55	if (displayLevel >= l) { \
				56	DISPLAY(__VA_ARGS__); \
				57	} /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
				58	#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
				59
				60	#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
				61	if (displayLevel >= l) { \
				62	if ((clock() - g_time > refreshRate) \|\| (displayLevel >= 4)) { \
				63	g_time = clock(); \
				64	DISPLAY(__VA_ARGS__); \
				65	} \
				66	}
				67	#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
				68	static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
				69	static clock_t g_time = 0;
				70
				71	/-************************************
				72	* Hash table
				73	***************************************
				74	* A small specialized hash map for storing activeDmers.
				75	* The map does not resize, so if it becomes full it will loop forever.
				76	* Thus, the map must be large enough to store every value.
				77	* The map implements linear probing and keeps its load less than 0.5.
				78	*/
				79
				80	#define MAP_EMPTY_VALUE ((U32)-1)
				81	typedef struct COVER_map_pair_t_s {
				82	U32 key;
				83	U32 value;
				84	} COVER_map_pair_t;
				85
				86	typedef struct COVER_map_s {
				87	COVER_map_pair_t *data;
				88	U32 sizeLog;
				89	U32 size;
				90	U32 sizeMask;
				91	} COVER_map_t;
				92
				93	/**
				94	* Clear the map.
				95	*/
				96	static void COVER_map_clear(COVER_map_t *map) {
				97	memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
				98	}
				99
				100	/**
				101	* Initializes a map of the given size.
				102	* Returns 1 on success and 0 on failure.
				103	* The map must be destroyed with COVER_map_destroy().
				104	* The map is only guaranteed to be large enough to hold size elements.
				105	*/
				106	static int COVER_map_init(COVER_map_t *map, U32 size) {
				107	map->sizeLog = ZSTD_highbit32(size) + 2;
				108	map->size = (U32)1 << map->sizeLog;
				109	map->sizeMask = map->size - 1;
				110	map->data = (COVER_map_pair_t )malloc(map->size sizeof(COVER_map_pair_t));
				111	if (!map->data) {
				112	map->sizeLog = 0;
				113	map->size = 0;
				114	return 0;
				115	}
				116	COVER_map_clear(map);
				117	return 1;
				118	}
				119
				120	/**
				121	* Internal hash function
				122	*/
				123	static const U32 prime4bytes = 2654435761U;
				124	static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
				125	return (key * prime4bytes) >> (32 - map->sizeLog);
				126	}
				127
				128	/**
				129	* Helper function that returns the index that a key should be placed into.
				130	*/
				131	static U32 COVER_map_index(COVER_map_t *map, U32 key) {
				132	const U32 hash = COVER_map_hash(map, key);
				133	U32 i;
				134	for (i = hash;; i = (i + 1) & map->sizeMask) {
				135	COVER_map_pair_t *pos = &map->data[i];
				136	if (pos->value == MAP_EMPTY_VALUE) {
				137	return i;
				138	}
				139	if (pos->key == key) {
				140	return i;
				141	}
				142	}
				143	}
				144
				145	/**
				146	* Returns the pointer to the value for key.
				147	* If key is not in the map, it is inserted and the value is set to 0.
				148	* The map must not be full.
				149	*/
				150	static U32 COVER_map_at(COVER_map_t map, U32 key) {
				151	COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
				152	if (pos->value == MAP_EMPTY_VALUE) {
				153	pos->key = key;
				154	pos->value = 0;
				155	}
				156	return &pos->value;
				157	}
				158
				159	/**
				160	* Deletes key from the map if present.
				161	*/
				162	static void COVER_map_remove(COVER_map_t *map, U32 key) {
				163	U32 i = COVER_map_index(map, key);
				164	COVER_map_pair_t *del = &map->data[i];
				165	U32 shift = 1;
				166	if (del->value == MAP_EMPTY_VALUE) {
				167	return;
				168	}
				169	for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
				170	COVER_map_pair_t *const pos = &map->data[i];
				171	/* If the position is empty we are done */
				172	if (pos->value == MAP_EMPTY_VALUE) {
				173	del->value = MAP_EMPTY_VALUE;
				174	return;
				175	}
				176	/* If pos can be moved to del do so */
				177	if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
				178	del->key = pos->key;
				179	del->value = pos->value;
				180	del = pos;
				181	shift = 1;
				182	} else {
				183	++shift;
				184	}
				185	}
				186	}
				187
				188	/**
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	189	* Destroys a map that is inited with COVER_map_init().
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	190	*/
				191	static void COVER_map_destroy(COVER_map_t *map) {
				192	if (map->data) {
				193	free(map->data);
				194	}
				195	map->data = NULL;
				196	map->size = 0;
				197	}
				198
				199	/-************************************
				200	* Context
				201	***************************************/
				202
				203	typedef struct {
				204	const BYTE *samples;
				205	size_t *offsets;
				206	const size_t *samplesSizes;
				207	size_t nbSamples;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	208	size_t nbTrainSamples;
				209	size_t nbTestSamples;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	210	U32 *suffix;
				211	size_t suffixSize;
				212	U32 *freqs;
				213	U32 *dmerAt;
				214	unsigned d;
				215	} COVER_ctx_t;
				216
				217	/* We need a global context for qsort... */
				218	static COVER_ctx_t *g_ctx = NULL;
				219
				220	/-************************************
				221	* Helper functions
				222	***************************************/
				223
				224	/**
				225	* Returns the sum of the sample sizes.
				226	*/
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	227	size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	228	size_t sum = 0;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	229	unsigned i;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	230	for (i = 0; i < nbSamples; ++i) {
				231	sum += samplesSizes[i];
				232	}
				233	return sum;
				234	}
				235
				236	/**
				237	* Returns -1 if the dmer at lp is less than the dmer at rp.
				238	* Return 0 if the dmers at lp and rp are equal.
				239	* Returns 1 if the dmer at lp is greater than the dmer at rp.
				240	*/
				241	static int COVER_cmp(COVER_ctx_t ctx, const void lp, const void *rp) {
				242	U32 const lhs = (U32 const )lp;
				243	U32 const rhs = (U32 const )rp;
				244	return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
				245	}
				246	/**
				247	* Faster version for d <= 8.
				248	*/
				249	static int COVER_cmp8(COVER_ctx_t ctx, const void lp, const void *rp) {
				250	U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
				251	U64 const lhs = MEM_readLE64(ctx->samples + (U32 const )lp) & mask;
				252	U64 const rhs = MEM_readLE64(ctx->samples + (U32 const )rp) & mask;
				253	if (lhs < rhs) {
				254	return -1;
				255	}
				256	return (lhs > rhs);
				257	}
				258
				259	/**
				260	* Same as COVER_cmp() except ties are broken by pointer value
				261	* NOTE: g_ctx must be set to call this function. A global is required because
				262	* qsort doesn't take an opaque pointer.
				263	*/
				264	static int COVER_strict_cmp(const void lp, const void rp) {
				265	int result = COVER_cmp(g_ctx, lp, rp);
				266	if (result == 0) {
				267	result = lp < rp ? -1 : 1;
				268	}
				269	return result;
				270	}
				271	/**
				272	* Faster version for d <= 8.
				273	*/
				274	static int COVER_strict_cmp8(const void lp, const void rp) {
				275	int result = COVER_cmp8(g_ctx, lp, rp);
				276	if (result == 0) {
				277	result = lp < rp ? -1 : 1;
				278	}
				279	return result;
				280	}
				281
				282	/**
				283	* Returns the first pointer in [first, last) whose element does not compare
				284	* less than value. If no such element exists it returns last.
				285	*/
				286	static const size_t COVER_lower_bound(const size_t first, const size_t *last,
				287	size_t value) {
				288	size_t count = last - first;
				289	while (count != 0) {
				290	size_t step = count / 2;
				291	const size_t *ptr = first;
				292	ptr += step;
				293	if (*ptr < value) {
				294	first = ++ptr;
				295	count -= step + 1;
				296	} else {
				297	count = step;
				298	}
				299	}
				300	return first;
				301	}
				302
				303	/**
				304	* Generic groupBy function.
				305	* Groups an array sorted by cmp into groups with equivalent values.
				306	* Calls grp for each group.
				307	*/
				308	static void
				309	COVER_groupBy(const void data, size_t count, size_t size, COVER_ctx_t ctx,
				310	int (cmp)(COVER_ctx_t , const void , const void ),
				311	void (grp)(COVER_ctx_t , const void , const void )) {
				312	const BYTE ptr = (const BYTE )data;
				313	size_t num = 0;
				314	while (num < count) {
				315	const BYTE *grpEnd = ptr + size;
				316	++num;
				317	while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
				318	grpEnd += size;
				319	++num;
				320	}
				321	grp(ctx, ptr, grpEnd);
				322	ptr = grpEnd;
				323	}
				324	}
				325
				326	/-************************************
				327	* Cover functions
				328	***************************************/
				329
				330	/**
				331	* Called on each group of positions with the same dmer.
				332	* Counts the frequency of each dmer and saves it in the suffix array.
				333	* Fills `ctx->dmerAt`.
				334	*/
				335	static void COVER_group(COVER_ctx_t ctx, const void group,
				336	const void *groupEnd) {
				337	/* The group consists of all the positions with the same first d bytes. */
				338	const U32 grpPtr = (const U32 )group;
				339	const U32 grpEnd = (const U32 )groupEnd;
				340	/* The dmerId is how we will reference this dmer.
				341	* This allows us to map the whole dmer space to a much smaller space, the
				342	* size of the suffix array.
				343	*/
				344	const U32 dmerId = (U32)(grpPtr - ctx->suffix);
				345	/* Count the number of samples this dmer shows up in */
				346	U32 freq = 0;
				347	/* Details */
				348	const size_t *curOffsetPtr = ctx->offsets;
				349	const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
				350	/* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
				351	* different sample than the last.
				352	*/
				353	size_t curSampleEnd = ctx->offsets[0];
				354	for (; grpPtr != grpEnd; ++grpPtr) {
				355	/* Save the dmerId for this position so we can get back to it. */
				356	ctx->dmerAt[*grpPtr] = dmerId;
				357	/* Dictionaries only help for the first reference to the dmer.
				358	* After that zstd can reference the match from the previous reference.
				359	* So only count each dmer once for each sample it is in.
				360	*/
				361	if (*grpPtr < curSampleEnd) {
				362	continue;
				363	}
				364	freq += 1;
				365	/* Binary search to find the end of the sample *grpPtr is in.
				366	* In the common case that grpPtr + 1 == grpEnd we can skip the binary
				367	* search because the loop is over.
				368	*/
				369	if (grpPtr + 1 != grpEnd) {
				370	const size_t *sampleEndPtr =
				371	COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
				372	curSampleEnd = *sampleEndPtr;
				373	curOffsetPtr = sampleEndPtr + 1;
				374	}
				375	}
				376	/* At this point we are never going to look at this segment of the suffix
				377	* array again. We take advantage of this fact to save memory.
				378	* We store the frequency of the dmer in the first position of the group,
				379	* which is dmerId.
				380	*/
				381	ctx->suffix[dmerId] = freq;
				382	}
				383
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	384
				385	/**
				386	* Selects the best segment in an epoch.
				387	* Segments of are scored according to the function:
				388	*
				389	* Let F(d) be the frequency of dmer d.
				390	* Let S_i be the dmer at position i of segment S which has length k.
				391	*
				392	* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
				393	*
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	394	* Once the dmer d is in the dictionary we set F(d) = 0.
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	395	*/
				396	static COVER_segment_t COVER_selectSegment(const COVER_ctx_t ctx, U32 freqs,
				397	COVER_map_t *activeDmers, U32 begin,
				398	U32 end,
				399	ZDICT_cover_params_t parameters) {
				400	/* Constants */
				401	const U32 k = parameters.k;
				402	const U32 d = parameters.d;
				403	const U32 dmersInK = k - d + 1;
				404	/* Try each segment (activeSegment) and save the best (bestSegment) */
				405	COVER_segment_t bestSegment = {0, 0, 0};
				406	COVER_segment_t activeSegment;
				407	/* Reset the activeDmers in the segment */
				408	COVER_map_clear(activeDmers);
				409	/* The activeSegment starts at the beginning of the epoch. */
				410	activeSegment.begin = begin;
				411	activeSegment.end = begin;
				412	activeSegment.score = 0;
				413	/* Slide the activeSegment through the whole epoch.
				414	* Save the best segment in bestSegment.
				415	*/
				416	while (activeSegment.end < end) {
				417	/* The dmerId for the dmer at the next position */
				418	U32 newDmer = ctx->dmerAt[activeSegment.end];
				419	/* The entry in activeDmers for this dmerId */
				420	U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
				421	/* If the dmer isn't already present in the segment add its score. */
				422	if (*newDmerOcc == 0) {
				423	/* The paper suggest using the L-0.5 norm, but experiments show that it
				424	* doesn't help.
				425	*/
				426	activeSegment.score += freqs[newDmer];
				427	}
				428	/* Add the dmer to the segment */
				429	activeSegment.end += 1;
				430	*newDmerOcc += 1;
				431
				432	/* If the window is now too large, drop the first position */
				433	if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
				434	U32 delDmer = ctx->dmerAt[activeSegment.begin];
				435	U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
				436	activeSegment.begin += 1;
				437	*delDmerOcc -= 1;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	438	/* If this is the last occurrence of the dmer, subtract its score */
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	439	if (*delDmerOcc == 0) {
				440	COVER_map_remove(activeDmers, delDmer);
				441	activeSegment.score -= freqs[delDmer];
				442	}
				443	}
				444
				445	/* If this segment is the best so far save it */
				446	if (activeSegment.score > bestSegment.score) {
				447	bestSegment = activeSegment;
				448	}
				449	}
				450	{
				451	/* Trim off the zero frequency head and tail from the segment. */
				452	U32 newBegin = bestSegment.end;
				453	U32 newEnd = bestSegment.begin;
				454	U32 pos;
				455	for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
				456	U32 freq = freqs[ctx->dmerAt[pos]];
				457	if (freq != 0) {
				458	newBegin = MIN(newBegin, pos);
				459	newEnd = pos + 1;
				460	}
				461	}
				462	bestSegment.begin = newBegin;
				463	bestSegment.end = newEnd;
				464	}
				465	{
				466	/* Zero out the frequency of each dmer covered by the chosen segment. */
				467	U32 pos;
				468	for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
				469	freqs[ctx->dmerAt[pos]] = 0;
				470	}
				471	}
				472	return bestSegment;
				473	}
				474
				475	/**
				476	* Check the validity of the parameters.
				477	* Returns non-zero if the parameters are valid and 0 otherwise.
				478	*/
				479	static int COVER_checkParameters(ZDICT_cover_params_t parameters,
				480	size_t maxDictSize) {
				481	/* k and d are required parameters */
				482	if (parameters.d == 0 \|\| parameters.k == 0) {
				483	return 0;
				484	}
				485	/* k <= maxDictSize */
				486	if (parameters.k > maxDictSize) {
				487	return 0;
				488	}
				489	/* d <= k */
				490	if (parameters.d > parameters.k) {
				491	return 0;
				492	}
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	493	/* 0 < splitPoint <= 1 */
				494	if (parameters.splitPoint <= 0 \|\| parameters.splitPoint > 1){
				495	return 0;
				496	}
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	497	return 1;
				498	}
				499
				500	/**
				501	* Clean up a context initialized with `COVER_ctx_init()`.
				502	*/
				503	static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
				504	if (!ctx) {
				505	return;
				506	}
				507	if (ctx->suffix) {
				508	free(ctx->suffix);
				509	ctx->suffix = NULL;
				510	}
				511	if (ctx->freqs) {
				512	free(ctx->freqs);
				513	ctx->freqs = NULL;
				514	}
				515	if (ctx->dmerAt) {
				516	free(ctx->dmerAt);
				517	ctx->dmerAt = NULL;
				518	}
				519	if (ctx->offsets) {
				520	free(ctx->offsets);
				521	ctx->offsets = NULL;
				522	}
				523	}
				524
				525	/**
				526	* Prepare a context for dictionary building.
				527	* The context is only dependent on the parameter `d` and can used multiple
				528	* times.
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	529	* Returns 0 on success or error code on error.
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	530	* The context must be destroyed with `COVER_ctx_destroy()`.
				531	*/
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	532	static size_t COVER_ctx_init(COVER_ctx_t ctx, const void samplesBuffer,
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	533	const size_t *samplesSizes, unsigned nbSamples,
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	534	unsigned d, double splitPoint) {
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	535	const BYTE const samples = (const BYTE )samplesBuffer;
				536	const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	537	/* Split samples into testing and training sets */
				538	const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
				539	const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
				540	const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
				541	const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	542	/* Checks */
				543	if (totalSamplesSize < MAX(d, sizeof(U64)) \|\|
				544	totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
				545	DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	546	(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	547	return ERROR(srcSize_wrong);
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	548	}
				549	/* Check if there are at least 5 training samples */
				550	if (nbTrainSamples < 5) {
				551	DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	552	return ERROR(srcSize_wrong);
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	553	}
				554	/* Check if there's testing sample */
				555	if (nbTestSamples < 1) {
				556	DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	557	return ERROR(srcSize_wrong);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	558	}
				559	/* Zero the context */
				560	memset(ctx, 0, sizeof(*ctx));
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	561	DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
				562	(unsigned)trainingSamplesSize);
				563	DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
				564	(unsigned)testSamplesSize);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	565	ctx->samples = samples;
				566	ctx->samplesSizes = samplesSizes;
				567	ctx->nbSamples = nbSamples;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	568	ctx->nbTrainSamples = nbTrainSamples;
				569	ctx->nbTestSamples = nbTestSamples;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	570	/* Partial suffix array */
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	571	ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	572	ctx->suffix = (U32 )malloc(ctx->suffixSize sizeof(U32));
				573	/* Maps index to the dmerID */
				574	ctx->dmerAt = (U32 )malloc(ctx->suffixSize sizeof(U32));
				575	/* The offsets of each file */
				576	ctx->offsets = (size_t )malloc((nbSamples + 1) sizeof(size_t));
				577	if (!ctx->suffix \|\| !ctx->dmerAt \|\| !ctx->offsets) {
				578	DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
				579	COVER_ctx_destroy(ctx);
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	580	return ERROR(memory_allocation);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	581	}
				582	ctx->freqs = NULL;
				583	ctx->d = d;
				584
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	585	/* Fill offsets from the samplesSizes */
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	586	{
				587	U32 i;
				588	ctx->offsets[0] = 0;
				589	for (i = 1; i <= nbSamples; ++i) {
				590	ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
				591	}
				592	}
				593	DISPLAYLEVEL(2, "Constructing partial suffix array\n");
				594	{
				595	/* suffix is a partial suffix array.
				596	* It only sorts suffixes by their first parameters.d bytes.
				597	* The sort is stable, so each dmer group is sorted by position in input.
				598	*/
				599	U32 i;
				600	for (i = 0; i < ctx->suffixSize; ++i) {
				601	ctx->suffix[i] = i;
				602	}
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	603	/* qsort doesn't take an opaque pointer, so pass as a global.
				604	* On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
				605	*/
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	606	g_ctx = ctx;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	607	#if defined(__OpenBSD__)
				608	mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
				609	(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
				610	#else
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	611	qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
				612	(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	613	#endif
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	614	}
				615	DISPLAYLEVEL(2, "Computing frequencies\n");
				616	/* For each dmer group (group of positions with the same first d bytes):
				617	* 1. For each position we set dmerAt[position] = dmerID. The dmerID is
				618	* (groupBeginPtr - suffix). This allows us to go from position to
				619	* dmerID so we can look up values in freq.
				620	* 2. We calculate how many samples the dmer occurs in and save it in
				621	* freqs[dmerId].
				622	*/
				623	COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
				624	(ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
				625	ctx->freqs = ctx->suffix;
				626	ctx->suffix = NULL;
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	627	return 0;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	628	}
				629
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	630	void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
				631	{
				632	const double ratio = (double)nbDmers / maxDictSize;
				633	if (ratio >= 10) {
				634	return;
				635	}
				636	LOCALDISPLAYLEVEL(displayLevel, 1,
				637	"WARNING: The maximum dictionary size %u is too large "
				638	"compared to the source size %u! "
				639	"size(source)/size(dictionary) = %f, but it should be >= "
				640	"10! This may lead to a subpar dictionary! We recommend "
				641	"training on sources at least 10x, and up to 100x the "
				642	"size of the dictionary!\n", (U32)maxDictSize,
				643	(U32)nbDmers, ratio);
				644	}
				645
				646	COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
				647	U32 nbDmers, U32 k, U32 passes)
				648	{
				649	const U32 minEpochSize = k * 10;
				650	COVER_epoch_info_t epochs;
				651	epochs.num = MAX(1, maxDictSize / k / passes);
				652	epochs.size = nbDmers / epochs.num;
				653	if (epochs.size >= minEpochSize) {
				654	assert(epochs.size * epochs.num <= nbDmers);
				655	return epochs;
				656	}
				657	epochs.size = MIN(minEpochSize, nbDmers);
				658	epochs.num = nbDmers / epochs.size;
				659	assert(epochs.size * epochs.num <= nbDmers);
				660	return epochs;
				661	}
				662
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	663	/**
				664	* Given the prepared context build the dictionary.
				665	*/
				666	static size_t COVER_buildDictionary(const COVER_ctx_t ctx, U32 freqs,
				667	COVER_map_t activeDmers, void dictBuffer,
				668	size_t dictBufferCapacity,
				669	ZDICT_cover_params_t parameters) {
				670	BYTE const dict = (BYTE )dictBuffer;
				671	size_t tail = dictBufferCapacity;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	672	/* Divide the data into epochs. We will select one segment from each epoch. */
				673	const COVER_epoch_info_t epochs = COVER_computeEpochs(
				674	(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
				675	const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
				676	size_t zeroScoreRun = 0;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	677	size_t epoch;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	678	DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
				679	(U32)epochs.num, (U32)epochs.size);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	680	/* Loop through the epochs until there are no more segments or the dictionary
				681	* is full.
				682	*/
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	683	for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
				684	const U32 epochBegin = (U32)(epoch * epochs.size);
				685	const U32 epochEnd = epochBegin + epochs.size;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	686	size_t segmentSize;
				687	/* Select a segment */
				688	COVER_segment_t segment = COVER_selectSegment(
				689	ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	690	/* If the segment covers no dmers, then we are out of content.
				691	* There may be new content in other epochs, for continue for some time.
				692	*/
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	693	if (segment.score == 0) {
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	694	if (++zeroScoreRun >= maxZeroScoreRun) {
				695	break;
				696	}
				697	continue;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	698	}
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	699	zeroScoreRun = 0;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	700	/* Trim the segment if necessary and if it is too small then we are done */
				701	segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
				702	if (segmentSize < parameters.d) {
				703	break;
				704	}
				705	/* We fill the dictionary from the back to allow the best segments to be
				706	* referenced with the smallest offsets.
				707	*/
				708	tail -= segmentSize;
				709	memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
				710	DISPLAYUPDATE(
				711	2, "\r%u%% ",
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	712	(unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	713	}
				714	DISPLAYLEVEL(2, "\r%79s\r", "");
				715	return tail;
				716	}
				717
				718	ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
				719	void *dictBuffer, size_t dictBufferCapacity,
				720	const void samplesBuffer, const size_t samplesSizes, unsigned nbSamples,
				721	ZDICT_cover_params_t parameters)
				722	{
				723	BYTE* const dict = (BYTE*)dictBuffer;
				724	COVER_ctx_t ctx;
				725	COVER_map_t activeDmers;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	726	parameters.splitPoint = 1.0;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	727	/* Initialize global data */
				728	g_displayLevel = parameters.zParams.notificationLevel;
				729	/* Checks */
				730	if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
				731	DISPLAYLEVEL(1, "Cover parameters incorrect\n");
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	732	return ERROR(parameter_outOfBound);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	733	}
				734	if (nbSamples == 0) {
				735	DISPLAYLEVEL(1, "Cover must have at least one input file\n");
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	736	return ERROR(srcSize_wrong);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	737	}
				738	if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
				739	DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
				740	ZDICT_DICTSIZE_MIN);
				741	return ERROR(dstSize_tooSmall);
				742	}
				743	/* Initialize context and activeDmers */
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	744	{
				745	size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
				746	parameters.d, parameters.splitPoint);
				747	if (ZSTD_isError(initVal)) {
				748	return initVal;
				749	}
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	750	}
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	751	COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	752	if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
				753	DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
				754	COVER_ctx_destroy(&ctx);
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	755	return ERROR(memory_allocation);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	756	}
				757
				758	DISPLAYLEVEL(2, "Building dictionary\n");
				759	{
				760	const size_t tail =
				761	COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
				762	dictBufferCapacity, parameters);
				763	const size_t dictionarySize = ZDICT_finalizeDictionary(
				764	dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
				765	samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
				766	if (!ZSTD_isError(dictionarySize)) {
				767	DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	768	(unsigned)dictionarySize);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	769	}
				770	COVER_ctx_destroy(&ctx);
				771	COVER_map_destroy(&activeDmers);
				772	return dictionarySize;
				773	}
				774	}
				775
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	776
				777
				778	size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
				779	const size_t samplesSizes, const BYTE samples,
				780	size_t *offsets,
				781	size_t nbTrainSamples, size_t nbSamples,
				782	BYTE *const dict, size_t dictBufferCapacity) {
				783	size_t totalCompressedSize = ERROR(GENERIC);
				784	/* Pointers */
				785	ZSTD_CCtx *cctx;
				786	ZSTD_CDict *cdict;
				787	void *dst;
				788	/* Local variables */
				789	size_t dstCapacity;
				790	size_t i;
				791	/* Allocate dst with enough space to compress the maximum sized sample */
				792	{
				793	size_t maxSampleSize = 0;
				794	i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
				795	for (; i < nbSamples; ++i) {
				796	maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
				797	}
				798	dstCapacity = ZSTD_compressBound(maxSampleSize);
				799	dst = malloc(dstCapacity);
				800	}
				801	/* Create the cctx and cdict */
				802	cctx = ZSTD_createCCtx();
				803	cdict = ZSTD_createCDict(dict, dictBufferCapacity,
				804	parameters.zParams.compressionLevel);
				805	if (!dst \|\| !cctx \|\| !cdict) {
				806	goto _compressCleanup;
				807	}
				808	/* Compress each sample and sum their sizes (or error) */
				809	totalCompressedSize = dictBufferCapacity;
				810	i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
				811	for (; i < nbSamples; ++i) {
				812	const size_t size = ZSTD_compress_usingCDict(
				813	cctx, dst, dstCapacity, samples + offsets[i],
				814	samplesSizes[i], cdict);
				815	if (ZSTD_isError(size)) {
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	816	totalCompressedSize = size;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	817	goto _compressCleanup;
				818	}
				819	totalCompressedSize += size;
				820	}
				821	_compressCleanup:
				822	ZSTD_freeCCtx(cctx);
				823	ZSTD_freeCDict(cdict);
				824	if (dst) {
				825	free(dst);
				826	}
				827	return totalCompressedSize;
				828	}
				829
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	830
				831	/**
				832	* Initialize the `COVER_best_t`.
				833	*/
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	834	void COVER_best_init(COVER_best_t *best) {
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	835	if (best==NULL) return; /* compatible with init on NULL */
				836	(void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
				837	(void)ZSTD_pthread_cond_init(&best->cond, NULL);
				838	best->liveJobs = 0;
				839	best->dict = NULL;
				840	best->dictSize = 0;
				841	best->compressedSize = (size_t)-1;
				842	memset(&best->parameters, 0, sizeof(best->parameters));
				843	}
				844
				845	/**
				846	* Wait until liveJobs == 0.
				847	*/
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	848	void COVER_best_wait(COVER_best_t *best) {
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	849	if (!best) {
				850	return;
				851	}
				852	ZSTD_pthread_mutex_lock(&best->mutex);
				853	while (best->liveJobs != 0) {
				854	ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
				855	}
				856	ZSTD_pthread_mutex_unlock(&best->mutex);
				857	}
				858
				859	/**
				860	* Call COVER_best_wait() and then destroy the COVER_best_t.
				861	*/
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	862	void COVER_best_destroy(COVER_best_t *best) {
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	863	if (!best) {
				864	return;
				865	}
				866	COVER_best_wait(best);
				867	if (best->dict) {
				868	free(best->dict);
				869	}
				870	ZSTD_pthread_mutex_destroy(&best->mutex);
				871	ZSTD_pthread_cond_destroy(&best->cond);
				872	}
				873
				874	/**
				875	* Called when a thread is about to be launched.
				876	* Increments liveJobs.
				877	*/
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	878	void COVER_best_start(COVER_best_t *best) {
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	879	if (!best) {
				880	return;
				881	}
				882	ZSTD_pthread_mutex_lock(&best->mutex);
				883	++best->liveJobs;
				884	ZSTD_pthread_mutex_unlock(&best->mutex);
				885	}
				886
				887	/**
				888	* Called when a thread finishes executing, both on error or success.
				889	* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
				890	* If this dictionary is the best so far save it and its parameters.
				891	*/
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	892	void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
				893	COVER_dictSelection_t selection) {
				894	void* dict = selection.dictContent;
				895	size_t compressedSize = selection.totalCompressedSize;
				896	size_t dictSize = selection.dictSize;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	897	if (!best) {
				898	return;
				899	}
				900	{
				901	size_t liveJobs;
				902	ZSTD_pthread_mutex_lock(&best->mutex);
				903	--best->liveJobs;
				904	liveJobs = best->liveJobs;
				905	/* If the new dictionary is better */
				906	if (compressedSize < best->compressedSize) {
				907	/* Allocate space if necessary */
				908	if (!best->dict \|\| best->dictSize < dictSize) {
				909	if (best->dict) {
				910	free(best->dict);
				911	}
				912	best->dict = malloc(dictSize);
				913	if (!best->dict) {
				914	best->compressedSize = ERROR(GENERIC);
				915	best->dictSize = 0;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	916	ZSTD_pthread_cond_signal(&best->cond);
				917	ZSTD_pthread_mutex_unlock(&best->mutex);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	918	return;
				919	}
				920	}
				921	/* Save the dictionary, parameters, and size */
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	922	if (!dict) {
				923	return;
				924	}
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	925	memcpy(best->dict, dict, dictSize);
				926	best->dictSize = dictSize;
				927	best->parameters = parameters;
				928	best->compressedSize = compressedSize;
				929	}
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	930	if (liveJobs == 0) {
				931	ZSTD_pthread_cond_broadcast(&best->cond);
				932	}
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	933	ZSTD_pthread_mutex_unlock(&best->mutex);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	934	}
				935	}
				936
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	937	COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
				938	COVER_dictSelection_t selection = { NULL, 0, error };
				939	return selection;
				940	}
				941
				942	unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
				943	return (ZSTD_isError(selection.totalCompressedSize) \|\| !selection.dictContent);
				944	}
				945
				946	void COVER_dictSelectionFree(COVER_dictSelection_t selection){
				947	free(selection.dictContent);
				948	}
				949
				950	COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
				951	size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
				952	size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
				953
				954	size_t largestDict = 0;
				955	size_t largestCompressed = 0;
				956	BYTE* customDictContentEnd = customDictContent + dictContentSize;
				957
				958	BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
				959	BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
				960	double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
				961
				962	if (!largestDictbuffer \|\| !candidateDictBuffer) {
				963	free(largestDictbuffer);
				964	free(candidateDictBuffer);
				965	return COVER_dictSelectionError(dictContentSize);
				966	}
				967
				968	/* Initial dictionary size and compressed size */
				969	memcpy(largestDictbuffer, customDictContent, dictContentSize);
				970	dictContentSize = ZDICT_finalizeDictionary(
				971	largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
				972	samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
				973
				974	if (ZDICT_isError(dictContentSize)) {
				975	free(largestDictbuffer);
				976	free(candidateDictBuffer);
				977	return COVER_dictSelectionError(dictContentSize);
				978	}
				979
				980	totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
				981	samplesBuffer, offsets,
				982	nbCheckSamples, nbSamples,
				983	largestDictbuffer, dictContentSize);
				984
				985	if (ZSTD_isError(totalCompressedSize)) {
				986	free(largestDictbuffer);
				987	free(candidateDictBuffer);
				988	return COVER_dictSelectionError(totalCompressedSize);
				989	}
				990
				991	if (params.shrinkDict == 0) {
				992	COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
				993	free(candidateDictBuffer);
				994	return selection;
				995	}
				996
				997	largestDict = dictContentSize;
				998	largestCompressed = totalCompressedSize;
				999	dictContentSize = ZDICT_DICTSIZE_MIN;
				1000
				1001	/* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
				1002	while (dictContentSize < largestDict) {
				1003	memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
				1004	dictContentSize = ZDICT_finalizeDictionary(
				1005	candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
				1006	samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
				1007
				1008	if (ZDICT_isError(dictContentSize)) {
				1009	free(largestDictbuffer);
				1010	free(candidateDictBuffer);
				1011	return COVER_dictSelectionError(dictContentSize);
				1012
				1013	}
				1014
				1015	totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
				1016	samplesBuffer, offsets,
				1017	nbCheckSamples, nbSamples,
				1018	candidateDictBuffer, dictContentSize);
				1019
				1020	if (ZSTD_isError(totalCompressedSize)) {
				1021	free(largestDictbuffer);
				1022	free(candidateDictBuffer);
				1023	return COVER_dictSelectionError(totalCompressedSize);
				1024	}
				1025
				1026	if (totalCompressedSize <= largestCompressed * regressionTolerance) {
				1027	COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
				1028	free(largestDictbuffer);
				1029	return selection;
				1030	}
				1031	dictContentSize *= 2;
				1032	}
				1033	dictContentSize = largestDict;
				1034	totalCompressedSize = largestCompressed;
				1035	{
				1036	COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
				1037	free(candidateDictBuffer);
				1038	return selection;
				1039	}
				1040	}
				1041
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1042	/**
				1043	* Parameters for COVER_tryParameters().
				1044	*/
				1045	typedef struct COVER_tryParameters_data_s {
				1046	const COVER_ctx_t *ctx;
				1047	COVER_best_t *best;
				1048	size_t dictBufferCapacity;
				1049	ZDICT_cover_params_t parameters;
				1050	} COVER_tryParameters_data_t;
				1051
				1052	/**
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	1053	* Tries a set of parameters and updates the COVER_best_t with the results.
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1054	* This function is thread safe if zstd is compiled with multithreaded support.
				1055	* It takes its parameters as an OWNING opaque pointer to support threading.
				1056	*/
				1057	static void COVER_tryParameters(void *opaque) {
				1058	/* Save parameters as local variables */
				1059	COVER_tryParameters_data_t const data = (COVER_tryParameters_data_t )opaque;
				1060	const COVER_ctx_t *const ctx = data->ctx;
				1061	const ZDICT_cover_params_t parameters = data->parameters;
				1062	size_t dictBufferCapacity = data->dictBufferCapacity;
				1063	size_t totalCompressedSize = ERROR(GENERIC);
				1064	/* Allocate space for hash table, dict, and freqs */
				1065	COVER_map_t activeDmers;
				1066	BYTE const dict = (BYTE const)malloc(dictBufferCapacity);
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1067	COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1068	U32 freqs = (U32 )malloc(ctx->suffixSize * sizeof(U32));
				1069	if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
				1070	DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
				1071	goto _cleanup;
				1072	}
				1073	if (!dict \|\| !freqs) {
				1074	DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
				1075	goto _cleanup;
				1076	}
				1077	/* Copy the frequencies because we need to modify them */
				1078	memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
				1079	/* Build the dictionary */
				1080	{
				1081	const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
				1082	dictBufferCapacity, parameters);
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1083	selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
				1084	ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
				1085	totalCompressedSize);
				1086
				1087	if (COVER_dictSelectionIsError(selection)) {
				1088	DISPLAYLEVEL(1, "Failed to select dictionary\n");
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1089	goto _cleanup;
				1090	}
				1091	}
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1092	_cleanup:
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1093	free(dict);
				1094	COVER_best_finish(data->best, parameters, selection);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1095	free(data);
				1096	COVER_map_destroy(&activeDmers);
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1097	COVER_dictSelectionFree(selection);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1098	if (freqs) {
				1099	free(freqs);
				1100	}
				1101	}
				1102
				1103	ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
				1104	void dictBuffer, size_t dictBufferCapacity, const void samplesBuffer,
				1105	const size_t *samplesSizes, unsigned nbSamples,
				1106	ZDICT_cover_params_t *parameters) {
				1107	/* constants */
				1108	const unsigned nbThreads = parameters->nbThreads;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	1109	const double splitPoint =
				1110	parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1111	const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
				1112	const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
				1113	const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
				1114	const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
				1115	const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
				1116	const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
				1117	const unsigned kIterations =
				1118	(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1119	const unsigned shrinkDict = 0;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1120	/* Local variables */
				1121	const int displayLevel = parameters->zParams.notificationLevel;
				1122	unsigned iteration = 1;
				1123	unsigned d;
				1124	unsigned k;
				1125	COVER_best_t best;
				1126	POOL_ctx *pool = NULL;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	1127	int warned = 0;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1128
				1129	/* Checks */
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	1130	if (splitPoint <= 0 \|\| splitPoint > 1) {
				1131	LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1132	return ERROR(parameter_outOfBound);
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	1133	}
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1134	if (kMinK < kMaxD \|\| kMaxK < kMinK) {
				1135	LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1136	return ERROR(parameter_outOfBound);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1137	}
				1138	if (nbSamples == 0) {
				1139	DISPLAYLEVEL(1, "Cover must have at least one input file\n");
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1140	return ERROR(srcSize_wrong);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1141	}
				1142	if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
				1143	DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
				1144	ZDICT_DICTSIZE_MIN);
				1145	return ERROR(dstSize_tooSmall);
				1146	}
				1147	if (nbThreads > 1) {
				1148	pool = POOL_create(nbThreads, 1);
				1149	if (!pool) {
				1150	return ERROR(memory_allocation);
				1151	}
				1152	}
				1153	/* Initialization */
				1154	COVER_best_init(&best);
				1155	/* Turn down global display level to clean up display at level 2 and below */
				1156	g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
				1157	/* Loop through d first because each new value needs a new context */
				1158	LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
				1159	kIterations);
				1160	for (d = kMinD; d <= kMaxD; d += 2) {
				1161	/* Initialize the context for this value of d */
				1162	COVER_ctx_t ctx;
				1163	LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1164	{
				1165	const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
				1166	if (ZSTD_isError(initVal)) {
				1167	LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
				1168	COVER_best_destroy(&best);
				1169	POOL_free(pool);
				1170	return initVal;
				1171	}
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1172	}
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	1173	if (!warned) {
				1174	COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
				1175	warned = 1;
				1176	}
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1177	/* Loop through k reusing the same context */
				1178	for (k = kMinK; k <= kMaxK; k += kStepSize) {
				1179	/* Prepare the arguments */
				1180	COVER_tryParameters_data_t data = (COVER_tryParameters_data_t )malloc(
				1181	sizeof(COVER_tryParameters_data_t));
				1182	LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
				1183	if (!data) {
				1184	LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
				1185	COVER_best_destroy(&best);
				1186	COVER_ctx_destroy(&ctx);
				1187	POOL_free(pool);
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1188	return ERROR(memory_allocation);
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1189	}
				1190	data->ctx = &ctx;
				1191	data->best = &best;
				1192	data->dictBufferCapacity = dictBufferCapacity;
				1193	data->parameters = *parameters;
				1194	data->parameters.k = k;
				1195	data->parameters.d = d;
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	1196	data->parameters.splitPoint = splitPoint;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1197	data->parameters.steps = kSteps;
David Bainbridge	788e520	2019-10-21 18:49:40 +0000	[diff] [blame]	1198	data->parameters.shrinkDict = shrinkDict;
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1199	data->parameters.zParams.notificationLevel = g_displayLevel;
				1200	/* Check the parameters */
				1201	if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
				1202	DISPLAYLEVEL(1, "Cover parameters incorrect\n");
				1203	free(data);
				1204	continue;
				1205	}
				1206	/* Call the function and pass ownership of data to it */
				1207	COVER_best_start(&best);
				1208	if (pool) {
				1209	POOL_add(pool, &COVER_tryParameters, data);
				1210	} else {
				1211	COVER_tryParameters(data);
				1212	}
				1213	/* Print status */
				1214	LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
Abhilash S.L	3b49463	2019-07-16 15:51:09 +0530	[diff] [blame]	1215	(unsigned)((iteration * 100) / kIterations));
William Kurkian	ea86948	2019-04-09 15:16:11 -0400	[diff] [blame]	1216	++iteration;
				1217	}
				1218	COVER_best_wait(&best);
				1219	COVER_ctx_destroy(&ctx);
				1220	}
				1221	LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
				1222	/* Fill the output buffer and parameters with output of the best parameters */
				1223	{
				1224	const size_t dictSize = best.dictSize;
				1225	if (ZSTD_isError(best.compressedSize)) {
				1226	const size_t compressedSize = best.compressedSize;
				1227	COVER_best_destroy(&best);
				1228	POOL_free(pool);
				1229	return compressedSize;
				1230	}
				1231	*parameters = best.parameters;
				1232	memcpy(dictBuffer, best.dict, dictSize);
				1233	COVER_best_destroy(&best);
				1234	POOL_free(pool);
				1235	return dictSize;
				1236	}
				1237	}