Blame - vendor/golang.org/x/text/internal/triegen/triegen.go - ofagent-go

blob: 51d218a30f027eac2ece424ff8558f6303edbcab [file] [log] [blame]

Don Newton	98fd881	2019-09-23 15:15:02 -0400	[diff] [blame^]	1	// Copyright 2014 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// Package triegen implements a code generator for a trie for associating
				6	// unsigned integer values with UTF-8 encoded runes.
				7	//
				8	// Many of the go.text packages use tries for storing per-rune information. A
				9	// trie is especially useful if many of the runes have the same value. If this
				10	// is the case, many blocks can be expected to be shared allowing for
				11	// information on many runes to be stored in little space.
				12	//
				13	// As most of the lookups are done directly on []byte slices, the tries use the
				14	// UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to
				15	// runes and contributes a little bit to better performance. It also naturally
				16	// provides a fast path for ASCII.
				17	//
				18	// Space is also an issue. There are many code points defined in Unicode and as
				19	// a result tables can get quite large. So every byte counts. The triegen
				20	// package automatically chooses the smallest integer values to represent the
				21	// tables. Compacters allow further compression of the trie by allowing for
				22	// alternative representations of individual trie blocks.
				23	//
				24	// triegen allows generating multiple tries as a single structure. This is
				25	// useful when, for example, one wants to generate tries for several languages
				26	// that have a lot of values in common. Some existing libraries for
				27	// internationalization store all per-language data as a dynamically loadable
				28	// chunk. The go.text packages are designed with the assumption that the user
				29	// typically wants to compile in support for all supported languages, in line
				30	// with the approach common to Go to create a single standalone binary. The
				31	// multi-root trie approach can give significant storage savings in this
				32	// scenario.
				33	//
				34	// triegen generates both tables and code. The code is optimized to use the
				35	// automatically chosen data types. The following code is generated for a Trie
				36	// or multiple Tries named "foo":
				37	// - type fooTrie
				38	// The trie type.
				39	//
				40	// - func newFooTrie(x int) *fooTrie
				41	// Trie constructor, where x is the index of the trie passed to Gen.
				42	//
				43	// - func (t *fooTrie) lookup(s []byte) (v uintX, sz int)
				44	// The lookup method, where uintX is automatically chosen.
				45	//
				46	// - func lookupString, lookupUnsafe and lookupStringUnsafe
				47	// Variants of the above.
				48	//
				49	// - var fooValues and fooIndex and any tables generated by Compacters.
				50	// The core trie data.
				51	//
				52	// - var fooTrieHandles
				53	// Indexes of starter blocks in case of multiple trie roots.
				54	//
				55	// It is recommended that users test the generated trie by checking the returned
				56	// value for every rune. Such exhaustive tests are possible as the number of
				57	// runes in Unicode is limited.
				58	package triegen // import "golang.org/x/text/internal/triegen"
				59
				60	// TODO: Arguably, the internally optimized data types would not have to be
				61	// exposed in the generated API. We could also investigate not generating the
				62	// code, but using it through a package. We would have to investigate the impact
				63	// on performance of making such change, though. For packages like unicode/norm,
				64	// small changes like this could tank performance.
				65
				66	import (
				67	"encoding/binary"
				68	"fmt"
				69	"hash/crc64"
				70	"io"
				71	"log"
				72	"unicode/utf8"
				73	)
				74
				75	// builder builds a set of tries for associating values with runes. The set of
				76	// tries can share common index and value blocks.
				77	type builder struct {
				78	Name string
				79
				80	// ValueType is the type of the trie values looked up.
				81	ValueType string
				82
				83	// ValueSize is the byte size of the ValueType.
				84	ValueSize int
				85
				86	// IndexType is the type of trie index values used for all UTF-8 bytes of
				87	// a rune except the last one.
				88	IndexType string
				89
				90	// IndexSize is the byte size of the IndexType.
				91	IndexSize int
				92
				93	// SourceType is used when generating the lookup functions. If the user
				94	// requests StringSupport, all lookup functions will be generated for
				95	// string input as well.
				96	SourceType string
				97
				98	Trie []*Trie
				99
				100	IndexBlocks []*node
				101	ValueBlocks [][]uint64
				102	Compactions []compaction
				103	Checksum uint64
				104
				105	ASCIIBlock string
				106	StarterBlock string
				107
				108	indexBlockIdx map[uint64]int
				109	valueBlockIdx map[uint64]nodeIndex
				110	asciiBlockIdx map[uint64]int
				111
				112	// Stats are used to fill out the template.
				113	Stats struct {
				114	NValueEntries int
				115	NValueBytes int
				116	NIndexEntries int
				117	NIndexBytes int
				118	NHandleBytes int
				119	}
				120
				121	err error
				122	}
				123
				124	// A nodeIndex encodes the index of a node, which is defined by the compaction
				125	// which stores it and an index within the compaction. For internal nodes, the
				126	// compaction is always 0.
				127	type nodeIndex struct {
				128	compaction int
				129	index int
				130	}
				131
				132	// compaction keeps track of stats used for the compaction.
				133	type compaction struct {
				134	c Compacter
				135	blocks []*node
				136	maxHandle uint32
				137	totalSize int
				138
				139	// Used by template-based generator and thus exported.
				140	Cutoff uint32
				141	Offset uint32
				142	Handler string
				143	}
				144
				145	func (b *builder) setError(err error) {
				146	if b.err == nil {
				147	b.err = err
				148	}
				149	}
				150
				151	// An Option can be passed to Gen.
				152	type Option func(b *builder) error
				153
				154	// Compact configures the trie generator to use the given Compacter.
				155	func Compact(c Compacter) Option {
				156	return func(b *builder) error {
				157	b.Compactions = append(b.Compactions, compaction{
				158	c: c,
				159	Handler: c.Handler() + "(n, b)"})
				160	return nil
				161	}
				162	}
				163
				164	// Gen writes Go code for a shared trie lookup structure to w for the given
				165	// Tries. The generated trie type will be called nameTrie. newNameTrie(x) will
				166	// return the *nameTrie for tries[x]. A value can be looked up by using one of
				167	// the various lookup methods defined on nameTrie. It returns the table size of
				168	// the generated trie.
				169	func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) {
				170	// The index contains two dummy blocks, followed by the zero block. The zero
				171	// block is at offset 0x80, so that the offset for the zero block for
				172	// continuation bytes is 0.
				173	b := &builder{
				174	Name: name,
				175	Trie: tries,
				176	IndexBlocks: []*node{{}, {}, {}},
				177	Compactions: []compaction{{
				178	Handler: name + "Values[n<<6+uint32(b)]",
				179	}},
				180	// The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero
				181	// block.
				182	indexBlockIdx: map[uint64]int{0: 0},
				183	valueBlockIdx: map[uint64]nodeIndex{0: {}},
				184	asciiBlockIdx: map[uint64]int{},
				185	}
				186	b.Compactions[0].c = (*simpleCompacter)(b)
				187
				188	for _, f := range opts {
				189	if err := f(b); err != nil {
				190	return 0, err
				191	}
				192	}
				193	b.build()
				194	if b.err != nil {
				195	return 0, b.err
				196	}
				197	if err = b.print(w); err != nil {
				198	return 0, err
				199	}
				200	return b.Size(), nil
				201	}
				202
				203	// A Trie represents a single root node of a trie. A builder may build several
				204	// overlapping tries at once.
				205	type Trie struct {
				206	root *node
				207
				208	hiddenTrie
				209	}
				210
				211	// hiddenTrie contains values we want to be visible to the template generator,
				212	// but hidden from the API documentation.
				213	type hiddenTrie struct {
				214	Name string
				215	Checksum uint64
				216	ASCIIIndex int
				217	StarterIndex int
				218	}
				219
				220	// NewTrie returns a new trie root.
				221	func NewTrie(name string) *Trie {
				222	return &Trie{
				223	&node{
				224	children: make([]*node, blockSize),
				225	values: make([]uint64, utf8.RuneSelf),
				226	},
				227	hiddenTrie{Name: name},
				228	}
				229	}
				230
				231	// Gen is a convenience wrapper around the Gen func passing t as the only trie
				232	// and uses the name passed to NewTrie. It returns the size of the generated
				233	// tables.
				234	func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) {
				235	return Gen(w, t.Name, []*Trie{t}, opts...)
				236	}
				237
				238	// node is a node of the intermediate trie structure.
				239	type node struct {
				240	// children holds this node's children. It is always of length 64.
				241	// A child node may be nil.
				242	children []*node
				243
				244	// values contains the values of this node. If it is non-nil, this node is
				245	// either a root or leaf node:
				246	// For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F].
				247	// For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF].
				248	values []uint64
				249
				250	index nodeIndex
				251	}
				252
				253	// Insert associates value with the given rune. Insert will panic if a non-zero
				254	// value is passed for an invalid rune.
				255	func (t *Trie) Insert(r rune, value uint64) {
				256	if value == 0 {
				257	return
				258	}
				259	s := string(r)
				260	if []rune(s)[0] != r && value != 0 {
				261	// Note: The UCD tables will always assign what amounts to a zero value
				262	// to a surrogate. Allowing a zero value for an illegal rune allows
				263	// users to iterate over [0..MaxRune] without having to explicitly
				264	// exclude surrogates, which would be tedious.
				265	panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r))
				266	}
				267	if len(s) == 1 {
				268	// It is a root node value (ASCII).
				269	t.root.values[s[0]] = value
				270	return
				271	}
				272
				273	n := t.root
				274	for ; len(s) > 1; s = s[1:] {
				275	if n.children == nil {
				276	n.children = make([]*node, blockSize)
				277	}
				278	p := s[0] % blockSize
				279	c := n.children[p]
				280	if c == nil {
				281	c = &node{}
				282	n.children[p] = c
				283	}
				284	if len(s) > 2 && c.values != nil {
				285	log.Fatalf("triegen: insert(%U): found internal node with values", r)
				286	}
				287	n = c
				288	}
				289	if n.values == nil {
				290	n.values = make([]uint64, blockSize)
				291	}
				292	if n.children != nil {
				293	log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r)
				294	}
				295	n.values[s[0]-0x80] = value
				296	}
				297
				298	// Size returns the number of bytes the generated trie will take to store. It
				299	// needs to be exported as it is used in the templates.
				300	func (b *builder) Size() int {
				301	// Index blocks.
				302	sz := len(b.IndexBlocks) * blockSize * b.IndexSize
				303
				304	// Skip the first compaction, which represents the normal value blocks, as
				305	// its totalSize does not account for the ASCII blocks, which are managed
				306	// separately.
				307	sz += len(b.ValueBlocks) * blockSize * b.ValueSize
				308	for _, c := range b.Compactions[1:] {
				309	sz += c.totalSize
				310	}
				311
				312	// TODO: this computation does not account for the fixed overhead of a using
				313	// a compaction, either code or data. As for data, though, the typical
				314	// overhead of data is in the order of bytes (2 bytes for cases). Further,
				315	// the savings of using a compaction should anyway be substantial for it to
				316	// be worth it.
				317
				318	// For multi-root tries, we also need to account for the handles.
				319	if len(b.Trie) > 1 {
				320	sz += 2 * b.IndexSize * len(b.Trie)
				321	}
				322	return sz
				323	}
				324
				325	func (b *builder) build() {
				326	// Compute the sizes of the values.
				327	var vmax uint64
				328	for _, t := range b.Trie {
				329	vmax = maxValue(t.root, vmax)
				330	}
				331	b.ValueType, b.ValueSize = getIntType(vmax)
				332
				333	// Compute all block allocations.
				334	// TODO: first compute the ASCII blocks for all tries and then the other
				335	// nodes. ASCII blocks are more restricted in placement, as they require two
				336	// blocks to be placed consecutively. Processing them first may improve
				337	// sharing (at least one zero block can be expected to be saved.)
				338	for _, t := range b.Trie {
				339	b.Checksum += b.buildTrie(t)
				340	}
				341
				342	// Compute the offsets for all the Compacters.
				343	offset := uint32(0)
				344	for i := range b.Compactions {
				345	c := &b.Compactions[i]
				346	c.Offset = offset
				347	offset += c.maxHandle + 1
				348	c.Cutoff = offset
				349	}
				350
				351	// Compute the sizes of indexes.
				352	// TODO: different byte positions could have different sizes. So far we have
				353	// not found a case where this is beneficial.
				354	imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff)
				355	for _, ib := range b.IndexBlocks {
				356	if x := uint64(ib.index.index); x > imax {
				357	imax = x
				358	}
				359	}
				360	b.IndexType, b.IndexSize = getIntType(imax)
				361	}
				362
				363	func maxValue(n *node, max uint64) uint64 {
				364	if n == nil {
				365	return max
				366	}
				367	for _, c := range n.children {
				368	max = maxValue(c, max)
				369	}
				370	for _, v := range n.values {
				371	if max < v {
				372	max = v
				373	}
				374	}
				375	return max
				376	}
				377
				378	func getIntType(v uint64) (string, int) {
				379	switch {
				380	case v < 1<<8:
				381	return "uint8", 1
				382	case v < 1<<16:
				383	return "uint16", 2
				384	case v < 1<<32:
				385	return "uint32", 4
				386	}
				387	return "uint64", 8
				388	}
				389
				390	const (
				391	blockSize = 64
				392
				393	// Subtract two blocks to offset 0x80, the first continuation byte.
				394	blockOffset = 2
				395
				396	// Subtract three blocks to offset 0xC0, the first non-ASCII starter.
				397	rootBlockOffset = 3
				398	)
				399
				400	var crcTable = crc64.MakeTable(crc64.ISO)
				401
				402	func (b builder) buildTrie(t Trie) uint64 {
				403	n := t.root
				404
				405	// Get the ASCII offset. For the first trie, the ASCII block will be at
				406	// position 0.
				407	hasher := crc64.New(crcTable)
				408	binary.Write(hasher, binary.BigEndian, n.values)
				409	hash := hasher.Sum64()
				410
				411	v, ok := b.asciiBlockIdx[hash]
				412	if !ok {
				413	v = len(b.ValueBlocks)
				414	b.asciiBlockIdx[hash] = v
				415
				416	b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:])
				417	if v == 0 {
				418	// Add the zero block at position 2 so that it will be assigned a
				419	// zero reference in the lookup blocks.
				420	// TODO: always do this? This would allow us to remove a check from
				421	// the trie lookup, but at the expense of extra space. Analyze
				422	// performance for unicode/norm.
				423	b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize))
				424	}
				425	}
				426	t.ASCIIIndex = v
				427
				428	// Compute remaining offsets.
				429	t.Checksum = b.computeOffsets(n, true)
				430	// We already subtracted the normal blockOffset from the index. Subtract the
				431	// difference for starter bytes.
				432	t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset)
				433	return t.Checksum
				434	}
				435
				436	func (b builder) computeOffsets(n node, root bool) uint64 {
				437	// For the first trie, the root lookup block will be at position 3, which is
				438	// the offset for UTF-8 non-ASCII starter bytes.
				439	first := len(b.IndexBlocks) == rootBlockOffset
				440	if first {
				441	b.IndexBlocks = append(b.IndexBlocks, n)
				442	}
				443
				444	// We special-case the cases where all values recursively are 0. This allows
				445	// for the use of a zero block to which all such values can be directed.
				446	hash := uint64(0)
				447	if n.children != nil \|\| n.values != nil {
				448	hasher := crc64.New(crcTable)
				449	for _, c := range n.children {
				450	var v uint64
				451	if c != nil {
				452	v = b.computeOffsets(c, false)
				453	}
				454	binary.Write(hasher, binary.BigEndian, v)
				455	}
				456	binary.Write(hasher, binary.BigEndian, n.values)
				457	hash = hasher.Sum64()
				458	}
				459
				460	if first {
				461	b.indexBlockIdx[hash] = rootBlockOffset - blockOffset
				462	}
				463
				464	// Compacters don't apply to internal nodes.
				465	if n.children != nil {
				466	v, ok := b.indexBlockIdx[hash]
				467	if !ok {
				468	v = len(b.IndexBlocks) - blockOffset
				469	b.IndexBlocks = append(b.IndexBlocks, n)
				470	b.indexBlockIdx[hash] = v
				471	}
				472	n.index = nodeIndex{0, v}
				473	} else {
				474	h, ok := b.valueBlockIdx[hash]
				475	if !ok {
				476	bestI, bestSize := 0, blockSize*b.ValueSize
				477	for i, c := range b.Compactions[1:] {
				478	if sz, ok := c.c.Size(n.values); ok && bestSize > sz {
				479	bestI, bestSize = i+1, sz
				480	}
				481	}
				482	c := &b.Compactions[bestI]
				483	c.totalSize += bestSize
				484	v := c.c.Store(n.values)
				485	if c.maxHandle < v {
				486	c.maxHandle = v
				487	}
				488	h = nodeIndex{bestI, int(v)}
				489	b.valueBlockIdx[hash] = h
				490	}
				491	n.index = h
				492	}
				493	return hash
				494	}