blob: 51d218a30f027eac2ece424ff8558f6303edbcab [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package triegen implements a code generator for a trie for associating
6// unsigned integer values with UTF-8 encoded runes.
7//
8// Many of the go.text packages use tries for storing per-rune information. A
9// trie is especially useful if many of the runes have the same value. If this
10// is the case, many blocks can be expected to be shared allowing for
11// information on many runes to be stored in little space.
12//
13// As most of the lookups are done directly on []byte slices, the tries use the
14// UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to
15// runes and contributes a little bit to better performance. It also naturally
16// provides a fast path for ASCII.
17//
18// Space is also an issue. There are many code points defined in Unicode and as
19// a result tables can get quite large. So every byte counts. The triegen
20// package automatically chooses the smallest integer values to represent the
21// tables. Compacters allow further compression of the trie by allowing for
22// alternative representations of individual trie blocks.
23//
24// triegen allows generating multiple tries as a single structure. This is
25// useful when, for example, one wants to generate tries for several languages
26// that have a lot of values in common. Some existing libraries for
27// internationalization store all per-language data as a dynamically loadable
28// chunk. The go.text packages are designed with the assumption that the user
29// typically wants to compile in support for all supported languages, in line
30// with the approach common to Go to create a single standalone binary. The
31// multi-root trie approach can give significant storage savings in this
32// scenario.
33//
34// triegen generates both tables and code. The code is optimized to use the
35// automatically chosen data types. The following code is generated for a Trie
36// or multiple Tries named "foo":
37// - type fooTrie
38// The trie type.
39//
40// - func newFooTrie(x int) *fooTrie
41// Trie constructor, where x is the index of the trie passed to Gen.
42//
43// - func (t *fooTrie) lookup(s []byte) (v uintX, sz int)
44// The lookup method, where uintX is automatically chosen.
45//
46// - func lookupString, lookupUnsafe and lookupStringUnsafe
47// Variants of the above.
48//
49// - var fooValues and fooIndex and any tables generated by Compacters.
50// The core trie data.
51//
52// - var fooTrieHandles
53// Indexes of starter blocks in case of multiple trie roots.
54//
55// It is recommended that users test the generated trie by checking the returned
56// value for every rune. Such exhaustive tests are possible as the number of
57// runes in Unicode is limited.
58package triegen // import "golang.org/x/text/internal/triegen"
59
60// TODO: Arguably, the internally optimized data types would not have to be
61// exposed in the generated API. We could also investigate not generating the
62// code, but using it through a package. We would have to investigate the impact
63// on performance of making such change, though. For packages like unicode/norm,
64// small changes like this could tank performance.
65
66import (
67 "encoding/binary"
68 "fmt"
69 "hash/crc64"
70 "io"
71 "log"
72 "unicode/utf8"
73)
74
75// builder builds a set of tries for associating values with runes. The set of
76// tries can share common index and value blocks.
77type builder struct {
78 Name string
79
80 // ValueType is the type of the trie values looked up.
81 ValueType string
82
83 // ValueSize is the byte size of the ValueType.
84 ValueSize int
85
86 // IndexType is the type of trie index values used for all UTF-8 bytes of
87 // a rune except the last one.
88 IndexType string
89
90 // IndexSize is the byte size of the IndexType.
91 IndexSize int
92
93 // SourceType is used when generating the lookup functions. If the user
94 // requests StringSupport, all lookup functions will be generated for
95 // string input as well.
96 SourceType string
97
98 Trie []*Trie
99
100 IndexBlocks []*node
101 ValueBlocks [][]uint64
102 Compactions []compaction
103 Checksum uint64
104
105 ASCIIBlock string
106 StarterBlock string
107
108 indexBlockIdx map[uint64]int
109 valueBlockIdx map[uint64]nodeIndex
110 asciiBlockIdx map[uint64]int
111
112 // Stats are used to fill out the template.
113 Stats struct {
114 NValueEntries int
115 NValueBytes int
116 NIndexEntries int
117 NIndexBytes int
118 NHandleBytes int
119 }
120
121 err error
122}
123
124// A nodeIndex encodes the index of a node, which is defined by the compaction
125// which stores it and an index within the compaction. For internal nodes, the
126// compaction is always 0.
127type nodeIndex struct {
128 compaction int
129 index int
130}
131
132// compaction keeps track of stats used for the compaction.
133type compaction struct {
134 c Compacter
135 blocks []*node
136 maxHandle uint32
137 totalSize int
138
139 // Used by template-based generator and thus exported.
140 Cutoff uint32
141 Offset uint32
142 Handler string
143}
144
145func (b *builder) setError(err error) {
146 if b.err == nil {
147 b.err = err
148 }
149}
150
151// An Option can be passed to Gen.
152type Option func(b *builder) error
153
154// Compact configures the trie generator to use the given Compacter.
155func Compact(c Compacter) Option {
156 return func(b *builder) error {
157 b.Compactions = append(b.Compactions, compaction{
158 c: c,
159 Handler: c.Handler() + "(n, b)"})
160 return nil
161 }
162}
163
164// Gen writes Go code for a shared trie lookup structure to w for the given
165// Tries. The generated trie type will be called nameTrie. newNameTrie(x) will
166// return the *nameTrie for tries[x]. A value can be looked up by using one of
167// the various lookup methods defined on nameTrie. It returns the table size of
168// the generated trie.
169func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) {
170 // The index contains two dummy blocks, followed by the zero block. The zero
171 // block is at offset 0x80, so that the offset for the zero block for
172 // continuation bytes is 0.
173 b := &builder{
174 Name: name,
175 Trie: tries,
176 IndexBlocks: []*node{{}, {}, {}},
177 Compactions: []compaction{{
178 Handler: name + "Values[n<<6+uint32(b)]",
179 }},
180 // The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero
181 // block.
182 indexBlockIdx: map[uint64]int{0: 0},
183 valueBlockIdx: map[uint64]nodeIndex{0: {}},
184 asciiBlockIdx: map[uint64]int{},
185 }
186 b.Compactions[0].c = (*simpleCompacter)(b)
187
188 for _, f := range opts {
189 if err := f(b); err != nil {
190 return 0, err
191 }
192 }
193 b.build()
194 if b.err != nil {
195 return 0, b.err
196 }
197 if err = b.print(w); err != nil {
198 return 0, err
199 }
200 return b.Size(), nil
201}
202
203// A Trie represents a single root node of a trie. A builder may build several
204// overlapping tries at once.
205type Trie struct {
206 root *node
207
208 hiddenTrie
209}
210
211// hiddenTrie contains values we want to be visible to the template generator,
212// but hidden from the API documentation.
213type hiddenTrie struct {
214 Name string
215 Checksum uint64
216 ASCIIIndex int
217 StarterIndex int
218}
219
220// NewTrie returns a new trie root.
221func NewTrie(name string) *Trie {
222 return &Trie{
223 &node{
224 children: make([]*node, blockSize),
225 values: make([]uint64, utf8.RuneSelf),
226 },
227 hiddenTrie{Name: name},
228 }
229}
230
231// Gen is a convenience wrapper around the Gen func passing t as the only trie
232// and uses the name passed to NewTrie. It returns the size of the generated
233// tables.
234func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) {
235 return Gen(w, t.Name, []*Trie{t}, opts...)
236}
237
238// node is a node of the intermediate trie structure.
239type node struct {
240 // children holds this node's children. It is always of length 64.
241 // A child node may be nil.
242 children []*node
243
244 // values contains the values of this node. If it is non-nil, this node is
245 // either a root or leaf node:
246 // For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F].
247 // For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF].
248 values []uint64
249
250 index nodeIndex
251}
252
253// Insert associates value with the given rune. Insert will panic if a non-zero
254// value is passed for an invalid rune.
255func (t *Trie) Insert(r rune, value uint64) {
256 if value == 0 {
257 return
258 }
259 s := string(r)
260 if []rune(s)[0] != r && value != 0 {
261 // Note: The UCD tables will always assign what amounts to a zero value
262 // to a surrogate. Allowing a zero value for an illegal rune allows
263 // users to iterate over [0..MaxRune] without having to explicitly
264 // exclude surrogates, which would be tedious.
265 panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r))
266 }
267 if len(s) == 1 {
268 // It is a root node value (ASCII).
269 t.root.values[s[0]] = value
270 return
271 }
272
273 n := t.root
274 for ; len(s) > 1; s = s[1:] {
275 if n.children == nil {
276 n.children = make([]*node, blockSize)
277 }
278 p := s[0] % blockSize
279 c := n.children[p]
280 if c == nil {
281 c = &node{}
282 n.children[p] = c
283 }
284 if len(s) > 2 && c.values != nil {
285 log.Fatalf("triegen: insert(%U): found internal node with values", r)
286 }
287 n = c
288 }
289 if n.values == nil {
290 n.values = make([]uint64, blockSize)
291 }
292 if n.children != nil {
293 log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r)
294 }
295 n.values[s[0]-0x80] = value
296}
297
298// Size returns the number of bytes the generated trie will take to store. It
299// needs to be exported as it is used in the templates.
300func (b *builder) Size() int {
301 // Index blocks.
302 sz := len(b.IndexBlocks) * blockSize * b.IndexSize
303
304 // Skip the first compaction, which represents the normal value blocks, as
305 // its totalSize does not account for the ASCII blocks, which are managed
306 // separately.
307 sz += len(b.ValueBlocks) * blockSize * b.ValueSize
308 for _, c := range b.Compactions[1:] {
309 sz += c.totalSize
310 }
311
312 // TODO: this computation does not account for the fixed overhead of a using
313 // a compaction, either code or data. As for data, though, the typical
314 // overhead of data is in the order of bytes (2 bytes for cases). Further,
315 // the savings of using a compaction should anyway be substantial for it to
316 // be worth it.
317
318 // For multi-root tries, we also need to account for the handles.
319 if len(b.Trie) > 1 {
320 sz += 2 * b.IndexSize * len(b.Trie)
321 }
322 return sz
323}
324
325func (b *builder) build() {
326 // Compute the sizes of the values.
327 var vmax uint64
328 for _, t := range b.Trie {
329 vmax = maxValue(t.root, vmax)
330 }
331 b.ValueType, b.ValueSize = getIntType(vmax)
332
333 // Compute all block allocations.
334 // TODO: first compute the ASCII blocks for all tries and then the other
335 // nodes. ASCII blocks are more restricted in placement, as they require two
336 // blocks to be placed consecutively. Processing them first may improve
337 // sharing (at least one zero block can be expected to be saved.)
338 for _, t := range b.Trie {
339 b.Checksum += b.buildTrie(t)
340 }
341
342 // Compute the offsets for all the Compacters.
343 offset := uint32(0)
344 for i := range b.Compactions {
345 c := &b.Compactions[i]
346 c.Offset = offset
347 offset += c.maxHandle + 1
348 c.Cutoff = offset
349 }
350
351 // Compute the sizes of indexes.
352 // TODO: different byte positions could have different sizes. So far we have
353 // not found a case where this is beneficial.
354 imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff)
355 for _, ib := range b.IndexBlocks {
356 if x := uint64(ib.index.index); x > imax {
357 imax = x
358 }
359 }
360 b.IndexType, b.IndexSize = getIntType(imax)
361}
362
363func maxValue(n *node, max uint64) uint64 {
364 if n == nil {
365 return max
366 }
367 for _, c := range n.children {
368 max = maxValue(c, max)
369 }
370 for _, v := range n.values {
371 if max < v {
372 max = v
373 }
374 }
375 return max
376}
377
378func getIntType(v uint64) (string, int) {
379 switch {
380 case v < 1<<8:
381 return "uint8", 1
382 case v < 1<<16:
383 return "uint16", 2
384 case v < 1<<32:
385 return "uint32", 4
386 }
387 return "uint64", 8
388}
389
390const (
391 blockSize = 64
392
393 // Subtract two blocks to offset 0x80, the first continuation byte.
394 blockOffset = 2
395
396 // Subtract three blocks to offset 0xC0, the first non-ASCII starter.
397 rootBlockOffset = 3
398)
399
400var crcTable = crc64.MakeTable(crc64.ISO)
401
402func (b *builder) buildTrie(t *Trie) uint64 {
403 n := t.root
404
405 // Get the ASCII offset. For the first trie, the ASCII block will be at
406 // position 0.
407 hasher := crc64.New(crcTable)
408 binary.Write(hasher, binary.BigEndian, n.values)
409 hash := hasher.Sum64()
410
411 v, ok := b.asciiBlockIdx[hash]
412 if !ok {
413 v = len(b.ValueBlocks)
414 b.asciiBlockIdx[hash] = v
415
416 b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:])
417 if v == 0 {
418 // Add the zero block at position 2 so that it will be assigned a
419 // zero reference in the lookup blocks.
420 // TODO: always do this? This would allow us to remove a check from
421 // the trie lookup, but at the expense of extra space. Analyze
422 // performance for unicode/norm.
423 b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize))
424 }
425 }
426 t.ASCIIIndex = v
427
428 // Compute remaining offsets.
429 t.Checksum = b.computeOffsets(n, true)
430 // We already subtracted the normal blockOffset from the index. Subtract the
431 // difference for starter bytes.
432 t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset)
433 return t.Checksum
434}
435
436func (b *builder) computeOffsets(n *node, root bool) uint64 {
437 // For the first trie, the root lookup block will be at position 3, which is
438 // the offset for UTF-8 non-ASCII starter bytes.
439 first := len(b.IndexBlocks) == rootBlockOffset
440 if first {
441 b.IndexBlocks = append(b.IndexBlocks, n)
442 }
443
444 // We special-case the cases where all values recursively are 0. This allows
445 // for the use of a zero block to which all such values can be directed.
446 hash := uint64(0)
447 if n.children != nil || n.values != nil {
448 hasher := crc64.New(crcTable)
449 for _, c := range n.children {
450 var v uint64
451 if c != nil {
452 v = b.computeOffsets(c, false)
453 }
454 binary.Write(hasher, binary.BigEndian, v)
455 }
456 binary.Write(hasher, binary.BigEndian, n.values)
457 hash = hasher.Sum64()
458 }
459
460 if first {
461 b.indexBlockIdx[hash] = rootBlockOffset - blockOffset
462 }
463
464 // Compacters don't apply to internal nodes.
465 if n.children != nil {
466 v, ok := b.indexBlockIdx[hash]
467 if !ok {
468 v = len(b.IndexBlocks) - blockOffset
469 b.IndexBlocks = append(b.IndexBlocks, n)
470 b.indexBlockIdx[hash] = v
471 }
472 n.index = nodeIndex{0, v}
473 } else {
474 h, ok := b.valueBlockIdx[hash]
475 if !ok {
476 bestI, bestSize := 0, blockSize*b.ValueSize
477 for i, c := range b.Compactions[1:] {
478 if sz, ok := c.c.Size(n.values); ok && bestSize > sz {
479 bestI, bestSize = i+1, sz
480 }
481 }
482 c := &b.Compactions[bestI]
483 c.totalSize += bestSize
484 v := c.c.Store(n.values)
485 if c.maxHandle < v {
486 c.maxHandle = v
487 }
488 h = nodeIndex{bestI, int(v)}
489 b.valueBlockIdx[hash] = h
490 }
491 n.index = h
492 }
493 return hash
494}