William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 1 | // Copyright 2014 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Package triegen implements a code generator for a trie for associating |
| 6 | // unsigned integer values with UTF-8 encoded runes. |
| 7 | // |
| 8 | // Many of the go.text packages use tries for storing per-rune information. A |
| 9 | // trie is especially useful if many of the runes have the same value. If this |
| 10 | // is the case, many blocks can be expected to be shared allowing for |
| 11 | // information on many runes to be stored in little space. |
| 12 | // |
| 13 | // As most of the lookups are done directly on []byte slices, the tries use the |
| 14 | // UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to |
| 15 | // runes and contributes a little bit to better performance. It also naturally |
| 16 | // provides a fast path for ASCII. |
| 17 | // |
| 18 | // Space is also an issue. There are many code points defined in Unicode and as |
| 19 | // a result tables can get quite large. So every byte counts. The triegen |
| 20 | // package automatically chooses the smallest integer values to represent the |
| 21 | // tables. Compacters allow further compression of the trie by allowing for |
| 22 | // alternative representations of individual trie blocks. |
| 23 | // |
| 24 | // triegen allows generating multiple tries as a single structure. This is |
| 25 | // useful when, for example, one wants to generate tries for several languages |
| 26 | // that have a lot of values in common. Some existing libraries for |
| 27 | // internationalization store all per-language data as a dynamically loadable |
| 28 | // chunk. The go.text packages are designed with the assumption that the user |
| 29 | // typically wants to compile in support for all supported languages, in line |
| 30 | // with the approach common to Go to create a single standalone binary. The |
| 31 | // multi-root trie approach can give significant storage savings in this |
| 32 | // scenario. |
| 33 | // |
| 34 | // triegen generates both tables and code. The code is optimized to use the |
| 35 | // automatically chosen data types. The following code is generated for a Trie |
| 36 | // or multiple Tries named "foo": |
| 37 | // - type fooTrie |
| 38 | // The trie type. |
| 39 | // |
| 40 | // - func newFooTrie(x int) *fooTrie |
| 41 | // Trie constructor, where x is the index of the trie passed to Gen. |
| 42 | // |
| 43 | // - func (t *fooTrie) lookup(s []byte) (v uintX, sz int) |
| 44 | // The lookup method, where uintX is automatically chosen. |
| 45 | // |
| 46 | // - func lookupString, lookupUnsafe and lookupStringUnsafe |
| 47 | // Variants of the above. |
| 48 | // |
| 49 | // - var fooValues and fooIndex and any tables generated by Compacters. |
| 50 | // The core trie data. |
| 51 | // |
| 52 | // - var fooTrieHandles |
| 53 | // Indexes of starter blocks in case of multiple trie roots. |
| 54 | // |
| 55 | // It is recommended that users test the generated trie by checking the returned |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame^] | 56 | // value for every rune. Such exhaustive tests are possible as the number of |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 57 | // runes in Unicode is limited. |
| 58 | package triegen // import "golang.org/x/text/internal/triegen" |
| 59 | |
| 60 | // TODO: Arguably, the internally optimized data types would not have to be |
| 61 | // exposed in the generated API. We could also investigate not generating the |
| 62 | // code, but using it through a package. We would have to investigate the impact |
| 63 | // on performance of making such change, though. For packages like unicode/norm, |
| 64 | // small changes like this could tank performance. |
| 65 | |
| 66 | import ( |
| 67 | "encoding/binary" |
| 68 | "fmt" |
| 69 | "hash/crc64" |
| 70 | "io" |
| 71 | "log" |
| 72 | "unicode/utf8" |
| 73 | ) |
| 74 | |
| 75 | // builder builds a set of tries for associating values with runes. The set of |
| 76 | // tries can share common index and value blocks. |
| 77 | type builder struct { |
| 78 | Name string |
| 79 | |
| 80 | // ValueType is the type of the trie values looked up. |
| 81 | ValueType string |
| 82 | |
| 83 | // ValueSize is the byte size of the ValueType. |
| 84 | ValueSize int |
| 85 | |
| 86 | // IndexType is the type of trie index values used for all UTF-8 bytes of |
| 87 | // a rune except the last one. |
| 88 | IndexType string |
| 89 | |
| 90 | // IndexSize is the byte size of the IndexType. |
| 91 | IndexSize int |
| 92 | |
| 93 | // SourceType is used when generating the lookup functions. If the user |
| 94 | // requests StringSupport, all lookup functions will be generated for |
| 95 | // string input as well. |
| 96 | SourceType string |
| 97 | |
| 98 | Trie []*Trie |
| 99 | |
| 100 | IndexBlocks []*node |
| 101 | ValueBlocks [][]uint64 |
| 102 | Compactions []compaction |
| 103 | Checksum uint64 |
| 104 | |
| 105 | ASCIIBlock string |
| 106 | StarterBlock string |
| 107 | |
| 108 | indexBlockIdx map[uint64]int |
| 109 | valueBlockIdx map[uint64]nodeIndex |
| 110 | asciiBlockIdx map[uint64]int |
| 111 | |
| 112 | // Stats are used to fill out the template. |
| 113 | Stats struct { |
| 114 | NValueEntries int |
| 115 | NValueBytes int |
| 116 | NIndexEntries int |
| 117 | NIndexBytes int |
| 118 | NHandleBytes int |
| 119 | } |
| 120 | |
| 121 | err error |
| 122 | } |
| 123 | |
| 124 | // A nodeIndex encodes the index of a node, which is defined by the compaction |
| 125 | // which stores it and an index within the compaction. For internal nodes, the |
| 126 | // compaction is always 0. |
| 127 | type nodeIndex struct { |
| 128 | compaction int |
| 129 | index int |
| 130 | } |
| 131 | |
| 132 | // compaction keeps track of stats used for the compaction. |
| 133 | type compaction struct { |
| 134 | c Compacter |
| 135 | blocks []*node |
| 136 | maxHandle uint32 |
| 137 | totalSize int |
| 138 | |
| 139 | // Used by template-based generator and thus exported. |
| 140 | Cutoff uint32 |
| 141 | Offset uint32 |
| 142 | Handler string |
| 143 | } |
| 144 | |
| 145 | func (b *builder) setError(err error) { |
| 146 | if b.err == nil { |
| 147 | b.err = err |
| 148 | } |
| 149 | } |
| 150 | |
| 151 | // An Option can be passed to Gen. |
| 152 | type Option func(b *builder) error |
| 153 | |
| 154 | // Compact configures the trie generator to use the given Compacter. |
| 155 | func Compact(c Compacter) Option { |
| 156 | return func(b *builder) error { |
| 157 | b.Compactions = append(b.Compactions, compaction{ |
| 158 | c: c, |
| 159 | Handler: c.Handler() + "(n, b)"}) |
| 160 | return nil |
| 161 | } |
| 162 | } |
| 163 | |
| 164 | // Gen writes Go code for a shared trie lookup structure to w for the given |
| 165 | // Tries. The generated trie type will be called nameTrie. newNameTrie(x) will |
| 166 | // return the *nameTrie for tries[x]. A value can be looked up by using one of |
| 167 | // the various lookup methods defined on nameTrie. It returns the table size of |
| 168 | // the generated trie. |
| 169 | func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) { |
| 170 | // The index contains two dummy blocks, followed by the zero block. The zero |
| 171 | // block is at offset 0x80, so that the offset for the zero block for |
| 172 | // continuation bytes is 0. |
| 173 | b := &builder{ |
| 174 | Name: name, |
| 175 | Trie: tries, |
| 176 | IndexBlocks: []*node{{}, {}, {}}, |
| 177 | Compactions: []compaction{{ |
| 178 | Handler: name + "Values[n<<6+uint32(b)]", |
| 179 | }}, |
| 180 | // The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero |
| 181 | // block. |
| 182 | indexBlockIdx: map[uint64]int{0: 0}, |
| 183 | valueBlockIdx: map[uint64]nodeIndex{0: {}}, |
| 184 | asciiBlockIdx: map[uint64]int{}, |
| 185 | } |
| 186 | b.Compactions[0].c = (*simpleCompacter)(b) |
| 187 | |
| 188 | for _, f := range opts { |
| 189 | if err := f(b); err != nil { |
| 190 | return 0, err |
| 191 | } |
| 192 | } |
| 193 | b.build() |
| 194 | if b.err != nil { |
| 195 | return 0, b.err |
| 196 | } |
| 197 | if err = b.print(w); err != nil { |
| 198 | return 0, err |
| 199 | } |
| 200 | return b.Size(), nil |
| 201 | } |
| 202 | |
| 203 | // A Trie represents a single root node of a trie. A builder may build several |
| 204 | // overlapping tries at once. |
| 205 | type Trie struct { |
| 206 | root *node |
| 207 | |
| 208 | hiddenTrie |
| 209 | } |
| 210 | |
| 211 | // hiddenTrie contains values we want to be visible to the template generator, |
| 212 | // but hidden from the API documentation. |
| 213 | type hiddenTrie struct { |
| 214 | Name string |
| 215 | Checksum uint64 |
| 216 | ASCIIIndex int |
| 217 | StarterIndex int |
| 218 | } |
| 219 | |
| 220 | // NewTrie returns a new trie root. |
| 221 | func NewTrie(name string) *Trie { |
| 222 | return &Trie{ |
| 223 | &node{ |
| 224 | children: make([]*node, blockSize), |
| 225 | values: make([]uint64, utf8.RuneSelf), |
| 226 | }, |
| 227 | hiddenTrie{Name: name}, |
| 228 | } |
| 229 | } |
| 230 | |
| 231 | // Gen is a convenience wrapper around the Gen func passing t as the only trie |
| 232 | // and uses the name passed to NewTrie. It returns the size of the generated |
| 233 | // tables. |
| 234 | func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) { |
| 235 | return Gen(w, t.Name, []*Trie{t}, opts...) |
| 236 | } |
| 237 | |
| 238 | // node is a node of the intermediate trie structure. |
| 239 | type node struct { |
| 240 | // children holds this node's children. It is always of length 64. |
| 241 | // A child node may be nil. |
| 242 | children []*node |
| 243 | |
| 244 | // values contains the values of this node. If it is non-nil, this node is |
| 245 | // either a root or leaf node: |
| 246 | // For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F]. |
| 247 | // For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF]. |
| 248 | values []uint64 |
| 249 | |
| 250 | index nodeIndex |
| 251 | } |
| 252 | |
| 253 | // Insert associates value with the given rune. Insert will panic if a non-zero |
| 254 | // value is passed for an invalid rune. |
| 255 | func (t *Trie) Insert(r rune, value uint64) { |
| 256 | if value == 0 { |
| 257 | return |
| 258 | } |
| 259 | s := string(r) |
| 260 | if []rune(s)[0] != r && value != 0 { |
| 261 | // Note: The UCD tables will always assign what amounts to a zero value |
| 262 | // to a surrogate. Allowing a zero value for an illegal rune allows |
| 263 | // users to iterate over [0..MaxRune] without having to explicitly |
| 264 | // exclude surrogates, which would be tedious. |
| 265 | panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r)) |
| 266 | } |
| 267 | if len(s) == 1 { |
| 268 | // It is a root node value (ASCII). |
| 269 | t.root.values[s[0]] = value |
| 270 | return |
| 271 | } |
| 272 | |
| 273 | n := t.root |
| 274 | for ; len(s) > 1; s = s[1:] { |
| 275 | if n.children == nil { |
| 276 | n.children = make([]*node, blockSize) |
| 277 | } |
| 278 | p := s[0] % blockSize |
| 279 | c := n.children[p] |
| 280 | if c == nil { |
| 281 | c = &node{} |
| 282 | n.children[p] = c |
| 283 | } |
| 284 | if len(s) > 2 && c.values != nil { |
| 285 | log.Fatalf("triegen: insert(%U): found internal node with values", r) |
| 286 | } |
| 287 | n = c |
| 288 | } |
| 289 | if n.values == nil { |
| 290 | n.values = make([]uint64, blockSize) |
| 291 | } |
| 292 | if n.children != nil { |
| 293 | log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r) |
| 294 | } |
| 295 | n.values[s[0]-0x80] = value |
| 296 | } |
| 297 | |
| 298 | // Size returns the number of bytes the generated trie will take to store. It |
| 299 | // needs to be exported as it is used in the templates. |
| 300 | func (b *builder) Size() int { |
| 301 | // Index blocks. |
| 302 | sz := len(b.IndexBlocks) * blockSize * b.IndexSize |
| 303 | |
| 304 | // Skip the first compaction, which represents the normal value blocks, as |
| 305 | // its totalSize does not account for the ASCII blocks, which are managed |
| 306 | // separately. |
| 307 | sz += len(b.ValueBlocks) * blockSize * b.ValueSize |
| 308 | for _, c := range b.Compactions[1:] { |
| 309 | sz += c.totalSize |
| 310 | } |
| 311 | |
| 312 | // TODO: this computation does not account for the fixed overhead of a using |
| 313 | // a compaction, either code or data. As for data, though, the typical |
| 314 | // overhead of data is in the order of bytes (2 bytes for cases). Further, |
| 315 | // the savings of using a compaction should anyway be substantial for it to |
| 316 | // be worth it. |
| 317 | |
| 318 | // For multi-root tries, we also need to account for the handles. |
| 319 | if len(b.Trie) > 1 { |
| 320 | sz += 2 * b.IndexSize * len(b.Trie) |
| 321 | } |
| 322 | return sz |
| 323 | } |
| 324 | |
| 325 | func (b *builder) build() { |
| 326 | // Compute the sizes of the values. |
| 327 | var vmax uint64 |
| 328 | for _, t := range b.Trie { |
| 329 | vmax = maxValue(t.root, vmax) |
| 330 | } |
| 331 | b.ValueType, b.ValueSize = getIntType(vmax) |
| 332 | |
| 333 | // Compute all block allocations. |
| 334 | // TODO: first compute the ASCII blocks for all tries and then the other |
| 335 | // nodes. ASCII blocks are more restricted in placement, as they require two |
| 336 | // blocks to be placed consecutively. Processing them first may improve |
| 337 | // sharing (at least one zero block can be expected to be saved.) |
| 338 | for _, t := range b.Trie { |
| 339 | b.Checksum += b.buildTrie(t) |
| 340 | } |
| 341 | |
| 342 | // Compute the offsets for all the Compacters. |
| 343 | offset := uint32(0) |
| 344 | for i := range b.Compactions { |
| 345 | c := &b.Compactions[i] |
| 346 | c.Offset = offset |
| 347 | offset += c.maxHandle + 1 |
| 348 | c.Cutoff = offset |
| 349 | } |
| 350 | |
| 351 | // Compute the sizes of indexes. |
| 352 | // TODO: different byte positions could have different sizes. So far we have |
| 353 | // not found a case where this is beneficial. |
| 354 | imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff) |
| 355 | for _, ib := range b.IndexBlocks { |
| 356 | if x := uint64(ib.index.index); x > imax { |
| 357 | imax = x |
| 358 | } |
| 359 | } |
| 360 | b.IndexType, b.IndexSize = getIntType(imax) |
| 361 | } |
| 362 | |
| 363 | func maxValue(n *node, max uint64) uint64 { |
| 364 | if n == nil { |
| 365 | return max |
| 366 | } |
| 367 | for _, c := range n.children { |
| 368 | max = maxValue(c, max) |
| 369 | } |
| 370 | for _, v := range n.values { |
| 371 | if max < v { |
| 372 | max = v |
| 373 | } |
| 374 | } |
| 375 | return max |
| 376 | } |
| 377 | |
| 378 | func getIntType(v uint64) (string, int) { |
| 379 | switch { |
| 380 | case v < 1<<8: |
| 381 | return "uint8", 1 |
| 382 | case v < 1<<16: |
| 383 | return "uint16", 2 |
| 384 | case v < 1<<32: |
| 385 | return "uint32", 4 |
| 386 | } |
| 387 | return "uint64", 8 |
| 388 | } |
| 389 | |
| 390 | const ( |
| 391 | blockSize = 64 |
| 392 | |
| 393 | // Subtract two blocks to offset 0x80, the first continuation byte. |
| 394 | blockOffset = 2 |
| 395 | |
| 396 | // Subtract three blocks to offset 0xC0, the first non-ASCII starter. |
| 397 | rootBlockOffset = 3 |
| 398 | ) |
| 399 | |
| 400 | var crcTable = crc64.MakeTable(crc64.ISO) |
| 401 | |
| 402 | func (b *builder) buildTrie(t *Trie) uint64 { |
| 403 | n := t.root |
| 404 | |
| 405 | // Get the ASCII offset. For the first trie, the ASCII block will be at |
| 406 | // position 0. |
| 407 | hasher := crc64.New(crcTable) |
| 408 | binary.Write(hasher, binary.BigEndian, n.values) |
| 409 | hash := hasher.Sum64() |
| 410 | |
| 411 | v, ok := b.asciiBlockIdx[hash] |
| 412 | if !ok { |
| 413 | v = len(b.ValueBlocks) |
| 414 | b.asciiBlockIdx[hash] = v |
| 415 | |
| 416 | b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:]) |
| 417 | if v == 0 { |
| 418 | // Add the zero block at position 2 so that it will be assigned a |
| 419 | // zero reference in the lookup blocks. |
| 420 | // TODO: always do this? This would allow us to remove a check from |
| 421 | // the trie lookup, but at the expense of extra space. Analyze |
| 422 | // performance for unicode/norm. |
| 423 | b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize)) |
| 424 | } |
| 425 | } |
| 426 | t.ASCIIIndex = v |
| 427 | |
| 428 | // Compute remaining offsets. |
| 429 | t.Checksum = b.computeOffsets(n, true) |
| 430 | // We already subtracted the normal blockOffset from the index. Subtract the |
| 431 | // difference for starter bytes. |
| 432 | t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset) |
| 433 | return t.Checksum |
| 434 | } |
| 435 | |
| 436 | func (b *builder) computeOffsets(n *node, root bool) uint64 { |
| 437 | // For the first trie, the root lookup block will be at position 3, which is |
| 438 | // the offset for UTF-8 non-ASCII starter bytes. |
| 439 | first := len(b.IndexBlocks) == rootBlockOffset |
| 440 | if first { |
| 441 | b.IndexBlocks = append(b.IndexBlocks, n) |
| 442 | } |
| 443 | |
| 444 | // We special-case the cases where all values recursively are 0. This allows |
| 445 | // for the use of a zero block to which all such values can be directed. |
| 446 | hash := uint64(0) |
| 447 | if n.children != nil || n.values != nil { |
| 448 | hasher := crc64.New(crcTable) |
| 449 | for _, c := range n.children { |
| 450 | var v uint64 |
| 451 | if c != nil { |
| 452 | v = b.computeOffsets(c, false) |
| 453 | } |
| 454 | binary.Write(hasher, binary.BigEndian, v) |
| 455 | } |
| 456 | binary.Write(hasher, binary.BigEndian, n.values) |
| 457 | hash = hasher.Sum64() |
| 458 | } |
| 459 | |
| 460 | if first { |
| 461 | b.indexBlockIdx[hash] = rootBlockOffset - blockOffset |
| 462 | } |
| 463 | |
| 464 | // Compacters don't apply to internal nodes. |
| 465 | if n.children != nil { |
| 466 | v, ok := b.indexBlockIdx[hash] |
| 467 | if !ok { |
| 468 | v = len(b.IndexBlocks) - blockOffset |
| 469 | b.IndexBlocks = append(b.IndexBlocks, n) |
| 470 | b.indexBlockIdx[hash] = v |
| 471 | } |
| 472 | n.index = nodeIndex{0, v} |
| 473 | } else { |
| 474 | h, ok := b.valueBlockIdx[hash] |
| 475 | if !ok { |
| 476 | bestI, bestSize := 0, blockSize*b.ValueSize |
| 477 | for i, c := range b.Compactions[1:] { |
| 478 | if sz, ok := c.c.Size(n.values); ok && bestSize > sz { |
| 479 | bestI, bestSize = i+1, sz |
| 480 | } |
| 481 | } |
| 482 | c := &b.Compactions[bestI] |
| 483 | c.totalSize += bestSize |
| 484 | v := c.c.Store(n.values) |
| 485 | if c.maxHandle < v { |
| 486 | c.maxHandle = v |
| 487 | } |
| 488 | h = nodeIndex{bestI, int(v)} |
| 489 | b.valueBlockIdx[hash] = h |
| 490 | } |
| 491 | n.index = h |
| 492 | } |
| 493 | return hash |
| 494 | } |