blob: 30a3aa93343d22d5cb928b2c7cb54b25e41b95c3 [file] [log] [blame]
Kent Hagerman6379e092020-02-18 18:05:57 -05001// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Normalization table generator.
8// Data read from the web.
9// See forminfo.go for a description of the trie values associated with each rune.
10
11package main
12
13import (
14 "bytes"
15 "encoding/binary"
16 "flag"
17 "fmt"
18 "io"
19 "log"
20 "sort"
21 "strconv"
22 "strings"
23
24 "golang.org/x/text/internal/gen"
25 "golang.org/x/text/internal/triegen"
26 "golang.org/x/text/internal/ucd"
27)
28
29func main() {
30 gen.Init()
31 loadUnicodeData()
32 compactCCC()
33 loadCompositionExclusions()
34 completeCharFields(FCanonical)
35 completeCharFields(FCompatibility)
36 computeNonStarterCounts()
37 verifyComputed()
38 printChars()
39 testDerived()
40 printTestdata()
41 makeTables()
42}
43
44var (
45 tablelist = flag.String("tables",
46 "all",
47 "comma-separated list of which tables to generate; "+
48 "can be 'decomp', 'recomp', 'info' and 'all'")
49 test = flag.Bool("test",
50 false,
51 "test existing tables against DerivedNormalizationProps and generate test data for regression testing")
52 verbose = flag.Bool("verbose",
53 false,
54 "write data to stdout as it is parsed")
55)
56
57const MaxChar = 0x10FFFF // anything above this shouldn't exist
58
59// Quick Check properties of runes allow us to quickly
60// determine whether a rune may occur in a normal form.
61// For a given normal form, a rune may be guaranteed to occur
62// verbatim (QC=Yes), may or may not combine with another
63// rune (QC=Maybe), or may not occur (QC=No).
64type QCResult int
65
66const (
67 QCUnknown QCResult = iota
68 QCYes
69 QCNo
70 QCMaybe
71)
72
73func (r QCResult) String() string {
74 switch r {
75 case QCYes:
76 return "Yes"
77 case QCNo:
78 return "No"
79 case QCMaybe:
80 return "Maybe"
81 }
82 return "***UNKNOWN***"
83}
84
85const (
86 FCanonical = iota // NFC or NFD
87 FCompatibility // NFKC or NFKD
88 FNumberOfFormTypes
89)
90
91const (
92 MComposed = iota // NFC or NFKC
93 MDecomposed // NFD or NFKD
94 MNumberOfModes
95)
96
97// This contains only the properties we're interested in.
98type Char struct {
99 name string
100 codePoint rune // if zero, this index is not a valid code point.
101 ccc uint8 // canonical combining class
102 origCCC uint8
103 excludeInComp bool // from CompositionExclusions.txt
104 compatDecomp bool // it has a compatibility expansion
105
106 nTrailingNonStarters uint8
107 nLeadingNonStarters uint8 // must be equal to trailing if non-zero
108
109 forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
110
111 state State
112}
113
114var chars = make([]Char, MaxChar+1)
115var cccMap = make(map[uint8]uint8)
116
117func (c Char) String() string {
118 buf := new(bytes.Buffer)
119
120 fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name)
121 fmt.Fprintf(buf, " ccc: %v\n", c.ccc)
122 fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp)
123 fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp)
124 fmt.Fprintf(buf, " state: %v\n", c.state)
125 fmt.Fprintf(buf, " NFC:\n")
126 fmt.Fprint(buf, c.forms[FCanonical])
127 fmt.Fprintf(buf, " NFKC:\n")
128 fmt.Fprint(buf, c.forms[FCompatibility])
129
130 return buf.String()
131}
132
133// In UnicodeData.txt, some ranges are marked like this:
134// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
135// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
136// parseCharacter keeps a state variable indicating the weirdness.
137type State int
138
139const (
140 SNormal State = iota // known to be zero for the type
141 SFirst
142 SLast
143 SMissing
144)
145
146var lastChar = rune('\u0000')
147
148func (c Char) isValid() bool {
149 return c.codePoint != 0 && c.state != SMissing
150}
151
152type FormInfo struct {
153 quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
154 verified [MNumberOfModes]bool // index: MComposed or MDecomposed
155
156 combinesForward bool // May combine with rune on the right
157 combinesBackward bool // May combine with rune on the left
158 isOneWay bool // Never appears in result
159 inDecomp bool // Some decompositions result in this char.
160 decomp Decomposition
161 expandedDecomp Decomposition
162}
163
164func (f FormInfo) String() string {
165 buf := bytes.NewBuffer(make([]byte, 0))
166
167 fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed])
168 fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed])
169 fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward)
170 fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward)
171 fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay)
172 fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp)
173 fmt.Fprintf(buf, " decomposition: %X\n", f.decomp)
174 fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp)
175
176 return buf.String()
177}
178
179type Decomposition []rune
180
181func parseDecomposition(s string, skipfirst bool) (a []rune, err error) {
182 decomp := strings.Split(s, " ")
183 if len(decomp) > 0 && skipfirst {
184 decomp = decomp[1:]
185 }
186 for _, d := range decomp {
187 point, err := strconv.ParseUint(d, 16, 64)
188 if err != nil {
189 return a, err
190 }
191 a = append(a, rune(point))
192 }
193 return a, nil
194}
195
196func loadUnicodeData() {
197 f := gen.OpenUCDFile("UnicodeData.txt")
198 defer f.Close()
199 p := ucd.New(f)
200 for p.Next() {
201 r := p.Rune(ucd.CodePoint)
202 char := &chars[r]
203
204 char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass))
205 decmap := p.String(ucd.DecompMapping)
206
207 exp, err := parseDecomposition(decmap, false)
208 isCompat := false
209 if err != nil {
210 if len(decmap) > 0 {
211 exp, err = parseDecomposition(decmap, true)
212 if err != nil {
213 log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err)
214 }
215 isCompat = true
216 }
217 }
218
219 char.name = p.String(ucd.Name)
220 char.codePoint = r
221 char.forms[FCompatibility].decomp = exp
222 if !isCompat {
223 char.forms[FCanonical].decomp = exp
224 } else {
225 char.compatDecomp = true
226 }
227 if len(decmap) > 0 {
228 char.forms[FCompatibility].decomp = exp
229 }
230 }
231 if err := p.Err(); err != nil {
232 log.Fatal(err)
233 }
234}
235
236// compactCCC converts the sparse set of CCC values to a continguous one,
237// reducing the number of bits needed from 8 to 6.
238func compactCCC() {
239 m := make(map[uint8]uint8)
240 for i := range chars {
241 c := &chars[i]
242 m[c.ccc] = 0
243 }
244 cccs := []int{}
245 for v, _ := range m {
246 cccs = append(cccs, int(v))
247 }
248 sort.Ints(cccs)
249 for i, c := range cccs {
250 cccMap[uint8(i)] = uint8(c)
251 m[uint8(c)] = uint8(i)
252 }
253 for i := range chars {
254 c := &chars[i]
255 c.origCCC = c.ccc
256 c.ccc = m[c.ccc]
257 }
258 if len(m) >= 1<<6 {
259 log.Fatalf("too many difference CCC values: %d >= 64", len(m))
260 }
261}
262
263// CompositionExclusions.txt has form:
264// 0958 # ...
265// See https://unicode.org/reports/tr44/ for full explanation
266func loadCompositionExclusions() {
267 f := gen.OpenUCDFile("CompositionExclusions.txt")
268 defer f.Close()
269 p := ucd.New(f)
270 for p.Next() {
271 c := &chars[p.Rune(0)]
272 if c.excludeInComp {
273 log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint)
274 }
275 c.excludeInComp = true
276 }
277 if e := p.Err(); e != nil {
278 log.Fatal(e)
279 }
280}
281
282// hasCompatDecomp returns true if any of the recursive
283// decompositions contains a compatibility expansion.
284// In this case, the character may not occur in NFK*.
285func hasCompatDecomp(r rune) bool {
286 c := &chars[r]
287 if c.compatDecomp {
288 return true
289 }
290 for _, d := range c.forms[FCompatibility].decomp {
291 if hasCompatDecomp(d) {
292 return true
293 }
294 }
295 return false
296}
297
298// Hangul related constants.
299const (
300 HangulBase = 0xAC00
301 HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
302
303 JamoLBase = 0x1100
304 JamoLEnd = 0x1113
305 JamoVBase = 0x1161
306 JamoVEnd = 0x1176
307 JamoTBase = 0x11A8
308 JamoTEnd = 0x11C3
309
310 JamoLVTCount = 19 * 21 * 28
311 JamoTCount = 28
312)
313
314func isHangul(r rune) bool {
315 return HangulBase <= r && r < HangulEnd
316}
317
318func isHangulWithoutJamoT(r rune) bool {
319 if !isHangul(r) {
320 return false
321 }
322 r -= HangulBase
323 return r < JamoLVTCount && r%JamoTCount == 0
324}
325
326func ccc(r rune) uint8 {
327 return chars[r].ccc
328}
329
330// Insert a rune in a buffer, ordered by Canonical Combining Class.
331func insertOrdered(b Decomposition, r rune) Decomposition {
332 n := len(b)
333 b = append(b, 0)
334 cc := ccc(r)
335 if cc > 0 {
336 // Use bubble sort.
337 for ; n > 0; n-- {
338 if ccc(b[n-1]) <= cc {
339 break
340 }
341 b[n] = b[n-1]
342 }
343 }
344 b[n] = r
345 return b
346}
347
348// Recursively decompose.
349func decomposeRecursive(form int, r rune, d Decomposition) Decomposition {
350 dcomp := chars[r].forms[form].decomp
351 if len(dcomp) == 0 {
352 return insertOrdered(d, r)
353 }
354 for _, c := range dcomp {
355 d = decomposeRecursive(form, c, d)
356 }
357 return d
358}
359
360func completeCharFields(form int) {
361 // Phase 0: pre-expand decomposition.
362 for i := range chars {
363 f := &chars[i].forms[form]
364 if len(f.decomp) == 0 {
365 continue
366 }
367 exp := make(Decomposition, 0)
368 for _, c := range f.decomp {
369 exp = decomposeRecursive(form, c, exp)
370 }
371 f.expandedDecomp = exp
372 }
373
374 // Phase 1: composition exclusion, mark decomposition.
375 for i := range chars {
376 c := &chars[i]
377 f := &c.forms[form]
378
379 // Marks script-specific exclusions and version restricted.
380 f.isOneWay = c.excludeInComp
381
382 // Singletons
383 f.isOneWay = f.isOneWay || len(f.decomp) == 1
384
385 // Non-starter decompositions
386 if len(f.decomp) > 1 {
387 chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0
388 f.isOneWay = f.isOneWay || chk
389 }
390
391 // Runes that decompose into more than two runes.
392 f.isOneWay = f.isOneWay || len(f.decomp) > 2
393
394 if form == FCompatibility {
395 f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint)
396 }
397
398 for _, r := range f.decomp {
399 chars[r].forms[form].inDecomp = true
400 }
401 }
402
403 // Phase 2: forward and backward combining.
404 for i := range chars {
405 c := &chars[i]
406 f := &c.forms[form]
407
408 if !f.isOneWay && len(f.decomp) == 2 {
409 f0 := &chars[f.decomp[0]].forms[form]
410 f1 := &chars[f.decomp[1]].forms[form]
411 if !f0.isOneWay {
412 f0.combinesForward = true
413 }
414 if !f1.isOneWay {
415 f1.combinesBackward = true
416 }
417 }
418 if isHangulWithoutJamoT(rune(i)) {
419 f.combinesForward = true
420 }
421 }
422
423 // Phase 3: quick check values.
424 for i := range chars {
425 c := &chars[i]
426 f := &c.forms[form]
427
428 switch {
429 case len(f.decomp) > 0:
430 f.quickCheck[MDecomposed] = QCNo
431 case isHangul(rune(i)):
432 f.quickCheck[MDecomposed] = QCNo
433 default:
434 f.quickCheck[MDecomposed] = QCYes
435 }
436 switch {
437 case f.isOneWay:
438 f.quickCheck[MComposed] = QCNo
439 case (i & 0xffff00) == JamoLBase:
440 f.quickCheck[MComposed] = QCYes
441 if JamoLBase <= i && i < JamoLEnd {
442 f.combinesForward = true
443 }
444 if JamoVBase <= i && i < JamoVEnd {
445 f.quickCheck[MComposed] = QCMaybe
446 f.combinesBackward = true
447 f.combinesForward = true
448 }
449 if JamoTBase <= i && i < JamoTEnd {
450 f.quickCheck[MComposed] = QCMaybe
451 f.combinesBackward = true
452 }
453 case !f.combinesBackward:
454 f.quickCheck[MComposed] = QCYes
455 default:
456 f.quickCheck[MComposed] = QCMaybe
457 }
458 }
459}
460
461func computeNonStarterCounts() {
462 // Phase 4: leading and trailing non-starter count
463 for i := range chars {
464 c := &chars[i]
465
466 runes := []rune{rune(i)}
467 // We always use FCompatibility so that the CGJ insertion points do not
468 // change for repeated normalizations with different forms.
469 if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 {
470 runes = exp
471 }
472 // We consider runes that combine backwards to be non-starters for the
473 // purpose of Stream-Safe Text Processing.
474 for _, r := range runes {
475 if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
476 break
477 }
478 c.nLeadingNonStarters++
479 }
480 for i := len(runes) - 1; i >= 0; i-- {
481 if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
482 break
483 }
484 c.nTrailingNonStarters++
485 }
486 if c.nTrailingNonStarters > 3 {
487 log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes)
488 }
489
490 if isHangul(rune(i)) {
491 c.nTrailingNonStarters = 2
492 if isHangulWithoutJamoT(rune(i)) {
493 c.nTrailingNonStarters = 1
494 }
495 }
496
497 if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t {
498 log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t)
499 }
500 if t := c.nTrailingNonStarters; t > 3 {
501 log.Fatalf("%U: number of trailing non-starters is %d > 3", t)
502 }
503 }
504}
505
506func printBytes(w io.Writer, b []byte, name string) {
507 fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b))
508 fmt.Fprintf(w, "var %s = [...]byte {", name)
509 for i, c := range b {
510 switch {
511 case i%64 == 0:
512 fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63)
513 case i%8 == 0:
514 fmt.Fprintf(w, "\n")
515 }
516 fmt.Fprintf(w, "0x%.2X, ", c)
517 }
518 fmt.Fprint(w, "\n}\n\n")
519}
520
521// See forminfo.go for format.
522func makeEntry(f *FormInfo, c *Char) uint16 {
523 e := uint16(0)
524 if r := c.codePoint; HangulBase <= r && r < HangulEnd {
525 e |= 0x40
526 }
527 if f.combinesForward {
528 e |= 0x20
529 }
530 if f.quickCheck[MDecomposed] == QCNo {
531 e |= 0x4
532 }
533 switch f.quickCheck[MComposed] {
534 case QCYes:
535 case QCNo:
536 e |= 0x10
537 case QCMaybe:
538 e |= 0x18
539 default:
540 log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed])
541 }
542 e |= uint16(c.nTrailingNonStarters)
543 return e
544}
545
546// decompSet keeps track of unique decompositions, grouped by whether
547// the decomposition is followed by a trailing and/or leading CCC.
548type decompSet [7]map[string]bool
549
550const (
551 normalDecomp = iota
552 firstMulti
553 firstCCC
554 endMulti
555 firstLeadingCCC
556 firstCCCZeroExcept
557 firstStarterWithNLead
558 lastDecomp
559)
560
561var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"}
562
563func makeDecompSet() decompSet {
564 m := decompSet{}
565 for i := range m {
566 m[i] = make(map[string]bool)
567 }
568 return m
569}
570func (m *decompSet) insert(key int, s string) {
571 m[key][s] = true
572}
573
574func printCharInfoTables(w io.Writer) int {
575 mkstr := func(r rune, f *FormInfo) (int, string) {
576 d := f.expandedDecomp
577 s := string([]rune(d))
578 if max := 1 << 6; len(s) >= max {
579 const msg = "%U: too many bytes in decomposition: %d >= %d"
580 log.Fatalf(msg, r, len(s), max)
581 }
582 head := uint8(len(s))
583 if f.quickCheck[MComposed] != QCYes {
584 head |= 0x40
585 }
586 if f.combinesForward {
587 head |= 0x80
588 }
589 s = string([]byte{head}) + s
590
591 lccc := ccc(d[0])
592 tccc := ccc(d[len(d)-1])
593 cc := ccc(r)
594 if cc != 0 && lccc == 0 && tccc == 0 {
595 log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc)
596 }
597 if tccc < lccc && lccc != 0 {
598 const msg = "%U: lccc (%d) must be <= tcc (%d)"
599 log.Fatalf(msg, r, lccc, tccc)
600 }
601 index := normalDecomp
602 nTrail := chars[r].nTrailingNonStarters
603 nLead := chars[r].nLeadingNonStarters
604 if tccc > 0 || lccc > 0 || nTrail > 0 {
605 tccc <<= 2
606 tccc |= nTrail
607 s += string([]byte{tccc})
608 index = endMulti
609 for _, r := range d[1:] {
610 if ccc(r) == 0 {
611 index = firstCCC
612 }
613 }
614 if lccc > 0 || nLead > 0 {
615 s += string([]byte{lccc})
616 if index == firstCCC {
617 log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
618 }
619 index = firstLeadingCCC
620 }
621 if cc != lccc {
622 if cc != 0 {
623 log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
624 }
625 index = firstCCCZeroExcept
626 }
627 } else if len(d) > 1 {
628 index = firstMulti
629 }
630 return index, s
631 }
632
633 decompSet := makeDecompSet()
634 const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail.
635 decompSet.insert(firstStarterWithNLead, nLeadStr)
636
637 // Store the uniqued decompositions in a byte buffer,
638 // preceded by their byte length.
639 for _, c := range chars {
640 for _, f := range c.forms {
641 if len(f.expandedDecomp) == 0 {
642 continue
643 }
644 if f.combinesBackward {
645 log.Fatalf("%U: combinesBackward and decompose", c.codePoint)
646 }
647 index, s := mkstr(c.codePoint, &f)
648 decompSet.insert(index, s)
649 }
650 }
651
652 decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
653 size := 0
654 positionMap := make(map[string]uint16)
655 decompositions.WriteString("\000")
656 fmt.Fprintln(w, "const (")
657 for i, m := range decompSet {
658 sa := []string{}
659 for s := range m {
660 sa = append(sa, s)
661 }
662 sort.Strings(sa)
663 for _, s := range sa {
664 p := decompositions.Len()
665 decompositions.WriteString(s)
666 positionMap[s] = uint16(p)
667 }
668 if cname[i] != "" {
669 fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len())
670 }
671 }
672 fmt.Fprintln(w, "maxDecomp = 0x8000")
673 fmt.Fprintln(w, ")")
674 b := decompositions.Bytes()
675 printBytes(w, b, "decomps")
676 size += len(b)
677
678 varnames := []string{"nfc", "nfkc"}
679 for i := 0; i < FNumberOfFormTypes; i++ {
680 trie := triegen.NewTrie(varnames[i])
681
682 for r, c := range chars {
683 f := c.forms[i]
684 d := f.expandedDecomp
685 if len(d) != 0 {
686 _, key := mkstr(c.codePoint, &f)
687 trie.Insert(rune(r), uint64(positionMap[key]))
688 if c.ccc != ccc(d[0]) {
689 // We assume the lead ccc of a decomposition !=0 in this case.
690 if ccc(d[0]) == 0 {
691 log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
692 }
693 }
694 } else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward {
695 // Handle cases where it can't be detected that the nLead should be equal
696 // to nTrail.
697 trie.Insert(c.codePoint, uint64(positionMap[nLeadStr]))
698 } else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 {
699 trie.Insert(c.codePoint, uint64(0x8000|v))
700 }
701 }
702 sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]}))
703 if err != nil {
704 log.Fatal(err)
705 }
706 size += sz
707 }
708 return size
709}
710
711func contains(sa []string, s string) bool {
712 for _, a := range sa {
713 if a == s {
714 return true
715 }
716 }
717 return false
718}
719
720func makeTables() {
721 w := &bytes.Buffer{}
722
723 size := 0
724 if *tablelist == "" {
725 return
726 }
727 list := strings.Split(*tablelist, ",")
728 if *tablelist == "all" {
729 list = []string{"recomp", "info"}
730 }
731
732 // Compute maximum decomposition size.
733 max := 0
734 for _, c := range chars {
735 if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max {
736 max = n
737 }
738 }
739 fmt.Fprintln(w, `import "sync"`)
740 fmt.Fprintln(w)
741
742 fmt.Fprintln(w, "const (")
743 fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.")
744 fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion())
745 fmt.Fprintln(w)
746 fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform")
747 fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at")
748 fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that")
749 fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.")
750 fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max)
751 fmt.Fprintln(w, ")\n")
752
753 // Print the CCC remap table.
754 size += len(cccMap)
755 fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap))
756 for i := 0; i < len(cccMap); i++ {
757 if i%8 == 0 {
758 fmt.Fprintln(w)
759 }
760 fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)])
761 }
762 fmt.Fprintln(w, "\n}\n")
763
764 if contains(list, "info") {
765 size += printCharInfoTables(w)
766 }
767
768 if contains(list, "recomp") {
769 // Note that we use 32 bit keys, instead of 64 bit.
770 // This clips the bits of three entries, but we know
771 // this won't cause a collision. The compiler will catch
772 // any changes made to UnicodeData.txt that introduces
773 // a collision.
774 // Note that the recomposition map for NFC and NFKC
775 // are identical.
776
777 // Recomposition map
778 nrentries := 0
779 for _, c := range chars {
780 f := c.forms[FCanonical]
781 if !f.isOneWay && len(f.decomp) > 0 {
782 nrentries++
783 }
784 }
785 sz := nrentries * 8
786 size += sz
787 fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz)
788 fmt.Fprintln(w, "var recompMap map[uint32]rune")
789 fmt.Fprintln(w, "var recompMapOnce sync.Once\n")
790 fmt.Fprintln(w, `const recompMapPacked = "" +`)
791 var buf [8]byte
792 for i, c := range chars {
793 f := c.forms[FCanonical]
794 d := f.decomp
795 if !f.isOneWay && len(d) > 0 {
796 key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1]))
797 binary.BigEndian.PutUint32(buf[:4], key)
798 binary.BigEndian.PutUint32(buf[4:], uint32(i))
799 fmt.Fprintf(w, "\t\t%q + // 0x%.8X: 0x%.8X\n", string(buf[:]), key, uint32(i))
800 }
801 }
802 // hack so we don't have to special case the trailing plus sign
803 fmt.Fprintf(w, ` ""`)
804 fmt.Fprintln(w)
805 }
806
807 fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
808 gen.WriteVersionedGoFile("tables.go", "norm", w.Bytes())
809}
810
811func printChars() {
812 if *verbose {
813 for _, c := range chars {
814 if !c.isValid() || c.state == SMissing {
815 continue
816 }
817 fmt.Println(c)
818 }
819 }
820}
821
822// verifyComputed does various consistency tests.
823func verifyComputed() {
824 for i, c := range chars {
825 for _, f := range c.forms {
826 isNo := (f.quickCheck[MDecomposed] == QCNo)
827 if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) {
828 log.Fatalf("%U: NF*D QC must be No if rune decomposes", i)
829 }
830
831 isMaybe := f.quickCheck[MComposed] == QCMaybe
832 if f.combinesBackward != isMaybe {
833 log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i)
834 }
835 if len(f.decomp) > 0 && f.combinesForward && isMaybe {
836 log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i)
837 }
838
839 if len(f.expandedDecomp) != 0 {
840 continue
841 }
842 if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b {
843 // We accept these runes to be treated differently (it only affects
844 // segment breaking in iteration, most likely on improper use), but
845 // reconsider if more characters are added.
846 // U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;;
847 // U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;;
848 // U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;;
849 // U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;;
850 // U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;;
851 // U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;;
852 if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) {
853 log.Fatalf("%U: nLead was %v; want %v", i, a, b)
854 }
855 }
856 }
857 nfc := c.forms[FCanonical]
858 nfkc := c.forms[FCompatibility]
859 if nfc.combinesBackward != nfkc.combinesBackward {
860 log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
861 }
862 }
863}
864
865// Use values in DerivedNormalizationProps.txt to compare against the
866// values we computed.
867// DerivedNormalizationProps.txt has form:
868// 00C0..00C5 ; NFD_QC; N # ...
869// 0374 ; NFD_QC; N # ...
870// See https://unicode.org/reports/tr44/ for full explanation
871func testDerived() {
872 f := gen.OpenUCDFile("DerivedNormalizationProps.txt")
873 defer f.Close()
874 p := ucd.New(f)
875 for p.Next() {
876 r := p.Rune(0)
877 c := &chars[r]
878
879 var ftype, mode int
880 qt := p.String(1)
881 switch qt {
882 case "NFC_QC":
883 ftype, mode = FCanonical, MComposed
884 case "NFD_QC":
885 ftype, mode = FCanonical, MDecomposed
886 case "NFKC_QC":
887 ftype, mode = FCompatibility, MComposed
888 case "NFKD_QC":
889 ftype, mode = FCompatibility, MDecomposed
890 default:
891 continue
892 }
893 var qr QCResult
894 switch p.String(2) {
895 case "Y":
896 qr = QCYes
897 case "N":
898 qr = QCNo
899 case "M":
900 qr = QCMaybe
901 default:
902 log.Fatalf(`Unexpected quick check value "%s"`, p.String(2))
903 }
904 if got := c.forms[ftype].quickCheck[mode]; got != qr {
905 log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr)
906 }
907 c.forms[ftype].verified[mode] = true
908 }
909 if err := p.Err(); err != nil {
910 log.Fatal(err)
911 }
912 // Any unspecified value must be QCYes. Verify this.
913 for i, c := range chars {
914 for j, fd := range c.forms {
915 for k, qr := range fd.quickCheck {
916 if !fd.verified[k] && qr != QCYes {
917 m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n"
918 log.Printf(m, i, j, k, qr, c.name)
919 }
920 }
921 }
922 }
923}
924
925var testHeader = `const (
926 Yes = iota
927 No
928 Maybe
929)
930
931type formData struct {
932 qc uint8
933 combinesForward bool
934 decomposition string
935}
936
937type runeData struct {
938 r rune
939 ccc uint8
940 nLead uint8
941 nTrail uint8
942 f [2]formData // 0: canonical; 1: compatibility
943}
944
945func f(qc uint8, cf bool, dec string) [2]formData {
946 return [2]formData{{qc, cf, dec}, {qc, cf, dec}}
947}
948
949func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData {
950 return [2]formData{{qc, cf, d}, {qck, cfk, dk}}
951}
952
953var testData = []runeData{
954`
955
956func printTestdata() {
957 type lastInfo struct {
958 ccc uint8
959 nLead uint8
960 nTrail uint8
961 f string
962 }
963
964 last := lastInfo{}
965 w := &bytes.Buffer{}
966 fmt.Fprintf(w, testHeader)
967 for r, c := range chars {
968 f := c.forms[FCanonical]
969 qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
970 f = c.forms[FCompatibility]
971 qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
972 s := ""
973 if d == dk && qc == qck && cf == cfk {
974 s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d)
975 } else {
976 s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk)
977 }
978 current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s}
979 if last != current {
980 fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s)
981 last = current
982 }
983 }
984 fmt.Fprintln(w, "}")
985 gen.WriteVersionedGoFile("data_test.go", "norm", w.Bytes())
986}