blob: 092a4b506174bd188d43d72b89e859ca0a291f68 [file] [log] [blame]
khenaidooac637102019-01-14 15:44:34 -05001// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package build // import "golang.org/x/text/collate/build"
6
7import (
8 "fmt"
9 "io"
10 "log"
11 "sort"
12 "strings"
13 "unicode/utf8"
14
15 "golang.org/x/text/internal/colltab"
16 "golang.org/x/text/language"
17 "golang.org/x/text/unicode/norm"
18)
19
20// TODO: optimizations:
21// - expandElem is currently 20K. By putting unique colElems in a separate
22// table and having a byte array of indexes into this table, we can reduce
23// the total size to about 7K. By also factoring out the length bytes, we
24// can reduce this to about 6K.
25// - trie valueBlocks are currently 100K. There are a lot of sparse blocks
26// and many consecutive values with the same stride. This can be further
27// compacted.
28// - Compress secondary weights into 8 bits.
29// - Some LDML specs specify a context element. Currently we simply concatenate
30// those. Context can be implemented using the contraction trie. If Builder
31// could analyze and detect when using a context makes sense, there is no
32// need to expose this construct in the API.
33
34// A Builder builds a root collation table. The user must specify the
35// collation elements for each entry. A common use will be to base the weights
36// on those specified in the allkeys* file as provided by the UCA or CLDR.
37type Builder struct {
38 index *trieBuilder
39 root ordering
40 locale []*Tailoring
41 t *table
42 err error
43 built bool
44
45 minNonVar int // lowest primary recorded for a variable
46 varTop int // highest primary recorded for a non-variable
47
48 // indexes used for reusing expansions and contractions
49 expIndex map[string]int // positions of expansions keyed by their string representation
50 ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes
51 ctElem map[string]int // contraction elements keyed by their string representation
52}
53
54// A Tailoring builds a collation table based on another collation table.
55// The table is defined by specifying tailorings to the underlying table.
Scott Bakerbeb3cfa2019-10-01 14:44:30 -070056// See https://unicode.org/reports/tr35/ for an overview of tailoring
khenaidooac637102019-01-14 15:44:34 -050057// collation tables. The CLDR contains pre-defined tailorings for a variety
Scott Bakerbeb3cfa2019-10-01 14:44:30 -070058// of languages (See https://www.unicode.org/Public/cldr/<version>/core.zip.)
khenaidooac637102019-01-14 15:44:34 -050059type Tailoring struct {
60 id string
61 builder *Builder
62 index *ordering
63
64 anchor *entry
65 before bool
66}
67
68// NewBuilder returns a new Builder.
69func NewBuilder() *Builder {
70 return &Builder{
71 index: newTrieBuilder(),
72 root: makeRootOrdering(),
73 expIndex: make(map[string]int),
74 ctHandle: make(map[string]ctHandle),
75 ctElem: make(map[string]int),
76 }
77}
78
79// Tailoring returns a Tailoring for the given locale. One should
80// have completed all calls to Add before calling Tailoring.
81func (b *Builder) Tailoring(loc language.Tag) *Tailoring {
82 t := &Tailoring{
83 id: loc.String(),
84 builder: b,
85 index: b.root.clone(),
86 }
87 t.index.id = t.id
88 b.locale = append(b.locale, t)
89 return t
90}
91
92// Add adds an entry to the collation element table, mapping
93// a slice of runes to a sequence of collation elements.
94// A collation element is specified as list of weights: []int{primary, secondary, ...}.
95// The entries are typically obtained from a collation element table
Scott Bakerbeb3cfa2019-10-01 14:44:30 -070096// as defined in https://www.unicode.org/reports/tr10/#Data_Table_Format.
khenaidooac637102019-01-14 15:44:34 -050097// Note that the collation elements specified by colelems are only used
98// as a guide. The actual weights generated by Builder may differ.
99// The argument variables is a list of indices into colelems that should contain
100// a value for each colelem that is a variable. (See the reference above.)
101func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
102 str := string(runes)
103 elems := make([]rawCE, len(colelems))
104 for i, ce := range colelems {
105 if len(ce) == 0 {
106 break
107 }
108 elems[i] = makeRawCE(ce, 0)
109 if len(ce) == 1 {
110 elems[i].w[1] = defaultSecondary
111 }
112 if len(ce) <= 2 {
113 elems[i].w[2] = defaultTertiary
114 }
115 if len(ce) <= 3 {
116 elems[i].w[3] = ce[0]
117 }
118 }
119 for i, ce := range elems {
120 p := ce.w[0]
121 isvar := false
122 for _, j := range variables {
123 if i == j {
124 isvar = true
125 }
126 }
127 if isvar {
128 if p >= b.minNonVar && b.minNonVar > 0 {
129 return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar)
130 }
131 if p > b.varTop {
132 b.varTop = p
133 }
134 } else if p > 1 { // 1 is a special primary value reserved for FFFE
135 if p <= b.varTop {
136 return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop)
137 }
138 if b.minNonVar == 0 || p < b.minNonVar {
139 b.minNonVar = p
140 }
141 }
142 }
143 elems, err := convertLargeWeights(elems)
144 if err != nil {
145 return err
146 }
147 cccs := []uint8{}
148 nfd := norm.NFD.String(str)
149 for i := range nfd {
150 cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC())
151 }
152 if len(cccs) < len(elems) {
153 if len(cccs) > 2 {
154 return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems))
155 }
156 p := len(elems) - 1
157 for ; p > 0 && elems[p].w[0] == 0; p-- {
158 elems[p].ccc = cccs[len(cccs)-1]
159 }
160 for ; p >= 0; p-- {
161 elems[p].ccc = cccs[0]
162 }
163 } else {
164 for i := range elems {
165 elems[i].ccc = cccs[i]
166 }
167 }
168 // doNorm in collate.go assumes that the following conditions hold.
169 if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] {
170 return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs)
171 }
172 b.root.newEntry(str, elems)
173 return nil
174}
175
176func (t *Tailoring) setAnchor(anchor string) error {
177 anchor = norm.NFC.String(anchor)
178 a := t.index.find(anchor)
179 if a == nil {
180 a = t.index.newEntry(anchor, nil)
181 a.implicit = true
182 a.modified = true
183 for _, r := range []rune(anchor) {
184 e := t.index.find(string(r))
185 e.lock = true
186 }
187 }
188 t.anchor = a
189 return nil
190}
191
192// SetAnchor sets the point after which elements passed in subsequent calls to
193// Insert will be inserted. It is equivalent to the reset directive in an LDML
194// specification. See Insert for an example.
195// SetAnchor supports the following logical reset positions:
196// <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>,
197// and <last_non_ignorable/>.
198func (t *Tailoring) SetAnchor(anchor string) error {
199 if err := t.setAnchor(anchor); err != nil {
200 return err
201 }
202 t.before = false
203 return nil
204}
205
206// SetAnchorBefore is similar to SetAnchor, except that subsequent calls to
207// Insert will insert entries before the anchor.
208func (t *Tailoring) SetAnchorBefore(anchor string) error {
209 if err := t.setAnchor(anchor); err != nil {
210 return err
211 }
212 t.before = true
213 return nil
214}
215
216// Insert sets the ordering of str relative to the entry set by the previous
217// call to SetAnchor or Insert. The argument extend corresponds
218// to the extend elements as defined in LDML. A non-empty value for extend
219// will cause the collation elements corresponding to extend to be appended
220// to the collation elements generated for the entry added by Insert.
221// This has the same net effect as sorting str after the string anchor+extend.
Scott Bakerbeb3cfa2019-10-01 14:44:30 -0700222// See https://www.unicode.org/reports/tr10/#Tailoring_Example for details
223// on parametric tailoring and https://unicode.org/reports/tr35/#Collation_Elements
khenaidooac637102019-01-14 15:44:34 -0500224// for full details on LDML.
225//
226// Examples: create a tailoring for Swedish, where "ä" is ordered after "z"
227// at the primary sorting level:
228// t := b.Tailoring("se")
229// t.SetAnchor("z")
230// t.Insert(colltab.Primary, "ä", "")
231// Order "ü" after "ue" at the secondary sorting level:
232// t.SetAnchor("ue")
233// t.Insert(colltab.Secondary, "ü","")
234// or
235// t.SetAnchor("u")
236// t.Insert(colltab.Secondary, "ü", "e")
237// Order "q" afer "ab" at the secondary level and "Q" after "q"
238// at the tertiary level:
239// t.SetAnchor("ab")
240// t.Insert(colltab.Secondary, "q", "")
241// t.Insert(colltab.Tertiary, "Q", "")
242// Order "b" before "a":
243// t.SetAnchorBefore("a")
244// t.Insert(colltab.Primary, "b", "")
245// Order "0" after the last primary ignorable:
246// t.SetAnchor("<last_primary_ignorable/>")
247// t.Insert(colltab.Primary, "0", "")
248func (t *Tailoring) Insert(level colltab.Level, str, extend string) error {
249 if t.anchor == nil {
250 return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
251 }
252 str = norm.NFC.String(str)
253 e := t.index.find(str)
254 if e == nil {
255 e = t.index.newEntry(str, nil)
256 } else if e.logical != noAnchor {
257 return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str)
258 }
259 if e.lock {
260 return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str)
261 }
262 a := t.anchor
263 // Find the first element after the anchor which differs at a level smaller or
264 // equal to the given level. Then insert at this position.
Scott Bakerbeb3cfa2019-10-01 14:44:30 -0700265 // See https://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details.
khenaidooac637102019-01-14 15:44:34 -0500266 e.before = t.before
267 if t.before {
268 t.before = false
269 if a.prev == nil {
270 a.insertBefore(e)
271 } else {
272 for a = a.prev; a.level > level; a = a.prev {
273 }
274 a.insertAfter(e)
275 }
276 e.level = level
277 } else {
278 for ; a.level > level; a = a.next {
279 }
280 e.level = a.level
281 if a != e {
282 a.insertAfter(e)
283 a.level = level
284 } else {
285 // We don't set a to prev itself. This has the effect of the entry
286 // getting new collation elements that are an increment of itself.
287 // This is intentional.
288 a.prev.level = level
289 }
290 }
291 e.extend = norm.NFD.String(extend)
292 e.exclude = false
293 e.modified = true
294 e.elems = nil
295 t.anchor = e
296 return nil
297}
298
299func (o *ordering) getWeight(e *entry) []rawCE {
300 if len(e.elems) == 0 && e.logical == noAnchor {
301 if e.implicit {
302 for _, r := range e.runes {
303 e.elems = append(e.elems, o.getWeight(o.find(string(r)))...)
304 }
305 } else if e.before {
306 count := [colltab.Identity + 1]int{}
307 a := e
308 for ; a.elems == nil && !a.implicit; a = a.next {
309 count[a.level]++
310 }
311 e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)}
312 for i := colltab.Primary; i < colltab.Quaternary; i++ {
313 if count[i] != 0 {
314 e.elems[0].w[i] -= count[i]
315 break
316 }
317 }
318 if e.prev != nil {
319 o.verifyWeights(e.prev, e, e.prev.level)
320 }
321 } else {
322 prev := e.prev
323 e.elems = nextWeight(prev.level, o.getWeight(prev))
324 o.verifyWeights(e, e.next, e.level)
325 }
326 }
327 return e.elems
328}
329
330func (o *ordering) addExtension(e *entry) {
331 if ex := o.find(e.extend); ex != nil {
332 e.elems = append(e.elems, ex.elems...)
333 } else {
334 for _, r := range []rune(e.extend) {
335 e.elems = append(e.elems, o.find(string(r)).elems...)
336 }
337 }
338 e.extend = ""
339}
340
341func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error {
342 if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil {
343 return nil
344 }
345 for i := colltab.Primary; i < level; i++ {
346 if a.elems[0].w[i] < b.elems[0].w[i] {
347 return nil
348 }
349 }
350 if a.elems[0].w[level] >= b.elems[0].w[level] {
351 err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
352 log.Println(err)
353 // TODO: return the error instead, or better, fix the conflicting entry by making room.
354 }
355 return nil
356}
357
358func (b *Builder) error(e error) {
359 if e != nil {
360 b.err = e
361 }
362}
363
364func (b *Builder) errorID(locale string, e error) {
365 if e != nil {
366 b.err = fmt.Errorf("%s:%v", locale, e)
367 }
368}
369
370// patchNorm ensures that NFC and NFD counterparts are consistent.
371func (o *ordering) patchNorm() {
372 // Insert the NFD counterparts, if necessary.
373 for _, e := range o.ordered {
374 nfd := norm.NFD.String(e.str)
375 if nfd != e.str {
376 if e0 := o.find(nfd); e0 != nil && !e0.modified {
377 e0.elems = e.elems
378 } else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) {
379 e := o.newEntry(nfd, e.elems)
380 e.modified = true
381 }
382 }
383 }
384 // Update unchanged composed forms if one of their parts changed.
385 for _, e := range o.ordered {
386 nfd := norm.NFD.String(e.str)
387 if e.modified || nfd == e.str {
388 continue
389 }
390 if e0 := o.find(nfd); e0 != nil {
391 e.elems = e0.elems
392 } else {
393 e.elems = o.genColElems(nfd)
394 if norm.NFD.LastBoundary([]byte(nfd)) == 0 {
395 r := []rune(nfd)
396 head := string(r[0])
397 tail := ""
398 for i := 1; i < len(r); i++ {
399 s := norm.NFC.String(head + string(r[i]))
400 if e0 := o.find(s); e0 != nil && e0.modified {
401 head = s
402 } else {
403 tail += string(r[i])
404 }
405 }
406 e.elems = append(o.genColElems(head), o.genColElems(tail)...)
407 }
408 }
409 }
410 // Exclude entries for which the individual runes generate the same collation elements.
411 for _, e := range o.ordered {
412 if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) {
413 e.exclude = true
414 }
415 }
416}
417
418func (b *Builder) buildOrdering(o *ordering) {
419 for _, e := range o.ordered {
420 o.getWeight(e)
421 }
422 for _, e := range o.ordered {
423 o.addExtension(e)
424 }
425 o.patchNorm()
426 o.sort()
427 simplify(o)
428 b.processExpansions(o) // requires simplify
429 b.processContractions(o) // requires simplify
430
431 t := newNode()
432 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
433 if !e.skip() {
434 ce, err := e.encode()
435 b.errorID(o.id, err)
436 t.insert(e.runes[0], ce)
437 }
438 }
439 o.handle = b.index.addTrie(t)
440}
441
442func (b *Builder) build() (*table, error) {
443 if b.built {
444 return b.t, b.err
445 }
446 b.built = true
447 b.t = &table{
448 Table: colltab.Table{
449 MaxContractLen: utf8.UTFMax,
450 VariableTop: uint32(b.varTop),
451 },
452 }
453
454 b.buildOrdering(&b.root)
455 b.t.root = b.root.handle
456 for _, t := range b.locale {
457 b.buildOrdering(t.index)
458 if b.err != nil {
459 break
460 }
461 }
462 i, err := b.index.generate()
463 b.t.trie = *i
464 b.t.Index = colltab.Trie{
465 Index: i.index,
466 Values: i.values,
467 Index0: i.index[blockSize*b.t.root.lookupStart:],
468 Values0: i.values[blockSize*b.t.root.valueStart:],
469 }
470 b.error(err)
471 return b.t, b.err
472}
473
474// Build builds the root Collator.
475func (b *Builder) Build() (colltab.Weighter, error) {
476 table, err := b.build()
477 if err != nil {
478 return nil, err
479 }
480 return table, nil
481}
482
483// Build builds a Collator for Tailoring t.
484func (t *Tailoring) Build() (colltab.Weighter, error) {
485 // TODO: implement.
486 return nil, nil
487}
488
489// Print prints the tables for b and all its Tailorings as a Go file
490// that can be included in the Collate package.
491func (b *Builder) Print(w io.Writer) (n int, err error) {
492 p := func(nn int, e error) {
493 n += nn
494 if err == nil {
495 err = e
496 }
497 }
498 t, err := b.build()
499 if err != nil {
500 return 0, err
501 }
502 p(fmt.Fprintf(w, `var availableLocales = "und`))
503 for _, loc := range b.locale {
504 if loc.id != "und" {
505 p(fmt.Fprintf(w, ",%s", loc.id))
506 }
507 }
508 p(fmt.Fprint(w, "\"\n\n"))
509 p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop))
510 p(fmt.Fprintln(w, "var locales = [...]tableIndex{"))
511 for _, loc := range b.locale {
512 if loc.id == "und" {
513 p(t.fprintIndex(w, loc.index.handle, loc.id))
514 }
515 }
516 for _, loc := range b.locale {
517 if loc.id != "und" {
518 p(t.fprintIndex(w, loc.index.handle, loc.id))
519 }
520 }
521 p(fmt.Fprint(w, "}\n\n"))
522 n, _, err = t.fprint(w, "main")
523 return
524}
525
526// reproducibleFromNFKD checks whether the given expansion could be generated
527// from an NFKD expansion.
528func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool {
529 // Length must be equal.
530 if len(exp) != len(nfkd) {
531 return false
532 }
533 for i, ce := range exp {
534 // Primary and secondary values should be equal.
535 if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] {
536 return false
537 }
538 // Tertiary values should be equal to maxTertiary for third element onwards.
539 // TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
540 // simply be dropped. Try this out by dropping the following code.
541 if i >= 2 && ce.w[2] != maxTertiary {
542 return false
543 }
544 if _, err := makeCE(ce); err != nil {
545 // Simply return false. The error will be caught elsewhere.
546 return false
547 }
548 }
549 return true
550}
551
552func simplify(o *ordering) {
553 // Runes that are a starter of a contraction should not be removed.
554 // (To date, there is only Kannada character 0CCA.)
555 keep := make(map[rune]bool)
556 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
557 if len(e.runes) > 1 {
558 keep[e.runes[0]] = true
559 }
560 }
561 // Tag entries for which the runes NFKD decompose to identical values.
562 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
563 s := e.str
564 nfkd := norm.NFKD.String(s)
565 nfd := norm.NFD.String(s)
566 if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd {
567 continue
568 }
569 if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
570 e.decompose = true
571 }
572 }
573}
574
575// appendExpansion converts the given collation sequence to
576// collation elements and adds them to the expansion table.
577// It returns an index to the expansion table.
578func (b *Builder) appendExpansion(e *entry) int {
579 t := b.t
580 i := len(t.ExpandElem)
581 ce := uint32(len(e.elems))
582 t.ExpandElem = append(t.ExpandElem, ce)
583 for _, w := range e.elems {
584 ce, err := makeCE(w)
585 if err != nil {
586 b.error(err)
587 return -1
588 }
589 t.ExpandElem = append(t.ExpandElem, ce)
590 }
591 return i
592}
593
594// processExpansions extracts data necessary to generate
595// the extraction tables.
596func (b *Builder) processExpansions(o *ordering) {
597 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
598 if !e.expansion() {
599 continue
600 }
601 key := fmt.Sprintf("%v", e.elems)
602 i, ok := b.expIndex[key]
603 if !ok {
604 i = b.appendExpansion(e)
605 b.expIndex[key] = i
606 }
607 e.expansionIndex = i
608 }
609}
610
611func (b *Builder) processContractions(o *ordering) {
612 // Collate contractions per starter rune.
613 starters := []rune{}
614 cm := make(map[rune][]*entry)
615 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
616 if e.contraction() {
617 if len(e.str) > b.t.MaxContractLen {
618 b.t.MaxContractLen = len(e.str)
619 }
620 r := e.runes[0]
621 if _, ok := cm[r]; !ok {
622 starters = append(starters, r)
623 }
624 cm[r] = append(cm[r], e)
625 }
626 }
627 // Add entries of single runes that are at a start of a contraction.
628 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
629 if !e.contraction() {
630 r := e.runes[0]
631 if _, ok := cm[r]; ok {
632 cm[r] = append(cm[r], e)
633 }
634 }
635 }
636 // Build the tries for the contractions.
637 t := b.t
638 for _, r := range starters {
639 l := cm[r]
640 // Compute suffix strings. There are 31 different contraction suffix
641 // sets for 715 contractions and 82 contraction starter runes as of
642 // version 6.0.0.
643 sufx := []string{}
644 hasSingle := false
645 for _, e := range l {
646 if len(e.runes) > 1 {
647 sufx = append(sufx, string(e.runes[1:]))
648 } else {
649 hasSingle = true
650 }
651 }
652 if !hasSingle {
653 b.error(fmt.Errorf("no single entry for starter rune %U found", r))
654 continue
655 }
656 // Unique the suffix set.
657 sort.Strings(sufx)
658 key := strings.Join(sufx, "\n")
659 handle, ok := b.ctHandle[key]
660 if !ok {
661 var err error
662 handle, err = appendTrie(&t.ContractTries, sufx)
663 if err != nil {
664 b.error(err)
665 }
666 b.ctHandle[key] = handle
667 }
668 // Bucket sort entries in index order.
669 es := make([]*entry, len(l))
670 for _, e := range l {
671 var p, sn int
672 if len(e.runes) > 1 {
673 str := []byte(string(e.runes[1:]))
674 p, sn = lookup(&t.ContractTries, handle, str)
675 if sn != len(str) {
676 log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str))
677 }
678 }
679 if es[p] != nil {
680 log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0])
681 }
682 es[p] = e
683 }
684 // Create collation elements for contractions.
685 elems := []uint32{}
686 for _, e := range es {
687 ce, err := e.encodeBase()
688 b.errorID(o.id, err)
689 elems = append(elems, ce)
690 }
691 key = fmt.Sprintf("%v", elems)
692 i, ok := b.ctElem[key]
693 if !ok {
694 i = len(t.ContractElem)
695 b.ctElem[key] = i
696 t.ContractElem = append(t.ContractElem, elems...)
697 }
698 // Store info in entry for starter rune.
699 es[0].contractionIndex = i
700 es[0].contractionHandle = handle
701 }
702}