Don Newton | 379ae25 | 2019-04-01 12:17:06 -0400 | [diff] [blame^] | 1 | // Copyright 2012 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package build // import "golang.org/x/text/collate/build" |
| 6 | |
| 7 | import ( |
| 8 | "fmt" |
| 9 | "io" |
| 10 | "log" |
| 11 | "sort" |
| 12 | "strings" |
| 13 | "unicode/utf8" |
| 14 | |
| 15 | "golang.org/x/text/internal/colltab" |
| 16 | "golang.org/x/text/language" |
| 17 | "golang.org/x/text/unicode/norm" |
| 18 | ) |
| 19 | |
| 20 | // TODO: optimizations: |
| 21 | // - expandElem is currently 20K. By putting unique colElems in a separate |
| 22 | // table and having a byte array of indexes into this table, we can reduce |
| 23 | // the total size to about 7K. By also factoring out the length bytes, we |
| 24 | // can reduce this to about 6K. |
| 25 | // - trie valueBlocks are currently 100K. There are a lot of sparse blocks |
| 26 | // and many consecutive values with the same stride. This can be further |
| 27 | // compacted. |
| 28 | // - Compress secondary weights into 8 bits. |
| 29 | // - Some LDML specs specify a context element. Currently we simply concatenate |
| 30 | // those. Context can be implemented using the contraction trie. If Builder |
| 31 | // could analyze and detect when using a context makes sense, there is no |
| 32 | // need to expose this construct in the API. |
| 33 | |
| 34 | // A Builder builds a root collation table. The user must specify the |
| 35 | // collation elements for each entry. A common use will be to base the weights |
| 36 | // on those specified in the allkeys* file as provided by the UCA or CLDR. |
| 37 | type Builder struct { |
| 38 | index *trieBuilder |
| 39 | root ordering |
| 40 | locale []*Tailoring |
| 41 | t *table |
| 42 | err error |
| 43 | built bool |
| 44 | |
| 45 | minNonVar int // lowest primary recorded for a variable |
| 46 | varTop int // highest primary recorded for a non-variable |
| 47 | |
| 48 | // indexes used for reusing expansions and contractions |
| 49 | expIndex map[string]int // positions of expansions keyed by their string representation |
| 50 | ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes |
| 51 | ctElem map[string]int // contraction elements keyed by their string representation |
| 52 | } |
| 53 | |
| 54 | // A Tailoring builds a collation table based on another collation table. |
| 55 | // The table is defined by specifying tailorings to the underlying table. |
| 56 | // See http://unicode.org/reports/tr35/ for an overview of tailoring |
| 57 | // collation tables. The CLDR contains pre-defined tailorings for a variety |
| 58 | // of languages (See http://www.unicode.org/Public/cldr/<version>/core.zip.) |
| 59 | type Tailoring struct { |
| 60 | id string |
| 61 | builder *Builder |
| 62 | index *ordering |
| 63 | |
| 64 | anchor *entry |
| 65 | before bool |
| 66 | } |
| 67 | |
| 68 | // NewBuilder returns a new Builder. |
| 69 | func NewBuilder() *Builder { |
| 70 | return &Builder{ |
| 71 | index: newTrieBuilder(), |
| 72 | root: makeRootOrdering(), |
| 73 | expIndex: make(map[string]int), |
| 74 | ctHandle: make(map[string]ctHandle), |
| 75 | ctElem: make(map[string]int), |
| 76 | } |
| 77 | } |
| 78 | |
| 79 | // Tailoring returns a Tailoring for the given locale. One should |
| 80 | // have completed all calls to Add before calling Tailoring. |
| 81 | func (b *Builder) Tailoring(loc language.Tag) *Tailoring { |
| 82 | t := &Tailoring{ |
| 83 | id: loc.String(), |
| 84 | builder: b, |
| 85 | index: b.root.clone(), |
| 86 | } |
| 87 | t.index.id = t.id |
| 88 | b.locale = append(b.locale, t) |
| 89 | return t |
| 90 | } |
| 91 | |
| 92 | // Add adds an entry to the collation element table, mapping |
| 93 | // a slice of runes to a sequence of collation elements. |
| 94 | // A collation element is specified as list of weights: []int{primary, secondary, ...}. |
| 95 | // The entries are typically obtained from a collation element table |
| 96 | // as defined in http://www.unicode.org/reports/tr10/#Data_Table_Format. |
| 97 | // Note that the collation elements specified by colelems are only used |
| 98 | // as a guide. The actual weights generated by Builder may differ. |
| 99 | // The argument variables is a list of indices into colelems that should contain |
| 100 | // a value for each colelem that is a variable. (See the reference above.) |
| 101 | func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error { |
| 102 | str := string(runes) |
| 103 | elems := make([]rawCE, len(colelems)) |
| 104 | for i, ce := range colelems { |
| 105 | if len(ce) == 0 { |
| 106 | break |
| 107 | } |
| 108 | elems[i] = makeRawCE(ce, 0) |
| 109 | if len(ce) == 1 { |
| 110 | elems[i].w[1] = defaultSecondary |
| 111 | } |
| 112 | if len(ce) <= 2 { |
| 113 | elems[i].w[2] = defaultTertiary |
| 114 | } |
| 115 | if len(ce) <= 3 { |
| 116 | elems[i].w[3] = ce[0] |
| 117 | } |
| 118 | } |
| 119 | for i, ce := range elems { |
| 120 | p := ce.w[0] |
| 121 | isvar := false |
| 122 | for _, j := range variables { |
| 123 | if i == j { |
| 124 | isvar = true |
| 125 | } |
| 126 | } |
| 127 | if isvar { |
| 128 | if p >= b.minNonVar && b.minNonVar > 0 { |
| 129 | return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar) |
| 130 | } |
| 131 | if p > b.varTop { |
| 132 | b.varTop = p |
| 133 | } |
| 134 | } else if p > 1 { // 1 is a special primary value reserved for FFFE |
| 135 | if p <= b.varTop { |
| 136 | return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop) |
| 137 | } |
| 138 | if b.minNonVar == 0 || p < b.minNonVar { |
| 139 | b.minNonVar = p |
| 140 | } |
| 141 | } |
| 142 | } |
| 143 | elems, err := convertLargeWeights(elems) |
| 144 | if err != nil { |
| 145 | return err |
| 146 | } |
| 147 | cccs := []uint8{} |
| 148 | nfd := norm.NFD.String(str) |
| 149 | for i := range nfd { |
| 150 | cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC()) |
| 151 | } |
| 152 | if len(cccs) < len(elems) { |
| 153 | if len(cccs) > 2 { |
| 154 | return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems)) |
| 155 | } |
| 156 | p := len(elems) - 1 |
| 157 | for ; p > 0 && elems[p].w[0] == 0; p-- { |
| 158 | elems[p].ccc = cccs[len(cccs)-1] |
| 159 | } |
| 160 | for ; p >= 0; p-- { |
| 161 | elems[p].ccc = cccs[0] |
| 162 | } |
| 163 | } else { |
| 164 | for i := range elems { |
| 165 | elems[i].ccc = cccs[i] |
| 166 | } |
| 167 | } |
| 168 | // doNorm in collate.go assumes that the following conditions hold. |
| 169 | if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] { |
| 170 | return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs) |
| 171 | } |
| 172 | b.root.newEntry(str, elems) |
| 173 | return nil |
| 174 | } |
| 175 | |
| 176 | func (t *Tailoring) setAnchor(anchor string) error { |
| 177 | anchor = norm.NFC.String(anchor) |
| 178 | a := t.index.find(anchor) |
| 179 | if a == nil { |
| 180 | a = t.index.newEntry(anchor, nil) |
| 181 | a.implicit = true |
| 182 | a.modified = true |
| 183 | for _, r := range []rune(anchor) { |
| 184 | e := t.index.find(string(r)) |
| 185 | e.lock = true |
| 186 | } |
| 187 | } |
| 188 | t.anchor = a |
| 189 | return nil |
| 190 | } |
| 191 | |
| 192 | // SetAnchor sets the point after which elements passed in subsequent calls to |
| 193 | // Insert will be inserted. It is equivalent to the reset directive in an LDML |
| 194 | // specification. See Insert for an example. |
| 195 | // SetAnchor supports the following logical reset positions: |
| 196 | // <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>, |
| 197 | // and <last_non_ignorable/>. |
| 198 | func (t *Tailoring) SetAnchor(anchor string) error { |
| 199 | if err := t.setAnchor(anchor); err != nil { |
| 200 | return err |
| 201 | } |
| 202 | t.before = false |
| 203 | return nil |
| 204 | } |
| 205 | |
| 206 | // SetAnchorBefore is similar to SetAnchor, except that subsequent calls to |
| 207 | // Insert will insert entries before the anchor. |
| 208 | func (t *Tailoring) SetAnchorBefore(anchor string) error { |
| 209 | if err := t.setAnchor(anchor); err != nil { |
| 210 | return err |
| 211 | } |
| 212 | t.before = true |
| 213 | return nil |
| 214 | } |
| 215 | |
| 216 | // Insert sets the ordering of str relative to the entry set by the previous |
| 217 | // call to SetAnchor or Insert. The argument extend corresponds |
| 218 | // to the extend elements as defined in LDML. A non-empty value for extend |
| 219 | // will cause the collation elements corresponding to extend to be appended |
| 220 | // to the collation elements generated for the entry added by Insert. |
| 221 | // This has the same net effect as sorting str after the string anchor+extend. |
| 222 | // See http://www.unicode.org/reports/tr10/#Tailoring_Example for details |
| 223 | // on parametric tailoring and http://unicode.org/reports/tr35/#Collation_Elements |
| 224 | // for full details on LDML. |
| 225 | // |
| 226 | // Examples: create a tailoring for Swedish, where "ä" is ordered after "z" |
| 227 | // at the primary sorting level: |
| 228 | // t := b.Tailoring("se") |
| 229 | // t.SetAnchor("z") |
| 230 | // t.Insert(colltab.Primary, "ä", "") |
| 231 | // Order "ü" after "ue" at the secondary sorting level: |
| 232 | // t.SetAnchor("ue") |
| 233 | // t.Insert(colltab.Secondary, "ü","") |
| 234 | // or |
| 235 | // t.SetAnchor("u") |
| 236 | // t.Insert(colltab.Secondary, "ü", "e") |
| 237 | // Order "q" afer "ab" at the secondary level and "Q" after "q" |
| 238 | // at the tertiary level: |
| 239 | // t.SetAnchor("ab") |
| 240 | // t.Insert(colltab.Secondary, "q", "") |
| 241 | // t.Insert(colltab.Tertiary, "Q", "") |
| 242 | // Order "b" before "a": |
| 243 | // t.SetAnchorBefore("a") |
| 244 | // t.Insert(colltab.Primary, "b", "") |
| 245 | // Order "0" after the last primary ignorable: |
| 246 | // t.SetAnchor("<last_primary_ignorable/>") |
| 247 | // t.Insert(colltab.Primary, "0", "") |
| 248 | func (t *Tailoring) Insert(level colltab.Level, str, extend string) error { |
| 249 | if t.anchor == nil { |
| 250 | return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str) |
| 251 | } |
| 252 | str = norm.NFC.String(str) |
| 253 | e := t.index.find(str) |
| 254 | if e == nil { |
| 255 | e = t.index.newEntry(str, nil) |
| 256 | } else if e.logical != noAnchor { |
| 257 | return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str) |
| 258 | } |
| 259 | if e.lock { |
| 260 | return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str) |
| 261 | } |
| 262 | a := t.anchor |
| 263 | // Find the first element after the anchor which differs at a level smaller or |
| 264 | // equal to the given level. Then insert at this position. |
| 265 | // See http://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details. |
| 266 | e.before = t.before |
| 267 | if t.before { |
| 268 | t.before = false |
| 269 | if a.prev == nil { |
| 270 | a.insertBefore(e) |
| 271 | } else { |
| 272 | for a = a.prev; a.level > level; a = a.prev { |
| 273 | } |
| 274 | a.insertAfter(e) |
| 275 | } |
| 276 | e.level = level |
| 277 | } else { |
| 278 | for ; a.level > level; a = a.next { |
| 279 | } |
| 280 | e.level = a.level |
| 281 | if a != e { |
| 282 | a.insertAfter(e) |
| 283 | a.level = level |
| 284 | } else { |
| 285 | // We don't set a to prev itself. This has the effect of the entry |
| 286 | // getting new collation elements that are an increment of itself. |
| 287 | // This is intentional. |
| 288 | a.prev.level = level |
| 289 | } |
| 290 | } |
| 291 | e.extend = norm.NFD.String(extend) |
| 292 | e.exclude = false |
| 293 | e.modified = true |
| 294 | e.elems = nil |
| 295 | t.anchor = e |
| 296 | return nil |
| 297 | } |
| 298 | |
| 299 | func (o *ordering) getWeight(e *entry) []rawCE { |
| 300 | if len(e.elems) == 0 && e.logical == noAnchor { |
| 301 | if e.implicit { |
| 302 | for _, r := range e.runes { |
| 303 | e.elems = append(e.elems, o.getWeight(o.find(string(r)))...) |
| 304 | } |
| 305 | } else if e.before { |
| 306 | count := [colltab.Identity + 1]int{} |
| 307 | a := e |
| 308 | for ; a.elems == nil && !a.implicit; a = a.next { |
| 309 | count[a.level]++ |
| 310 | } |
| 311 | e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)} |
| 312 | for i := colltab.Primary; i < colltab.Quaternary; i++ { |
| 313 | if count[i] != 0 { |
| 314 | e.elems[0].w[i] -= count[i] |
| 315 | break |
| 316 | } |
| 317 | } |
| 318 | if e.prev != nil { |
| 319 | o.verifyWeights(e.prev, e, e.prev.level) |
| 320 | } |
| 321 | } else { |
| 322 | prev := e.prev |
| 323 | e.elems = nextWeight(prev.level, o.getWeight(prev)) |
| 324 | o.verifyWeights(e, e.next, e.level) |
| 325 | } |
| 326 | } |
| 327 | return e.elems |
| 328 | } |
| 329 | |
| 330 | func (o *ordering) addExtension(e *entry) { |
| 331 | if ex := o.find(e.extend); ex != nil { |
| 332 | e.elems = append(e.elems, ex.elems...) |
| 333 | } else { |
| 334 | for _, r := range []rune(e.extend) { |
| 335 | e.elems = append(e.elems, o.find(string(r)).elems...) |
| 336 | } |
| 337 | } |
| 338 | e.extend = "" |
| 339 | } |
| 340 | |
| 341 | func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error { |
| 342 | if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil { |
| 343 | return nil |
| 344 | } |
| 345 | for i := colltab.Primary; i < level; i++ { |
| 346 | if a.elems[0].w[i] < b.elems[0].w[i] { |
| 347 | return nil |
| 348 | } |
| 349 | } |
| 350 | if a.elems[0].w[level] >= b.elems[0].w[level] { |
| 351 | err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems) |
| 352 | log.Println(err) |
| 353 | // TODO: return the error instead, or better, fix the conflicting entry by making room. |
| 354 | } |
| 355 | return nil |
| 356 | } |
| 357 | |
| 358 | func (b *Builder) error(e error) { |
| 359 | if e != nil { |
| 360 | b.err = e |
| 361 | } |
| 362 | } |
| 363 | |
| 364 | func (b *Builder) errorID(locale string, e error) { |
| 365 | if e != nil { |
| 366 | b.err = fmt.Errorf("%s:%v", locale, e) |
| 367 | } |
| 368 | } |
| 369 | |
| 370 | // patchNorm ensures that NFC and NFD counterparts are consistent. |
| 371 | func (o *ordering) patchNorm() { |
| 372 | // Insert the NFD counterparts, if necessary. |
| 373 | for _, e := range o.ordered { |
| 374 | nfd := norm.NFD.String(e.str) |
| 375 | if nfd != e.str { |
| 376 | if e0 := o.find(nfd); e0 != nil && !e0.modified { |
| 377 | e0.elems = e.elems |
| 378 | } else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) { |
| 379 | e := o.newEntry(nfd, e.elems) |
| 380 | e.modified = true |
| 381 | } |
| 382 | } |
| 383 | } |
| 384 | // Update unchanged composed forms if one of their parts changed. |
| 385 | for _, e := range o.ordered { |
| 386 | nfd := norm.NFD.String(e.str) |
| 387 | if e.modified || nfd == e.str { |
| 388 | continue |
| 389 | } |
| 390 | if e0 := o.find(nfd); e0 != nil { |
| 391 | e.elems = e0.elems |
| 392 | } else { |
| 393 | e.elems = o.genColElems(nfd) |
| 394 | if norm.NFD.LastBoundary([]byte(nfd)) == 0 { |
| 395 | r := []rune(nfd) |
| 396 | head := string(r[0]) |
| 397 | tail := "" |
| 398 | for i := 1; i < len(r); i++ { |
| 399 | s := norm.NFC.String(head + string(r[i])) |
| 400 | if e0 := o.find(s); e0 != nil && e0.modified { |
| 401 | head = s |
| 402 | } else { |
| 403 | tail += string(r[i]) |
| 404 | } |
| 405 | } |
| 406 | e.elems = append(o.genColElems(head), o.genColElems(tail)...) |
| 407 | } |
| 408 | } |
| 409 | } |
| 410 | // Exclude entries for which the individual runes generate the same collation elements. |
| 411 | for _, e := range o.ordered { |
| 412 | if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) { |
| 413 | e.exclude = true |
| 414 | } |
| 415 | } |
| 416 | } |
| 417 | |
| 418 | func (b *Builder) buildOrdering(o *ordering) { |
| 419 | for _, e := range o.ordered { |
| 420 | o.getWeight(e) |
| 421 | } |
| 422 | for _, e := range o.ordered { |
| 423 | o.addExtension(e) |
| 424 | } |
| 425 | o.patchNorm() |
| 426 | o.sort() |
| 427 | simplify(o) |
| 428 | b.processExpansions(o) // requires simplify |
| 429 | b.processContractions(o) // requires simplify |
| 430 | |
| 431 | t := newNode() |
| 432 | for e := o.front(); e != nil; e, _ = e.nextIndexed() { |
| 433 | if !e.skip() { |
| 434 | ce, err := e.encode() |
| 435 | b.errorID(o.id, err) |
| 436 | t.insert(e.runes[0], ce) |
| 437 | } |
| 438 | } |
| 439 | o.handle = b.index.addTrie(t) |
| 440 | } |
| 441 | |
| 442 | func (b *Builder) build() (*table, error) { |
| 443 | if b.built { |
| 444 | return b.t, b.err |
| 445 | } |
| 446 | b.built = true |
| 447 | b.t = &table{ |
| 448 | Table: colltab.Table{ |
| 449 | MaxContractLen: utf8.UTFMax, |
| 450 | VariableTop: uint32(b.varTop), |
| 451 | }, |
| 452 | } |
| 453 | |
| 454 | b.buildOrdering(&b.root) |
| 455 | b.t.root = b.root.handle |
| 456 | for _, t := range b.locale { |
| 457 | b.buildOrdering(t.index) |
| 458 | if b.err != nil { |
| 459 | break |
| 460 | } |
| 461 | } |
| 462 | i, err := b.index.generate() |
| 463 | b.t.trie = *i |
| 464 | b.t.Index = colltab.Trie{ |
| 465 | Index: i.index, |
| 466 | Values: i.values, |
| 467 | Index0: i.index[blockSize*b.t.root.lookupStart:], |
| 468 | Values0: i.values[blockSize*b.t.root.valueStart:], |
| 469 | } |
| 470 | b.error(err) |
| 471 | return b.t, b.err |
| 472 | } |
| 473 | |
| 474 | // Build builds the root Collator. |
| 475 | func (b *Builder) Build() (colltab.Weighter, error) { |
| 476 | table, err := b.build() |
| 477 | if err != nil { |
| 478 | return nil, err |
| 479 | } |
| 480 | return table, nil |
| 481 | } |
| 482 | |
| 483 | // Build builds a Collator for Tailoring t. |
| 484 | func (t *Tailoring) Build() (colltab.Weighter, error) { |
| 485 | // TODO: implement. |
| 486 | return nil, nil |
| 487 | } |
| 488 | |
| 489 | // Print prints the tables for b and all its Tailorings as a Go file |
| 490 | // that can be included in the Collate package. |
| 491 | func (b *Builder) Print(w io.Writer) (n int, err error) { |
| 492 | p := func(nn int, e error) { |
| 493 | n += nn |
| 494 | if err == nil { |
| 495 | err = e |
| 496 | } |
| 497 | } |
| 498 | t, err := b.build() |
| 499 | if err != nil { |
| 500 | return 0, err |
| 501 | } |
| 502 | p(fmt.Fprintf(w, `var availableLocales = "und`)) |
| 503 | for _, loc := range b.locale { |
| 504 | if loc.id != "und" { |
| 505 | p(fmt.Fprintf(w, ",%s", loc.id)) |
| 506 | } |
| 507 | } |
| 508 | p(fmt.Fprint(w, "\"\n\n")) |
| 509 | p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop)) |
| 510 | p(fmt.Fprintln(w, "var locales = [...]tableIndex{")) |
| 511 | for _, loc := range b.locale { |
| 512 | if loc.id == "und" { |
| 513 | p(t.fprintIndex(w, loc.index.handle, loc.id)) |
| 514 | } |
| 515 | } |
| 516 | for _, loc := range b.locale { |
| 517 | if loc.id != "und" { |
| 518 | p(t.fprintIndex(w, loc.index.handle, loc.id)) |
| 519 | } |
| 520 | } |
| 521 | p(fmt.Fprint(w, "}\n\n")) |
| 522 | n, _, err = t.fprint(w, "main") |
| 523 | return |
| 524 | } |
| 525 | |
| 526 | // reproducibleFromNFKD checks whether the given expansion could be generated |
| 527 | // from an NFKD expansion. |
| 528 | func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool { |
| 529 | // Length must be equal. |
| 530 | if len(exp) != len(nfkd) { |
| 531 | return false |
| 532 | } |
| 533 | for i, ce := range exp { |
| 534 | // Primary and secondary values should be equal. |
| 535 | if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] { |
| 536 | return false |
| 537 | } |
| 538 | // Tertiary values should be equal to maxTertiary for third element onwards. |
| 539 | // TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can |
| 540 | // simply be dropped. Try this out by dropping the following code. |
| 541 | if i >= 2 && ce.w[2] != maxTertiary { |
| 542 | return false |
| 543 | } |
| 544 | if _, err := makeCE(ce); err != nil { |
| 545 | // Simply return false. The error will be caught elsewhere. |
| 546 | return false |
| 547 | } |
| 548 | } |
| 549 | return true |
| 550 | } |
| 551 | |
| 552 | func simplify(o *ordering) { |
| 553 | // Runes that are a starter of a contraction should not be removed. |
| 554 | // (To date, there is only Kannada character 0CCA.) |
| 555 | keep := make(map[rune]bool) |
| 556 | for e := o.front(); e != nil; e, _ = e.nextIndexed() { |
| 557 | if len(e.runes) > 1 { |
| 558 | keep[e.runes[0]] = true |
| 559 | } |
| 560 | } |
| 561 | // Tag entries for which the runes NFKD decompose to identical values. |
| 562 | for e := o.front(); e != nil; e, _ = e.nextIndexed() { |
| 563 | s := e.str |
| 564 | nfkd := norm.NFKD.String(s) |
| 565 | nfd := norm.NFD.String(s) |
| 566 | if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd { |
| 567 | continue |
| 568 | } |
| 569 | if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) { |
| 570 | e.decompose = true |
| 571 | } |
| 572 | } |
| 573 | } |
| 574 | |
| 575 | // appendExpansion converts the given collation sequence to |
| 576 | // collation elements and adds them to the expansion table. |
| 577 | // It returns an index to the expansion table. |
| 578 | func (b *Builder) appendExpansion(e *entry) int { |
| 579 | t := b.t |
| 580 | i := len(t.ExpandElem) |
| 581 | ce := uint32(len(e.elems)) |
| 582 | t.ExpandElem = append(t.ExpandElem, ce) |
| 583 | for _, w := range e.elems { |
| 584 | ce, err := makeCE(w) |
| 585 | if err != nil { |
| 586 | b.error(err) |
| 587 | return -1 |
| 588 | } |
| 589 | t.ExpandElem = append(t.ExpandElem, ce) |
| 590 | } |
| 591 | return i |
| 592 | } |
| 593 | |
| 594 | // processExpansions extracts data necessary to generate |
| 595 | // the extraction tables. |
| 596 | func (b *Builder) processExpansions(o *ordering) { |
| 597 | for e := o.front(); e != nil; e, _ = e.nextIndexed() { |
| 598 | if !e.expansion() { |
| 599 | continue |
| 600 | } |
| 601 | key := fmt.Sprintf("%v", e.elems) |
| 602 | i, ok := b.expIndex[key] |
| 603 | if !ok { |
| 604 | i = b.appendExpansion(e) |
| 605 | b.expIndex[key] = i |
| 606 | } |
| 607 | e.expansionIndex = i |
| 608 | } |
| 609 | } |
| 610 | |
| 611 | func (b *Builder) processContractions(o *ordering) { |
| 612 | // Collate contractions per starter rune. |
| 613 | starters := []rune{} |
| 614 | cm := make(map[rune][]*entry) |
| 615 | for e := o.front(); e != nil; e, _ = e.nextIndexed() { |
| 616 | if e.contraction() { |
| 617 | if len(e.str) > b.t.MaxContractLen { |
| 618 | b.t.MaxContractLen = len(e.str) |
| 619 | } |
| 620 | r := e.runes[0] |
| 621 | if _, ok := cm[r]; !ok { |
| 622 | starters = append(starters, r) |
| 623 | } |
| 624 | cm[r] = append(cm[r], e) |
| 625 | } |
| 626 | } |
| 627 | // Add entries of single runes that are at a start of a contraction. |
| 628 | for e := o.front(); e != nil; e, _ = e.nextIndexed() { |
| 629 | if !e.contraction() { |
| 630 | r := e.runes[0] |
| 631 | if _, ok := cm[r]; ok { |
| 632 | cm[r] = append(cm[r], e) |
| 633 | } |
| 634 | } |
| 635 | } |
| 636 | // Build the tries for the contractions. |
| 637 | t := b.t |
| 638 | for _, r := range starters { |
| 639 | l := cm[r] |
| 640 | // Compute suffix strings. There are 31 different contraction suffix |
| 641 | // sets for 715 contractions and 82 contraction starter runes as of |
| 642 | // version 6.0.0. |
| 643 | sufx := []string{} |
| 644 | hasSingle := false |
| 645 | for _, e := range l { |
| 646 | if len(e.runes) > 1 { |
| 647 | sufx = append(sufx, string(e.runes[1:])) |
| 648 | } else { |
| 649 | hasSingle = true |
| 650 | } |
| 651 | } |
| 652 | if !hasSingle { |
| 653 | b.error(fmt.Errorf("no single entry for starter rune %U found", r)) |
| 654 | continue |
| 655 | } |
| 656 | // Unique the suffix set. |
| 657 | sort.Strings(sufx) |
| 658 | key := strings.Join(sufx, "\n") |
| 659 | handle, ok := b.ctHandle[key] |
| 660 | if !ok { |
| 661 | var err error |
| 662 | handle, err = appendTrie(&t.ContractTries, sufx) |
| 663 | if err != nil { |
| 664 | b.error(err) |
| 665 | } |
| 666 | b.ctHandle[key] = handle |
| 667 | } |
| 668 | // Bucket sort entries in index order. |
| 669 | es := make([]*entry, len(l)) |
| 670 | for _, e := range l { |
| 671 | var p, sn int |
| 672 | if len(e.runes) > 1 { |
| 673 | str := []byte(string(e.runes[1:])) |
| 674 | p, sn = lookup(&t.ContractTries, handle, str) |
| 675 | if sn != len(str) { |
| 676 | log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str)) |
| 677 | } |
| 678 | } |
| 679 | if es[p] != nil { |
| 680 | log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0]) |
| 681 | } |
| 682 | es[p] = e |
| 683 | } |
| 684 | // Create collation elements for contractions. |
| 685 | elems := []uint32{} |
| 686 | for _, e := range es { |
| 687 | ce, err := e.encodeBase() |
| 688 | b.errorID(o.id, err) |
| 689 | elems = append(elems, ce) |
| 690 | } |
| 691 | key = fmt.Sprintf("%v", elems) |
| 692 | i, ok := b.ctElem[key] |
| 693 | if !ok { |
| 694 | i = len(t.ContractElem) |
| 695 | b.ctElem[key] = i |
| 696 | t.ContractElem = append(t.ContractElem, elems...) |
| 697 | } |
| 698 | // Store info in entry for starter rune. |
| 699 | es[0].contractionIndex = i |
| 700 | es[0].contractionHandle = handle |
| 701 | } |
| 702 | } |