Scott Baker | eee8dd8 | 2019-09-24 12:52:34 -0700 | [diff] [blame^] | 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build ignore |
| 6 | |
| 7 | // Language tag table generator. |
| 8 | // Data read from the web. |
| 9 | |
| 10 | package main |
| 11 | |
| 12 | import ( |
| 13 | "bufio" |
| 14 | "flag" |
| 15 | "fmt" |
| 16 | "io" |
| 17 | "io/ioutil" |
| 18 | "log" |
| 19 | "math" |
| 20 | "reflect" |
| 21 | "regexp" |
| 22 | "sort" |
| 23 | "strconv" |
| 24 | "strings" |
| 25 | |
| 26 | "golang.org/x/text/internal/gen" |
| 27 | "golang.org/x/text/internal/tag" |
| 28 | "golang.org/x/text/unicode/cldr" |
| 29 | ) |
| 30 | |
| 31 | var ( |
| 32 | test = flag.Bool("test", |
| 33 | false, |
| 34 | "test existing tables; can be used to compare web data with package data.") |
| 35 | outputFile = flag.String("output", |
| 36 | "tables.go", |
| 37 | "output file for generated tables") |
| 38 | ) |
| 39 | |
| 40 | var comment = []string{ |
| 41 | ` |
| 42 | lang holds an alphabetically sorted list of ISO-639 language identifiers. |
| 43 | All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag. |
| 44 | For 2-byte language identifiers, the two successive bytes have the following meaning: |
| 45 | - if the first letter of the 2- and 3-letter ISO codes are the same: |
| 46 | the second and third letter of the 3-letter ISO code. |
| 47 | - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3. |
| 48 | For 3-byte language identifiers the 4th byte is 0.`, |
| 49 | ` |
| 50 | langNoIndex is a bit vector of all 3-letter language codes that are not used as an index |
| 51 | in lookup tables. The language ids for these language codes are derived directly |
| 52 | from the letters and are not consecutive.`, |
| 53 | ` |
| 54 | altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives |
| 55 | to 2-letter language codes that cannot be derived using the method described above. |
| 56 | Each 3-letter code is followed by its 1-byte langID.`, |
| 57 | ` |
| 58 | altLangIndex is used to convert indexes in altLangISO3 to langIDs.`, |
| 59 | ` |
| 60 | langAliasMap maps langIDs to their suggested replacements.`, |
| 61 | ` |
| 62 | script is an alphabetically sorted list of ISO 15924 codes. The index |
| 63 | of the script in the string, divided by 4, is the internal scriptID.`, |
| 64 | ` |
| 65 | isoRegionOffset needs to be added to the index of regionISO to obtain the regionID |
| 66 | for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for |
| 67 | the UN.M49 codes used for groups.)`, |
| 68 | ` |
| 69 | regionISO holds a list of alphabetically sorted 2-letter ISO region codes. |
| 70 | Each 2-letter codes is followed by two bytes with the following meaning: |
| 71 | - [A-Z}{2}: the first letter of the 2-letter code plus these two |
| 72 | letters form the 3-letter ISO code. |
| 73 | - 0, n: index into altRegionISO3.`, |
| 74 | ` |
| 75 | regionTypes defines the status of a region for various standards.`, |
| 76 | ` |
| 77 | m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are |
| 78 | codes indicating collections of regions.`, |
| 79 | ` |
| 80 | m49Index gives indexes into fromM49 based on the three most significant bits |
| 81 | of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in |
| 82 | fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]] |
| 83 | for an entry where the first 7 bits match the 7 lsb of the UN.M49 code. |
| 84 | The region code is stored in the 9 lsb of the indexed value.`, |
| 85 | ` |
| 86 | fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`, |
| 87 | ` |
| 88 | altRegionISO3 holds a list of 3-letter region codes that cannot be |
| 89 | mapped to 2-letter codes using the default algorithm. This is a short list.`, |
| 90 | ` |
| 91 | altRegionIDs holds a list of regionIDs the positions of which match those |
| 92 | of the 3-letter ISO codes in altRegionISO3.`, |
| 93 | ` |
| 94 | variantNumSpecialized is the number of specialized variants in variants.`, |
| 95 | ` |
| 96 | suppressScript is an index from langID to the dominant script for that language, |
| 97 | if it exists. If a script is given, it should be suppressed from the language tag.`, |
| 98 | ` |
| 99 | likelyLang is a lookup table, indexed by langID, for the most likely |
| 100 | scripts and regions given incomplete information. If more entries exist for a |
| 101 | given language, region and script are the index and size respectively |
| 102 | of the list in likelyLangList.`, |
| 103 | ` |
| 104 | likelyLangList holds lists info associated with likelyLang.`, |
| 105 | ` |
| 106 | likelyRegion is a lookup table, indexed by regionID, for the most likely |
| 107 | languages and scripts given incomplete information. If more entries exist |
| 108 | for a given regionID, lang and script are the index and size respectively |
| 109 | of the list in likelyRegionList. |
| 110 | TODO: exclude containers and user-definable regions from the list.`, |
| 111 | ` |
| 112 | likelyRegionList holds lists info associated with likelyRegion.`, |
| 113 | ` |
| 114 | likelyScript is a lookup table, indexed by scriptID, for the most likely |
| 115 | languages and regions given a script.`, |
| 116 | ` |
| 117 | matchLang holds pairs of langIDs of base languages that are typically |
| 118 | mutually intelligible. Each pair is associated with a confidence and |
| 119 | whether the intelligibility goes one or both ways.`, |
| 120 | ` |
| 121 | matchScript holds pairs of scriptIDs where readers of one script |
| 122 | can typically also read the other. Each is associated with a confidence.`, |
| 123 | ` |
| 124 | nRegionGroups is the number of region groups.`, |
| 125 | ` |
| 126 | regionInclusion maps region identifiers to sets of regions in regionInclusionBits, |
| 127 | where each set holds all groupings that are directly connected in a region |
| 128 | containment graph.`, |
| 129 | ` |
| 130 | regionInclusionBits is an array of bit vectors where every vector represents |
| 131 | a set of region groupings. These sets are used to compute the distance |
| 132 | between two regions for the purpose of language matching.`, |
| 133 | ` |
| 134 | regionInclusionNext marks, for each entry in regionInclusionBits, the set of |
| 135 | all groups that are reachable from the groups set in the respective entry.`, |
| 136 | } |
| 137 | |
| 138 | // TODO: consider changing some of these structures to tries. This can reduce |
| 139 | // memory, but may increase the need for memory allocations. This could be |
| 140 | // mitigated if we can piggyback on language tags for common cases. |
| 141 | |
| 142 | func failOnError(e error) { |
| 143 | if e != nil { |
| 144 | log.Panic(e) |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | type setType int |
| 149 | |
| 150 | const ( |
| 151 | Indexed setType = 1 + iota // all elements must be of same size |
| 152 | Linear |
| 153 | ) |
| 154 | |
| 155 | type stringSet struct { |
| 156 | s []string |
| 157 | sorted, frozen bool |
| 158 | |
| 159 | // We often need to update values after the creation of an index is completed. |
| 160 | // We include a convenience map for keeping track of this. |
| 161 | update map[string]string |
| 162 | typ setType // used for checking. |
| 163 | } |
| 164 | |
| 165 | func (ss *stringSet) clone() stringSet { |
| 166 | c := *ss |
| 167 | c.s = append([]string(nil), c.s...) |
| 168 | return c |
| 169 | } |
| 170 | |
| 171 | func (ss *stringSet) setType(t setType) { |
| 172 | if ss.typ != t && ss.typ != 0 { |
| 173 | log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ) |
| 174 | } |
| 175 | } |
| 176 | |
| 177 | // parse parses a whitespace-separated string and initializes ss with its |
| 178 | // components. |
| 179 | func (ss *stringSet) parse(s string) { |
| 180 | scan := bufio.NewScanner(strings.NewReader(s)) |
| 181 | scan.Split(bufio.ScanWords) |
| 182 | for scan.Scan() { |
| 183 | ss.add(scan.Text()) |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | func (ss *stringSet) assertChangeable() { |
| 188 | if ss.frozen { |
| 189 | log.Panic("attempt to modify a frozen stringSet") |
| 190 | } |
| 191 | } |
| 192 | |
| 193 | func (ss *stringSet) add(s string) { |
| 194 | ss.assertChangeable() |
| 195 | ss.s = append(ss.s, s) |
| 196 | ss.sorted = ss.frozen |
| 197 | } |
| 198 | |
| 199 | func (ss *stringSet) freeze() { |
| 200 | ss.compact() |
| 201 | ss.frozen = true |
| 202 | } |
| 203 | |
| 204 | func (ss *stringSet) compact() { |
| 205 | if ss.sorted { |
| 206 | return |
| 207 | } |
| 208 | a := ss.s |
| 209 | sort.Strings(a) |
| 210 | k := 0 |
| 211 | for i := 1; i < len(a); i++ { |
| 212 | if a[k] != a[i] { |
| 213 | a[k+1] = a[i] |
| 214 | k++ |
| 215 | } |
| 216 | } |
| 217 | ss.s = a[:k+1] |
| 218 | ss.sorted = ss.frozen |
| 219 | } |
| 220 | |
| 221 | type funcSorter struct { |
| 222 | fn func(a, b string) bool |
| 223 | sort.StringSlice |
| 224 | } |
| 225 | |
| 226 | func (s funcSorter) Less(i, j int) bool { |
| 227 | return s.fn(s.StringSlice[i], s.StringSlice[j]) |
| 228 | } |
| 229 | |
| 230 | func (ss *stringSet) sortFunc(f func(a, b string) bool) { |
| 231 | ss.compact() |
| 232 | sort.Sort(funcSorter{f, sort.StringSlice(ss.s)}) |
| 233 | } |
| 234 | |
| 235 | func (ss *stringSet) remove(s string) { |
| 236 | ss.assertChangeable() |
| 237 | if i, ok := ss.find(s); ok { |
| 238 | copy(ss.s[i:], ss.s[i+1:]) |
| 239 | ss.s = ss.s[:len(ss.s)-1] |
| 240 | } |
| 241 | } |
| 242 | |
| 243 | func (ss *stringSet) replace(ol, nu string) { |
| 244 | ss.s[ss.index(ol)] = nu |
| 245 | ss.sorted = ss.frozen |
| 246 | } |
| 247 | |
| 248 | func (ss *stringSet) index(s string) int { |
| 249 | ss.setType(Indexed) |
| 250 | i, ok := ss.find(s) |
| 251 | if !ok { |
| 252 | if i < len(ss.s) { |
| 253 | log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i]) |
| 254 | } |
| 255 | log.Panicf("find: item %q is not in list", s) |
| 256 | |
| 257 | } |
| 258 | return i |
| 259 | } |
| 260 | |
| 261 | func (ss *stringSet) find(s string) (int, bool) { |
| 262 | ss.compact() |
| 263 | i := sort.SearchStrings(ss.s, s) |
| 264 | return i, i != len(ss.s) && ss.s[i] == s |
| 265 | } |
| 266 | |
| 267 | func (ss *stringSet) slice() []string { |
| 268 | ss.compact() |
| 269 | return ss.s |
| 270 | } |
| 271 | |
| 272 | func (ss *stringSet) updateLater(v, key string) { |
| 273 | if ss.update == nil { |
| 274 | ss.update = map[string]string{} |
| 275 | } |
| 276 | ss.update[v] = key |
| 277 | } |
| 278 | |
| 279 | // join joins the string and ensures that all entries are of the same length. |
| 280 | func (ss *stringSet) join() string { |
| 281 | ss.setType(Indexed) |
| 282 | n := len(ss.s[0]) |
| 283 | for _, s := range ss.s { |
| 284 | if len(s) != n { |
| 285 | log.Panicf("join: not all entries are of the same length: %q", s) |
| 286 | } |
| 287 | } |
| 288 | ss.s = append(ss.s, strings.Repeat("\xff", n)) |
| 289 | return strings.Join(ss.s, "") |
| 290 | } |
| 291 | |
| 292 | // ianaEntry holds information for an entry in the IANA Language Subtag Repository. |
| 293 | // All types use the same entry. |
| 294 | // See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various |
| 295 | // fields. |
| 296 | type ianaEntry struct { |
| 297 | typ string |
| 298 | description []string |
| 299 | scope string |
| 300 | added string |
| 301 | preferred string |
| 302 | deprecated string |
| 303 | suppressScript string |
| 304 | macro string |
| 305 | prefix []string |
| 306 | } |
| 307 | |
| 308 | type builder struct { |
| 309 | w *gen.CodeWriter |
| 310 | hw io.Writer // MultiWriter for w and w.Hash |
| 311 | data *cldr.CLDR |
| 312 | supp *cldr.SupplementalData |
| 313 | |
| 314 | // indices |
| 315 | locale stringSet // common locales |
| 316 | lang stringSet // canonical language ids (2 or 3 letter ISO codes) with data |
| 317 | langNoIndex stringSet // 3-letter ISO codes with no associated data |
| 318 | script stringSet // 4-letter ISO codes |
| 319 | region stringSet // 2-letter ISO or 3-digit UN M49 codes |
| 320 | variant stringSet // 4-8-alphanumeric variant code. |
| 321 | |
| 322 | // Region codes that are groups with their corresponding group IDs. |
| 323 | groups map[int]index |
| 324 | |
| 325 | // langInfo |
| 326 | registry map[string]*ianaEntry |
| 327 | } |
| 328 | |
| 329 | type index uint |
| 330 | |
| 331 | func newBuilder(w *gen.CodeWriter) *builder { |
| 332 | r := gen.OpenCLDRCoreZip() |
| 333 | defer r.Close() |
| 334 | d := &cldr.Decoder{} |
| 335 | data, err := d.DecodeZip(r) |
| 336 | failOnError(err) |
| 337 | b := builder{ |
| 338 | w: w, |
| 339 | hw: io.MultiWriter(w, w.Hash), |
| 340 | data: data, |
| 341 | supp: data.Supplemental(), |
| 342 | } |
| 343 | b.parseRegistry() |
| 344 | return &b |
| 345 | } |
| 346 | |
| 347 | func (b *builder) parseRegistry() { |
| 348 | r := gen.OpenIANAFile("assignments/language-subtag-registry") |
| 349 | defer r.Close() |
| 350 | b.registry = make(map[string]*ianaEntry) |
| 351 | |
| 352 | scan := bufio.NewScanner(r) |
| 353 | scan.Split(bufio.ScanWords) |
| 354 | var record *ianaEntry |
| 355 | for more := scan.Scan(); more; { |
| 356 | key := scan.Text() |
| 357 | more = scan.Scan() |
| 358 | value := scan.Text() |
| 359 | switch key { |
| 360 | case "Type:": |
| 361 | record = &ianaEntry{typ: value} |
| 362 | case "Subtag:", "Tag:": |
| 363 | if s := strings.SplitN(value, "..", 2); len(s) > 1 { |
| 364 | for a := s[0]; a <= s[1]; a = inc(a) { |
| 365 | b.addToRegistry(a, record) |
| 366 | } |
| 367 | } else { |
| 368 | b.addToRegistry(value, record) |
| 369 | } |
| 370 | case "Suppress-Script:": |
| 371 | record.suppressScript = value |
| 372 | case "Added:": |
| 373 | record.added = value |
| 374 | case "Deprecated:": |
| 375 | record.deprecated = value |
| 376 | case "Macrolanguage:": |
| 377 | record.macro = value |
| 378 | case "Preferred-Value:": |
| 379 | record.preferred = value |
| 380 | case "Prefix:": |
| 381 | record.prefix = append(record.prefix, value) |
| 382 | case "Scope:": |
| 383 | record.scope = value |
| 384 | case "Description:": |
| 385 | buf := []byte(value) |
| 386 | for more = scan.Scan(); more; more = scan.Scan() { |
| 387 | b := scan.Bytes() |
| 388 | if b[0] == '%' || b[len(b)-1] == ':' { |
| 389 | break |
| 390 | } |
| 391 | buf = append(buf, ' ') |
| 392 | buf = append(buf, b...) |
| 393 | } |
| 394 | record.description = append(record.description, string(buf)) |
| 395 | continue |
| 396 | default: |
| 397 | continue |
| 398 | } |
| 399 | more = scan.Scan() |
| 400 | } |
| 401 | if scan.Err() != nil { |
| 402 | log.Panic(scan.Err()) |
| 403 | } |
| 404 | } |
| 405 | |
| 406 | func (b *builder) addToRegistry(key string, entry *ianaEntry) { |
| 407 | if info, ok := b.registry[key]; ok { |
| 408 | if info.typ != "language" || entry.typ != "extlang" { |
| 409 | log.Fatalf("parseRegistry: tag %q already exists", key) |
| 410 | } |
| 411 | } else { |
| 412 | b.registry[key] = entry |
| 413 | } |
| 414 | } |
| 415 | |
| 416 | var commentIndex = make(map[string]string) |
| 417 | |
| 418 | func init() { |
| 419 | for _, s := range comment { |
| 420 | key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0]) |
| 421 | commentIndex[key] = s |
| 422 | } |
| 423 | } |
| 424 | |
| 425 | func (b *builder) comment(name string) { |
| 426 | if s := commentIndex[name]; len(s) > 0 { |
| 427 | b.w.WriteComment(s) |
| 428 | } else { |
| 429 | fmt.Fprintln(b.w) |
| 430 | } |
| 431 | } |
| 432 | |
| 433 | func (b *builder) pf(f string, x ...interface{}) { |
| 434 | fmt.Fprintf(b.hw, f, x...) |
| 435 | fmt.Fprint(b.hw, "\n") |
| 436 | } |
| 437 | |
| 438 | func (b *builder) p(x ...interface{}) { |
| 439 | fmt.Fprintln(b.hw, x...) |
| 440 | } |
| 441 | |
| 442 | func (b *builder) addSize(s int) { |
| 443 | b.w.Size += s |
| 444 | b.pf("// Size: %d bytes", s) |
| 445 | } |
| 446 | |
| 447 | func (b *builder) writeConst(name string, x interface{}) { |
| 448 | b.comment(name) |
| 449 | b.w.WriteConst(name, x) |
| 450 | } |
| 451 | |
| 452 | // writeConsts computes f(v) for all v in values and writes the results |
| 453 | // as constants named _v to a single constant block. |
| 454 | func (b *builder) writeConsts(f func(string) int, values ...string) { |
| 455 | b.pf("const (") |
| 456 | for _, v := range values { |
| 457 | b.pf("\t_%s = %v", v, f(v)) |
| 458 | } |
| 459 | b.pf(")") |
| 460 | } |
| 461 | |
| 462 | // writeType writes the type of the given value, which must be a struct. |
| 463 | func (b *builder) writeType(value interface{}) { |
| 464 | b.comment(reflect.TypeOf(value).Name()) |
| 465 | b.w.WriteType(value) |
| 466 | } |
| 467 | |
| 468 | func (b *builder) writeSlice(name string, ss interface{}) { |
| 469 | b.writeSliceAddSize(name, 0, ss) |
| 470 | } |
| 471 | |
| 472 | func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) { |
| 473 | b.comment(name) |
| 474 | b.w.Size += extraSize |
| 475 | v := reflect.ValueOf(ss) |
| 476 | t := v.Type().Elem() |
| 477 | b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len()) |
| 478 | |
| 479 | fmt.Fprintf(b.w, "var %s = ", name) |
| 480 | b.w.WriteArray(ss) |
| 481 | b.p() |
| 482 | } |
| 483 | |
| 484 | type fromTo struct { |
| 485 | from, to uint16 |
| 486 | } |
| 487 | |
| 488 | func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) { |
| 489 | ss.sortFunc(func(a, b string) bool { |
| 490 | return index(a) < index(b) |
| 491 | }) |
| 492 | m := []fromTo{} |
| 493 | for _, s := range ss.s { |
| 494 | m = append(m, fromTo{index(s), index(ss.update[s])}) |
| 495 | } |
| 496 | b.writeSlice(name, m) |
| 497 | } |
| 498 | |
| 499 | const base = 'z' - 'a' + 1 |
| 500 | |
| 501 | func strToInt(s string) uint { |
| 502 | v := uint(0) |
| 503 | for i := 0; i < len(s); i++ { |
| 504 | v *= base |
| 505 | v += uint(s[i] - 'a') |
| 506 | } |
| 507 | return v |
| 508 | } |
| 509 | |
| 510 | // converts the given integer to the original ASCII string passed to strToInt. |
| 511 | // len(s) must match the number of characters obtained. |
| 512 | func intToStr(v uint, s []byte) { |
| 513 | for i := len(s) - 1; i >= 0; i-- { |
| 514 | s[i] = byte(v%base) + 'a' |
| 515 | v /= base |
| 516 | } |
| 517 | } |
| 518 | |
| 519 | func (b *builder) writeBitVector(name string, ss []string) { |
| 520 | vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8))) |
| 521 | for _, s := range ss { |
| 522 | v := strToInt(s) |
| 523 | vec[v/8] |= 1 << (v % 8) |
| 524 | } |
| 525 | b.writeSlice(name, vec) |
| 526 | } |
| 527 | |
| 528 | // TODO: convert this type into a list or two-stage trie. |
| 529 | func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) { |
| 530 | b.comment(name) |
| 531 | v := reflect.ValueOf(m) |
| 532 | sz := v.Len() * (2 + int(v.Type().Key().Size())) |
| 533 | for _, k := range m { |
| 534 | sz += len(k) |
| 535 | } |
| 536 | b.addSize(sz) |
| 537 | keys := []string{} |
| 538 | b.pf(`var %s = map[string]uint16{`, name) |
| 539 | for k := range m { |
| 540 | keys = append(keys, k) |
| 541 | } |
| 542 | sort.Strings(keys) |
| 543 | for _, k := range keys { |
| 544 | b.pf("\t%q: %v,", k, f(m[k])) |
| 545 | } |
| 546 | b.p("}") |
| 547 | } |
| 548 | |
| 549 | func (b *builder) writeMap(name string, m interface{}) { |
| 550 | b.comment(name) |
| 551 | v := reflect.ValueOf(m) |
| 552 | sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size())) |
| 553 | b.addSize(sz) |
| 554 | f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool { |
| 555 | return strings.IndexRune("{}, ", r) != -1 |
| 556 | }) |
| 557 | sort.Strings(f[1:]) |
| 558 | b.pf(`var %s = %s{`, name, f[0]) |
| 559 | for _, kv := range f[1:] { |
| 560 | b.pf("\t%s,", kv) |
| 561 | } |
| 562 | b.p("}") |
| 563 | } |
| 564 | |
| 565 | func (b *builder) langIndex(s string) uint16 { |
| 566 | if s == "und" { |
| 567 | return 0 |
| 568 | } |
| 569 | if i, ok := b.lang.find(s); ok { |
| 570 | return uint16(i) |
| 571 | } |
| 572 | return uint16(strToInt(s)) + uint16(len(b.lang.s)) |
| 573 | } |
| 574 | |
| 575 | // inc advances the string to its lexicographical successor. |
| 576 | func inc(s string) string { |
| 577 | const maxTagLength = 4 |
| 578 | var buf [maxTagLength]byte |
| 579 | intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)]) |
| 580 | for i := 0; i < len(s); i++ { |
| 581 | if s[i] <= 'Z' { |
| 582 | buf[i] -= 'a' - 'A' |
| 583 | } |
| 584 | } |
| 585 | return string(buf[:len(s)]) |
| 586 | } |
| 587 | |
| 588 | func (b *builder) parseIndices() { |
| 589 | meta := b.supp.Metadata |
| 590 | |
| 591 | for k, v := range b.registry { |
| 592 | var ss *stringSet |
| 593 | switch v.typ { |
| 594 | case "language": |
| 595 | if len(k) == 2 || v.suppressScript != "" || v.scope == "special" { |
| 596 | b.lang.add(k) |
| 597 | continue |
| 598 | } else { |
| 599 | ss = &b.langNoIndex |
| 600 | } |
| 601 | case "region": |
| 602 | ss = &b.region |
| 603 | case "script": |
| 604 | ss = &b.script |
| 605 | case "variant": |
| 606 | ss = &b.variant |
| 607 | default: |
| 608 | continue |
| 609 | } |
| 610 | ss.add(k) |
| 611 | } |
| 612 | // Include any language for which there is data. |
| 613 | for _, lang := range b.data.Locales() { |
| 614 | if x := b.data.RawLDML(lang); false || |
| 615 | x.LocaleDisplayNames != nil || |
| 616 | x.Characters != nil || |
| 617 | x.Delimiters != nil || |
| 618 | x.Measurement != nil || |
| 619 | x.Dates != nil || |
| 620 | x.Numbers != nil || |
| 621 | x.Units != nil || |
| 622 | x.ListPatterns != nil || |
| 623 | x.Collations != nil || |
| 624 | x.Segmentations != nil || |
| 625 | x.Rbnf != nil || |
| 626 | x.Annotations != nil || |
| 627 | x.Metadata != nil { |
| 628 | |
| 629 | from := strings.Split(lang, "_") |
| 630 | if lang := from[0]; lang != "root" { |
| 631 | b.lang.add(lang) |
| 632 | } |
| 633 | } |
| 634 | } |
| 635 | // Include locales for plural rules, which uses a different structure. |
| 636 | for _, plurals := range b.data.Supplemental().Plurals { |
| 637 | for _, rules := range plurals.PluralRules { |
| 638 | for _, lang := range strings.Split(rules.Locales, " ") { |
| 639 | if lang = strings.Split(lang, "_")[0]; lang != "root" { |
| 640 | b.lang.add(lang) |
| 641 | } |
| 642 | } |
| 643 | } |
| 644 | } |
| 645 | // Include languages in likely subtags. |
| 646 | for _, m := range b.supp.LikelySubtags.LikelySubtag { |
| 647 | from := strings.Split(m.From, "_") |
| 648 | b.lang.add(from[0]) |
| 649 | } |
| 650 | // Include ISO-639 alpha-3 bibliographic entries. |
| 651 | for _, a := range meta.Alias.LanguageAlias { |
| 652 | if a.Reason == "bibliographic" { |
| 653 | b.langNoIndex.add(a.Type) |
| 654 | } |
| 655 | } |
| 656 | // Include regions in territoryAlias (not all are in the IANA registry!) |
| 657 | for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { |
| 658 | if len(reg.Type) == 2 { |
| 659 | b.region.add(reg.Type) |
| 660 | } |
| 661 | } |
| 662 | |
| 663 | for _, s := range b.lang.s { |
| 664 | if len(s) == 3 { |
| 665 | b.langNoIndex.remove(s) |
| 666 | } |
| 667 | } |
| 668 | b.writeConst("numLanguages", len(b.lang.slice())+len(b.langNoIndex.slice())) |
| 669 | b.writeConst("numScripts", len(b.script.slice())) |
| 670 | b.writeConst("numRegions", len(b.region.slice())) |
| 671 | |
| 672 | // Add dummy codes at the start of each list to represent "unspecified". |
| 673 | b.lang.add("---") |
| 674 | b.script.add("----") |
| 675 | b.region.add("---") |
| 676 | |
| 677 | // common locales |
| 678 | b.locale.parse(meta.DefaultContent.Locales) |
| 679 | } |
| 680 | |
| 681 | // TODO: region inclusion data will probably not be use used in future matchers. |
| 682 | |
| 683 | func (b *builder) computeRegionGroups() { |
| 684 | b.groups = make(map[int]index) |
| 685 | |
| 686 | // Create group indices. |
| 687 | for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID. |
| 688 | b.groups[i] = index(len(b.groups)) |
| 689 | } |
| 690 | for _, g := range b.supp.TerritoryContainment.Group { |
| 691 | // Skip UN and EURO zone as they are flattening the containment |
| 692 | // relationship. |
| 693 | if g.Type == "EZ" || g.Type == "UN" { |
| 694 | continue |
| 695 | } |
| 696 | group := b.region.index(g.Type) |
| 697 | if _, ok := b.groups[group]; !ok { |
| 698 | b.groups[group] = index(len(b.groups)) |
| 699 | } |
| 700 | } |
| 701 | if len(b.groups) > 64 { |
| 702 | log.Fatalf("only 64 groups supported, found %d", len(b.groups)) |
| 703 | } |
| 704 | b.writeConst("nRegionGroups", len(b.groups)) |
| 705 | } |
| 706 | |
| 707 | var langConsts = []string{ |
| 708 | "af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", |
| 709 | "et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is", |
| 710 | "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", |
| 711 | "mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt", |
| 712 | "ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", |
| 713 | "tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu", |
| 714 | |
| 715 | // constants for grandfathered tags (if not already defined) |
| 716 | "jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu", |
| 717 | "nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn", |
| 718 | } |
| 719 | |
| 720 | // writeLanguage generates all tables needed for language canonicalization. |
| 721 | func (b *builder) writeLanguage() { |
| 722 | meta := b.supp.Metadata |
| 723 | |
| 724 | b.writeConst("nonCanonicalUnd", b.lang.index("und")) |
| 725 | b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) |
| 726 | b.writeConst("langPrivateStart", b.langIndex("qaa")) |
| 727 | b.writeConst("langPrivateEnd", b.langIndex("qtz")) |
| 728 | |
| 729 | // Get language codes that need to be mapped (overlong 3-letter codes, |
| 730 | // deprecated 2-letter codes, legacy and grandfathered tags.) |
| 731 | langAliasMap := stringSet{} |
| 732 | aliasTypeMap := map[string]langAliasType{} |
| 733 | |
| 734 | // altLangISO3 get the alternative ISO3 names that need to be mapped. |
| 735 | altLangISO3 := stringSet{} |
| 736 | // Add dummy start to avoid the use of index 0. |
| 737 | altLangISO3.add("---") |
| 738 | altLangISO3.updateLater("---", "aa") |
| 739 | |
| 740 | lang := b.lang.clone() |
| 741 | for _, a := range meta.Alias.LanguageAlias { |
| 742 | if a.Replacement == "" { |
| 743 | a.Replacement = "und" |
| 744 | } |
| 745 | // TODO: support mapping to tags |
| 746 | repl := strings.SplitN(a.Replacement, "_", 2)[0] |
| 747 | if a.Reason == "overlong" { |
| 748 | if len(a.Replacement) == 2 && len(a.Type) == 3 { |
| 749 | lang.updateLater(a.Replacement, a.Type) |
| 750 | } |
| 751 | } else if len(a.Type) <= 3 { |
| 752 | switch a.Reason { |
| 753 | case "macrolanguage": |
| 754 | aliasTypeMap[a.Type] = langMacro |
| 755 | case "deprecated": |
| 756 | // handled elsewhere |
| 757 | continue |
| 758 | case "bibliographic", "legacy": |
| 759 | if a.Type == "no" { |
| 760 | continue |
| 761 | } |
| 762 | aliasTypeMap[a.Type] = langLegacy |
| 763 | default: |
| 764 | log.Fatalf("new %s alias: %s", a.Reason, a.Type) |
| 765 | } |
| 766 | langAliasMap.add(a.Type) |
| 767 | langAliasMap.updateLater(a.Type, repl) |
| 768 | } |
| 769 | } |
| 770 | // Manually add the mapping of "nb" (Norwegian) to its macro language. |
| 771 | // This can be removed if CLDR adopts this change. |
| 772 | langAliasMap.add("nb") |
| 773 | langAliasMap.updateLater("nb", "no") |
| 774 | aliasTypeMap["nb"] = langMacro |
| 775 | |
| 776 | for k, v := range b.registry { |
| 777 | // Also add deprecated values for 3-letter ISO codes, which CLDR omits. |
| 778 | if v.typ == "language" && v.deprecated != "" && v.preferred != "" { |
| 779 | langAliasMap.add(k) |
| 780 | langAliasMap.updateLater(k, v.preferred) |
| 781 | aliasTypeMap[k] = langDeprecated |
| 782 | } |
| 783 | } |
| 784 | // Fix CLDR mappings. |
| 785 | lang.updateLater("tl", "tgl") |
| 786 | lang.updateLater("sh", "hbs") |
| 787 | lang.updateLater("mo", "mol") |
| 788 | lang.updateLater("no", "nor") |
| 789 | lang.updateLater("tw", "twi") |
| 790 | lang.updateLater("nb", "nob") |
| 791 | lang.updateLater("ak", "aka") |
| 792 | lang.updateLater("bh", "bih") |
| 793 | |
| 794 | // Ensure that each 2-letter code is matched with a 3-letter code. |
| 795 | for _, v := range lang.s[1:] { |
| 796 | s, ok := lang.update[v] |
| 797 | if !ok { |
| 798 | if s, ok = lang.update[langAliasMap.update[v]]; !ok { |
| 799 | continue |
| 800 | } |
| 801 | lang.update[v] = s |
| 802 | } |
| 803 | if v[0] != s[0] { |
| 804 | altLangISO3.add(s) |
| 805 | altLangISO3.updateLater(s, v) |
| 806 | } |
| 807 | } |
| 808 | |
| 809 | // Complete canonicalized language tags. |
| 810 | lang.freeze() |
| 811 | for i, v := range lang.s { |
| 812 | // We can avoid these manual entries by using the IANA registry directly. |
| 813 | // Seems easier to update the list manually, as changes are rare. |
| 814 | // The panic in this loop will trigger if we miss an entry. |
| 815 | add := "" |
| 816 | if s, ok := lang.update[v]; ok { |
| 817 | if s[0] == v[0] { |
| 818 | add = s[1:] |
| 819 | } else { |
| 820 | add = string([]byte{0, byte(altLangISO3.index(s))}) |
| 821 | } |
| 822 | } else if len(v) == 3 { |
| 823 | add = "\x00" |
| 824 | } else { |
| 825 | log.Panicf("no data for long form of %q", v) |
| 826 | } |
| 827 | lang.s[i] += add |
| 828 | } |
| 829 | b.writeConst("lang", tag.Index(lang.join())) |
| 830 | |
| 831 | b.writeConst("langNoIndexOffset", len(b.lang.s)) |
| 832 | |
| 833 | // space of all valid 3-letter language identifiers. |
| 834 | b.writeBitVector("langNoIndex", b.langNoIndex.slice()) |
| 835 | |
| 836 | altLangIndex := []uint16{} |
| 837 | for i, s := range altLangISO3.slice() { |
| 838 | altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))}) |
| 839 | if i > 0 { |
| 840 | idx := b.lang.index(altLangISO3.update[s]) |
| 841 | altLangIndex = append(altLangIndex, uint16(idx)) |
| 842 | } |
| 843 | } |
| 844 | b.writeConst("altLangISO3", tag.Index(altLangISO3.join())) |
| 845 | b.writeSlice("altLangIndex", altLangIndex) |
| 846 | |
| 847 | b.writeSortedMap("langAliasMap", &langAliasMap, b.langIndex) |
| 848 | types := make([]langAliasType, len(langAliasMap.s)) |
| 849 | for i, s := range langAliasMap.s { |
| 850 | types[i] = aliasTypeMap[s] |
| 851 | } |
| 852 | b.writeSlice("langAliasTypes", types) |
| 853 | } |
| 854 | |
| 855 | var scriptConsts = []string{ |
| 856 | "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", |
| 857 | "Zzzz", |
| 858 | } |
| 859 | |
| 860 | func (b *builder) writeScript() { |
| 861 | b.writeConsts(b.script.index, scriptConsts...) |
| 862 | b.writeConst("script", tag.Index(b.script.join())) |
| 863 | |
| 864 | supp := make([]uint8, len(b.lang.slice())) |
| 865 | for i, v := range b.lang.slice()[1:] { |
| 866 | if sc := b.registry[v].suppressScript; sc != "" { |
| 867 | supp[i+1] = uint8(b.script.index(sc)) |
| 868 | } |
| 869 | } |
| 870 | b.writeSlice("suppressScript", supp) |
| 871 | |
| 872 | // There is only one deprecated script in CLDR. This value is hard-coded. |
| 873 | // We check here if the code must be updated. |
| 874 | for _, a := range b.supp.Metadata.Alias.ScriptAlias { |
| 875 | if a.Type != "Qaai" { |
| 876 | log.Panicf("unexpected deprecated stript %q", a.Type) |
| 877 | } |
| 878 | } |
| 879 | } |
| 880 | |
| 881 | func parseM49(s string) int16 { |
| 882 | if len(s) == 0 { |
| 883 | return 0 |
| 884 | } |
| 885 | v, err := strconv.ParseUint(s, 10, 10) |
| 886 | failOnError(err) |
| 887 | return int16(v) |
| 888 | } |
| 889 | |
| 890 | var regionConsts = []string{ |
| 891 | "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", |
| 892 | "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. |
| 893 | } |
| 894 | |
| 895 | func (b *builder) writeRegion() { |
| 896 | b.writeConsts(b.region.index, regionConsts...) |
| 897 | |
| 898 | isoOffset := b.region.index("AA") |
| 899 | m49map := make([]int16, len(b.region.slice())) |
| 900 | fromM49map := make(map[int16]int) |
| 901 | altRegionISO3 := "" |
| 902 | altRegionIDs := []uint16{} |
| 903 | |
| 904 | b.writeConst("isoRegionOffset", isoOffset) |
| 905 | |
| 906 | // 2-letter region lookup and mapping to numeric codes. |
| 907 | regionISO := b.region.clone() |
| 908 | regionISO.s = regionISO.s[isoOffset:] |
| 909 | regionISO.sorted = false |
| 910 | |
| 911 | regionTypes := make([]byte, len(b.region.s)) |
| 912 | |
| 913 | // Is the region valid BCP 47? |
| 914 | for s, e := range b.registry { |
| 915 | if len(s) == 2 && s == strings.ToUpper(s) { |
| 916 | i := b.region.index(s) |
| 917 | for _, d := range e.description { |
| 918 | if strings.Contains(d, "Private use") { |
| 919 | regionTypes[i] = iso3166UserAssigned |
| 920 | } |
| 921 | } |
| 922 | regionTypes[i] |= bcp47Region |
| 923 | } |
| 924 | } |
| 925 | |
| 926 | // Is the region a valid ccTLD? |
| 927 | r := gen.OpenIANAFile("domains/root/db") |
| 928 | defer r.Close() |
| 929 | |
| 930 | buf, err := ioutil.ReadAll(r) |
| 931 | failOnError(err) |
| 932 | re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`) |
| 933 | for _, m := range re.FindAllSubmatch(buf, -1) { |
| 934 | i := b.region.index(strings.ToUpper(string(m[1]))) |
| 935 | regionTypes[i] |= ccTLD |
| 936 | } |
| 937 | |
| 938 | b.writeSlice("regionTypes", regionTypes) |
| 939 | |
| 940 | iso3Set := make(map[string]int) |
| 941 | update := func(iso2, iso3 string) { |
| 942 | i := regionISO.index(iso2) |
| 943 | if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] { |
| 944 | regionISO.s[i] += iso3[1:] |
| 945 | iso3Set[iso3] = -1 |
| 946 | } else { |
| 947 | if ok && j >= 0 { |
| 948 | regionISO.s[i] += string([]byte{0, byte(j)}) |
| 949 | } else { |
| 950 | iso3Set[iso3] = len(altRegionISO3) |
| 951 | regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))}) |
| 952 | altRegionISO3 += iso3 |
| 953 | altRegionIDs = append(altRegionIDs, uint16(isoOffset+i)) |
| 954 | } |
| 955 | } |
| 956 | } |
| 957 | for _, tc := range b.supp.CodeMappings.TerritoryCodes { |
| 958 | i := regionISO.index(tc.Type) + isoOffset |
| 959 | if d := m49map[i]; d != 0 { |
| 960 | log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d) |
| 961 | } |
| 962 | m49 := parseM49(tc.Numeric) |
| 963 | m49map[i] = m49 |
| 964 | if r := fromM49map[m49]; r == 0 { |
| 965 | fromM49map[m49] = i |
| 966 | } else if r != i { |
| 967 | dep := b.registry[regionISO.s[r-isoOffset]].deprecated |
| 968 | if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) { |
| 969 | fromM49map[m49] = i |
| 970 | } |
| 971 | } |
| 972 | } |
| 973 | for _, ta := range b.supp.Metadata.Alias.TerritoryAlias { |
| 974 | if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 { |
| 975 | from := parseM49(ta.Type) |
| 976 | if r := fromM49map[from]; r == 0 { |
| 977 | fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset |
| 978 | } |
| 979 | } |
| 980 | } |
| 981 | for _, tc := range b.supp.CodeMappings.TerritoryCodes { |
| 982 | if len(tc.Alpha3) == 3 { |
| 983 | update(tc.Type, tc.Alpha3) |
| 984 | } |
| 985 | } |
| 986 | // This entries are not included in territoryCodes. Mostly 3-letter variants |
| 987 | // of deleted codes and an entry for QU. |
| 988 | for _, m := range []struct{ iso2, iso3 string }{ |
| 989 | {"CT", "CTE"}, |
| 990 | {"DY", "DHY"}, |
| 991 | {"HV", "HVO"}, |
| 992 | {"JT", "JTN"}, |
| 993 | {"MI", "MID"}, |
| 994 | {"NH", "NHB"}, |
| 995 | {"NQ", "ATN"}, |
| 996 | {"PC", "PCI"}, |
| 997 | {"PU", "PUS"}, |
| 998 | {"PZ", "PCZ"}, |
| 999 | {"RH", "RHO"}, |
| 1000 | {"VD", "VDR"}, |
| 1001 | {"WK", "WAK"}, |
| 1002 | // These three-letter codes are used for others as well. |
| 1003 | {"FQ", "ATF"}, |
| 1004 | } { |
| 1005 | update(m.iso2, m.iso3) |
| 1006 | } |
| 1007 | for i, s := range regionISO.s { |
| 1008 | if len(s) != 4 { |
| 1009 | regionISO.s[i] = s + " " |
| 1010 | } |
| 1011 | } |
| 1012 | b.writeConst("regionISO", tag.Index(regionISO.join())) |
| 1013 | b.writeConst("altRegionISO3", altRegionISO3) |
| 1014 | b.writeSlice("altRegionIDs", altRegionIDs) |
| 1015 | |
| 1016 | // Create list of deprecated regions. |
| 1017 | // TODO: consider inserting SF -> FI. Not included by CLDR, but is the only |
| 1018 | // Transitionally-reserved mapping not included. |
| 1019 | regionOldMap := stringSet{} |
| 1020 | // Include regions in territoryAlias (not all are in the IANA registry!) |
| 1021 | for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { |
| 1022 | if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 { |
| 1023 | regionOldMap.add(reg.Type) |
| 1024 | regionOldMap.updateLater(reg.Type, reg.Replacement) |
| 1025 | i, _ := regionISO.find(reg.Type) |
| 1026 | j, _ := regionISO.find(reg.Replacement) |
| 1027 | if k := m49map[i+isoOffset]; k == 0 { |
| 1028 | m49map[i+isoOffset] = m49map[j+isoOffset] |
| 1029 | } |
| 1030 | } |
| 1031 | } |
| 1032 | b.writeSortedMap("regionOldMap", ®ionOldMap, func(s string) uint16 { |
| 1033 | return uint16(b.region.index(s)) |
| 1034 | }) |
| 1035 | // 3-digit region lookup, groupings. |
| 1036 | for i := 1; i < isoOffset; i++ { |
| 1037 | m := parseM49(b.region.s[i]) |
| 1038 | m49map[i] = m |
| 1039 | fromM49map[m] = i |
| 1040 | } |
| 1041 | b.writeSlice("m49", m49map) |
| 1042 | |
| 1043 | const ( |
| 1044 | searchBits = 7 |
| 1045 | regionBits = 9 |
| 1046 | ) |
| 1047 | if len(m49map) >= 1<<regionBits { |
| 1048 | log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits) |
| 1049 | } |
| 1050 | m49Index := [9]int16{} |
| 1051 | fromM49 := []uint16{} |
| 1052 | m49 := []int{} |
| 1053 | for k, _ := range fromM49map { |
| 1054 | m49 = append(m49, int(k)) |
| 1055 | } |
| 1056 | sort.Ints(m49) |
| 1057 | for _, k := range m49[1:] { |
| 1058 | val := (k & (1<<searchBits - 1)) << regionBits |
| 1059 | fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)])) |
| 1060 | m49Index[1:][k>>searchBits] = int16(len(fromM49)) |
| 1061 | } |
| 1062 | b.writeSlice("m49Index", m49Index) |
| 1063 | b.writeSlice("fromM49", fromM49) |
| 1064 | } |
| 1065 | |
| 1066 | const ( |
| 1067 | // TODO: put these lists in regionTypes as user data? Could be used for |
| 1068 | // various optimizations and refinements and could be exposed in the API. |
| 1069 | iso3166Except = "AC CP DG EA EU FX IC SU TA UK" |
| 1070 | iso3166Trans = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions. |
| 1071 | // DY and RH are actually not deleted, but indeterminately reserved. |
| 1072 | iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD" |
| 1073 | ) |
| 1074 | |
| 1075 | const ( |
| 1076 | iso3166UserAssigned = 1 << iota |
| 1077 | ccTLD |
| 1078 | bcp47Region |
| 1079 | ) |
| 1080 | |
| 1081 | func find(list []string, s string) int { |
| 1082 | for i, t := range list { |
| 1083 | if t == s { |
| 1084 | return i |
| 1085 | } |
| 1086 | } |
| 1087 | return -1 |
| 1088 | } |
| 1089 | |
| 1090 | // writeVariants generates per-variant information and creates a map from variant |
| 1091 | // name to index value. We assign index values such that sorting multiple |
| 1092 | // variants by index value will result in the correct order. |
| 1093 | // There are two types of variants: specialized and general. Specialized variants |
| 1094 | // are only applicable to certain language or language-script pairs. Generalized |
| 1095 | // variants apply to any language. Generalized variants always sort after |
| 1096 | // specialized variants. We will therefore always assign a higher index value |
| 1097 | // to a generalized variant than any other variant. Generalized variants are |
| 1098 | // sorted alphabetically among themselves. |
| 1099 | // Specialized variants may also sort after other specialized variants. Such |
| 1100 | // variants will be ordered after any of the variants they may follow. |
| 1101 | // We assume that if a variant x is followed by a variant y, then for any prefix |
| 1102 | // p of x, p-x is a prefix of y. This allows us to order tags based on the |
| 1103 | // maximum of the length of any of its prefixes. |
| 1104 | // TODO: it is possible to define a set of Prefix values on variants such that |
| 1105 | // a total order cannot be defined to the point that this algorithm breaks. |
| 1106 | // In other words, we cannot guarantee the same order of variants for the |
| 1107 | // future using the same algorithm or for non-compliant combinations of |
| 1108 | // variants. For this reason, consider using simple alphabetic sorting |
| 1109 | // of variants and ignore Prefix restrictions altogether. |
| 1110 | func (b *builder) writeVariant() { |
| 1111 | generalized := stringSet{} |
| 1112 | specialized := stringSet{} |
| 1113 | specializedExtend := stringSet{} |
| 1114 | // Collate the variants by type and check assumptions. |
| 1115 | for _, v := range b.variant.slice() { |
| 1116 | e := b.registry[v] |
| 1117 | if len(e.prefix) == 0 { |
| 1118 | generalized.add(v) |
| 1119 | continue |
| 1120 | } |
| 1121 | c := strings.Split(e.prefix[0], "-") |
| 1122 | hasScriptOrRegion := false |
| 1123 | if len(c) > 1 { |
| 1124 | _, hasScriptOrRegion = b.script.find(c[1]) |
| 1125 | if !hasScriptOrRegion { |
| 1126 | _, hasScriptOrRegion = b.region.find(c[1]) |
| 1127 | |
| 1128 | } |
| 1129 | } |
| 1130 | if len(c) == 1 || len(c) == 2 && hasScriptOrRegion { |
| 1131 | // Variant is preceded by a language. |
| 1132 | specialized.add(v) |
| 1133 | continue |
| 1134 | } |
| 1135 | // Variant is preceded by another variant. |
| 1136 | specializedExtend.add(v) |
| 1137 | prefix := c[0] + "-" |
| 1138 | if hasScriptOrRegion { |
| 1139 | prefix += c[1] |
| 1140 | } |
| 1141 | for _, p := range e.prefix { |
| 1142 | // Verify that the prefix minus the last element is a prefix of the |
| 1143 | // predecessor element. |
| 1144 | i := strings.LastIndex(p, "-") |
| 1145 | pred := b.registry[p[i+1:]] |
| 1146 | if find(pred.prefix, p[:i]) < 0 { |
| 1147 | log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v) |
| 1148 | } |
| 1149 | // The sorting used below does not work in the general case. It works |
| 1150 | // if we assume that variants that may be followed by others only have |
| 1151 | // prefixes of the same length. Verify this. |
| 1152 | count := strings.Count(p[:i], "-") |
| 1153 | for _, q := range pred.prefix { |
| 1154 | if c := strings.Count(q, "-"); c != count { |
| 1155 | log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count) |
| 1156 | } |
| 1157 | } |
| 1158 | if !strings.HasPrefix(p, prefix) { |
| 1159 | log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix) |
| 1160 | } |
| 1161 | } |
| 1162 | } |
| 1163 | |
| 1164 | // Sort extended variants. |
| 1165 | a := specializedExtend.s |
| 1166 | less := func(v, w string) bool { |
| 1167 | // Sort by the maximum number of elements. |
| 1168 | maxCount := func(s string) (max int) { |
| 1169 | for _, p := range b.registry[s].prefix { |
| 1170 | if c := strings.Count(p, "-"); c > max { |
| 1171 | max = c |
| 1172 | } |
| 1173 | } |
| 1174 | return |
| 1175 | } |
| 1176 | if cv, cw := maxCount(v), maxCount(w); cv != cw { |
| 1177 | return cv < cw |
| 1178 | } |
| 1179 | // Sort by name as tie breaker. |
| 1180 | return v < w |
| 1181 | } |
| 1182 | sort.Sort(funcSorter{less, sort.StringSlice(a)}) |
| 1183 | specializedExtend.frozen = true |
| 1184 | |
| 1185 | // Create index from variant name to index. |
| 1186 | variantIndex := make(map[string]uint8) |
| 1187 | add := func(s []string) { |
| 1188 | for _, v := range s { |
| 1189 | variantIndex[v] = uint8(len(variantIndex)) |
| 1190 | } |
| 1191 | } |
| 1192 | add(specialized.slice()) |
| 1193 | add(specializedExtend.s) |
| 1194 | numSpecialized := len(variantIndex) |
| 1195 | add(generalized.slice()) |
| 1196 | if n := len(variantIndex); n > 255 { |
| 1197 | log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n) |
| 1198 | } |
| 1199 | b.writeMap("variantIndex", variantIndex) |
| 1200 | b.writeConst("variantNumSpecialized", numSpecialized) |
| 1201 | } |
| 1202 | |
| 1203 | func (b *builder) writeLanguageInfo() { |
| 1204 | } |
| 1205 | |
| 1206 | // writeLikelyData writes tables that are used both for finding parent relations and for |
| 1207 | // language matching. Each entry contains additional bits to indicate the status of the |
| 1208 | // data to know when it cannot be used for parent relations. |
| 1209 | func (b *builder) writeLikelyData() { |
| 1210 | const ( |
| 1211 | isList = 1 << iota |
| 1212 | scriptInFrom |
| 1213 | regionInFrom |
| 1214 | ) |
| 1215 | type ( // generated types |
| 1216 | likelyScriptRegion struct { |
| 1217 | region uint16 |
| 1218 | script uint8 |
| 1219 | flags uint8 |
| 1220 | } |
| 1221 | likelyLangScript struct { |
| 1222 | lang uint16 |
| 1223 | script uint8 |
| 1224 | flags uint8 |
| 1225 | } |
| 1226 | likelyLangRegion struct { |
| 1227 | lang uint16 |
| 1228 | region uint16 |
| 1229 | } |
| 1230 | // likelyTag is used for getting likely tags for group regions, where |
| 1231 | // the likely region might be a region contained in the group. |
| 1232 | likelyTag struct { |
| 1233 | lang uint16 |
| 1234 | region uint16 |
| 1235 | script uint8 |
| 1236 | } |
| 1237 | ) |
| 1238 | var ( // generated variables |
| 1239 | likelyRegionGroup = make([]likelyTag, len(b.groups)) |
| 1240 | likelyLang = make([]likelyScriptRegion, len(b.lang.s)) |
| 1241 | likelyRegion = make([]likelyLangScript, len(b.region.s)) |
| 1242 | likelyScript = make([]likelyLangRegion, len(b.script.s)) |
| 1243 | likelyLangList = []likelyScriptRegion{} |
| 1244 | likelyRegionList = []likelyLangScript{} |
| 1245 | ) |
| 1246 | type fromTo struct { |
| 1247 | from, to []string |
| 1248 | } |
| 1249 | langToOther := map[int][]fromTo{} |
| 1250 | regionToOther := map[int][]fromTo{} |
| 1251 | for _, m := range b.supp.LikelySubtags.LikelySubtag { |
| 1252 | from := strings.Split(m.From, "_") |
| 1253 | to := strings.Split(m.To, "_") |
| 1254 | if len(to) != 3 { |
| 1255 | log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to)) |
| 1256 | } |
| 1257 | if len(from) > 3 { |
| 1258 | log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from)) |
| 1259 | } |
| 1260 | if from[0] != to[0] && from[0] != "und" { |
| 1261 | log.Fatalf("unexpected language change in expansion: %s -> %s", from, to) |
| 1262 | } |
| 1263 | if len(from) == 3 { |
| 1264 | if from[2] != to[2] { |
| 1265 | log.Fatalf("unexpected region change in expansion: %s -> %s", from, to) |
| 1266 | } |
| 1267 | if from[0] != "und" { |
| 1268 | log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to) |
| 1269 | } |
| 1270 | } |
| 1271 | if len(from) == 1 || from[0] != "und" { |
| 1272 | id := 0 |
| 1273 | if from[0] != "und" { |
| 1274 | id = b.lang.index(from[0]) |
| 1275 | } |
| 1276 | langToOther[id] = append(langToOther[id], fromTo{from, to}) |
| 1277 | } else if len(from) == 2 && len(from[1]) == 4 { |
| 1278 | sid := b.script.index(from[1]) |
| 1279 | likelyScript[sid].lang = uint16(b.langIndex(to[0])) |
| 1280 | likelyScript[sid].region = uint16(b.region.index(to[2])) |
| 1281 | } else { |
| 1282 | r := b.region.index(from[len(from)-1]) |
| 1283 | if id, ok := b.groups[r]; ok { |
| 1284 | if from[0] != "und" { |
| 1285 | log.Fatalf("region changed unexpectedly: %s -> %s", from, to) |
| 1286 | } |
| 1287 | likelyRegionGroup[id].lang = uint16(b.langIndex(to[0])) |
| 1288 | likelyRegionGroup[id].script = uint8(b.script.index(to[1])) |
| 1289 | likelyRegionGroup[id].region = uint16(b.region.index(to[2])) |
| 1290 | } else { |
| 1291 | regionToOther[r] = append(regionToOther[r], fromTo{from, to}) |
| 1292 | } |
| 1293 | } |
| 1294 | } |
| 1295 | b.writeType(likelyLangRegion{}) |
| 1296 | b.writeSlice("likelyScript", likelyScript) |
| 1297 | |
| 1298 | for id := range b.lang.s { |
| 1299 | list := langToOther[id] |
| 1300 | if len(list) == 1 { |
| 1301 | likelyLang[id].region = uint16(b.region.index(list[0].to[2])) |
| 1302 | likelyLang[id].script = uint8(b.script.index(list[0].to[1])) |
| 1303 | } else if len(list) > 1 { |
| 1304 | likelyLang[id].flags = isList |
| 1305 | likelyLang[id].region = uint16(len(likelyLangList)) |
| 1306 | likelyLang[id].script = uint8(len(list)) |
| 1307 | for _, x := range list { |
| 1308 | flags := uint8(0) |
| 1309 | if len(x.from) > 1 { |
| 1310 | if x.from[1] == x.to[2] { |
| 1311 | flags = regionInFrom |
| 1312 | } else { |
| 1313 | flags = scriptInFrom |
| 1314 | } |
| 1315 | } |
| 1316 | likelyLangList = append(likelyLangList, likelyScriptRegion{ |
| 1317 | region: uint16(b.region.index(x.to[2])), |
| 1318 | script: uint8(b.script.index(x.to[1])), |
| 1319 | flags: flags, |
| 1320 | }) |
| 1321 | } |
| 1322 | } |
| 1323 | } |
| 1324 | // TODO: merge suppressScript data with this table. |
| 1325 | b.writeType(likelyScriptRegion{}) |
| 1326 | b.writeSlice("likelyLang", likelyLang) |
| 1327 | b.writeSlice("likelyLangList", likelyLangList) |
| 1328 | |
| 1329 | for id := range b.region.s { |
| 1330 | list := regionToOther[id] |
| 1331 | if len(list) == 1 { |
| 1332 | likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0])) |
| 1333 | likelyRegion[id].script = uint8(b.script.index(list[0].to[1])) |
| 1334 | if len(list[0].from) > 2 { |
| 1335 | likelyRegion[id].flags = scriptInFrom |
| 1336 | } |
| 1337 | } else if len(list) > 1 { |
| 1338 | likelyRegion[id].flags = isList |
| 1339 | likelyRegion[id].lang = uint16(len(likelyRegionList)) |
| 1340 | likelyRegion[id].script = uint8(len(list)) |
| 1341 | for i, x := range list { |
| 1342 | if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 { |
| 1343 | log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i) |
| 1344 | } |
| 1345 | x := likelyLangScript{ |
| 1346 | lang: uint16(b.langIndex(x.to[0])), |
| 1347 | script: uint8(b.script.index(x.to[1])), |
| 1348 | } |
| 1349 | if len(list[0].from) > 2 { |
| 1350 | x.flags = scriptInFrom |
| 1351 | } |
| 1352 | likelyRegionList = append(likelyRegionList, x) |
| 1353 | } |
| 1354 | } |
| 1355 | } |
| 1356 | b.writeType(likelyLangScript{}) |
| 1357 | b.writeSlice("likelyRegion", likelyRegion) |
| 1358 | b.writeSlice("likelyRegionList", likelyRegionList) |
| 1359 | |
| 1360 | b.writeType(likelyTag{}) |
| 1361 | b.writeSlice("likelyRegionGroup", likelyRegionGroup) |
| 1362 | } |
| 1363 | |
| 1364 | type mutualIntelligibility struct { |
| 1365 | want, have uint16 |
| 1366 | distance uint8 |
| 1367 | oneway bool |
| 1368 | } |
| 1369 | |
| 1370 | type scriptIntelligibility struct { |
| 1371 | wantLang, haveLang uint16 |
| 1372 | wantScript, haveScript uint8 |
| 1373 | distance uint8 |
| 1374 | // Always oneway |
| 1375 | } |
| 1376 | |
| 1377 | type regionIntelligibility struct { |
| 1378 | lang uint16 // compact language id |
| 1379 | script uint8 // 0 means any |
| 1380 | group uint8 // 0 means any; if bit 7 is set it means inverse |
| 1381 | distance uint8 |
| 1382 | // Always twoway. |
| 1383 | } |
| 1384 | |
| 1385 | // writeMatchData writes tables with languages and scripts for which there is |
| 1386 | // mutual intelligibility. The data is based on CLDR's languageMatching data. |
| 1387 | // Note that we use a different algorithm than the one defined by CLDR and that |
| 1388 | // we slightly modify the data. For example, we convert scores to confidence levels. |
| 1389 | // We also drop all region-related data as we use a different algorithm to |
| 1390 | // determine region equivalence. |
| 1391 | func (b *builder) writeMatchData() { |
| 1392 | lm := b.supp.LanguageMatching.LanguageMatches |
| 1393 | cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new") |
| 1394 | |
| 1395 | regionHierarchy := map[string][]string{} |
| 1396 | for _, g := range b.supp.TerritoryContainment.Group { |
| 1397 | regions := strings.Split(g.Contains, " ") |
| 1398 | regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...) |
| 1399 | } |
| 1400 | regionToGroups := make([]uint8, len(b.region.s)) |
| 1401 | |
| 1402 | idToIndex := map[string]uint8{} |
| 1403 | for i, mv := range lm[0].MatchVariable { |
| 1404 | if i > 6 { |
| 1405 | log.Fatalf("Too many groups: %d", i) |
| 1406 | } |
| 1407 | idToIndex[mv.Id] = uint8(i + 1) |
| 1408 | // TODO: also handle '-' |
| 1409 | for _, r := range strings.Split(mv.Value, "+") { |
| 1410 | todo := []string{r} |
| 1411 | for k := 0; k < len(todo); k++ { |
| 1412 | r := todo[k] |
| 1413 | regionToGroups[b.region.index(r)] |= 1 << uint8(i) |
| 1414 | todo = append(todo, regionHierarchy[r]...) |
| 1415 | } |
| 1416 | } |
| 1417 | } |
| 1418 | b.writeSlice("regionToGroups", regionToGroups) |
| 1419 | |
| 1420 | // maps language id to in- and out-of-group region. |
| 1421 | paradigmLocales := [][3]uint16{} |
| 1422 | locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ") |
| 1423 | for i := 0; i < len(locales); i += 2 { |
| 1424 | x := [3]uint16{} |
| 1425 | for j := 0; j < 2; j++ { |
| 1426 | pc := strings.SplitN(locales[i+j], "-", 2) |
| 1427 | x[0] = b.langIndex(pc[0]) |
| 1428 | if len(pc) == 2 { |
| 1429 | x[1+j] = uint16(b.region.index(pc[1])) |
| 1430 | } |
| 1431 | } |
| 1432 | paradigmLocales = append(paradigmLocales, x) |
| 1433 | } |
| 1434 | b.writeSlice("paradigmLocales", paradigmLocales) |
| 1435 | |
| 1436 | b.writeType(mutualIntelligibility{}) |
| 1437 | b.writeType(scriptIntelligibility{}) |
| 1438 | b.writeType(regionIntelligibility{}) |
| 1439 | |
| 1440 | matchLang := []mutualIntelligibility{} |
| 1441 | matchScript := []scriptIntelligibility{} |
| 1442 | matchRegion := []regionIntelligibility{} |
| 1443 | // Convert the languageMatch entries in lists keyed by desired language. |
| 1444 | for _, m := range lm[0].LanguageMatch { |
| 1445 | // Different versions of CLDR use different separators. |
| 1446 | desired := strings.Replace(m.Desired, "-", "_", -1) |
| 1447 | supported := strings.Replace(m.Supported, "-", "_", -1) |
| 1448 | d := strings.Split(desired, "_") |
| 1449 | s := strings.Split(supported, "_") |
| 1450 | if len(d) != len(s) { |
| 1451 | log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) |
| 1452 | continue |
| 1453 | } |
| 1454 | distance, _ := strconv.ParseInt(m.Distance, 10, 8) |
| 1455 | switch len(d) { |
| 1456 | case 2: |
| 1457 | if desired == supported && desired == "*_*" { |
| 1458 | continue |
| 1459 | } |
| 1460 | // language-script pair. |
| 1461 | matchScript = append(matchScript, scriptIntelligibility{ |
| 1462 | wantLang: uint16(b.langIndex(d[0])), |
| 1463 | haveLang: uint16(b.langIndex(s[0])), |
| 1464 | wantScript: uint8(b.script.index(d[1])), |
| 1465 | haveScript: uint8(b.script.index(s[1])), |
| 1466 | distance: uint8(distance), |
| 1467 | }) |
| 1468 | if m.Oneway != "true" { |
| 1469 | matchScript = append(matchScript, scriptIntelligibility{ |
| 1470 | wantLang: uint16(b.langIndex(s[0])), |
| 1471 | haveLang: uint16(b.langIndex(d[0])), |
| 1472 | wantScript: uint8(b.script.index(s[1])), |
| 1473 | haveScript: uint8(b.script.index(d[1])), |
| 1474 | distance: uint8(distance), |
| 1475 | }) |
| 1476 | } |
| 1477 | case 1: |
| 1478 | if desired == supported && desired == "*" { |
| 1479 | continue |
| 1480 | } |
| 1481 | if distance == 1 { |
| 1482 | // nb == no is already handled by macro mapping. Check there |
| 1483 | // really is only this case. |
| 1484 | if d[0] != "no" || s[0] != "nb" { |
| 1485 | log.Fatalf("unhandled equivalence %s == %s", s[0], d[0]) |
| 1486 | } |
| 1487 | continue |
| 1488 | } |
| 1489 | // TODO: consider dropping oneway field and just doubling the entry. |
| 1490 | matchLang = append(matchLang, mutualIntelligibility{ |
| 1491 | want: uint16(b.langIndex(d[0])), |
| 1492 | have: uint16(b.langIndex(s[0])), |
| 1493 | distance: uint8(distance), |
| 1494 | oneway: m.Oneway == "true", |
| 1495 | }) |
| 1496 | case 3: |
| 1497 | if desired == supported && desired == "*_*_*" { |
| 1498 | continue |
| 1499 | } |
| 1500 | if desired != supported { |
| 1501 | // This is now supported by CLDR, but only one case, which |
| 1502 | // should already be covered by paradigm locales. For instance, |
| 1503 | // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in |
| 1504 | // testdata/CLDRLocaleMatcherTest.txt tests this. |
| 1505 | if supported != "en_*_GB" { |
| 1506 | log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) |
| 1507 | } |
| 1508 | continue |
| 1509 | } |
| 1510 | ri := regionIntelligibility{ |
| 1511 | lang: b.langIndex(d[0]), |
| 1512 | distance: uint8(distance), |
| 1513 | } |
| 1514 | if d[1] != "*" { |
| 1515 | ri.script = uint8(b.script.index(d[1])) |
| 1516 | } |
| 1517 | switch { |
| 1518 | case d[2] == "*": |
| 1519 | ri.group = 0x80 // not contained in anything |
| 1520 | case strings.HasPrefix(d[2], "$!"): |
| 1521 | ri.group = 0x80 |
| 1522 | d[2] = "$" + d[2][len("$!"):] |
| 1523 | fallthrough |
| 1524 | case strings.HasPrefix(d[2], "$"): |
| 1525 | ri.group |= idToIndex[d[2]] |
| 1526 | } |
| 1527 | matchRegion = append(matchRegion, ri) |
| 1528 | default: |
| 1529 | log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) |
| 1530 | } |
| 1531 | } |
| 1532 | sort.SliceStable(matchLang, func(i, j int) bool { |
| 1533 | return matchLang[i].distance < matchLang[j].distance |
| 1534 | }) |
| 1535 | b.writeSlice("matchLang", matchLang) |
| 1536 | |
| 1537 | sort.SliceStable(matchScript, func(i, j int) bool { |
| 1538 | return matchScript[i].distance < matchScript[j].distance |
| 1539 | }) |
| 1540 | b.writeSlice("matchScript", matchScript) |
| 1541 | |
| 1542 | sort.SliceStable(matchRegion, func(i, j int) bool { |
| 1543 | return matchRegion[i].distance < matchRegion[j].distance |
| 1544 | }) |
| 1545 | b.writeSlice("matchRegion", matchRegion) |
| 1546 | } |
| 1547 | |
| 1548 | func (b *builder) writeRegionInclusionData() { |
| 1549 | var ( |
| 1550 | // mm holds for each group the set of groups with a distance of 1. |
| 1551 | mm = make(map[int][]index) |
| 1552 | |
| 1553 | // containment holds for each group the transitive closure of |
| 1554 | // containment of other groups. |
| 1555 | containment = make(map[index][]index) |
| 1556 | ) |
| 1557 | for _, g := range b.supp.TerritoryContainment.Group { |
| 1558 | // Skip UN and EURO zone as they are flattening the containment |
| 1559 | // relationship. |
| 1560 | if g.Type == "EZ" || g.Type == "UN" { |
| 1561 | continue |
| 1562 | } |
| 1563 | group := b.region.index(g.Type) |
| 1564 | groupIdx := b.groups[group] |
| 1565 | for _, mem := range strings.Split(g.Contains, " ") { |
| 1566 | r := b.region.index(mem) |
| 1567 | mm[r] = append(mm[r], groupIdx) |
| 1568 | if g, ok := b.groups[r]; ok { |
| 1569 | mm[group] = append(mm[group], g) |
| 1570 | containment[groupIdx] = append(containment[groupIdx], g) |
| 1571 | } |
| 1572 | } |
| 1573 | } |
| 1574 | |
| 1575 | regionContainment := make([]uint64, len(b.groups)) |
| 1576 | for _, g := range b.groups { |
| 1577 | l := containment[g] |
| 1578 | |
| 1579 | // Compute the transitive closure of containment. |
| 1580 | for i := 0; i < len(l); i++ { |
| 1581 | l = append(l, containment[l[i]]...) |
| 1582 | } |
| 1583 | |
| 1584 | // Compute the bitmask. |
| 1585 | regionContainment[g] = 1 << g |
| 1586 | for _, v := range l { |
| 1587 | regionContainment[g] |= 1 << v |
| 1588 | } |
| 1589 | } |
| 1590 | b.writeSlice("regionContainment", regionContainment) |
| 1591 | |
| 1592 | regionInclusion := make([]uint8, len(b.region.s)) |
| 1593 | bvs := make(map[uint64]index) |
| 1594 | // Make the first bitvector positions correspond with the groups. |
| 1595 | for r, i := range b.groups { |
| 1596 | bv := uint64(1 << i) |
| 1597 | for _, g := range mm[r] { |
| 1598 | bv |= 1 << g |
| 1599 | } |
| 1600 | bvs[bv] = i |
| 1601 | regionInclusion[r] = uint8(bvs[bv]) |
| 1602 | } |
| 1603 | for r := 1; r < len(b.region.s); r++ { |
| 1604 | if _, ok := b.groups[r]; !ok { |
| 1605 | bv := uint64(0) |
| 1606 | for _, g := range mm[r] { |
| 1607 | bv |= 1 << g |
| 1608 | } |
| 1609 | if bv == 0 { |
| 1610 | // Pick the world for unspecified regions. |
| 1611 | bv = 1 << b.groups[b.region.index("001")] |
| 1612 | } |
| 1613 | if _, ok := bvs[bv]; !ok { |
| 1614 | bvs[bv] = index(len(bvs)) |
| 1615 | } |
| 1616 | regionInclusion[r] = uint8(bvs[bv]) |
| 1617 | } |
| 1618 | } |
| 1619 | b.writeSlice("regionInclusion", regionInclusion) |
| 1620 | regionInclusionBits := make([]uint64, len(bvs)) |
| 1621 | for k, v := range bvs { |
| 1622 | regionInclusionBits[v] = uint64(k) |
| 1623 | } |
| 1624 | // Add bit vectors for increasingly large distances until a fixed point is reached. |
| 1625 | regionInclusionNext := []uint8{} |
| 1626 | for i := 0; i < len(regionInclusionBits); i++ { |
| 1627 | bits := regionInclusionBits[i] |
| 1628 | next := bits |
| 1629 | for i := uint(0); i < uint(len(b.groups)); i++ { |
| 1630 | if bits&(1<<i) != 0 { |
| 1631 | next |= regionInclusionBits[i] |
| 1632 | } |
| 1633 | } |
| 1634 | if _, ok := bvs[next]; !ok { |
| 1635 | bvs[next] = index(len(bvs)) |
| 1636 | regionInclusionBits = append(regionInclusionBits, next) |
| 1637 | } |
| 1638 | regionInclusionNext = append(regionInclusionNext, uint8(bvs[next])) |
| 1639 | } |
| 1640 | b.writeSlice("regionInclusionBits", regionInclusionBits) |
| 1641 | b.writeSlice("regionInclusionNext", regionInclusionNext) |
| 1642 | } |
| 1643 | |
| 1644 | type parentRel struct { |
| 1645 | lang uint16 |
| 1646 | script uint8 |
| 1647 | maxScript uint8 |
| 1648 | toRegion uint16 |
| 1649 | fromRegion []uint16 |
| 1650 | } |
| 1651 | |
| 1652 | func (b *builder) writeParents() { |
| 1653 | b.writeType(parentRel{}) |
| 1654 | |
| 1655 | parents := []parentRel{} |
| 1656 | |
| 1657 | // Construct parent overrides. |
| 1658 | n := 0 |
| 1659 | for _, p := range b.data.Supplemental().ParentLocales.ParentLocale { |
| 1660 | // Skipping non-standard scripts to root is implemented using addTags. |
| 1661 | if p.Parent == "root" { |
| 1662 | continue |
| 1663 | } |
| 1664 | |
| 1665 | sub := strings.Split(p.Parent, "_") |
| 1666 | parent := parentRel{lang: b.langIndex(sub[0])} |
| 1667 | if len(sub) == 2 { |
| 1668 | // TODO: check that all undefined scripts are indeed Latn in these |
| 1669 | // cases. |
| 1670 | parent.maxScript = uint8(b.script.index("Latn")) |
| 1671 | parent.toRegion = uint16(b.region.index(sub[1])) |
| 1672 | } else { |
| 1673 | parent.script = uint8(b.script.index(sub[1])) |
| 1674 | parent.maxScript = parent.script |
| 1675 | parent.toRegion = uint16(b.region.index(sub[2])) |
| 1676 | } |
| 1677 | for _, c := range strings.Split(p.Locales, " ") { |
| 1678 | region := b.region.index(c[strings.LastIndex(c, "_")+1:]) |
| 1679 | parent.fromRegion = append(parent.fromRegion, uint16(region)) |
| 1680 | } |
| 1681 | parents = append(parents, parent) |
| 1682 | n += len(parent.fromRegion) |
| 1683 | } |
| 1684 | b.writeSliceAddSize("parents", n*2, parents) |
| 1685 | } |
| 1686 | |
| 1687 | func main() { |
| 1688 | gen.Init() |
| 1689 | |
| 1690 | gen.Repackage("gen_common.go", "common.go", "language") |
| 1691 | |
| 1692 | w := gen.NewCodeWriter() |
| 1693 | defer w.WriteGoFile("tables.go", "language") |
| 1694 | |
| 1695 | fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`) |
| 1696 | |
| 1697 | b := newBuilder(w) |
| 1698 | gen.WriteCLDRVersion(w) |
| 1699 | |
| 1700 | b.parseIndices() |
| 1701 | b.writeType(fromTo{}) |
| 1702 | b.writeLanguage() |
| 1703 | b.writeScript() |
| 1704 | b.writeRegion() |
| 1705 | b.writeVariant() |
| 1706 | // TODO: b.writeLocale() |
| 1707 | b.computeRegionGroups() |
| 1708 | b.writeLikelyData() |
| 1709 | b.writeMatchData() |
| 1710 | b.writeRegionInclusionData() |
| 1711 | b.writeParents() |
| 1712 | } |