Blame - vendor/golang.org/x/text/language/gen.go - voltha-simonu-adapter

blob: 302f1940aaf487f4d8a87913ceed8194716365cc [file] [log] [blame]

Scott Baker	eee8dd8	2019-09-24 12:52:34 -0700	[diff] [blame]	1	// Copyright 2013 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// +build ignore
				6
				7	// Language tag table generator.
				8	// Data read from the web.
				9
				10	package main
				11
				12	import (
				13	"bufio"
				14	"flag"
				15	"fmt"
				16	"io"
				17	"io/ioutil"
				18	"log"
				19	"math"
				20	"reflect"
				21	"regexp"
				22	"sort"
				23	"strconv"
				24	"strings"
				25
				26	"golang.org/x/text/internal/gen"
				27	"golang.org/x/text/internal/tag"
				28	"golang.org/x/text/unicode/cldr"
				29	)
				30
				31	var (
				32	test = flag.Bool("test",
				33	false,
				34	"test existing tables; can be used to compare web data with package data.")
				35	outputFile = flag.String("output",
				36	"tables.go",
				37	"output file for generated tables")
				38	)
				39
				40	var comment = []string{
				41	`
				42	lang holds an alphabetically sorted list of ISO-639 language identifiers.
				43	All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag.
				44	For 2-byte language identifiers, the two successive bytes have the following meaning:
				45	- if the first letter of the 2- and 3-letter ISO codes are the same:
				46	the second and third letter of the 3-letter ISO code.
				47	- otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3.
				48	For 3-byte language identifiers the 4th byte is 0.`,
				49	`
				50	langNoIndex is a bit vector of all 3-letter language codes that are not used as an index
				51	in lookup tables. The language ids for these language codes are derived directly
				52	from the letters and are not consecutive.`,
				53	`
				54	altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives
				55	to 2-letter language codes that cannot be derived using the method described above.
				56	Each 3-letter code is followed by its 1-byte langID.`,
				57	`
				58	altLangIndex is used to convert indexes in altLangISO3 to langIDs.`,
				59	`
				60	langAliasMap maps langIDs to their suggested replacements.`,
				61	`
				62	script is an alphabetically sorted list of ISO 15924 codes. The index
				63	of the script in the string, divided by 4, is the internal scriptID.`,
				64	`
				65	isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
				66	for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
				67	the UN.M49 codes used for groups.)`,
				68	`
				69	regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
				70	Each 2-letter codes is followed by two bytes with the following meaning:
				71	- [A-Z}{2}: the first letter of the 2-letter code plus these two
				72	letters form the 3-letter ISO code.
				73	- 0, n: index into altRegionISO3.`,
				74	`
				75	regionTypes defines the status of a region for various standards.`,
				76	`
				77	m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
				78	codes indicating collections of regions.`,
				79	`
				80	m49Index gives indexes into fromM49 based on the three most significant bits
				81	of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in
				82	fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]]
				83	for an entry where the first 7 bits match the 7 lsb of the UN.M49 code.
				84	The region code is stored in the 9 lsb of the indexed value.`,
				85	`
				86	fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`,
				87	`
				88	altRegionISO3 holds a list of 3-letter region codes that cannot be
				89	mapped to 2-letter codes using the default algorithm. This is a short list.`,
				90	`
				91	altRegionIDs holds a list of regionIDs the positions of which match those
				92	of the 3-letter ISO codes in altRegionISO3.`,
				93	`
				94	variantNumSpecialized is the number of specialized variants in variants.`,
				95	`
				96	suppressScript is an index from langID to the dominant script for that language,
				97	if it exists. If a script is given, it should be suppressed from the language tag.`,
				98	`
				99	likelyLang is a lookup table, indexed by langID, for the most likely
				100	scripts and regions given incomplete information. If more entries exist for a
				101	given language, region and script are the index and size respectively
				102	of the list in likelyLangList.`,
				103	`
				104	likelyLangList holds lists info associated with likelyLang.`,
				105	`
				106	likelyRegion is a lookup table, indexed by regionID, for the most likely
				107	languages and scripts given incomplete information. If more entries exist
				108	for a given regionID, lang and script are the index and size respectively
				109	of the list in likelyRegionList.
				110	TODO: exclude containers and user-definable regions from the list.`,
				111	`
				112	likelyRegionList holds lists info associated with likelyRegion.`,
				113	`
				114	likelyScript is a lookup table, indexed by scriptID, for the most likely
				115	languages and regions given a script.`,
				116	`
				117	matchLang holds pairs of langIDs of base languages that are typically
				118	mutually intelligible. Each pair is associated with a confidence and
				119	whether the intelligibility goes one or both ways.`,
				120	`
				121	matchScript holds pairs of scriptIDs where readers of one script
				122	can typically also read the other. Each is associated with a confidence.`,
				123	`
				124	nRegionGroups is the number of region groups.`,
				125	`
				126	regionInclusion maps region identifiers to sets of regions in regionInclusionBits,
				127	where each set holds all groupings that are directly connected in a region
				128	containment graph.`,
				129	`
				130	regionInclusionBits is an array of bit vectors where every vector represents
				131	a set of region groupings. These sets are used to compute the distance
				132	between two regions for the purpose of language matching.`,
				133	`
				134	regionInclusionNext marks, for each entry in regionInclusionBits, the set of
				135	all groups that are reachable from the groups set in the respective entry.`,
				136	}
				137
				138	// TODO: consider changing some of these structures to tries. This can reduce
				139	// memory, but may increase the need for memory allocations. This could be
				140	// mitigated if we can piggyback on language tags for common cases.
				141
				142	func failOnError(e error) {
				143	if e != nil {
				144	log.Panic(e)
				145	}
				146	}
				147
				148	type setType int
				149
				150	const (
				151	Indexed setType = 1 + iota // all elements must be of same size
				152	Linear
				153	)
				154
				155	type stringSet struct {
				156	s []string
				157	sorted, frozen bool
				158
				159	// We often need to update values after the creation of an index is completed.
				160	// We include a convenience map for keeping track of this.
				161	update map[string]string
				162	typ setType // used for checking.
				163	}
				164
				165	func (ss *stringSet) clone() stringSet {
				166	c := *ss
				167	c.s = append([]string(nil), c.s...)
				168	return c
				169	}
				170
				171	func (ss *stringSet) setType(t setType) {
				172	if ss.typ != t && ss.typ != 0 {
				173	log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
				174	}
				175	}
				176
				177	// parse parses a whitespace-separated string and initializes ss with its
				178	// components.
				179	func (ss *stringSet) parse(s string) {
				180	scan := bufio.NewScanner(strings.NewReader(s))
				181	scan.Split(bufio.ScanWords)
				182	for scan.Scan() {
				183	ss.add(scan.Text())
				184	}
				185	}
				186
				187	func (ss *stringSet) assertChangeable() {
				188	if ss.frozen {
				189	log.Panic("attempt to modify a frozen stringSet")
				190	}
				191	}
				192
				193	func (ss *stringSet) add(s string) {
				194	ss.assertChangeable()
				195	ss.s = append(ss.s, s)
				196	ss.sorted = ss.frozen
				197	}
				198
				199	func (ss *stringSet) freeze() {
				200	ss.compact()
				201	ss.frozen = true
				202	}
				203
				204	func (ss *stringSet) compact() {
				205	if ss.sorted {
				206	return
				207	}
				208	a := ss.s
				209	sort.Strings(a)
				210	k := 0
				211	for i := 1; i < len(a); i++ {
				212	if a[k] != a[i] {
				213	a[k+1] = a[i]
				214	k++
				215	}
				216	}
				217	ss.s = a[:k+1]
				218	ss.sorted = ss.frozen
				219	}
				220
				221	type funcSorter struct {
				222	fn func(a, b string) bool
				223	sort.StringSlice
				224	}
				225
				226	func (s funcSorter) Less(i, j int) bool {
				227	return s.fn(s.StringSlice[i], s.StringSlice[j])
				228	}
				229
				230	func (ss *stringSet) sortFunc(f func(a, b string) bool) {
				231	ss.compact()
				232	sort.Sort(funcSorter{f, sort.StringSlice(ss.s)})
				233	}
				234
				235	func (ss *stringSet) remove(s string) {
				236	ss.assertChangeable()
				237	if i, ok := ss.find(s); ok {
				238	copy(ss.s[i:], ss.s[i+1:])
				239	ss.s = ss.s[:len(ss.s)-1]
				240	}
				241	}
				242
				243	func (ss *stringSet) replace(ol, nu string) {
				244	ss.s[ss.index(ol)] = nu
				245	ss.sorted = ss.frozen
				246	}
				247
				248	func (ss *stringSet) index(s string) int {
				249	ss.setType(Indexed)
				250	i, ok := ss.find(s)
				251	if !ok {
				252	if i < len(ss.s) {
				253	log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
				254	}
				255	log.Panicf("find: item %q is not in list", s)
				256
				257	}
				258	return i
				259	}
				260
				261	func (ss *stringSet) find(s string) (int, bool) {
				262	ss.compact()
				263	i := sort.SearchStrings(ss.s, s)
				264	return i, i != len(ss.s) && ss.s[i] == s
				265	}
				266
				267	func (ss *stringSet) slice() []string {
				268	ss.compact()
				269	return ss.s
				270	}
				271
				272	func (ss *stringSet) updateLater(v, key string) {
				273	if ss.update == nil {
				274	ss.update = map[string]string{}
				275	}
				276	ss.update[v] = key
				277	}
				278
				279	// join joins the string and ensures that all entries are of the same length.
				280	func (ss *stringSet) join() string {
				281	ss.setType(Indexed)
				282	n := len(ss.s[0])
				283	for _, s := range ss.s {
				284	if len(s) != n {
				285	log.Panicf("join: not all entries are of the same length: %q", s)
				286	}
				287	}
				288	ss.s = append(ss.s, strings.Repeat("\xff", n))
				289	return strings.Join(ss.s, "")
				290	}
				291
				292	// ianaEntry holds information for an entry in the IANA Language Subtag Repository.
				293	// All types use the same entry.
				294	// See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various
				295	// fields.
				296	type ianaEntry struct {
				297	typ string
				298	description []string
				299	scope string
				300	added string
				301	preferred string
				302	deprecated string
				303	suppressScript string
				304	macro string
				305	prefix []string
				306	}
				307
				308	type builder struct {
				309	w *gen.CodeWriter
				310	hw io.Writer // MultiWriter for w and w.Hash
				311	data *cldr.CLDR
				312	supp *cldr.SupplementalData
				313
				314	// indices
				315	locale stringSet // common locales
				316	lang stringSet // canonical language ids (2 or 3 letter ISO codes) with data
				317	langNoIndex stringSet // 3-letter ISO codes with no associated data
				318	script stringSet // 4-letter ISO codes
				319	region stringSet // 2-letter ISO or 3-digit UN M49 codes
				320	variant stringSet // 4-8-alphanumeric variant code.
				321
				322	// Region codes that are groups with their corresponding group IDs.
				323	groups map[int]index
				324
				325	// langInfo
				326	registry map[string]*ianaEntry
				327	}
				328
				329	type index uint
				330
				331	func newBuilder(w gen.CodeWriter) builder {
				332	r := gen.OpenCLDRCoreZip()
				333	defer r.Close()
				334	d := &cldr.Decoder{}
				335	data, err := d.DecodeZip(r)
				336	failOnError(err)
				337	b := builder{
				338	w: w,
				339	hw: io.MultiWriter(w, w.Hash),
				340	data: data,
				341	supp: data.Supplemental(),
				342	}
				343	b.parseRegistry()
				344	return &b
				345	}
				346
				347	func (b *builder) parseRegistry() {
				348	r := gen.OpenIANAFile("assignments/language-subtag-registry")
				349	defer r.Close()
				350	b.registry = make(map[string]*ianaEntry)
				351
				352	scan := bufio.NewScanner(r)
				353	scan.Split(bufio.ScanWords)
				354	var record *ianaEntry
				355	for more := scan.Scan(); more; {
				356	key := scan.Text()
				357	more = scan.Scan()
				358	value := scan.Text()
				359	switch key {
				360	case "Type:":
				361	record = &ianaEntry{typ: value}
				362	case "Subtag:", "Tag:":
				363	if s := strings.SplitN(value, "..", 2); len(s) > 1 {
				364	for a := s[0]; a <= s[1]; a = inc(a) {
				365	b.addToRegistry(a, record)
				366	}
				367	} else {
				368	b.addToRegistry(value, record)
				369	}
				370	case "Suppress-Script:":
				371	record.suppressScript = value
				372	case "Added:":
				373	record.added = value
				374	case "Deprecated:":
				375	record.deprecated = value
				376	case "Macrolanguage:":
				377	record.macro = value
				378	case "Preferred-Value:":
				379	record.preferred = value
				380	case "Prefix:":
				381	record.prefix = append(record.prefix, value)
				382	case "Scope:":
				383	record.scope = value
				384	case "Description:":
				385	buf := []byte(value)
				386	for more = scan.Scan(); more; more = scan.Scan() {
				387	b := scan.Bytes()
				388	if b[0] == '%' \|\| b[len(b)-1] == ':' {
				389	break
				390	}
				391	buf = append(buf, ' ')
				392	buf = append(buf, b...)
				393	}
				394	record.description = append(record.description, string(buf))
				395	continue
				396	default:
				397	continue
				398	}
				399	more = scan.Scan()
				400	}
				401	if scan.Err() != nil {
				402	log.Panic(scan.Err())
				403	}
				404	}
				405
				406	func (b builder) addToRegistry(key string, entry ianaEntry) {
				407	if info, ok := b.registry[key]; ok {
				408	if info.typ != "language" \|\| entry.typ != "extlang" {
				409	log.Fatalf("parseRegistry: tag %q already exists", key)
				410	}
				411	} else {
				412	b.registry[key] = entry
				413	}
				414	}
				415
				416	var commentIndex = make(map[string]string)
				417
				418	func init() {
				419	for _, s := range comment {
				420	key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
				421	commentIndex[key] = s
				422	}
				423	}
				424
				425	func (b *builder) comment(name string) {
				426	if s := commentIndex[name]; len(s) > 0 {
				427	b.w.WriteComment(s)
				428	} else {
				429	fmt.Fprintln(b.w)
				430	}
				431	}
				432
				433	func (b *builder) pf(f string, x ...interface{}) {
				434	fmt.Fprintf(b.hw, f, x...)
				435	fmt.Fprint(b.hw, "\n")
				436	}
				437
				438	func (b *builder) p(x ...interface{}) {
				439	fmt.Fprintln(b.hw, x...)
				440	}
				441
				442	func (b *builder) addSize(s int) {
				443	b.w.Size += s
				444	b.pf("// Size: %d bytes", s)
				445	}
				446
				447	func (b *builder) writeConst(name string, x interface{}) {
				448	b.comment(name)
				449	b.w.WriteConst(name, x)
				450	}
				451
				452	// writeConsts computes f(v) for all v in values and writes the results
				453	// as constants named _v to a single constant block.
				454	func (b *builder) writeConsts(f func(string) int, values ...string) {
				455	b.pf("const (")
				456	for _, v := range values {
				457	b.pf("\t_%s = %v", v, f(v))
				458	}
				459	b.pf(")")
				460	}
				461
				462	// writeType writes the type of the given value, which must be a struct.
				463	func (b *builder) writeType(value interface{}) {
				464	b.comment(reflect.TypeOf(value).Name())
				465	b.w.WriteType(value)
				466	}
				467
				468	func (b *builder) writeSlice(name string, ss interface{}) {
				469	b.writeSliceAddSize(name, 0, ss)
				470	}
				471
				472	func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) {
				473	b.comment(name)
				474	b.w.Size += extraSize
				475	v := reflect.ValueOf(ss)
				476	t := v.Type().Elem()
				477	b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len())
				478
				479	fmt.Fprintf(b.w, "var %s = ", name)
				480	b.w.WriteArray(ss)
				481	b.p()
				482	}
				483
				484	type fromTo struct {
				485	from, to uint16
				486	}
				487
				488	func (b builder) writeSortedMap(name string, ss stringSet, index func(s string) uint16) {
				489	ss.sortFunc(func(a, b string) bool {
				490	return index(a) < index(b)
				491	})
				492	m := []fromTo{}
				493	for _, s := range ss.s {
				494	m = append(m, fromTo{index(s), index(ss.update[s])})
				495	}
				496	b.writeSlice(name, m)
				497	}
				498
				499	const base = 'z' - 'a' + 1
				500
				501	func strToInt(s string) uint {
				502	v := uint(0)
				503	for i := 0; i < len(s); i++ {
				504	v *= base
				505	v += uint(s[i] - 'a')
				506	}
				507	return v
				508	}
				509
				510	// converts the given integer to the original ASCII string passed to strToInt.
				511	// len(s) must match the number of characters obtained.
				512	func intToStr(v uint, s []byte) {
				513	for i := len(s) - 1; i >= 0; i-- {
				514	s[i] = byte(v%base) + 'a'
				515	v /= base
				516	}
				517	}
				518
				519	func (b *builder) writeBitVector(name string, ss []string) {
				520	vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8)))
				521	for _, s := range ss {
				522	v := strToInt(s)
				523	vec[v/8] \|= 1 << (v % 8)
				524	}
				525	b.writeSlice(name, vec)
				526	}
				527
				528	// TODO: convert this type into a list or two-stage trie.
				529	func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
				530	b.comment(name)
				531	v := reflect.ValueOf(m)
				532	sz := v.Len() * (2 + int(v.Type().Key().Size()))
				533	for _, k := range m {
				534	sz += len(k)
				535	}
				536	b.addSize(sz)
				537	keys := []string{}
				538	b.pf(`var %s = map[string]uint16{`, name)
				539	for k := range m {
				540	keys = append(keys, k)
				541	}
				542	sort.Strings(keys)
				543	for _, k := range keys {
				544	b.pf("\t%q: %v,", k, f(m[k]))
				545	}
				546	b.p("}")
				547	}
				548
				549	func (b *builder) writeMap(name string, m interface{}) {
				550	b.comment(name)
				551	v := reflect.ValueOf(m)
				552	sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size()))
				553	b.addSize(sz)
				554	f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool {
				555	return strings.IndexRune("{}, ", r) != -1
				556	})
				557	sort.Strings(f[1:])
				558	b.pf(`var %s = %s{`, name, f[0])
				559	for _, kv := range f[1:] {
				560	b.pf("\t%s,", kv)
				561	}
				562	b.p("}")
				563	}
				564
				565	func (b *builder) langIndex(s string) uint16 {
				566	if s == "und" {
				567	return 0
				568	}
				569	if i, ok := b.lang.find(s); ok {
				570	return uint16(i)
				571	}
				572	return uint16(strToInt(s)) + uint16(len(b.lang.s))
				573	}
				574
				575	// inc advances the string to its lexicographical successor.
				576	func inc(s string) string {
				577	const maxTagLength = 4
				578	var buf [maxTagLength]byte
				579	intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)])
				580	for i := 0; i < len(s); i++ {
				581	if s[i] <= 'Z' {
				582	buf[i] -= 'a' - 'A'
				583	}
				584	}
				585	return string(buf[:len(s)])
				586	}
				587
				588	func (b *builder) parseIndices() {
				589	meta := b.supp.Metadata
				590
				591	for k, v := range b.registry {
				592	var ss *stringSet
				593	switch v.typ {
				594	case "language":
				595	if len(k) == 2 \|\| v.suppressScript != "" \|\| v.scope == "special" {
				596	b.lang.add(k)
				597	continue
				598	} else {
				599	ss = &b.langNoIndex
				600	}
				601	case "region":
				602	ss = &b.region
				603	case "script":
				604	ss = &b.script
				605	case "variant":
				606	ss = &b.variant
				607	default:
				608	continue
				609	}
				610	ss.add(k)
				611	}
				612	// Include any language for which there is data.
				613	for _, lang := range b.data.Locales() {
				614	if x := b.data.RawLDML(lang); false \|\|
				615	x.LocaleDisplayNames != nil \|\|
				616	x.Characters != nil \|\|
				617	x.Delimiters != nil \|\|
				618	x.Measurement != nil \|\|
				619	x.Dates != nil \|\|
				620	x.Numbers != nil \|\|
				621	x.Units != nil \|\|
				622	x.ListPatterns != nil \|\|
				623	x.Collations != nil \|\|
				624	x.Segmentations != nil \|\|
				625	x.Rbnf != nil \|\|
				626	x.Annotations != nil \|\|
				627	x.Metadata != nil {
				628
				629	from := strings.Split(lang, "_")
				630	if lang := from[0]; lang != "root" {
				631	b.lang.add(lang)
				632	}
				633	}
				634	}
				635	// Include locales for plural rules, which uses a different structure.
				636	for _, plurals := range b.data.Supplemental().Plurals {
				637	for _, rules := range plurals.PluralRules {
				638	for _, lang := range strings.Split(rules.Locales, " ") {
				639	if lang = strings.Split(lang, "_")[0]; lang != "root" {
				640	b.lang.add(lang)
				641	}
				642	}
				643	}
				644	}
				645	// Include languages in likely subtags.
				646	for _, m := range b.supp.LikelySubtags.LikelySubtag {
				647	from := strings.Split(m.From, "_")
				648	b.lang.add(from[0])
				649	}
				650	// Include ISO-639 alpha-3 bibliographic entries.
				651	for _, a := range meta.Alias.LanguageAlias {
				652	if a.Reason == "bibliographic" {
				653	b.langNoIndex.add(a.Type)
				654	}
				655	}
				656	// Include regions in territoryAlias (not all are in the IANA registry!)
				657	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
				658	if len(reg.Type) == 2 {
				659	b.region.add(reg.Type)
				660	}
				661	}
				662
				663	for _, s := range b.lang.s {
				664	if len(s) == 3 {
				665	b.langNoIndex.remove(s)
				666	}
				667	}
				668	b.writeConst("numLanguages", len(b.lang.slice())+len(b.langNoIndex.slice()))
				669	b.writeConst("numScripts", len(b.script.slice()))
				670	b.writeConst("numRegions", len(b.region.slice()))
				671
				672	// Add dummy codes at the start of each list to represent "unspecified".
				673	b.lang.add("---")
				674	b.script.add("----")
				675	b.region.add("---")
				676
				677	// common locales
				678	b.locale.parse(meta.DefaultContent.Locales)
				679	}
				680
				681	// TODO: region inclusion data will probably not be use used in future matchers.
				682
				683	func (b *builder) computeRegionGroups() {
				684	b.groups = make(map[int]index)
				685
				686	// Create group indices.
				687	for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID.
				688	b.groups[i] = index(len(b.groups))
				689	}
				690	for _, g := range b.supp.TerritoryContainment.Group {
				691	// Skip UN and EURO zone as they are flattening the containment
				692	// relationship.
				693	if g.Type == "EZ" \|\| g.Type == "UN" {
				694	continue
				695	}
				696	group := b.region.index(g.Type)
				697	if _, ok := b.groups[group]; !ok {
				698	b.groups[group] = index(len(b.groups))
				699	}
				700	}
				701	if len(b.groups) > 64 {
				702	log.Fatalf("only 64 groups supported, found %d", len(b.groups))
				703	}
				704	b.writeConst("nRegionGroups", len(b.groups))
				705	}
				706
				707	var langConsts = []string{
				708	"af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
				709	"et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is",
				710	"it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml",
				711	"mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt",
				712	"ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th",
				713	"tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu",
				714
				715	// constants for grandfathered tags (if not already defined)
				716	"jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu",
				717	"nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn",
				718	}
				719
				720	// writeLanguage generates all tables needed for language canonicalization.
				721	func (b *builder) writeLanguage() {
				722	meta := b.supp.Metadata
				723
				724	b.writeConst("nonCanonicalUnd", b.lang.index("und"))
				725	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
				726	b.writeConst("langPrivateStart", b.langIndex("qaa"))
				727	b.writeConst("langPrivateEnd", b.langIndex("qtz"))
				728
				729	// Get language codes that need to be mapped (overlong 3-letter codes,
				730	// deprecated 2-letter codes, legacy and grandfathered tags.)
				731	langAliasMap := stringSet{}
				732	aliasTypeMap := map[string]langAliasType{}
				733
				734	// altLangISO3 get the alternative ISO3 names that need to be mapped.
				735	altLangISO3 := stringSet{}
				736	// Add dummy start to avoid the use of index 0.
				737	altLangISO3.add("---")
				738	altLangISO3.updateLater("---", "aa")
				739
				740	lang := b.lang.clone()
				741	for _, a := range meta.Alias.LanguageAlias {
				742	if a.Replacement == "" {
				743	a.Replacement = "und"
				744	}
				745	// TODO: support mapping to tags
				746	repl := strings.SplitN(a.Replacement, "_", 2)[0]
				747	if a.Reason == "overlong" {
				748	if len(a.Replacement) == 2 && len(a.Type) == 3 {
				749	lang.updateLater(a.Replacement, a.Type)
				750	}
				751	} else if len(a.Type) <= 3 {
				752	switch a.Reason {
				753	case "macrolanguage":
				754	aliasTypeMap[a.Type] = langMacro
				755	case "deprecated":
				756	// handled elsewhere
				757	continue
				758	case "bibliographic", "legacy":
				759	if a.Type == "no" {
				760	continue
				761	}
				762	aliasTypeMap[a.Type] = langLegacy
				763	default:
				764	log.Fatalf("new %s alias: %s", a.Reason, a.Type)
				765	}
				766	langAliasMap.add(a.Type)
				767	langAliasMap.updateLater(a.Type, repl)
				768	}
				769	}
				770	// Manually add the mapping of "nb" (Norwegian) to its macro language.
				771	// This can be removed if CLDR adopts this change.
				772	langAliasMap.add("nb")
				773	langAliasMap.updateLater("nb", "no")
				774	aliasTypeMap["nb"] = langMacro
				775
				776	for k, v := range b.registry {
				777	// Also add deprecated values for 3-letter ISO codes, which CLDR omits.
				778	if v.typ == "language" && v.deprecated != "" && v.preferred != "" {
				779	langAliasMap.add(k)
				780	langAliasMap.updateLater(k, v.preferred)
				781	aliasTypeMap[k] = langDeprecated
				782	}
				783	}
				784	// Fix CLDR mappings.
				785	lang.updateLater("tl", "tgl")
				786	lang.updateLater("sh", "hbs")
				787	lang.updateLater("mo", "mol")
				788	lang.updateLater("no", "nor")
				789	lang.updateLater("tw", "twi")
				790	lang.updateLater("nb", "nob")
				791	lang.updateLater("ak", "aka")
				792	lang.updateLater("bh", "bih")
				793
				794	// Ensure that each 2-letter code is matched with a 3-letter code.
				795	for _, v := range lang.s[1:] {
				796	s, ok := lang.update[v]
				797	if !ok {
				798	if s, ok = lang.update[langAliasMap.update[v]]; !ok {
				799	continue
				800	}
				801	lang.update[v] = s
				802	}
				803	if v[0] != s[0] {
				804	altLangISO3.add(s)
				805	altLangISO3.updateLater(s, v)
				806	}
				807	}
				808
				809	// Complete canonicalized language tags.
				810	lang.freeze()
				811	for i, v := range lang.s {
				812	// We can avoid these manual entries by using the IANA registry directly.
				813	// Seems easier to update the list manually, as changes are rare.
				814	// The panic in this loop will trigger if we miss an entry.
				815	add := ""
				816	if s, ok := lang.update[v]; ok {
				817	if s[0] == v[0] {
				818	add = s[1:]
				819	} else {
				820	add = string([]byte{0, byte(altLangISO3.index(s))})
				821	}
				822	} else if len(v) == 3 {
				823	add = "\x00"
				824	} else {
				825	log.Panicf("no data for long form of %q", v)
				826	}
				827	lang.s[i] += add
				828	}
				829	b.writeConst("lang", tag.Index(lang.join()))
				830
				831	b.writeConst("langNoIndexOffset", len(b.lang.s))
				832
				833	// space of all valid 3-letter language identifiers.
				834	b.writeBitVector("langNoIndex", b.langNoIndex.slice())
				835
				836	altLangIndex := []uint16{}
				837	for i, s := range altLangISO3.slice() {
				838	altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))})
				839	if i > 0 {
				840	idx := b.lang.index(altLangISO3.update[s])
				841	altLangIndex = append(altLangIndex, uint16(idx))
				842	}
				843	}
				844	b.writeConst("altLangISO3", tag.Index(altLangISO3.join()))
				845	b.writeSlice("altLangIndex", altLangIndex)
				846
				847	b.writeSortedMap("langAliasMap", &langAliasMap, b.langIndex)
				848	types := make([]langAliasType, len(langAliasMap.s))
				849	for i, s := range langAliasMap.s {
				850	types[i] = aliasTypeMap[s]
				851	}
				852	b.writeSlice("langAliasTypes", types)
				853	}
				854
				855	var scriptConsts = []string{
				856	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
				857	"Zzzz",
				858	}
				859
				860	func (b *builder) writeScript() {
				861	b.writeConsts(b.script.index, scriptConsts...)
				862	b.writeConst("script", tag.Index(b.script.join()))
				863
				864	supp := make([]uint8, len(b.lang.slice()))
				865	for i, v := range b.lang.slice()[1:] {
				866	if sc := b.registry[v].suppressScript; sc != "" {
				867	supp[i+1] = uint8(b.script.index(sc))
				868	}
				869	}
				870	b.writeSlice("suppressScript", supp)
				871
				872	// There is only one deprecated script in CLDR. This value is hard-coded.
				873	// We check here if the code must be updated.
				874	for _, a := range b.supp.Metadata.Alias.ScriptAlias {
				875	if a.Type != "Qaai" {
				876	log.Panicf("unexpected deprecated stript %q", a.Type)
				877	}
				878	}
				879	}
				880
				881	func parseM49(s string) int16 {
				882	if len(s) == 0 {
				883	return 0
				884	}
				885	v, err := strconv.ParseUint(s, 10, 10)
				886	failOnError(err)
				887	return int16(v)
				888	}
				889
				890	var regionConsts = []string{
				891	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
				892	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
				893	}
				894
				895	func (b *builder) writeRegion() {
				896	b.writeConsts(b.region.index, regionConsts...)
				897
				898	isoOffset := b.region.index("AA")
				899	m49map := make([]int16, len(b.region.slice()))
				900	fromM49map := make(map[int16]int)
				901	altRegionISO3 := ""
				902	altRegionIDs := []uint16{}
				903
				904	b.writeConst("isoRegionOffset", isoOffset)
				905
				906	// 2-letter region lookup and mapping to numeric codes.
				907	regionISO := b.region.clone()
				908	regionISO.s = regionISO.s[isoOffset:]
				909	regionISO.sorted = false
				910
				911	regionTypes := make([]byte, len(b.region.s))
				912
				913	// Is the region valid BCP 47?
				914	for s, e := range b.registry {
				915	if len(s) == 2 && s == strings.ToUpper(s) {
				916	i := b.region.index(s)
				917	for _, d := range e.description {
				918	if strings.Contains(d, "Private use") {
				919	regionTypes[i] = iso3166UserAssigned
				920	}
				921	}
				922	regionTypes[i] \|= bcp47Region
				923	}
				924	}
				925
				926	// Is the region a valid ccTLD?
				927	r := gen.OpenIANAFile("domains/root/db")
				928	defer r.Close()
				929
				930	buf, err := ioutil.ReadAll(r)
				931	failOnError(err)
				932	re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`)
				933	for _, m := range re.FindAllSubmatch(buf, -1) {
				934	i := b.region.index(strings.ToUpper(string(m[1])))
				935	regionTypes[i] \|= ccTLD
				936	}
				937
				938	b.writeSlice("regionTypes", regionTypes)
				939
				940	iso3Set := make(map[string]int)
				941	update := func(iso2, iso3 string) {
				942	i := regionISO.index(iso2)
				943	if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] {
				944	regionISO.s[i] += iso3[1:]
				945	iso3Set[iso3] = -1
				946	} else {
				947	if ok && j >= 0 {
				948	regionISO.s[i] += string([]byte{0, byte(j)})
				949	} else {
				950	iso3Set[iso3] = len(altRegionISO3)
				951	regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
				952	altRegionISO3 += iso3
				953	altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
				954	}
				955	}
				956	}
				957	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
				958	i := regionISO.index(tc.Type) + isoOffset
				959	if d := m49map[i]; d != 0 {
				960	log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
				961	}
				962	m49 := parseM49(tc.Numeric)
				963	m49map[i] = m49
				964	if r := fromM49map[m49]; r == 0 {
				965	fromM49map[m49] = i
				966	} else if r != i {
				967	dep := b.registry[regionISO.s[r-isoOffset]].deprecated
				968	if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" \|\| t.deprecated > dep) {
				969	fromM49map[m49] = i
				970	}
				971	}
				972	}
				973	for _, ta := range b.supp.Metadata.Alias.TerritoryAlias {
				974	if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 {
				975	from := parseM49(ta.Type)
				976	if r := fromM49map[from]; r == 0 {
				977	fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset
				978	}
				979	}
				980	}
				981	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
				982	if len(tc.Alpha3) == 3 {
				983	update(tc.Type, tc.Alpha3)
				984	}
				985	}
				986	// This entries are not included in territoryCodes. Mostly 3-letter variants
				987	// of deleted codes and an entry for QU.
				988	for _, m := range []struct{ iso2, iso3 string }{
				989	{"CT", "CTE"},
				990	{"DY", "DHY"},
				991	{"HV", "HVO"},
				992	{"JT", "JTN"},
				993	{"MI", "MID"},
				994	{"NH", "NHB"},
				995	{"NQ", "ATN"},
				996	{"PC", "PCI"},
				997	{"PU", "PUS"},
				998	{"PZ", "PCZ"},
				999	{"RH", "RHO"},
				1000	{"VD", "VDR"},
				1001	{"WK", "WAK"},
				1002	// These three-letter codes are used for others as well.
				1003	{"FQ", "ATF"},
				1004	} {
				1005	update(m.iso2, m.iso3)
				1006	}
				1007	for i, s := range regionISO.s {
				1008	if len(s) != 4 {
				1009	regionISO.s[i] = s + " "
				1010	}
				1011	}
				1012	b.writeConst("regionISO", tag.Index(regionISO.join()))
				1013	b.writeConst("altRegionISO3", altRegionISO3)
				1014	b.writeSlice("altRegionIDs", altRegionIDs)
				1015
				1016	// Create list of deprecated regions.
				1017	// TODO: consider inserting SF -> FI. Not included by CLDR, but is the only
				1018	// Transitionally-reserved mapping not included.
				1019	regionOldMap := stringSet{}
				1020	// Include regions in territoryAlias (not all are in the IANA registry!)
				1021	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
				1022	if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 {
				1023	regionOldMap.add(reg.Type)
				1024	regionOldMap.updateLater(reg.Type, reg.Replacement)
				1025	i, _ := regionISO.find(reg.Type)
				1026	j, _ := regionISO.find(reg.Replacement)
				1027	if k := m49map[i+isoOffset]; k == 0 {
				1028	m49map[i+isoOffset] = m49map[j+isoOffset]
				1029	}
				1030	}
				1031	}
				1032	b.writeSortedMap("regionOldMap", &regionOldMap, func(s string) uint16 {
				1033	return uint16(b.region.index(s))
				1034	})
				1035	// 3-digit region lookup, groupings.
				1036	for i := 1; i < isoOffset; i++ {
				1037	m := parseM49(b.region.s[i])
				1038	m49map[i] = m
				1039	fromM49map[m] = i
				1040	}
				1041	b.writeSlice("m49", m49map)
				1042
				1043	const (
				1044	searchBits = 7
				1045	regionBits = 9
				1046	)
				1047	if len(m49map) >= 1<<regionBits {
				1048	log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits)
				1049	}
				1050	m49Index := [9]int16{}
				1051	fromM49 := []uint16{}
				1052	m49 := []int{}
				1053	for k, _ := range fromM49map {
				1054	m49 = append(m49, int(k))
				1055	}
				1056	sort.Ints(m49)
				1057	for _, k := range m49[1:] {
				1058	val := (k & (1<<searchBits - 1)) << regionBits
				1059	fromM49 = append(fromM49, uint16(val\|fromM49map[int16(k)]))
				1060	m49Index[1:][k>>searchBits] = int16(len(fromM49))
				1061	}
				1062	b.writeSlice("m49Index", m49Index)
				1063	b.writeSlice("fromM49", fromM49)
				1064	}
				1065
				1066	const (
				1067	// TODO: put these lists in regionTypes as user data? Could be used for
				1068	// various optimizations and refinements and could be exposed in the API.
				1069	iso3166Except = "AC CP DG EA EU FX IC SU TA UK"
				1070	iso3166Trans = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions.
				1071	// DY and RH are actually not deleted, but indeterminately reserved.
				1072	iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD"
				1073	)
				1074
				1075	const (
				1076	iso3166UserAssigned = 1 << iota
				1077	ccTLD
				1078	bcp47Region
				1079	)
				1080
				1081	func find(list []string, s string) int {
				1082	for i, t := range list {
				1083	if t == s {
				1084	return i
				1085	}
				1086	}
				1087	return -1
				1088	}
				1089
				1090	// writeVariants generates per-variant information and creates a map from variant
				1091	// name to index value. We assign index values such that sorting multiple
				1092	// variants by index value will result in the correct order.
				1093	// There are two types of variants: specialized and general. Specialized variants
				1094	// are only applicable to certain language or language-script pairs. Generalized
				1095	// variants apply to any language. Generalized variants always sort after
				1096	// specialized variants. We will therefore always assign a higher index value
				1097	// to a generalized variant than any other variant. Generalized variants are
				1098	// sorted alphabetically among themselves.
				1099	// Specialized variants may also sort after other specialized variants. Such
				1100	// variants will be ordered after any of the variants they may follow.
				1101	// We assume that if a variant x is followed by a variant y, then for any prefix
				1102	// p of x, p-x is a prefix of y. This allows us to order tags based on the
				1103	// maximum of the length of any of its prefixes.
				1104	// TODO: it is possible to define a set of Prefix values on variants such that
				1105	// a total order cannot be defined to the point that this algorithm breaks.
				1106	// In other words, we cannot guarantee the same order of variants for the
				1107	// future using the same algorithm or for non-compliant combinations of
				1108	// variants. For this reason, consider using simple alphabetic sorting
				1109	// of variants and ignore Prefix restrictions altogether.
				1110	func (b *builder) writeVariant() {
				1111	generalized := stringSet{}
				1112	specialized := stringSet{}
				1113	specializedExtend := stringSet{}
				1114	// Collate the variants by type and check assumptions.
				1115	for _, v := range b.variant.slice() {
				1116	e := b.registry[v]
				1117	if len(e.prefix) == 0 {
				1118	generalized.add(v)
				1119	continue
				1120	}
				1121	c := strings.Split(e.prefix[0], "-")
				1122	hasScriptOrRegion := false
				1123	if len(c) > 1 {
				1124	_, hasScriptOrRegion = b.script.find(c[1])
				1125	if !hasScriptOrRegion {
				1126	_, hasScriptOrRegion = b.region.find(c[1])
				1127
				1128	}
				1129	}
				1130	if len(c) == 1 \|\| len(c) == 2 && hasScriptOrRegion {
				1131	// Variant is preceded by a language.
				1132	specialized.add(v)
				1133	continue
				1134	}
				1135	// Variant is preceded by another variant.
				1136	specializedExtend.add(v)
				1137	prefix := c[0] + "-"
				1138	if hasScriptOrRegion {
				1139	prefix += c[1]
				1140	}
				1141	for _, p := range e.prefix {
				1142	// Verify that the prefix minus the last element is a prefix of the
				1143	// predecessor element.
				1144	i := strings.LastIndex(p, "-")
				1145	pred := b.registry[p[i+1:]]
				1146	if find(pred.prefix, p[:i]) < 0 {
				1147	log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v)
				1148	}
				1149	// The sorting used below does not work in the general case. It works
				1150	// if we assume that variants that may be followed by others only have
				1151	// prefixes of the same length. Verify this.
				1152	count := strings.Count(p[:i], "-")
				1153	for _, q := range pred.prefix {
				1154	if c := strings.Count(q, "-"); c != count {
				1155	log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count)
				1156	}
				1157	}
				1158	if !strings.HasPrefix(p, prefix) {
				1159	log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix)
				1160	}
				1161	}
				1162	}
				1163
				1164	// Sort extended variants.
				1165	a := specializedExtend.s
				1166	less := func(v, w string) bool {
				1167	// Sort by the maximum number of elements.
				1168	maxCount := func(s string) (max int) {
				1169	for _, p := range b.registry[s].prefix {
				1170	if c := strings.Count(p, "-"); c > max {
				1171	max = c
				1172	}
				1173	}
				1174	return
				1175	}
				1176	if cv, cw := maxCount(v), maxCount(w); cv != cw {
				1177	return cv < cw
				1178	}
				1179	// Sort by name as tie breaker.
				1180	return v < w
				1181	}
				1182	sort.Sort(funcSorter{less, sort.StringSlice(a)})
				1183	specializedExtend.frozen = true
				1184
				1185	// Create index from variant name to index.
				1186	variantIndex := make(map[string]uint8)
				1187	add := func(s []string) {
				1188	for _, v := range s {
				1189	variantIndex[v] = uint8(len(variantIndex))
				1190	}
				1191	}
				1192	add(specialized.slice())
				1193	add(specializedExtend.s)
				1194	numSpecialized := len(variantIndex)
				1195	add(generalized.slice())
				1196	if n := len(variantIndex); n > 255 {
				1197	log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n)
				1198	}
				1199	b.writeMap("variantIndex", variantIndex)
				1200	b.writeConst("variantNumSpecialized", numSpecialized)
				1201	}
				1202
				1203	func (b *builder) writeLanguageInfo() {
				1204	}
				1205
				1206	// writeLikelyData writes tables that are used both for finding parent relations and for
				1207	// language matching. Each entry contains additional bits to indicate the status of the
				1208	// data to know when it cannot be used for parent relations.
				1209	func (b *builder) writeLikelyData() {
				1210	const (
				1211	isList = 1 << iota
				1212	scriptInFrom
				1213	regionInFrom
				1214	)
				1215	type ( // generated types
				1216	likelyScriptRegion struct {
				1217	region uint16
				1218	script uint8
				1219	flags uint8
				1220	}
				1221	likelyLangScript struct {
				1222	lang uint16
				1223	script uint8
				1224	flags uint8
				1225	}
				1226	likelyLangRegion struct {
				1227	lang uint16
				1228	region uint16
				1229	}
				1230	// likelyTag is used for getting likely tags for group regions, where
				1231	// the likely region might be a region contained in the group.
				1232	likelyTag struct {
				1233	lang uint16
				1234	region uint16
				1235	script uint8
				1236	}
				1237	)
				1238	var ( // generated variables
				1239	likelyRegionGroup = make([]likelyTag, len(b.groups))
				1240	likelyLang = make([]likelyScriptRegion, len(b.lang.s))
				1241	likelyRegion = make([]likelyLangScript, len(b.region.s))
				1242	likelyScript = make([]likelyLangRegion, len(b.script.s))
				1243	likelyLangList = []likelyScriptRegion{}
				1244	likelyRegionList = []likelyLangScript{}
				1245	)
				1246	type fromTo struct {
				1247	from, to []string
				1248	}
				1249	langToOther := map[int][]fromTo{}
				1250	regionToOther := map[int][]fromTo{}
				1251	for _, m := range b.supp.LikelySubtags.LikelySubtag {
				1252	from := strings.Split(m.From, "_")
				1253	to := strings.Split(m.To, "_")
				1254	if len(to) != 3 {
				1255	log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to))
				1256	}
				1257	if len(from) > 3 {
				1258	log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from))
				1259	}
				1260	if from[0] != to[0] && from[0] != "und" {
				1261	log.Fatalf("unexpected language change in expansion: %s -> %s", from, to)
				1262	}
				1263	if len(from) == 3 {
				1264	if from[2] != to[2] {
				1265	log.Fatalf("unexpected region change in expansion: %s -> %s", from, to)
				1266	}
				1267	if from[0] != "und" {
				1268	log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to)
				1269	}
				1270	}
				1271	if len(from) == 1 \|\| from[0] != "und" {
				1272	id := 0
				1273	if from[0] != "und" {
				1274	id = b.lang.index(from[0])
				1275	}
				1276	langToOther[id] = append(langToOther[id], fromTo{from, to})
				1277	} else if len(from) == 2 && len(from[1]) == 4 {
				1278	sid := b.script.index(from[1])
				1279	likelyScript[sid].lang = uint16(b.langIndex(to[0]))
				1280	likelyScript[sid].region = uint16(b.region.index(to[2]))
				1281	} else {
				1282	r := b.region.index(from[len(from)-1])
				1283	if id, ok := b.groups[r]; ok {
				1284	if from[0] != "und" {
				1285	log.Fatalf("region changed unexpectedly: %s -> %s", from, to)
				1286	}
				1287	likelyRegionGroup[id].lang = uint16(b.langIndex(to[0]))
				1288	likelyRegionGroup[id].script = uint8(b.script.index(to[1]))
				1289	likelyRegionGroup[id].region = uint16(b.region.index(to[2]))
				1290	} else {
				1291	regionToOther[r] = append(regionToOther[r], fromTo{from, to})
				1292	}
				1293	}
				1294	}
				1295	b.writeType(likelyLangRegion{})
				1296	b.writeSlice("likelyScript", likelyScript)
				1297
				1298	for id := range b.lang.s {
				1299	list := langToOther[id]
				1300	if len(list) == 1 {
				1301	likelyLang[id].region = uint16(b.region.index(list[0].to[2]))
				1302	likelyLang[id].script = uint8(b.script.index(list[0].to[1]))
				1303	} else if len(list) > 1 {
				1304	likelyLang[id].flags = isList
				1305	likelyLang[id].region = uint16(len(likelyLangList))
				1306	likelyLang[id].script = uint8(len(list))
				1307	for _, x := range list {
				1308	flags := uint8(0)
				1309	if len(x.from) > 1 {
				1310	if x.from[1] == x.to[2] {
				1311	flags = regionInFrom
				1312	} else {
				1313	flags = scriptInFrom
				1314	}
				1315	}
				1316	likelyLangList = append(likelyLangList, likelyScriptRegion{
				1317	region: uint16(b.region.index(x.to[2])),
				1318	script: uint8(b.script.index(x.to[1])),
				1319	flags: flags,
				1320	})
				1321	}
				1322	}
				1323	}
				1324	// TODO: merge suppressScript data with this table.
				1325	b.writeType(likelyScriptRegion{})
				1326	b.writeSlice("likelyLang", likelyLang)
				1327	b.writeSlice("likelyLangList", likelyLangList)
				1328
				1329	for id := range b.region.s {
				1330	list := regionToOther[id]
				1331	if len(list) == 1 {
				1332	likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0]))
				1333	likelyRegion[id].script = uint8(b.script.index(list[0].to[1]))
				1334	if len(list[0].from) > 2 {
				1335	likelyRegion[id].flags = scriptInFrom
				1336	}
				1337	} else if len(list) > 1 {
				1338	likelyRegion[id].flags = isList
				1339	likelyRegion[id].lang = uint16(len(likelyRegionList))
				1340	likelyRegion[id].script = uint8(len(list))
				1341	for i, x := range list {
				1342	if len(x.from) == 2 && i != 0 \|\| i > 0 && len(x.from) != 3 {
				1343	log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i)
				1344	}
				1345	x := likelyLangScript{
				1346	lang: uint16(b.langIndex(x.to[0])),
				1347	script: uint8(b.script.index(x.to[1])),
				1348	}
				1349	if len(list[0].from) > 2 {
				1350	x.flags = scriptInFrom
				1351	}
				1352	likelyRegionList = append(likelyRegionList, x)
				1353	}
				1354	}
				1355	}
				1356	b.writeType(likelyLangScript{})
				1357	b.writeSlice("likelyRegion", likelyRegion)
				1358	b.writeSlice("likelyRegionList", likelyRegionList)
				1359
				1360	b.writeType(likelyTag{})
				1361	b.writeSlice("likelyRegionGroup", likelyRegionGroup)
				1362	}
				1363
				1364	type mutualIntelligibility struct {
				1365	want, have uint16
				1366	distance uint8
				1367	oneway bool
				1368	}
				1369
				1370	type scriptIntelligibility struct {
				1371	wantLang, haveLang uint16
				1372	wantScript, haveScript uint8
				1373	distance uint8
				1374	// Always oneway
				1375	}
				1376
				1377	type regionIntelligibility struct {
				1378	lang uint16 // compact language id
				1379	script uint8 // 0 means any
				1380	group uint8 // 0 means any; if bit 7 is set it means inverse
				1381	distance uint8
				1382	// Always twoway.
				1383	}
				1384
				1385	// writeMatchData writes tables with languages and scripts for which there is
				1386	// mutual intelligibility. The data is based on CLDR's languageMatching data.
				1387	// Note that we use a different algorithm than the one defined by CLDR and that
				1388	// we slightly modify the data. For example, we convert scores to confidence levels.
				1389	// We also drop all region-related data as we use a different algorithm to
				1390	// determine region equivalence.
				1391	func (b *builder) writeMatchData() {
				1392	lm := b.supp.LanguageMatching.LanguageMatches
				1393	cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
				1394
				1395	regionHierarchy := map[string][]string{}
				1396	for _, g := range b.supp.TerritoryContainment.Group {
				1397	regions := strings.Split(g.Contains, " ")
				1398	regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
				1399	}
				1400	regionToGroups := make([]uint8, len(b.region.s))
				1401
				1402	idToIndex := map[string]uint8{}
				1403	for i, mv := range lm[0].MatchVariable {
				1404	if i > 6 {
				1405	log.Fatalf("Too many groups: %d", i)
				1406	}
				1407	idToIndex[mv.Id] = uint8(i + 1)
				1408	// TODO: also handle '-'
				1409	for _, r := range strings.Split(mv.Value, "+") {
				1410	todo := []string{r}
				1411	for k := 0; k < len(todo); k++ {
				1412	r := todo[k]
				1413	regionToGroups[b.region.index(r)] \|= 1 << uint8(i)
				1414	todo = append(todo, regionHierarchy[r]...)
				1415	}
				1416	}
				1417	}
				1418	b.writeSlice("regionToGroups", regionToGroups)
				1419
				1420	// maps language id to in- and out-of-group region.
				1421	paradigmLocales := [][3]uint16{}
				1422	locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
				1423	for i := 0; i < len(locales); i += 2 {
				1424	x := [3]uint16{}
				1425	for j := 0; j < 2; j++ {
				1426	pc := strings.SplitN(locales[i+j], "-", 2)
				1427	x[0] = b.langIndex(pc[0])
				1428	if len(pc) == 2 {
				1429	x[1+j] = uint16(b.region.index(pc[1]))
				1430	}
				1431	}
				1432	paradigmLocales = append(paradigmLocales, x)
				1433	}
				1434	b.writeSlice("paradigmLocales", paradigmLocales)
				1435
				1436	b.writeType(mutualIntelligibility{})
				1437	b.writeType(scriptIntelligibility{})
				1438	b.writeType(regionIntelligibility{})
				1439
				1440	matchLang := []mutualIntelligibility{}
				1441	matchScript := []scriptIntelligibility{}
				1442	matchRegion := []regionIntelligibility{}
				1443	// Convert the languageMatch entries in lists keyed by desired language.
				1444	for _, m := range lm[0].LanguageMatch {
				1445	// Different versions of CLDR use different separators.
				1446	desired := strings.Replace(m.Desired, "-", "_", -1)
				1447	supported := strings.Replace(m.Supported, "-", "_", -1)
				1448	d := strings.Split(desired, "_")
				1449	s := strings.Split(supported, "_")
				1450	if len(d) != len(s) {
				1451	log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
				1452	continue
				1453	}
				1454	distance, _ := strconv.ParseInt(m.Distance, 10, 8)
				1455	switch len(d) {
				1456	case 2:
				1457	if desired == supported && desired == "_" {
				1458	continue
				1459	}
				1460	// language-script pair.
				1461	matchScript = append(matchScript, scriptIntelligibility{
				1462	wantLang: uint16(b.langIndex(d[0])),
				1463	haveLang: uint16(b.langIndex(s[0])),
				1464	wantScript: uint8(b.script.index(d[1])),
				1465	haveScript: uint8(b.script.index(s[1])),
				1466	distance: uint8(distance),
				1467	})
				1468	if m.Oneway != "true" {
				1469	matchScript = append(matchScript, scriptIntelligibility{
				1470	wantLang: uint16(b.langIndex(s[0])),
				1471	haveLang: uint16(b.langIndex(d[0])),
				1472	wantScript: uint8(b.script.index(s[1])),
				1473	haveScript: uint8(b.script.index(d[1])),
				1474	distance: uint8(distance),
				1475	})
				1476	}
				1477	case 1:
				1478	if desired == supported && desired == "*" {
				1479	continue
				1480	}
				1481	if distance == 1 {
				1482	// nb == no is already handled by macro mapping. Check there
				1483	// really is only this case.
				1484	if d[0] != "no" \|\| s[0] != "nb" {
				1485	log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
				1486	}
				1487	continue
				1488	}
				1489	// TODO: consider dropping oneway field and just doubling the entry.
				1490	matchLang = append(matchLang, mutualIntelligibility{
				1491	want: uint16(b.langIndex(d[0])),
				1492	have: uint16(b.langIndex(s[0])),
				1493	distance: uint8(distance),
				1494	oneway: m.Oneway == "true",
				1495	})
				1496	case 3:
				1497	if desired == supported && desired == "__*" {
				1498	continue
				1499	}
				1500	if desired != supported {
				1501	// This is now supported by CLDR, but only one case, which
				1502	// should already be covered by paradigm locales. For instance,
				1503	// test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
				1504	// testdata/CLDRLocaleMatcherTest.txt tests this.
				1505	if supported != "en_*_GB" {
				1506	log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
				1507	}
				1508	continue
				1509	}
				1510	ri := regionIntelligibility{
				1511	lang: b.langIndex(d[0]),
				1512	distance: uint8(distance),
				1513	}
				1514	if d[1] != "*" {
				1515	ri.script = uint8(b.script.index(d[1]))
				1516	}
				1517	switch {
				1518	case d[2] == "*":
				1519	ri.group = 0x80 // not contained in anything
				1520	case strings.HasPrefix(d[2], "$!"):
				1521	ri.group = 0x80
				1522	d[2] = "$" + d[2][len("$!"):]
				1523	fallthrough
				1524	case strings.HasPrefix(d[2], "$"):
				1525	ri.group \|= idToIndex[d[2]]
				1526	}
				1527	matchRegion = append(matchRegion, ri)
				1528	default:
				1529	log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
				1530	}
				1531	}
				1532	sort.SliceStable(matchLang, func(i, j int) bool {
				1533	return matchLang[i].distance < matchLang[j].distance
				1534	})
				1535	b.writeSlice("matchLang", matchLang)
				1536
				1537	sort.SliceStable(matchScript, func(i, j int) bool {
				1538	return matchScript[i].distance < matchScript[j].distance
				1539	})
				1540	b.writeSlice("matchScript", matchScript)
				1541
				1542	sort.SliceStable(matchRegion, func(i, j int) bool {
				1543	return matchRegion[i].distance < matchRegion[j].distance
				1544	})
				1545	b.writeSlice("matchRegion", matchRegion)
				1546	}
				1547
				1548	func (b *builder) writeRegionInclusionData() {
				1549	var (
				1550	// mm holds for each group the set of groups with a distance of 1.
				1551	mm = make(map[int][]index)
				1552
				1553	// containment holds for each group the transitive closure of
				1554	// containment of other groups.
				1555	containment = make(map[index][]index)
				1556	)
				1557	for _, g := range b.supp.TerritoryContainment.Group {
				1558	// Skip UN and EURO zone as they are flattening the containment
				1559	// relationship.
				1560	if g.Type == "EZ" \|\| g.Type == "UN" {
				1561	continue
				1562	}
				1563	group := b.region.index(g.Type)
				1564	groupIdx := b.groups[group]
				1565	for _, mem := range strings.Split(g.Contains, " ") {
				1566	r := b.region.index(mem)
				1567	mm[r] = append(mm[r], groupIdx)
				1568	if g, ok := b.groups[r]; ok {
				1569	mm[group] = append(mm[group], g)
				1570	containment[groupIdx] = append(containment[groupIdx], g)
				1571	}
				1572	}
				1573	}
				1574
				1575	regionContainment := make([]uint64, len(b.groups))
				1576	for _, g := range b.groups {
				1577	l := containment[g]
				1578
				1579	// Compute the transitive closure of containment.
				1580	for i := 0; i < len(l); i++ {
				1581	l = append(l, containment[l[i]]...)
				1582	}
				1583
				1584	// Compute the bitmask.
				1585	regionContainment[g] = 1 << g
				1586	for _, v := range l {
				1587	regionContainment[g] \|= 1 << v
				1588	}
				1589	}
				1590	b.writeSlice("regionContainment", regionContainment)
				1591
				1592	regionInclusion := make([]uint8, len(b.region.s))
				1593	bvs := make(map[uint64]index)
				1594	// Make the first bitvector positions correspond with the groups.
				1595	for r, i := range b.groups {
				1596	bv := uint64(1 << i)
				1597	for _, g := range mm[r] {
				1598	bv \|= 1 << g
				1599	}
				1600	bvs[bv] = i
				1601	regionInclusion[r] = uint8(bvs[bv])
				1602	}
				1603	for r := 1; r < len(b.region.s); r++ {
				1604	if _, ok := b.groups[r]; !ok {
				1605	bv := uint64(0)
				1606	for _, g := range mm[r] {
				1607	bv \|= 1 << g
				1608	}
				1609	if bv == 0 {
				1610	// Pick the world for unspecified regions.
				1611	bv = 1 << b.groups[b.region.index("001")]
				1612	}
				1613	if _, ok := bvs[bv]; !ok {
				1614	bvs[bv] = index(len(bvs))
				1615	}
				1616	regionInclusion[r] = uint8(bvs[bv])
				1617	}
				1618	}
				1619	b.writeSlice("regionInclusion", regionInclusion)
				1620	regionInclusionBits := make([]uint64, len(bvs))
				1621	for k, v := range bvs {
				1622	regionInclusionBits[v] = uint64(k)
				1623	}
				1624	// Add bit vectors for increasingly large distances until a fixed point is reached.
				1625	regionInclusionNext := []uint8{}
				1626	for i := 0; i < len(regionInclusionBits); i++ {
				1627	bits := regionInclusionBits[i]
				1628	next := bits
				1629	for i := uint(0); i < uint(len(b.groups)); i++ {
				1630	if bits&(1<<i) != 0 {
				1631	next \|= regionInclusionBits[i]
				1632	}
				1633	}
				1634	if _, ok := bvs[next]; !ok {
				1635	bvs[next] = index(len(bvs))
				1636	regionInclusionBits = append(regionInclusionBits, next)
				1637	}
				1638	regionInclusionNext = append(regionInclusionNext, uint8(bvs[next]))
				1639	}
				1640	b.writeSlice("regionInclusionBits", regionInclusionBits)
				1641	b.writeSlice("regionInclusionNext", regionInclusionNext)
				1642	}
				1643
				1644	type parentRel struct {
				1645	lang uint16
				1646	script uint8
				1647	maxScript uint8
				1648	toRegion uint16
				1649	fromRegion []uint16
				1650	}
				1651
				1652	func (b *builder) writeParents() {
				1653	b.writeType(parentRel{})
				1654
				1655	parents := []parentRel{}
				1656
				1657	// Construct parent overrides.
				1658	n := 0
				1659	for _, p := range b.data.Supplemental().ParentLocales.ParentLocale {
				1660	// Skipping non-standard scripts to root is implemented using addTags.
				1661	if p.Parent == "root" {
				1662	continue
				1663	}
				1664
				1665	sub := strings.Split(p.Parent, "_")
				1666	parent := parentRel{lang: b.langIndex(sub[0])}
				1667	if len(sub) == 2 {
				1668	// TODO: check that all undefined scripts are indeed Latn in these
				1669	// cases.
				1670	parent.maxScript = uint8(b.script.index("Latn"))
				1671	parent.toRegion = uint16(b.region.index(sub[1]))
				1672	} else {
				1673	parent.script = uint8(b.script.index(sub[1]))
				1674	parent.maxScript = parent.script
				1675	parent.toRegion = uint16(b.region.index(sub[2]))
				1676	}
				1677	for _, c := range strings.Split(p.Locales, " ") {
				1678	region := b.region.index(c[strings.LastIndex(c, "_")+1:])
				1679	parent.fromRegion = append(parent.fromRegion, uint16(region))
				1680	}
				1681	parents = append(parents, parent)
				1682	n += len(parent.fromRegion)
				1683	}
				1684	b.writeSliceAddSize("parents", n*2, parents)
				1685	}
				1686
				1687	func main() {
				1688	gen.Init()
				1689
				1690	gen.Repackage("gen_common.go", "common.go", "language")
				1691
				1692	w := gen.NewCodeWriter()
				1693	defer w.WriteGoFile("tables.go", "language")
				1694
				1695	fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`)
				1696
				1697	b := newBuilder(w)
				1698	gen.WriteCLDRVersion(w)
				1699
				1700	b.parseIndices()
				1701	b.writeType(fromTo{})
				1702	b.writeLanguage()
				1703	b.writeScript()
				1704	b.writeRegion()
				1705	b.writeVariant()
				1706	// TODO: b.writeLocale()
				1707	b.computeRegionGroups()
				1708	b.writeLikelyData()
				1709	b.writeMatchData()
				1710	b.writeRegionInclusionData()
				1711	b.writeParents()
				1712	}