khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build ignore |
| 6 | |
| 7 | // Language tag table generator. |
| 8 | // Data read from the web. |
| 9 | |
| 10 | package main |
| 11 | |
| 12 | import ( |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 13 | "flag" |
| 14 | "fmt" |
| 15 | "io" |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 16 | "log" |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 17 | "sort" |
| 18 | "strconv" |
| 19 | "strings" |
| 20 | |
| 21 | "golang.org/x/text/internal/gen" |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 22 | "golang.org/x/text/internal/language" |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 23 | "golang.org/x/text/unicode/cldr" |
| 24 | ) |
| 25 | |
| 26 | var ( |
| 27 | test = flag.Bool("test", |
| 28 | false, |
| 29 | "test existing tables; can be used to compare web data with package data.") |
| 30 | outputFile = flag.String("output", |
| 31 | "tables.go", |
| 32 | "output file for generated tables") |
| 33 | ) |
| 34 | |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 35 | func main() { |
| 36 | gen.Init() |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 37 | |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 38 | w := gen.NewCodeWriter() |
| 39 | defer w.WriteGoFile("tables.go", "language") |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 40 | |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 41 | b := newBuilder(w) |
| 42 | gen.WriteCLDRVersion(w) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 43 | |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 44 | b.writeConstants() |
| 45 | b.writeMatchData() |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 46 | } |
| 47 | |
| 48 | type builder struct { |
| 49 | w *gen.CodeWriter |
| 50 | hw io.Writer // MultiWriter for w and w.Hash |
| 51 | data *cldr.CLDR |
| 52 | supp *cldr.SupplementalData |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 53 | } |
| 54 | |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 55 | func (b *builder) langIndex(s string) uint16 { |
| 56 | return uint16(language.MustParseBase(s)) |
| 57 | } |
| 58 | |
| 59 | func (b *builder) regionIndex(s string) int { |
| 60 | return int(language.MustParseRegion(s)) |
| 61 | } |
| 62 | |
| 63 | func (b *builder) scriptIndex(s string) int { |
| 64 | return int(language.MustParseScript(s)) |
| 65 | } |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 66 | |
| 67 | func newBuilder(w *gen.CodeWriter) *builder { |
| 68 | r := gen.OpenCLDRCoreZip() |
| 69 | defer r.Close() |
| 70 | d := &cldr.Decoder{} |
| 71 | data, err := d.DecodeZip(r) |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 72 | if err != nil { |
| 73 | log.Fatal(err) |
| 74 | } |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 75 | b := builder{ |
| 76 | w: w, |
| 77 | hw: io.MultiWriter(w, w.Hash), |
| 78 | data: data, |
| 79 | supp: data.Supplemental(), |
| 80 | } |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 81 | return &b |
| 82 | } |
| 83 | |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 84 | // writeConsts computes f(v) for all v in values and writes the results |
| 85 | // as constants named _v to a single constant block. |
| 86 | func (b *builder) writeConsts(f func(string) int, values ...string) { |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 87 | fmt.Fprintln(b.w, "const (") |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 88 | for _, v := range values { |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 89 | fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v)) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 90 | } |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 91 | fmt.Fprintln(b.w, ")") |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 92 | } |
| 93 | |
| 94 | // TODO: region inclusion data will probably not be use used in future matchers. |
| 95 | |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 96 | var langConsts = []string{ |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 97 | "de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und", |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 98 | } |
| 99 | |
| 100 | var scriptConsts = []string{ |
| 101 | "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", |
| 102 | "Zzzz", |
| 103 | } |
| 104 | |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 105 | var regionConsts = []string{ |
| 106 | "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", |
| 107 | "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. |
| 108 | } |
| 109 | |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 110 | func (b *builder) writeConstants() { |
| 111 | b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) |
| 112 | b.writeConsts(b.regionIndex, regionConsts...) |
| 113 | b.writeConsts(b.scriptIndex, scriptConsts...) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 114 | } |
| 115 | |
| 116 | type mutualIntelligibility struct { |
| 117 | want, have uint16 |
| 118 | distance uint8 |
| 119 | oneway bool |
| 120 | } |
| 121 | |
| 122 | type scriptIntelligibility struct { |
| 123 | wantLang, haveLang uint16 |
| 124 | wantScript, haveScript uint8 |
| 125 | distance uint8 |
| 126 | // Always oneway |
| 127 | } |
| 128 | |
| 129 | type regionIntelligibility struct { |
| 130 | lang uint16 // compact language id |
| 131 | script uint8 // 0 means any |
| 132 | group uint8 // 0 means any; if bit 7 is set it means inverse |
| 133 | distance uint8 |
| 134 | // Always twoway. |
| 135 | } |
| 136 | |
| 137 | // writeMatchData writes tables with languages and scripts for which there is |
| 138 | // mutual intelligibility. The data is based on CLDR's languageMatching data. |
| 139 | // Note that we use a different algorithm than the one defined by CLDR and that |
| 140 | // we slightly modify the data. For example, we convert scores to confidence levels. |
| 141 | // We also drop all region-related data as we use a different algorithm to |
| 142 | // determine region equivalence. |
| 143 | func (b *builder) writeMatchData() { |
| 144 | lm := b.supp.LanguageMatching.LanguageMatches |
| 145 | cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new") |
| 146 | |
| 147 | regionHierarchy := map[string][]string{} |
| 148 | for _, g := range b.supp.TerritoryContainment.Group { |
| 149 | regions := strings.Split(g.Contains, " ") |
| 150 | regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...) |
| 151 | } |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 152 | regionToGroups := make([]uint8, language.NumRegions) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 153 | |
| 154 | idToIndex := map[string]uint8{} |
| 155 | for i, mv := range lm[0].MatchVariable { |
| 156 | if i > 6 { |
| 157 | log.Fatalf("Too many groups: %d", i) |
| 158 | } |
| 159 | idToIndex[mv.Id] = uint8(i + 1) |
| 160 | // TODO: also handle '-' |
| 161 | for _, r := range strings.Split(mv.Value, "+") { |
| 162 | todo := []string{r} |
| 163 | for k := 0; k < len(todo); k++ { |
| 164 | r := todo[k] |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 165 | regionToGroups[b.regionIndex(r)] |= 1 << uint8(i) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 166 | todo = append(todo, regionHierarchy[r]...) |
| 167 | } |
| 168 | } |
| 169 | } |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 170 | b.w.WriteVar("regionToGroups", regionToGroups) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 171 | |
| 172 | // maps language id to in- and out-of-group region. |
| 173 | paradigmLocales := [][3]uint16{} |
| 174 | locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ") |
| 175 | for i := 0; i < len(locales); i += 2 { |
| 176 | x := [3]uint16{} |
| 177 | for j := 0; j < 2; j++ { |
| 178 | pc := strings.SplitN(locales[i+j], "-", 2) |
| 179 | x[0] = b.langIndex(pc[0]) |
| 180 | if len(pc) == 2 { |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 181 | x[1+j] = uint16(b.regionIndex(pc[1])) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 182 | } |
| 183 | } |
| 184 | paradigmLocales = append(paradigmLocales, x) |
| 185 | } |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 186 | b.w.WriteVar("paradigmLocales", paradigmLocales) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 187 | |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 188 | b.w.WriteType(mutualIntelligibility{}) |
| 189 | b.w.WriteType(scriptIntelligibility{}) |
| 190 | b.w.WriteType(regionIntelligibility{}) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 191 | |
| 192 | matchLang := []mutualIntelligibility{} |
| 193 | matchScript := []scriptIntelligibility{} |
| 194 | matchRegion := []regionIntelligibility{} |
| 195 | // Convert the languageMatch entries in lists keyed by desired language. |
| 196 | for _, m := range lm[0].LanguageMatch { |
| 197 | // Different versions of CLDR use different separators. |
| 198 | desired := strings.Replace(m.Desired, "-", "_", -1) |
| 199 | supported := strings.Replace(m.Supported, "-", "_", -1) |
| 200 | d := strings.Split(desired, "_") |
| 201 | s := strings.Split(supported, "_") |
| 202 | if len(d) != len(s) { |
| 203 | log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) |
| 204 | continue |
| 205 | } |
| 206 | distance, _ := strconv.ParseInt(m.Distance, 10, 8) |
| 207 | switch len(d) { |
| 208 | case 2: |
| 209 | if desired == supported && desired == "*_*" { |
| 210 | continue |
| 211 | } |
| 212 | // language-script pair. |
| 213 | matchScript = append(matchScript, scriptIntelligibility{ |
| 214 | wantLang: uint16(b.langIndex(d[0])), |
| 215 | haveLang: uint16(b.langIndex(s[0])), |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 216 | wantScript: uint8(b.scriptIndex(d[1])), |
| 217 | haveScript: uint8(b.scriptIndex(s[1])), |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 218 | distance: uint8(distance), |
| 219 | }) |
| 220 | if m.Oneway != "true" { |
| 221 | matchScript = append(matchScript, scriptIntelligibility{ |
| 222 | wantLang: uint16(b.langIndex(s[0])), |
| 223 | haveLang: uint16(b.langIndex(d[0])), |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 224 | wantScript: uint8(b.scriptIndex(s[1])), |
| 225 | haveScript: uint8(b.scriptIndex(d[1])), |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 226 | distance: uint8(distance), |
| 227 | }) |
| 228 | } |
| 229 | case 1: |
| 230 | if desired == supported && desired == "*" { |
| 231 | continue |
| 232 | } |
| 233 | if distance == 1 { |
| 234 | // nb == no is already handled by macro mapping. Check there |
| 235 | // really is only this case. |
| 236 | if d[0] != "no" || s[0] != "nb" { |
| 237 | log.Fatalf("unhandled equivalence %s == %s", s[0], d[0]) |
| 238 | } |
| 239 | continue |
| 240 | } |
| 241 | // TODO: consider dropping oneway field and just doubling the entry. |
| 242 | matchLang = append(matchLang, mutualIntelligibility{ |
| 243 | want: uint16(b.langIndex(d[0])), |
| 244 | have: uint16(b.langIndex(s[0])), |
| 245 | distance: uint8(distance), |
| 246 | oneway: m.Oneway == "true", |
| 247 | }) |
| 248 | case 3: |
| 249 | if desired == supported && desired == "*_*_*" { |
| 250 | continue |
| 251 | } |
| 252 | if desired != supported { |
| 253 | // This is now supported by CLDR, but only one case, which |
| 254 | // should already be covered by paradigm locales. For instance, |
| 255 | // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in |
| 256 | // testdata/CLDRLocaleMatcherTest.txt tests this. |
| 257 | if supported != "en_*_GB" { |
| 258 | log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) |
| 259 | } |
| 260 | continue |
| 261 | } |
| 262 | ri := regionIntelligibility{ |
| 263 | lang: b.langIndex(d[0]), |
| 264 | distance: uint8(distance), |
| 265 | } |
| 266 | if d[1] != "*" { |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 267 | ri.script = uint8(b.scriptIndex(d[1])) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 268 | } |
| 269 | switch { |
| 270 | case d[2] == "*": |
| 271 | ri.group = 0x80 // not contained in anything |
| 272 | case strings.HasPrefix(d[2], "$!"): |
| 273 | ri.group = 0x80 |
| 274 | d[2] = "$" + d[2][len("$!"):] |
| 275 | fallthrough |
| 276 | case strings.HasPrefix(d[2], "$"): |
| 277 | ri.group |= idToIndex[d[2]] |
| 278 | } |
| 279 | matchRegion = append(matchRegion, ri) |
| 280 | default: |
| 281 | log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) |
| 282 | } |
| 283 | } |
| 284 | sort.SliceStable(matchLang, func(i, j int) bool { |
| 285 | return matchLang[i].distance < matchLang[j].distance |
| 286 | }) |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 287 | b.w.WriteComment(` |
| 288 | matchLang holds pairs of langIDs of base languages that are typically |
| 289 | mutually intelligible. Each pair is associated with a confidence and |
| 290 | whether the intelligibility goes one or both ways.`) |
| 291 | b.w.WriteVar("matchLang", matchLang) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 292 | |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 293 | b.w.WriteComment(` |
| 294 | matchScript holds pairs of scriptIDs where readers of one script |
| 295 | can typically also read the other. Each is associated with a confidence.`) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 296 | sort.SliceStable(matchScript, func(i, j int) bool { |
| 297 | return matchScript[i].distance < matchScript[j].distance |
| 298 | }) |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 299 | b.w.WriteVar("matchScript", matchScript) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 300 | |
| 301 | sort.SliceStable(matchRegion, func(i, j int) bool { |
| 302 | return matchRegion[i].distance < matchRegion[j].distance |
| 303 | }) |
Scott Baker | 8461e15 | 2019-10-01 14:44:30 -0700 | [diff] [blame] | 304 | b.w.WriteVar("matchRegion", matchRegion) |
khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 305 | } |