blob: 3004eb42c11adbcb4a57f5377ad6767eea0f4e7d [file] [log] [blame]
khenaidooac637102019-01-14 15:44:34 -05001// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Language tag table generator.
8// Data read from the web.
9
10package main
11
12import (
khenaidooac637102019-01-14 15:44:34 -050013 "flag"
14 "fmt"
15 "io"
khenaidooac637102019-01-14 15:44:34 -050016 "log"
khenaidooac637102019-01-14 15:44:34 -050017 "sort"
18 "strconv"
19 "strings"
20
21 "golang.org/x/text/internal/gen"
Scott Baker8461e152019-10-01 14:44:30 -070022 "golang.org/x/text/internal/language"
khenaidooac637102019-01-14 15:44:34 -050023 "golang.org/x/text/unicode/cldr"
24)
25
26var (
27 test = flag.Bool("test",
28 false,
29 "test existing tables; can be used to compare web data with package data.")
30 outputFile = flag.String("output",
31 "tables.go",
32 "output file for generated tables")
33)
34
Scott Baker8461e152019-10-01 14:44:30 -070035func main() {
36 gen.Init()
khenaidooac637102019-01-14 15:44:34 -050037
Scott Baker8461e152019-10-01 14:44:30 -070038 w := gen.NewCodeWriter()
39 defer w.WriteGoFile("tables.go", "language")
khenaidooac637102019-01-14 15:44:34 -050040
Scott Baker8461e152019-10-01 14:44:30 -070041 b := newBuilder(w)
42 gen.WriteCLDRVersion(w)
khenaidooac637102019-01-14 15:44:34 -050043
Scott Baker8461e152019-10-01 14:44:30 -070044 b.writeConstants()
45 b.writeMatchData()
khenaidooac637102019-01-14 15:44:34 -050046}
47
48type builder struct {
49 w *gen.CodeWriter
50 hw io.Writer // MultiWriter for w and w.Hash
51 data *cldr.CLDR
52 supp *cldr.SupplementalData
khenaidooac637102019-01-14 15:44:34 -050053}
54
Scott Baker8461e152019-10-01 14:44:30 -070055func (b *builder) langIndex(s string) uint16 {
56 return uint16(language.MustParseBase(s))
57}
58
59func (b *builder) regionIndex(s string) int {
60 return int(language.MustParseRegion(s))
61}
62
63func (b *builder) scriptIndex(s string) int {
64 return int(language.MustParseScript(s))
65}
khenaidooac637102019-01-14 15:44:34 -050066
67func newBuilder(w *gen.CodeWriter) *builder {
68 r := gen.OpenCLDRCoreZip()
69 defer r.Close()
70 d := &cldr.Decoder{}
71 data, err := d.DecodeZip(r)
Scott Baker8461e152019-10-01 14:44:30 -070072 if err != nil {
73 log.Fatal(err)
74 }
khenaidooac637102019-01-14 15:44:34 -050075 b := builder{
76 w: w,
77 hw: io.MultiWriter(w, w.Hash),
78 data: data,
79 supp: data.Supplemental(),
80 }
khenaidooac637102019-01-14 15:44:34 -050081 return &b
82}
83
khenaidooac637102019-01-14 15:44:34 -050084// writeConsts computes f(v) for all v in values and writes the results
85// as constants named _v to a single constant block.
86func (b *builder) writeConsts(f func(string) int, values ...string) {
Scott Baker8461e152019-10-01 14:44:30 -070087 fmt.Fprintln(b.w, "const (")
khenaidooac637102019-01-14 15:44:34 -050088 for _, v := range values {
Scott Baker8461e152019-10-01 14:44:30 -070089 fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))
khenaidooac637102019-01-14 15:44:34 -050090 }
Scott Baker8461e152019-10-01 14:44:30 -070091 fmt.Fprintln(b.w, ")")
khenaidooac637102019-01-14 15:44:34 -050092}
93
94// TODO: region inclusion data will probably not be use used in future matchers.
95
khenaidooac637102019-01-14 15:44:34 -050096var langConsts = []string{
Scott Baker8461e152019-10-01 14:44:30 -070097 "de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",
khenaidooac637102019-01-14 15:44:34 -050098}
99
100var scriptConsts = []string{
101 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
102 "Zzzz",
103}
104
khenaidooac637102019-01-14 15:44:34 -0500105var regionConsts = []string{
106 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
107 "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
108}
109
Scott Baker8461e152019-10-01 14:44:30 -0700110func (b *builder) writeConstants() {
111 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
112 b.writeConsts(b.regionIndex, regionConsts...)
113 b.writeConsts(b.scriptIndex, scriptConsts...)
khenaidooac637102019-01-14 15:44:34 -0500114}
115
116type mutualIntelligibility struct {
117 want, have uint16
118 distance uint8
119 oneway bool
120}
121
122type scriptIntelligibility struct {
123 wantLang, haveLang uint16
124 wantScript, haveScript uint8
125 distance uint8
126 // Always oneway
127}
128
129type regionIntelligibility struct {
130 lang uint16 // compact language id
131 script uint8 // 0 means any
132 group uint8 // 0 means any; if bit 7 is set it means inverse
133 distance uint8
134 // Always twoway.
135}
136
137// writeMatchData writes tables with languages and scripts for which there is
138// mutual intelligibility. The data is based on CLDR's languageMatching data.
139// Note that we use a different algorithm than the one defined by CLDR and that
140// we slightly modify the data. For example, we convert scores to confidence levels.
141// We also drop all region-related data as we use a different algorithm to
142// determine region equivalence.
143func (b *builder) writeMatchData() {
144 lm := b.supp.LanguageMatching.LanguageMatches
145 cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
146
147 regionHierarchy := map[string][]string{}
148 for _, g := range b.supp.TerritoryContainment.Group {
149 regions := strings.Split(g.Contains, " ")
150 regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
151 }
Scott Baker8461e152019-10-01 14:44:30 -0700152 regionToGroups := make([]uint8, language.NumRegions)
khenaidooac637102019-01-14 15:44:34 -0500153
154 idToIndex := map[string]uint8{}
155 for i, mv := range lm[0].MatchVariable {
156 if i > 6 {
157 log.Fatalf("Too many groups: %d", i)
158 }
159 idToIndex[mv.Id] = uint8(i + 1)
160 // TODO: also handle '-'
161 for _, r := range strings.Split(mv.Value, "+") {
162 todo := []string{r}
163 for k := 0; k < len(todo); k++ {
164 r := todo[k]
Scott Baker8461e152019-10-01 14:44:30 -0700165 regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)
khenaidooac637102019-01-14 15:44:34 -0500166 todo = append(todo, regionHierarchy[r]...)
167 }
168 }
169 }
Scott Baker8461e152019-10-01 14:44:30 -0700170 b.w.WriteVar("regionToGroups", regionToGroups)
khenaidooac637102019-01-14 15:44:34 -0500171
172 // maps language id to in- and out-of-group region.
173 paradigmLocales := [][3]uint16{}
174 locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
175 for i := 0; i < len(locales); i += 2 {
176 x := [3]uint16{}
177 for j := 0; j < 2; j++ {
178 pc := strings.SplitN(locales[i+j], "-", 2)
179 x[0] = b.langIndex(pc[0])
180 if len(pc) == 2 {
Scott Baker8461e152019-10-01 14:44:30 -0700181 x[1+j] = uint16(b.regionIndex(pc[1]))
khenaidooac637102019-01-14 15:44:34 -0500182 }
183 }
184 paradigmLocales = append(paradigmLocales, x)
185 }
Scott Baker8461e152019-10-01 14:44:30 -0700186 b.w.WriteVar("paradigmLocales", paradigmLocales)
khenaidooac637102019-01-14 15:44:34 -0500187
Scott Baker8461e152019-10-01 14:44:30 -0700188 b.w.WriteType(mutualIntelligibility{})
189 b.w.WriteType(scriptIntelligibility{})
190 b.w.WriteType(regionIntelligibility{})
khenaidooac637102019-01-14 15:44:34 -0500191
192 matchLang := []mutualIntelligibility{}
193 matchScript := []scriptIntelligibility{}
194 matchRegion := []regionIntelligibility{}
195 // Convert the languageMatch entries in lists keyed by desired language.
196 for _, m := range lm[0].LanguageMatch {
197 // Different versions of CLDR use different separators.
198 desired := strings.Replace(m.Desired, "-", "_", -1)
199 supported := strings.Replace(m.Supported, "-", "_", -1)
200 d := strings.Split(desired, "_")
201 s := strings.Split(supported, "_")
202 if len(d) != len(s) {
203 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
204 continue
205 }
206 distance, _ := strconv.ParseInt(m.Distance, 10, 8)
207 switch len(d) {
208 case 2:
209 if desired == supported && desired == "*_*" {
210 continue
211 }
212 // language-script pair.
213 matchScript = append(matchScript, scriptIntelligibility{
214 wantLang: uint16(b.langIndex(d[0])),
215 haveLang: uint16(b.langIndex(s[0])),
Scott Baker8461e152019-10-01 14:44:30 -0700216 wantScript: uint8(b.scriptIndex(d[1])),
217 haveScript: uint8(b.scriptIndex(s[1])),
khenaidooac637102019-01-14 15:44:34 -0500218 distance: uint8(distance),
219 })
220 if m.Oneway != "true" {
221 matchScript = append(matchScript, scriptIntelligibility{
222 wantLang: uint16(b.langIndex(s[0])),
223 haveLang: uint16(b.langIndex(d[0])),
Scott Baker8461e152019-10-01 14:44:30 -0700224 wantScript: uint8(b.scriptIndex(s[1])),
225 haveScript: uint8(b.scriptIndex(d[1])),
khenaidooac637102019-01-14 15:44:34 -0500226 distance: uint8(distance),
227 })
228 }
229 case 1:
230 if desired == supported && desired == "*" {
231 continue
232 }
233 if distance == 1 {
234 // nb == no is already handled by macro mapping. Check there
235 // really is only this case.
236 if d[0] != "no" || s[0] != "nb" {
237 log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
238 }
239 continue
240 }
241 // TODO: consider dropping oneway field and just doubling the entry.
242 matchLang = append(matchLang, mutualIntelligibility{
243 want: uint16(b.langIndex(d[0])),
244 have: uint16(b.langIndex(s[0])),
245 distance: uint8(distance),
246 oneway: m.Oneway == "true",
247 })
248 case 3:
249 if desired == supported && desired == "*_*_*" {
250 continue
251 }
252 if desired != supported {
253 // This is now supported by CLDR, but only one case, which
254 // should already be covered by paradigm locales. For instance,
255 // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
256 // testdata/CLDRLocaleMatcherTest.txt tests this.
257 if supported != "en_*_GB" {
258 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
259 }
260 continue
261 }
262 ri := regionIntelligibility{
263 lang: b.langIndex(d[0]),
264 distance: uint8(distance),
265 }
266 if d[1] != "*" {
Scott Baker8461e152019-10-01 14:44:30 -0700267 ri.script = uint8(b.scriptIndex(d[1]))
khenaidooac637102019-01-14 15:44:34 -0500268 }
269 switch {
270 case d[2] == "*":
271 ri.group = 0x80 // not contained in anything
272 case strings.HasPrefix(d[2], "$!"):
273 ri.group = 0x80
274 d[2] = "$" + d[2][len("$!"):]
275 fallthrough
276 case strings.HasPrefix(d[2], "$"):
277 ri.group |= idToIndex[d[2]]
278 }
279 matchRegion = append(matchRegion, ri)
280 default:
281 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
282 }
283 }
284 sort.SliceStable(matchLang, func(i, j int) bool {
285 return matchLang[i].distance < matchLang[j].distance
286 })
Scott Baker8461e152019-10-01 14:44:30 -0700287 b.w.WriteComment(`
288 matchLang holds pairs of langIDs of base languages that are typically
289 mutually intelligible. Each pair is associated with a confidence and
290 whether the intelligibility goes one or both ways.`)
291 b.w.WriteVar("matchLang", matchLang)
khenaidooac637102019-01-14 15:44:34 -0500292
Scott Baker8461e152019-10-01 14:44:30 -0700293 b.w.WriteComment(`
294 matchScript holds pairs of scriptIDs where readers of one script
295 can typically also read the other. Each is associated with a confidence.`)
khenaidooac637102019-01-14 15:44:34 -0500296 sort.SliceStable(matchScript, func(i, j int) bool {
297 return matchScript[i].distance < matchScript[j].distance
298 })
Scott Baker8461e152019-10-01 14:44:30 -0700299 b.w.WriteVar("matchScript", matchScript)
khenaidooac637102019-01-14 15:44:34 -0500300
301 sort.SliceStable(matchRegion, func(i, j int) bool {
302 return matchRegion[i].distance < matchRegion[j].distance
303 })
Scott Baker8461e152019-10-01 14:44:30 -0700304 b.w.WriteVar("matchRegion", matchRegion)
khenaidooac637102019-01-14 15:44:34 -0500305}