blob: b4c835ed4d990c9e5b3c13efaaac953cf86d494a [file] [log] [blame]
Scott Bakereee8dd82019-09-24 12:52:34 -07001// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Collation table generator.
8// Data read from the web.
9
10package main
11
12import (
13 "archive/zip"
14 "bufio"
15 "bytes"
16 "flag"
17 "fmt"
18 "io"
19 "io/ioutil"
20 "log"
21 "os"
22 "regexp"
23 "sort"
24 "strconv"
25 "strings"
26 "unicode/utf8"
27
28 "golang.org/x/text/collate"
29 "golang.org/x/text/collate/build"
30 "golang.org/x/text/internal/colltab"
31 "golang.org/x/text/internal/gen"
32 "golang.org/x/text/language"
33 "golang.org/x/text/unicode/cldr"
34)
35
36var (
37 test = flag.Bool("test", false,
38 "test existing tables; can be used to compare web data with package data.")
39 short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
40 draft = flag.Bool("draft", false, `Use draft versions, when available.`)
41 tags = flag.String("tags", "", "build tags to be included after +build directive")
42 pkg = flag.String("package", "collate",
43 "the name of the package in which the generated file is to be included")
44
45 tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
46 "comma-spearated list of tables to generate.")
47 exclude = flagStringSet("exclude", "zh2", "",
48 "comma-separated list of languages to exclude.")
49 include = flagStringSet("include", "", "",
50 "comma-separated list of languages to include. Include trumps exclude.")
51 // TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons)
52 // TODO: Not included: traditional (buggy for Bengali)
53 types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "",
54 "comma-separated list of types that should be included.")
55)
56
57// stringSet implements an ordered set based on a list. It implements flag.Value
58// to allow a set to be specified as a comma-separated list.
59type stringSet struct {
60 s []string
61 allowed *stringSet
62 dirty bool // needs compaction if true
63 all bool
64 allowAll bool
65}
66
67func flagStringSet(name, def, allowed, usage string) *stringSet {
68 ss := &stringSet{}
69 if allowed != "" {
70 usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
71 ss.allowed = &stringSet{}
72 failOnError(ss.allowed.Set(allowed))
73 }
74 ss.Set(def)
75 flag.Var(ss, name, usage)
76 return ss
77}
78
79func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
80 ss := &stringSet{allowAll: true}
81 if allowed == "" {
82 flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
83 } else {
84 ss.allowed = &stringSet{}
85 failOnError(ss.allowed.Set(allowed))
86 flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
87 }
88 ss.Set(def)
89 return ss
90}
91
92func (ss stringSet) Len() int {
93 return len(ss.s)
94}
95
96func (ss stringSet) String() string {
97 return strings.Join(ss.s, ",")
98}
99
100func (ss *stringSet) Set(s string) error {
101 if ss.allowAll && s == "all" {
102 ss.s = nil
103 ss.all = true
104 return nil
105 }
106 ss.s = ss.s[:0]
107 for _, s := range strings.Split(s, ",") {
108 if s := strings.TrimSpace(s); s != "" {
109 if ss.allowed != nil && !ss.allowed.contains(s) {
110 return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
111 }
112 ss.add(s)
113 }
114 }
115 ss.compact()
116 return nil
117}
118
119func (ss *stringSet) add(s string) {
120 ss.s = append(ss.s, s)
121 ss.dirty = true
122}
123
124func (ss *stringSet) values() []string {
125 ss.compact()
126 return ss.s
127}
128
129func (ss *stringSet) contains(s string) bool {
130 if ss.all {
131 return true
132 }
133 for _, v := range ss.s {
134 if v == s {
135 return true
136 }
137 }
138 return false
139}
140
141func (ss *stringSet) compact() {
142 if !ss.dirty {
143 return
144 }
145 a := ss.s
146 sort.Strings(a)
147 k := 0
148 for i := 1; i < len(a); i++ {
149 if a[k] != a[i] {
150 a[k+1] = a[i]
151 k++
152 }
153 }
154 ss.s = a[:k+1]
155 ss.dirty = false
156}
157
158func skipLang(l string) bool {
159 if include.Len() > 0 {
160 return !include.contains(l)
161 }
162 return exclude.contains(l)
163}
164
165// altInclude returns a list of alternatives (for the LDML alt attribute)
166// in order of preference. An empty string in this list indicates the
167// default entry.
168func altInclude() []string {
169 l := []string{}
170 if *short {
171 l = append(l, "short")
172 }
173 l = append(l, "")
174 // TODO: handle draft using cldr.SetDraftLevel
175 if *draft {
176 l = append(l, "proposed")
177 }
178 return l
179}
180
181func failOnError(e error) {
182 if e != nil {
183 log.Panic(e)
184 }
185}
186
187func openArchive() *zip.Reader {
188 f := gen.OpenCLDRCoreZip()
189 buffer, err := ioutil.ReadAll(f)
190 f.Close()
191 failOnError(err)
192 archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
193 failOnError(err)
194 return archive
195}
196
197// parseUCA parses a Default Unicode Collation Element Table of the format
198// specified in http://www.unicode.org/reports/tr10/#File_Format.
199// It returns the variable top.
200func parseUCA(builder *build.Builder) {
201 var r io.ReadCloser
202 var err error
203 for _, f := range openArchive().File {
204 if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
205 r, err = f.Open()
206 }
207 }
208 if r == nil {
209 log.Fatal("File allkeys_CLDR.txt not found in archive.")
210 }
211 failOnError(err)
212 defer r.Close()
213 scanner := bufio.NewScanner(r)
214 colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
215 for i := 1; scanner.Scan(); i++ {
216 line := scanner.Text()
217 if len(line) == 0 || line[0] == '#' {
218 continue
219 }
220 if line[0] == '@' {
221 // parse properties
222 switch {
223 case strings.HasPrefix(line[1:], "version "):
224 a := strings.Split(line[1:], " ")
225 if a[1] != gen.UnicodeVersion() {
226 log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion())
227 }
228 case strings.HasPrefix(line[1:], "backwards "):
229 log.Fatalf("%d: unsupported option backwards", i)
230 default:
231 log.Printf("%d: unknown option %s", i, line[1:])
232 }
233 } else {
234 // parse entries
235 part := strings.Split(line, " ; ")
236 if len(part) != 2 {
237 log.Fatalf("%d: production rule without ';': %v", i, line)
238 }
239 lhs := []rune{}
240 for _, v := range strings.Split(part[0], " ") {
241 if v == "" {
242 continue
243 }
244 lhs = append(lhs, rune(convHex(i, v)))
245 }
246 var n int
247 var vars []int
248 rhs := [][]int{}
249 for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
250 n += len(m[0])
251 elem := []int{}
252 for _, h := range strings.Split(m[2], ".") {
253 elem = append(elem, convHex(i, h))
254 }
255 if m[1] == "*" {
256 vars = append(vars, i)
257 }
258 rhs = append(rhs, elem)
259 }
260 if len(part[1]) < n+3 || part[1][n+1] != '#' {
261 log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
262 }
263 if *test {
264 testInput.add(string(lhs))
265 }
266 failOnError(builder.Add(lhs, rhs, vars))
267 }
268 }
269 if scanner.Err() != nil {
270 log.Fatal(scanner.Err())
271 }
272}
273
274func convHex(line int, s string) int {
275 r, e := strconv.ParseInt(s, 16, 32)
276 if e != nil {
277 log.Fatalf("%d: %v", line, e)
278 }
279 return int(r)
280}
281
282var testInput = stringSet{}
283
284var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
285var tagRe = regexp.MustCompile(`<([a-z_]*) */>`)
286
287var mainLocales = []string{}
288
289// charsets holds a list of exemplar characters per category.
290type charSets map[string][]string
291
292func (p charSets) fprint(w io.Writer) {
293 fmt.Fprintln(w, "[exN]string{")
294 for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
295 if set := p[k]; len(set) != 0 {
296 fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
297 }
298 }
299 fmt.Fprintln(w, "\t},")
300}
301
302var localeChars = make(map[string]charSets)
303
304const exemplarHeader = `
305type exemplarType int
306const (
307 exCharacters exemplarType = iota
308 exContractions
309 exPunctuation
310 exAuxiliary
311 exCurrency
312 exIndex
313 exN
314)
315`
316
317func printExemplarCharacters(w io.Writer) {
318 fmt.Fprintln(w, exemplarHeader)
319 fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
320 for _, loc := range mainLocales {
321 fmt.Fprintf(w, "\t%q: ", loc)
322 localeChars[loc].fprint(w)
323 }
324 fmt.Fprintln(w, "}")
325}
326
327func decodeCLDR(d *cldr.Decoder) *cldr.CLDR {
328 r := gen.OpenCLDRCoreZip()
329 data, err := d.DecodeZip(r)
330 failOnError(err)
331 return data
332}
333
334// parseMain parses XML files in the main directory of the CLDR core.zip file.
335func parseMain() {
336 d := &cldr.Decoder{}
337 d.SetDirFilter("main")
338 d.SetSectionFilter("characters")
339 data := decodeCLDR(d)
340 for _, loc := range data.Locales() {
341 x := data.RawLDML(loc)
342 if skipLang(x.Identity.Language.Type) {
343 continue
344 }
345 if x.Characters != nil {
346 x, _ = data.LDML(loc)
347 loc = language.Make(loc).String()
348 for _, ec := range x.Characters.ExemplarCharacters {
349 if ec.Draft != "" {
350 continue
351 }
352 if _, ok := localeChars[loc]; !ok {
353 mainLocales = append(mainLocales, loc)
354 localeChars[loc] = make(charSets)
355 }
356 localeChars[loc][ec.Type] = parseCharacters(ec.Data())
357 }
358 }
359 }
360}
361
362func parseCharacters(chars string) []string {
363 parseSingle := func(s string) (r rune, tail string, escaped bool) {
364 if s[0] == '\\' {
365 return rune(s[1]), s[2:], true
366 }
367 r, sz := utf8.DecodeRuneInString(s)
368 return r, s[sz:], false
369 }
370 chars = strings.TrimSpace(chars)
371 if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' {
372 chars = chars[1:n]
373 }
374 list := []string{}
375 var r, last, end rune
376 for len(chars) > 0 {
377 if chars[0] == '{' { // character sequence
378 buf := []rune{}
379 for chars = chars[1:]; len(chars) > 0; {
380 r, chars, _ = parseSingle(chars)
381 if r == '}' {
382 break
383 }
384 if r == ' ' {
385 log.Fatalf("space not supported in sequence %q", chars)
386 }
387 buf = append(buf, r)
388 }
389 list = append(list, string(buf))
390 last = 0
391 } else { // single character
392 escaped := false
393 r, chars, escaped = parseSingle(chars)
394 if r != ' ' {
395 if r == '-' && !escaped {
396 if last == 0 {
397 log.Fatal("'-' should be preceded by a character")
398 }
399 end, chars, _ = parseSingle(chars)
400 for ; last <= end; last++ {
401 list = append(list, string(last))
402 }
403 last = 0
404 } else {
405 list = append(list, string(r))
406 last = r
407 }
408 }
409 }
410 }
411 return list
412}
413
414var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
415
416// typeMap translates legacy type keys to their BCP47 equivalent.
417var typeMap = map[string]string{
418 "phonebook": "phonebk",
419 "traditional": "trad",
420}
421
422// parseCollation parses XML files in the collation directory of the CLDR core.zip file.
423func parseCollation(b *build.Builder) {
424 d := &cldr.Decoder{}
425 d.SetDirFilter("collation")
426 data := decodeCLDR(d)
427 for _, loc := range data.Locales() {
428 x, err := data.LDML(loc)
429 failOnError(err)
430 if skipLang(x.Identity.Language.Type) {
431 continue
432 }
433 cs := x.Collations.Collation
434 sl := cldr.MakeSlice(&cs)
435 if len(types.s) == 0 {
436 sl.SelectAnyOf("type", x.Collations.Default())
437 } else if !types.all {
438 sl.SelectAnyOf("type", types.s...)
439 }
440 sl.SelectOnePerGroup("alt", altInclude())
441
442 for _, c := range cs {
443 id, err := language.Parse(loc)
444 if err != nil {
445 fmt.Fprintf(os.Stderr, "invalid locale: %q", err)
446 continue
447 }
448 // Support both old- and new-style defaults.
449 d := c.Type
450 if x.Collations.DefaultCollation == nil {
451 d = x.Collations.Default()
452 } else {
453 d = x.Collations.DefaultCollation.Data()
454 }
455 // We assume tables are being built either for search or collation,
456 // but not both. For search the default is always "search".
457 if d != c.Type && c.Type != "search" {
458 typ := c.Type
459 if len(c.Type) > 8 {
460 typ = typeMap[c.Type]
461 }
462 id, err = id.SetTypeForKey("co", typ)
463 failOnError(err)
464 }
465 t := b.Tailoring(id)
466 c.Process(processor{t})
467 }
468 }
469}
470
471type processor struct {
472 t *build.Tailoring
473}
474
475func (p processor) Reset(anchor string, before int) (err error) {
476 if before != 0 {
477 err = p.t.SetAnchorBefore(anchor)
478 } else {
479 err = p.t.SetAnchor(anchor)
480 }
481 failOnError(err)
482 return nil
483}
484
485func (p processor) Insert(level int, str, context, extend string) error {
486 str = context + str
487 if *test {
488 testInput.add(str)
489 }
490 // TODO: mimic bug in old maketables: remove.
491 err := p.t.Insert(colltab.Level(level-1), str, context+extend)
492 failOnError(err)
493 return nil
494}
495
496func (p processor) Index(id string) {
497}
498
499func testCollator(c *collate.Collator) {
500 c0 := collate.New(language.Und)
501
502 // iterator over all characters for all locales and check
503 // whether Key is equal.
504 buf := collate.Buffer{}
505
506 // Add all common and not too uncommon runes to the test set.
507 for i := rune(0); i < 0x30000; i++ {
508 testInput.add(string(i))
509 }
510 for i := rune(0xE0000); i < 0xF0000; i++ {
511 testInput.add(string(i))
512 }
513 for _, str := range testInput.values() {
514 k0 := c0.KeyFromString(&buf, str)
515 k := c.KeyFromString(&buf, str)
516 if !bytes.Equal(k0, k) {
517 failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
518 }
519 buf.Reset()
520 }
521 fmt.Println("PASS")
522}
523
524func main() {
525 gen.Init()
526 b := build.NewBuilder()
527 parseUCA(b)
528 if tables.contains("chars") {
529 parseMain()
530 }
531 parseCollation(b)
532
533 c, err := b.Build()
534 failOnError(err)
535
536 if *test {
537 testCollator(collate.NewFromTable(c))
538 } else {
539 w := &bytes.Buffer{}
540
541 gen.WriteUnicodeVersion(w)
542 gen.WriteCLDRVersion(w)
543
544 if tables.contains("collate") {
545 _, err = b.Print(w)
546 failOnError(err)
547 }
548 if tables.contains("chars") {
549 printExemplarCharacters(w)
550 }
551 gen.WriteGoFile("tables.go", *pkg, w.Bytes())
552 }
553}