Scott Baker | e7144bc | 2019-10-01 14:16:47 -0700 | [diff] [blame] | 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build ignore |
| 6 | |
| 7 | // This tool generates types for the various XML formats of CLDR. |
| 8 | package main |
| 9 | |
| 10 | import ( |
| 11 | "archive/zip" |
| 12 | "bytes" |
| 13 | "encoding/xml" |
| 14 | "flag" |
| 15 | "fmt" |
| 16 | "io" |
| 17 | "io/ioutil" |
| 18 | "log" |
| 19 | "os" |
| 20 | "regexp" |
| 21 | "strings" |
| 22 | |
| 23 | "golang.org/x/text/internal/gen" |
| 24 | ) |
| 25 | |
| 26 | var outputFile = flag.String("output", "xml.go", "output file name") |
| 27 | |
| 28 | func main() { |
| 29 | flag.Parse() |
| 30 | |
| 31 | r := gen.OpenCLDRCoreZip() |
| 32 | buffer, err := ioutil.ReadAll(r) |
| 33 | if err != nil { |
| 34 | log.Fatal("Could not read zip file") |
| 35 | } |
| 36 | r.Close() |
| 37 | z, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) |
| 38 | if err != nil { |
| 39 | log.Fatalf("Could not read zip archive: %v", err) |
| 40 | } |
| 41 | |
| 42 | var buf bytes.Buffer |
| 43 | |
| 44 | version := gen.CLDRVersion() |
| 45 | |
| 46 | for _, dtd := range files { |
| 47 | for _, f := range z.File { |
| 48 | if strings.HasSuffix(f.Name, dtd.file+".dtd") { |
| 49 | r, err := f.Open() |
| 50 | failOnError(err) |
| 51 | |
| 52 | b := makeBuilder(&buf, dtd) |
| 53 | b.parseDTD(r) |
| 54 | b.resolve(b.index[dtd.top[0]]) |
| 55 | b.write() |
| 56 | if b.version != "" && version != b.version { |
| 57 | println(f.Name) |
| 58 | log.Fatalf("main: inconsistent versions: found %s; want %s", b.version, version) |
| 59 | } |
| 60 | break |
| 61 | } |
| 62 | } |
| 63 | } |
| 64 | fmt.Fprintln(&buf, "// Version is the version of CLDR from which the XML definitions are generated.") |
| 65 | fmt.Fprintf(&buf, "const Version = %q\n", version) |
| 66 | |
| 67 | gen.WriteGoFile(*outputFile, "cldr", buf.Bytes()) |
| 68 | } |
| 69 | |
| 70 | func failOnError(err error) { |
| 71 | if err != nil { |
| 72 | log.New(os.Stderr, "", log.Lshortfile).Output(2, err.Error()) |
| 73 | os.Exit(1) |
| 74 | } |
| 75 | } |
| 76 | |
| 77 | // configuration data per DTD type |
| 78 | type dtd struct { |
| 79 | file string // base file name |
| 80 | root string // Go name of the root XML element |
| 81 | top []string // create a different type for this section |
| 82 | |
| 83 | skipElem []string // hard-coded or deprecated elements |
| 84 | skipAttr []string // attributes to exclude |
| 85 | predefined []string // hard-coded elements exist of the form <name>Elem |
| 86 | forceRepeat []string // elements to make slices despite DTD |
| 87 | } |
| 88 | |
| 89 | var files = []dtd{ |
| 90 | { |
| 91 | file: "ldmlBCP47", |
| 92 | root: "LDMLBCP47", |
| 93 | top: []string{"ldmlBCP47"}, |
| 94 | skipElem: []string{ |
| 95 | "cldrVersion", // deprecated, not used |
| 96 | }, |
| 97 | }, |
| 98 | { |
| 99 | file: "ldmlSupplemental", |
| 100 | root: "SupplementalData", |
| 101 | top: []string{"supplementalData"}, |
| 102 | skipElem: []string{ |
| 103 | "cldrVersion", // deprecated, not used |
| 104 | }, |
| 105 | forceRepeat: []string{ |
| 106 | "plurals", // data defined in plurals.xml and ordinals.xml |
| 107 | }, |
| 108 | }, |
| 109 | { |
| 110 | file: "ldml", |
| 111 | root: "LDML", |
| 112 | top: []string{ |
| 113 | "ldml", "collation", "calendar", "timeZoneNames", "localeDisplayNames", "numbers", |
| 114 | }, |
| 115 | skipElem: []string{ |
| 116 | "cp", // not used anywhere |
| 117 | "special", // not used anywhere |
| 118 | "fallback", // deprecated, not used |
| 119 | "alias", // in Common |
| 120 | "default", // in Common |
| 121 | }, |
| 122 | skipAttr: []string{ |
| 123 | "hiraganaQuarternary", // typo in DTD, correct version included as well |
| 124 | }, |
| 125 | predefined: []string{"rules"}, |
| 126 | }, |
| 127 | } |
| 128 | |
| 129 | var comments = map[string]string{ |
| 130 | "ldmlBCP47": ` |
| 131 | // LDMLBCP47 holds information on allowable values for various variables in LDML. |
| 132 | `, |
| 133 | "supplementalData": ` |
| 134 | // SupplementalData holds information relevant for internationalization |
| 135 | // and proper use of CLDR, but that is not contained in the locale hierarchy. |
| 136 | `, |
| 137 | "ldml": ` |
| 138 | // LDML is the top-level type for locale-specific data. |
| 139 | `, |
| 140 | "collation": ` |
| 141 | // Collation contains rules that specify a certain sort-order, |
| 142 | // as a tailoring of the root order. |
| 143 | // The parsed rules are obtained by passing a RuleProcessor to Collation's |
| 144 | // Process method. |
| 145 | `, |
| 146 | "calendar": ` |
| 147 | // Calendar specifies the fields used for formatting and parsing dates and times. |
| 148 | // The month and quarter names are identified numerically, starting at 1. |
| 149 | // The day (of the week) names are identified with short strings, since there is |
| 150 | // no universally-accepted numeric designation. |
| 151 | `, |
| 152 | "dates": ` |
| 153 | // Dates contains information regarding the format and parsing of dates and times. |
| 154 | `, |
| 155 | "localeDisplayNames": ` |
| 156 | // LocaleDisplayNames specifies localized display names for for scripts, languages, |
| 157 | // countries, currencies, and variants. |
| 158 | `, |
| 159 | "numbers": ` |
| 160 | // Numbers supplies information for formatting and parsing numbers and currencies. |
| 161 | `, |
| 162 | } |
| 163 | |
| 164 | type element struct { |
| 165 | name string // XML element name |
| 166 | category string // elements contained by this element |
| 167 | signature string // category + attrKey* |
| 168 | |
| 169 | attr []*attribute // attributes supported by this element. |
| 170 | sub []struct { // parsed and evaluated sub elements of this element. |
| 171 | e *element |
| 172 | repeat bool // true if the element needs to be a slice |
| 173 | } |
| 174 | |
| 175 | resolved bool // prevent multiple resolutions of this element. |
| 176 | } |
| 177 | |
| 178 | type attribute struct { |
| 179 | name string |
| 180 | key string |
| 181 | list []string |
| 182 | |
| 183 | tag string // Go tag |
| 184 | } |
| 185 | |
| 186 | var ( |
| 187 | reHead = regexp.MustCompile(` *(\w+) +([\w\-]+)`) |
| 188 | reAttr = regexp.MustCompile(` *(\w+) *(?:(\w+)|\(([\w\- \|]+)\)) *(?:#([A-Z]*) *(?:\"([\.\d+])\")?)? *("[\w\-:]*")?`) |
| 189 | reElem = regexp.MustCompile(`^ *(EMPTY|ANY|\(.*\)[\*\+\?]?) *$`) |
| 190 | reToken = regexp.MustCompile(`\w\-`) |
| 191 | ) |
| 192 | |
| 193 | // builder is used to read in the DTD files from CLDR and generate Go code |
| 194 | // to be used with the encoding/xml package. |
| 195 | type builder struct { |
| 196 | w io.Writer |
| 197 | index map[string]*element |
| 198 | elem []*element |
| 199 | info dtd |
| 200 | version string |
| 201 | } |
| 202 | |
| 203 | func makeBuilder(w io.Writer, d dtd) builder { |
| 204 | return builder{ |
| 205 | w: w, |
| 206 | index: make(map[string]*element), |
| 207 | elem: []*element{}, |
| 208 | info: d, |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | // parseDTD parses a DTD file. |
| 213 | func (b *builder) parseDTD(r io.Reader) { |
| 214 | for d := xml.NewDecoder(r); ; { |
| 215 | t, err := d.Token() |
| 216 | if t == nil { |
| 217 | break |
| 218 | } |
| 219 | failOnError(err) |
| 220 | dir, ok := t.(xml.Directive) |
| 221 | if !ok { |
| 222 | continue |
| 223 | } |
| 224 | m := reHead.FindSubmatch(dir) |
| 225 | dir = dir[len(m[0]):] |
| 226 | ename := string(m[2]) |
| 227 | el, elementFound := b.index[ename] |
| 228 | switch string(m[1]) { |
| 229 | case "ELEMENT": |
| 230 | if elementFound { |
| 231 | log.Fatal("parseDTD: duplicate entry for element %q", ename) |
| 232 | } |
| 233 | m := reElem.FindSubmatch(dir) |
| 234 | if m == nil { |
| 235 | log.Fatalf("parseDTD: invalid element %q", string(dir)) |
| 236 | } |
| 237 | if len(m[0]) != len(dir) { |
| 238 | log.Fatal("parseDTD: invalid element %q", string(dir), len(dir), len(m[0]), string(m[0])) |
| 239 | } |
| 240 | s := string(m[1]) |
| 241 | el = &element{ |
| 242 | name: ename, |
| 243 | category: s, |
| 244 | } |
| 245 | b.index[ename] = el |
| 246 | case "ATTLIST": |
| 247 | if !elementFound { |
| 248 | log.Fatalf("parseDTD: unknown element %q", ename) |
| 249 | } |
| 250 | s := string(dir) |
| 251 | m := reAttr.FindStringSubmatch(s) |
| 252 | if m == nil { |
| 253 | log.Fatal(fmt.Errorf("parseDTD: invalid attribute %q", string(dir))) |
| 254 | } |
| 255 | if m[4] == "FIXED" { |
| 256 | b.version = m[5] |
| 257 | } else { |
| 258 | switch m[1] { |
| 259 | case "draft", "references", "alt", "validSubLocales", "standard" /* in Common */ : |
| 260 | case "type", "choice": |
| 261 | default: |
| 262 | el.attr = append(el.attr, &attribute{ |
| 263 | name: m[1], |
| 264 | key: s, |
| 265 | list: reToken.FindAllString(m[3], -1), |
| 266 | }) |
| 267 | el.signature = fmt.Sprintf("%s=%s+%s", el.signature, m[1], m[2]) |
| 268 | } |
| 269 | } |
| 270 | } |
| 271 | } |
| 272 | } |
| 273 | |
| 274 | var reCat = regexp.MustCompile(`[ ,\|]*(?:(\(|\)|\#?[\w_-]+)([\*\+\?]?))?`) |
| 275 | |
| 276 | // resolve takes a parsed element and converts it into structured data |
| 277 | // that can be used to generate the XML code. |
| 278 | func (b *builder) resolve(e *element) { |
| 279 | if e.resolved { |
| 280 | return |
| 281 | } |
| 282 | b.elem = append(b.elem, e) |
| 283 | e.resolved = true |
| 284 | s := e.category |
| 285 | found := make(map[string]bool) |
| 286 | sequenceStart := []int{} |
| 287 | for len(s) > 0 { |
| 288 | m := reCat.FindStringSubmatch(s) |
| 289 | if m == nil { |
| 290 | log.Fatalf("%s: invalid category string %q", e.name, s) |
| 291 | } |
| 292 | repeat := m[2] == "*" || m[2] == "+" || in(b.info.forceRepeat, m[1]) |
| 293 | switch m[1] { |
| 294 | case "": |
| 295 | case "(": |
| 296 | sequenceStart = append(sequenceStart, len(e.sub)) |
| 297 | case ")": |
| 298 | if len(sequenceStart) == 0 { |
| 299 | log.Fatalf("%s: unmatched closing parenthesis", e.name) |
| 300 | } |
| 301 | for i := sequenceStart[len(sequenceStart)-1]; i < len(e.sub); i++ { |
| 302 | e.sub[i].repeat = e.sub[i].repeat || repeat |
| 303 | } |
| 304 | sequenceStart = sequenceStart[:len(sequenceStart)-1] |
| 305 | default: |
| 306 | if in(b.info.skipElem, m[1]) { |
| 307 | } else if sub, ok := b.index[m[1]]; ok { |
| 308 | if !found[sub.name] { |
| 309 | e.sub = append(e.sub, struct { |
| 310 | e *element |
| 311 | repeat bool |
| 312 | }{sub, repeat}) |
| 313 | found[sub.name] = true |
| 314 | b.resolve(sub) |
| 315 | } |
| 316 | } else if m[1] == "#PCDATA" || m[1] == "ANY" { |
| 317 | } else if m[1] != "EMPTY" { |
| 318 | log.Fatalf("resolve:%s: element %q not found", e.name, m[1]) |
| 319 | } |
| 320 | } |
| 321 | s = s[len(m[0]):] |
| 322 | } |
| 323 | } |
| 324 | |
| 325 | // return true if s is contained in set. |
| 326 | func in(set []string, s string) bool { |
| 327 | for _, v := range set { |
| 328 | if v == s { |
| 329 | return true |
| 330 | } |
| 331 | } |
| 332 | return false |
| 333 | } |
| 334 | |
| 335 | var repl = strings.NewReplacer("-", " ", "_", " ") |
| 336 | |
| 337 | // title puts the first character or each character following '_' in title case and |
| 338 | // removes all occurrences of '_'. |
| 339 | func title(s string) string { |
| 340 | return strings.Replace(strings.Title(repl.Replace(s)), " ", "", -1) |
| 341 | } |
| 342 | |
| 343 | // writeElem generates Go code for a single element, recursively. |
| 344 | func (b *builder) writeElem(tab int, e *element) { |
| 345 | p := func(f string, x ...interface{}) { |
| 346 | f = strings.Replace(f, "\n", "\n"+strings.Repeat("\t", tab), -1) |
| 347 | fmt.Fprintf(b.w, f, x...) |
| 348 | } |
| 349 | if len(e.sub) == 0 && len(e.attr) == 0 { |
| 350 | p("Common") |
| 351 | return |
| 352 | } |
| 353 | p("struct {") |
| 354 | tab++ |
| 355 | p("\nCommon") |
| 356 | for _, attr := range e.attr { |
| 357 | if !in(b.info.skipAttr, attr.name) { |
| 358 | p("\n%s string `xml:\"%s,attr\"`", title(attr.name), attr.name) |
| 359 | } |
| 360 | } |
| 361 | for _, sub := range e.sub { |
| 362 | if in(b.info.predefined, sub.e.name) { |
| 363 | p("\n%sElem", sub.e.name) |
| 364 | continue |
| 365 | } |
| 366 | if in(b.info.skipElem, sub.e.name) { |
| 367 | continue |
| 368 | } |
| 369 | p("\n%s ", title(sub.e.name)) |
| 370 | if sub.repeat { |
| 371 | p("[]") |
| 372 | } |
| 373 | p("*") |
| 374 | if in(b.info.top, sub.e.name) { |
| 375 | p(title(sub.e.name)) |
| 376 | } else { |
| 377 | b.writeElem(tab, sub.e) |
| 378 | } |
| 379 | p(" `xml:\"%s\"`", sub.e.name) |
| 380 | } |
| 381 | tab-- |
| 382 | p("\n}") |
| 383 | } |
| 384 | |
| 385 | // write generates the Go XML code. |
| 386 | func (b *builder) write() { |
| 387 | for i, name := range b.info.top { |
| 388 | e := b.index[name] |
| 389 | if e != nil { |
| 390 | fmt.Fprintf(b.w, comments[name]) |
| 391 | name := title(e.name) |
| 392 | if i == 0 { |
| 393 | name = b.info.root |
| 394 | } |
| 395 | fmt.Fprintf(b.w, "type %s ", name) |
| 396 | b.writeElem(0, e) |
| 397 | fmt.Fprint(b.w, "\n") |
| 398 | } |
| 399 | } |
| 400 | } |