William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame^] | 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package cldr |
| 6 | |
| 7 | import ( |
| 8 | "bufio" |
| 9 | "encoding/xml" |
| 10 | "errors" |
| 11 | "fmt" |
| 12 | "strconv" |
| 13 | "strings" |
| 14 | "unicode" |
| 15 | "unicode/utf8" |
| 16 | ) |
| 17 | |
| 18 | // RuleProcessor can be passed to Collator's Process method, which |
| 19 | // parses the rules and calls the respective method for each rule found. |
| 20 | type RuleProcessor interface { |
| 21 | Reset(anchor string, before int) error |
| 22 | Insert(level int, str, context, extend string) error |
| 23 | Index(id string) |
| 24 | } |
| 25 | |
| 26 | const ( |
| 27 | // cldrIndex is a Unicode-reserved sentinel value used to mark the start |
| 28 | // of a grouping within an index. |
| 29 | // We ignore any rule that starts with this rune. |
| 30 | // See http://unicode.org/reports/tr35/#Collation_Elements for details. |
| 31 | cldrIndex = "\uFDD0" |
| 32 | |
| 33 | // specialAnchor is the format in which to represent logical reset positions, |
| 34 | // such as "first tertiary ignorable". |
| 35 | specialAnchor = "<%s/>" |
| 36 | ) |
| 37 | |
| 38 | // Process parses the rules for the tailorings of this collation |
| 39 | // and calls the respective methods of p for each rule found. |
| 40 | func (c Collation) Process(p RuleProcessor) (err error) { |
| 41 | if len(c.Cr) > 0 { |
| 42 | if len(c.Cr) > 1 { |
| 43 | return fmt.Errorf("multiple cr elements, want 0 or 1") |
| 44 | } |
| 45 | return processRules(p, c.Cr[0].Data()) |
| 46 | } |
| 47 | if c.Rules.Any != nil { |
| 48 | return c.processXML(p) |
| 49 | } |
| 50 | return errors.New("no tailoring data") |
| 51 | } |
| 52 | |
| 53 | // processRules parses rules in the Collation Rule Syntax defined in |
| 54 | // http://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings. |
| 55 | func processRules(p RuleProcessor, s string) (err error) { |
| 56 | chk := func(s string, e error) string { |
| 57 | if err == nil { |
| 58 | err = e |
| 59 | } |
| 60 | return s |
| 61 | } |
| 62 | i := 0 // Save the line number for use after the loop. |
| 63 | scanner := bufio.NewScanner(strings.NewReader(s)) |
| 64 | for ; scanner.Scan() && err == nil; i++ { |
| 65 | for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) { |
| 66 | level := 5 |
| 67 | var ch byte |
| 68 | switch ch, s = s[0], s[1:]; ch { |
| 69 | case '&': // followed by <anchor> or '[' <key> ']' |
| 70 | if s = skipSpace(s); consume(&s, '[') { |
| 71 | s = chk(parseSpecialAnchor(p, s)) |
| 72 | } else { |
| 73 | s = chk(parseAnchor(p, 0, s)) |
| 74 | } |
| 75 | case '<': // sort relation '<'{1,4}, optionally followed by '*'. |
| 76 | for level = 1; consume(&s, '<'); level++ { |
| 77 | } |
| 78 | if level > 4 { |
| 79 | err = fmt.Errorf("level %d > 4", level) |
| 80 | } |
| 81 | fallthrough |
| 82 | case '=': // identity relation, optionally followed by *. |
| 83 | if consume(&s, '*') { |
| 84 | s = chk(parseSequence(p, level, s)) |
| 85 | } else { |
| 86 | s = chk(parseOrder(p, level, s)) |
| 87 | } |
| 88 | default: |
| 89 | chk("", fmt.Errorf("illegal operator %q", ch)) |
| 90 | break |
| 91 | } |
| 92 | } |
| 93 | } |
| 94 | if chk("", scanner.Err()); err != nil { |
| 95 | return fmt.Errorf("%d: %v", i, err) |
| 96 | } |
| 97 | return nil |
| 98 | } |
| 99 | |
| 100 | // parseSpecialAnchor parses the anchor syntax which is either of the form |
| 101 | // ['before' <level>] <anchor> |
| 102 | // or |
| 103 | // [<label>] |
| 104 | // The starting should already be consumed. |
| 105 | func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) { |
| 106 | i := strings.IndexByte(s, ']') |
| 107 | if i == -1 { |
| 108 | return "", errors.New("unmatched bracket") |
| 109 | } |
| 110 | a := strings.TrimSpace(s[:i]) |
| 111 | s = s[i+1:] |
| 112 | if strings.HasPrefix(a, "before ") { |
| 113 | l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3) |
| 114 | if err != nil { |
| 115 | return s, err |
| 116 | } |
| 117 | return parseAnchor(p, int(l), s) |
| 118 | } |
| 119 | return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0) |
| 120 | } |
| 121 | |
| 122 | func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) { |
| 123 | anchor, s, err := scanString(s) |
| 124 | if err != nil { |
| 125 | return s, err |
| 126 | } |
| 127 | return s, p.Reset(anchor, level) |
| 128 | } |
| 129 | |
| 130 | func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) { |
| 131 | var value, context, extend string |
| 132 | if value, s, err = scanString(s); err != nil { |
| 133 | return s, err |
| 134 | } |
| 135 | if strings.HasPrefix(value, cldrIndex) { |
| 136 | p.Index(value[len(cldrIndex):]) |
| 137 | return |
| 138 | } |
| 139 | if consume(&s, '|') { |
| 140 | if context, s, err = scanString(s); err != nil { |
| 141 | return s, errors.New("missing string after context") |
| 142 | } |
| 143 | } |
| 144 | if consume(&s, '/') { |
| 145 | if extend, s, err = scanString(s); err != nil { |
| 146 | return s, errors.New("missing string after extension") |
| 147 | } |
| 148 | } |
| 149 | return s, p.Insert(level, value, context, extend) |
| 150 | } |
| 151 | |
| 152 | // scanString scans a single input string. |
| 153 | func scanString(s string) (str, tail string, err error) { |
| 154 | if s = skipSpace(s); s == "" { |
| 155 | return s, s, errors.New("missing string") |
| 156 | } |
| 157 | buf := [16]byte{} // small but enough to hold most cases. |
| 158 | value := buf[:0] |
| 159 | for s != "" { |
| 160 | if consume(&s, '\'') { |
| 161 | i := strings.IndexByte(s, '\'') |
| 162 | if i == -1 { |
| 163 | return "", "", errors.New(`unmatched single quote`) |
| 164 | } |
| 165 | if i == 0 { |
| 166 | value = append(value, '\'') |
| 167 | } else { |
| 168 | value = append(value, s[:i]...) |
| 169 | } |
| 170 | s = s[i+1:] |
| 171 | continue |
| 172 | } |
| 173 | r, sz := utf8.DecodeRuneInString(s) |
| 174 | if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) { |
| 175 | break |
| 176 | } |
| 177 | value = append(value, s[:sz]...) |
| 178 | s = s[sz:] |
| 179 | } |
| 180 | return string(value), skipSpace(s), nil |
| 181 | } |
| 182 | |
| 183 | func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) { |
| 184 | if s = skipSpace(s); s == "" { |
| 185 | return s, errors.New("empty sequence") |
| 186 | } |
| 187 | last := rune(0) |
| 188 | for s != "" { |
| 189 | r, sz := utf8.DecodeRuneInString(s) |
| 190 | s = s[sz:] |
| 191 | |
| 192 | if r == '-' { |
| 193 | // We have a range. The first element was already written. |
| 194 | if last == 0 { |
| 195 | return s, errors.New("range without starter value") |
| 196 | } |
| 197 | r, sz = utf8.DecodeRuneInString(s) |
| 198 | s = s[sz:] |
| 199 | if r == utf8.RuneError || r < last { |
| 200 | return s, fmt.Errorf("invalid range %q-%q", last, r) |
| 201 | } |
| 202 | for i := last + 1; i <= r; i++ { |
| 203 | if err := p.Insert(level, string(i), "", ""); err != nil { |
| 204 | return s, err |
| 205 | } |
| 206 | } |
| 207 | last = 0 |
| 208 | continue |
| 209 | } |
| 210 | |
| 211 | if unicode.IsSpace(r) || unicode.IsPunct(r) { |
| 212 | break |
| 213 | } |
| 214 | |
| 215 | // normal case |
| 216 | if err := p.Insert(level, string(r), "", ""); err != nil { |
| 217 | return s, err |
| 218 | } |
| 219 | last = r |
| 220 | } |
| 221 | return s, nil |
| 222 | } |
| 223 | |
| 224 | func skipSpace(s string) string { |
| 225 | return strings.TrimLeftFunc(s, unicode.IsSpace) |
| 226 | } |
| 227 | |
| 228 | // consumes returns whether the next byte is ch. If so, it gobbles it by |
| 229 | // updating s. |
| 230 | func consume(s *string, ch byte) (ok bool) { |
| 231 | if *s == "" || (*s)[0] != ch { |
| 232 | return false |
| 233 | } |
| 234 | *s = (*s)[1:] |
| 235 | return true |
| 236 | } |
| 237 | |
| 238 | // The following code parses Collation rules of CLDR version 24 and before. |
| 239 | |
| 240 | var lmap = map[byte]int{ |
| 241 | 'p': 1, |
| 242 | 's': 2, |
| 243 | 't': 3, |
| 244 | 'i': 5, |
| 245 | } |
| 246 | |
| 247 | type rulesElem struct { |
| 248 | Rules struct { |
| 249 | Common |
| 250 | Any []*struct { |
| 251 | XMLName xml.Name |
| 252 | rule |
| 253 | } `xml:",any"` |
| 254 | } `xml:"rules"` |
| 255 | } |
| 256 | |
| 257 | type rule struct { |
| 258 | Value string `xml:",chardata"` |
| 259 | Before string `xml:"before,attr"` |
| 260 | Any []*struct { |
| 261 | XMLName xml.Name |
| 262 | rule |
| 263 | } `xml:",any"` |
| 264 | } |
| 265 | |
| 266 | var emptyValueError = errors.New("cldr: empty rule value") |
| 267 | |
| 268 | func (r *rule) value() (string, error) { |
| 269 | // Convert hexadecimal Unicode codepoint notation to a string. |
| 270 | s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode) |
| 271 | r.Value = s |
| 272 | if s == "" { |
| 273 | if len(r.Any) != 1 { |
| 274 | return "", emptyValueError |
| 275 | } |
| 276 | r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local) |
| 277 | r.Any = nil |
| 278 | } else if len(r.Any) != 0 { |
| 279 | return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any) |
| 280 | } |
| 281 | return r.Value, nil |
| 282 | } |
| 283 | |
| 284 | func (r rule) process(p RuleProcessor, name, context, extend string) error { |
| 285 | v, err := r.value() |
| 286 | if err != nil { |
| 287 | return err |
| 288 | } |
| 289 | switch name { |
| 290 | case "p", "s", "t", "i": |
| 291 | if strings.HasPrefix(v, cldrIndex) { |
| 292 | p.Index(v[len(cldrIndex):]) |
| 293 | return nil |
| 294 | } |
| 295 | if err := p.Insert(lmap[name[0]], v, context, extend); err != nil { |
| 296 | return err |
| 297 | } |
| 298 | case "pc", "sc", "tc", "ic": |
| 299 | level := lmap[name[0]] |
| 300 | for _, s := range v { |
| 301 | if err := p.Insert(level, string(s), context, extend); err != nil { |
| 302 | return err |
| 303 | } |
| 304 | } |
| 305 | default: |
| 306 | return fmt.Errorf("cldr: unsupported tag: %q", name) |
| 307 | } |
| 308 | return nil |
| 309 | } |
| 310 | |
| 311 | // processXML parses the format of CLDR versions 24 and older. |
| 312 | func (c Collation) processXML(p RuleProcessor) (err error) { |
| 313 | // Collation is generated and defined in xml.go. |
| 314 | var v string |
| 315 | for _, r := range c.Rules.Any { |
| 316 | switch r.XMLName.Local { |
| 317 | case "reset": |
| 318 | level := 0 |
| 319 | switch r.Before { |
| 320 | case "primary", "1": |
| 321 | level = 1 |
| 322 | case "secondary", "2": |
| 323 | level = 2 |
| 324 | case "tertiary", "3": |
| 325 | level = 3 |
| 326 | case "": |
| 327 | default: |
| 328 | return fmt.Errorf("cldr: unknown level %q", r.Before) |
| 329 | } |
| 330 | v, err = r.value() |
| 331 | if err == nil { |
| 332 | err = p.Reset(v, level) |
| 333 | } |
| 334 | case "x": |
| 335 | var context, extend string |
| 336 | for _, r1 := range r.Any { |
| 337 | v, err = r1.value() |
| 338 | switch r1.XMLName.Local { |
| 339 | case "context": |
| 340 | context = v |
| 341 | case "extend": |
| 342 | extend = v |
| 343 | } |
| 344 | } |
| 345 | for _, r1 := range r.Any { |
| 346 | if t := r1.XMLName.Local; t == "context" || t == "extend" { |
| 347 | continue |
| 348 | } |
| 349 | r1.rule.process(p, r1.XMLName.Local, context, extend) |
| 350 | } |
| 351 | default: |
| 352 | err = r.rule.process(p, r.XMLName.Local, "", "") |
| 353 | } |
| 354 | if err != nil { |
| 355 | return err |
| 356 | } |
| 357 | } |
| 358 | return nil |
| 359 | } |