blob: 27c5bac9aa74093b0b68aeb887b0d43c354717d8 [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package cldr
6
7import (
8 "bufio"
9 "encoding/xml"
10 "errors"
11 "fmt"
12 "strconv"
13 "strings"
14 "unicode"
15 "unicode/utf8"
16)
17
18// RuleProcessor can be passed to Collator's Process method, which
19// parses the rules and calls the respective method for each rule found.
20type RuleProcessor interface {
21 Reset(anchor string, before int) error
22 Insert(level int, str, context, extend string) error
23 Index(id string)
24}
25
26const (
27 // cldrIndex is a Unicode-reserved sentinel value used to mark the start
28 // of a grouping within an index.
29 // We ignore any rule that starts with this rune.
30 // See https://unicode.org/reports/tr35/#Collation_Elements for details.
31 cldrIndex = "\uFDD0"
32
33 // specialAnchor is the format in which to represent logical reset positions,
34 // such as "first tertiary ignorable".
35 specialAnchor = "<%s/>"
36)
37
38// Process parses the rules for the tailorings of this collation
39// and calls the respective methods of p for each rule found.
40func (c Collation) Process(p RuleProcessor) (err error) {
41 if len(c.Cr) > 0 {
42 if len(c.Cr) > 1 {
43 return fmt.Errorf("multiple cr elements, want 0 or 1")
44 }
45 return processRules(p, c.Cr[0].Data())
46 }
47 if c.Rules.Any != nil {
48 return c.processXML(p)
49 }
50 return errors.New("no tailoring data")
51}
52
53// processRules parses rules in the Collation Rule Syntax defined in
54// https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
55func processRules(p RuleProcessor, s string) (err error) {
56 chk := func(s string, e error) string {
57 if err == nil {
58 err = e
59 }
60 return s
61 }
62 i := 0 // Save the line number for use after the loop.
63 scanner := bufio.NewScanner(strings.NewReader(s))
64 for ; scanner.Scan() && err == nil; i++ {
65 for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) {
66 level := 5
67 var ch byte
68 switch ch, s = s[0], s[1:]; ch {
69 case '&': // followed by <anchor> or '[' <key> ']'
70 if s = skipSpace(s); consume(&s, '[') {
71 s = chk(parseSpecialAnchor(p, s))
72 } else {
73 s = chk(parseAnchor(p, 0, s))
74 }
75 case '<': // sort relation '<'{1,4}, optionally followed by '*'.
76 for level = 1; consume(&s, '<'); level++ {
77 }
78 if level > 4 {
79 err = fmt.Errorf("level %d > 4", level)
80 }
81 fallthrough
82 case '=': // identity relation, optionally followed by *.
83 if consume(&s, '*') {
84 s = chk(parseSequence(p, level, s))
85 } else {
86 s = chk(parseOrder(p, level, s))
87 }
88 default:
89 chk("", fmt.Errorf("illegal operator %q", ch))
90 break
91 }
92 }
93 }
94 if chk("", scanner.Err()); err != nil {
95 return fmt.Errorf("%d: %v", i, err)
96 }
97 return nil
98}
99
100// parseSpecialAnchor parses the anchor syntax which is either of the form
101// ['before' <level>] <anchor>
102// or
103// [<label>]
104// The starting should already be consumed.
105func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) {
106 i := strings.IndexByte(s, ']')
107 if i == -1 {
108 return "", errors.New("unmatched bracket")
109 }
110 a := strings.TrimSpace(s[:i])
111 s = s[i+1:]
112 if strings.HasPrefix(a, "before ") {
113 l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3)
114 if err != nil {
115 return s, err
116 }
117 return parseAnchor(p, int(l), s)
118 }
119 return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0)
120}
121
122func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) {
123 anchor, s, err := scanString(s)
124 if err != nil {
125 return s, err
126 }
127 return s, p.Reset(anchor, level)
128}
129
130func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) {
131 var value, context, extend string
132 if value, s, err = scanString(s); err != nil {
133 return s, err
134 }
135 if strings.HasPrefix(value, cldrIndex) {
136 p.Index(value[len(cldrIndex):])
137 return
138 }
139 if consume(&s, '|') {
140 if context, s, err = scanString(s); err != nil {
141 return s, errors.New("missing string after context")
142 }
143 }
144 if consume(&s, '/') {
145 if extend, s, err = scanString(s); err != nil {
146 return s, errors.New("missing string after extension")
147 }
148 }
149 return s, p.Insert(level, value, context, extend)
150}
151
152// scanString scans a single input string.
153func scanString(s string) (str, tail string, err error) {
154 if s = skipSpace(s); s == "" {
155 return s, s, errors.New("missing string")
156 }
157 buf := [16]byte{} // small but enough to hold most cases.
158 value := buf[:0]
159 for s != "" {
160 if consume(&s, '\'') {
161 i := strings.IndexByte(s, '\'')
162 if i == -1 {
163 return "", "", errors.New(`unmatched single quote`)
164 }
165 if i == 0 {
166 value = append(value, '\'')
167 } else {
168 value = append(value, s[:i]...)
169 }
170 s = s[i+1:]
171 continue
172 }
173 r, sz := utf8.DecodeRuneInString(s)
174 if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) {
175 break
176 }
177 value = append(value, s[:sz]...)
178 s = s[sz:]
179 }
180 return string(value), skipSpace(s), nil
181}
182
183func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) {
184 if s = skipSpace(s); s == "" {
185 return s, errors.New("empty sequence")
186 }
187 last := rune(0)
188 for s != "" {
189 r, sz := utf8.DecodeRuneInString(s)
190 s = s[sz:]
191
192 if r == '-' {
193 // We have a range. The first element was already written.
194 if last == 0 {
195 return s, errors.New("range without starter value")
196 }
197 r, sz = utf8.DecodeRuneInString(s)
198 s = s[sz:]
199 if r == utf8.RuneError || r < last {
200 return s, fmt.Errorf("invalid range %q-%q", last, r)
201 }
202 for i := last + 1; i <= r; i++ {
203 if err := p.Insert(level, string(i), "", ""); err != nil {
204 return s, err
205 }
206 }
207 last = 0
208 continue
209 }
210
211 if unicode.IsSpace(r) || unicode.IsPunct(r) {
212 break
213 }
214
215 // normal case
216 if err := p.Insert(level, string(r), "", ""); err != nil {
217 return s, err
218 }
219 last = r
220 }
221 return s, nil
222}
223
224func skipSpace(s string) string {
225 return strings.TrimLeftFunc(s, unicode.IsSpace)
226}
227
228// consumes returns whether the next byte is ch. If so, it gobbles it by
229// updating s.
230func consume(s *string, ch byte) (ok bool) {
231 if *s == "" || (*s)[0] != ch {
232 return false
233 }
234 *s = (*s)[1:]
235 return true
236}
237
238// The following code parses Collation rules of CLDR version 24 and before.
239
240var lmap = map[byte]int{
241 'p': 1,
242 's': 2,
243 't': 3,
244 'i': 5,
245}
246
247type rulesElem struct {
248 Rules struct {
249 Common
250 Any []*struct {
251 XMLName xml.Name
252 rule
253 } `xml:",any"`
254 } `xml:"rules"`
255}
256
257type rule struct {
258 Value string `xml:",chardata"`
259 Before string `xml:"before,attr"`
260 Any []*struct {
261 XMLName xml.Name
262 rule
263 } `xml:",any"`
264}
265
266var emptyValueError = errors.New("cldr: empty rule value")
267
268func (r *rule) value() (string, error) {
269 // Convert hexadecimal Unicode codepoint notation to a string.
270 s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode)
271 r.Value = s
272 if s == "" {
273 if len(r.Any) != 1 {
274 return "", emptyValueError
275 }
276 r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local)
277 r.Any = nil
278 } else if len(r.Any) != 0 {
279 return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any)
280 }
281 return r.Value, nil
282}
283
284func (r rule) process(p RuleProcessor, name, context, extend string) error {
285 v, err := r.value()
286 if err != nil {
287 return err
288 }
289 switch name {
290 case "p", "s", "t", "i":
291 if strings.HasPrefix(v, cldrIndex) {
292 p.Index(v[len(cldrIndex):])
293 return nil
294 }
295 if err := p.Insert(lmap[name[0]], v, context, extend); err != nil {
296 return err
297 }
298 case "pc", "sc", "tc", "ic":
299 level := lmap[name[0]]
300 for _, s := range v {
301 if err := p.Insert(level, string(s), context, extend); err != nil {
302 return err
303 }
304 }
305 default:
306 return fmt.Errorf("cldr: unsupported tag: %q", name)
307 }
308 return nil
309}
310
311// processXML parses the format of CLDR versions 24 and older.
312func (c Collation) processXML(p RuleProcessor) (err error) {
313 // Collation is generated and defined in xml.go.
314 var v string
315 for _, r := range c.Rules.Any {
316 switch r.XMLName.Local {
317 case "reset":
318 level := 0
319 switch r.Before {
320 case "primary", "1":
321 level = 1
322 case "secondary", "2":
323 level = 2
324 case "tertiary", "3":
325 level = 3
326 case "":
327 default:
328 return fmt.Errorf("cldr: unknown level %q", r.Before)
329 }
330 v, err = r.value()
331 if err == nil {
332 err = p.Reset(v, level)
333 }
334 case "x":
335 var context, extend string
336 for _, r1 := range r.Any {
337 v, err = r1.value()
338 switch r1.XMLName.Local {
339 case "context":
340 context = v
341 case "extend":
342 extend = v
343 }
344 }
345 for _, r1 := range r.Any {
346 if t := r1.XMLName.Local; t == "context" || t == "extend" {
347 continue
348 }
349 r1.rule.process(p, r1.XMLName.Local, context, extend)
350 }
351 default:
352 err = r.rule.process(p, r.XMLName.Local, "", "")
353 }
354 if err != nil {
355 return err
356 }
357 }
358 return nil
359}