blob: 0879bc84c87a2018a2eba7d08d95530d0ab8bde8 [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package ucd provides a parser for Unicode Character Database files, the
6// format of which is defined in https://www.unicode.org/reports/tr44/. See
7// https://www.unicode.org/Public/UCD/latest/ucd/ for example files.
8//
9// It currently does not support substitutions of missing fields.
10package ucd // import "golang.org/x/text/internal/ucd"
11
12import (
13 "bufio"
14 "errors"
15 "fmt"
16 "io"
17 "log"
18 "regexp"
19 "strconv"
20 "strings"
21)
22
23// UnicodeData.txt fields.
24const (
25 CodePoint = iota
26 Name
27 GeneralCategory
28 CanonicalCombiningClass
29 BidiClass
30 DecompMapping
31 DecimalValue
32 DigitValue
33 NumericValue
34 BidiMirrored
35 Unicode1Name
36 ISOComment
37 SimpleUppercaseMapping
38 SimpleLowercaseMapping
39 SimpleTitlecaseMapping
40)
41
42// Parse calls f for each entry in the given reader of a UCD file. It will close
43// the reader upon return. It will call log.Fatal if any error occurred.
44//
45// This implements the most common usage pattern of using Parser.
46func Parse(r io.ReadCloser, f func(p *Parser)) {
47 defer r.Close()
48
49 p := New(r)
50 for p.Next() {
51 f(p)
52 }
53 if err := p.Err(); err != nil {
54 r.Close() // os.Exit will cause defers not to be called.
55 log.Fatal(err)
56 }
57}
58
59// An Option is used to configure a Parser.
60type Option func(p *Parser)
61
62func keepRanges(p *Parser) {
63 p.keepRanges = true
64}
65
66var (
67 // KeepRanges prevents the expansion of ranges. The raw ranges can be
68 // obtained by calling Range(0) on the parser.
69 KeepRanges Option = keepRanges
70)
71
72// The Part option register a handler for lines starting with a '@'. The text
73// after a '@' is available as the first field. Comments are handled as usual.
74func Part(f func(p *Parser)) Option {
75 return func(p *Parser) {
76 p.partHandler = f
77 }
78}
79
80// The CommentHandler option passes comments that are on a line by itself to
81// a given handler.
82func CommentHandler(f func(s string)) Option {
83 return func(p *Parser) {
84 p.commentHandler = f
85 }
86}
87
88// A Parser parses Unicode Character Database (UCD) files.
89type Parser struct {
90 scanner *bufio.Scanner
91
92 keepRanges bool // Don't expand rune ranges in field 0.
93
94 err error
95 comment string
96 field []string
97 // parsedRange is needed in case Range(0) is called more than once for one
98 // field. In some cases this requires scanning ahead.
99 line int
100 parsedRange bool
101 rangeStart, rangeEnd rune
102
103 partHandler func(p *Parser)
104 commentHandler func(s string)
105}
106
107func (p *Parser) setError(err error, msg string) {
108 if p.err == nil && err != nil {
109 if msg == "" {
110 p.err = fmt.Errorf("ucd:line:%d: %v", p.line, err)
111 } else {
112 p.err = fmt.Errorf("ucd:line:%d:%s: %v", p.line, msg, err)
113 }
114 }
115}
116
117func (p *Parser) getField(i int) string {
118 if i >= len(p.field) {
119 return ""
120 }
121 return p.field[i]
122}
123
124// Err returns a non-nil error if any error occurred during parsing.
125func (p *Parser) Err() error {
126 return p.err
127}
128
129// New returns a Parser for the given Reader.
130func New(r io.Reader, o ...Option) *Parser {
131 p := &Parser{
132 scanner: bufio.NewScanner(r),
133 }
134 for _, f := range o {
135 f(p)
136 }
137 return p
138}
139
140// Next parses the next line in the file. It returns true if a line was parsed
141// and false if it reached the end of the file.
142func (p *Parser) Next() bool {
143 if !p.keepRanges && p.rangeStart < p.rangeEnd {
144 p.rangeStart++
145 return true
146 }
147 p.comment = ""
148 p.field = p.field[:0]
149 p.parsedRange = false
150
151 for p.scanner.Scan() && p.err == nil {
152 p.line++
153 s := p.scanner.Text()
154 if s == "" {
155 continue
156 }
157 if s[0] == '#' {
158 if p.commentHandler != nil {
159 p.commentHandler(strings.TrimSpace(s[1:]))
160 }
161 continue
162 }
163
164 // Parse line
165 if i := strings.IndexByte(s, '#'); i != -1 {
166 p.comment = strings.TrimSpace(s[i+1:])
167 s = s[:i]
168 }
169 if s[0] == '@' {
170 if p.partHandler != nil {
171 p.field = append(p.field, strings.TrimSpace(s[1:]))
172 p.partHandler(p)
173 p.field = p.field[:0]
174 }
175 p.comment = ""
176 continue
177 }
178 for {
179 i := strings.IndexByte(s, ';')
180 if i == -1 {
181 p.field = append(p.field, strings.TrimSpace(s))
182 break
183 }
184 p.field = append(p.field, strings.TrimSpace(s[:i]))
185 s = s[i+1:]
186 }
187 if !p.keepRanges {
188 p.rangeStart, p.rangeEnd = p.getRange(0)
189 }
190 return true
191 }
192 p.setError(p.scanner.Err(), "scanner failed")
193 return false
194}
195
196func parseRune(b string) (rune, error) {
197 if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
198 b = b[2:]
199 }
200 x, err := strconv.ParseUint(b, 16, 32)
201 return rune(x), err
202}
203
204func (p *Parser) parseRune(s string) rune {
205 x, err := parseRune(s)
206 p.setError(err, "failed to parse rune")
207 return x
208}
209
210// Rune parses and returns field i as a rune.
211func (p *Parser) Rune(i int) rune {
212 if i > 0 || p.keepRanges {
213 return p.parseRune(p.getField(i))
214 }
215 return p.rangeStart
216}
217
218// Runes interprets and returns field i as a sequence of runes.
219func (p *Parser) Runes(i int) (runes []rune) {
220 add := func(s string) {
221 if s = strings.TrimSpace(s); len(s) > 0 {
222 runes = append(runes, p.parseRune(s))
223 }
224 }
225 for b := p.getField(i); ; {
226 i := strings.IndexByte(b, ' ')
227 if i == -1 {
228 add(b)
229 break
230 }
231 add(b[:i])
232 b = b[i+1:]
233 }
234 return
235}
236
237var (
238 errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")
239
240 // reRange matches one line of a legacy rune range.
241 reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
242)
243
244// Range parses and returns field i as a rune range. A range is inclusive at
245// both ends. If the field only has one rune, first and last will be identical.
246// It supports the legacy format for ranges used in UnicodeData.txt.
247func (p *Parser) Range(i int) (first, last rune) {
248 if !p.keepRanges {
249 return p.rangeStart, p.rangeStart
250 }
251 return p.getRange(i)
252}
253
254func (p *Parser) getRange(i int) (first, last rune) {
255 b := p.getField(i)
256 if k := strings.Index(b, ".."); k != -1 {
257 return p.parseRune(b[:k]), p.parseRune(b[k+2:])
258 }
259 // The first field may not be a rune, in which case we may ignore any error
260 // and set the range as 0..0.
261 x, err := parseRune(b)
262 if err != nil {
263 // Disable range parsing henceforth. This ensures that an error will be
264 // returned if the user subsequently will try to parse this field as
265 // a Rune.
266 p.keepRanges = true
267 }
268 // Special case for UnicodeData that was retained for backwards compatibility.
269 if i == 0 && len(p.field) > 1 && strings.HasSuffix(p.field[1], "First>") {
270 if p.parsedRange {
271 return p.rangeStart, p.rangeEnd
272 }
273 mf := reRange.FindStringSubmatch(p.scanner.Text())
274 p.line++
275 if mf == nil || !p.scanner.Scan() {
276 p.setError(errIncorrectLegacyRange, "")
277 return x, x
278 }
279 // Using Bytes would be more efficient here, but Text is a lot easier
280 // and this is not a frequent case.
281 ml := reRange.FindStringSubmatch(p.scanner.Text())
282 if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
283 p.setError(errIncorrectLegacyRange, "")
284 return x, x
285 }
286 p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Text()[:len(ml[1])])
287 p.parsedRange = true
288 return p.rangeStart, p.rangeEnd
289 }
290 return x, x
291}
292
293// bools recognizes all valid UCD boolean values.
294var bools = map[string]bool{
295 "": false,
296 "N": false,
297 "No": false,
298 "F": false,
299 "False": false,
300 "Y": true,
301 "Yes": true,
302 "T": true,
303 "True": true,
304}
305
306// Bool parses and returns field i as a boolean value.
307func (p *Parser) Bool(i int) bool {
308 f := p.getField(i)
309 for s, v := range bools {
310 if f == s {
311 return v
312 }
313 }
314 p.setError(strconv.ErrSyntax, "error parsing bool")
315 return false
316}
317
318// Int parses and returns field i as an integer value.
319func (p *Parser) Int(i int) int {
320 x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
321 p.setError(err, "error parsing int")
322 return int(x)
323}
324
325// Uint parses and returns field i as an unsigned integer value.
326func (p *Parser) Uint(i int) uint {
327 x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
328 p.setError(err, "error parsing uint")
329 return uint(x)
330}
331
332// Float parses and returns field i as a decimal value.
333func (p *Parser) Float(i int) float64 {
334 x, err := strconv.ParseFloat(string(p.getField(i)), 64)
335 p.setError(err, "error parsing float")
336 return x
337}
338
339// String parses and returns field i as a string value.
340func (p *Parser) String(i int) string {
341 return string(p.getField(i))
342}
343
344// Strings parses and returns field i as a space-separated list of strings.
345func (p *Parser) Strings(i int) []string {
346 ss := strings.Split(string(p.getField(i)), " ")
347 for i, s := range ss {
348 ss[i] = strings.TrimSpace(s)
349 }
350 return ss
351}
352
353// Comment returns the comments for the current line.
354func (p *Parser) Comment() string {
355 return string(p.comment)
356}
357
358var errUndefinedEnum = errors.New("ucd: undefined enum value")
359
360// Enum interprets and returns field i as a value that must be one of the values
361// in enum.
362func (p *Parser) Enum(i int, enum ...string) string {
363 f := p.getField(i)
364 for _, s := range enum {
365 if f == s {
366 return s
367 }
368 }
369 p.setError(errUndefinedEnum, "error parsing enum")
370 return ""
371}