blob: 2be83e1da54201eb6067ed6f8ee83f579761a41a [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package language
6
7import (
8 "bytes"
9 "errors"
10 "fmt"
11 "sort"
12
13 "golang.org/x/text/internal/tag"
14)
15
16// isAlpha returns true if the byte is not a digit.
17// b must be an ASCII letter or digit.
18func isAlpha(b byte) bool {
19 return b > '9'
20}
21
22// isAlphaNum returns true if the string contains only ASCII letters or digits.
23func isAlphaNum(s []byte) bool {
24 for _, c := range s {
25 if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
26 return false
27 }
28 }
29 return true
30}
31
32// ErrSyntax is returned by any of the parsing functions when the
33// input is not well-formed, according to BCP 47.
34// TODO: return the position at which the syntax error occurred?
35var ErrSyntax = errors.New("language: tag is not well-formed")
36
37// ErrDuplicateKey is returned when a tag contains the same key twice with
38// different values in the -u section.
39var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
40
41// ValueError is returned by any of the parsing functions when the
42// input is well-formed but the respective subtag is not recognized
43// as a valid value.
44type ValueError struct {
45 v [8]byte
46}
47
48// NewValueError creates a new ValueError.
49func NewValueError(tag []byte) ValueError {
50 var e ValueError
51 copy(e.v[:], tag)
52 return e
53}
54
55func (e ValueError) tag() []byte {
56 n := bytes.IndexByte(e.v[:], 0)
57 if n == -1 {
58 n = 8
59 }
60 return e.v[:n]
61}
62
63// Error implements the error interface.
64func (e ValueError) Error() string {
65 return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
66}
67
68// Subtag returns the subtag for which the error occurred.
69func (e ValueError) Subtag() string {
70 return string(e.tag())
71}
72
73// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
74type scanner struct {
75 b []byte
76 bytes [max99thPercentileSize]byte
77 token []byte
78 start int // start position of the current token
79 end int // end position of the current token
80 next int // next point for scan
81 err error
82 done bool
83}
84
85func makeScannerString(s string) scanner {
86 scan := scanner{}
87 if len(s) <= len(scan.bytes) {
88 scan.b = scan.bytes[:copy(scan.bytes[:], s)]
89 } else {
90 scan.b = []byte(s)
91 }
92 scan.init()
93 return scan
94}
95
96// makeScanner returns a scanner using b as the input buffer.
97// b is not copied and may be modified by the scanner routines.
98func makeScanner(b []byte) scanner {
99 scan := scanner{b: b}
100 scan.init()
101 return scan
102}
103
104func (s *scanner) init() {
105 for i, c := range s.b {
106 if c == '_' {
107 s.b[i] = '-'
108 }
109 }
110 s.scan()
111}
112
113// restToLower converts the string between start and end to lower case.
114func (s *scanner) toLower(start, end int) {
115 for i := start; i < end; i++ {
116 c := s.b[i]
117 if 'A' <= c && c <= 'Z' {
118 s.b[i] += 'a' - 'A'
119 }
120 }
121}
122
123func (s *scanner) setError(e error) {
124 if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
125 s.err = e
126 }
127}
128
129// resizeRange shrinks or grows the array at position oldStart such that
130// a new string of size newSize can fit between oldStart and oldEnd.
131// Sets the scan point to after the resized range.
132func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
133 s.start = oldStart
134 if end := oldStart + newSize; end != oldEnd {
135 diff := end - oldEnd
136 if end < cap(s.b) {
137 b := make([]byte, len(s.b)+diff)
138 copy(b, s.b[:oldStart])
139 copy(b[end:], s.b[oldEnd:])
140 s.b = b
141 } else {
142 s.b = append(s.b[end:], s.b[oldEnd:]...)
143 }
144 s.next = end + (s.next - s.end)
145 s.end = end
146 }
147}
148
149// replace replaces the current token with repl.
150func (s *scanner) replace(repl string) {
151 s.resizeRange(s.start, s.end, len(repl))
152 copy(s.b[s.start:], repl)
153}
154
155// gobble removes the current token from the input.
156// Caller must call scan after calling gobble.
157func (s *scanner) gobble(e error) {
158 s.setError(e)
159 if s.start == 0 {
160 s.b = s.b[:+copy(s.b, s.b[s.next:])]
161 s.end = 0
162 } else {
163 s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
164 s.end = s.start - 1
165 }
166 s.next = s.start
167}
168
169// deleteRange removes the given range from s.b before the current token.
170func (s *scanner) deleteRange(start, end int) {
171 s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
172 diff := end - start
173 s.next -= diff
174 s.start -= diff
175 s.end -= diff
176}
177
178// scan parses the next token of a BCP 47 string. Tokens that are larger
179// than 8 characters or include non-alphanumeric characters result in an error
180// and are gobbled and removed from the output.
181// It returns the end position of the last token consumed.
182func (s *scanner) scan() (end int) {
183 end = s.end
184 s.token = nil
185 for s.start = s.next; s.next < len(s.b); {
186 i := bytes.IndexByte(s.b[s.next:], '-')
187 if i == -1 {
188 s.end = len(s.b)
189 s.next = len(s.b)
190 i = s.end - s.start
191 } else {
192 s.end = s.next + i
193 s.next = s.end + 1
194 }
195 token := s.b[s.start:s.end]
196 if i < 1 || i > 8 || !isAlphaNum(token) {
197 s.gobble(ErrSyntax)
198 continue
199 }
200 s.token = token
201 return end
202 }
203 if n := len(s.b); n > 0 && s.b[n-1] == '-' {
204 s.setError(ErrSyntax)
205 s.b = s.b[:len(s.b)-1]
206 }
207 s.done = true
208 return end
209}
210
211// acceptMinSize parses multiple tokens of the given size or greater.
212// It returns the end position of the last token consumed.
213func (s *scanner) acceptMinSize(min int) (end int) {
214 end = s.end
215 s.scan()
216 for ; len(s.token) >= min; s.scan() {
217 end = s.end
218 }
219 return end
220}
221
222// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
223// failed it returns an error and any part of the tag that could be parsed.
224// If parsing succeeded but an unknown value was found, it returns
225// ValueError. The Tag returned in this case is just stripped of the unknown
226// value. All other values are preserved. It accepts tags in the BCP 47 format
227// and extensions to this standard defined in
228// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
229func Parse(s string) (t Tag, err error) {
230 // TODO: consider supporting old-style locale key-value pairs.
231 if s == "" {
232 return Und, ErrSyntax
233 }
234 if len(s) <= maxAltTaglen {
235 b := [maxAltTaglen]byte{}
236 for i, c := range s {
237 // Generating invalid UTF-8 is okay as it won't match.
238 if 'A' <= c && c <= 'Z' {
239 c += 'a' - 'A'
240 } else if c == '_' {
241 c = '-'
242 }
243 b[i] = byte(c)
244 }
245 if t, ok := grandfathered(b); ok {
246 return t, nil
247 }
248 }
249 scan := makeScannerString(s)
250 return parse(&scan, s)
251}
252
253func parse(scan *scanner, s string) (t Tag, err error) {
254 t = Und
255 var end int
256 if n := len(scan.token); n <= 1 {
257 scan.toLower(0, len(scan.b))
258 if n == 0 || scan.token[0] != 'x' {
259 return t, ErrSyntax
260 }
261 end = parseExtensions(scan)
262 } else if n >= 4 {
263 return Und, ErrSyntax
264 } else { // the usual case
265 t, end = parseTag(scan)
266 if n := len(scan.token); n == 1 {
267 t.pExt = uint16(end)
268 end = parseExtensions(scan)
269 } else if end < len(scan.b) {
270 scan.setError(ErrSyntax)
271 scan.b = scan.b[:end]
272 }
273 }
274 if int(t.pVariant) < len(scan.b) {
275 if end < len(s) {
276 s = s[:end]
277 }
278 if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
279 t.str = s
280 } else {
281 t.str = string(scan.b)
282 }
283 } else {
284 t.pVariant, t.pExt = 0, 0
285 }
286 return t, scan.err
287}
288
289// parseTag parses language, script, region and variants.
290// It returns a Tag and the end position in the input that was parsed.
291func parseTag(scan *scanner) (t Tag, end int) {
292 var e error
293 // TODO: set an error if an unknown lang, script or region is encountered.
294 t.LangID, e = getLangID(scan.token)
295 scan.setError(e)
296 scan.replace(t.LangID.String())
297 langStart := scan.start
298 end = scan.scan()
299 for len(scan.token) == 3 && isAlpha(scan.token[0]) {
300 // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
301 // to a tag of the form <extlang>.
302 lang, e := getLangID(scan.token)
303 if lang != 0 {
304 t.LangID = lang
305 copy(scan.b[langStart:], lang.String())
306 scan.b[langStart+3] = '-'
307 scan.start = langStart + 4
308 }
309 scan.gobble(e)
310 end = scan.scan()
311 }
312 if len(scan.token) == 4 && isAlpha(scan.token[0]) {
313 t.ScriptID, e = getScriptID(script, scan.token)
314 if t.ScriptID == 0 {
315 scan.gobble(e)
316 }
317 end = scan.scan()
318 }
319 if n := len(scan.token); n >= 2 && n <= 3 {
320 t.RegionID, e = getRegionID(scan.token)
321 if t.RegionID == 0 {
322 scan.gobble(e)
323 } else {
324 scan.replace(t.RegionID.String())
325 }
326 end = scan.scan()
327 }
328 scan.toLower(scan.start, len(scan.b))
329 t.pVariant = byte(end)
330 end = parseVariants(scan, end, t)
331 t.pExt = uint16(end)
332 return t, end
333}
334
335var separator = []byte{'-'}
336
337// parseVariants scans tokens as long as each token is a valid variant string.
338// Duplicate variants are removed.
339func parseVariants(scan *scanner, end int, t Tag) int {
340 start := scan.start
341 varIDBuf := [4]uint8{}
342 variantBuf := [4][]byte{}
343 varID := varIDBuf[:0]
344 variant := variantBuf[:0]
345 last := -1
346 needSort := false
347 for ; len(scan.token) >= 4; scan.scan() {
348 // TODO: measure the impact of needing this conversion and redesign
349 // the data structure if there is an issue.
350 v, ok := variantIndex[string(scan.token)]
351 if !ok {
352 // unknown variant
353 // TODO: allow user-defined variants?
354 scan.gobble(NewValueError(scan.token))
355 continue
356 }
357 varID = append(varID, v)
358 variant = append(variant, scan.token)
359 if !needSort {
360 if last < int(v) {
361 last = int(v)
362 } else {
363 needSort = true
364 // There is no legal combinations of more than 7 variants
365 // (and this is by no means a useful sequence).
366 const maxVariants = 8
367 if len(varID) > maxVariants {
368 break
369 }
370 }
371 }
372 end = scan.end
373 }
374 if needSort {
375 sort.Sort(variantsSort{varID, variant})
376 k, l := 0, -1
377 for i, v := range varID {
378 w := int(v)
379 if l == w {
380 // Remove duplicates.
381 continue
382 }
383 varID[k] = varID[i]
384 variant[k] = variant[i]
385 k++
386 l = w
387 }
388 if str := bytes.Join(variant[:k], separator); len(str) == 0 {
389 end = start - 1
390 } else {
391 scan.resizeRange(start, end, len(str))
392 copy(scan.b[scan.start:], str)
393 end = scan.end
394 }
395 }
396 return end
397}
398
399type variantsSort struct {
400 i []uint8
401 v [][]byte
402}
403
404func (s variantsSort) Len() int {
405 return len(s.i)
406}
407
408func (s variantsSort) Swap(i, j int) {
409 s.i[i], s.i[j] = s.i[j], s.i[i]
410 s.v[i], s.v[j] = s.v[j], s.v[i]
411}
412
413func (s variantsSort) Less(i, j int) bool {
414 return s.i[i] < s.i[j]
415}
416
417type bytesSort struct {
418 b [][]byte
419 n int // first n bytes to compare
420}
421
422func (b bytesSort) Len() int {
423 return len(b.b)
424}
425
426func (b bytesSort) Swap(i, j int) {
427 b.b[i], b.b[j] = b.b[j], b.b[i]
428}
429
430func (b bytesSort) Less(i, j int) bool {
431 for k := 0; k < b.n; k++ {
432 if b.b[i][k] == b.b[j][k] {
433 continue
434 }
435 return b.b[i][k] < b.b[j][k]
436 }
437 return false
438}
439
440// parseExtensions parses and normalizes the extensions in the buffer.
441// It returns the last position of scan.b that is part of any extension.
442// It also trims scan.b to remove excess parts accordingly.
443func parseExtensions(scan *scanner) int {
444 start := scan.start
445 exts := [][]byte{}
446 private := []byte{}
447 end := scan.end
448 for len(scan.token) == 1 {
449 extStart := scan.start
450 ext := scan.token[0]
451 end = parseExtension(scan)
452 extension := scan.b[extStart:end]
453 if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
454 scan.setError(ErrSyntax)
455 end = extStart
456 continue
457 } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
458 scan.b = scan.b[:end]
459 return end
460 } else if ext == 'x' {
461 private = extension
462 break
463 }
464 exts = append(exts, extension)
465 }
466 sort.Sort(bytesSort{exts, 1})
467 if len(private) > 0 {
468 exts = append(exts, private)
469 }
470 scan.b = scan.b[:start]
471 if len(exts) > 0 {
472 scan.b = append(scan.b, bytes.Join(exts, separator)...)
473 } else if start > 0 {
474 // Strip trailing '-'.
475 scan.b = scan.b[:start-1]
476 }
477 return end
478}
479
480// parseExtension parses a single extension and returns the position of
481// the extension end.
482func parseExtension(scan *scanner) int {
483 start, end := scan.start, scan.end
484 switch scan.token[0] {
485 case 'u':
486 attrStart := end
487 scan.scan()
488 for last := []byte{}; len(scan.token) > 2; scan.scan() {
489 if bytes.Compare(scan.token, last) != -1 {
490 // Attributes are unsorted. Start over from scratch.
491 p := attrStart + 1
492 scan.next = p
493 attrs := [][]byte{}
494 for scan.scan(); len(scan.token) > 2; scan.scan() {
495 attrs = append(attrs, scan.token)
496 end = scan.end
497 }
498 sort.Sort(bytesSort{attrs, 3})
499 copy(scan.b[p:], bytes.Join(attrs, separator))
500 break
501 }
502 last = scan.token
503 end = scan.end
504 }
505 var last, key []byte
506 for attrEnd := end; len(scan.token) == 2; last = key {
507 key = scan.token
508 keyEnd := scan.end
509 end = scan.acceptMinSize(3)
510 // TODO: check key value validity
511 if keyEnd == end || bytes.Compare(key, last) != 1 {
512 // We have an invalid key or the keys are not sorted.
513 // Start scanning keys from scratch and reorder.
514 p := attrEnd + 1
515 scan.next = p
516 keys := [][]byte{}
517 for scan.scan(); len(scan.token) == 2; {
518 keyStart, keyEnd := scan.start, scan.end
519 end = scan.acceptMinSize(3)
520 if keyEnd != end {
521 keys = append(keys, scan.b[keyStart:end])
522 } else {
523 scan.setError(ErrSyntax)
524 end = keyStart
525 }
526 }
527 sort.Stable(bytesSort{keys, 2})
528 if n := len(keys); n > 0 {
529 k := 0
530 for i := 1; i < n; i++ {
531 if !bytes.Equal(keys[k][:2], keys[i][:2]) {
532 k++
533 keys[k] = keys[i]
534 } else if !bytes.Equal(keys[k], keys[i]) {
535 scan.setError(ErrDuplicateKey)
536 }
537 }
538 keys = keys[:k+1]
539 }
540 reordered := bytes.Join(keys, separator)
541 if e := p + len(reordered); e < end {
542 scan.deleteRange(e, end)
543 end = e
544 }
545 copy(scan.b[p:], reordered)
546 break
547 }
548 }
549 case 't':
550 scan.scan()
551 if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
552 _, end = parseTag(scan)
553 scan.toLower(start, end)
554 }
555 for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
556 end = scan.acceptMinSize(3)
557 }
558 case 'x':
559 end = scan.acceptMinSize(1)
560 default:
561 end = scan.acceptMinSize(2)
562 }
563 return end
564}
565
566// getExtension returns the name, body and end position of the extension.
567func getExtension(s string, p int) (end int, ext string) {
568 if s[p] == '-' {
569 p++
570 }
571 if s[p] == 'x' {
572 return len(s), s[p:]
573 }
574 end = nextExtension(s, p)
575 return end, s[p:end]
576}
577
578// nextExtension finds the next extension within the string, searching
579// for the -<char>- pattern from position p.
580// In the fast majority of cases, language tags will have at most
581// one extension and extensions tend to be small.
582func nextExtension(s string, p int) int {
583 for n := len(s) - 3; p < n; {
584 if s[p] == '-' {
585 if s[p+2] == '-' {
586 return p
587 }
588 p += 3
589 } else {
590 p++
591 }
592 }
593 return len(s)
594}