Blame - vendor/golang.org/x/text/internal/language/parse.go - ofagent-go

blob: 2be83e1da54201eb6067ed6f8ee83f579761a41a [file] [log] [blame]

Don Newton	98fd881	2019-09-23 15:15:02 -0400	[diff] [blame^]	1	// Copyright 2013 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	package language
				6
				7	import (
				8	"bytes"
				9	"errors"
				10	"fmt"
				11	"sort"
				12
				13	"golang.org/x/text/internal/tag"
				14	)
				15
				16	// isAlpha returns true if the byte is not a digit.
				17	// b must be an ASCII letter or digit.
				18	func isAlpha(b byte) bool {
				19	return b > '9'
				20	}
				21
				22	// isAlphaNum returns true if the string contains only ASCII letters or digits.
				23	func isAlphaNum(s []byte) bool {
				24	for _, c := range s {
				25	if !('a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z' \|\| '0' <= c && c <= '9') {
				26	return false
				27	}
				28	}
				29	return true
				30	}
				31
				32	// ErrSyntax is returned by any of the parsing functions when the
				33	// input is not well-formed, according to BCP 47.
				34	// TODO: return the position at which the syntax error occurred?
				35	var ErrSyntax = errors.New("language: tag is not well-formed")
				36
				37	// ErrDuplicateKey is returned when a tag contains the same key twice with
				38	// different values in the -u section.
				39	var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
				40
				41	// ValueError is returned by any of the parsing functions when the
				42	// input is well-formed but the respective subtag is not recognized
				43	// as a valid value.
				44	type ValueError struct {
				45	v [8]byte
				46	}
				47
				48	// NewValueError creates a new ValueError.
				49	func NewValueError(tag []byte) ValueError {
				50	var e ValueError
				51	copy(e.v[:], tag)
				52	return e
				53	}
				54
				55	func (e ValueError) tag() []byte {
				56	n := bytes.IndexByte(e.v[:], 0)
				57	if n == -1 {
				58	n = 8
				59	}
				60	return e.v[:n]
				61	}
				62
				63	// Error implements the error interface.
				64	func (e ValueError) Error() string {
				65	return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
				66	}
				67
				68	// Subtag returns the subtag for which the error occurred.
				69	func (e ValueError) Subtag() string {
				70	return string(e.tag())
				71	}
				72
				73	// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
				74	type scanner struct {
				75	b []byte
				76	bytes [max99thPercentileSize]byte
				77	token []byte
				78	start int // start position of the current token
				79	end int // end position of the current token
				80	next int // next point for scan
				81	err error
				82	done bool
				83	}
				84
				85	func makeScannerString(s string) scanner {
				86	scan := scanner{}
				87	if len(s) <= len(scan.bytes) {
				88	scan.b = scan.bytes[:copy(scan.bytes[:], s)]
				89	} else {
				90	scan.b = []byte(s)
				91	}
				92	scan.init()
				93	return scan
				94	}
				95
				96	// makeScanner returns a scanner using b as the input buffer.
				97	// b is not copied and may be modified by the scanner routines.
				98	func makeScanner(b []byte) scanner {
				99	scan := scanner{b: b}
				100	scan.init()
				101	return scan
				102	}
				103
				104	func (s *scanner) init() {
				105	for i, c := range s.b {
				106	if c == '_' {
				107	s.b[i] = '-'
				108	}
				109	}
				110	s.scan()
				111	}
				112
				113	// restToLower converts the string between start and end to lower case.
				114	func (s *scanner) toLower(start, end int) {
				115	for i := start; i < end; i++ {
				116	c := s.b[i]
				117	if 'A' <= c && c <= 'Z' {
				118	s.b[i] += 'a' - 'A'
				119	}
				120	}
				121	}
				122
				123	func (s *scanner) setError(e error) {
				124	if s.err == nil \|\| (e == ErrSyntax && s.err != ErrSyntax) {
				125	s.err = e
				126	}
				127	}
				128
				129	// resizeRange shrinks or grows the array at position oldStart such that
				130	// a new string of size newSize can fit between oldStart and oldEnd.
				131	// Sets the scan point to after the resized range.
				132	func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
				133	s.start = oldStart
				134	if end := oldStart + newSize; end != oldEnd {
				135	diff := end - oldEnd
				136	if end < cap(s.b) {
				137	b := make([]byte, len(s.b)+diff)
				138	copy(b, s.b[:oldStart])
				139	copy(b[end:], s.b[oldEnd:])
				140	s.b = b
				141	} else {
				142	s.b = append(s.b[end:], s.b[oldEnd:]...)
				143	}
				144	s.next = end + (s.next - s.end)
				145	s.end = end
				146	}
				147	}
				148
				149	// replace replaces the current token with repl.
				150	func (s *scanner) replace(repl string) {
				151	s.resizeRange(s.start, s.end, len(repl))
				152	copy(s.b[s.start:], repl)
				153	}
				154
				155	// gobble removes the current token from the input.
				156	// Caller must call scan after calling gobble.
				157	func (s *scanner) gobble(e error) {
				158	s.setError(e)
				159	if s.start == 0 {
				160	s.b = s.b[:+copy(s.b, s.b[s.next:])]
				161	s.end = 0
				162	} else {
				163	s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
				164	s.end = s.start - 1
				165	}
				166	s.next = s.start
				167	}
				168
				169	// deleteRange removes the given range from s.b before the current token.
				170	func (s *scanner) deleteRange(start, end int) {
				171	s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
				172	diff := end - start
				173	s.next -= diff
				174	s.start -= diff
				175	s.end -= diff
				176	}
				177
				178	// scan parses the next token of a BCP 47 string. Tokens that are larger
				179	// than 8 characters or include non-alphanumeric characters result in an error
				180	// and are gobbled and removed from the output.
				181	// It returns the end position of the last token consumed.
				182	func (s *scanner) scan() (end int) {
				183	end = s.end
				184	s.token = nil
				185	for s.start = s.next; s.next < len(s.b); {
				186	i := bytes.IndexByte(s.b[s.next:], '-')
				187	if i == -1 {
				188	s.end = len(s.b)
				189	s.next = len(s.b)
				190	i = s.end - s.start
				191	} else {
				192	s.end = s.next + i
				193	s.next = s.end + 1
				194	}
				195	token := s.b[s.start:s.end]
				196	if i < 1 \|\| i > 8 \|\| !isAlphaNum(token) {
				197	s.gobble(ErrSyntax)
				198	continue
				199	}
				200	s.token = token
				201	return end
				202	}
				203	if n := len(s.b); n > 0 && s.b[n-1] == '-' {
				204	s.setError(ErrSyntax)
				205	s.b = s.b[:len(s.b)-1]
				206	}
				207	s.done = true
				208	return end
				209	}
				210
				211	// acceptMinSize parses multiple tokens of the given size or greater.
				212	// It returns the end position of the last token consumed.
				213	func (s *scanner) acceptMinSize(min int) (end int) {
				214	end = s.end
				215	s.scan()
				216	for ; len(s.token) >= min; s.scan() {
				217	end = s.end
				218	}
				219	return end
				220	}
				221
				222	// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
				223	// failed it returns an error and any part of the tag that could be parsed.
				224	// If parsing succeeded but an unknown value was found, it returns
				225	// ValueError. The Tag returned in this case is just stripped of the unknown
				226	// value. All other values are preserved. It accepts tags in the BCP 47 format
				227	// and extensions to this standard defined in
				228	// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
				229	func Parse(s string) (t Tag, err error) {
				230	// TODO: consider supporting old-style locale key-value pairs.
				231	if s == "" {
				232	return Und, ErrSyntax
				233	}
				234	if len(s) <= maxAltTaglen {
				235	b := [maxAltTaglen]byte{}
				236	for i, c := range s {
				237	// Generating invalid UTF-8 is okay as it won't match.
				238	if 'A' <= c && c <= 'Z' {
				239	c += 'a' - 'A'
				240	} else if c == '_' {
				241	c = '-'
				242	}
				243	b[i] = byte(c)
				244	}
				245	if t, ok := grandfathered(b); ok {
				246	return t, nil
				247	}
				248	}
				249	scan := makeScannerString(s)
				250	return parse(&scan, s)
				251	}
				252
				253	func parse(scan *scanner, s string) (t Tag, err error) {
				254	t = Und
				255	var end int
				256	if n := len(scan.token); n <= 1 {
				257	scan.toLower(0, len(scan.b))
				258	if n == 0 \|\| scan.token[0] != 'x' {
				259	return t, ErrSyntax
				260	}
				261	end = parseExtensions(scan)
				262	} else if n >= 4 {
				263	return Und, ErrSyntax
				264	} else { // the usual case
				265	t, end = parseTag(scan)
				266	if n := len(scan.token); n == 1 {
				267	t.pExt = uint16(end)
				268	end = parseExtensions(scan)
				269	} else if end < len(scan.b) {
				270	scan.setError(ErrSyntax)
				271	scan.b = scan.b[:end]
				272	}
				273	}
				274	if int(t.pVariant) < len(scan.b) {
				275	if end < len(s) {
				276	s = s[:end]
				277	}
				278	if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
				279	t.str = s
				280	} else {
				281	t.str = string(scan.b)
				282	}
				283	} else {
				284	t.pVariant, t.pExt = 0, 0
				285	}
				286	return t, scan.err
				287	}
				288
				289	// parseTag parses language, script, region and variants.
				290	// It returns a Tag and the end position in the input that was parsed.
				291	func parseTag(scan *scanner) (t Tag, end int) {
				292	var e error
				293	// TODO: set an error if an unknown lang, script or region is encountered.
				294	t.LangID, e = getLangID(scan.token)
				295	scan.setError(e)
				296	scan.replace(t.LangID.String())
				297	langStart := scan.start
				298	end = scan.scan()
				299	for len(scan.token) == 3 && isAlpha(scan.token[0]) {
				300	// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
				301	// to a tag of the form <extlang>.
				302	lang, e := getLangID(scan.token)
				303	if lang != 0 {
				304	t.LangID = lang
				305	copy(scan.b[langStart:], lang.String())
				306	scan.b[langStart+3] = '-'
				307	scan.start = langStart + 4
				308	}
				309	scan.gobble(e)
				310	end = scan.scan()
				311	}
				312	if len(scan.token) == 4 && isAlpha(scan.token[0]) {
				313	t.ScriptID, e = getScriptID(script, scan.token)
				314	if t.ScriptID == 0 {
				315	scan.gobble(e)
				316	}
				317	end = scan.scan()
				318	}
				319	if n := len(scan.token); n >= 2 && n <= 3 {
				320	t.RegionID, e = getRegionID(scan.token)
				321	if t.RegionID == 0 {
				322	scan.gobble(e)
				323	} else {
				324	scan.replace(t.RegionID.String())
				325	}
				326	end = scan.scan()
				327	}
				328	scan.toLower(scan.start, len(scan.b))
				329	t.pVariant = byte(end)
				330	end = parseVariants(scan, end, t)
				331	t.pExt = uint16(end)
				332	return t, end
				333	}
				334
				335	var separator = []byte{'-'}
				336
				337	// parseVariants scans tokens as long as each token is a valid variant string.
				338	// Duplicate variants are removed.
				339	func parseVariants(scan *scanner, end int, t Tag) int {
				340	start := scan.start
				341	varIDBuf := [4]uint8{}
				342	variantBuf := [4][]byte{}
				343	varID := varIDBuf[:0]
				344	variant := variantBuf[:0]
				345	last := -1
				346	needSort := false
				347	for ; len(scan.token) >= 4; scan.scan() {
				348	// TODO: measure the impact of needing this conversion and redesign
				349	// the data structure if there is an issue.
				350	v, ok := variantIndex[string(scan.token)]
				351	if !ok {
				352	// unknown variant
				353	// TODO: allow user-defined variants?
				354	scan.gobble(NewValueError(scan.token))
				355	continue
				356	}
				357	varID = append(varID, v)
				358	variant = append(variant, scan.token)
				359	if !needSort {
				360	if last < int(v) {
				361	last = int(v)
				362	} else {
				363	needSort = true
				364	// There is no legal combinations of more than 7 variants
				365	// (and this is by no means a useful sequence).
				366	const maxVariants = 8
				367	if len(varID) > maxVariants {
				368	break
				369	}
				370	}
				371	}
				372	end = scan.end
				373	}
				374	if needSort {
				375	sort.Sort(variantsSort{varID, variant})
				376	k, l := 0, -1
				377	for i, v := range varID {
				378	w := int(v)
				379	if l == w {
				380	// Remove duplicates.
				381	continue
				382	}
				383	varID[k] = varID[i]
				384	variant[k] = variant[i]
				385	k++
				386	l = w
				387	}
				388	if str := bytes.Join(variant[:k], separator); len(str) == 0 {
				389	end = start - 1
				390	} else {
				391	scan.resizeRange(start, end, len(str))
				392	copy(scan.b[scan.start:], str)
				393	end = scan.end
				394	}
				395	}
				396	return end
				397	}
				398
				399	type variantsSort struct {
				400	i []uint8
				401	v [][]byte
				402	}
				403
				404	func (s variantsSort) Len() int {
				405	return len(s.i)
				406	}
				407
				408	func (s variantsSort) Swap(i, j int) {
				409	s.i[i], s.i[j] = s.i[j], s.i[i]
				410	s.v[i], s.v[j] = s.v[j], s.v[i]
				411	}
				412
				413	func (s variantsSort) Less(i, j int) bool {
				414	return s.i[i] < s.i[j]
				415	}
				416
				417	type bytesSort struct {
				418	b [][]byte
				419	n int // first n bytes to compare
				420	}
				421
				422	func (b bytesSort) Len() int {
				423	return len(b.b)
				424	}
				425
				426	func (b bytesSort) Swap(i, j int) {
				427	b.b[i], b.b[j] = b.b[j], b.b[i]
				428	}
				429
				430	func (b bytesSort) Less(i, j int) bool {
				431	for k := 0; k < b.n; k++ {
				432	if b.b[i][k] == b.b[j][k] {
				433	continue
				434	}
				435	return b.b[i][k] < b.b[j][k]
				436	}
				437	return false
				438	}
				439
				440	// parseExtensions parses and normalizes the extensions in the buffer.
				441	// It returns the last position of scan.b that is part of any extension.
				442	// It also trims scan.b to remove excess parts accordingly.
				443	func parseExtensions(scan *scanner) int {
				444	start := scan.start
				445	exts := [][]byte{}
				446	private := []byte{}
				447	end := scan.end
				448	for len(scan.token) == 1 {
				449	extStart := scan.start
				450	ext := scan.token[0]
				451	end = parseExtension(scan)
				452	extension := scan.b[extStart:end]
				453	if len(extension) < 3 \|\| (ext != 'x' && len(extension) < 4) {
				454	scan.setError(ErrSyntax)
				455	end = extStart
				456	continue
				457	} else if start == extStart && (ext == 'x' \|\| scan.start == len(scan.b)) {
				458	scan.b = scan.b[:end]
				459	return end
				460	} else if ext == 'x' {
				461	private = extension
				462	break
				463	}
				464	exts = append(exts, extension)
				465	}
				466	sort.Sort(bytesSort{exts, 1})
				467	if len(private) > 0 {
				468	exts = append(exts, private)
				469	}
				470	scan.b = scan.b[:start]
				471	if len(exts) > 0 {
				472	scan.b = append(scan.b, bytes.Join(exts, separator)...)
				473	} else if start > 0 {
				474	// Strip trailing '-'.
				475	scan.b = scan.b[:start-1]
				476	}
				477	return end
				478	}
				479
				480	// parseExtension parses a single extension and returns the position of
				481	// the extension end.
				482	func parseExtension(scan *scanner) int {
				483	start, end := scan.start, scan.end
				484	switch scan.token[0] {
				485	case 'u':
				486	attrStart := end
				487	scan.scan()
				488	for last := []byte{}; len(scan.token) > 2; scan.scan() {
				489	if bytes.Compare(scan.token, last) != -1 {
				490	// Attributes are unsorted. Start over from scratch.
				491	p := attrStart + 1
				492	scan.next = p
				493	attrs := [][]byte{}
				494	for scan.scan(); len(scan.token) > 2; scan.scan() {
				495	attrs = append(attrs, scan.token)
				496	end = scan.end
				497	}
				498	sort.Sort(bytesSort{attrs, 3})
				499	copy(scan.b[p:], bytes.Join(attrs, separator))
				500	break
				501	}
				502	last = scan.token
				503	end = scan.end
				504	}
				505	var last, key []byte
				506	for attrEnd := end; len(scan.token) == 2; last = key {
				507	key = scan.token
				508	keyEnd := scan.end
				509	end = scan.acceptMinSize(3)
				510	// TODO: check key value validity
				511	if keyEnd == end \|\| bytes.Compare(key, last) != 1 {
				512	// We have an invalid key or the keys are not sorted.
				513	// Start scanning keys from scratch and reorder.
				514	p := attrEnd + 1
				515	scan.next = p
				516	keys := [][]byte{}
				517	for scan.scan(); len(scan.token) == 2; {
				518	keyStart, keyEnd := scan.start, scan.end
				519	end = scan.acceptMinSize(3)
				520	if keyEnd != end {
				521	keys = append(keys, scan.b[keyStart:end])
				522	} else {
				523	scan.setError(ErrSyntax)
				524	end = keyStart
				525	}
				526	}
				527	sort.Stable(bytesSort{keys, 2})
				528	if n := len(keys); n > 0 {
				529	k := 0
				530	for i := 1; i < n; i++ {
				531	if !bytes.Equal(keys[k][:2], keys[i][:2]) {
				532	k++
				533	keys[k] = keys[i]
				534	} else if !bytes.Equal(keys[k], keys[i]) {
				535	scan.setError(ErrDuplicateKey)
				536	}
				537	}
				538	keys = keys[:k+1]
				539	}
				540	reordered := bytes.Join(keys, separator)
				541	if e := p + len(reordered); e < end {
				542	scan.deleteRange(e, end)
				543	end = e
				544	}
				545	copy(scan.b[p:], reordered)
				546	break
				547	}
				548	}
				549	case 't':
				550	scan.scan()
				551	if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
				552	_, end = parseTag(scan)
				553	scan.toLower(start, end)
				554	}
				555	for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
				556	end = scan.acceptMinSize(3)
				557	}
				558	case 'x':
				559	end = scan.acceptMinSize(1)
				560	default:
				561	end = scan.acceptMinSize(2)
				562	}
				563	return end
				564	}
				565
				566	// getExtension returns the name, body and end position of the extension.
				567	func getExtension(s string, p int) (end int, ext string) {
				568	if s[p] == '-' {
				569	p++
				570	}
				571	if s[p] == 'x' {
				572	return len(s), s[p:]
				573	}
				574	end = nextExtension(s, p)
				575	return end, s[p:end]
				576	}
				577
				578	// nextExtension finds the next extension within the string, searching
				579	// for the -<char>- pattern from position p.
				580	// In the fast majority of cases, language tags will have at most
				581	// one extension and extensions tend to be small.
				582	func nextExtension(s string, p int) int {
				583	for n := len(s) - 3; p < n; {
				584	if s[p] == '-' {
				585	if s[p+2] == '-' {
				586	return p
				587	}
				588	p += 3
				589	} else {
				590	p++
				591	}
				592	}
				593	return len(s)
				594	}