blob: 1d80ac37082a278ba778db7dc7433942e196b5bf [file] [log] [blame]
Don Newton379ae252019-04-01 12:17:06 -04001// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package language
6
7import (
8 "bytes"
9 "fmt"
10 "sort"
11 "strconv"
12
13 "golang.org/x/text/internal/tag"
14)
15
16// findIndex tries to find the given tag in idx and returns a standardized error
17// if it could not be found.
18func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
19 if !tag.FixCase(form, key) {
20 return 0, errSyntax
21 }
22 i := idx.Index(key)
23 if i == -1 {
24 return 0, mkErrInvalid(key)
25 }
26 return i, nil
27}
28
29func searchUint(imap []uint16, key uint16) int {
30 return sort.Search(len(imap), func(i int) bool {
31 return imap[i] >= key
32 })
33}
34
35type langID uint16
36
37// getLangID returns the langID of s if s is a canonical subtag
38// or langUnknown if s is not a canonical subtag.
39func getLangID(s []byte) (langID, error) {
40 if len(s) == 2 {
41 return getLangISO2(s)
42 }
43 return getLangISO3(s)
44}
45
46// mapLang returns the mapped langID of id according to mapping m.
47func normLang(id langID) (langID, langAliasType) {
48 k := sort.Search(len(langAliasMap), func(i int) bool {
49 return langAliasMap[i].from >= uint16(id)
50 })
51 if k < len(langAliasMap) && langAliasMap[k].from == uint16(id) {
52 return langID(langAliasMap[k].to), langAliasTypes[k]
53 }
54 return id, langAliasTypeUnknown
55}
56
57// getLangISO2 returns the langID for the given 2-letter ISO language code
58// or unknownLang if this does not exist.
59func getLangISO2(s []byte) (langID, error) {
60 if !tag.FixCase("zz", s) {
61 return 0, errSyntax
62 }
63 if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
64 return langID(i), nil
65 }
66 return 0, mkErrInvalid(s)
67}
68
69const base = 'z' - 'a' + 1
70
71func strToInt(s []byte) uint {
72 v := uint(0)
73 for i := 0; i < len(s); i++ {
74 v *= base
75 v += uint(s[i] - 'a')
76 }
77 return v
78}
79
80// converts the given integer to the original ASCII string passed to strToInt.
81// len(s) must match the number of characters obtained.
82func intToStr(v uint, s []byte) {
83 for i := len(s) - 1; i >= 0; i-- {
84 s[i] = byte(v%base) + 'a'
85 v /= base
86 }
87}
88
89// getLangISO3 returns the langID for the given 3-letter ISO language code
90// or unknownLang if this does not exist.
91func getLangISO3(s []byte) (langID, error) {
92 if tag.FixCase("und", s) {
93 // first try to match canonical 3-letter entries
94 for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
95 if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
96 // We treat "und" as special and always translate it to "unspecified".
97 // Note that ZZ and Zzzz are private use and are not treated as
98 // unspecified by default.
99 id := langID(i)
100 if id == nonCanonicalUnd {
101 return 0, nil
102 }
103 return id, nil
104 }
105 }
106 if i := altLangISO3.Index(s); i != -1 {
107 return langID(altLangIndex[altLangISO3.Elem(i)[3]]), nil
108 }
109 n := strToInt(s)
110 if langNoIndex[n/8]&(1<<(n%8)) != 0 {
111 return langID(n) + langNoIndexOffset, nil
112 }
113 // Check for non-canonical uses of ISO3.
114 for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
115 if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
116 return langID(i), nil
117 }
118 }
119 return 0, mkErrInvalid(s)
120 }
121 return 0, errSyntax
122}
123
124// stringToBuf writes the string to b and returns the number of bytes
125// written. cap(b) must be >= 3.
126func (id langID) stringToBuf(b []byte) int {
127 if id >= langNoIndexOffset {
128 intToStr(uint(id)-langNoIndexOffset, b[:3])
129 return 3
130 } else if id == 0 {
131 return copy(b, "und")
132 }
133 l := lang[id<<2:]
134 if l[3] == 0 {
135 return copy(b, l[:3])
136 }
137 return copy(b, l[:2])
138}
139
140// String returns the BCP 47 representation of the langID.
141// Use b as variable name, instead of id, to ensure the variable
142// used is consistent with that of Base in which this type is embedded.
143func (b langID) String() string {
144 if b == 0 {
145 return "und"
146 } else if b >= langNoIndexOffset {
147 b -= langNoIndexOffset
148 buf := [3]byte{}
149 intToStr(uint(b), buf[:])
150 return string(buf[:])
151 }
152 l := lang.Elem(int(b))
153 if l[3] == 0 {
154 return l[:3]
155 }
156 return l[:2]
157}
158
159// ISO3 returns the ISO 639-3 language code.
160func (b langID) ISO3() string {
161 if b == 0 || b >= langNoIndexOffset {
162 return b.String()
163 }
164 l := lang.Elem(int(b))
165 if l[3] == 0 {
166 return l[:3]
167 } else if l[2] == 0 {
168 return altLangISO3.Elem(int(l[3]))[:3]
169 }
170 // This allocation will only happen for 3-letter ISO codes
171 // that are non-canonical BCP 47 language identifiers.
172 return l[0:1] + l[2:4]
173}
174
175// IsPrivateUse reports whether this language code is reserved for private use.
176func (b langID) IsPrivateUse() bool {
177 return langPrivateStart <= b && b <= langPrivateEnd
178}
179
180type regionID uint16
181
182// getRegionID returns the region id for s if s is a valid 2-letter region code
183// or unknownRegion.
184func getRegionID(s []byte) (regionID, error) {
185 if len(s) == 3 {
186 if isAlpha(s[0]) {
187 return getRegionISO3(s)
188 }
189 if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
190 return getRegionM49(int(i))
191 }
192 }
193 return getRegionISO2(s)
194}
195
196// getRegionISO2 returns the regionID for the given 2-letter ISO country code
197// or unknownRegion if this does not exist.
198func getRegionISO2(s []byte) (regionID, error) {
199 i, err := findIndex(regionISO, s, "ZZ")
200 if err != nil {
201 return 0, err
202 }
203 return regionID(i) + isoRegionOffset, nil
204}
205
206// getRegionISO3 returns the regionID for the given 3-letter ISO country code
207// or unknownRegion if this does not exist.
208func getRegionISO3(s []byte) (regionID, error) {
209 if tag.FixCase("ZZZ", s) {
210 for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
211 if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
212 return regionID(i) + isoRegionOffset, nil
213 }
214 }
215 for i := 0; i < len(altRegionISO3); i += 3 {
216 if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
217 return regionID(altRegionIDs[i/3]), nil
218 }
219 }
220 return 0, mkErrInvalid(s)
221 }
222 return 0, errSyntax
223}
224
225func getRegionM49(n int) (regionID, error) {
226 if 0 < n && n <= 999 {
227 const (
228 searchBits = 7
229 regionBits = 9
230 regionMask = 1<<regionBits - 1
231 )
232 idx := n >> searchBits
233 buf := fromM49[m49Index[idx]:m49Index[idx+1]]
234 val := uint16(n) << regionBits // we rely on bits shifting out
235 i := sort.Search(len(buf), func(i int) bool {
236 return buf[i] >= val
237 })
238 if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
239 return regionID(r & regionMask), nil
240 }
241 }
242 var e ValueError
243 fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
244 return 0, e
245}
246
247// normRegion returns a region if r is deprecated or 0 otherwise.
248// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
249// TODO: consider mapping split up regions to new most populous one (like CLDR).
250func normRegion(r regionID) regionID {
251 m := regionOldMap
252 k := sort.Search(len(m), func(i int) bool {
253 return m[i].from >= uint16(r)
254 })
255 if k < len(m) && m[k].from == uint16(r) {
256 return regionID(m[k].to)
257 }
258 return 0
259}
260
261const (
262 iso3166UserAssigned = 1 << iota
263 ccTLD
264 bcp47Region
265)
266
267func (r regionID) typ() byte {
268 return regionTypes[r]
269}
270
271// String returns the BCP 47 representation for the region.
272// It returns "ZZ" for an unspecified region.
273func (r regionID) String() string {
274 if r < isoRegionOffset {
275 if r == 0 {
276 return "ZZ"
277 }
278 return fmt.Sprintf("%03d", r.M49())
279 }
280 r -= isoRegionOffset
281 return regionISO.Elem(int(r))[:2]
282}
283
284// ISO3 returns the 3-letter ISO code of r.
285// Note that not all regions have a 3-letter ISO code.
286// In such cases this method returns "ZZZ".
287func (r regionID) ISO3() string {
288 if r < isoRegionOffset {
289 return "ZZZ"
290 }
291 r -= isoRegionOffset
292 reg := regionISO.Elem(int(r))
293 switch reg[2] {
294 case 0:
295 return altRegionISO3[reg[3]:][:3]
296 case ' ':
297 return "ZZZ"
298 }
299 return reg[0:1] + reg[2:4]
300}
301
302// M49 returns the UN M.49 encoding of r, or 0 if this encoding
303// is not defined for r.
304func (r regionID) M49() int {
305 return int(m49[r])
306}
307
308// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
309// may include private-use tags that are assigned by CLDR and used in this
310// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
311func (r regionID) IsPrivateUse() bool {
312 return r.typ()&iso3166UserAssigned != 0
313}
314
315type scriptID uint8
316
317// getScriptID returns the script id for string s. It assumes that s
318// is of the format [A-Z][a-z]{3}.
319func getScriptID(idx tag.Index, s []byte) (scriptID, error) {
320 i, err := findIndex(idx, s, "Zzzz")
321 return scriptID(i), err
322}
323
324// String returns the script code in title case.
325// It returns "Zzzz" for an unspecified script.
326func (s scriptID) String() string {
327 if s == 0 {
328 return "Zzzz"
329 }
330 return script.Elem(int(s))
331}
332
333// IsPrivateUse reports whether this script code is reserved for private use.
334func (s scriptID) IsPrivateUse() bool {
335 return _Qaaa <= s && s <= _Qabx
336}
337
338const (
339 maxAltTaglen = len("en-US-POSIX")
340 maxLen = maxAltTaglen
341)
342
343var (
344 // grandfatheredMap holds a mapping from legacy and grandfathered tags to
345 // their base language or index to more elaborate tag.
346 grandfatheredMap = map[[maxLen]byte]int16{
347 [maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
348 [maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami
349 [maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn
350 [maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak
351 [maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon
352 [maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux
353 [maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo
354 [maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn
355 [maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao
356 [maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay
357 [maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu
358 [maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok
359 [maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn
360 [maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR
361 [maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL
362 [maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE
363 [maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu
364 [maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka
365 [maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
366 [maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang
367
368 // Grandfathered tags with no modern replacement will be converted as
369 // follows:
370 [maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
371 [maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed
372 [maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default
373 [maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian
374 [maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo
375 [maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min
376
377 // CLDR-specific tag.
378 [maxLen]byte{'r', 'o', 'o', 't'}: 0, // root
379 [maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
380 }
381
382 altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
383
384 altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
385)
386
387func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
388 if v, ok := grandfatheredMap[s]; ok {
389 if v < 0 {
390 return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
391 }
392 t.lang = langID(v)
393 return t, true
394 }
395 return t, false
396}