Don Newton | 379ae25 | 2019-04-01 12:17:06 -0400 | [diff] [blame] | 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package language |
| 6 | |
| 7 | import ( |
| 8 | "bytes" |
| 9 | "fmt" |
| 10 | "sort" |
| 11 | "strconv" |
| 12 | |
| 13 | "golang.org/x/text/internal/tag" |
| 14 | ) |
| 15 | |
| 16 | // findIndex tries to find the given tag in idx and returns a standardized error |
| 17 | // if it could not be found. |
| 18 | func findIndex(idx tag.Index, key []byte, form string) (index int, err error) { |
| 19 | if !tag.FixCase(form, key) { |
| 20 | return 0, errSyntax |
| 21 | } |
| 22 | i := idx.Index(key) |
| 23 | if i == -1 { |
| 24 | return 0, mkErrInvalid(key) |
| 25 | } |
| 26 | return i, nil |
| 27 | } |
| 28 | |
| 29 | func searchUint(imap []uint16, key uint16) int { |
| 30 | return sort.Search(len(imap), func(i int) bool { |
| 31 | return imap[i] >= key |
| 32 | }) |
| 33 | } |
| 34 | |
| 35 | type langID uint16 |
| 36 | |
| 37 | // getLangID returns the langID of s if s is a canonical subtag |
| 38 | // or langUnknown if s is not a canonical subtag. |
| 39 | func getLangID(s []byte) (langID, error) { |
| 40 | if len(s) == 2 { |
| 41 | return getLangISO2(s) |
| 42 | } |
| 43 | return getLangISO3(s) |
| 44 | } |
| 45 | |
| 46 | // mapLang returns the mapped langID of id according to mapping m. |
| 47 | func normLang(id langID) (langID, langAliasType) { |
| 48 | k := sort.Search(len(langAliasMap), func(i int) bool { |
| 49 | return langAliasMap[i].from >= uint16(id) |
| 50 | }) |
| 51 | if k < len(langAliasMap) && langAliasMap[k].from == uint16(id) { |
| 52 | return langID(langAliasMap[k].to), langAliasTypes[k] |
| 53 | } |
| 54 | return id, langAliasTypeUnknown |
| 55 | } |
| 56 | |
| 57 | // getLangISO2 returns the langID for the given 2-letter ISO language code |
| 58 | // or unknownLang if this does not exist. |
| 59 | func getLangISO2(s []byte) (langID, error) { |
| 60 | if !tag.FixCase("zz", s) { |
| 61 | return 0, errSyntax |
| 62 | } |
| 63 | if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 { |
| 64 | return langID(i), nil |
| 65 | } |
| 66 | return 0, mkErrInvalid(s) |
| 67 | } |
| 68 | |
| 69 | const base = 'z' - 'a' + 1 |
| 70 | |
| 71 | func strToInt(s []byte) uint { |
| 72 | v := uint(0) |
| 73 | for i := 0; i < len(s); i++ { |
| 74 | v *= base |
| 75 | v += uint(s[i] - 'a') |
| 76 | } |
| 77 | return v |
| 78 | } |
| 79 | |
| 80 | // converts the given integer to the original ASCII string passed to strToInt. |
| 81 | // len(s) must match the number of characters obtained. |
| 82 | func intToStr(v uint, s []byte) { |
| 83 | for i := len(s) - 1; i >= 0; i-- { |
| 84 | s[i] = byte(v%base) + 'a' |
| 85 | v /= base |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | // getLangISO3 returns the langID for the given 3-letter ISO language code |
| 90 | // or unknownLang if this does not exist. |
| 91 | func getLangISO3(s []byte) (langID, error) { |
| 92 | if tag.FixCase("und", s) { |
| 93 | // first try to match canonical 3-letter entries |
| 94 | for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) { |
| 95 | if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] { |
| 96 | // We treat "und" as special and always translate it to "unspecified". |
| 97 | // Note that ZZ and Zzzz are private use and are not treated as |
| 98 | // unspecified by default. |
| 99 | id := langID(i) |
| 100 | if id == nonCanonicalUnd { |
| 101 | return 0, nil |
| 102 | } |
| 103 | return id, nil |
| 104 | } |
| 105 | } |
| 106 | if i := altLangISO3.Index(s); i != -1 { |
| 107 | return langID(altLangIndex[altLangISO3.Elem(i)[3]]), nil |
| 108 | } |
| 109 | n := strToInt(s) |
| 110 | if langNoIndex[n/8]&(1<<(n%8)) != 0 { |
| 111 | return langID(n) + langNoIndexOffset, nil |
| 112 | } |
| 113 | // Check for non-canonical uses of ISO3. |
| 114 | for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) { |
| 115 | if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] { |
| 116 | return langID(i), nil |
| 117 | } |
| 118 | } |
| 119 | return 0, mkErrInvalid(s) |
| 120 | } |
| 121 | return 0, errSyntax |
| 122 | } |
| 123 | |
| 124 | // stringToBuf writes the string to b and returns the number of bytes |
| 125 | // written. cap(b) must be >= 3. |
| 126 | func (id langID) stringToBuf(b []byte) int { |
| 127 | if id >= langNoIndexOffset { |
| 128 | intToStr(uint(id)-langNoIndexOffset, b[:3]) |
| 129 | return 3 |
| 130 | } else if id == 0 { |
| 131 | return copy(b, "und") |
| 132 | } |
| 133 | l := lang[id<<2:] |
| 134 | if l[3] == 0 { |
| 135 | return copy(b, l[:3]) |
| 136 | } |
| 137 | return copy(b, l[:2]) |
| 138 | } |
| 139 | |
| 140 | // String returns the BCP 47 representation of the langID. |
| 141 | // Use b as variable name, instead of id, to ensure the variable |
| 142 | // used is consistent with that of Base in which this type is embedded. |
| 143 | func (b langID) String() string { |
| 144 | if b == 0 { |
| 145 | return "und" |
| 146 | } else if b >= langNoIndexOffset { |
| 147 | b -= langNoIndexOffset |
| 148 | buf := [3]byte{} |
| 149 | intToStr(uint(b), buf[:]) |
| 150 | return string(buf[:]) |
| 151 | } |
| 152 | l := lang.Elem(int(b)) |
| 153 | if l[3] == 0 { |
| 154 | return l[:3] |
| 155 | } |
| 156 | return l[:2] |
| 157 | } |
| 158 | |
| 159 | // ISO3 returns the ISO 639-3 language code. |
| 160 | func (b langID) ISO3() string { |
| 161 | if b == 0 || b >= langNoIndexOffset { |
| 162 | return b.String() |
| 163 | } |
| 164 | l := lang.Elem(int(b)) |
| 165 | if l[3] == 0 { |
| 166 | return l[:3] |
| 167 | } else if l[2] == 0 { |
| 168 | return altLangISO3.Elem(int(l[3]))[:3] |
| 169 | } |
| 170 | // This allocation will only happen for 3-letter ISO codes |
| 171 | // that are non-canonical BCP 47 language identifiers. |
| 172 | return l[0:1] + l[2:4] |
| 173 | } |
| 174 | |
| 175 | // IsPrivateUse reports whether this language code is reserved for private use. |
| 176 | func (b langID) IsPrivateUse() bool { |
| 177 | return langPrivateStart <= b && b <= langPrivateEnd |
| 178 | } |
| 179 | |
| 180 | type regionID uint16 |
| 181 | |
| 182 | // getRegionID returns the region id for s if s is a valid 2-letter region code |
| 183 | // or unknownRegion. |
| 184 | func getRegionID(s []byte) (regionID, error) { |
| 185 | if len(s) == 3 { |
| 186 | if isAlpha(s[0]) { |
| 187 | return getRegionISO3(s) |
| 188 | } |
| 189 | if i, err := strconv.ParseUint(string(s), 10, 10); err == nil { |
| 190 | return getRegionM49(int(i)) |
| 191 | } |
| 192 | } |
| 193 | return getRegionISO2(s) |
| 194 | } |
| 195 | |
| 196 | // getRegionISO2 returns the regionID for the given 2-letter ISO country code |
| 197 | // or unknownRegion if this does not exist. |
| 198 | func getRegionISO2(s []byte) (regionID, error) { |
| 199 | i, err := findIndex(regionISO, s, "ZZ") |
| 200 | if err != nil { |
| 201 | return 0, err |
| 202 | } |
| 203 | return regionID(i) + isoRegionOffset, nil |
| 204 | } |
| 205 | |
| 206 | // getRegionISO3 returns the regionID for the given 3-letter ISO country code |
| 207 | // or unknownRegion if this does not exist. |
| 208 | func getRegionISO3(s []byte) (regionID, error) { |
| 209 | if tag.FixCase("ZZZ", s) { |
| 210 | for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) { |
| 211 | if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] { |
| 212 | return regionID(i) + isoRegionOffset, nil |
| 213 | } |
| 214 | } |
| 215 | for i := 0; i < len(altRegionISO3); i += 3 { |
| 216 | if tag.Compare(altRegionISO3[i:i+3], s) == 0 { |
| 217 | return regionID(altRegionIDs[i/3]), nil |
| 218 | } |
| 219 | } |
| 220 | return 0, mkErrInvalid(s) |
| 221 | } |
| 222 | return 0, errSyntax |
| 223 | } |
| 224 | |
| 225 | func getRegionM49(n int) (regionID, error) { |
| 226 | if 0 < n && n <= 999 { |
| 227 | const ( |
| 228 | searchBits = 7 |
| 229 | regionBits = 9 |
| 230 | regionMask = 1<<regionBits - 1 |
| 231 | ) |
| 232 | idx := n >> searchBits |
| 233 | buf := fromM49[m49Index[idx]:m49Index[idx+1]] |
| 234 | val := uint16(n) << regionBits // we rely on bits shifting out |
| 235 | i := sort.Search(len(buf), func(i int) bool { |
| 236 | return buf[i] >= val |
| 237 | }) |
| 238 | if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val { |
| 239 | return regionID(r & regionMask), nil |
| 240 | } |
| 241 | } |
| 242 | var e ValueError |
| 243 | fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n) |
| 244 | return 0, e |
| 245 | } |
| 246 | |
| 247 | // normRegion returns a region if r is deprecated or 0 otherwise. |
| 248 | // TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ). |
| 249 | // TODO: consider mapping split up regions to new most populous one (like CLDR). |
| 250 | func normRegion(r regionID) regionID { |
| 251 | m := regionOldMap |
| 252 | k := sort.Search(len(m), func(i int) bool { |
| 253 | return m[i].from >= uint16(r) |
| 254 | }) |
| 255 | if k < len(m) && m[k].from == uint16(r) { |
| 256 | return regionID(m[k].to) |
| 257 | } |
| 258 | return 0 |
| 259 | } |
| 260 | |
| 261 | const ( |
| 262 | iso3166UserAssigned = 1 << iota |
| 263 | ccTLD |
| 264 | bcp47Region |
| 265 | ) |
| 266 | |
| 267 | func (r regionID) typ() byte { |
| 268 | return regionTypes[r] |
| 269 | } |
| 270 | |
| 271 | // String returns the BCP 47 representation for the region. |
| 272 | // It returns "ZZ" for an unspecified region. |
| 273 | func (r regionID) String() string { |
| 274 | if r < isoRegionOffset { |
| 275 | if r == 0 { |
| 276 | return "ZZ" |
| 277 | } |
| 278 | return fmt.Sprintf("%03d", r.M49()) |
| 279 | } |
| 280 | r -= isoRegionOffset |
| 281 | return regionISO.Elem(int(r))[:2] |
| 282 | } |
| 283 | |
| 284 | // ISO3 returns the 3-letter ISO code of r. |
| 285 | // Note that not all regions have a 3-letter ISO code. |
| 286 | // In such cases this method returns "ZZZ". |
| 287 | func (r regionID) ISO3() string { |
| 288 | if r < isoRegionOffset { |
| 289 | return "ZZZ" |
| 290 | } |
| 291 | r -= isoRegionOffset |
| 292 | reg := regionISO.Elem(int(r)) |
| 293 | switch reg[2] { |
| 294 | case 0: |
| 295 | return altRegionISO3[reg[3]:][:3] |
| 296 | case ' ': |
| 297 | return "ZZZ" |
| 298 | } |
| 299 | return reg[0:1] + reg[2:4] |
| 300 | } |
| 301 | |
| 302 | // M49 returns the UN M.49 encoding of r, or 0 if this encoding |
| 303 | // is not defined for r. |
| 304 | func (r regionID) M49() int { |
| 305 | return int(m49[r]) |
| 306 | } |
| 307 | |
| 308 | // IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This |
| 309 | // may include private-use tags that are assigned by CLDR and used in this |
| 310 | // implementation. So IsPrivateUse and IsCountry can be simultaneously true. |
| 311 | func (r regionID) IsPrivateUse() bool { |
| 312 | return r.typ()&iso3166UserAssigned != 0 |
| 313 | } |
| 314 | |
| 315 | type scriptID uint8 |
| 316 | |
| 317 | // getScriptID returns the script id for string s. It assumes that s |
| 318 | // is of the format [A-Z][a-z]{3}. |
| 319 | func getScriptID(idx tag.Index, s []byte) (scriptID, error) { |
| 320 | i, err := findIndex(idx, s, "Zzzz") |
| 321 | return scriptID(i), err |
| 322 | } |
| 323 | |
| 324 | // String returns the script code in title case. |
| 325 | // It returns "Zzzz" for an unspecified script. |
| 326 | func (s scriptID) String() string { |
| 327 | if s == 0 { |
| 328 | return "Zzzz" |
| 329 | } |
| 330 | return script.Elem(int(s)) |
| 331 | } |
| 332 | |
| 333 | // IsPrivateUse reports whether this script code is reserved for private use. |
| 334 | func (s scriptID) IsPrivateUse() bool { |
| 335 | return _Qaaa <= s && s <= _Qabx |
| 336 | } |
| 337 | |
| 338 | const ( |
| 339 | maxAltTaglen = len("en-US-POSIX") |
| 340 | maxLen = maxAltTaglen |
| 341 | ) |
| 342 | |
| 343 | var ( |
| 344 | // grandfatheredMap holds a mapping from legacy and grandfathered tags to |
| 345 | // their base language or index to more elaborate tag. |
| 346 | grandfatheredMap = map[[maxLen]byte]int16{ |
| 347 | [maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban |
| 348 | [maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami |
| 349 | [maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn |
| 350 | [maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak |
| 351 | [maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon |
| 352 | [maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux |
| 353 | [maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo |
| 354 | [maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn |
| 355 | [maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao |
| 356 | [maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay |
| 357 | [maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu |
| 358 | [maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok |
| 359 | [maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn |
| 360 | [maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR |
| 361 | [maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL |
| 362 | [maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE |
| 363 | [maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu |
| 364 | [maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka |
| 365 | [maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan |
| 366 | [maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang |
| 367 | |
| 368 | // Grandfathered tags with no modern replacement will be converted as |
| 369 | // follows: |
| 370 | [maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish |
| 371 | [maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed |
| 372 | [maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default |
| 373 | [maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian |
| 374 | [maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo |
| 375 | [maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min |
| 376 | |
| 377 | // CLDR-specific tag. |
| 378 | [maxLen]byte{'r', 'o', 'o', 't'}: 0, // root |
| 379 | [maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX" |
| 380 | } |
| 381 | |
| 382 | altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102} |
| 383 | |
| 384 | altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix" |
| 385 | ) |
| 386 | |
| 387 | func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) { |
| 388 | if v, ok := grandfatheredMap[s]; ok { |
| 389 | if v < 0 { |
| 390 | return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true |
| 391 | } |
| 392 | t.lang = langID(v) |
| 393 | return t, true |
| 394 | } |
| 395 | return t, false |
| 396 | } |