Scott Baker | eee8dd8 | 2019-09-24 12:52:34 -0700 | [diff] [blame^] | 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | //go:generate go run gen.go gen_common.go -output tables.go |
| 6 | //go:generate go run gen_index.go |
| 7 | |
| 8 | package language |
| 9 | |
| 10 | // TODO: Remove above NOTE after: |
| 11 | // - verifying that tables are dropped correctly (most notably matcher tables). |
| 12 | |
| 13 | import ( |
| 14 | "errors" |
| 15 | "fmt" |
| 16 | "strings" |
| 17 | ) |
| 18 | |
| 19 | const ( |
| 20 | // maxCoreSize is the maximum size of a BCP 47 tag without variants and |
| 21 | // extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes. |
| 22 | maxCoreSize = 12 |
| 23 | |
| 24 | // max99thPercentileSize is a somewhat arbitrary buffer size that presumably |
| 25 | // is large enough to hold at least 99% of the BCP 47 tags. |
| 26 | max99thPercentileSize = 32 |
| 27 | |
| 28 | // maxSimpleUExtensionSize is the maximum size of a -u extension with one |
| 29 | // key-type pair. Equals len("-u-") + key (2) + dash + max value (8). |
| 30 | maxSimpleUExtensionSize = 14 |
| 31 | ) |
| 32 | |
| 33 | // Tag represents a BCP 47 language tag. It is used to specify an instance of a |
| 34 | // specific language or locale. All language tag values are guaranteed to be |
| 35 | // well-formed. |
| 36 | type Tag struct { |
| 37 | lang langID |
| 38 | region regionID |
| 39 | // TODO: we will soon run out of positions for script. Idea: instead of |
| 40 | // storing lang, region, and script codes, store only the compact index and |
| 41 | // have a lookup table from this code to its expansion. This greatly speeds |
| 42 | // up table lookup, speed up common variant cases. |
| 43 | // This will also immediately free up 3 extra bytes. Also, the pVariant |
| 44 | // field can now be moved to the lookup table, as the compact index uniquely |
| 45 | // determines the offset of a possible variant. |
| 46 | script scriptID |
| 47 | pVariant byte // offset in str, includes preceding '-' |
| 48 | pExt uint16 // offset of first extension, includes preceding '-' |
| 49 | |
| 50 | // str is the string representation of the Tag. It will only be used if the |
| 51 | // tag has variants or extensions. |
| 52 | str string |
| 53 | } |
| 54 | |
| 55 | // Make is a convenience wrapper for Parse that omits the error. |
| 56 | // In case of an error, a sensible default is returned. |
| 57 | func Make(s string) Tag { |
| 58 | return Default.Make(s) |
| 59 | } |
| 60 | |
| 61 | // Make is a convenience wrapper for c.Parse that omits the error. |
| 62 | // In case of an error, a sensible default is returned. |
| 63 | func (c CanonType) Make(s string) Tag { |
| 64 | t, _ := c.Parse(s) |
| 65 | return t |
| 66 | } |
| 67 | |
| 68 | // Raw returns the raw base language, script and region, without making an |
| 69 | // attempt to infer their values. |
| 70 | func (t Tag) Raw() (b Base, s Script, r Region) { |
| 71 | return Base{t.lang}, Script{t.script}, Region{t.region} |
| 72 | } |
| 73 | |
| 74 | // equalTags compares language, script and region subtags only. |
| 75 | func (t Tag) equalTags(a Tag) bool { |
| 76 | return t.lang == a.lang && t.script == a.script && t.region == a.region |
| 77 | } |
| 78 | |
| 79 | // IsRoot returns true if t is equal to language "und". |
| 80 | func (t Tag) IsRoot() bool { |
| 81 | if int(t.pVariant) < len(t.str) { |
| 82 | return false |
| 83 | } |
| 84 | return t.equalTags(und) |
| 85 | } |
| 86 | |
| 87 | // private reports whether the Tag consists solely of a private use tag. |
| 88 | func (t Tag) private() bool { |
| 89 | return t.str != "" && t.pVariant == 0 |
| 90 | } |
| 91 | |
| 92 | // CanonType can be used to enable or disable various types of canonicalization. |
| 93 | type CanonType int |
| 94 | |
| 95 | const ( |
| 96 | // Replace deprecated base languages with their preferred replacements. |
| 97 | DeprecatedBase CanonType = 1 << iota |
| 98 | // Replace deprecated scripts with their preferred replacements. |
| 99 | DeprecatedScript |
| 100 | // Replace deprecated regions with their preferred replacements. |
| 101 | DeprecatedRegion |
| 102 | // Remove redundant scripts. |
| 103 | SuppressScript |
| 104 | // Normalize legacy encodings. This includes legacy languages defined in |
| 105 | // CLDR as well as bibliographic codes defined in ISO-639. |
| 106 | Legacy |
| 107 | // Map the dominant language of a macro language group to the macro language |
| 108 | // subtag. For example cmn -> zh. |
| 109 | Macro |
| 110 | // The CLDR flag should be used if full compatibility with CLDR is required. |
| 111 | // There are a few cases where language.Tag may differ from CLDR. To follow all |
| 112 | // of CLDR's suggestions, use All|CLDR. |
| 113 | CLDR |
| 114 | |
| 115 | // Raw can be used to Compose or Parse without Canonicalization. |
| 116 | Raw CanonType = 0 |
| 117 | |
| 118 | // Replace all deprecated tags with their preferred replacements. |
| 119 | Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion |
| 120 | |
| 121 | // All canonicalizations recommended by BCP 47. |
| 122 | BCP47 = Deprecated | SuppressScript |
| 123 | |
| 124 | // All canonicalizations. |
| 125 | All = BCP47 | Legacy | Macro |
| 126 | |
| 127 | // Default is the canonicalization used by Parse, Make and Compose. To |
| 128 | // preserve as much information as possible, canonicalizations that remove |
| 129 | // potentially valuable information are not included. The Matcher is |
| 130 | // designed to recognize similar tags that would be the same if |
| 131 | // they were canonicalized using All. |
| 132 | Default = Deprecated | Legacy |
| 133 | |
| 134 | canonLang = DeprecatedBase | Legacy | Macro |
| 135 | |
| 136 | // TODO: LikelyScript, LikelyRegion: suppress similar to ICU. |
| 137 | ) |
| 138 | |
| 139 | // canonicalize returns the canonicalized equivalent of the tag and |
| 140 | // whether there was any change. |
| 141 | func (t Tag) canonicalize(c CanonType) (Tag, bool) { |
| 142 | if c == Raw { |
| 143 | return t, false |
| 144 | } |
| 145 | changed := false |
| 146 | if c&SuppressScript != 0 { |
| 147 | if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] { |
| 148 | t.script = 0 |
| 149 | changed = true |
| 150 | } |
| 151 | } |
| 152 | if c&canonLang != 0 { |
| 153 | for { |
| 154 | if l, aliasType := normLang(t.lang); l != t.lang { |
| 155 | switch aliasType { |
| 156 | case langLegacy: |
| 157 | if c&Legacy != 0 { |
| 158 | if t.lang == _sh && t.script == 0 { |
| 159 | t.script = _Latn |
| 160 | } |
| 161 | t.lang = l |
| 162 | changed = true |
| 163 | } |
| 164 | case langMacro: |
| 165 | if c&Macro != 0 { |
| 166 | // We deviate here from CLDR. The mapping "nb" -> "no" |
| 167 | // qualifies as a typical Macro language mapping. However, |
| 168 | // for legacy reasons, CLDR maps "no", the macro language |
| 169 | // code for Norwegian, to the dominant variant "nb". This |
| 170 | // change is currently under consideration for CLDR as well. |
| 171 | // See http://unicode.org/cldr/trac/ticket/2698 and also |
| 172 | // http://unicode.org/cldr/trac/ticket/1790 for some of the |
| 173 | // practical implications. TODO: this check could be removed |
| 174 | // if CLDR adopts this change. |
| 175 | if c&CLDR == 0 || t.lang != _nb { |
| 176 | changed = true |
| 177 | t.lang = l |
| 178 | } |
| 179 | } |
| 180 | case langDeprecated: |
| 181 | if c&DeprecatedBase != 0 { |
| 182 | if t.lang == _mo && t.region == 0 { |
| 183 | t.region = _MD |
| 184 | } |
| 185 | t.lang = l |
| 186 | changed = true |
| 187 | // Other canonicalization types may still apply. |
| 188 | continue |
| 189 | } |
| 190 | } |
| 191 | } else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 { |
| 192 | t.lang = _nb |
| 193 | changed = true |
| 194 | } |
| 195 | break |
| 196 | } |
| 197 | } |
| 198 | if c&DeprecatedScript != 0 { |
| 199 | if t.script == _Qaai { |
| 200 | changed = true |
| 201 | t.script = _Zinh |
| 202 | } |
| 203 | } |
| 204 | if c&DeprecatedRegion != 0 { |
| 205 | if r := normRegion(t.region); r != 0 { |
| 206 | changed = true |
| 207 | t.region = r |
| 208 | } |
| 209 | } |
| 210 | return t, changed |
| 211 | } |
| 212 | |
| 213 | // Canonicalize returns the canonicalized equivalent of the tag. |
| 214 | func (c CanonType) Canonicalize(t Tag) (Tag, error) { |
| 215 | t, changed := t.canonicalize(c) |
| 216 | if changed { |
| 217 | t.remakeString() |
| 218 | } |
| 219 | return t, nil |
| 220 | } |
| 221 | |
| 222 | // Confidence indicates the level of certainty for a given return value. |
| 223 | // For example, Serbian may be written in Cyrillic or Latin script. |
| 224 | // The confidence level indicates whether a value was explicitly specified, |
| 225 | // whether it is typically the only possible value, or whether there is |
| 226 | // an ambiguity. |
| 227 | type Confidence int |
| 228 | |
| 229 | const ( |
| 230 | No Confidence = iota // full confidence that there was no match |
| 231 | Low // most likely value picked out of a set of alternatives |
| 232 | High // value is generally assumed to be the correct match |
| 233 | Exact // exact match or explicitly specified value |
| 234 | ) |
| 235 | |
| 236 | var confName = []string{"No", "Low", "High", "Exact"} |
| 237 | |
| 238 | func (c Confidence) String() string { |
| 239 | return confName[c] |
| 240 | } |
| 241 | |
| 242 | // remakeString is used to update t.str in case lang, script or region changed. |
| 243 | // It is assumed that pExt and pVariant still point to the start of the |
| 244 | // respective parts. |
| 245 | func (t *Tag) remakeString() { |
| 246 | if t.str == "" { |
| 247 | return |
| 248 | } |
| 249 | extra := t.str[t.pVariant:] |
| 250 | if t.pVariant > 0 { |
| 251 | extra = extra[1:] |
| 252 | } |
| 253 | if t.equalTags(und) && strings.HasPrefix(extra, "x-") { |
| 254 | t.str = extra |
| 255 | t.pVariant = 0 |
| 256 | t.pExt = 0 |
| 257 | return |
| 258 | } |
| 259 | var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases. |
| 260 | b := buf[:t.genCoreBytes(buf[:])] |
| 261 | if extra != "" { |
| 262 | diff := len(b) - int(t.pVariant) |
| 263 | b = append(b, '-') |
| 264 | b = append(b, extra...) |
| 265 | t.pVariant = uint8(int(t.pVariant) + diff) |
| 266 | t.pExt = uint16(int(t.pExt) + diff) |
| 267 | } else { |
| 268 | t.pVariant = uint8(len(b)) |
| 269 | t.pExt = uint16(len(b)) |
| 270 | } |
| 271 | t.str = string(b) |
| 272 | } |
| 273 | |
| 274 | // genCoreBytes writes a string for the base languages, script and region tags |
| 275 | // to the given buffer and returns the number of bytes written. It will never |
| 276 | // write more than maxCoreSize bytes. |
| 277 | func (t *Tag) genCoreBytes(buf []byte) int { |
| 278 | n := t.lang.stringToBuf(buf[:]) |
| 279 | if t.script != 0 { |
| 280 | n += copy(buf[n:], "-") |
| 281 | n += copy(buf[n:], t.script.String()) |
| 282 | } |
| 283 | if t.region != 0 { |
| 284 | n += copy(buf[n:], "-") |
| 285 | n += copy(buf[n:], t.region.String()) |
| 286 | } |
| 287 | return n |
| 288 | } |
| 289 | |
| 290 | // String returns the canonical string representation of the language tag. |
| 291 | func (t Tag) String() string { |
| 292 | if t.str != "" { |
| 293 | return t.str |
| 294 | } |
| 295 | if t.script == 0 && t.region == 0 { |
| 296 | return t.lang.String() |
| 297 | } |
| 298 | buf := [maxCoreSize]byte{} |
| 299 | return string(buf[:t.genCoreBytes(buf[:])]) |
| 300 | } |
| 301 | |
| 302 | // MarshalText implements encoding.TextMarshaler. |
| 303 | func (t Tag) MarshalText() (text []byte, err error) { |
| 304 | if t.str != "" { |
| 305 | text = append(text, t.str...) |
| 306 | } else if t.script == 0 && t.region == 0 { |
| 307 | text = append(text, t.lang.String()...) |
| 308 | } else { |
| 309 | buf := [maxCoreSize]byte{} |
| 310 | text = buf[:t.genCoreBytes(buf[:])] |
| 311 | } |
| 312 | return text, nil |
| 313 | } |
| 314 | |
| 315 | // UnmarshalText implements encoding.TextUnmarshaler. |
| 316 | func (t *Tag) UnmarshalText(text []byte) error { |
| 317 | tag, err := Raw.Parse(string(text)) |
| 318 | *t = tag |
| 319 | return err |
| 320 | } |
| 321 | |
| 322 | // Base returns the base language of the language tag. If the base language is |
| 323 | // unspecified, an attempt will be made to infer it from the context. |
| 324 | // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change. |
| 325 | func (t Tag) Base() (Base, Confidence) { |
| 326 | if t.lang != 0 { |
| 327 | return Base{t.lang}, Exact |
| 328 | } |
| 329 | c := High |
| 330 | if t.script == 0 && !(Region{t.region}).IsCountry() { |
| 331 | c = Low |
| 332 | } |
| 333 | if tag, err := addTags(t); err == nil && tag.lang != 0 { |
| 334 | return Base{tag.lang}, c |
| 335 | } |
| 336 | return Base{0}, No |
| 337 | } |
| 338 | |
| 339 | // Script infers the script for the language tag. If it was not explicitly given, it will infer |
| 340 | // a most likely candidate. |
| 341 | // If more than one script is commonly used for a language, the most likely one |
| 342 | // is returned with a low confidence indication. For example, it returns (Cyrl, Low) |
| 343 | // for Serbian. |
| 344 | // If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined) |
| 345 | // as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks |
| 346 | // common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts. |
| 347 | // See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for |
| 348 | // unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified. |
| 349 | // Note that an inferred script is never guaranteed to be the correct one. Latin is |
| 350 | // almost exclusively used for Afrikaans, but Arabic has been used for some texts |
| 351 | // in the past. Also, the script that is commonly used may change over time. |
| 352 | // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change. |
| 353 | func (t Tag) Script() (Script, Confidence) { |
| 354 | if t.script != 0 { |
| 355 | return Script{t.script}, Exact |
| 356 | } |
| 357 | sc, c := scriptID(_Zzzz), No |
| 358 | if t.lang < langNoIndexOffset { |
| 359 | if scr := scriptID(suppressScript[t.lang]); scr != 0 { |
| 360 | // Note: it is not always the case that a language with a suppress |
| 361 | // script value is only written in one script (e.g. kk, ms, pa). |
| 362 | if t.region == 0 { |
| 363 | return Script{scriptID(scr)}, High |
| 364 | } |
| 365 | sc, c = scr, High |
| 366 | } |
| 367 | } |
| 368 | if tag, err := addTags(t); err == nil { |
| 369 | if tag.script != sc { |
| 370 | sc, c = tag.script, Low |
| 371 | } |
| 372 | } else { |
| 373 | t, _ = (Deprecated | Macro).Canonicalize(t) |
| 374 | if tag, err := addTags(t); err == nil && tag.script != sc { |
| 375 | sc, c = tag.script, Low |
| 376 | } |
| 377 | } |
| 378 | return Script{sc}, c |
| 379 | } |
| 380 | |
| 381 | // Region returns the region for the language tag. If it was not explicitly given, it will |
| 382 | // infer a most likely candidate from the context. |
| 383 | // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change. |
| 384 | func (t Tag) Region() (Region, Confidence) { |
| 385 | if t.region != 0 { |
| 386 | return Region{t.region}, Exact |
| 387 | } |
| 388 | if t, err := addTags(t); err == nil { |
| 389 | return Region{t.region}, Low // TODO: differentiate between high and low. |
| 390 | } |
| 391 | t, _ = (Deprecated | Macro).Canonicalize(t) |
| 392 | if tag, err := addTags(t); err == nil { |
| 393 | return Region{tag.region}, Low |
| 394 | } |
| 395 | return Region{_ZZ}, No // TODO: return world instead of undetermined? |
| 396 | } |
| 397 | |
| 398 | // Variant returns the variants specified explicitly for this language tag. |
| 399 | // or nil if no variant was specified. |
| 400 | func (t Tag) Variants() []Variant { |
| 401 | v := []Variant{} |
| 402 | if int(t.pVariant) < int(t.pExt) { |
| 403 | for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; { |
| 404 | x, str = nextToken(str) |
| 405 | v = append(v, Variant{x}) |
| 406 | } |
| 407 | } |
| 408 | return v |
| 409 | } |
| 410 | |
| 411 | // Parent returns the CLDR parent of t. In CLDR, missing fields in data for a |
| 412 | // specific language are substituted with fields from the parent language. |
| 413 | // The parent for a language may change for newer versions of CLDR. |
| 414 | func (t Tag) Parent() Tag { |
| 415 | if t.str != "" { |
| 416 | // Strip the variants and extensions. |
| 417 | t, _ = Raw.Compose(t.Raw()) |
| 418 | if t.region == 0 && t.script != 0 && t.lang != 0 { |
| 419 | base, _ := addTags(Tag{lang: t.lang}) |
| 420 | if base.script == t.script { |
| 421 | return Tag{lang: t.lang} |
| 422 | } |
| 423 | } |
| 424 | return t |
| 425 | } |
| 426 | if t.lang != 0 { |
| 427 | if t.region != 0 { |
| 428 | maxScript := t.script |
| 429 | if maxScript == 0 { |
| 430 | max, _ := addTags(t) |
| 431 | maxScript = max.script |
| 432 | } |
| 433 | |
| 434 | for i := range parents { |
| 435 | if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript { |
| 436 | for _, r := range parents[i].fromRegion { |
| 437 | if regionID(r) == t.region { |
| 438 | return Tag{ |
| 439 | lang: t.lang, |
| 440 | script: scriptID(parents[i].script), |
| 441 | region: regionID(parents[i].toRegion), |
| 442 | } |
| 443 | } |
| 444 | } |
| 445 | } |
| 446 | } |
| 447 | |
| 448 | // Strip the script if it is the default one. |
| 449 | base, _ := addTags(Tag{lang: t.lang}) |
| 450 | if base.script != maxScript { |
| 451 | return Tag{lang: t.lang, script: maxScript} |
| 452 | } |
| 453 | return Tag{lang: t.lang} |
| 454 | } else if t.script != 0 { |
| 455 | // The parent for an base-script pair with a non-default script is |
| 456 | // "und" instead of the base language. |
| 457 | base, _ := addTags(Tag{lang: t.lang}) |
| 458 | if base.script != t.script { |
| 459 | return und |
| 460 | } |
| 461 | return Tag{lang: t.lang} |
| 462 | } |
| 463 | } |
| 464 | return und |
| 465 | } |
| 466 | |
| 467 | // returns token t and the rest of the string. |
| 468 | func nextToken(s string) (t, tail string) { |
| 469 | p := strings.Index(s[1:], "-") |
| 470 | if p == -1 { |
| 471 | return s[1:], "" |
| 472 | } |
| 473 | p++ |
| 474 | return s[1:p], s[p:] |
| 475 | } |
| 476 | |
| 477 | // Extension is a single BCP 47 extension. |
| 478 | type Extension struct { |
| 479 | s string |
| 480 | } |
| 481 | |
| 482 | // String returns the string representation of the extension, including the |
| 483 | // type tag. |
| 484 | func (e Extension) String() string { |
| 485 | return e.s |
| 486 | } |
| 487 | |
| 488 | // ParseExtension parses s as an extension and returns it on success. |
| 489 | func ParseExtension(s string) (e Extension, err error) { |
| 490 | scan := makeScannerString(s) |
| 491 | var end int |
| 492 | if n := len(scan.token); n != 1 { |
| 493 | return Extension{}, errSyntax |
| 494 | } |
| 495 | scan.toLower(0, len(scan.b)) |
| 496 | end = parseExtension(&scan) |
| 497 | if end != len(s) { |
| 498 | return Extension{}, errSyntax |
| 499 | } |
| 500 | return Extension{string(scan.b)}, nil |
| 501 | } |
| 502 | |
| 503 | // Type returns the one-byte extension type of e. It returns 0 for the zero |
| 504 | // exception. |
| 505 | func (e Extension) Type() byte { |
| 506 | if e.s == "" { |
| 507 | return 0 |
| 508 | } |
| 509 | return e.s[0] |
| 510 | } |
| 511 | |
| 512 | // Tokens returns the list of tokens of e. |
| 513 | func (e Extension) Tokens() []string { |
| 514 | return strings.Split(e.s, "-") |
| 515 | } |
| 516 | |
| 517 | // Extension returns the extension of type x for tag t. It will return |
| 518 | // false for ok if t does not have the requested extension. The returned |
| 519 | // extension will be invalid in this case. |
| 520 | func (t Tag) Extension(x byte) (ext Extension, ok bool) { |
| 521 | for i := int(t.pExt); i < len(t.str)-1; { |
| 522 | var ext string |
| 523 | i, ext = getExtension(t.str, i) |
| 524 | if ext[0] == x { |
| 525 | return Extension{ext}, true |
| 526 | } |
| 527 | } |
| 528 | return Extension{}, false |
| 529 | } |
| 530 | |
| 531 | // Extensions returns all extensions of t. |
| 532 | func (t Tag) Extensions() []Extension { |
| 533 | e := []Extension{} |
| 534 | for i := int(t.pExt); i < len(t.str)-1; { |
| 535 | var ext string |
| 536 | i, ext = getExtension(t.str, i) |
| 537 | e = append(e, Extension{ext}) |
| 538 | } |
| 539 | return e |
| 540 | } |
| 541 | |
| 542 | // TypeForKey returns the type associated with the given key, where key and type |
| 543 | // are of the allowed values defined for the Unicode locale extension ('u') in |
| 544 | // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. |
| 545 | // TypeForKey will traverse the inheritance chain to get the correct value. |
| 546 | func (t Tag) TypeForKey(key string) string { |
| 547 | if start, end, _ := t.findTypeForKey(key); end != start { |
| 548 | return t.str[start:end] |
| 549 | } |
| 550 | return "" |
| 551 | } |
| 552 | |
| 553 | var ( |
| 554 | errPrivateUse = errors.New("cannot set a key on a private use tag") |
| 555 | errInvalidArguments = errors.New("invalid key or type") |
| 556 | ) |
| 557 | |
| 558 | // SetTypeForKey returns a new Tag with the key set to type, where key and type |
| 559 | // are of the allowed values defined for the Unicode locale extension ('u') in |
| 560 | // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. |
| 561 | // An empty value removes an existing pair with the same key. |
| 562 | func (t Tag) SetTypeForKey(key, value string) (Tag, error) { |
| 563 | if t.private() { |
| 564 | return t, errPrivateUse |
| 565 | } |
| 566 | if len(key) != 2 { |
| 567 | return t, errInvalidArguments |
| 568 | } |
| 569 | |
| 570 | // Remove the setting if value is "". |
| 571 | if value == "" { |
| 572 | start, end, _ := t.findTypeForKey(key) |
| 573 | if start != end { |
| 574 | // Remove key tag and leading '-'. |
| 575 | start -= 4 |
| 576 | |
| 577 | // Remove a possible empty extension. |
| 578 | if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' { |
| 579 | start -= 2 |
| 580 | } |
| 581 | if start == int(t.pVariant) && end == len(t.str) { |
| 582 | t.str = "" |
| 583 | t.pVariant, t.pExt = 0, 0 |
| 584 | } else { |
| 585 | t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:]) |
| 586 | } |
| 587 | } |
| 588 | return t, nil |
| 589 | } |
| 590 | |
| 591 | if len(value) < 3 || len(value) > 8 { |
| 592 | return t, errInvalidArguments |
| 593 | } |
| 594 | |
| 595 | var ( |
| 596 | buf [maxCoreSize + maxSimpleUExtensionSize]byte |
| 597 | uStart int // start of the -u extension. |
| 598 | ) |
| 599 | |
| 600 | // Generate the tag string if needed. |
| 601 | if t.str == "" { |
| 602 | uStart = t.genCoreBytes(buf[:]) |
| 603 | buf[uStart] = '-' |
| 604 | uStart++ |
| 605 | } |
| 606 | |
| 607 | // Create new key-type pair and parse it to verify. |
| 608 | b := buf[uStart:] |
| 609 | copy(b, "u-") |
| 610 | copy(b[2:], key) |
| 611 | b[4] = '-' |
| 612 | b = b[:5+copy(b[5:], value)] |
| 613 | scan := makeScanner(b) |
| 614 | if parseExtensions(&scan); scan.err != nil { |
| 615 | return t, scan.err |
| 616 | } |
| 617 | |
| 618 | // Assemble the replacement string. |
| 619 | if t.str == "" { |
| 620 | t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1) |
| 621 | t.str = string(buf[:uStart+len(b)]) |
| 622 | } else { |
| 623 | s := t.str |
| 624 | start, end, hasExt := t.findTypeForKey(key) |
| 625 | if start == end { |
| 626 | if hasExt { |
| 627 | b = b[2:] |
| 628 | } |
| 629 | t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:]) |
| 630 | } else { |
| 631 | t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:]) |
| 632 | } |
| 633 | } |
| 634 | return t, nil |
| 635 | } |
| 636 | |
| 637 | // findKeyAndType returns the start and end position for the type corresponding |
| 638 | // to key or the point at which to insert the key-value pair if the type |
| 639 | // wasn't found. The hasExt return value reports whether an -u extension was present. |
| 640 | // Note: the extensions are typically very small and are likely to contain |
| 641 | // only one key-type pair. |
| 642 | func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) { |
| 643 | p := int(t.pExt) |
| 644 | if len(key) != 2 || p == len(t.str) || p == 0 { |
| 645 | return p, p, false |
| 646 | } |
| 647 | s := t.str |
| 648 | |
| 649 | // Find the correct extension. |
| 650 | for p++; s[p] != 'u'; p++ { |
| 651 | if s[p] > 'u' { |
| 652 | p-- |
| 653 | return p, p, false |
| 654 | } |
| 655 | if p = nextExtension(s, p); p == len(s) { |
| 656 | return len(s), len(s), false |
| 657 | } |
| 658 | } |
| 659 | // Proceed to the hyphen following the extension name. |
| 660 | p++ |
| 661 | |
| 662 | // curKey is the key currently being processed. |
| 663 | curKey := "" |
| 664 | |
| 665 | // Iterate over keys until we get the end of a section. |
| 666 | for { |
| 667 | // p points to the hyphen preceding the current token. |
| 668 | if p3 := p + 3; s[p3] == '-' { |
| 669 | // Found a key. |
| 670 | // Check whether we just processed the key that was requested. |
| 671 | if curKey == key { |
| 672 | return start, p, true |
| 673 | } |
| 674 | // Set to the next key and continue scanning type tokens. |
| 675 | curKey = s[p+1 : p3] |
| 676 | if curKey > key { |
| 677 | return p, p, true |
| 678 | } |
| 679 | // Start of the type token sequence. |
| 680 | start = p + 4 |
| 681 | // A type is at least 3 characters long. |
| 682 | p += 7 // 4 + 3 |
| 683 | } else { |
| 684 | // Attribute or type, which is at least 3 characters long. |
| 685 | p += 4 |
| 686 | } |
| 687 | // p points past the third character of a type or attribute. |
| 688 | max := p + 5 // maximum length of token plus hyphen. |
| 689 | if len(s) < max { |
| 690 | max = len(s) |
| 691 | } |
| 692 | for ; p < max && s[p] != '-'; p++ { |
| 693 | } |
| 694 | // Bail if we have exhausted all tokens or if the next token starts |
| 695 | // a new extension. |
| 696 | if p == len(s) || s[p+2] == '-' { |
| 697 | if curKey == key { |
| 698 | return start, p, true |
| 699 | } |
| 700 | return p, p, true |
| 701 | } |
| 702 | } |
| 703 | } |
| 704 | |
| 705 | // CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags |
| 706 | // for which data exists in the text repository. The index will change over time |
| 707 | // and should not be stored in persistent storage. Extensions, except for the |
| 708 | // 'va' type of the 'u' extension, are ignored. It will return 0, false if no |
| 709 | // compact tag exists, where 0 is the index for the root language (Und). |
| 710 | func CompactIndex(t Tag) (index int, ok bool) { |
| 711 | // TODO: perhaps give more frequent tags a lower index. |
| 712 | // TODO: we could make the indexes stable. This will excluded some |
| 713 | // possibilities for optimization, so don't do this quite yet. |
| 714 | b, s, r := t.Raw() |
| 715 | if len(t.str) > 0 { |
| 716 | if strings.HasPrefix(t.str, "x-") { |
| 717 | // We have no entries for user-defined tags. |
| 718 | return 0, false |
| 719 | } |
| 720 | if uint16(t.pVariant) != t.pExt { |
| 721 | // There are no tags with variants and an u-va type. |
| 722 | if t.TypeForKey("va") != "" { |
| 723 | return 0, false |
| 724 | } |
| 725 | t, _ = Raw.Compose(b, s, r, t.Variants()) |
| 726 | } else if _, ok := t.Extension('u'); ok { |
| 727 | // Strip all but the 'va' entry. |
| 728 | variant := t.TypeForKey("va") |
| 729 | t, _ = Raw.Compose(b, s, r) |
| 730 | t, _ = t.SetTypeForKey("va", variant) |
| 731 | } |
| 732 | if len(t.str) > 0 { |
| 733 | // We have some variants. |
| 734 | for i, s := range specialTags { |
| 735 | if s == t { |
| 736 | return i + 1, true |
| 737 | } |
| 738 | } |
| 739 | return 0, false |
| 740 | } |
| 741 | } |
| 742 | // No variants specified: just compare core components. |
| 743 | // The key has the form lllssrrr, where l, s, and r are nibbles for |
| 744 | // respectively the langID, scriptID, and regionID. |
| 745 | key := uint32(b.langID) << (8 + 12) |
| 746 | key |= uint32(s.scriptID) << 12 |
| 747 | key |= uint32(r.regionID) |
| 748 | x, ok := coreTags[key] |
| 749 | return int(x), ok |
| 750 | } |
| 751 | |
| 752 | // Base is an ISO 639 language code, used for encoding the base language |
| 753 | // of a language tag. |
| 754 | type Base struct { |
| 755 | langID |
| 756 | } |
| 757 | |
| 758 | // ParseBase parses a 2- or 3-letter ISO 639 code. |
| 759 | // It returns a ValueError if s is a well-formed but unknown language identifier |
| 760 | // or another error if another error occurred. |
| 761 | func ParseBase(s string) (Base, error) { |
| 762 | if n := len(s); n < 2 || 3 < n { |
| 763 | return Base{}, errSyntax |
| 764 | } |
| 765 | var buf [3]byte |
| 766 | l, err := getLangID(buf[:copy(buf[:], s)]) |
| 767 | return Base{l}, err |
| 768 | } |
| 769 | |
| 770 | // Script is a 4-letter ISO 15924 code for representing scripts. |
| 771 | // It is idiomatically represented in title case. |
| 772 | type Script struct { |
| 773 | scriptID |
| 774 | } |
| 775 | |
| 776 | // ParseScript parses a 4-letter ISO 15924 code. |
| 777 | // It returns a ValueError if s is a well-formed but unknown script identifier |
| 778 | // or another error if another error occurred. |
| 779 | func ParseScript(s string) (Script, error) { |
| 780 | if len(s) != 4 { |
| 781 | return Script{}, errSyntax |
| 782 | } |
| 783 | var buf [4]byte |
| 784 | sc, err := getScriptID(script, buf[:copy(buf[:], s)]) |
| 785 | return Script{sc}, err |
| 786 | } |
| 787 | |
| 788 | // Region is an ISO 3166-1 or UN M.49 code for representing countries and regions. |
| 789 | type Region struct { |
| 790 | regionID |
| 791 | } |
| 792 | |
| 793 | // EncodeM49 returns the Region for the given UN M.49 code. |
| 794 | // It returns an error if r is not a valid code. |
| 795 | func EncodeM49(r int) (Region, error) { |
| 796 | rid, err := getRegionM49(r) |
| 797 | return Region{rid}, err |
| 798 | } |
| 799 | |
| 800 | // ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code. |
| 801 | // It returns a ValueError if s is a well-formed but unknown region identifier |
| 802 | // or another error if another error occurred. |
| 803 | func ParseRegion(s string) (Region, error) { |
| 804 | if n := len(s); n < 2 || 3 < n { |
| 805 | return Region{}, errSyntax |
| 806 | } |
| 807 | var buf [3]byte |
| 808 | r, err := getRegionID(buf[:copy(buf[:], s)]) |
| 809 | return Region{r}, err |
| 810 | } |
| 811 | |
| 812 | // IsCountry returns whether this region is a country or autonomous area. This |
| 813 | // includes non-standard definitions from CLDR. |
| 814 | func (r Region) IsCountry() bool { |
| 815 | if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK { |
| 816 | return false |
| 817 | } |
| 818 | return true |
| 819 | } |
| 820 | |
| 821 | // IsGroup returns whether this region defines a collection of regions. This |
| 822 | // includes non-standard definitions from CLDR. |
| 823 | func (r Region) IsGroup() bool { |
| 824 | if r.regionID == 0 { |
| 825 | return false |
| 826 | } |
| 827 | return int(regionInclusion[r.regionID]) < len(regionContainment) |
| 828 | } |
| 829 | |
| 830 | // Contains returns whether Region c is contained by Region r. It returns true |
| 831 | // if c == r. |
| 832 | func (r Region) Contains(c Region) bool { |
| 833 | return r.regionID.contains(c.regionID) |
| 834 | } |
| 835 | |
| 836 | func (r regionID) contains(c regionID) bool { |
| 837 | if r == c { |
| 838 | return true |
| 839 | } |
| 840 | g := regionInclusion[r] |
| 841 | if g >= nRegionGroups { |
| 842 | return false |
| 843 | } |
| 844 | m := regionContainment[g] |
| 845 | |
| 846 | d := regionInclusion[c] |
| 847 | b := regionInclusionBits[d] |
| 848 | |
| 849 | // A contained country may belong to multiple disjoint groups. Matching any |
| 850 | // of these indicates containment. If the contained region is a group, it |
| 851 | // must strictly be a subset. |
| 852 | if d >= nRegionGroups { |
| 853 | return b&m != 0 |
| 854 | } |
| 855 | return b&^m == 0 |
| 856 | } |
| 857 | |
| 858 | var errNoTLD = errors.New("language: region is not a valid ccTLD") |
| 859 | |
| 860 | // TLD returns the country code top-level domain (ccTLD). UK is returned for GB. |
| 861 | // In all other cases it returns either the region itself or an error. |
| 862 | // |
| 863 | // This method may return an error for a region for which there exists a |
| 864 | // canonical form with a ccTLD. To get that ccTLD canonicalize r first. The |
| 865 | // region will already be canonicalized it was obtained from a Tag that was |
| 866 | // obtained using any of the default methods. |
| 867 | func (r Region) TLD() (Region, error) { |
| 868 | // See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the |
| 869 | // difference between ISO 3166-1 and IANA ccTLD. |
| 870 | if r.regionID == _GB { |
| 871 | r = Region{_UK} |
| 872 | } |
| 873 | if (r.typ() & ccTLD) == 0 { |
| 874 | return Region{}, errNoTLD |
| 875 | } |
| 876 | return r, nil |
| 877 | } |
| 878 | |
| 879 | // Canonicalize returns the region or a possible replacement if the region is |
| 880 | // deprecated. It will not return a replacement for deprecated regions that |
| 881 | // are split into multiple regions. |
| 882 | func (r Region) Canonicalize() Region { |
| 883 | if cr := normRegion(r.regionID); cr != 0 { |
| 884 | return Region{cr} |
| 885 | } |
| 886 | return r |
| 887 | } |
| 888 | |
| 889 | // Variant represents a registered variant of a language as defined by BCP 47. |
| 890 | type Variant struct { |
| 891 | variant string |
| 892 | } |
| 893 | |
| 894 | // ParseVariant parses and returns a Variant. An error is returned if s is not |
| 895 | // a valid variant. |
| 896 | func ParseVariant(s string) (Variant, error) { |
| 897 | s = strings.ToLower(s) |
| 898 | if _, ok := variantIndex[s]; ok { |
| 899 | return Variant{s}, nil |
| 900 | } |
| 901 | return Variant{}, mkErrInvalid([]byte(s)) |
| 902 | } |
| 903 | |
| 904 | // String returns the string representation of the variant. |
| 905 | func (v Variant) String() string { |
| 906 | return v.variant |
| 907 | } |