Scott Baker | eee8dd8 | 2019-09-24 12:52:34 -0700 | [diff] [blame] | 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package language |
| 6 | |
| 7 | import ( |
| 8 | "bytes" |
| 9 | "errors" |
| 10 | "fmt" |
| 11 | "sort" |
| 12 | "strconv" |
| 13 | "strings" |
| 14 | |
| 15 | "golang.org/x/text/internal/tag" |
| 16 | ) |
| 17 | |
| 18 | // isAlpha returns true if the byte is not a digit. |
| 19 | // b must be an ASCII letter or digit. |
| 20 | func isAlpha(b byte) bool { |
| 21 | return b > '9' |
| 22 | } |
| 23 | |
| 24 | // isAlphaNum returns true if the string contains only ASCII letters or digits. |
| 25 | func isAlphaNum(s []byte) bool { |
| 26 | for _, c := range s { |
| 27 | if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') { |
| 28 | return false |
| 29 | } |
| 30 | } |
| 31 | return true |
| 32 | } |
| 33 | |
| 34 | // errSyntax is returned by any of the parsing functions when the |
| 35 | // input is not well-formed, according to BCP 47. |
| 36 | // TODO: return the position at which the syntax error occurred? |
| 37 | var errSyntax = errors.New("language: tag is not well-formed") |
| 38 | |
| 39 | // ValueError is returned by any of the parsing functions when the |
| 40 | // input is well-formed but the respective subtag is not recognized |
| 41 | // as a valid value. |
| 42 | type ValueError struct { |
| 43 | v [8]byte |
| 44 | } |
| 45 | |
| 46 | func mkErrInvalid(s []byte) error { |
| 47 | var e ValueError |
| 48 | copy(e.v[:], s) |
| 49 | return e |
| 50 | } |
| 51 | |
| 52 | func (e ValueError) tag() []byte { |
| 53 | n := bytes.IndexByte(e.v[:], 0) |
| 54 | if n == -1 { |
| 55 | n = 8 |
| 56 | } |
| 57 | return e.v[:n] |
| 58 | } |
| 59 | |
| 60 | // Error implements the error interface. |
| 61 | func (e ValueError) Error() string { |
| 62 | return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag()) |
| 63 | } |
| 64 | |
| 65 | // Subtag returns the subtag for which the error occurred. |
| 66 | func (e ValueError) Subtag() string { |
| 67 | return string(e.tag()) |
| 68 | } |
| 69 | |
| 70 | // scanner is used to scan BCP 47 tokens, which are separated by _ or -. |
| 71 | type scanner struct { |
| 72 | b []byte |
| 73 | bytes [max99thPercentileSize]byte |
| 74 | token []byte |
| 75 | start int // start position of the current token |
| 76 | end int // end position of the current token |
| 77 | next int // next point for scan |
| 78 | err error |
| 79 | done bool |
| 80 | } |
| 81 | |
| 82 | func makeScannerString(s string) scanner { |
| 83 | scan := scanner{} |
| 84 | if len(s) <= len(scan.bytes) { |
| 85 | scan.b = scan.bytes[:copy(scan.bytes[:], s)] |
| 86 | } else { |
| 87 | scan.b = []byte(s) |
| 88 | } |
| 89 | scan.init() |
| 90 | return scan |
| 91 | } |
| 92 | |
| 93 | // makeScanner returns a scanner using b as the input buffer. |
| 94 | // b is not copied and may be modified by the scanner routines. |
| 95 | func makeScanner(b []byte) scanner { |
| 96 | scan := scanner{b: b} |
| 97 | scan.init() |
| 98 | return scan |
| 99 | } |
| 100 | |
| 101 | func (s *scanner) init() { |
| 102 | for i, c := range s.b { |
| 103 | if c == '_' { |
| 104 | s.b[i] = '-' |
| 105 | } |
| 106 | } |
| 107 | s.scan() |
| 108 | } |
| 109 | |
| 110 | // restToLower converts the string between start and end to lower case. |
| 111 | func (s *scanner) toLower(start, end int) { |
| 112 | for i := start; i < end; i++ { |
| 113 | c := s.b[i] |
| 114 | if 'A' <= c && c <= 'Z' { |
| 115 | s.b[i] += 'a' - 'A' |
| 116 | } |
| 117 | } |
| 118 | } |
| 119 | |
| 120 | func (s *scanner) setError(e error) { |
| 121 | if s.err == nil || (e == errSyntax && s.err != errSyntax) { |
| 122 | s.err = e |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | // resizeRange shrinks or grows the array at position oldStart such that |
| 127 | // a new string of size newSize can fit between oldStart and oldEnd. |
| 128 | // Sets the scan point to after the resized range. |
| 129 | func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) { |
| 130 | s.start = oldStart |
| 131 | if end := oldStart + newSize; end != oldEnd { |
| 132 | diff := end - oldEnd |
| 133 | if end < cap(s.b) { |
| 134 | b := make([]byte, len(s.b)+diff) |
| 135 | copy(b, s.b[:oldStart]) |
| 136 | copy(b[end:], s.b[oldEnd:]) |
| 137 | s.b = b |
| 138 | } else { |
| 139 | s.b = append(s.b[end:], s.b[oldEnd:]...) |
| 140 | } |
| 141 | s.next = end + (s.next - s.end) |
| 142 | s.end = end |
| 143 | } |
| 144 | } |
| 145 | |
| 146 | // replace replaces the current token with repl. |
| 147 | func (s *scanner) replace(repl string) { |
| 148 | s.resizeRange(s.start, s.end, len(repl)) |
| 149 | copy(s.b[s.start:], repl) |
| 150 | } |
| 151 | |
| 152 | // gobble removes the current token from the input. |
| 153 | // Caller must call scan after calling gobble. |
| 154 | func (s *scanner) gobble(e error) { |
| 155 | s.setError(e) |
| 156 | if s.start == 0 { |
| 157 | s.b = s.b[:+copy(s.b, s.b[s.next:])] |
| 158 | s.end = 0 |
| 159 | } else { |
| 160 | s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])] |
| 161 | s.end = s.start - 1 |
| 162 | } |
| 163 | s.next = s.start |
| 164 | } |
| 165 | |
| 166 | // deleteRange removes the given range from s.b before the current token. |
| 167 | func (s *scanner) deleteRange(start, end int) { |
| 168 | s.setError(errSyntax) |
| 169 | s.b = s.b[:start+copy(s.b[start:], s.b[end:])] |
| 170 | diff := end - start |
| 171 | s.next -= diff |
| 172 | s.start -= diff |
| 173 | s.end -= diff |
| 174 | } |
| 175 | |
| 176 | // scan parses the next token of a BCP 47 string. Tokens that are larger |
| 177 | // than 8 characters or include non-alphanumeric characters result in an error |
| 178 | // and are gobbled and removed from the output. |
| 179 | // It returns the end position of the last token consumed. |
| 180 | func (s *scanner) scan() (end int) { |
| 181 | end = s.end |
| 182 | s.token = nil |
| 183 | for s.start = s.next; s.next < len(s.b); { |
| 184 | i := bytes.IndexByte(s.b[s.next:], '-') |
| 185 | if i == -1 { |
| 186 | s.end = len(s.b) |
| 187 | s.next = len(s.b) |
| 188 | i = s.end - s.start |
| 189 | } else { |
| 190 | s.end = s.next + i |
| 191 | s.next = s.end + 1 |
| 192 | } |
| 193 | token := s.b[s.start:s.end] |
| 194 | if i < 1 || i > 8 || !isAlphaNum(token) { |
| 195 | s.gobble(errSyntax) |
| 196 | continue |
| 197 | } |
| 198 | s.token = token |
| 199 | return end |
| 200 | } |
| 201 | if n := len(s.b); n > 0 && s.b[n-1] == '-' { |
| 202 | s.setError(errSyntax) |
| 203 | s.b = s.b[:len(s.b)-1] |
| 204 | } |
| 205 | s.done = true |
| 206 | return end |
| 207 | } |
| 208 | |
| 209 | // acceptMinSize parses multiple tokens of the given size or greater. |
| 210 | // It returns the end position of the last token consumed. |
| 211 | func (s *scanner) acceptMinSize(min int) (end int) { |
| 212 | end = s.end |
| 213 | s.scan() |
| 214 | for ; len(s.token) >= min; s.scan() { |
| 215 | end = s.end |
| 216 | } |
| 217 | return end |
| 218 | } |
| 219 | |
| 220 | // Parse parses the given BCP 47 string and returns a valid Tag. If parsing |
| 221 | // failed it returns an error and any part of the tag that could be parsed. |
| 222 | // If parsing succeeded but an unknown value was found, it returns |
| 223 | // ValueError. The Tag returned in this case is just stripped of the unknown |
| 224 | // value. All other values are preserved. It accepts tags in the BCP 47 format |
| 225 | // and extensions to this standard defined in |
| 226 | // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. |
| 227 | // The resulting tag is canonicalized using the default canonicalization type. |
| 228 | func Parse(s string) (t Tag, err error) { |
| 229 | return Default.Parse(s) |
| 230 | } |
| 231 | |
| 232 | // Parse parses the given BCP 47 string and returns a valid Tag. If parsing |
| 233 | // failed it returns an error and any part of the tag that could be parsed. |
| 234 | // If parsing succeeded but an unknown value was found, it returns |
| 235 | // ValueError. The Tag returned in this case is just stripped of the unknown |
| 236 | // value. All other values are preserved. It accepts tags in the BCP 47 format |
| 237 | // and extensions to this standard defined in |
| 238 | // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. |
| 239 | // The resulting tag is canonicalized using the the canonicalization type c. |
| 240 | func (c CanonType) Parse(s string) (t Tag, err error) { |
| 241 | // TODO: consider supporting old-style locale key-value pairs. |
| 242 | if s == "" { |
| 243 | return und, errSyntax |
| 244 | } |
| 245 | if len(s) <= maxAltTaglen { |
| 246 | b := [maxAltTaglen]byte{} |
| 247 | for i, c := range s { |
| 248 | // Generating invalid UTF-8 is okay as it won't match. |
| 249 | if 'A' <= c && c <= 'Z' { |
| 250 | c += 'a' - 'A' |
| 251 | } else if c == '_' { |
| 252 | c = '-' |
| 253 | } |
| 254 | b[i] = byte(c) |
| 255 | } |
| 256 | if t, ok := grandfathered(b); ok { |
| 257 | return t, nil |
| 258 | } |
| 259 | } |
| 260 | scan := makeScannerString(s) |
| 261 | t, err = parse(&scan, s) |
| 262 | t, changed := t.canonicalize(c) |
| 263 | if changed { |
| 264 | t.remakeString() |
| 265 | } |
| 266 | return t, err |
| 267 | } |
| 268 | |
| 269 | func parse(scan *scanner, s string) (t Tag, err error) { |
| 270 | t = und |
| 271 | var end int |
| 272 | if n := len(scan.token); n <= 1 { |
| 273 | scan.toLower(0, len(scan.b)) |
| 274 | if n == 0 || scan.token[0] != 'x' { |
| 275 | return t, errSyntax |
| 276 | } |
| 277 | end = parseExtensions(scan) |
| 278 | } else if n >= 4 { |
| 279 | return und, errSyntax |
| 280 | } else { // the usual case |
| 281 | t, end = parseTag(scan) |
| 282 | if n := len(scan.token); n == 1 { |
| 283 | t.pExt = uint16(end) |
| 284 | end = parseExtensions(scan) |
| 285 | } else if end < len(scan.b) { |
| 286 | scan.setError(errSyntax) |
| 287 | scan.b = scan.b[:end] |
| 288 | } |
| 289 | } |
| 290 | if int(t.pVariant) < len(scan.b) { |
| 291 | if end < len(s) { |
| 292 | s = s[:end] |
| 293 | } |
| 294 | if len(s) > 0 && tag.Compare(s, scan.b) == 0 { |
| 295 | t.str = s |
| 296 | } else { |
| 297 | t.str = string(scan.b) |
| 298 | } |
| 299 | } else { |
| 300 | t.pVariant, t.pExt = 0, 0 |
| 301 | } |
| 302 | return t, scan.err |
| 303 | } |
| 304 | |
| 305 | // parseTag parses language, script, region and variants. |
| 306 | // It returns a Tag and the end position in the input that was parsed. |
| 307 | func parseTag(scan *scanner) (t Tag, end int) { |
| 308 | var e error |
| 309 | // TODO: set an error if an unknown lang, script or region is encountered. |
| 310 | t.lang, e = getLangID(scan.token) |
| 311 | scan.setError(e) |
| 312 | scan.replace(t.lang.String()) |
| 313 | langStart := scan.start |
| 314 | end = scan.scan() |
| 315 | for len(scan.token) == 3 && isAlpha(scan.token[0]) { |
| 316 | // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent |
| 317 | // to a tag of the form <extlang>. |
| 318 | lang, e := getLangID(scan.token) |
| 319 | if lang != 0 { |
| 320 | t.lang = lang |
| 321 | copy(scan.b[langStart:], lang.String()) |
| 322 | scan.b[langStart+3] = '-' |
| 323 | scan.start = langStart + 4 |
| 324 | } |
| 325 | scan.gobble(e) |
| 326 | end = scan.scan() |
| 327 | } |
| 328 | if len(scan.token) == 4 && isAlpha(scan.token[0]) { |
| 329 | t.script, e = getScriptID(script, scan.token) |
| 330 | if t.script == 0 { |
| 331 | scan.gobble(e) |
| 332 | } |
| 333 | end = scan.scan() |
| 334 | } |
| 335 | if n := len(scan.token); n >= 2 && n <= 3 { |
| 336 | t.region, e = getRegionID(scan.token) |
| 337 | if t.region == 0 { |
| 338 | scan.gobble(e) |
| 339 | } else { |
| 340 | scan.replace(t.region.String()) |
| 341 | } |
| 342 | end = scan.scan() |
| 343 | } |
| 344 | scan.toLower(scan.start, len(scan.b)) |
| 345 | t.pVariant = byte(end) |
| 346 | end = parseVariants(scan, end, t) |
| 347 | t.pExt = uint16(end) |
| 348 | return t, end |
| 349 | } |
| 350 | |
| 351 | var separator = []byte{'-'} |
| 352 | |
| 353 | // parseVariants scans tokens as long as each token is a valid variant string. |
| 354 | // Duplicate variants are removed. |
| 355 | func parseVariants(scan *scanner, end int, t Tag) int { |
| 356 | start := scan.start |
| 357 | varIDBuf := [4]uint8{} |
| 358 | variantBuf := [4][]byte{} |
| 359 | varID := varIDBuf[:0] |
| 360 | variant := variantBuf[:0] |
| 361 | last := -1 |
| 362 | needSort := false |
| 363 | for ; len(scan.token) >= 4; scan.scan() { |
| 364 | // TODO: measure the impact of needing this conversion and redesign |
| 365 | // the data structure if there is an issue. |
| 366 | v, ok := variantIndex[string(scan.token)] |
| 367 | if !ok { |
| 368 | // unknown variant |
| 369 | // TODO: allow user-defined variants? |
| 370 | scan.gobble(mkErrInvalid(scan.token)) |
| 371 | continue |
| 372 | } |
| 373 | varID = append(varID, v) |
| 374 | variant = append(variant, scan.token) |
| 375 | if !needSort { |
| 376 | if last < int(v) { |
| 377 | last = int(v) |
| 378 | } else { |
| 379 | needSort = true |
| 380 | // There is no legal combinations of more than 7 variants |
| 381 | // (and this is by no means a useful sequence). |
| 382 | const maxVariants = 8 |
| 383 | if len(varID) > maxVariants { |
| 384 | break |
| 385 | } |
| 386 | } |
| 387 | } |
| 388 | end = scan.end |
| 389 | } |
| 390 | if needSort { |
| 391 | sort.Sort(variantsSort{varID, variant}) |
| 392 | k, l := 0, -1 |
| 393 | for i, v := range varID { |
| 394 | w := int(v) |
| 395 | if l == w { |
| 396 | // Remove duplicates. |
| 397 | continue |
| 398 | } |
| 399 | varID[k] = varID[i] |
| 400 | variant[k] = variant[i] |
| 401 | k++ |
| 402 | l = w |
| 403 | } |
| 404 | if str := bytes.Join(variant[:k], separator); len(str) == 0 { |
| 405 | end = start - 1 |
| 406 | } else { |
| 407 | scan.resizeRange(start, end, len(str)) |
| 408 | copy(scan.b[scan.start:], str) |
| 409 | end = scan.end |
| 410 | } |
| 411 | } |
| 412 | return end |
| 413 | } |
| 414 | |
| 415 | type variantsSort struct { |
| 416 | i []uint8 |
| 417 | v [][]byte |
| 418 | } |
| 419 | |
| 420 | func (s variantsSort) Len() int { |
| 421 | return len(s.i) |
| 422 | } |
| 423 | |
| 424 | func (s variantsSort) Swap(i, j int) { |
| 425 | s.i[i], s.i[j] = s.i[j], s.i[i] |
| 426 | s.v[i], s.v[j] = s.v[j], s.v[i] |
| 427 | } |
| 428 | |
| 429 | func (s variantsSort) Less(i, j int) bool { |
| 430 | return s.i[i] < s.i[j] |
| 431 | } |
| 432 | |
| 433 | type bytesSort [][]byte |
| 434 | |
| 435 | func (b bytesSort) Len() int { |
| 436 | return len(b) |
| 437 | } |
| 438 | |
| 439 | func (b bytesSort) Swap(i, j int) { |
| 440 | b[i], b[j] = b[j], b[i] |
| 441 | } |
| 442 | |
| 443 | func (b bytesSort) Less(i, j int) bool { |
| 444 | return bytes.Compare(b[i], b[j]) == -1 |
| 445 | } |
| 446 | |
| 447 | // parseExtensions parses and normalizes the extensions in the buffer. |
| 448 | // It returns the last position of scan.b that is part of any extension. |
| 449 | // It also trims scan.b to remove excess parts accordingly. |
| 450 | func parseExtensions(scan *scanner) int { |
| 451 | start := scan.start |
| 452 | exts := [][]byte{} |
| 453 | private := []byte{} |
| 454 | end := scan.end |
| 455 | for len(scan.token) == 1 { |
| 456 | extStart := scan.start |
| 457 | ext := scan.token[0] |
| 458 | end = parseExtension(scan) |
| 459 | extension := scan.b[extStart:end] |
| 460 | if len(extension) < 3 || (ext != 'x' && len(extension) < 4) { |
| 461 | scan.setError(errSyntax) |
| 462 | end = extStart |
| 463 | continue |
| 464 | } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) { |
| 465 | scan.b = scan.b[:end] |
| 466 | return end |
| 467 | } else if ext == 'x' { |
| 468 | private = extension |
| 469 | break |
| 470 | } |
| 471 | exts = append(exts, extension) |
| 472 | } |
| 473 | sort.Sort(bytesSort(exts)) |
| 474 | if len(private) > 0 { |
| 475 | exts = append(exts, private) |
| 476 | } |
| 477 | scan.b = scan.b[:start] |
| 478 | if len(exts) > 0 { |
| 479 | scan.b = append(scan.b, bytes.Join(exts, separator)...) |
| 480 | } else if start > 0 { |
| 481 | // Strip trailing '-'. |
| 482 | scan.b = scan.b[:start-1] |
| 483 | } |
| 484 | return end |
| 485 | } |
| 486 | |
| 487 | // parseExtension parses a single extension and returns the position of |
| 488 | // the extension end. |
| 489 | func parseExtension(scan *scanner) int { |
| 490 | start, end := scan.start, scan.end |
| 491 | switch scan.token[0] { |
| 492 | case 'u': |
| 493 | attrStart := end |
| 494 | scan.scan() |
| 495 | for last := []byte{}; len(scan.token) > 2; scan.scan() { |
| 496 | if bytes.Compare(scan.token, last) != -1 { |
| 497 | // Attributes are unsorted. Start over from scratch. |
| 498 | p := attrStart + 1 |
| 499 | scan.next = p |
| 500 | attrs := [][]byte{} |
| 501 | for scan.scan(); len(scan.token) > 2; scan.scan() { |
| 502 | attrs = append(attrs, scan.token) |
| 503 | end = scan.end |
| 504 | } |
| 505 | sort.Sort(bytesSort(attrs)) |
| 506 | copy(scan.b[p:], bytes.Join(attrs, separator)) |
| 507 | break |
| 508 | } |
| 509 | last = scan.token |
| 510 | end = scan.end |
| 511 | } |
| 512 | var last, key []byte |
| 513 | for attrEnd := end; len(scan.token) == 2; last = key { |
| 514 | key = scan.token |
| 515 | keyEnd := scan.end |
| 516 | end = scan.acceptMinSize(3) |
| 517 | // TODO: check key value validity |
| 518 | if keyEnd == end || bytes.Compare(key, last) != 1 { |
| 519 | // We have an invalid key or the keys are not sorted. |
| 520 | // Start scanning keys from scratch and reorder. |
| 521 | p := attrEnd + 1 |
| 522 | scan.next = p |
| 523 | keys := [][]byte{} |
| 524 | for scan.scan(); len(scan.token) == 2; { |
| 525 | keyStart, keyEnd := scan.start, scan.end |
| 526 | end = scan.acceptMinSize(3) |
| 527 | if keyEnd != end { |
| 528 | keys = append(keys, scan.b[keyStart:end]) |
| 529 | } else { |
| 530 | scan.setError(errSyntax) |
| 531 | end = keyStart |
| 532 | } |
| 533 | } |
| 534 | sort.Sort(bytesSort(keys)) |
| 535 | reordered := bytes.Join(keys, separator) |
| 536 | if e := p + len(reordered); e < end { |
| 537 | scan.deleteRange(e, end) |
| 538 | end = e |
| 539 | } |
| 540 | copy(scan.b[p:], bytes.Join(keys, separator)) |
| 541 | break |
| 542 | } |
| 543 | } |
| 544 | case 't': |
| 545 | scan.scan() |
| 546 | if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) { |
| 547 | _, end = parseTag(scan) |
| 548 | scan.toLower(start, end) |
| 549 | } |
| 550 | for len(scan.token) == 2 && !isAlpha(scan.token[1]) { |
| 551 | end = scan.acceptMinSize(3) |
| 552 | } |
| 553 | case 'x': |
| 554 | end = scan.acceptMinSize(1) |
| 555 | default: |
| 556 | end = scan.acceptMinSize(2) |
| 557 | } |
| 558 | return end |
| 559 | } |
| 560 | |
| 561 | // Compose creates a Tag from individual parts, which may be of type Tag, Base, |
| 562 | // Script, Region, Variant, []Variant, Extension, []Extension or error. If a |
| 563 | // Base, Script or Region or slice of type Variant or Extension is passed more |
| 564 | // than once, the latter will overwrite the former. Variants and Extensions are |
| 565 | // accumulated, but if two extensions of the same type are passed, the latter |
| 566 | // will replace the former. A Tag overwrites all former values and typically |
| 567 | // only makes sense as the first argument. The resulting tag is returned after |
| 568 | // canonicalizing using the Default CanonType. If one or more errors are |
| 569 | // encountered, one of the errors is returned. |
| 570 | func Compose(part ...interface{}) (t Tag, err error) { |
| 571 | return Default.Compose(part...) |
| 572 | } |
| 573 | |
| 574 | // Compose creates a Tag from individual parts, which may be of type Tag, Base, |
| 575 | // Script, Region, Variant, []Variant, Extension, []Extension or error. If a |
| 576 | // Base, Script or Region or slice of type Variant or Extension is passed more |
| 577 | // than once, the latter will overwrite the former. Variants and Extensions are |
| 578 | // accumulated, but if two extensions of the same type are passed, the latter |
| 579 | // will replace the former. A Tag overwrites all former values and typically |
| 580 | // only makes sense as the first argument. The resulting tag is returned after |
| 581 | // canonicalizing using CanonType c. If one or more errors are encountered, |
| 582 | // one of the errors is returned. |
| 583 | func (c CanonType) Compose(part ...interface{}) (t Tag, err error) { |
| 584 | var b builder |
| 585 | if err = b.update(part...); err != nil { |
| 586 | return und, err |
| 587 | } |
| 588 | t, _ = b.tag.canonicalize(c) |
| 589 | |
| 590 | if len(b.ext) > 0 || len(b.variant) > 0 { |
| 591 | sort.Sort(sortVariant(b.variant)) |
| 592 | sort.Strings(b.ext) |
| 593 | if b.private != "" { |
| 594 | b.ext = append(b.ext, b.private) |
| 595 | } |
| 596 | n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...) |
| 597 | buf := make([]byte, n) |
| 598 | p := t.genCoreBytes(buf) |
| 599 | t.pVariant = byte(p) |
| 600 | p += appendTokens(buf[p:], b.variant...) |
| 601 | t.pExt = uint16(p) |
| 602 | p += appendTokens(buf[p:], b.ext...) |
| 603 | t.str = string(buf[:p]) |
| 604 | } else if b.private != "" { |
| 605 | t.str = b.private |
| 606 | t.remakeString() |
| 607 | } |
| 608 | return |
| 609 | } |
| 610 | |
| 611 | type builder struct { |
| 612 | tag Tag |
| 613 | |
| 614 | private string // the x extension |
| 615 | ext []string |
| 616 | variant []string |
| 617 | |
| 618 | err error |
| 619 | } |
| 620 | |
| 621 | func (b *builder) addExt(e string) { |
| 622 | if e == "" { |
| 623 | } else if e[0] == 'x' { |
| 624 | b.private = e |
| 625 | } else { |
| 626 | b.ext = append(b.ext, e) |
| 627 | } |
| 628 | } |
| 629 | |
| 630 | var errInvalidArgument = errors.New("invalid Extension or Variant") |
| 631 | |
| 632 | func (b *builder) update(part ...interface{}) (err error) { |
| 633 | replace := func(l *[]string, s string, eq func(a, b string) bool) bool { |
| 634 | if s == "" { |
| 635 | b.err = errInvalidArgument |
| 636 | return true |
| 637 | } |
| 638 | for i, v := range *l { |
| 639 | if eq(v, s) { |
| 640 | (*l)[i] = s |
| 641 | return true |
| 642 | } |
| 643 | } |
| 644 | return false |
| 645 | } |
| 646 | for _, x := range part { |
| 647 | switch v := x.(type) { |
| 648 | case Tag: |
| 649 | b.tag.lang = v.lang |
| 650 | b.tag.region = v.region |
| 651 | b.tag.script = v.script |
| 652 | if v.str != "" { |
| 653 | b.variant = nil |
| 654 | for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; { |
| 655 | x, s = nextToken(s) |
| 656 | b.variant = append(b.variant, x) |
| 657 | } |
| 658 | b.ext, b.private = nil, "" |
| 659 | for i, e := int(v.pExt), ""; i < len(v.str); { |
| 660 | i, e = getExtension(v.str, i) |
| 661 | b.addExt(e) |
| 662 | } |
| 663 | } |
| 664 | case Base: |
| 665 | b.tag.lang = v.langID |
| 666 | case Script: |
| 667 | b.tag.script = v.scriptID |
| 668 | case Region: |
| 669 | b.tag.region = v.regionID |
| 670 | case Variant: |
| 671 | if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) { |
| 672 | b.variant = append(b.variant, v.variant) |
| 673 | } |
| 674 | case Extension: |
| 675 | if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) { |
| 676 | b.addExt(v.s) |
| 677 | } |
| 678 | case []Variant: |
| 679 | b.variant = nil |
| 680 | for _, x := range v { |
| 681 | b.update(x) |
| 682 | } |
| 683 | case []Extension: |
| 684 | b.ext, b.private = nil, "" |
| 685 | for _, e := range v { |
| 686 | b.update(e) |
| 687 | } |
| 688 | // TODO: support parsing of raw strings based on morphology or just extensions? |
| 689 | case error: |
| 690 | err = v |
| 691 | } |
| 692 | } |
| 693 | return |
| 694 | } |
| 695 | |
| 696 | func tokenLen(token ...string) (n int) { |
| 697 | for _, t := range token { |
| 698 | n += len(t) + 1 |
| 699 | } |
| 700 | return |
| 701 | } |
| 702 | |
| 703 | func appendTokens(b []byte, token ...string) int { |
| 704 | p := 0 |
| 705 | for _, t := range token { |
| 706 | b[p] = '-' |
| 707 | copy(b[p+1:], t) |
| 708 | p += 1 + len(t) |
| 709 | } |
| 710 | return p |
| 711 | } |
| 712 | |
| 713 | type sortVariant []string |
| 714 | |
| 715 | func (s sortVariant) Len() int { |
| 716 | return len(s) |
| 717 | } |
| 718 | |
| 719 | func (s sortVariant) Swap(i, j int) { |
| 720 | s[j], s[i] = s[i], s[j] |
| 721 | } |
| 722 | |
| 723 | func (s sortVariant) Less(i, j int) bool { |
| 724 | return variantIndex[s[i]] < variantIndex[s[j]] |
| 725 | } |
| 726 | |
| 727 | func findExt(list []string, x byte) int { |
| 728 | for i, e := range list { |
| 729 | if e[0] == x { |
| 730 | return i |
| 731 | } |
| 732 | } |
| 733 | return -1 |
| 734 | } |
| 735 | |
| 736 | // getExtension returns the name, body and end position of the extension. |
| 737 | func getExtension(s string, p int) (end int, ext string) { |
| 738 | if s[p] == '-' { |
| 739 | p++ |
| 740 | } |
| 741 | if s[p] == 'x' { |
| 742 | return len(s), s[p:] |
| 743 | } |
| 744 | end = nextExtension(s, p) |
| 745 | return end, s[p:end] |
| 746 | } |
| 747 | |
| 748 | // nextExtension finds the next extension within the string, searching |
| 749 | // for the -<char>- pattern from position p. |
| 750 | // In the fast majority of cases, language tags will have at most |
| 751 | // one extension and extensions tend to be small. |
| 752 | func nextExtension(s string, p int) int { |
| 753 | for n := len(s) - 3; p < n; { |
| 754 | if s[p] == '-' { |
| 755 | if s[p+2] == '-' { |
| 756 | return p |
| 757 | } |
| 758 | p += 3 |
| 759 | } else { |
| 760 | p++ |
| 761 | } |
| 762 | } |
| 763 | return len(s) |
| 764 | } |
| 765 | |
| 766 | var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight") |
| 767 | |
| 768 | // ParseAcceptLanguage parses the contents of an Accept-Language header as |
| 769 | // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and |
| 770 | // a list of corresponding quality weights. It is more permissive than RFC 2616 |
| 771 | // and may return non-nil slices even if the input is not valid. |
| 772 | // The Tags will be sorted by highest weight first and then by first occurrence. |
| 773 | // Tags with a weight of zero will be dropped. An error will be returned if the |
| 774 | // input could not be parsed. |
| 775 | func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) { |
| 776 | var entry string |
| 777 | for s != "" { |
| 778 | if entry, s = split(s, ','); entry == "" { |
| 779 | continue |
| 780 | } |
| 781 | |
| 782 | entry, weight := split(entry, ';') |
| 783 | |
| 784 | // Scan the language. |
| 785 | t, err := Parse(entry) |
| 786 | if err != nil { |
| 787 | id, ok := acceptFallback[entry] |
| 788 | if !ok { |
| 789 | return nil, nil, err |
| 790 | } |
| 791 | t = Tag{lang: id} |
| 792 | } |
| 793 | |
| 794 | // Scan the optional weight. |
| 795 | w := 1.0 |
| 796 | if weight != "" { |
| 797 | weight = consume(weight, 'q') |
| 798 | weight = consume(weight, '=') |
| 799 | // consume returns the empty string when a token could not be |
| 800 | // consumed, resulting in an error for ParseFloat. |
| 801 | if w, err = strconv.ParseFloat(weight, 32); err != nil { |
| 802 | return nil, nil, errInvalidWeight |
| 803 | } |
| 804 | // Drop tags with a quality weight of 0. |
| 805 | if w <= 0 { |
| 806 | continue |
| 807 | } |
| 808 | } |
| 809 | |
| 810 | tag = append(tag, t) |
| 811 | q = append(q, float32(w)) |
| 812 | } |
| 813 | sortStable(&tagSort{tag, q}) |
| 814 | return tag, q, nil |
| 815 | } |
| 816 | |
| 817 | // consume removes a leading token c from s and returns the result or the empty |
| 818 | // string if there is no such token. |
| 819 | func consume(s string, c byte) string { |
| 820 | if s == "" || s[0] != c { |
| 821 | return "" |
| 822 | } |
| 823 | return strings.TrimSpace(s[1:]) |
| 824 | } |
| 825 | |
| 826 | func split(s string, c byte) (head, tail string) { |
| 827 | if i := strings.IndexByte(s, c); i >= 0 { |
| 828 | return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:]) |
| 829 | } |
| 830 | return strings.TrimSpace(s), "" |
| 831 | } |
| 832 | |
| 833 | // Add hack mapping to deal with a small number of cases that that occur |
| 834 | // in Accept-Language (with reasonable frequency). |
| 835 | var acceptFallback = map[string]langID{ |
| 836 | "english": _en, |
| 837 | "deutsch": _de, |
| 838 | "italian": _it, |
| 839 | "french": _fr, |
| 840 | "*": _mul, // defined in the spec to match all languages. |
| 841 | } |
| 842 | |
| 843 | type tagSort struct { |
| 844 | tag []Tag |
| 845 | q []float32 |
| 846 | } |
| 847 | |
| 848 | func (s *tagSort) Len() int { |
| 849 | return len(s.q) |
| 850 | } |
| 851 | |
| 852 | func (s *tagSort) Less(i, j int) bool { |
| 853 | return s.q[i] > s.q[j] |
| 854 | } |
| 855 | |
| 856 | func (s *tagSort) Swap(i, j int) { |
| 857 | s.tag[i], s.tag[j] = s.tag[j], s.tag[i] |
| 858 | s.q[i], s.q[j] = s.q[j], s.q[i] |
| 859 | } |