William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package language |
| 6 | |
| 7 | import ( |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 8 | "errors" |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 9 | "strconv" |
| 10 | "strings" |
| 11 | |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 12 | "golang.org/x/text/internal/language" |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 13 | ) |
| 14 | |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 15 | // ValueError is returned by any of the parsing functions when the |
| 16 | // input is well-formed but the respective subtag is not recognized |
| 17 | // as a valid value. |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 18 | type ValueError interface { |
| 19 | error |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 20 | |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 21 | // Subtag returns the subtag for which the error occurred. |
| 22 | Subtag() string |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 23 | } |
| 24 | |
| 25 | // Parse parses the given BCP 47 string and returns a valid Tag. If parsing |
| 26 | // failed it returns an error and any part of the tag that could be parsed. |
| 27 | // If parsing succeeded but an unknown value was found, it returns |
| 28 | // ValueError. The Tag returned in this case is just stripped of the unknown |
| 29 | // value. All other values are preserved. It accepts tags in the BCP 47 format |
| 30 | // and extensions to this standard defined in |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 31 | // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 32 | // The resulting tag is canonicalized using the default canonicalization type. |
| 33 | func Parse(s string) (t Tag, err error) { |
| 34 | return Default.Parse(s) |
| 35 | } |
| 36 | |
| 37 | // Parse parses the given BCP 47 string and returns a valid Tag. If parsing |
| 38 | // failed it returns an error and any part of the tag that could be parsed. |
| 39 | // If parsing succeeded but an unknown value was found, it returns |
| 40 | // ValueError. The Tag returned in this case is just stripped of the unknown |
| 41 | // value. All other values are preserved. It accepts tags in the BCP 47 format |
| 42 | // and extensions to this standard defined in |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 43 | // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. |
| 44 | // The resulting tag is canonicalized using the canonicalization type c. |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 45 | func (c CanonType) Parse(s string) (t Tag, err error) { |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 46 | tt, err := language.Parse(s) |
| 47 | if err != nil { |
| 48 | return makeTag(tt), err |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 49 | } |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 50 | tt, changed := canonicalize(c, tt) |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 51 | if changed { |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 52 | tt.RemakeString() |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 53 | } |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 54 | return makeTag(tt), err |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 55 | } |
| 56 | |
| 57 | // Compose creates a Tag from individual parts, which may be of type Tag, Base, |
| 58 | // Script, Region, Variant, []Variant, Extension, []Extension or error. If a |
| 59 | // Base, Script or Region or slice of type Variant or Extension is passed more |
| 60 | // than once, the latter will overwrite the former. Variants and Extensions are |
| 61 | // accumulated, but if two extensions of the same type are passed, the latter |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 62 | // will replace the former. For -u extensions, though, the key-type pairs are |
| 63 | // added, where later values overwrite older ones. A Tag overwrites all former |
| 64 | // values and typically only makes sense as the first argument. The resulting |
| 65 | // tag is returned after canonicalizing using the Default CanonType. If one or |
| 66 | // more errors are encountered, one of the errors is returned. |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 67 | func Compose(part ...interface{}) (t Tag, err error) { |
| 68 | return Default.Compose(part...) |
| 69 | } |
| 70 | |
| 71 | // Compose creates a Tag from individual parts, which may be of type Tag, Base, |
| 72 | // Script, Region, Variant, []Variant, Extension, []Extension or error. If a |
| 73 | // Base, Script or Region or slice of type Variant or Extension is passed more |
| 74 | // than once, the latter will overwrite the former. Variants and Extensions are |
| 75 | // accumulated, but if two extensions of the same type are passed, the latter |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 76 | // will replace the former. For -u extensions, though, the key-type pairs are |
| 77 | // added, where later values overwrite older ones. A Tag overwrites all former |
| 78 | // values and typically only makes sense as the first argument. The resulting |
| 79 | // tag is returned after canonicalizing using CanonType c. If one or more errors |
| 80 | // are encountered, one of the errors is returned. |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 81 | func (c CanonType) Compose(part ...interface{}) (t Tag, err error) { |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 82 | var b language.Builder |
| 83 | if err = update(&b, part...); err != nil { |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 84 | return und, err |
| 85 | } |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 86 | b.Tag, _ = canonicalize(c, b.Tag) |
| 87 | return makeTag(b.Make()), err |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 88 | } |
| 89 | |
| 90 | var errInvalidArgument = errors.New("invalid Extension or Variant") |
| 91 | |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 92 | func update(b *language.Builder, part ...interface{}) (err error) { |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 93 | for _, x := range part { |
| 94 | switch v := x.(type) { |
| 95 | case Tag: |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 96 | b.SetTag(v.tag()) |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 97 | case Base: |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 98 | b.Tag.LangID = v.langID |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 99 | case Script: |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 100 | b.Tag.ScriptID = v.scriptID |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 101 | case Region: |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 102 | b.Tag.RegionID = v.regionID |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 103 | case Variant: |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 104 | if v.variant == "" { |
| 105 | err = errInvalidArgument |
| 106 | break |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 107 | } |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 108 | b.AddVariant(v.variant) |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 109 | case Extension: |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 110 | if v.s == "" { |
| 111 | err = errInvalidArgument |
| 112 | break |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 113 | } |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 114 | b.SetExt(v.s) |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 115 | case []Variant: |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 116 | b.ClearVariants() |
| 117 | for _, v := range v { |
| 118 | b.AddVariant(v.variant) |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 119 | } |
| 120 | case []Extension: |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 121 | b.ClearExtensions() |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 122 | for _, e := range v { |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 123 | b.SetExt(e.s) |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 124 | } |
| 125 | // TODO: support parsing of raw strings based on morphology or just extensions? |
| 126 | case error: |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 127 | if v != nil { |
| 128 | err = v |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 129 | } |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 130 | } |
| 131 | } |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 132 | return |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 133 | } |
| 134 | |
| 135 | var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight") |
| 136 | |
| 137 | // ParseAcceptLanguage parses the contents of an Accept-Language header as |
| 138 | // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and |
| 139 | // a list of corresponding quality weights. It is more permissive than RFC 2616 |
| 140 | // and may return non-nil slices even if the input is not valid. |
| 141 | // The Tags will be sorted by highest weight first and then by first occurrence. |
| 142 | // Tags with a weight of zero will be dropped. An error will be returned if the |
| 143 | // input could not be parsed. |
| 144 | func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) { |
| 145 | var entry string |
| 146 | for s != "" { |
| 147 | if entry, s = split(s, ','); entry == "" { |
| 148 | continue |
| 149 | } |
| 150 | |
| 151 | entry, weight := split(entry, ';') |
| 152 | |
| 153 | // Scan the language. |
| 154 | t, err := Parse(entry) |
| 155 | if err != nil { |
| 156 | id, ok := acceptFallback[entry] |
| 157 | if !ok { |
| 158 | return nil, nil, err |
| 159 | } |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 160 | t = makeTag(language.Tag{LangID: id}) |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 161 | } |
| 162 | |
| 163 | // Scan the optional weight. |
| 164 | w := 1.0 |
| 165 | if weight != "" { |
| 166 | weight = consume(weight, 'q') |
| 167 | weight = consume(weight, '=') |
| 168 | // consume returns the empty string when a token could not be |
| 169 | // consumed, resulting in an error for ParseFloat. |
| 170 | if w, err = strconv.ParseFloat(weight, 32); err != nil { |
| 171 | return nil, nil, errInvalidWeight |
| 172 | } |
| 173 | // Drop tags with a quality weight of 0. |
| 174 | if w <= 0 { |
| 175 | continue |
| 176 | } |
| 177 | } |
| 178 | |
| 179 | tag = append(tag, t) |
| 180 | q = append(q, float32(w)) |
| 181 | } |
| 182 | sortStable(&tagSort{tag, q}) |
| 183 | return tag, q, nil |
| 184 | } |
| 185 | |
| 186 | // consume removes a leading token c from s and returns the result or the empty |
| 187 | // string if there is no such token. |
| 188 | func consume(s string, c byte) string { |
| 189 | if s == "" || s[0] != c { |
| 190 | return "" |
| 191 | } |
| 192 | return strings.TrimSpace(s[1:]) |
| 193 | } |
| 194 | |
| 195 | func split(s string, c byte) (head, tail string) { |
| 196 | if i := strings.IndexByte(s, c); i >= 0 { |
| 197 | return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:]) |
| 198 | } |
| 199 | return strings.TrimSpace(s), "" |
| 200 | } |
| 201 | |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 202 | // Add hack mapping to deal with a small number of cases that occur |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 203 | // in Accept-Language (with reasonable frequency). |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 204 | var acceptFallback = map[string]language.Language{ |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 205 | "english": _en, |
| 206 | "deutsch": _de, |
| 207 | "italian": _it, |
| 208 | "french": _fr, |
| 209 | "*": _mul, // defined in the spec to match all languages. |
| 210 | } |
| 211 | |
| 212 | type tagSort struct { |
| 213 | tag []Tag |
| 214 | q []float32 |
| 215 | } |
| 216 | |
| 217 | func (s *tagSort) Len() int { |
| 218 | return len(s.q) |
| 219 | } |
| 220 | |
| 221 | func (s *tagSort) Less(i, j int) bool { |
| 222 | return s.q[i] > s.q[j] |
| 223 | } |
| 224 | |
| 225 | func (s *tagSort) Swap(i, j int) { |
| 226 | s.tag[i], s.tag[j] = s.tag[j], s.tag[i] |
| 227 | s.q[i], s.q[j] = s.q[j], s.q[i] |
| 228 | } |