Scott Baker | eee8dd8 | 2019-09-24 12:52:34 -0700 | [diff] [blame^] | 1 | // Copyright 2014 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package collate |
| 6 | |
| 7 | import ( |
| 8 | "sort" |
| 9 | |
| 10 | "golang.org/x/text/internal/colltab" |
| 11 | "golang.org/x/text/language" |
| 12 | "golang.org/x/text/unicode/norm" |
| 13 | ) |
| 14 | |
| 15 | // newCollator creates a new collator with default options configured. |
| 16 | func newCollator(t colltab.Weighter) *Collator { |
| 17 | // Initialize a collator with default options. |
| 18 | c := &Collator{ |
| 19 | options: options{ |
| 20 | ignore: [colltab.NumLevels]bool{ |
| 21 | colltab.Quaternary: true, |
| 22 | colltab.Identity: true, |
| 23 | }, |
| 24 | f: norm.NFD, |
| 25 | t: t, |
| 26 | }, |
| 27 | } |
| 28 | |
| 29 | // TODO: store vt in tags or remove. |
| 30 | c.variableTop = t.Top() |
| 31 | |
| 32 | return c |
| 33 | } |
| 34 | |
| 35 | // An Option is used to change the behavior of a Collator. Options override the |
| 36 | // settings passed through the locale identifier. |
| 37 | type Option struct { |
| 38 | priority int |
| 39 | f func(o *options) |
| 40 | } |
| 41 | |
| 42 | type prioritizedOptions []Option |
| 43 | |
| 44 | func (p prioritizedOptions) Len() int { |
| 45 | return len(p) |
| 46 | } |
| 47 | |
| 48 | func (p prioritizedOptions) Swap(i, j int) { |
| 49 | p[i], p[j] = p[j], p[i] |
| 50 | } |
| 51 | |
| 52 | func (p prioritizedOptions) Less(i, j int) bool { |
| 53 | return p[i].priority < p[j].priority |
| 54 | } |
| 55 | |
| 56 | type options struct { |
| 57 | // ignore specifies which levels to ignore. |
| 58 | ignore [colltab.NumLevels]bool |
| 59 | |
| 60 | // caseLevel is true if there is an additional level of case matching |
| 61 | // between the secondary and tertiary levels. |
| 62 | caseLevel bool |
| 63 | |
| 64 | // backwards specifies the order of sorting at the secondary level. |
| 65 | // This option exists predominantly to support reverse sorting of accents in French. |
| 66 | backwards bool |
| 67 | |
| 68 | // numeric specifies whether any sequence of decimal digits (category is Nd) |
| 69 | // is sorted at a primary level with its numeric value. |
| 70 | // For example, "A-21" < "A-123". |
| 71 | // This option is set by wrapping the main Weighter with NewNumericWeighter. |
| 72 | numeric bool |
| 73 | |
| 74 | // alternate specifies an alternative handling of variables. |
| 75 | alternate alternateHandling |
| 76 | |
| 77 | // variableTop is the largest primary value that is considered to be |
| 78 | // variable. |
| 79 | variableTop uint32 |
| 80 | |
| 81 | t colltab.Weighter |
| 82 | |
| 83 | f norm.Form |
| 84 | } |
| 85 | |
| 86 | func (o *options) setOptions(opts []Option) { |
| 87 | sort.Sort(prioritizedOptions(opts)) |
| 88 | for _, x := range opts { |
| 89 | x.f(o) |
| 90 | } |
| 91 | } |
| 92 | |
| 93 | // OptionsFromTag extracts the BCP47 collation options from the tag and |
| 94 | // configures a collator accordingly. These options are set before any other |
| 95 | // option. |
| 96 | func OptionsFromTag(t language.Tag) Option { |
| 97 | return Option{0, func(o *options) { |
| 98 | o.setFromTag(t) |
| 99 | }} |
| 100 | } |
| 101 | |
| 102 | func (o *options) setFromTag(t language.Tag) { |
| 103 | o.caseLevel = ldmlBool(t, o.caseLevel, "kc") |
| 104 | o.backwards = ldmlBool(t, o.backwards, "kb") |
| 105 | o.numeric = ldmlBool(t, o.numeric, "kn") |
| 106 | |
| 107 | // Extract settings from the BCP47 u extension. |
| 108 | switch t.TypeForKey("ks") { // strength |
| 109 | case "level1": |
| 110 | o.ignore[colltab.Secondary] = true |
| 111 | o.ignore[colltab.Tertiary] = true |
| 112 | case "level2": |
| 113 | o.ignore[colltab.Tertiary] = true |
| 114 | case "level3", "": |
| 115 | // The default. |
| 116 | case "level4": |
| 117 | o.ignore[colltab.Quaternary] = false |
| 118 | case "identic": |
| 119 | o.ignore[colltab.Quaternary] = false |
| 120 | o.ignore[colltab.Identity] = false |
| 121 | } |
| 122 | |
| 123 | switch t.TypeForKey("ka") { |
| 124 | case "shifted": |
| 125 | o.alternate = altShifted |
| 126 | // The following two types are not official BCP47, but we support them to |
| 127 | // give access to this otherwise hidden functionality. The name blanked is |
| 128 | // derived from the LDML name blanked and posix reflects the main use of |
| 129 | // the shift-trimmed option. |
| 130 | case "blanked": |
| 131 | o.alternate = altBlanked |
| 132 | case "posix": |
| 133 | o.alternate = altShiftTrimmed |
| 134 | } |
| 135 | |
| 136 | // TODO: caseFirst ("kf"), reorder ("kr"), and maybe variableTop ("vt"). |
| 137 | |
| 138 | // Not used: |
| 139 | // - normalization ("kk", not necessary for this implementation) |
| 140 | // - hiraganaQuatenary ("kh", obsolete) |
| 141 | } |
| 142 | |
| 143 | func ldmlBool(t language.Tag, old bool, key string) bool { |
| 144 | switch t.TypeForKey(key) { |
| 145 | case "true": |
| 146 | return true |
| 147 | case "false": |
| 148 | return false |
| 149 | default: |
| 150 | return old |
| 151 | } |
| 152 | } |
| 153 | |
| 154 | var ( |
| 155 | // IgnoreCase sets case-insensitive comparison. |
| 156 | IgnoreCase Option = ignoreCase |
| 157 | ignoreCase = Option{3, ignoreCaseF} |
| 158 | |
| 159 | // IgnoreDiacritics causes diacritical marks to be ignored. ("o" == "รถ"). |
| 160 | IgnoreDiacritics Option = ignoreDiacritics |
| 161 | ignoreDiacritics = Option{3, ignoreDiacriticsF} |
| 162 | |
| 163 | // IgnoreWidth causes full-width characters to match their half-width |
| 164 | // equivalents. |
| 165 | IgnoreWidth Option = ignoreWidth |
| 166 | ignoreWidth = Option{2, ignoreWidthF} |
| 167 | |
| 168 | // Loose sets the collator to ignore diacritics, case and weight. |
| 169 | Loose Option = loose |
| 170 | loose = Option{4, looseF} |
| 171 | |
| 172 | // Force ordering if strings are equivalent but not equal. |
| 173 | Force Option = force |
| 174 | force = Option{5, forceF} |
| 175 | |
| 176 | // Numeric specifies that numbers should sort numerically ("2" < "12"). |
| 177 | Numeric Option = numeric |
| 178 | numeric = Option{5, numericF} |
| 179 | ) |
| 180 | |
| 181 | func ignoreWidthF(o *options) { |
| 182 | o.ignore[colltab.Tertiary] = true |
| 183 | o.caseLevel = true |
| 184 | } |
| 185 | |
| 186 | func ignoreDiacriticsF(o *options) { |
| 187 | o.ignore[colltab.Secondary] = true |
| 188 | } |
| 189 | |
| 190 | func ignoreCaseF(o *options) { |
| 191 | o.ignore[colltab.Tertiary] = true |
| 192 | o.caseLevel = false |
| 193 | } |
| 194 | |
| 195 | func looseF(o *options) { |
| 196 | ignoreWidthF(o) |
| 197 | ignoreDiacriticsF(o) |
| 198 | ignoreCaseF(o) |
| 199 | } |
| 200 | |
| 201 | func forceF(o *options) { |
| 202 | o.ignore[colltab.Identity] = false |
| 203 | } |
| 204 | |
| 205 | func numericF(o *options) { o.numeric = true } |
| 206 | |
| 207 | // Reorder overrides the pre-defined ordering of scripts and character sets. |
| 208 | func Reorder(s ...string) Option { |
| 209 | // TODO: need fractional weights to implement this. |
| 210 | panic("TODO: implement") |
| 211 | } |
| 212 | |
| 213 | // TODO: consider making these public again. These options cannot be fully |
| 214 | // specified in BCP47, so an API interface seems warranted. Still a higher-level |
| 215 | // interface would be nice (e.g. a POSIX option for enabling altShiftTrimmed) |
| 216 | |
| 217 | // alternateHandling identifies the various ways in which variables are handled. |
| 218 | // A rune with a primary weight lower than the variable top is considered a |
| 219 | // variable. |
| 220 | // See http://www.unicode.org/reports/tr10/#Variable_Weighting for details. |
| 221 | type alternateHandling int |
| 222 | |
| 223 | const ( |
| 224 | // altNonIgnorable turns off special handling of variables. |
| 225 | altNonIgnorable alternateHandling = iota |
| 226 | |
| 227 | // altBlanked sets variables and all subsequent primary ignorables to be |
| 228 | // ignorable at all levels. This is identical to removing all variables |
| 229 | // and subsequent primary ignorables from the input. |
| 230 | altBlanked |
| 231 | |
| 232 | // altShifted sets variables to be ignorable for levels one through three and |
| 233 | // adds a fourth level based on the values of the ignored levels. |
| 234 | altShifted |
| 235 | |
| 236 | // altShiftTrimmed is a slight variant of altShifted that is used to |
| 237 | // emulate POSIX. |
| 238 | altShiftTrimmed |
| 239 | ) |