blob: 19cb546751fba6c9901c6bc58e6d537a68d5643b [file] [log] [blame]
William Kurkianea869482019-04-09 15:16:11 -04001// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package collate
6
7import (
8 "sort"
9
10 "golang.org/x/text/internal/colltab"
11 "golang.org/x/text/language"
12 "golang.org/x/text/unicode/norm"
13)
14
15// newCollator creates a new collator with default options configured.
16func newCollator(t colltab.Weighter) *Collator {
17 // Initialize a collator with default options.
18 c := &Collator{
19 options: options{
20 ignore: [colltab.NumLevels]bool{
21 colltab.Quaternary: true,
22 colltab.Identity: true,
23 },
24 f: norm.NFD,
25 t: t,
26 },
27 }
28
29 // TODO: store vt in tags or remove.
30 c.variableTop = t.Top()
31
32 return c
33}
34
35// An Option is used to change the behavior of a Collator. Options override the
36// settings passed through the locale identifier.
37type Option struct {
38 priority int
39 f func(o *options)
40}
41
42type prioritizedOptions []Option
43
44func (p prioritizedOptions) Len() int {
45 return len(p)
46}
47
48func (p prioritizedOptions) Swap(i, j int) {
49 p[i], p[j] = p[j], p[i]
50}
51
52func (p prioritizedOptions) Less(i, j int) bool {
53 return p[i].priority < p[j].priority
54}
55
56type options struct {
57 // ignore specifies which levels to ignore.
58 ignore [colltab.NumLevels]bool
59
60 // caseLevel is true if there is an additional level of case matching
61 // between the secondary and tertiary levels.
62 caseLevel bool
63
64 // backwards specifies the order of sorting at the secondary level.
65 // This option exists predominantly to support reverse sorting of accents in French.
66 backwards bool
67
68 // numeric specifies whether any sequence of decimal digits (category is Nd)
69 // is sorted at a primary level with its numeric value.
70 // For example, "A-21" < "A-123".
71 // This option is set by wrapping the main Weighter with NewNumericWeighter.
72 numeric bool
73
74 // alternate specifies an alternative handling of variables.
75 alternate alternateHandling
76
77 // variableTop is the largest primary value that is considered to be
78 // variable.
79 variableTop uint32
80
81 t colltab.Weighter
82
83 f norm.Form
84}
85
86func (o *options) setOptions(opts []Option) {
87 sort.Sort(prioritizedOptions(opts))
88 for _, x := range opts {
89 x.f(o)
90 }
91}
92
93// OptionsFromTag extracts the BCP47 collation options from the tag and
94// configures a collator accordingly. These options are set before any other
95// option.
96func OptionsFromTag(t language.Tag) Option {
97 return Option{0, func(o *options) {
98 o.setFromTag(t)
99 }}
100}
101
102func (o *options) setFromTag(t language.Tag) {
103 o.caseLevel = ldmlBool(t, o.caseLevel, "kc")
104 o.backwards = ldmlBool(t, o.backwards, "kb")
105 o.numeric = ldmlBool(t, o.numeric, "kn")
106
107 // Extract settings from the BCP47 u extension.
108 switch t.TypeForKey("ks") { // strength
109 case "level1":
110 o.ignore[colltab.Secondary] = true
111 o.ignore[colltab.Tertiary] = true
112 case "level2":
113 o.ignore[colltab.Tertiary] = true
114 case "level3", "":
115 // The default.
116 case "level4":
117 o.ignore[colltab.Quaternary] = false
118 case "identic":
119 o.ignore[colltab.Quaternary] = false
120 o.ignore[colltab.Identity] = false
121 }
122
123 switch t.TypeForKey("ka") {
124 case "shifted":
125 o.alternate = altShifted
126 // The following two types are not official BCP47, but we support them to
127 // give access to this otherwise hidden functionality. The name blanked is
128 // derived from the LDML name blanked and posix reflects the main use of
129 // the shift-trimmed option.
130 case "blanked":
131 o.alternate = altBlanked
132 case "posix":
133 o.alternate = altShiftTrimmed
134 }
135
136 // TODO: caseFirst ("kf"), reorder ("kr"), and maybe variableTop ("vt").
137
138 // Not used:
139 // - normalization ("kk", not necessary for this implementation)
140 // - hiraganaQuatenary ("kh", obsolete)
141}
142
143func ldmlBool(t language.Tag, old bool, key string) bool {
144 switch t.TypeForKey(key) {
145 case "true":
146 return true
147 case "false":
148 return false
149 default:
150 return old
151 }
152}
153
154var (
155 // IgnoreCase sets case-insensitive comparison.
156 IgnoreCase Option = ignoreCase
157 ignoreCase = Option{3, ignoreCaseF}
158
159 // IgnoreDiacritics causes diacritical marks to be ignored. ("o" == "รถ").
160 IgnoreDiacritics Option = ignoreDiacritics
161 ignoreDiacritics = Option{3, ignoreDiacriticsF}
162
163 // IgnoreWidth causes full-width characters to match their half-width
164 // equivalents.
165 IgnoreWidth Option = ignoreWidth
166 ignoreWidth = Option{2, ignoreWidthF}
167
Abhilash S.L3b494632019-07-16 15:51:09 +0530168 // Loose sets the collator to ignore diacritics, case and width.
William Kurkianea869482019-04-09 15:16:11 -0400169 Loose Option = loose
170 loose = Option{4, looseF}
171
172 // Force ordering if strings are equivalent but not equal.
173 Force Option = force
174 force = Option{5, forceF}
175
176 // Numeric specifies that numbers should sort numerically ("2" < "12").
177 Numeric Option = numeric
178 numeric = Option{5, numericF}
179)
180
181func ignoreWidthF(o *options) {
182 o.ignore[colltab.Tertiary] = true
183 o.caseLevel = true
184}
185
186func ignoreDiacriticsF(o *options) {
187 o.ignore[colltab.Secondary] = true
188}
189
190func ignoreCaseF(o *options) {
191 o.ignore[colltab.Tertiary] = true
192 o.caseLevel = false
193}
194
195func looseF(o *options) {
196 ignoreWidthF(o)
197 ignoreDiacriticsF(o)
198 ignoreCaseF(o)
199}
200
201func forceF(o *options) {
202 o.ignore[colltab.Identity] = false
203}
204
205func numericF(o *options) { o.numeric = true }
206
207// Reorder overrides the pre-defined ordering of scripts and character sets.
208func Reorder(s ...string) Option {
209 // TODO: need fractional weights to implement this.
210 panic("TODO: implement")
211}
212
213// TODO: consider making these public again. These options cannot be fully
214// specified in BCP47, so an API interface seems warranted. Still a higher-level
215// interface would be nice (e.g. a POSIX option for enabling altShiftTrimmed)
216
217// alternateHandling identifies the various ways in which variables are handled.
218// A rune with a primary weight lower than the variable top is considered a
219// variable.
Abhilash S.L3b494632019-07-16 15:51:09 +0530220// See https://www.unicode.org/reports/tr10/#Variable_Weighting for details.
William Kurkianea869482019-04-09 15:16:11 -0400221type alternateHandling int
222
223const (
224 // altNonIgnorable turns off special handling of variables.
225 altNonIgnorable alternateHandling = iota
226
227 // altBlanked sets variables and all subsequent primary ignorables to be
228 // ignorable at all levels. This is identical to removing all variables
229 // and subsequent primary ignorables from the input.
230 altBlanked
231
232 // altShifted sets variables to be ignorable for levels one through three and
233 // adds a fourth level based on the values of the ignored levels.
234 altShifted
235
236 // altShiftTrimmed is a slight variant of altShifted that is used to
237 // emulate POSIX.
238 altShiftTrimmed
239)