blob: 53b819cc3d5f36475db3796434255082d20df4cc [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package colltab
6
7import (
8 "unicode"
9 "unicode/utf8"
10)
11
12// NewNumericWeighter wraps w to replace individual digits to sort based on their
13// numeric value.
14//
15// Weighter w must have a free primary weight after the primary weight for 9.
16// If this is not the case, numeric value will sort at the same primary level
17// as the first primary sorting after 9.
18func NewNumericWeighter(w Weighter) Weighter {
19 getElem := func(s string) Elem {
20 elems, _ := w.AppendNextString(nil, s)
21 return elems[0]
22 }
23 nine := getElem("9")
24
25 // Numbers should order before zero, but the DUCET has no room for this.
26 // TODO: move before zero once we use fractional collation elements.
27 ns, _ := MakeElem(nine.Primary()+1, nine.Secondary(), int(nine.Tertiary()), 0)
28
29 return &numericWeighter{
30 Weighter: w,
31
32 // We assume that w sorts digits of different kinds in order of numeric
33 // value and that the tertiary weight order is preserved.
34 //
35 // TODO: evaluate whether it is worth basing the ranges on the Elem
36 // encoding itself once the move to fractional weights is complete.
37 zero: getElem("0"),
38 zeroSpecialLo: getElem("0"), // U+FF10 FULLWIDTH DIGIT ZERO
39 zeroSpecialHi: getElem("₀"), // U+2080 SUBSCRIPT ZERO
40 nine: nine,
41 nineSpecialHi: getElem("₉"), // U+2089 SUBSCRIPT NINE
42 numberStart: ns,
43 }
44}
45
46// A numericWeighter translates a stream of digits into a stream of weights
47// representing the numeric value.
48type numericWeighter struct {
49 Weighter
50
51 // The Elems below all demarcate boundaries of specific ranges. With the
52 // current element encoding digits are in two ranges: normal (default
53 // tertiary value) and special. For most languages, digits have collation
54 // elements in the normal range.
55 //
56 // Note: the range tests are very specific for the element encoding used by
57 // this implementation. The tests in collate_test.go are designed to fail
58 // if this code is not updated when an encoding has changed.
59
60 zero Elem // normal digit zero
61 zeroSpecialLo Elem // special digit zero, low tertiary value
62 zeroSpecialHi Elem // special digit zero, high tertiary value
63 nine Elem // normal digit nine
64 nineSpecialHi Elem // special digit nine
65 numberStart Elem
66}
67
68// AppendNext calls the namesake of the underlying weigher, but replaces single
69// digits with weights representing their value.
70func (nw *numericWeighter) AppendNext(buf []Elem, s []byte) (ce []Elem, n int) {
71 ce, n = nw.Weighter.AppendNext(buf, s)
72 nc := numberConverter{
73 elems: buf,
74 w: nw,
75 b: s,
76 }
77 isZero, ok := nc.checkNextDigit(ce)
78 if !ok {
79 return ce, n
80 }
81 // ce might have been grown already, so take it instead of buf.
82 nc.init(ce, len(buf), isZero)
83 for n < len(s) {
84 ce, sz := nw.Weighter.AppendNext(nc.elems, s[n:])
85 nc.b = s
86 n += sz
87 if !nc.update(ce) {
88 break
89 }
90 }
91 return nc.result(), n
92}
93
94// AppendNextString calls the namesake of the underlying weigher, but replaces
95// single digits with weights representing their value.
96func (nw *numericWeighter) AppendNextString(buf []Elem, s string) (ce []Elem, n int) {
97 ce, n = nw.Weighter.AppendNextString(buf, s)
98 nc := numberConverter{
99 elems: buf,
100 w: nw,
101 s: s,
102 }
103 isZero, ok := nc.checkNextDigit(ce)
104 if !ok {
105 return ce, n
106 }
107 nc.init(ce, len(buf), isZero)
108 for n < len(s) {
109 ce, sz := nw.Weighter.AppendNextString(nc.elems, s[n:])
110 nc.s = s
111 n += sz
112 if !nc.update(ce) {
113 break
114 }
115 }
116 return nc.result(), n
117}
118
119type numberConverter struct {
120 w *numericWeighter
121
122 elems []Elem
123 nDigits int
124 lenIndex int
125
126 s string // set if the input was of type string
127 b []byte // set if the input was of type []byte
128}
129
130// init completes initialization of a numberConverter and prepares it for adding
131// more digits. elems is assumed to have a digit starting at oldLen.
132func (nc *numberConverter) init(elems []Elem, oldLen int, isZero bool) {
133 // Insert a marker indicating the start of a number and a placeholder
134 // for the number of digits.
135 if isZero {
136 elems = append(elems[:oldLen], nc.w.numberStart, 0)
137 } else {
138 elems = append(elems, 0, 0)
139 copy(elems[oldLen+2:], elems[oldLen:])
140 elems[oldLen] = nc.w.numberStart
141 elems[oldLen+1] = 0
142
143 nc.nDigits = 1
144 }
145 nc.elems = elems
146 nc.lenIndex = oldLen + 1
147}
148
149// checkNextDigit reports whether bufNew adds a single digit relative to the old
150// buffer. If it does, it also reports whether this digit is zero.
151func (nc *numberConverter) checkNextDigit(bufNew []Elem) (isZero, ok bool) {
152 if len(nc.elems) >= len(bufNew) {
153 return false, false
154 }
155 e := bufNew[len(nc.elems)]
156 if e < nc.w.zeroSpecialLo || nc.w.nine < e {
157 // Not a number.
158 return false, false
159 }
160 if e < nc.w.zero {
161 if e > nc.w.nineSpecialHi {
162 // Not a number.
163 return false, false
164 }
165 if !nc.isDigit() {
166 return false, false
167 }
168 isZero = e <= nc.w.zeroSpecialHi
169 } else {
170 // This is the common case if we encounter a digit.
171 isZero = e == nc.w.zero
172 }
173 // Test the remaining added collation elements have a zero primary value.
174 if n := len(bufNew) - len(nc.elems); n > 1 {
175 for i := len(nc.elems) + 1; i < len(bufNew); i++ {
176 if bufNew[i].Primary() != 0 {
177 return false, false
178 }
179 }
180 // In some rare cases, collation elements will encode runes in
181 // unicode.No as a digit. For example Ethiopic digits (U+1369 - U+1371)
182 // are not in Nd. Also some digits that clearly belong in unicode.No,
183 // like U+0C78 TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR, have
184 // collation elements indistinguishable from normal digits.
185 // Unfortunately, this means we need to make this check for nearly all
186 // non-Latin digits.
187 //
188 // TODO: check the performance impact and find something better if it is
189 // an issue.
190 if !nc.isDigit() {
191 return false, false
192 }
193 }
194 return isZero, true
195}
196
197func (nc *numberConverter) isDigit() bool {
198 if nc.b != nil {
199 r, _ := utf8.DecodeRune(nc.b)
200 return unicode.In(r, unicode.Nd)
201 }
202 r, _ := utf8.DecodeRuneInString(nc.s)
203 return unicode.In(r, unicode.Nd)
204}
205
206// We currently support a maximum of about 2M digits (the number of primary
207// values). Such numbers will compare correctly against small numbers, but their
208// comparison against other large numbers is undefined.
209//
210// TODO: define a proper fallback, such as comparing large numbers textually or
211// actually allowing numbers of unlimited length.
212//
213// TODO: cap this to a lower number (like 100) and maybe allow a larger number
214// in an option?
215const maxDigits = 1<<maxPrimaryBits - 1
216
217func (nc *numberConverter) update(elems []Elem) bool {
218 isZero, ok := nc.checkNextDigit(elems)
219 if nc.nDigits == 0 && isZero {
220 return true
221 }
222 nc.elems = elems
223 if !ok {
224 return false
225 }
226 nc.nDigits++
227 return nc.nDigits < maxDigits
228}
229
230// result fills in the length element for the digit sequence and returns the
231// completed collation elements.
232func (nc *numberConverter) result() []Elem {
233 e, _ := MakeElem(nc.nDigits, defaultSecondary, defaultTertiary, 0)
234 nc.elems[nc.lenIndex] = e
235 return nc.elems
236}