William Kurkian | 6ea97f8 | 2019-03-13 15:51:55 -0400 | [diff] [blame^] | 1 | // Copyright 2014 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package colltab |
| 6 | |
| 7 | import ( |
| 8 | "unicode" |
| 9 | "unicode/utf8" |
| 10 | ) |
| 11 | |
| 12 | // NewNumericWeighter wraps w to replace individual digits to sort based on their |
| 13 | // numeric value. |
| 14 | // |
| 15 | // Weighter w must have a free primary weight after the primary weight for 9. |
| 16 | // If this is not the case, numeric value will sort at the same primary level |
| 17 | // as the first primary sorting after 9. |
| 18 | func NewNumericWeighter(w Weighter) Weighter { |
| 19 | getElem := func(s string) Elem { |
| 20 | elems, _ := w.AppendNextString(nil, s) |
| 21 | return elems[0] |
| 22 | } |
| 23 | nine := getElem("9") |
| 24 | |
| 25 | // Numbers should order before zero, but the DUCET has no room for this. |
| 26 | // TODO: move before zero once we use fractional collation elements. |
| 27 | ns, _ := MakeElem(nine.Primary()+1, nine.Secondary(), int(nine.Tertiary()), 0) |
| 28 | |
| 29 | return &numericWeighter{ |
| 30 | Weighter: w, |
| 31 | |
| 32 | // We assume that w sorts digits of different kinds in order of numeric |
| 33 | // value and that the tertiary weight order is preserved. |
| 34 | // |
| 35 | // TODO: evaluate whether it is worth basing the ranges on the Elem |
| 36 | // encoding itself once the move to fractional weights is complete. |
| 37 | zero: getElem("0"), |
| 38 | zeroSpecialLo: getElem("0"), // U+FF10 FULLWIDTH DIGIT ZERO |
| 39 | zeroSpecialHi: getElem("₀"), // U+2080 SUBSCRIPT ZERO |
| 40 | nine: nine, |
| 41 | nineSpecialHi: getElem("₉"), // U+2089 SUBSCRIPT NINE |
| 42 | numberStart: ns, |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | // A numericWeighter translates a stream of digits into a stream of weights |
| 47 | // representing the numeric value. |
| 48 | type numericWeighter struct { |
| 49 | Weighter |
| 50 | |
| 51 | // The Elems below all demarcate boundaries of specific ranges. With the |
| 52 | // current element encoding digits are in two ranges: normal (default |
| 53 | // tertiary value) and special. For most languages, digits have collation |
| 54 | // elements in the normal range. |
| 55 | // |
| 56 | // Note: the range tests are very specific for the element encoding used by |
| 57 | // this implementation. The tests in collate_test.go are designed to fail |
| 58 | // if this code is not updated when an encoding has changed. |
| 59 | |
| 60 | zero Elem // normal digit zero |
| 61 | zeroSpecialLo Elem // special digit zero, low tertiary value |
| 62 | zeroSpecialHi Elem // special digit zero, high tertiary value |
| 63 | nine Elem // normal digit nine |
| 64 | nineSpecialHi Elem // special digit nine |
| 65 | numberStart Elem |
| 66 | } |
| 67 | |
| 68 | // AppendNext calls the namesake of the underlying weigher, but replaces single |
| 69 | // digits with weights representing their value. |
| 70 | func (nw *numericWeighter) AppendNext(buf []Elem, s []byte) (ce []Elem, n int) { |
| 71 | ce, n = nw.Weighter.AppendNext(buf, s) |
| 72 | nc := numberConverter{ |
| 73 | elems: buf, |
| 74 | w: nw, |
| 75 | b: s, |
| 76 | } |
| 77 | isZero, ok := nc.checkNextDigit(ce) |
| 78 | if !ok { |
| 79 | return ce, n |
| 80 | } |
| 81 | // ce might have been grown already, so take it instead of buf. |
| 82 | nc.init(ce, len(buf), isZero) |
| 83 | for n < len(s) { |
| 84 | ce, sz := nw.Weighter.AppendNext(nc.elems, s[n:]) |
| 85 | nc.b = s |
| 86 | n += sz |
| 87 | if !nc.update(ce) { |
| 88 | break |
| 89 | } |
| 90 | } |
| 91 | return nc.result(), n |
| 92 | } |
| 93 | |
| 94 | // AppendNextString calls the namesake of the underlying weigher, but replaces |
| 95 | // single digits with weights representing their value. |
| 96 | func (nw *numericWeighter) AppendNextString(buf []Elem, s string) (ce []Elem, n int) { |
| 97 | ce, n = nw.Weighter.AppendNextString(buf, s) |
| 98 | nc := numberConverter{ |
| 99 | elems: buf, |
| 100 | w: nw, |
| 101 | s: s, |
| 102 | } |
| 103 | isZero, ok := nc.checkNextDigit(ce) |
| 104 | if !ok { |
| 105 | return ce, n |
| 106 | } |
| 107 | nc.init(ce, len(buf), isZero) |
| 108 | for n < len(s) { |
| 109 | ce, sz := nw.Weighter.AppendNextString(nc.elems, s[n:]) |
| 110 | nc.s = s |
| 111 | n += sz |
| 112 | if !nc.update(ce) { |
| 113 | break |
| 114 | } |
| 115 | } |
| 116 | return nc.result(), n |
| 117 | } |
| 118 | |
| 119 | type numberConverter struct { |
| 120 | w *numericWeighter |
| 121 | |
| 122 | elems []Elem |
| 123 | nDigits int |
| 124 | lenIndex int |
| 125 | |
| 126 | s string // set if the input was of type string |
| 127 | b []byte // set if the input was of type []byte |
| 128 | } |
| 129 | |
| 130 | // init completes initialization of a numberConverter and prepares it for adding |
| 131 | // more digits. elems is assumed to have a digit starting at oldLen. |
| 132 | func (nc *numberConverter) init(elems []Elem, oldLen int, isZero bool) { |
| 133 | // Insert a marker indicating the start of a number and and a placeholder |
| 134 | // for the number of digits. |
| 135 | if isZero { |
| 136 | elems = append(elems[:oldLen], nc.w.numberStart, 0) |
| 137 | } else { |
| 138 | elems = append(elems, 0, 0) |
| 139 | copy(elems[oldLen+2:], elems[oldLen:]) |
| 140 | elems[oldLen] = nc.w.numberStart |
| 141 | elems[oldLen+1] = 0 |
| 142 | |
| 143 | nc.nDigits = 1 |
| 144 | } |
| 145 | nc.elems = elems |
| 146 | nc.lenIndex = oldLen + 1 |
| 147 | } |
| 148 | |
| 149 | // checkNextDigit reports whether bufNew adds a single digit relative to the old |
| 150 | // buffer. If it does, it also reports whether this digit is zero. |
| 151 | func (nc *numberConverter) checkNextDigit(bufNew []Elem) (isZero, ok bool) { |
| 152 | if len(nc.elems) >= len(bufNew) { |
| 153 | return false, false |
| 154 | } |
| 155 | e := bufNew[len(nc.elems)] |
| 156 | if e < nc.w.zeroSpecialLo || nc.w.nine < e { |
| 157 | // Not a number. |
| 158 | return false, false |
| 159 | } |
| 160 | if e < nc.w.zero { |
| 161 | if e > nc.w.nineSpecialHi { |
| 162 | // Not a number. |
| 163 | return false, false |
| 164 | } |
| 165 | if !nc.isDigit() { |
| 166 | return false, false |
| 167 | } |
| 168 | isZero = e <= nc.w.zeroSpecialHi |
| 169 | } else { |
| 170 | // This is the common case if we encounter a digit. |
| 171 | isZero = e == nc.w.zero |
| 172 | } |
| 173 | // Test the remaining added collation elements have a zero primary value. |
| 174 | if n := len(bufNew) - len(nc.elems); n > 1 { |
| 175 | for i := len(nc.elems) + 1; i < len(bufNew); i++ { |
| 176 | if bufNew[i].Primary() != 0 { |
| 177 | return false, false |
| 178 | } |
| 179 | } |
| 180 | // In some rare cases, collation elements will encode runes in |
| 181 | // unicode.No as a digit. For example Ethiopic digits (U+1369 - U+1371) |
| 182 | // are not in Nd. Also some digits that clearly belong in unicode.No, |
| 183 | // like U+0C78 TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR, have |
| 184 | // collation elements indistinguishable from normal digits. |
| 185 | // Unfortunately, this means we need to make this check for nearly all |
| 186 | // non-Latin digits. |
| 187 | // |
| 188 | // TODO: check the performance impact and find something better if it is |
| 189 | // an issue. |
| 190 | if !nc.isDigit() { |
| 191 | return false, false |
| 192 | } |
| 193 | } |
| 194 | return isZero, true |
| 195 | } |
| 196 | |
| 197 | func (nc *numberConverter) isDigit() bool { |
| 198 | if nc.b != nil { |
| 199 | r, _ := utf8.DecodeRune(nc.b) |
| 200 | return unicode.In(r, unicode.Nd) |
| 201 | } |
| 202 | r, _ := utf8.DecodeRuneInString(nc.s) |
| 203 | return unicode.In(r, unicode.Nd) |
| 204 | } |
| 205 | |
| 206 | // We currently support a maximum of about 2M digits (the number of primary |
| 207 | // values). Such numbers will compare correctly against small numbers, but their |
| 208 | // comparison against other large numbers is undefined. |
| 209 | // |
| 210 | // TODO: define a proper fallback, such as comparing large numbers textually or |
| 211 | // actually allowing numbers of unlimited length. |
| 212 | // |
| 213 | // TODO: cap this to a lower number (like 100) and maybe allow a larger number |
| 214 | // in an option? |
| 215 | const maxDigits = 1<<maxPrimaryBits - 1 |
| 216 | |
| 217 | func (nc *numberConverter) update(elems []Elem) bool { |
| 218 | isZero, ok := nc.checkNextDigit(elems) |
| 219 | if nc.nDigits == 0 && isZero { |
| 220 | return true |
| 221 | } |
| 222 | nc.elems = elems |
| 223 | if !ok { |
| 224 | return false |
| 225 | } |
| 226 | nc.nDigits++ |
| 227 | return nc.nDigits < maxDigits |
| 228 | } |
| 229 | |
| 230 | // result fills in the length element for the digit sequence and returns the |
| 231 | // completed collation elements. |
| 232 | func (nc *numberConverter) result() []Elem { |
| 233 | e, _ := MakeElem(nc.nDigits, defaultSecondary, defaultTertiary, 0) |
| 234 | nc.elems[nc.lenIndex] = e |
| 235 | return nc.elems |
| 236 | } |