khenaidoo | ac63710 | 2019-01-14 15:44:34 -0500 | [diff] [blame] | 1 | // Copyright 2012 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package colltab |
| 6 | |
| 7 | import ( |
| 8 | "unicode/utf8" |
| 9 | |
| 10 | "golang.org/x/text/unicode/norm" |
| 11 | ) |
| 12 | |
| 13 | // Table holds all collation data for a given collation ordering. |
| 14 | type Table struct { |
| 15 | Index Trie // main trie |
| 16 | |
| 17 | // expansion info |
| 18 | ExpandElem []uint32 |
| 19 | |
| 20 | // contraction info |
| 21 | ContractTries ContractTrieSet |
| 22 | ContractElem []uint32 |
| 23 | MaxContractLen int |
| 24 | VariableTop uint32 |
| 25 | } |
| 26 | |
| 27 | func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) { |
| 28 | return t.appendNext(w, source{bytes: b}) |
| 29 | } |
| 30 | |
| 31 | func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) { |
| 32 | return t.appendNext(w, source{str: s}) |
| 33 | } |
| 34 | |
| 35 | func (t *Table) Start(p int, b []byte) int { |
| 36 | // TODO: implement |
| 37 | panic("not implemented") |
| 38 | } |
| 39 | |
| 40 | func (t *Table) StartString(p int, s string) int { |
| 41 | // TODO: implement |
| 42 | panic("not implemented") |
| 43 | } |
| 44 | |
| 45 | func (t *Table) Domain() []string { |
| 46 | // TODO: implement |
| 47 | panic("not implemented") |
| 48 | } |
| 49 | |
| 50 | func (t *Table) Top() uint32 { |
| 51 | return t.VariableTop |
| 52 | } |
| 53 | |
| 54 | type source struct { |
| 55 | str string |
| 56 | bytes []byte |
| 57 | } |
| 58 | |
| 59 | func (src *source) lookup(t *Table) (ce Elem, sz int) { |
| 60 | if src.bytes == nil { |
| 61 | return t.Index.lookupString(src.str) |
| 62 | } |
| 63 | return t.Index.lookup(src.bytes) |
| 64 | } |
| 65 | |
| 66 | func (src *source) tail(sz int) { |
| 67 | if src.bytes == nil { |
| 68 | src.str = src.str[sz:] |
| 69 | } else { |
| 70 | src.bytes = src.bytes[sz:] |
| 71 | } |
| 72 | } |
| 73 | |
| 74 | func (src *source) nfd(buf []byte, end int) []byte { |
| 75 | if src.bytes == nil { |
| 76 | return norm.NFD.AppendString(buf[:0], src.str[:end]) |
| 77 | } |
| 78 | return norm.NFD.Append(buf[:0], src.bytes[:end]...) |
| 79 | } |
| 80 | |
| 81 | func (src *source) rune() (r rune, sz int) { |
| 82 | if src.bytes == nil { |
| 83 | return utf8.DecodeRuneInString(src.str) |
| 84 | } |
| 85 | return utf8.DecodeRune(src.bytes) |
| 86 | } |
| 87 | |
| 88 | func (src *source) properties(f norm.Form) norm.Properties { |
| 89 | if src.bytes == nil { |
| 90 | return f.PropertiesString(src.str) |
| 91 | } |
| 92 | return f.Properties(src.bytes) |
| 93 | } |
| 94 | |
| 95 | // appendNext appends the weights corresponding to the next rune or |
| 96 | // contraction in s. If a contraction is matched to a discontinuous |
| 97 | // sequence of runes, the weights for the interstitial runes are |
| 98 | // appended as well. It returns a new slice that includes the appended |
| 99 | // weights and the number of bytes consumed from s. |
| 100 | func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) { |
| 101 | ce, sz := src.lookup(t) |
| 102 | tp := ce.ctype() |
| 103 | if tp == ceNormal { |
| 104 | if ce == 0 { |
| 105 | r, _ := src.rune() |
| 106 | const ( |
| 107 | hangulSize = 3 |
| 108 | firstHangul = 0xAC00 |
| 109 | lastHangul = 0xD7A3 |
| 110 | ) |
| 111 | if r >= firstHangul && r <= lastHangul { |
| 112 | // TODO: performance can be considerably improved here. |
| 113 | n = sz |
| 114 | var buf [16]byte // Used for decomposing Hangul. |
| 115 | for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] { |
| 116 | ce, sz = t.Index.lookup(b) |
| 117 | w = append(w, ce) |
| 118 | } |
| 119 | return w, n |
| 120 | } |
| 121 | ce = makeImplicitCE(implicitPrimary(r)) |
| 122 | } |
| 123 | w = append(w, ce) |
| 124 | } else if tp == ceExpansionIndex { |
| 125 | w = t.appendExpansion(w, ce) |
| 126 | } else if tp == ceContractionIndex { |
| 127 | n := 0 |
| 128 | src.tail(sz) |
| 129 | if src.bytes == nil { |
| 130 | w, n = t.matchContractionString(w, ce, src.str) |
| 131 | } else { |
| 132 | w, n = t.matchContraction(w, ce, src.bytes) |
| 133 | } |
| 134 | sz += n |
| 135 | } else if tp == ceDecompose { |
| 136 | // Decompose using NFKD and replace tertiary weights. |
| 137 | t1, t2 := splitDecompose(ce) |
| 138 | i := len(w) |
| 139 | nfkd := src.properties(norm.NFKD).Decomposition() |
| 140 | for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] { |
| 141 | w, p = t.appendNext(w, source{bytes: nfkd}) |
| 142 | } |
| 143 | w[i] = w[i].updateTertiary(t1) |
| 144 | if i++; i < len(w) { |
| 145 | w[i] = w[i].updateTertiary(t2) |
| 146 | for i++; i < len(w); i++ { |
| 147 | w[i] = w[i].updateTertiary(maxTertiary) |
| 148 | } |
| 149 | } |
| 150 | } |
| 151 | return w, sz |
| 152 | } |
| 153 | |
| 154 | func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem { |
| 155 | i := splitExpandIndex(ce) |
| 156 | n := int(t.ExpandElem[i]) |
| 157 | i++ |
| 158 | for _, ce := range t.ExpandElem[i : i+n] { |
| 159 | w = append(w, Elem(ce)) |
| 160 | } |
| 161 | return w |
| 162 | } |
| 163 | |
| 164 | func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) { |
| 165 | index, n, offset := splitContractIndex(ce) |
| 166 | |
| 167 | scan := t.ContractTries.scanner(index, n, suffix) |
| 168 | buf := [norm.MaxSegmentSize]byte{} |
| 169 | bufp := 0 |
| 170 | p := scan.scan(0) |
| 171 | |
| 172 | if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf { |
| 173 | // By now we should have filtered most cases. |
| 174 | p0 := p |
| 175 | bufn := 0 |
| 176 | rune := norm.NFD.Properties(suffix[p:]) |
| 177 | p += rune.Size() |
| 178 | if rune.LeadCCC() != 0 { |
| 179 | prevCC := rune.TrailCCC() |
| 180 | // A gap may only occur in the last normalization segment. |
| 181 | // This also ensures that len(scan.s) < norm.MaxSegmentSize. |
| 182 | if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 { |
| 183 | scan.s = suffix[:p+end] |
| 184 | } |
| 185 | for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf { |
| 186 | rune = norm.NFD.Properties(suffix[p:]) |
| 187 | if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc { |
| 188 | break |
| 189 | } |
| 190 | prevCC = rune.TrailCCC() |
| 191 | if pp := scan.scan(p); pp != p { |
| 192 | // Copy the interstitial runes for later processing. |
| 193 | bufn += copy(buf[bufn:], suffix[p0:p]) |
| 194 | if scan.pindex == pp { |
| 195 | bufp = bufn |
| 196 | } |
| 197 | p, p0 = pp, pp |
| 198 | } else { |
| 199 | p += rune.Size() |
| 200 | } |
| 201 | } |
| 202 | } |
| 203 | } |
| 204 | // Append weights for the matched contraction, which may be an expansion. |
| 205 | i, n := scan.result() |
| 206 | ce = Elem(t.ContractElem[i+offset]) |
| 207 | if ce.ctype() == ceNormal { |
| 208 | w = append(w, ce) |
| 209 | } else { |
| 210 | w = t.appendExpansion(w, ce) |
| 211 | } |
| 212 | // Append weights for the runes in the segment not part of the contraction. |
| 213 | for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] { |
| 214 | w, p = t.appendNext(w, source{bytes: b}) |
| 215 | } |
| 216 | return w, n |
| 217 | } |
| 218 | |
| 219 | // TODO: unify the two implementations. This is best done after first simplifying |
| 220 | // the algorithm taking into account the inclusion of both NFC and NFD forms |
| 221 | // in the table. |
| 222 | func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) { |
| 223 | index, n, offset := splitContractIndex(ce) |
| 224 | |
| 225 | scan := t.ContractTries.scannerString(index, n, suffix) |
| 226 | buf := [norm.MaxSegmentSize]byte{} |
| 227 | bufp := 0 |
| 228 | p := scan.scan(0) |
| 229 | |
| 230 | if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf { |
| 231 | // By now we should have filtered most cases. |
| 232 | p0 := p |
| 233 | bufn := 0 |
| 234 | rune := norm.NFD.PropertiesString(suffix[p:]) |
| 235 | p += rune.Size() |
| 236 | if rune.LeadCCC() != 0 { |
| 237 | prevCC := rune.TrailCCC() |
| 238 | // A gap may only occur in the last normalization segment. |
| 239 | // This also ensures that len(scan.s) < norm.MaxSegmentSize. |
| 240 | if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 { |
| 241 | scan.s = suffix[:p+end] |
| 242 | } |
| 243 | for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf { |
| 244 | rune = norm.NFD.PropertiesString(suffix[p:]) |
| 245 | if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc { |
| 246 | break |
| 247 | } |
| 248 | prevCC = rune.TrailCCC() |
| 249 | if pp := scan.scan(p); pp != p { |
| 250 | // Copy the interstitial runes for later processing. |
| 251 | bufn += copy(buf[bufn:], suffix[p0:p]) |
| 252 | if scan.pindex == pp { |
| 253 | bufp = bufn |
| 254 | } |
| 255 | p, p0 = pp, pp |
| 256 | } else { |
| 257 | p += rune.Size() |
| 258 | } |
| 259 | } |
| 260 | } |
| 261 | } |
| 262 | // Append weights for the matched contraction, which may be an expansion. |
| 263 | i, n := scan.result() |
| 264 | ce = Elem(t.ContractElem[i+offset]) |
| 265 | if ce.ctype() == ceNormal { |
| 266 | w = append(w, ce) |
| 267 | } else { |
| 268 | w = t.appendExpansion(w, ce) |
| 269 | } |
| 270 | // Append weights for the runes in the segment not part of the contraction. |
| 271 | for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] { |
| 272 | w, p = t.appendNext(w, source{bytes: b}) |
| 273 | } |
| 274 | return w, n |
| 275 | } |