blob: e26e36da93542cdafb23581da1954f3e1b002dfd [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package colltab
6
7import (
8 "unicode/utf8"
9
10 "golang.org/x/text/unicode/norm"
11)
12
13// Table holds all collation data for a given collation ordering.
14type Table struct {
15 Index Trie // main trie
16
17 // expansion info
18 ExpandElem []uint32
19
20 // contraction info
21 ContractTries ContractTrieSet
22 ContractElem []uint32
23 MaxContractLen int
24 VariableTop uint32
25}
26
27func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
28 return t.appendNext(w, source{bytes: b})
29}
30
31func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
32 return t.appendNext(w, source{str: s})
33}
34
35func (t *Table) Start(p int, b []byte) int {
36 // TODO: implement
37 panic("not implemented")
38}
39
40func (t *Table) StartString(p int, s string) int {
41 // TODO: implement
42 panic("not implemented")
43}
44
45func (t *Table) Domain() []string {
46 // TODO: implement
47 panic("not implemented")
48}
49
50func (t *Table) Top() uint32 {
51 return t.VariableTop
52}
53
54type source struct {
55 str string
56 bytes []byte
57}
58
59func (src *source) lookup(t *Table) (ce Elem, sz int) {
60 if src.bytes == nil {
61 return t.Index.lookupString(src.str)
62 }
63 return t.Index.lookup(src.bytes)
64}
65
66func (src *source) tail(sz int) {
67 if src.bytes == nil {
68 src.str = src.str[sz:]
69 } else {
70 src.bytes = src.bytes[sz:]
71 }
72}
73
74func (src *source) nfd(buf []byte, end int) []byte {
75 if src.bytes == nil {
76 return norm.NFD.AppendString(buf[:0], src.str[:end])
77 }
78 return norm.NFD.Append(buf[:0], src.bytes[:end]...)
79}
80
81func (src *source) rune() (r rune, sz int) {
82 if src.bytes == nil {
83 return utf8.DecodeRuneInString(src.str)
84 }
85 return utf8.DecodeRune(src.bytes)
86}
87
88func (src *source) properties(f norm.Form) norm.Properties {
89 if src.bytes == nil {
90 return f.PropertiesString(src.str)
91 }
92 return f.Properties(src.bytes)
93}
94
95// appendNext appends the weights corresponding to the next rune or
96// contraction in s. If a contraction is matched to a discontinuous
97// sequence of runes, the weights for the interstitial runes are
98// appended as well. It returns a new slice that includes the appended
99// weights and the number of bytes consumed from s.
100func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) {
101 ce, sz := src.lookup(t)
102 tp := ce.ctype()
103 if tp == ceNormal {
104 if ce == 0 {
105 r, _ := src.rune()
106 const (
107 hangulSize = 3
108 firstHangul = 0xAC00
109 lastHangul = 0xD7A3
110 )
111 if r >= firstHangul && r <= lastHangul {
112 // TODO: performance can be considerably improved here.
113 n = sz
114 var buf [16]byte // Used for decomposing Hangul.
115 for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
116 ce, sz = t.Index.lookup(b)
117 w = append(w, ce)
118 }
119 return w, n
120 }
121 ce = makeImplicitCE(implicitPrimary(r))
122 }
123 w = append(w, ce)
124 } else if tp == ceExpansionIndex {
125 w = t.appendExpansion(w, ce)
126 } else if tp == ceContractionIndex {
127 n := 0
128 src.tail(sz)
129 if src.bytes == nil {
130 w, n = t.matchContractionString(w, ce, src.str)
131 } else {
132 w, n = t.matchContraction(w, ce, src.bytes)
133 }
134 sz += n
135 } else if tp == ceDecompose {
136 // Decompose using NFKD and replace tertiary weights.
137 t1, t2 := splitDecompose(ce)
138 i := len(w)
139 nfkd := src.properties(norm.NFKD).Decomposition()
140 for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
141 w, p = t.appendNext(w, source{bytes: nfkd})
142 }
143 w[i] = w[i].updateTertiary(t1)
144 if i++; i < len(w) {
145 w[i] = w[i].updateTertiary(t2)
146 for i++; i < len(w); i++ {
147 w[i] = w[i].updateTertiary(maxTertiary)
148 }
149 }
150 }
151 return w, sz
152}
153
154func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem {
155 i := splitExpandIndex(ce)
156 n := int(t.ExpandElem[i])
157 i++
158 for _, ce := range t.ExpandElem[i : i+n] {
159 w = append(w, Elem(ce))
160 }
161 return w
162}
163
164func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
165 index, n, offset := splitContractIndex(ce)
166
167 scan := t.ContractTries.scanner(index, n, suffix)
168 buf := [norm.MaxSegmentSize]byte{}
169 bufp := 0
170 p := scan.scan(0)
171
172 if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
173 // By now we should have filtered most cases.
174 p0 := p
175 bufn := 0
176 rune := norm.NFD.Properties(suffix[p:])
177 p += rune.Size()
178 if rune.LeadCCC() != 0 {
179 prevCC := rune.TrailCCC()
180 // A gap may only occur in the last normalization segment.
181 // This also ensures that len(scan.s) < norm.MaxSegmentSize.
182 if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
183 scan.s = suffix[:p+end]
184 }
185 for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
186 rune = norm.NFD.Properties(suffix[p:])
187 if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
188 break
189 }
190 prevCC = rune.TrailCCC()
191 if pp := scan.scan(p); pp != p {
192 // Copy the interstitial runes for later processing.
193 bufn += copy(buf[bufn:], suffix[p0:p])
194 if scan.pindex == pp {
195 bufp = bufn
196 }
197 p, p0 = pp, pp
198 } else {
199 p += rune.Size()
200 }
201 }
202 }
203 }
204 // Append weights for the matched contraction, which may be an expansion.
205 i, n := scan.result()
206 ce = Elem(t.ContractElem[i+offset])
207 if ce.ctype() == ceNormal {
208 w = append(w, ce)
209 } else {
210 w = t.appendExpansion(w, ce)
211 }
212 // Append weights for the runes in the segment not part of the contraction.
213 for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
214 w, p = t.appendNext(w, source{bytes: b})
215 }
216 return w, n
217}
218
219// TODO: unify the two implementations. This is best done after first simplifying
220// the algorithm taking into account the inclusion of both NFC and NFD forms
221// in the table.
222func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
223 index, n, offset := splitContractIndex(ce)
224
225 scan := t.ContractTries.scannerString(index, n, suffix)
226 buf := [norm.MaxSegmentSize]byte{}
227 bufp := 0
228 p := scan.scan(0)
229
230 if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
231 // By now we should have filtered most cases.
232 p0 := p
233 bufn := 0
234 rune := norm.NFD.PropertiesString(suffix[p:])
235 p += rune.Size()
236 if rune.LeadCCC() != 0 {
237 prevCC := rune.TrailCCC()
238 // A gap may only occur in the last normalization segment.
239 // This also ensures that len(scan.s) < norm.MaxSegmentSize.
240 if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
241 scan.s = suffix[:p+end]
242 }
243 for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
244 rune = norm.NFD.PropertiesString(suffix[p:])
245 if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
246 break
247 }
248 prevCC = rune.TrailCCC()
249 if pp := scan.scan(p); pp != p {
250 // Copy the interstitial runes for later processing.
251 bufn += copy(buf[bufn:], suffix[p0:p])
252 if scan.pindex == pp {
253 bufp = bufn
254 }
255 p, p0 = pp, pp
256 } else {
257 p += rune.Size()
258 }
259 }
260 }
261 }
262 // Append weights for the matched contraction, which may be an expansion.
263 i, n := scan.result()
264 ce = Elem(t.ContractElem[i+offset])
265 if ce.ctype() == ceNormal {
266 w = append(w, ce)
267 } else {
268 w = t.appendExpansion(w, ce)
269 }
270 // Append weights for the runes in the segment not part of the contraction.
271 for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
272 w, p = t.appendNext(w, source{bytes: b})
273 }
274 return w, n
275}