blob: e2df64f0c656bc38f23b27c0fc27f51759e66f10 [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package build
6
7import (
8 "fmt"
9 "io"
10 "reflect"
11 "sort"
12 "strings"
13
14 "golang.org/x/text/internal/colltab"
15)
16
17// This file contains code for detecting contractions and generating
18// the necessary tables.
19// Any Unicode Collation Algorithm (UCA) table entry that has more than
20// one rune one the left-hand side is called a contraction.
21// See https://www.unicode.org/reports/tr10/#Contractions for more details.
22//
23// We define the following terms:
24// initial: a rune that appears as the first rune in a contraction.
25// suffix: a sequence of runes succeeding the initial rune
26// in a given contraction.
27// non-initial: a rune that appears in a suffix.
28//
29// A rune may be both an initial and a non-initial and may be so in
30// many contractions. An initial may typically also appear by itself.
31// In case of ambiguities, the UCA requires we match the longest
32// contraction.
33//
34// Many contraction rules share the same set of possible suffixes.
35// We store sets of suffixes in a trie that associates an index with
36// each suffix in the set. This index can be used to look up a
37// collation element associated with the (starter rune, suffix) pair.
38//
39// The trie is defined on a UTF-8 byte sequence.
40// The overall trie is represented as an array of ctEntries. Each node of the trie
41// is represented as a subsequence of ctEntries, where each entry corresponds to
42// a possible match of a next character in the search string. An entry
43// also includes the length and offset to the next sequence of entries
44// to check in case of a match.
45
46const (
47 final = 0
48 noIndex = 0xFF
49)
50
51// ctEntry associates to a matching byte an offset and/or next sequence of
52// bytes to check. A ctEntry c is called final if a match means that the
53// longest suffix has been found. An entry c is final if c.N == 0.
54// A single final entry can match a range of characters to an offset.
55// A non-final entry always matches a single byte. Note that a non-final
56// entry might still resemble a completed suffix.
57// Examples:
58// The suffix strings "ab" and "ac" can be represented as:
59// []ctEntry{
60// {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF.
61// {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2
62// }
63//
64// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
65// []ctEntry{
66// {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
67// {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'.
68// {'d', 'd', final, 3}, // "abd" -> 3
69// {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'.
70// {'d', 'd', final, 4}, // "abcd" -> 4
71// }
72// See genStateTests in contract_test.go for more examples.
73type ctEntry struct {
74 L uint8 // non-final: byte value to match; final: lowest match in range.
75 H uint8 // non-final: relative index to next block; final: highest match in range.
76 N uint8 // non-final: length of next block; final: final
77 I uint8 // result offset. Will be noIndex if more bytes are needed to complete.
78}
79
80// contractTrieSet holds a set of contraction tries. The tries are stored
81// consecutively in the entry field.
82type contractTrieSet []struct{ l, h, n, i uint8 }
83
84// ctHandle is used to identify a trie in the trie set, consisting in an offset
85// in the array and the size of the first node.
86type ctHandle struct {
87 index, n int
88}
89
90// appendTrie adds a new trie for the given suffixes to the trie set and returns
91// a handle to it. The handle will be invalid on error.
92func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) {
93 es := make([]stridx, len(suffixes))
94 for i, s := range suffixes {
95 es[i].str = s
96 }
97 sort.Sort(offsetSort(es))
98 for i := range es {
99 es[i].index = i + 1
100 }
101 sort.Sort(genidxSort(es))
102 i := len(*ct)
103 n, err := genStates(ct, es)
104 if err != nil {
105 *ct = (*ct)[:i]
106 return ctHandle{}, err
107 }
108 return ctHandle{i, n}, nil
109}
110
111// genStates generates ctEntries for a given suffix set and returns
112// the number of entries for the first node.
113func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) {
114 if len(sis) == 0 {
115 return 0, fmt.Errorf("genStates: list of suffices must be non-empty")
116 }
117 start := len(*ct)
118 // create entries for differing first bytes.
119 for _, si := range sis {
120 s := si.str
121 if len(s) == 0 {
122 continue
123 }
124 added := false
125 c := s[0]
126 if len(s) > 1 {
127 for j := len(*ct) - 1; j >= start; j-- {
128 if (*ct)[j].L == c {
129 added = true
130 break
131 }
132 }
133 if !added {
134 *ct = append(*ct, ctEntry{L: c, I: noIndex})
135 }
136 } else {
137 for j := len(*ct) - 1; j >= start; j-- {
138 // Update the offset for longer suffixes with the same byte.
139 if (*ct)[j].L == c {
140 (*ct)[j].I = uint8(si.index)
141 added = true
142 }
143 // Extend range of final ctEntry, if possible.
144 if (*ct)[j].H+1 == c {
145 (*ct)[j].H = c
146 added = true
147 }
148 }
149 if !added {
150 *ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)})
151 }
152 }
153 }
154 n := len(*ct) - start
155 // Append nodes for the remainder of the suffixes for each ctEntry.
156 sp := 0
157 for i, end := start, len(*ct); i < end; i++ {
158 fe := (*ct)[i]
159 if fe.H == 0 { // uninitialized non-final
160 ln := len(*ct) - start - n
161 if ln > 0xFF {
162 return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
163 }
164 fe.H = uint8(ln)
165 // Find first non-final strings with same byte as current entry.
166 for ; sis[sp].str[0] != fe.L; sp++ {
167 }
168 se := sp + 1
169 for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ {
170 }
171 sl := sis[sp:se]
172 sp = se
173 for i, si := range sl {
174 sl[i].str = si.str[1:]
175 }
176 nn, err := genStates(ct, sl)
177 if err != nil {
178 return 0, err
179 }
180 fe.N = uint8(nn)
181 (*ct)[i] = fe
182 }
183 }
184 sort.Sort(entrySort((*ct)[start : start+n]))
185 return n, nil
186}
187
188// There may be both a final and non-final entry for a byte if the byte
189// is implied in a range of matches in the final entry.
190// We need to ensure that the non-final entry comes first in that case.
191type entrySort colltab.ContractTrieSet
192
193func (fe entrySort) Len() int { return len(fe) }
194func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] }
195func (fe entrySort) Less(i, j int) bool {
196 return fe[i].L > fe[j].L
197}
198
199// stridx is used for sorting suffixes and their associated offsets.
200type stridx struct {
201 str string
202 index int
203}
204
205// For computing the offsets, we first sort by size, and then by string.
206// This ensures that strings that only differ in the last byte by 1
207// are sorted consecutively in increasing order such that they can
208// be packed as a range in a final ctEntry.
209type offsetSort []stridx
210
211func (si offsetSort) Len() int { return len(si) }
212func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
213func (si offsetSort) Less(i, j int) bool {
214 if len(si[i].str) != len(si[j].str) {
215 return len(si[i].str) > len(si[j].str)
216 }
217 return si[i].str < si[j].str
218}
219
220// For indexing, we want to ensure that strings are sorted in string order, where
221// for strings with the same prefix, we put longer strings before shorter ones.
222type genidxSort []stridx
223
224func (si genidxSort) Len() int { return len(si) }
225func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
226func (si genidxSort) Less(i, j int) bool {
227 if strings.HasPrefix(si[j].str, si[i].str) {
228 return false
229 }
230 if strings.HasPrefix(si[i].str, si[j].str) {
231 return true
232 }
233 return si[i].str < si[j].str
234}
235
236// lookup matches the longest suffix in str and returns the associated offset
237// and the number of bytes consumed.
238func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) {
239 states := (*ct)[h.index:]
240 p := 0
241 n := h.n
242 for i := 0; i < n && p < len(str); {
243 e := states[i]
244 c := str[p]
245 if c >= e.L {
246 if e.L == c {
247 p++
248 if e.I != noIndex {
249 index, ns = int(e.I), p
250 }
251 if e.N != final {
252 // set to new state
253 i, states, n = 0, states[int(e.H)+n:], int(e.N)
254 } else {
255 return
256 }
257 continue
258 } else if e.N == final && c <= e.H {
259 p++
260 return int(c-e.L) + int(e.I), p
261 }
262 }
263 i++
264 }
265 return
266}
267
268// print writes the contractTrieSet t as compilable Go code to w. It returns
269// the total number of bytes written and the size of the resulting data structure in bytes.
270func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
271 update3 := func(nn, sz int, e error) {
272 n += nn
273 if err == nil {
274 err = e
275 }
276 size += sz
277 }
278 update2 := func(nn int, e error) { update3(nn, 0, e) }
279
280 update3(printArray(*t, w, name))
281 update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name))
282 update3(printStruct(*t, w, name))
283 update2(fmt.Fprintln(w))
284 return
285}
286
287func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
288 p := func(f string, a ...interface{}) {
289 nn, e := fmt.Fprintf(w, f, a...)
290 n += nn
291 if err == nil {
292 err = e
293 }
294 }
295 size = len(ct) * 4
296 p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size)
297 p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct))
298 for _, fe := range ct {
299 p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I)
300 }
301 p("}\n")
302 return
303}
304
305func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
306 n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name)
307 size = int(reflect.TypeOf(ct).Size())
308 return
309}