William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 1 | // Copyright 2012 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package build |
| 6 | |
| 7 | import ( |
| 8 | "fmt" |
| 9 | "io" |
| 10 | "reflect" |
| 11 | "sort" |
| 12 | "strings" |
| 13 | |
| 14 | "golang.org/x/text/internal/colltab" |
| 15 | ) |
| 16 | |
| 17 | // This file contains code for detecting contractions and generating |
| 18 | // the necessary tables. |
| 19 | // Any Unicode Collation Algorithm (UCA) table entry that has more than |
| 20 | // one rune one the left-hand side is called a contraction. |
Abhilash S.L | 3b49463 | 2019-07-16 15:51:09 +0530 | [diff] [blame] | 21 | // See https://www.unicode.org/reports/tr10/#Contractions for more details. |
William Kurkian | ea86948 | 2019-04-09 15:16:11 -0400 | [diff] [blame] | 22 | // |
| 23 | // We define the following terms: |
| 24 | // initial: a rune that appears as the first rune in a contraction. |
| 25 | // suffix: a sequence of runes succeeding the initial rune |
| 26 | // in a given contraction. |
| 27 | // non-initial: a rune that appears in a suffix. |
| 28 | // |
| 29 | // A rune may be both an initial and a non-initial and may be so in |
| 30 | // many contractions. An initial may typically also appear by itself. |
| 31 | // In case of ambiguities, the UCA requires we match the longest |
| 32 | // contraction. |
| 33 | // |
| 34 | // Many contraction rules share the same set of possible suffixes. |
| 35 | // We store sets of suffixes in a trie that associates an index with |
| 36 | // each suffix in the set. This index can be used to look up a |
| 37 | // collation element associated with the (starter rune, suffix) pair. |
| 38 | // |
| 39 | // The trie is defined on a UTF-8 byte sequence. |
| 40 | // The overall trie is represented as an array of ctEntries. Each node of the trie |
| 41 | // is represented as a subsequence of ctEntries, where each entry corresponds to |
| 42 | // a possible match of a next character in the search string. An entry |
| 43 | // also includes the length and offset to the next sequence of entries |
| 44 | // to check in case of a match. |
| 45 | |
| 46 | const ( |
| 47 | final = 0 |
| 48 | noIndex = 0xFF |
| 49 | ) |
| 50 | |
| 51 | // ctEntry associates to a matching byte an offset and/or next sequence of |
| 52 | // bytes to check. A ctEntry c is called final if a match means that the |
| 53 | // longest suffix has been found. An entry c is final if c.N == 0. |
| 54 | // A single final entry can match a range of characters to an offset. |
| 55 | // A non-final entry always matches a single byte. Note that a non-final |
| 56 | // entry might still resemble a completed suffix. |
| 57 | // Examples: |
| 58 | // The suffix strings "ab" and "ac" can be represented as: |
| 59 | // []ctEntry{ |
| 60 | // {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF. |
| 61 | // {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2 |
| 62 | // } |
| 63 | // |
| 64 | // The suffix strings "ab", "abc", "abd", and "abcd" can be represented as: |
| 65 | // []ctEntry{ |
| 66 | // {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'. |
| 67 | // {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'. |
| 68 | // {'d', 'd', final, 3}, // "abd" -> 3 |
| 69 | // {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'. |
| 70 | // {'d', 'd', final, 4}, // "abcd" -> 4 |
| 71 | // } |
| 72 | // See genStateTests in contract_test.go for more examples. |
| 73 | type ctEntry struct { |
| 74 | L uint8 // non-final: byte value to match; final: lowest match in range. |
| 75 | H uint8 // non-final: relative index to next block; final: highest match in range. |
| 76 | N uint8 // non-final: length of next block; final: final |
| 77 | I uint8 // result offset. Will be noIndex if more bytes are needed to complete. |
| 78 | } |
| 79 | |
| 80 | // contractTrieSet holds a set of contraction tries. The tries are stored |
| 81 | // consecutively in the entry field. |
| 82 | type contractTrieSet []struct{ l, h, n, i uint8 } |
| 83 | |
| 84 | // ctHandle is used to identify a trie in the trie set, consisting in an offset |
| 85 | // in the array and the size of the first node. |
| 86 | type ctHandle struct { |
| 87 | index, n int |
| 88 | } |
| 89 | |
| 90 | // appendTrie adds a new trie for the given suffixes to the trie set and returns |
| 91 | // a handle to it. The handle will be invalid on error. |
| 92 | func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) { |
| 93 | es := make([]stridx, len(suffixes)) |
| 94 | for i, s := range suffixes { |
| 95 | es[i].str = s |
| 96 | } |
| 97 | sort.Sort(offsetSort(es)) |
| 98 | for i := range es { |
| 99 | es[i].index = i + 1 |
| 100 | } |
| 101 | sort.Sort(genidxSort(es)) |
| 102 | i := len(*ct) |
| 103 | n, err := genStates(ct, es) |
| 104 | if err != nil { |
| 105 | *ct = (*ct)[:i] |
| 106 | return ctHandle{}, err |
| 107 | } |
| 108 | return ctHandle{i, n}, nil |
| 109 | } |
| 110 | |
| 111 | // genStates generates ctEntries for a given suffix set and returns |
| 112 | // the number of entries for the first node. |
| 113 | func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) { |
| 114 | if len(sis) == 0 { |
| 115 | return 0, fmt.Errorf("genStates: list of suffices must be non-empty") |
| 116 | } |
| 117 | start := len(*ct) |
| 118 | // create entries for differing first bytes. |
| 119 | for _, si := range sis { |
| 120 | s := si.str |
| 121 | if len(s) == 0 { |
| 122 | continue |
| 123 | } |
| 124 | added := false |
| 125 | c := s[0] |
| 126 | if len(s) > 1 { |
| 127 | for j := len(*ct) - 1; j >= start; j-- { |
| 128 | if (*ct)[j].L == c { |
| 129 | added = true |
| 130 | break |
| 131 | } |
| 132 | } |
| 133 | if !added { |
| 134 | *ct = append(*ct, ctEntry{L: c, I: noIndex}) |
| 135 | } |
| 136 | } else { |
| 137 | for j := len(*ct) - 1; j >= start; j-- { |
| 138 | // Update the offset for longer suffixes with the same byte. |
| 139 | if (*ct)[j].L == c { |
| 140 | (*ct)[j].I = uint8(si.index) |
| 141 | added = true |
| 142 | } |
| 143 | // Extend range of final ctEntry, if possible. |
| 144 | if (*ct)[j].H+1 == c { |
| 145 | (*ct)[j].H = c |
| 146 | added = true |
| 147 | } |
| 148 | } |
| 149 | if !added { |
| 150 | *ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)}) |
| 151 | } |
| 152 | } |
| 153 | } |
| 154 | n := len(*ct) - start |
| 155 | // Append nodes for the remainder of the suffixes for each ctEntry. |
| 156 | sp := 0 |
| 157 | for i, end := start, len(*ct); i < end; i++ { |
| 158 | fe := (*ct)[i] |
| 159 | if fe.H == 0 { // uninitialized non-final |
| 160 | ln := len(*ct) - start - n |
| 161 | if ln > 0xFF { |
| 162 | return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln) |
| 163 | } |
| 164 | fe.H = uint8(ln) |
| 165 | // Find first non-final strings with same byte as current entry. |
| 166 | for ; sis[sp].str[0] != fe.L; sp++ { |
| 167 | } |
| 168 | se := sp + 1 |
| 169 | for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ { |
| 170 | } |
| 171 | sl := sis[sp:se] |
| 172 | sp = se |
| 173 | for i, si := range sl { |
| 174 | sl[i].str = si.str[1:] |
| 175 | } |
| 176 | nn, err := genStates(ct, sl) |
| 177 | if err != nil { |
| 178 | return 0, err |
| 179 | } |
| 180 | fe.N = uint8(nn) |
| 181 | (*ct)[i] = fe |
| 182 | } |
| 183 | } |
| 184 | sort.Sort(entrySort((*ct)[start : start+n])) |
| 185 | return n, nil |
| 186 | } |
| 187 | |
| 188 | // There may be both a final and non-final entry for a byte if the byte |
| 189 | // is implied in a range of matches in the final entry. |
| 190 | // We need to ensure that the non-final entry comes first in that case. |
| 191 | type entrySort colltab.ContractTrieSet |
| 192 | |
| 193 | func (fe entrySort) Len() int { return len(fe) } |
| 194 | func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] } |
| 195 | func (fe entrySort) Less(i, j int) bool { |
| 196 | return fe[i].L > fe[j].L |
| 197 | } |
| 198 | |
| 199 | // stridx is used for sorting suffixes and their associated offsets. |
| 200 | type stridx struct { |
| 201 | str string |
| 202 | index int |
| 203 | } |
| 204 | |
| 205 | // For computing the offsets, we first sort by size, and then by string. |
| 206 | // This ensures that strings that only differ in the last byte by 1 |
| 207 | // are sorted consecutively in increasing order such that they can |
| 208 | // be packed as a range in a final ctEntry. |
| 209 | type offsetSort []stridx |
| 210 | |
| 211 | func (si offsetSort) Len() int { return len(si) } |
| 212 | func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } |
| 213 | func (si offsetSort) Less(i, j int) bool { |
| 214 | if len(si[i].str) != len(si[j].str) { |
| 215 | return len(si[i].str) > len(si[j].str) |
| 216 | } |
| 217 | return si[i].str < si[j].str |
| 218 | } |
| 219 | |
| 220 | // For indexing, we want to ensure that strings are sorted in string order, where |
| 221 | // for strings with the same prefix, we put longer strings before shorter ones. |
| 222 | type genidxSort []stridx |
| 223 | |
| 224 | func (si genidxSort) Len() int { return len(si) } |
| 225 | func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } |
| 226 | func (si genidxSort) Less(i, j int) bool { |
| 227 | if strings.HasPrefix(si[j].str, si[i].str) { |
| 228 | return false |
| 229 | } |
| 230 | if strings.HasPrefix(si[i].str, si[j].str) { |
| 231 | return true |
| 232 | } |
| 233 | return si[i].str < si[j].str |
| 234 | } |
| 235 | |
| 236 | // lookup matches the longest suffix in str and returns the associated offset |
| 237 | // and the number of bytes consumed. |
| 238 | func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) { |
| 239 | states := (*ct)[h.index:] |
| 240 | p := 0 |
| 241 | n := h.n |
| 242 | for i := 0; i < n && p < len(str); { |
| 243 | e := states[i] |
| 244 | c := str[p] |
| 245 | if c >= e.L { |
| 246 | if e.L == c { |
| 247 | p++ |
| 248 | if e.I != noIndex { |
| 249 | index, ns = int(e.I), p |
| 250 | } |
| 251 | if e.N != final { |
| 252 | // set to new state |
| 253 | i, states, n = 0, states[int(e.H)+n:], int(e.N) |
| 254 | } else { |
| 255 | return |
| 256 | } |
| 257 | continue |
| 258 | } else if e.N == final && c <= e.H { |
| 259 | p++ |
| 260 | return int(c-e.L) + int(e.I), p |
| 261 | } |
| 262 | } |
| 263 | i++ |
| 264 | } |
| 265 | return |
| 266 | } |
| 267 | |
| 268 | // print writes the contractTrieSet t as compilable Go code to w. It returns |
| 269 | // the total number of bytes written and the size of the resulting data structure in bytes. |
| 270 | func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { |
| 271 | update3 := func(nn, sz int, e error) { |
| 272 | n += nn |
| 273 | if err == nil { |
| 274 | err = e |
| 275 | } |
| 276 | size += sz |
| 277 | } |
| 278 | update2 := func(nn int, e error) { update3(nn, 0, e) } |
| 279 | |
| 280 | update3(printArray(*t, w, name)) |
| 281 | update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name)) |
| 282 | update3(printStruct(*t, w, name)) |
| 283 | update2(fmt.Fprintln(w)) |
| 284 | return |
| 285 | } |
| 286 | |
| 287 | func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { |
| 288 | p := func(f string, a ...interface{}) { |
| 289 | nn, e := fmt.Fprintf(w, f, a...) |
| 290 | n += nn |
| 291 | if err == nil { |
| 292 | err = e |
| 293 | } |
| 294 | } |
| 295 | size = len(ct) * 4 |
| 296 | p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size) |
| 297 | p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct)) |
| 298 | for _, fe := range ct { |
| 299 | p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I) |
| 300 | } |
| 301 | p("}\n") |
| 302 | return |
| 303 | } |
| 304 | |
| 305 | func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { |
| 306 | n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name) |
| 307 | size = int(reflect.TypeOf(ct).Size()) |
| 308 | return |
| 309 | } |