blob: 04fc3bfb72b0395fa1e79fea9cabefef2493b6b7 [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package build
6
7import (
8 "fmt"
9 "unicode"
10
11 "golang.org/x/text/internal/colltab"
12)
13
14const (
15 defaultSecondary = 0x20
16 defaultTertiary = 0x2
17 maxTertiary = 0x1F
18)
19
20type rawCE struct {
21 w []int
22 ccc uint8
23}
24
25func makeRawCE(w []int, ccc uint8) rawCE {
26 ce := rawCE{w: make([]int, 4), ccc: ccc}
27 copy(ce.w, w)
28 return ce
29}
30
31// A collation element is represented as an uint32.
32// In the typical case, a rune maps to a single collation element. If a rune
33// can be the start of a contraction or expands into multiple collation elements,
34// then the collation element that is associated with a rune will have a special
35// form to represent such m to n mappings. Such special collation elements
36// have a value >= 0x80000000.
37
38const (
39 maxPrimaryBits = 21
40 maxSecondaryBits = 12
41 maxTertiaryBits = 8
42)
43
44func makeCE(ce rawCE) (uint32, error) {
45 v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
46 return uint32(v), e
47}
48
49// For contractions, collation elements are of the form
50// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
51// - n* is the size of the first node in the contraction trie.
52// - i* is the index of the first node in the contraction trie.
53// - b* is the offset into the contraction collation element table.
54// See contract.go for details on the contraction trie.
55const (
56 contractID = 0xC0000000
57 maxNBits = 4
58 maxTrieIndexBits = 12
59 maxContractOffsetBits = 13
60)
61
62func makeContractIndex(h ctHandle, offset int) (uint32, error) {
63 if h.n >= 1<<maxNBits {
64 return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
65 }
66 if h.index >= 1<<maxTrieIndexBits {
67 return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
68 }
69 if offset >= 1<<maxContractOffsetBits {
70 return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
71 }
72 ce := uint32(contractID)
73 ce += uint32(offset << (maxNBits + maxTrieIndexBits))
74 ce += uint32(h.index << maxNBits)
75 ce += uint32(h.n)
76 return ce, nil
77}
78
79// For expansions, collation elements are of the form
80// 11100000 00000000 bbbbbbbb bbbbbbbb,
81// where b* is the index into the expansion sequence table.
82const (
83 expandID = 0xE0000000
84 maxExpandIndexBits = 16
85)
86
87func makeExpandIndex(index int) (uint32, error) {
88 if index >= 1<<maxExpandIndexBits {
89 return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
90 }
91 return expandID + uint32(index), nil
92}
93
94// Each list of collation elements corresponding to an expansion starts with
95// a header indicating the length of the sequence.
96func makeExpansionHeader(n int) (uint32, error) {
97 return uint32(n), nil
98}
99
100// Some runes can be expanded using NFKD decomposition. Instead of storing the full
101// sequence of collation elements, we decompose the rune and lookup the collation
102// elements for each rune in the decomposition and modify the tertiary weights.
103// The collation element, in this case, is of the form
104// 11110000 00000000 wwwwwwww vvvvvvvv, where
105// - v* is the replacement tertiary weight for the first rune,
106// - w* is the replacement tertiary weight for the second rune,
107// Tertiary weights of subsequent runes should be replaced with maxTertiary.
108// See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
109const (
110 decompID = 0xF0000000
111)
112
113func makeDecompose(t1, t2 int) (uint32, error) {
114 if t1 >= 256 || t1 < 0 {
115 return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
116 }
117 if t2 >= 256 || t2 < 0 {
118 return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
119 }
120 return uint32(t2<<8+t1) + decompID, nil
121}
122
123const (
124 // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
125 minUnified rune = 0x4E00
126 maxUnified = 0x9FFF
127 minCompatibility = 0xF900
128 maxCompatibility = 0xFAFF
129 minRare = 0x3400
130 maxRare = 0x4DBF
131)
132const (
133 commonUnifiedOffset = 0x10000
134 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
135 otherOffset = 0x50000 // largest rune in rare is U+2FA1D
136 illegalOffset = otherOffset + int(unicode.MaxRune)
137 maxPrimary = illegalOffset + 1
138)
139
140// implicitPrimary returns the primary weight for the a rune
141// for which there is no entry for the rune in the collation table.
142// We take a different approach from the one specified in
143// https://unicode.org/reports/tr10/#Implicit_Weights,
144// but preserve the resulting relative ordering of the runes.
145func implicitPrimary(r rune) int {
146 if unicode.Is(unicode.Ideographic, r) {
147 if r >= minUnified && r <= maxUnified {
148 // The most common case for CJK.
149 return int(r) + commonUnifiedOffset
150 }
151 if r >= minCompatibility && r <= maxCompatibility {
152 // This will typically not hit. The DUCET explicitly specifies mappings
153 // for all characters that do not decompose.
154 return int(r) + commonUnifiedOffset
155 }
156 return int(r) + rareUnifiedOffset
157 }
158 return int(r) + otherOffset
159}
160
161// convertLargeWeights converts collation elements with large
162// primaries (either double primaries or for illegal runes)
163// to our own representation.
164// A CJK character C is represented in the DUCET as
165// [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
166// We will rewrite these characters to a single CE.
167// We assume the CJK values start at 0x8000.
168// See https://unicode.org/reports/tr10/#Implicit_Weights
169func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
170 const (
171 cjkPrimaryStart = 0xFB40
172 rarePrimaryStart = 0xFB80
173 otherPrimaryStart = 0xFBC0
174 illegalPrimary = 0xFFFE
175 highBitsMask = 0x3F
176 lowBitsMask = 0x7FFF
177 lowBitsFlag = 0x8000
178 shiftBits = 15
179 )
180 for i := 0; i < len(elems); i++ {
181 ce := elems[i].w
182 p := ce[0]
183 if p < cjkPrimaryStart {
184 continue
185 }
186 if p > 0xFFFF {
187 return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
188 }
189 if p >= illegalPrimary {
190 ce[0] = illegalOffset + p - illegalPrimary
191 } else {
192 if i+1 >= len(elems) {
193 return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
194 }
195 if elems[i+1].w[0]&lowBitsFlag == 0 {
196 return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
197 }
198 np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
199 switch {
200 case p < rarePrimaryStart:
201 np += commonUnifiedOffset
202 case p < otherPrimaryStart:
203 np += rareUnifiedOffset
204 default:
205 p += otherOffset
206 }
207 ce[0] = np
208 for j := i + 1; j+1 < len(elems); j++ {
209 elems[j] = elems[j+1]
210 }
211 elems = elems[:len(elems)-1]
212 }
213 }
214 return elems, nil
215}
216
217// nextWeight computes the first possible collation weights following elems
218// for the given level.
219func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
220 if level == colltab.Identity {
221 next := make([]rawCE, len(elems))
222 copy(next, elems)
223 return next
224 }
225 next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
226 next[0].w[level]++
227 if level < colltab.Secondary {
228 next[0].w[colltab.Secondary] = defaultSecondary
229 }
230 if level < colltab.Tertiary {
231 next[0].w[colltab.Tertiary] = defaultTertiary
232 }
233 // Filter entries that cannot influence ordering.
234 for _, ce := range elems[1:] {
235 skip := true
236 for i := colltab.Primary; i < level; i++ {
237 skip = skip && ce.w[i] == 0
238 }
239 if !skip {
240 next = append(next, ce)
241 }
242 }
243 return next
244}
245
246func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
247 for ; i < len(elems) && elems[i].w[level] == 0; i++ {
248 }
249 if i < len(elems) {
250 return i, elems[i].w[level]
251 }
252 return i, 0
253}
254
255// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
256// It also returns the collation level at which the difference is found.
257func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
258 for level := colltab.Primary; level < colltab.Identity; level++ {
259 var va, vb int
260 for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
261 ia, va = nextVal(a, ia, level)
262 ib, vb = nextVal(b, ib, level)
263 if va != vb {
264 if va < vb {
265 return -1, level
266 } else {
267 return 1, level
268 }
269 }
270 }
271 }
272 return 0, colltab.Identity
273}
274
275func equalCE(a, b rawCE) bool {
276 for i := 0; i < 3; i++ {
277 if b.w[i] != a.w[i] {
278 return false
279 }
280 }
281 return true
282}
283
284func equalCEArrays(a, b []rawCE) bool {
285 if len(a) != len(b) {
286 return false
287 }
288 for i := range a {
289 if !equalCE(a[i], b[i]) {
290 return false
291 }
292 }
293 return true
294}