Scott Baker | e7144bc | 2019-10-01 14:16:47 -0700 | [diff] [blame] | 1 | // Copyright 2012 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package build |
| 6 | |
| 7 | import ( |
| 8 | "fmt" |
| 9 | "unicode" |
| 10 | |
| 11 | "golang.org/x/text/internal/colltab" |
| 12 | ) |
| 13 | |
| 14 | const ( |
| 15 | defaultSecondary = 0x20 |
| 16 | defaultTertiary = 0x2 |
| 17 | maxTertiary = 0x1F |
| 18 | ) |
| 19 | |
| 20 | type rawCE struct { |
| 21 | w []int |
| 22 | ccc uint8 |
| 23 | } |
| 24 | |
| 25 | func makeRawCE(w []int, ccc uint8) rawCE { |
| 26 | ce := rawCE{w: make([]int, 4), ccc: ccc} |
| 27 | copy(ce.w, w) |
| 28 | return ce |
| 29 | } |
| 30 | |
| 31 | // A collation element is represented as an uint32. |
| 32 | // In the typical case, a rune maps to a single collation element. If a rune |
| 33 | // can be the start of a contraction or expands into multiple collation elements, |
| 34 | // then the collation element that is associated with a rune will have a special |
| 35 | // form to represent such m to n mappings. Such special collation elements |
| 36 | // have a value >= 0x80000000. |
| 37 | |
| 38 | const ( |
| 39 | maxPrimaryBits = 21 |
| 40 | maxSecondaryBits = 12 |
| 41 | maxTertiaryBits = 8 |
| 42 | ) |
| 43 | |
| 44 | func makeCE(ce rawCE) (uint32, error) { |
| 45 | v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc) |
| 46 | return uint32(v), e |
| 47 | } |
| 48 | |
| 49 | // For contractions, collation elements are of the form |
| 50 | // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where |
| 51 | // - n* is the size of the first node in the contraction trie. |
| 52 | // - i* is the index of the first node in the contraction trie. |
| 53 | // - b* is the offset into the contraction collation element table. |
| 54 | // See contract.go for details on the contraction trie. |
| 55 | const ( |
| 56 | contractID = 0xC0000000 |
| 57 | maxNBits = 4 |
| 58 | maxTrieIndexBits = 12 |
| 59 | maxContractOffsetBits = 13 |
| 60 | ) |
| 61 | |
| 62 | func makeContractIndex(h ctHandle, offset int) (uint32, error) { |
| 63 | if h.n >= 1<<maxNBits { |
| 64 | return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits) |
| 65 | } |
| 66 | if h.index >= 1<<maxTrieIndexBits { |
| 67 | return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits) |
| 68 | } |
| 69 | if offset >= 1<<maxContractOffsetBits { |
| 70 | return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits) |
| 71 | } |
| 72 | ce := uint32(contractID) |
| 73 | ce += uint32(offset << (maxNBits + maxTrieIndexBits)) |
| 74 | ce += uint32(h.index << maxNBits) |
| 75 | ce += uint32(h.n) |
| 76 | return ce, nil |
| 77 | } |
| 78 | |
| 79 | // For expansions, collation elements are of the form |
| 80 | // 11100000 00000000 bbbbbbbb bbbbbbbb, |
| 81 | // where b* is the index into the expansion sequence table. |
| 82 | const ( |
| 83 | expandID = 0xE0000000 |
| 84 | maxExpandIndexBits = 16 |
| 85 | ) |
| 86 | |
| 87 | func makeExpandIndex(index int) (uint32, error) { |
| 88 | if index >= 1<<maxExpandIndexBits { |
| 89 | return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits) |
| 90 | } |
| 91 | return expandID + uint32(index), nil |
| 92 | } |
| 93 | |
| 94 | // Each list of collation elements corresponding to an expansion starts with |
| 95 | // a header indicating the length of the sequence. |
| 96 | func makeExpansionHeader(n int) (uint32, error) { |
| 97 | return uint32(n), nil |
| 98 | } |
| 99 | |
| 100 | // Some runes can be expanded using NFKD decomposition. Instead of storing the full |
| 101 | // sequence of collation elements, we decompose the rune and lookup the collation |
| 102 | // elements for each rune in the decomposition and modify the tertiary weights. |
| 103 | // The collation element, in this case, is of the form |
| 104 | // 11110000 00000000 wwwwwwww vvvvvvvv, where |
| 105 | // - v* is the replacement tertiary weight for the first rune, |
| 106 | // - w* is the replacement tertiary weight for the second rune, |
| 107 | // Tertiary weights of subsequent runes should be replaced with maxTertiary. |
| 108 | // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. |
| 109 | const ( |
| 110 | decompID = 0xF0000000 |
| 111 | ) |
| 112 | |
| 113 | func makeDecompose(t1, t2 int) (uint32, error) { |
| 114 | if t1 >= 256 || t1 < 0 { |
| 115 | return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1) |
| 116 | } |
| 117 | if t2 >= 256 || t2 < 0 { |
| 118 | return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2) |
| 119 | } |
| 120 | return uint32(t2<<8+t1) + decompID, nil |
| 121 | } |
| 122 | |
| 123 | const ( |
| 124 | // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. |
| 125 | minUnified rune = 0x4E00 |
| 126 | maxUnified = 0x9FFF |
| 127 | minCompatibility = 0xF900 |
| 128 | maxCompatibility = 0xFAFF |
| 129 | minRare = 0x3400 |
| 130 | maxRare = 0x4DBF |
| 131 | ) |
| 132 | const ( |
| 133 | commonUnifiedOffset = 0x10000 |
| 134 | rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF |
| 135 | otherOffset = 0x50000 // largest rune in rare is U+2FA1D |
| 136 | illegalOffset = otherOffset + int(unicode.MaxRune) |
| 137 | maxPrimary = illegalOffset + 1 |
| 138 | ) |
| 139 | |
| 140 | // implicitPrimary returns the primary weight for the a rune |
| 141 | // for which there is no entry for the rune in the collation table. |
| 142 | // We take a different approach from the one specified in |
| 143 | // http://unicode.org/reports/tr10/#Implicit_Weights, |
| 144 | // but preserve the resulting relative ordering of the runes. |
| 145 | func implicitPrimary(r rune) int { |
| 146 | if unicode.Is(unicode.Ideographic, r) { |
| 147 | if r >= minUnified && r <= maxUnified { |
| 148 | // The most common case for CJK. |
| 149 | return int(r) + commonUnifiedOffset |
| 150 | } |
| 151 | if r >= minCompatibility && r <= maxCompatibility { |
| 152 | // This will typically not hit. The DUCET explicitly specifies mappings |
| 153 | // for all characters that do not decompose. |
| 154 | return int(r) + commonUnifiedOffset |
| 155 | } |
| 156 | return int(r) + rareUnifiedOffset |
| 157 | } |
| 158 | return int(r) + otherOffset |
| 159 | } |
| 160 | |
| 161 | // convertLargeWeights converts collation elements with large |
| 162 | // primaries (either double primaries or for illegal runes) |
| 163 | // to our own representation. |
| 164 | // A CJK character C is represented in the DUCET as |
| 165 | // [.FBxx.0020.0002.C][.BBBB.0000.0000.C] |
| 166 | // We will rewrite these characters to a single CE. |
| 167 | // We assume the CJK values start at 0x8000. |
| 168 | // See http://unicode.org/reports/tr10/#Implicit_Weights |
| 169 | func convertLargeWeights(elems []rawCE) (res []rawCE, err error) { |
| 170 | const ( |
| 171 | cjkPrimaryStart = 0xFB40 |
| 172 | rarePrimaryStart = 0xFB80 |
| 173 | otherPrimaryStart = 0xFBC0 |
| 174 | illegalPrimary = 0xFFFE |
| 175 | highBitsMask = 0x3F |
| 176 | lowBitsMask = 0x7FFF |
| 177 | lowBitsFlag = 0x8000 |
| 178 | shiftBits = 15 |
| 179 | ) |
| 180 | for i := 0; i < len(elems); i++ { |
| 181 | ce := elems[i].w |
| 182 | p := ce[0] |
| 183 | if p < cjkPrimaryStart { |
| 184 | continue |
| 185 | } |
| 186 | if p > 0xFFFF { |
| 187 | return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p) |
| 188 | } |
| 189 | if p >= illegalPrimary { |
| 190 | ce[0] = illegalOffset + p - illegalPrimary |
| 191 | } else { |
| 192 | if i+1 >= len(elems) { |
| 193 | return elems, fmt.Errorf("second part of double primary weight missing: %v", elems) |
| 194 | } |
| 195 | if elems[i+1].w[0]&lowBitsFlag == 0 { |
| 196 | return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems) |
| 197 | } |
| 198 | np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask |
| 199 | switch { |
| 200 | case p < rarePrimaryStart: |
| 201 | np += commonUnifiedOffset |
| 202 | case p < otherPrimaryStart: |
| 203 | np += rareUnifiedOffset |
| 204 | default: |
| 205 | p += otherOffset |
| 206 | } |
| 207 | ce[0] = np |
| 208 | for j := i + 1; j+1 < len(elems); j++ { |
| 209 | elems[j] = elems[j+1] |
| 210 | } |
| 211 | elems = elems[:len(elems)-1] |
| 212 | } |
| 213 | } |
| 214 | return elems, nil |
| 215 | } |
| 216 | |
| 217 | // nextWeight computes the first possible collation weights following elems |
| 218 | // for the given level. |
| 219 | func nextWeight(level colltab.Level, elems []rawCE) []rawCE { |
| 220 | if level == colltab.Identity { |
| 221 | next := make([]rawCE, len(elems)) |
| 222 | copy(next, elems) |
| 223 | return next |
| 224 | } |
| 225 | next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)} |
| 226 | next[0].w[level]++ |
| 227 | if level < colltab.Secondary { |
| 228 | next[0].w[colltab.Secondary] = defaultSecondary |
| 229 | } |
| 230 | if level < colltab.Tertiary { |
| 231 | next[0].w[colltab.Tertiary] = defaultTertiary |
| 232 | } |
| 233 | // Filter entries that cannot influence ordering. |
| 234 | for _, ce := range elems[1:] { |
| 235 | skip := true |
| 236 | for i := colltab.Primary; i < level; i++ { |
| 237 | skip = skip && ce.w[i] == 0 |
| 238 | } |
| 239 | if !skip { |
| 240 | next = append(next, ce) |
| 241 | } |
| 242 | } |
| 243 | return next |
| 244 | } |
| 245 | |
| 246 | func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) { |
| 247 | for ; i < len(elems) && elems[i].w[level] == 0; i++ { |
| 248 | } |
| 249 | if i < len(elems) { |
| 250 | return i, elems[i].w[level] |
| 251 | } |
| 252 | return i, 0 |
| 253 | } |
| 254 | |
| 255 | // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise. |
| 256 | // It also returns the collation level at which the difference is found. |
| 257 | func compareWeights(a, b []rawCE) (result int, level colltab.Level) { |
| 258 | for level := colltab.Primary; level < colltab.Identity; level++ { |
| 259 | var va, vb int |
| 260 | for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 { |
| 261 | ia, va = nextVal(a, ia, level) |
| 262 | ib, vb = nextVal(b, ib, level) |
| 263 | if va != vb { |
| 264 | if va < vb { |
| 265 | return -1, level |
| 266 | } else { |
| 267 | return 1, level |
| 268 | } |
| 269 | } |
| 270 | } |
| 271 | } |
| 272 | return 0, colltab.Identity |
| 273 | } |
| 274 | |
| 275 | func equalCE(a, b rawCE) bool { |
| 276 | for i := 0; i < 3; i++ { |
| 277 | if b.w[i] != a.w[i] { |
| 278 | return false |
| 279 | } |
| 280 | } |
| 281 | return true |
| 282 | } |
| 283 | |
| 284 | func equalCEArrays(a, b []rawCE) bool { |
| 285 | if len(a) != len(b) { |
| 286 | return false |
| 287 | } |
| 288 | for i := range a { |
| 289 | if !equalCE(a[i], b[i]) { |
| 290 | return false |
| 291 | } |
| 292 | } |
| 293 | return true |
| 294 | } |