Don Newton | 98fd881 | 2019-09-23 15:15:02 -0400 | [diff] [blame] | 1 | // Copyright 2011 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package norm |
| 6 | |
Don Newton | 7577f07 | 2020-01-06 12:41:11 -0500 | [diff] [blame] | 7 | import "encoding/binary" |
| 8 | |
Don Newton | 98fd881 | 2019-09-23 15:15:02 -0400 | [diff] [blame] | 9 | // This file contains Form-specific logic and wrappers for data in tables.go. |
| 10 | |
| 11 | // Rune info is stored in a separate trie per composing form. A composing form |
| 12 | // and its corresponding decomposing form share the same trie. Each trie maps |
| 13 | // a rune to a uint16. The values take two forms. For v >= 0x8000: |
| 14 | // bits |
| 15 | // 15: 1 (inverse of NFD_QC bit of qcInfo) |
| 16 | // 13..7: qcInfo (see below). isYesD is always true (no decompostion). |
| 17 | // 6..0: ccc (compressed CCC value). |
| 18 | // For v < 0x8000, the respective rune has a decomposition and v is an index |
| 19 | // into a byte array of UTF-8 decomposition sequences and additional info and |
| 20 | // has the form: |
| 21 | // <header> <decomp_byte>* [<tccc> [<lccc>]] |
| 22 | // The header contains the number of bytes in the decomposition (excluding this |
| 23 | // length byte). The two most significant bits of this length byte correspond |
| 24 | // to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1. |
| 25 | // The byte sequence is followed by a trailing and leading CCC if the values |
| 26 | // for these are not zero. The value of v determines which ccc are appended |
| 27 | // to the sequences. For v < firstCCC, there are none, for v >= firstCCC, |
| 28 | // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC |
| 29 | // there is an additional leading ccc. The value of tccc itself is the |
| 30 | // trailing CCC shifted left 2 bits. The two least-significant bits of tccc |
| 31 | // are the number of trailing non-starters. |
| 32 | |
| 33 | const ( |
| 34 | qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo |
| 35 | headerLenMask = 0x3F // extract the length value from the header byte |
| 36 | headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte |
| 37 | ) |
| 38 | |
| 39 | // Properties provides access to normalization properties of a rune. |
| 40 | type Properties struct { |
| 41 | pos uint8 // start position in reorderBuffer; used in composition.go |
| 42 | size uint8 // length of UTF-8 encoding of this rune |
| 43 | ccc uint8 // leading canonical combining class (ccc if not decomposition) |
| 44 | tccc uint8 // trailing canonical combining class (ccc if not decomposition) |
| 45 | nLead uint8 // number of leading non-starters. |
| 46 | flags qcInfo // quick check flags |
| 47 | index uint16 |
| 48 | } |
| 49 | |
| 50 | // functions dispatchable per form |
| 51 | type lookupFunc func(b input, i int) Properties |
| 52 | |
| 53 | // formInfo holds Form-specific functions and tables. |
| 54 | type formInfo struct { |
| 55 | form Form |
| 56 | composing, compatibility bool // form type |
| 57 | info lookupFunc |
| 58 | nextMain iterFunc |
| 59 | } |
| 60 | |
| 61 | var formTable = []*formInfo{{ |
| 62 | form: NFC, |
| 63 | composing: true, |
| 64 | compatibility: false, |
| 65 | info: lookupInfoNFC, |
| 66 | nextMain: nextComposed, |
| 67 | }, { |
| 68 | form: NFD, |
| 69 | composing: false, |
| 70 | compatibility: false, |
| 71 | info: lookupInfoNFC, |
| 72 | nextMain: nextDecomposed, |
| 73 | }, { |
| 74 | form: NFKC, |
| 75 | composing: true, |
| 76 | compatibility: true, |
| 77 | info: lookupInfoNFKC, |
| 78 | nextMain: nextComposed, |
| 79 | }, { |
| 80 | form: NFKD, |
| 81 | composing: false, |
| 82 | compatibility: true, |
| 83 | info: lookupInfoNFKC, |
| 84 | nextMain: nextDecomposed, |
| 85 | }} |
| 86 | |
| 87 | // We do not distinguish between boundaries for NFC, NFD, etc. to avoid |
| 88 | // unexpected behavior for the user. For example, in NFD, there is a boundary |
| 89 | // after 'a'. However, 'a' might combine with modifiers, so from the application's |
| 90 | // perspective it is not a good boundary. We will therefore always use the |
| 91 | // boundaries for the combining variants. |
| 92 | |
| 93 | // BoundaryBefore returns true if this rune starts a new segment and |
| 94 | // cannot combine with any rune on the left. |
| 95 | func (p Properties) BoundaryBefore() bool { |
| 96 | if p.ccc == 0 && !p.combinesBackward() { |
| 97 | return true |
| 98 | } |
| 99 | // We assume that the CCC of the first character in a decomposition |
| 100 | // is always non-zero if different from info.ccc and that we can return |
| 101 | // false at this point. This is verified by maketables. |
| 102 | return false |
| 103 | } |
| 104 | |
| 105 | // BoundaryAfter returns true if runes cannot combine with or otherwise |
| 106 | // interact with this or previous runes. |
| 107 | func (p Properties) BoundaryAfter() bool { |
| 108 | // TODO: loosen these conditions. |
| 109 | return p.isInert() |
| 110 | } |
| 111 | |
| 112 | // We pack quick check data in 4 bits: |
| 113 | // 5: Combines forward (0 == false, 1 == true) |
| 114 | // 4..3: NFC_QC Yes(00), No (10), or Maybe (11) |
| 115 | // 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition. |
| 116 | // 1..0: Number of trailing non-starters. |
| 117 | // |
| 118 | // When all 4 bits are zero, the character is inert, meaning it is never |
| 119 | // influenced by normalization. |
| 120 | type qcInfo uint8 |
| 121 | |
| 122 | func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } |
| 123 | func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } |
| 124 | |
| 125 | func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } |
| 126 | func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe |
| 127 | func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD |
| 128 | |
| 129 | func (p Properties) isInert() bool { |
| 130 | return p.flags&qcInfoMask == 0 && p.ccc == 0 |
| 131 | } |
| 132 | |
| 133 | func (p Properties) multiSegment() bool { |
| 134 | return p.index >= firstMulti && p.index < endMulti |
| 135 | } |
| 136 | |
| 137 | func (p Properties) nLeadingNonStarters() uint8 { |
| 138 | return p.nLead |
| 139 | } |
| 140 | |
| 141 | func (p Properties) nTrailingNonStarters() uint8 { |
| 142 | return uint8(p.flags & 0x03) |
| 143 | } |
| 144 | |
| 145 | // Decomposition returns the decomposition for the underlying rune |
| 146 | // or nil if there is none. |
| 147 | func (p Properties) Decomposition() []byte { |
| 148 | // TODO: create the decomposition for Hangul? |
| 149 | if p.index == 0 { |
| 150 | return nil |
| 151 | } |
| 152 | i := p.index |
| 153 | n := decomps[i] & headerLenMask |
| 154 | i++ |
| 155 | return decomps[i : i+uint16(n)] |
| 156 | } |
| 157 | |
| 158 | // Size returns the length of UTF-8 encoding of the rune. |
| 159 | func (p Properties) Size() int { |
| 160 | return int(p.size) |
| 161 | } |
| 162 | |
| 163 | // CCC returns the canonical combining class of the underlying rune. |
| 164 | func (p Properties) CCC() uint8 { |
| 165 | if p.index >= firstCCCZeroExcept { |
| 166 | return 0 |
| 167 | } |
| 168 | return ccc[p.ccc] |
| 169 | } |
| 170 | |
| 171 | // LeadCCC returns the CCC of the first rune in the decomposition. |
| 172 | // If there is no decomposition, LeadCCC equals CCC. |
| 173 | func (p Properties) LeadCCC() uint8 { |
| 174 | return ccc[p.ccc] |
| 175 | } |
| 176 | |
| 177 | // TrailCCC returns the CCC of the last rune in the decomposition. |
| 178 | // If there is no decomposition, TrailCCC equals CCC. |
| 179 | func (p Properties) TrailCCC() uint8 { |
| 180 | return ccc[p.tccc] |
| 181 | } |
| 182 | |
Don Newton | 7577f07 | 2020-01-06 12:41:11 -0500 | [diff] [blame] | 183 | func buildRecompMap() { |
| 184 | recompMap = make(map[uint32]rune, len(recompMapPacked)/8) |
| 185 | var buf [8]byte |
| 186 | for i := 0; i < len(recompMapPacked); i += 8 { |
| 187 | copy(buf[:], recompMapPacked[i:i+8]) |
| 188 | key := binary.BigEndian.Uint32(buf[:4]) |
| 189 | val := binary.BigEndian.Uint32(buf[4:]) |
| 190 | recompMap[key] = rune(val) |
| 191 | } |
| 192 | } |
| 193 | |
Don Newton | 98fd881 | 2019-09-23 15:15:02 -0400 | [diff] [blame] | 194 | // Recomposition |
| 195 | // We use 32-bit keys instead of 64-bit for the two codepoint keys. |
| 196 | // This clips off the bits of three entries, but we know this will not |
| 197 | // result in a collision. In the unlikely event that changes to |
| 198 | // UnicodeData.txt introduce collisions, the compiler will catch it. |
| 199 | // Note that the recomposition map for NFC and NFKC are identical. |
| 200 | |
| 201 | // combine returns the combined rune or 0 if it doesn't exist. |
Don Newton | 7577f07 | 2020-01-06 12:41:11 -0500 | [diff] [blame] | 202 | // |
| 203 | // The caller is responsible for calling |
| 204 | // recompMapOnce.Do(buildRecompMap) sometime before this is called. |
Don Newton | 98fd881 | 2019-09-23 15:15:02 -0400 | [diff] [blame] | 205 | func combine(a, b rune) rune { |
| 206 | key := uint32(uint16(a))<<16 + uint32(uint16(b)) |
Don Newton | 7577f07 | 2020-01-06 12:41:11 -0500 | [diff] [blame] | 207 | if recompMap == nil { |
| 208 | panic("caller error") // see func comment |
| 209 | } |
Don Newton | 98fd881 | 2019-09-23 15:15:02 -0400 | [diff] [blame] | 210 | return recompMap[key] |
| 211 | } |
| 212 | |
| 213 | func lookupInfoNFC(b input, i int) Properties { |
| 214 | v, sz := b.charinfoNFC(i) |
| 215 | return compInfo(v, sz) |
| 216 | } |
| 217 | |
| 218 | func lookupInfoNFKC(b input, i int) Properties { |
| 219 | v, sz := b.charinfoNFKC(i) |
| 220 | return compInfo(v, sz) |
| 221 | } |
| 222 | |
| 223 | // Properties returns properties for the first rune in s. |
| 224 | func (f Form) Properties(s []byte) Properties { |
| 225 | if f == NFC || f == NFD { |
| 226 | return compInfo(nfcData.lookup(s)) |
| 227 | } |
| 228 | return compInfo(nfkcData.lookup(s)) |
| 229 | } |
| 230 | |
| 231 | // PropertiesString returns properties for the first rune in s. |
| 232 | func (f Form) PropertiesString(s string) Properties { |
| 233 | if f == NFC || f == NFD { |
| 234 | return compInfo(nfcData.lookupString(s)) |
| 235 | } |
| 236 | return compInfo(nfkcData.lookupString(s)) |
| 237 | } |
| 238 | |
| 239 | // compInfo converts the information contained in v and sz |
| 240 | // to a Properties. See the comment at the top of the file |
| 241 | // for more information on the format. |
| 242 | func compInfo(v uint16, sz int) Properties { |
| 243 | if v == 0 { |
| 244 | return Properties{size: uint8(sz)} |
| 245 | } else if v >= 0x8000 { |
| 246 | p := Properties{ |
| 247 | size: uint8(sz), |
| 248 | ccc: uint8(v), |
| 249 | tccc: uint8(v), |
| 250 | flags: qcInfo(v >> 8), |
| 251 | } |
| 252 | if p.ccc > 0 || p.combinesBackward() { |
| 253 | p.nLead = uint8(p.flags & 0x3) |
| 254 | } |
| 255 | return p |
| 256 | } |
| 257 | // has decomposition |
| 258 | h := decomps[v] |
| 259 | f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 |
| 260 | p := Properties{size: uint8(sz), flags: f, index: v} |
| 261 | if v >= firstCCC { |
| 262 | v += uint16(h&headerLenMask) + 1 |
| 263 | c := decomps[v] |
| 264 | p.tccc = c >> 2 |
| 265 | p.flags |= qcInfo(c & 0x3) |
| 266 | if v >= firstLeadingCCC { |
| 267 | p.nLead = c & 0x3 |
| 268 | if v >= firstStarterWithNLead { |
| 269 | // We were tricked. Remove the decomposition. |
| 270 | p.flags &= 0x03 |
| 271 | p.index = 0 |
| 272 | return p |
| 273 | } |
| 274 | p.ccc = decomps[v+1] |
| 275 | } |
| 276 | } |
| 277 | return p |
| 278 | } |