blob: 526c7033ac464cc1fe840feac6bb84d81917514c [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package norm
6
Don Newton7577f072020-01-06 12:41:11 -05007import "encoding/binary"
8
Don Newton98fd8812019-09-23 15:15:02 -04009// This file contains Form-specific logic and wrappers for data in tables.go.
10
11// Rune info is stored in a separate trie per composing form. A composing form
12// and its corresponding decomposing form share the same trie. Each trie maps
13// a rune to a uint16. The values take two forms. For v >= 0x8000:
14// bits
15// 15: 1 (inverse of NFD_QC bit of qcInfo)
16// 13..7: qcInfo (see below). isYesD is always true (no decompostion).
17// 6..0: ccc (compressed CCC value).
18// For v < 0x8000, the respective rune has a decomposition and v is an index
19// into a byte array of UTF-8 decomposition sequences and additional info and
20// has the form:
21// <header> <decomp_byte>* [<tccc> [<lccc>]]
22// The header contains the number of bytes in the decomposition (excluding this
23// length byte). The two most significant bits of this length byte correspond
24// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
25// The byte sequence is followed by a trailing and leading CCC if the values
26// for these are not zero. The value of v determines which ccc are appended
27// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
28// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
29// there is an additional leading ccc. The value of tccc itself is the
30// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
31// are the number of trailing non-starters.
32
33const (
34 qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
35 headerLenMask = 0x3F // extract the length value from the header byte
36 headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
37)
38
39// Properties provides access to normalization properties of a rune.
40type Properties struct {
41 pos uint8 // start position in reorderBuffer; used in composition.go
42 size uint8 // length of UTF-8 encoding of this rune
43 ccc uint8 // leading canonical combining class (ccc if not decomposition)
44 tccc uint8 // trailing canonical combining class (ccc if not decomposition)
45 nLead uint8 // number of leading non-starters.
46 flags qcInfo // quick check flags
47 index uint16
48}
49
50// functions dispatchable per form
51type lookupFunc func(b input, i int) Properties
52
53// formInfo holds Form-specific functions and tables.
54type formInfo struct {
55 form Form
56 composing, compatibility bool // form type
57 info lookupFunc
58 nextMain iterFunc
59}
60
61var formTable = []*formInfo{{
62 form: NFC,
63 composing: true,
64 compatibility: false,
65 info: lookupInfoNFC,
66 nextMain: nextComposed,
67}, {
68 form: NFD,
69 composing: false,
70 compatibility: false,
71 info: lookupInfoNFC,
72 nextMain: nextDecomposed,
73}, {
74 form: NFKC,
75 composing: true,
76 compatibility: true,
77 info: lookupInfoNFKC,
78 nextMain: nextComposed,
79}, {
80 form: NFKD,
81 composing: false,
82 compatibility: true,
83 info: lookupInfoNFKC,
84 nextMain: nextDecomposed,
85}}
86
87// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
88// unexpected behavior for the user. For example, in NFD, there is a boundary
89// after 'a'. However, 'a' might combine with modifiers, so from the application's
90// perspective it is not a good boundary. We will therefore always use the
91// boundaries for the combining variants.
92
93// BoundaryBefore returns true if this rune starts a new segment and
94// cannot combine with any rune on the left.
95func (p Properties) BoundaryBefore() bool {
96 if p.ccc == 0 && !p.combinesBackward() {
97 return true
98 }
99 // We assume that the CCC of the first character in a decomposition
100 // is always non-zero if different from info.ccc and that we can return
101 // false at this point. This is verified by maketables.
102 return false
103}
104
105// BoundaryAfter returns true if runes cannot combine with or otherwise
106// interact with this or previous runes.
107func (p Properties) BoundaryAfter() bool {
108 // TODO: loosen these conditions.
109 return p.isInert()
110}
111
112// We pack quick check data in 4 bits:
113// 5: Combines forward (0 == false, 1 == true)
114// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
115// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
116// 1..0: Number of trailing non-starters.
117//
118// When all 4 bits are zero, the character is inert, meaning it is never
119// influenced by normalization.
120type qcInfo uint8
121
122func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
123func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
124
125func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 }
126func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
127func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
128
129func (p Properties) isInert() bool {
130 return p.flags&qcInfoMask == 0 && p.ccc == 0
131}
132
133func (p Properties) multiSegment() bool {
134 return p.index >= firstMulti && p.index < endMulti
135}
136
137func (p Properties) nLeadingNonStarters() uint8 {
138 return p.nLead
139}
140
141func (p Properties) nTrailingNonStarters() uint8 {
142 return uint8(p.flags & 0x03)
143}
144
145// Decomposition returns the decomposition for the underlying rune
146// or nil if there is none.
147func (p Properties) Decomposition() []byte {
148 // TODO: create the decomposition for Hangul?
149 if p.index == 0 {
150 return nil
151 }
152 i := p.index
153 n := decomps[i] & headerLenMask
154 i++
155 return decomps[i : i+uint16(n)]
156}
157
158// Size returns the length of UTF-8 encoding of the rune.
159func (p Properties) Size() int {
160 return int(p.size)
161}
162
163// CCC returns the canonical combining class of the underlying rune.
164func (p Properties) CCC() uint8 {
165 if p.index >= firstCCCZeroExcept {
166 return 0
167 }
168 return ccc[p.ccc]
169}
170
171// LeadCCC returns the CCC of the first rune in the decomposition.
172// If there is no decomposition, LeadCCC equals CCC.
173func (p Properties) LeadCCC() uint8 {
174 return ccc[p.ccc]
175}
176
177// TrailCCC returns the CCC of the last rune in the decomposition.
178// If there is no decomposition, TrailCCC equals CCC.
179func (p Properties) TrailCCC() uint8 {
180 return ccc[p.tccc]
181}
182
Don Newton7577f072020-01-06 12:41:11 -0500183func buildRecompMap() {
184 recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
185 var buf [8]byte
186 for i := 0; i < len(recompMapPacked); i += 8 {
187 copy(buf[:], recompMapPacked[i:i+8])
188 key := binary.BigEndian.Uint32(buf[:4])
189 val := binary.BigEndian.Uint32(buf[4:])
190 recompMap[key] = rune(val)
191 }
192}
193
Don Newton98fd8812019-09-23 15:15:02 -0400194// Recomposition
195// We use 32-bit keys instead of 64-bit for the two codepoint keys.
196// This clips off the bits of three entries, but we know this will not
197// result in a collision. In the unlikely event that changes to
198// UnicodeData.txt introduce collisions, the compiler will catch it.
199// Note that the recomposition map for NFC and NFKC are identical.
200
201// combine returns the combined rune or 0 if it doesn't exist.
Don Newton7577f072020-01-06 12:41:11 -0500202//
203// The caller is responsible for calling
204// recompMapOnce.Do(buildRecompMap) sometime before this is called.
Don Newton98fd8812019-09-23 15:15:02 -0400205func combine(a, b rune) rune {
206 key := uint32(uint16(a))<<16 + uint32(uint16(b))
Don Newton7577f072020-01-06 12:41:11 -0500207 if recompMap == nil {
208 panic("caller error") // see func comment
209 }
Don Newton98fd8812019-09-23 15:15:02 -0400210 return recompMap[key]
211}
212
213func lookupInfoNFC(b input, i int) Properties {
214 v, sz := b.charinfoNFC(i)
215 return compInfo(v, sz)
216}
217
218func lookupInfoNFKC(b input, i int) Properties {
219 v, sz := b.charinfoNFKC(i)
220 return compInfo(v, sz)
221}
222
223// Properties returns properties for the first rune in s.
224func (f Form) Properties(s []byte) Properties {
225 if f == NFC || f == NFD {
226 return compInfo(nfcData.lookup(s))
227 }
228 return compInfo(nfkcData.lookup(s))
229}
230
231// PropertiesString returns properties for the first rune in s.
232func (f Form) PropertiesString(s string) Properties {
233 if f == NFC || f == NFD {
234 return compInfo(nfcData.lookupString(s))
235 }
236 return compInfo(nfkcData.lookupString(s))
237}
238
239// compInfo converts the information contained in v and sz
240// to a Properties. See the comment at the top of the file
241// for more information on the format.
242func compInfo(v uint16, sz int) Properties {
243 if v == 0 {
244 return Properties{size: uint8(sz)}
245 } else if v >= 0x8000 {
246 p := Properties{
247 size: uint8(sz),
248 ccc: uint8(v),
249 tccc: uint8(v),
250 flags: qcInfo(v >> 8),
251 }
252 if p.ccc > 0 || p.combinesBackward() {
253 p.nLead = uint8(p.flags & 0x3)
254 }
255 return p
256 }
257 // has decomposition
258 h := decomps[v]
259 f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
260 p := Properties{size: uint8(sz), flags: f, index: v}
261 if v >= firstCCC {
262 v += uint16(h&headerLenMask) + 1
263 c := decomps[v]
264 p.tccc = c >> 2
265 p.flags |= qcInfo(c & 0x3)
266 if v >= firstLeadingCCC {
267 p.nLead = c & 0x3
268 if v >= firstStarterWithNLead {
269 // We were tricked. Remove the decomposition.
270 p.flags &= 0x03
271 p.index = 0
272 return p
273 }
274 p.ccc = decomps[v+1]
275 }
276 }
277 return p
278}