blob: e67e7655c547e22cfe36f9250d5f3e0e080cdc69 [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package norm
6
Don Newton98fd8812019-09-23 15:15:02 -04007// This file contains Form-specific logic and wrappers for data in tables.go.
8
9// Rune info is stored in a separate trie per composing form. A composing form
10// and its corresponding decomposing form share the same trie. Each trie maps
11// a rune to a uint16. The values take two forms. For v >= 0x8000:
12// bits
13// 15: 1 (inverse of NFD_QC bit of qcInfo)
14// 13..7: qcInfo (see below). isYesD is always true (no decompostion).
15// 6..0: ccc (compressed CCC value).
16// For v < 0x8000, the respective rune has a decomposition and v is an index
17// into a byte array of UTF-8 decomposition sequences and additional info and
18// has the form:
19// <header> <decomp_byte>* [<tccc> [<lccc>]]
20// The header contains the number of bytes in the decomposition (excluding this
21// length byte). The two most significant bits of this length byte correspond
22// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
23// The byte sequence is followed by a trailing and leading CCC if the values
24// for these are not zero. The value of v determines which ccc are appended
25// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
26// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
27// there is an additional leading ccc. The value of tccc itself is the
28// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
29// are the number of trailing non-starters.
30
31const (
32 qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
33 headerLenMask = 0x3F // extract the length value from the header byte
34 headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
35)
36
37// Properties provides access to normalization properties of a rune.
38type Properties struct {
39 pos uint8 // start position in reorderBuffer; used in composition.go
40 size uint8 // length of UTF-8 encoding of this rune
41 ccc uint8 // leading canonical combining class (ccc if not decomposition)
42 tccc uint8 // trailing canonical combining class (ccc if not decomposition)
43 nLead uint8 // number of leading non-starters.
44 flags qcInfo // quick check flags
45 index uint16
46}
47
48// functions dispatchable per form
49type lookupFunc func(b input, i int) Properties
50
51// formInfo holds Form-specific functions and tables.
52type formInfo struct {
53 form Form
54 composing, compatibility bool // form type
55 info lookupFunc
56 nextMain iterFunc
57}
58
59var formTable = []*formInfo{{
60 form: NFC,
61 composing: true,
62 compatibility: false,
63 info: lookupInfoNFC,
64 nextMain: nextComposed,
65}, {
66 form: NFD,
67 composing: false,
68 compatibility: false,
69 info: lookupInfoNFC,
70 nextMain: nextDecomposed,
71}, {
72 form: NFKC,
73 composing: true,
74 compatibility: true,
75 info: lookupInfoNFKC,
76 nextMain: nextComposed,
77}, {
78 form: NFKD,
79 composing: false,
80 compatibility: true,
81 info: lookupInfoNFKC,
82 nextMain: nextDecomposed,
83}}
84
85// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
86// unexpected behavior for the user. For example, in NFD, there is a boundary
87// after 'a'. However, 'a' might combine with modifiers, so from the application's
88// perspective it is not a good boundary. We will therefore always use the
89// boundaries for the combining variants.
90
91// BoundaryBefore returns true if this rune starts a new segment and
92// cannot combine with any rune on the left.
93func (p Properties) BoundaryBefore() bool {
94 if p.ccc == 0 && !p.combinesBackward() {
95 return true
96 }
97 // We assume that the CCC of the first character in a decomposition
98 // is always non-zero if different from info.ccc and that we can return
99 // false at this point. This is verified by maketables.
100 return false
101}
102
103// BoundaryAfter returns true if runes cannot combine with or otherwise
104// interact with this or previous runes.
105func (p Properties) BoundaryAfter() bool {
106 // TODO: loosen these conditions.
107 return p.isInert()
108}
109
110// We pack quick check data in 4 bits:
111// 5: Combines forward (0 == false, 1 == true)
112// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
113// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
114// 1..0: Number of trailing non-starters.
115//
116// When all 4 bits are zero, the character is inert, meaning it is never
117// influenced by normalization.
118type qcInfo uint8
119
120func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
121func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
122
123func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 }
124func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
125func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
126
127func (p Properties) isInert() bool {
128 return p.flags&qcInfoMask == 0 && p.ccc == 0
129}
130
131func (p Properties) multiSegment() bool {
132 return p.index >= firstMulti && p.index < endMulti
133}
134
135func (p Properties) nLeadingNonStarters() uint8 {
136 return p.nLead
137}
138
139func (p Properties) nTrailingNonStarters() uint8 {
140 return uint8(p.flags & 0x03)
141}
142
143// Decomposition returns the decomposition for the underlying rune
144// or nil if there is none.
145func (p Properties) Decomposition() []byte {
146 // TODO: create the decomposition for Hangul?
147 if p.index == 0 {
148 return nil
149 }
150 i := p.index
151 n := decomps[i] & headerLenMask
152 i++
153 return decomps[i : i+uint16(n)]
154}
155
156// Size returns the length of UTF-8 encoding of the rune.
157func (p Properties) Size() int {
158 return int(p.size)
159}
160
161// CCC returns the canonical combining class of the underlying rune.
162func (p Properties) CCC() uint8 {
163 if p.index >= firstCCCZeroExcept {
164 return 0
165 }
166 return ccc[p.ccc]
167}
168
169// LeadCCC returns the CCC of the first rune in the decomposition.
170// If there is no decomposition, LeadCCC equals CCC.
171func (p Properties) LeadCCC() uint8 {
172 return ccc[p.ccc]
173}
174
175// TrailCCC returns the CCC of the last rune in the decomposition.
176// If there is no decomposition, TrailCCC equals CCC.
177func (p Properties) TrailCCC() uint8 {
178 return ccc[p.tccc]
179}
180
Don Newton98fd8812019-09-23 15:15:02 -0400181// Recomposition
182// We use 32-bit keys instead of 64-bit for the two codepoint keys.
183// This clips off the bits of three entries, but we know this will not
184// result in a collision. In the unlikely event that changes to
185// UnicodeData.txt introduce collisions, the compiler will catch it.
186// Note that the recomposition map for NFC and NFKC are identical.
187
188// combine returns the combined rune or 0 if it doesn't exist.
Don Newton98fd8812019-09-23 15:15:02 -0400189func combine(a, b rune) rune {
190 key := uint32(uint16(a))<<16 + uint32(uint16(b))
Don Newton98fd8812019-09-23 15:15:02 -0400191 return recompMap[key]
192}
193
194func lookupInfoNFC(b input, i int) Properties {
195 v, sz := b.charinfoNFC(i)
196 return compInfo(v, sz)
197}
198
199func lookupInfoNFKC(b input, i int) Properties {
200 v, sz := b.charinfoNFKC(i)
201 return compInfo(v, sz)
202}
203
204// Properties returns properties for the first rune in s.
205func (f Form) Properties(s []byte) Properties {
206 if f == NFC || f == NFD {
207 return compInfo(nfcData.lookup(s))
208 }
209 return compInfo(nfkcData.lookup(s))
210}
211
212// PropertiesString returns properties for the first rune in s.
213func (f Form) PropertiesString(s string) Properties {
214 if f == NFC || f == NFD {
215 return compInfo(nfcData.lookupString(s))
216 }
217 return compInfo(nfkcData.lookupString(s))
218}
219
220// compInfo converts the information contained in v and sz
221// to a Properties. See the comment at the top of the file
222// for more information on the format.
223func compInfo(v uint16, sz int) Properties {
224 if v == 0 {
225 return Properties{size: uint8(sz)}
226 } else if v >= 0x8000 {
227 p := Properties{
228 size: uint8(sz),
229 ccc: uint8(v),
230 tccc: uint8(v),
231 flags: qcInfo(v >> 8),
232 }
233 if p.ccc > 0 || p.combinesBackward() {
234 p.nLead = uint8(p.flags & 0x3)
235 }
236 return p
237 }
238 // has decomposition
239 h := decomps[v]
240 f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
241 p := Properties{size: uint8(sz), flags: f, index: v}
242 if v >= firstCCC {
243 v += uint16(h&headerLenMask) + 1
244 c := decomps[v]
245 p.tccc = c >> 2
246 p.flags |= qcInfo(c & 0x3)
247 if v >= firstLeadingCCC {
248 p.nLead = c & 0x3
249 if v >= firstStarterWithNLead {
250 // We were tricked. Remove the decomposition.
251 p.flags &= 0x03
252 p.index = 0
253 return p
254 }
255 p.ccc = decomps[v+1]
256 }
257 }
258 return p
259}