blob: fbb846b4cfaee36c243e7a185b24420f3f022d82 [file] [log] [blame]
Naveen Sampath04696f72022-06-13 15:19:14 +05301package uniseg
2
3// The states of the grapheme cluster parser.
4const (
5 grAny = iota
6 grCR
7 grControlLF
8 grL
9 grLVV
10 grLVTT
11 grPrepend
12 grExtendedPictographic
13 grExtendedPictographicZWJ
14 grRIOdd
15 grRIEven
16)
17
18// The grapheme cluster parser's breaking instructions.
19const (
20 grNoBoundary = iota
21 grBoundary
22)
23
24// The grapheme cluster parser's state transitions. Maps (state, property) to
25// (new state, breaking instruction, rule number). The breaking instruction
26// always refers to the boundary between the last and next code point.
27//
28// This map is queried as follows:
29//
30// 1. Find specific state + specific property. Stop if found.
31// 2. Find specific state + any property.
32// 3. Find any state + specific property.
33// 4. If only (2) or (3) (but not both) was found, stop.
34// 5. If both (2) and (3) were found, use state and breaking instruction from
35// the transition with the lower rule number, prefer (3) if rule numbers
36// are equal. Stop.
37// 6. Assume grAny and grBoundary.
38var grTransitions = map[[2]int][3]int{
39 // GB5
40 {grAny, prCR}: {grCR, grBoundary, 50},
41 {grAny, prLF}: {grControlLF, grBoundary, 50},
42 {grAny, prControl}: {grControlLF, grBoundary, 50},
43
44 // GB4
45 {grCR, prAny}: {grAny, grBoundary, 40},
46 {grControlLF, prAny}: {grAny, grBoundary, 40},
47
48 // GB3.
49 {grCR, prLF}: {grAny, grNoBoundary, 30},
50
51 // GB6.
52 {grAny, prL}: {grL, grBoundary, 9990},
53 {grL, prL}: {grL, grNoBoundary, 60},
54 {grL, prV}: {grLVV, grNoBoundary, 60},
55 {grL, prLV}: {grLVV, grNoBoundary, 60},
56 {grL, prLVT}: {grLVTT, grNoBoundary, 60},
57
58 // GB7.
59 {grAny, prLV}: {grLVV, grBoundary, 9990},
60 {grAny, prV}: {grLVV, grBoundary, 9990},
61 {grLVV, prV}: {grLVV, grNoBoundary, 70},
62 {grLVV, prT}: {grLVTT, grNoBoundary, 70},
63
64 // GB8.
65 {grAny, prLVT}: {grLVTT, grBoundary, 9990},
66 {grAny, prT}: {grLVTT, grBoundary, 9990},
67 {grLVTT, prT}: {grLVTT, grNoBoundary, 80},
68
69 // GB9.
70 {grAny, prExtend}: {grAny, grNoBoundary, 90},
71 {grAny, prZWJ}: {grAny, grNoBoundary, 90},
72
73 // GB9a.
74 {grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
75
76 // GB9b.
77 {grAny, prPreprend}: {grPrepend, grBoundary, 9990},
78 {grPrepend, prAny}: {grAny, grNoBoundary, 92},
79
80 // GB11.
81 {grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990},
82 {grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110},
83 {grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110},
84 {grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
85
86 // GB12 / GB13.
87 {grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990},
88 {grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120},
89 {grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
90}
91
92// Graphemes implements an iterator over Unicode extended grapheme clusters,
93// specified in the Unicode Standard Annex #29. Grapheme clusters correspond to
94// "user-perceived characters". These characters often consist of multiple
95// code points (e.g. the "woman kissing woman" emoji consists of 8 code points:
96// woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ +
97// woman) and the rules described in Annex #29 must be applied to group those
98// code points into clusters perceived by the user as one character.
99type Graphemes struct {
100 // The code points over which this class iterates.
101 codePoints []rune
102
103 // The (byte-based) indices of the code points into the original string plus
104 // len(original string). Thus, len(indices) = len(codePoints) + 1.
105 indices []int
106
107 // The current grapheme cluster to be returned. These are indices into
108 // codePoints/indices. If start == end, we either haven't started iterating
109 // yet (0) or the iteration has already completed (1).
110 start, end int
111
112 // The index of the next code point to be parsed.
113 pos int
114
115 // The current state of the code point parser.
116 state int
117}
118
119// NewGraphemes returns a new grapheme cluster iterator.
120func NewGraphemes(s string) *Graphemes {
121 g := &Graphemes{}
122 for index, codePoint := range s {
123 g.codePoints = append(g.codePoints, codePoint)
124 g.indices = append(g.indices, index)
125 }
126 g.indices = append(g.indices, len(s))
127 g.Next() // Parse ahead.
128 return g
129}
130
131// Next advances the iterator by one grapheme cluster and returns false if no
132// clusters are left. This function must be called before the first cluster is
133// accessed.
134func (g *Graphemes) Next() bool {
135 g.start = g.end
136
137 // The state transition gives us a boundary instruction BEFORE the next code
138 // point so we always need to stay ahead by one code point.
139
140 // Parse the next code point.
141 for g.pos <= len(g.codePoints) {
142 // GB2.
143 if g.pos == len(g.codePoints) {
144 g.end = g.pos
145 g.pos++
146 break
147 }
148
149 // Determine the property of the next character.
150 nextProperty := property(g.codePoints[g.pos])
151 g.pos++
152
153 // Find the applicable transition.
154 var boundary bool
155 transition, ok := grTransitions[[2]int{g.state, nextProperty}]
156 if ok {
157 // We have a specific transition. We'll use it.
158 g.state = transition[0]
159 boundary = transition[1] == grBoundary
160 } else {
161 // No specific transition found. Try the less specific ones.
162 transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}]
163 transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}]
164 if okAnyProp && okAnyState {
165 // Both apply. We'll use a mix (see comments for grTransitions).
166 g.state = transAnyState[0]
167 boundary = transAnyState[1] == grBoundary
168 if transAnyProp[2] < transAnyState[2] {
169 g.state = transAnyProp[0]
170 boundary = transAnyProp[1] == grBoundary
171 }
172 } else if okAnyProp {
173 // We only have a specific state.
174 g.state = transAnyProp[0]
175 boundary = transAnyProp[1] == grBoundary
176 // This branch will probably never be reached because okAnyState will
177 // always be true given the current transition map. But we keep it here
178 // for future modifications to the transition map where this may not be
179 // true anymore.
180 } else if okAnyState {
181 // We only have a specific property.
182 g.state = transAnyState[0]
183 boundary = transAnyState[1] == grBoundary
184 } else {
185 // No known transition. GB999: Any x Any.
186 g.state = grAny
187 boundary = true
188 }
189 }
190
191 // If we found a cluster boundary, let's stop here. The current cluster will
192 // be the one that just ended.
193 if g.pos-1 == 0 /* GB1 */ || boundary {
194 g.end = g.pos - 1
195 break
196 }
197 }
198
199 return g.start != g.end
200}
201
202// Runes returns a slice of runes (code points) which corresponds to the current
203// grapheme cluster. If the iterator is already past the end or Next() has not
204// yet been called, nil is returned.
205func (g *Graphemes) Runes() []rune {
206 if g.start == g.end {
207 return nil
208 }
209 return g.codePoints[g.start:g.end]
210}
211
212// Str returns a substring of the original string which corresponds to the
213// current grapheme cluster. If the iterator is already past the end or Next()
214// has not yet been called, an empty string is returned.
215func (g *Graphemes) Str() string {
216 if g.start == g.end {
217 return ""
218 }
219 return string(g.codePoints[g.start:g.end])
220}
221
222// Bytes returns a byte slice which corresponds to the current grapheme cluster.
223// If the iterator is already past the end or Next() has not yet been called,
224// nil is returned.
225func (g *Graphemes) Bytes() []byte {
226 if g.start == g.end {
227 return nil
228 }
229 return []byte(string(g.codePoints[g.start:g.end]))
230}
231
232// Positions returns the interval of the current grapheme cluster as byte
233// positions into the original string. The first returned value "from" indexes
234// the first byte and the second returned value "to" indexes the first byte that
235// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
236// the original string "str". If Next() has not yet been called, both values are
237// 0. If the iterator is already past the end, both values are 1.
238func (g *Graphemes) Positions() (int, int) {
239 return g.indices[g.start], g.indices[g.end]
240}
241
242// Reset puts the iterator into its initial state such that the next call to
243// Next() sets it to the first grapheme cluster again.
244func (g *Graphemes) Reset() {
245 g.start, g.end, g.pos, g.state = 0, 0, 0, grAny
246 g.Next() // Parse ahead again.
247}
248
249// GraphemeClusterCount returns the number of user-perceived characters
250// (grapheme clusters) for the given string. To calculate this number, it
251// iterates through the string using the Graphemes iterator.
252func GraphemeClusterCount(s string) (n int) {
253 g := NewGraphemes(s)
254 for g.Next() {
255 n++
256 }
257 return
258}