Naveen Sampath | 04696f7 | 2022-06-13 15:19:14 +0530 | [diff] [blame] | 1 | package uniseg |
| 2 | |
| 3 | // The states of the grapheme cluster parser. |
| 4 | const ( |
| 5 | grAny = iota |
| 6 | grCR |
| 7 | grControlLF |
| 8 | grL |
| 9 | grLVV |
| 10 | grLVTT |
| 11 | grPrepend |
| 12 | grExtendedPictographic |
| 13 | grExtendedPictographicZWJ |
| 14 | grRIOdd |
| 15 | grRIEven |
| 16 | ) |
| 17 | |
| 18 | // The grapheme cluster parser's breaking instructions. |
| 19 | const ( |
| 20 | grNoBoundary = iota |
| 21 | grBoundary |
| 22 | ) |
| 23 | |
| 24 | // The grapheme cluster parser's state transitions. Maps (state, property) to |
| 25 | // (new state, breaking instruction, rule number). The breaking instruction |
| 26 | // always refers to the boundary between the last and next code point. |
| 27 | // |
| 28 | // This map is queried as follows: |
| 29 | // |
| 30 | // 1. Find specific state + specific property. Stop if found. |
| 31 | // 2. Find specific state + any property. |
| 32 | // 3. Find any state + specific property. |
| 33 | // 4. If only (2) or (3) (but not both) was found, stop. |
| 34 | // 5. If both (2) and (3) were found, use state and breaking instruction from |
| 35 | // the transition with the lower rule number, prefer (3) if rule numbers |
| 36 | // are equal. Stop. |
| 37 | // 6. Assume grAny and grBoundary. |
| 38 | var grTransitions = map[[2]int][3]int{ |
| 39 | // GB5 |
| 40 | {grAny, prCR}: {grCR, grBoundary, 50}, |
| 41 | {grAny, prLF}: {grControlLF, grBoundary, 50}, |
| 42 | {grAny, prControl}: {grControlLF, grBoundary, 50}, |
| 43 | |
| 44 | // GB4 |
| 45 | {grCR, prAny}: {grAny, grBoundary, 40}, |
| 46 | {grControlLF, prAny}: {grAny, grBoundary, 40}, |
| 47 | |
| 48 | // GB3. |
| 49 | {grCR, prLF}: {grAny, grNoBoundary, 30}, |
| 50 | |
| 51 | // GB6. |
| 52 | {grAny, prL}: {grL, grBoundary, 9990}, |
| 53 | {grL, prL}: {grL, grNoBoundary, 60}, |
| 54 | {grL, prV}: {grLVV, grNoBoundary, 60}, |
| 55 | {grL, prLV}: {grLVV, grNoBoundary, 60}, |
| 56 | {grL, prLVT}: {grLVTT, grNoBoundary, 60}, |
| 57 | |
| 58 | // GB7. |
| 59 | {grAny, prLV}: {grLVV, grBoundary, 9990}, |
| 60 | {grAny, prV}: {grLVV, grBoundary, 9990}, |
| 61 | {grLVV, prV}: {grLVV, grNoBoundary, 70}, |
| 62 | {grLVV, prT}: {grLVTT, grNoBoundary, 70}, |
| 63 | |
| 64 | // GB8. |
| 65 | {grAny, prLVT}: {grLVTT, grBoundary, 9990}, |
| 66 | {grAny, prT}: {grLVTT, grBoundary, 9990}, |
| 67 | {grLVTT, prT}: {grLVTT, grNoBoundary, 80}, |
| 68 | |
| 69 | // GB9. |
| 70 | {grAny, prExtend}: {grAny, grNoBoundary, 90}, |
| 71 | {grAny, prZWJ}: {grAny, grNoBoundary, 90}, |
| 72 | |
| 73 | // GB9a. |
| 74 | {grAny, prSpacingMark}: {grAny, grNoBoundary, 91}, |
| 75 | |
| 76 | // GB9b. |
| 77 | {grAny, prPreprend}: {grPrepend, grBoundary, 9990}, |
| 78 | {grPrepend, prAny}: {grAny, grNoBoundary, 92}, |
| 79 | |
| 80 | // GB11. |
| 81 | {grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990}, |
| 82 | {grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110}, |
| 83 | {grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110}, |
| 84 | {grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110}, |
| 85 | |
| 86 | // GB12 / GB13. |
| 87 | {grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990}, |
| 88 | {grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120}, |
| 89 | {grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120}, |
| 90 | } |
| 91 | |
| 92 | // Graphemes implements an iterator over Unicode extended grapheme clusters, |
| 93 | // specified in the Unicode Standard Annex #29. Grapheme clusters correspond to |
| 94 | // "user-perceived characters". These characters often consist of multiple |
| 95 | // code points (e.g. the "woman kissing woman" emoji consists of 8 code points: |
| 96 | // woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ + |
| 97 | // woman) and the rules described in Annex #29 must be applied to group those |
| 98 | // code points into clusters perceived by the user as one character. |
| 99 | type Graphemes struct { |
| 100 | // The code points over which this class iterates. |
| 101 | codePoints []rune |
| 102 | |
| 103 | // The (byte-based) indices of the code points into the original string plus |
| 104 | // len(original string). Thus, len(indices) = len(codePoints) + 1. |
| 105 | indices []int |
| 106 | |
| 107 | // The current grapheme cluster to be returned. These are indices into |
| 108 | // codePoints/indices. If start == end, we either haven't started iterating |
| 109 | // yet (0) or the iteration has already completed (1). |
| 110 | start, end int |
| 111 | |
| 112 | // The index of the next code point to be parsed. |
| 113 | pos int |
| 114 | |
| 115 | // The current state of the code point parser. |
| 116 | state int |
| 117 | } |
| 118 | |
| 119 | // NewGraphemes returns a new grapheme cluster iterator. |
| 120 | func NewGraphemes(s string) *Graphemes { |
| 121 | g := &Graphemes{} |
| 122 | for index, codePoint := range s { |
| 123 | g.codePoints = append(g.codePoints, codePoint) |
| 124 | g.indices = append(g.indices, index) |
| 125 | } |
| 126 | g.indices = append(g.indices, len(s)) |
| 127 | g.Next() // Parse ahead. |
| 128 | return g |
| 129 | } |
| 130 | |
| 131 | // Next advances the iterator by one grapheme cluster and returns false if no |
| 132 | // clusters are left. This function must be called before the first cluster is |
| 133 | // accessed. |
| 134 | func (g *Graphemes) Next() bool { |
| 135 | g.start = g.end |
| 136 | |
| 137 | // The state transition gives us a boundary instruction BEFORE the next code |
| 138 | // point so we always need to stay ahead by one code point. |
| 139 | |
| 140 | // Parse the next code point. |
| 141 | for g.pos <= len(g.codePoints) { |
| 142 | // GB2. |
| 143 | if g.pos == len(g.codePoints) { |
| 144 | g.end = g.pos |
| 145 | g.pos++ |
| 146 | break |
| 147 | } |
| 148 | |
| 149 | // Determine the property of the next character. |
| 150 | nextProperty := property(g.codePoints[g.pos]) |
| 151 | g.pos++ |
| 152 | |
| 153 | // Find the applicable transition. |
| 154 | var boundary bool |
| 155 | transition, ok := grTransitions[[2]int{g.state, nextProperty}] |
| 156 | if ok { |
| 157 | // We have a specific transition. We'll use it. |
| 158 | g.state = transition[0] |
| 159 | boundary = transition[1] == grBoundary |
| 160 | } else { |
| 161 | // No specific transition found. Try the less specific ones. |
| 162 | transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}] |
| 163 | transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}] |
| 164 | if okAnyProp && okAnyState { |
| 165 | // Both apply. We'll use a mix (see comments for grTransitions). |
| 166 | g.state = transAnyState[0] |
| 167 | boundary = transAnyState[1] == grBoundary |
| 168 | if transAnyProp[2] < transAnyState[2] { |
| 169 | g.state = transAnyProp[0] |
| 170 | boundary = transAnyProp[1] == grBoundary |
| 171 | } |
| 172 | } else if okAnyProp { |
| 173 | // We only have a specific state. |
| 174 | g.state = transAnyProp[0] |
| 175 | boundary = transAnyProp[1] == grBoundary |
| 176 | // This branch will probably never be reached because okAnyState will |
| 177 | // always be true given the current transition map. But we keep it here |
| 178 | // for future modifications to the transition map where this may not be |
| 179 | // true anymore. |
| 180 | } else if okAnyState { |
| 181 | // We only have a specific property. |
| 182 | g.state = transAnyState[0] |
| 183 | boundary = transAnyState[1] == grBoundary |
| 184 | } else { |
| 185 | // No known transition. GB999: Any x Any. |
| 186 | g.state = grAny |
| 187 | boundary = true |
| 188 | } |
| 189 | } |
| 190 | |
| 191 | // If we found a cluster boundary, let's stop here. The current cluster will |
| 192 | // be the one that just ended. |
| 193 | if g.pos-1 == 0 /* GB1 */ || boundary { |
| 194 | g.end = g.pos - 1 |
| 195 | break |
| 196 | } |
| 197 | } |
| 198 | |
| 199 | return g.start != g.end |
| 200 | } |
| 201 | |
| 202 | // Runes returns a slice of runes (code points) which corresponds to the current |
| 203 | // grapheme cluster. If the iterator is already past the end or Next() has not |
| 204 | // yet been called, nil is returned. |
| 205 | func (g *Graphemes) Runes() []rune { |
| 206 | if g.start == g.end { |
| 207 | return nil |
| 208 | } |
| 209 | return g.codePoints[g.start:g.end] |
| 210 | } |
| 211 | |
| 212 | // Str returns a substring of the original string which corresponds to the |
| 213 | // current grapheme cluster. If the iterator is already past the end or Next() |
| 214 | // has not yet been called, an empty string is returned. |
| 215 | func (g *Graphemes) Str() string { |
| 216 | if g.start == g.end { |
| 217 | return "" |
| 218 | } |
| 219 | return string(g.codePoints[g.start:g.end]) |
| 220 | } |
| 221 | |
| 222 | // Bytes returns a byte slice which corresponds to the current grapheme cluster. |
| 223 | // If the iterator is already past the end or Next() has not yet been called, |
| 224 | // nil is returned. |
| 225 | func (g *Graphemes) Bytes() []byte { |
| 226 | if g.start == g.end { |
| 227 | return nil |
| 228 | } |
| 229 | return []byte(string(g.codePoints[g.start:g.end])) |
| 230 | } |
| 231 | |
| 232 | // Positions returns the interval of the current grapheme cluster as byte |
| 233 | // positions into the original string. The first returned value "from" indexes |
| 234 | // the first byte and the second returned value "to" indexes the first byte that |
| 235 | // is not included anymore, i.e. str[from:to] is the current grapheme cluster of |
| 236 | // the original string "str". If Next() has not yet been called, both values are |
| 237 | // 0. If the iterator is already past the end, both values are 1. |
| 238 | func (g *Graphemes) Positions() (int, int) { |
| 239 | return g.indices[g.start], g.indices[g.end] |
| 240 | } |
| 241 | |
| 242 | // Reset puts the iterator into its initial state such that the next call to |
| 243 | // Next() sets it to the first grapheme cluster again. |
| 244 | func (g *Graphemes) Reset() { |
| 245 | g.start, g.end, g.pos, g.state = 0, 0, 0, grAny |
| 246 | g.Next() // Parse ahead again. |
| 247 | } |
| 248 | |
| 249 | // GraphemeClusterCount returns the number of user-perceived characters |
| 250 | // (grapheme clusters) for the given string. To calculate this number, it |
| 251 | // iterates through the string using the Graphemes iterator. |
| 252 | func GraphemeClusterCount(s string) (n int) { |
| 253 | g := NewGraphemes(s) |
| 254 | for g.Next() { |
| 255 | n++ |
| 256 | } |
| 257 | return |
| 258 | } |