| package uniseg |
| |
| // The states of the grapheme cluster parser. |
| const ( |
| grAny = iota |
| grCR |
| grControlLF |
| grL |
| grLVV |
| grLVTT |
| grPrepend |
| grExtendedPictographic |
| grExtendedPictographicZWJ |
| grRIOdd |
| grRIEven |
| ) |
| |
| // The grapheme cluster parser's breaking instructions. |
| const ( |
| grNoBoundary = iota |
| grBoundary |
| ) |
| |
| // The grapheme cluster parser's state transitions. Maps (state, property) to |
| // (new state, breaking instruction, rule number). The breaking instruction |
| // always refers to the boundary between the last and next code point. |
| // |
| // This map is queried as follows: |
| // |
| // 1. Find specific state + specific property. Stop if found. |
| // 2. Find specific state + any property. |
| // 3. Find any state + specific property. |
| // 4. If only (2) or (3) (but not both) was found, stop. |
| // 5. If both (2) and (3) were found, use state and breaking instruction from |
| // the transition with the lower rule number, prefer (3) if rule numbers |
| // are equal. Stop. |
| // 6. Assume grAny and grBoundary. |
| var grTransitions = map[[2]int][3]int{ |
| // GB5 |
| {grAny, prCR}: {grCR, grBoundary, 50}, |
| {grAny, prLF}: {grControlLF, grBoundary, 50}, |
| {grAny, prControl}: {grControlLF, grBoundary, 50}, |
| |
| // GB4 |
| {grCR, prAny}: {grAny, grBoundary, 40}, |
| {grControlLF, prAny}: {grAny, grBoundary, 40}, |
| |
| // GB3. |
| {grCR, prLF}: {grAny, grNoBoundary, 30}, |
| |
| // GB6. |
| {grAny, prL}: {grL, grBoundary, 9990}, |
| {grL, prL}: {grL, grNoBoundary, 60}, |
| {grL, prV}: {grLVV, grNoBoundary, 60}, |
| {grL, prLV}: {grLVV, grNoBoundary, 60}, |
| {grL, prLVT}: {grLVTT, grNoBoundary, 60}, |
| |
| // GB7. |
| {grAny, prLV}: {grLVV, grBoundary, 9990}, |
| {grAny, prV}: {grLVV, grBoundary, 9990}, |
| {grLVV, prV}: {grLVV, grNoBoundary, 70}, |
| {grLVV, prT}: {grLVTT, grNoBoundary, 70}, |
| |
| // GB8. |
| {grAny, prLVT}: {grLVTT, grBoundary, 9990}, |
| {grAny, prT}: {grLVTT, grBoundary, 9990}, |
| {grLVTT, prT}: {grLVTT, grNoBoundary, 80}, |
| |
| // GB9. |
| {grAny, prExtend}: {grAny, grNoBoundary, 90}, |
| {grAny, prZWJ}: {grAny, grNoBoundary, 90}, |
| |
| // GB9a. |
| {grAny, prSpacingMark}: {grAny, grNoBoundary, 91}, |
| |
| // GB9b. |
| {grAny, prPreprend}: {grPrepend, grBoundary, 9990}, |
| {grPrepend, prAny}: {grAny, grNoBoundary, 92}, |
| |
| // GB11. |
| {grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990}, |
| {grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110}, |
| {grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110}, |
| {grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110}, |
| |
| // GB12 / GB13. |
| {grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990}, |
| {grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120}, |
| {grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120}, |
| } |
| |
| // Graphemes implements an iterator over Unicode extended grapheme clusters, |
| // specified in the Unicode Standard Annex #29. Grapheme clusters correspond to |
| // "user-perceived characters". These characters often consist of multiple |
| // code points (e.g. the "woman kissing woman" emoji consists of 8 code points: |
| // woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ + |
| // woman) and the rules described in Annex #29 must be applied to group those |
| // code points into clusters perceived by the user as one character. |
| type Graphemes struct { |
| // The code points over which this class iterates. |
| codePoints []rune |
| |
| // The (byte-based) indices of the code points into the original string plus |
| // len(original string). Thus, len(indices) = len(codePoints) + 1. |
| indices []int |
| |
| // The current grapheme cluster to be returned. These are indices into |
| // codePoints/indices. If start == end, we either haven't started iterating |
| // yet (0) or the iteration has already completed (1). |
| start, end int |
| |
| // The index of the next code point to be parsed. |
| pos int |
| |
| // The current state of the code point parser. |
| state int |
| } |
| |
| // NewGraphemes returns a new grapheme cluster iterator. |
| func NewGraphemes(s string) *Graphemes { |
| g := &Graphemes{} |
| for index, codePoint := range s { |
| g.codePoints = append(g.codePoints, codePoint) |
| g.indices = append(g.indices, index) |
| } |
| g.indices = append(g.indices, len(s)) |
| g.Next() // Parse ahead. |
| return g |
| } |
| |
| // Next advances the iterator by one grapheme cluster and returns false if no |
| // clusters are left. This function must be called before the first cluster is |
| // accessed. |
| func (g *Graphemes) Next() bool { |
| g.start = g.end |
| |
| // The state transition gives us a boundary instruction BEFORE the next code |
| // point so we always need to stay ahead by one code point. |
| |
| // Parse the next code point. |
| for g.pos <= len(g.codePoints) { |
| // GB2. |
| if g.pos == len(g.codePoints) { |
| g.end = g.pos |
| g.pos++ |
| break |
| } |
| |
| // Determine the property of the next character. |
| nextProperty := property(g.codePoints[g.pos]) |
| g.pos++ |
| |
| // Find the applicable transition. |
| var boundary bool |
| transition, ok := grTransitions[[2]int{g.state, nextProperty}] |
| if ok { |
| // We have a specific transition. We'll use it. |
| g.state = transition[0] |
| boundary = transition[1] == grBoundary |
| } else { |
| // No specific transition found. Try the less specific ones. |
| transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}] |
| transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}] |
| if okAnyProp && okAnyState { |
| // Both apply. We'll use a mix (see comments for grTransitions). |
| g.state = transAnyState[0] |
| boundary = transAnyState[1] == grBoundary |
| if transAnyProp[2] < transAnyState[2] { |
| g.state = transAnyProp[0] |
| boundary = transAnyProp[1] == grBoundary |
| } |
| } else if okAnyProp { |
| // We only have a specific state. |
| g.state = transAnyProp[0] |
| boundary = transAnyProp[1] == grBoundary |
| // This branch will probably never be reached because okAnyState will |
| // always be true given the current transition map. But we keep it here |
| // for future modifications to the transition map where this may not be |
| // true anymore. |
| } else if okAnyState { |
| // We only have a specific property. |
| g.state = transAnyState[0] |
| boundary = transAnyState[1] == grBoundary |
| } else { |
| // No known transition. GB999: Any x Any. |
| g.state = grAny |
| boundary = true |
| } |
| } |
| |
| // If we found a cluster boundary, let's stop here. The current cluster will |
| // be the one that just ended. |
| if g.pos-1 == 0 /* GB1 */ || boundary { |
| g.end = g.pos - 1 |
| break |
| } |
| } |
| |
| return g.start != g.end |
| } |
| |
| // Runes returns a slice of runes (code points) which corresponds to the current |
| // grapheme cluster. If the iterator is already past the end or Next() has not |
| // yet been called, nil is returned. |
| func (g *Graphemes) Runes() []rune { |
| if g.start == g.end { |
| return nil |
| } |
| return g.codePoints[g.start:g.end] |
| } |
| |
| // Str returns a substring of the original string which corresponds to the |
| // current grapheme cluster. If the iterator is already past the end or Next() |
| // has not yet been called, an empty string is returned. |
| func (g *Graphemes) Str() string { |
| if g.start == g.end { |
| return "" |
| } |
| return string(g.codePoints[g.start:g.end]) |
| } |
| |
| // Bytes returns a byte slice which corresponds to the current grapheme cluster. |
| // If the iterator is already past the end or Next() has not yet been called, |
| // nil is returned. |
| func (g *Graphemes) Bytes() []byte { |
| if g.start == g.end { |
| return nil |
| } |
| return []byte(string(g.codePoints[g.start:g.end])) |
| } |
| |
| // Positions returns the interval of the current grapheme cluster as byte |
| // positions into the original string. The first returned value "from" indexes |
| // the first byte and the second returned value "to" indexes the first byte that |
| // is not included anymore, i.e. str[from:to] is the current grapheme cluster of |
| // the original string "str". If Next() has not yet been called, both values are |
| // 0. If the iterator is already past the end, both values are 1. |
| func (g *Graphemes) Positions() (int, int) { |
| return g.indices[g.start], g.indices[g.end] |
| } |
| |
| // Reset puts the iterator into its initial state such that the next call to |
| // Next() sets it to the first grapheme cluster again. |
| func (g *Graphemes) Reset() { |
| g.start, g.end, g.pos, g.state = 0, 0, 0, grAny |
| g.Next() // Parse ahead again. |
| } |
| |
| // GraphemeClusterCount returns the number of user-perceived characters |
| // (grapheme clusters) for the given string. To calculate this number, it |
| // iterates through the string using the Graphemes iterator. |
| func GraphemeClusterCount(s string) (n int) { |
| g := NewGraphemes(s) |
| for g.Next() { |
| n++ |
| } |
| return |
| } |