blob: 25649d4f55ffd0a65a7e4e385416c654a46389e8 [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package colltab
6
7import "unicode/utf8"
8
9// For a description of ContractTrieSet, see text/collate/build/contract.go.
10
11type ContractTrieSet []struct{ L, H, N, I uint8 }
12
13// ctScanner is used to match a trie to an input sequence.
14// A contraction may match a non-contiguous sequence of bytes in an input string.
15// For example, if there is a contraction for <a, combining_ring>, it should match
16// the sequence <a, combining_cedilla, combining_ring>, as combining_cedilla does
17// not block combining_ring.
18// ctScanner does not automatically skip over non-blocking non-starters, but rather
19// retains the state of the last match and leaves it up to the user to continue
20// the match at the appropriate points.
21type ctScanner struct {
22 states ContractTrieSet
23 s []byte
24 n int
25 index int
26 pindex int
27 done bool
28}
29
30type ctScannerString struct {
31 states ContractTrieSet
32 s string
33 n int
34 index int
35 pindex int
36 done bool
37}
38
39func (t ContractTrieSet) scanner(index, n int, b []byte) ctScanner {
40 return ctScanner{s: b, states: t[index:], n: n}
41}
42
43func (t ContractTrieSet) scannerString(index, n int, str string) ctScannerString {
44 return ctScannerString{s: str, states: t[index:], n: n}
45}
46
47// result returns the offset i and bytes consumed p so far. If no suffix
48// matched, i and p will be 0.
49func (s *ctScanner) result() (i, p int) {
50 return s.index, s.pindex
51}
52
53func (s *ctScannerString) result() (i, p int) {
54 return s.index, s.pindex
55}
56
57const (
58 final = 0
59 noIndex = 0xFF
60)
61
62// scan matches the longest suffix at the current location in the input
63// and returns the number of bytes consumed.
64func (s *ctScanner) scan(p int) int {
65 pr := p // the p at the rune start
66 str := s.s
67 states, n := s.states, s.n
68 for i := 0; i < n && p < len(str); {
69 e := states[i]
70 c := str[p]
71 // TODO: a significant number of contractions are of a form that
72 // cannot match discontiguous UTF-8 in a normalized string. We could let
73 // a negative value of e.n mean that we can set s.done = true and avoid
74 // the need for additional matches.
75 if c >= e.L {
76 if e.L == c {
77 p++
78 if e.I != noIndex {
79 s.index = int(e.I)
80 s.pindex = p
81 }
82 if e.N != final {
83 i, states, n = 0, states[int(e.H)+n:], int(e.N)
84 if p >= len(str) || utf8.RuneStart(str[p]) {
85 s.states, s.n, pr = states, n, p
86 }
87 } else {
88 s.done = true
89 return p
90 }
91 continue
92 } else if e.N == final && c <= e.H {
93 p++
94 s.done = true
95 s.index = int(c-e.L) + int(e.I)
96 s.pindex = p
97 return p
98 }
99 }
100 i++
101 }
102 return pr
103}
104
105// scan is a verbatim copy of ctScanner.scan.
106func (s *ctScannerString) scan(p int) int {
107 pr := p // the p at the rune start
108 str := s.s
109 states, n := s.states, s.n
110 for i := 0; i < n && p < len(str); {
111 e := states[i]
112 c := str[p]
113 // TODO: a significant number of contractions are of a form that
114 // cannot match discontiguous UTF-8 in a normalized string. We could let
115 // a negative value of e.n mean that we can set s.done = true and avoid
116 // the need for additional matches.
117 if c >= e.L {
118 if e.L == c {
119 p++
120 if e.I != noIndex {
121 s.index = int(e.I)
122 s.pindex = p
123 }
124 if e.N != final {
125 i, states, n = 0, states[int(e.H)+n:], int(e.N)
126 if p >= len(str) || utf8.RuneStart(str[p]) {
127 s.states, s.n, pr = states, n, p
128 }
129 } else {
130 s.done = true
131 return p
132 }
133 continue
134 } else if e.N == final && c <= e.H {
135 p++
136 s.done = true
137 s.index = int(c-e.L) + int(e.I)
138 s.pindex = p
139 return p
140 }
141 }
142 i++
143 }
144 return pr
145}