blob: 18539397914bfb0dfe473d0b9f3e55189d35f8b4 [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package bidi
6
7import (
8 "container/list"
9 "fmt"
10 "sort"
11)
12
13// This file contains a port of the reference implementation of the
14// Bidi Parentheses Algorithm:
Don Newton7577f072020-01-06 12:41:11 -050015// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
Don Newton98fd8812019-09-23 15:15:02 -040016//
17// The implementation in this file covers definitions BD14-BD16 and rule N0
18// of UAX#9.
19//
20// Some preprocessing is done for each rune before data is passed to this
21// algorithm:
22// - opening and closing brackets are identified
23// - a bracket pair type, like '(' and ')' is assigned a unique identifier that
24// is identical for the opening and closing bracket. It is left to do these
25// mappings.
26// - The BPA algorithm requires that bracket characters that are canonical
27// equivalents of each other be able to be substituted for each other.
28// It is the responsibility of the caller to do this canonicalization.
29//
30// In implementing BD16, this implementation departs slightly from the "logical"
31// algorithm defined in UAX#9. In particular, the stack referenced there
32// supports operations that go beyond a "basic" stack. An equivalent
33// implementation based on a linked list is used here.
34
35// Bidi_Paired_Bracket_Type
36// BD14. An opening paired bracket is a character whose
37// Bidi_Paired_Bracket_Type property value is Open.
38//
39// BD15. A closing paired bracket is a character whose
40// Bidi_Paired_Bracket_Type property value is Close.
41type bracketType byte
42
43const (
44 bpNone bracketType = iota
45 bpOpen
46 bpClose
47)
48
49// bracketPair holds a pair of index values for opening and closing bracket
50// location of a bracket pair.
51type bracketPair struct {
52 opener int
53 closer int
54}
55
56func (b *bracketPair) String() string {
57 return fmt.Sprintf("(%v, %v)", b.opener, b.closer)
58}
59
60// bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
61type bracketPairs []bracketPair
62
63func (b bracketPairs) Len() int { return len(b) }
64func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
65func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener }
66
67// resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
68//
69// For each rune, it takes the indexes into the original string, the class the
70// bracket type (in pairTypes) and the bracket identifier (pairValues). It also
71// takes the direction type for the start-of-sentence and the embedding level.
72//
73// The identifiers for bracket types are the rune of the canonicalized opening
74// bracket for brackets (open or close) or 0 for runes that are not brackets.
75func resolvePairedBrackets(s *isolatingRunSequence) {
76 p := bracketPairer{
77 sos: s.sos,
78 openers: list.New(),
79 codesIsolatedRun: s.types,
80 indexes: s.indexes,
81 }
82 dirEmbed := L
83 if s.level&1 != 0 {
84 dirEmbed = R
85 }
86 p.locateBrackets(s.p.pairTypes, s.p.pairValues)
87 p.resolveBrackets(dirEmbed, s.p.initialTypes)
88}
89
90type bracketPairer struct {
91 sos Class // direction corresponding to start of sequence
92
93 // The following is a restatement of BD 16 using non-algorithmic language.
94 //
95 // A bracket pair is a pair of characters consisting of an opening
96 // paired bracket and a closing paired bracket such that the
97 // Bidi_Paired_Bracket property value of the former equals the latter,
98 // subject to the following constraints.
99 // - both characters of a pair occur in the same isolating run sequence
100 // - the closing character of a pair follows the opening character
101 // - any bracket character can belong at most to one pair, the earliest possible one
102 // - any bracket character not part of a pair is treated like an ordinary character
103 // - pairs may nest properly, but their spans may not overlap otherwise
104
105 // Bracket characters with canonical decompositions are supposed to be
106 // treated as if they had been normalized, to allow normalized and non-
107 // normalized text to give the same result. In this implementation that step
108 // is pushed out to the caller. The caller has to ensure that the pairValue
109 // slices contain the rune of the opening bracket after normalization for
110 // any opening or closing bracket.
111
112 openers *list.List // list of positions for opening brackets
113
114 // bracket pair positions sorted by location of opening bracket
115 pairPositions bracketPairs
116
117 codesIsolatedRun []Class // directional bidi codes for an isolated run
118 indexes []int // array of index values into the original string
119
120}
121
122// matchOpener reports whether characters at given positions form a matching
123// bracket pair.
124func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool {
125 return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]]
126}
127
128const maxPairingDepth = 63
129
130// locateBrackets locates matching bracket pairs according to BD16.
131//
132// This implementation uses a linked list instead of a stack, because, while
133// elements are added at the front (like a push) they are not generally removed
134// in atomic 'pop' operations, reducing the benefit of the stack archetype.
135func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) {
136 // traverse the run
137 // do that explicitly (not in a for-each) so we can record position
138 for i, index := range p.indexes {
139
140 // look at the bracket type for each character
141 if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON {
142 // continue scanning
143 continue
144 }
145 switch pairTypes[index] {
146 case bpOpen:
147 // check if maximum pairing depth reached
148 if p.openers.Len() == maxPairingDepth {
149 p.openers.Init()
150 return
151 }
152 // remember opener location, most recent first
153 p.openers.PushFront(i)
154
155 case bpClose:
156 // see if there is a match
157 count := 0
158 for elem := p.openers.Front(); elem != nil; elem = elem.Next() {
159 count++
160 opener := elem.Value.(int)
161 if p.matchOpener(pairValues, opener, i) {
162 // if the opener matches, add nested pair to the ordered list
163 p.pairPositions = append(p.pairPositions, bracketPair{opener, i})
164 // remove up to and including matched opener
165 for ; count > 0; count-- {
166 p.openers.Remove(p.openers.Front())
167 }
168 break
169 }
170 }
171 sort.Sort(p.pairPositions)
172 // if we get here, the closing bracket matched no openers
173 // and gets ignored
174 }
175 }
176}
177
178// Bracket pairs within an isolating run sequence are processed as units so
179// that both the opening and the closing paired bracket in a pair resolve to
180// the same direction.
181//
182// N0. Process bracket pairs in an isolating run sequence sequentially in
183// the logical order of the text positions of the opening paired brackets
184// using the logic given below. Within this scope, bidirectional types EN
185// and AN are treated as R.
186//
187// Identify the bracket pairs in the current isolating run sequence
188// according to BD16. For each bracket-pair element in the list of pairs of
189// text positions:
190//
191// a Inspect the bidirectional types of the characters enclosed within the
192// bracket pair.
193//
194// b If any strong type (either L or R) matching the embedding direction is
195// found, set the type for both brackets in the pair to match the embedding
196// direction.
197//
198// o [ e ] o -> o e e e o
199//
200// o [ o e ] -> o e o e e
201//
202// o [ NI e ] -> o e NI e e
203//
204// c Otherwise, if a strong type (opposite the embedding direction) is
205// found, test for adjacent strong types as follows: 1 First, check
206// backwards before the opening paired bracket until the first strong type
207// (L, R, or sos) is found. If that first preceding strong type is opposite
208// the embedding direction, then set the type for both brackets in the pair
209// to that type. 2 Otherwise, set the type for both brackets in the pair to
210// the embedding direction.
211//
212// o [ o ] e -> o o o o e
213//
214// o [ o NI ] o -> o o o NI o o
215//
216// e [ o ] o -> e e o e o
217//
218// e [ o ] e -> e e o e e
219//
220// e ( o [ o ] NI ) e -> e e o o o o NI e e
221//
222// d Otherwise, do not set the type for the current bracket pair. Note that
223// if the enclosed text contains no strong types the paired brackets will
224// both resolve to the same level when resolved individually using rules N1
225// and N2.
226//
227// e ( NI ) o -> e ( NI ) o
228
229// getStrongTypeN0 maps character's directional code to strong type as required
230// by rule N0.
231//
232// TODO: have separate type for "strong" directionality.
233func (p *bracketPairer) getStrongTypeN0(index int) Class {
234 switch p.codesIsolatedRun[index] {
235 // in the scope of N0, number types are treated as R
236 case EN, AN, AL, R:
237 return R
238 case L:
239 return L
240 default:
241 return ON
242 }
243}
244
245// classifyPairContent reports the strong types contained inside a Bracket Pair,
246// assuming the given embedding direction.
247//
248// It returns ON if no strong type is found. If a single strong type is found,
Don Newton7577f072020-01-06 12:41:11 -0500249// it returns this type. Otherwise it returns the embedding direction.
Don Newton98fd8812019-09-23 15:15:02 -0400250//
251// TODO: use separate type for "strong" directionality.
252func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {
253 dirOpposite := ON
254 for i := loc.opener + 1; i < loc.closer; i++ {
255 dir := p.getStrongTypeN0(i)
256 if dir == ON {
257 continue
258 }
259 if dir == dirEmbed {
260 return dir // type matching embedding direction found
261 }
262 dirOpposite = dir
263 }
264 // return ON if no strong type found, or class opposite to dirEmbed
265 return dirOpposite
266}
267
268// classBeforePair determines which strong types are present before a Bracket
269// Pair. Return R or L if strong type found, otherwise ON.
270func (p *bracketPairer) classBeforePair(loc bracketPair) Class {
271 for i := loc.opener - 1; i >= 0; i-- {
272 if dir := p.getStrongTypeN0(i); dir != ON {
273 return dir
274 }
275 }
276 // no strong types found, return sos
277 return p.sos
278}
279
280// assignBracketType implements rule N0 for a single bracket pair.
281func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) {
282 // rule "N0, a", inspect contents of pair
283 dirPair := p.classifyPairContent(loc, dirEmbed)
284
285 // dirPair is now L, R, or N (no strong type found)
286
287 // the following logical tests are performed out of order compared to
288 // the statement of the rules but yield the same results
289 if dirPair == ON {
290 return // case "d" - nothing to do
291 }
292
293 if dirPair != dirEmbed {
294 // case "c": strong type found, opposite - check before (c.1)
295 dirPair = p.classBeforePair(loc)
296 if dirPair == dirEmbed || dirPair == ON {
297 // no strong opposite type found before - use embedding (c.2)
298 dirPair = dirEmbed
299 }
300 }
301 // else: case "b", strong type found matching embedding,
302 // no explicit action needed, as dirPair is already set to embedding
303 // direction
304
305 // set the bracket types to the type found
306 p.setBracketsToType(loc, dirPair, initialTypes)
307}
308
309func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) {
310 p.codesIsolatedRun[loc.opener] = dirPair
311 p.codesIsolatedRun[loc.closer] = dirPair
312
313 for i := loc.opener + 1; i < loc.closer; i++ {
314 index := p.indexes[i]
315 if initialTypes[index] != NSM {
316 break
317 }
318 p.codesIsolatedRun[i] = dirPair
319 }
320
321 for i := loc.closer + 1; i < len(p.indexes); i++ {
322 index := p.indexes[i]
323 if initialTypes[index] != NSM {
324 break
325 }
326 p.codesIsolatedRun[i] = dirPair
327 }
328}
329
330// resolveBrackets implements rule N0 for a list of pairs.
331func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) {
332 for _, loc := range p.pairPositions {
333 p.assignBracketType(loc, dirEmbed, initialTypes)
334 }
335}