Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame^] | 1 | // Copyright 2015 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package bidi |
| 6 | |
| 7 | import ( |
| 8 | "container/list" |
| 9 | "fmt" |
| 10 | "sort" |
| 11 | ) |
| 12 | |
| 13 | // This file contains a port of the reference implementation of the |
| 14 | // Bidi Parentheses Algorithm: |
| 15 | // https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java |
| 16 | // |
| 17 | // The implementation in this file covers definitions BD14-BD16 and rule N0 |
| 18 | // of UAX#9. |
| 19 | // |
| 20 | // Some preprocessing is done for each rune before data is passed to this |
| 21 | // algorithm: |
| 22 | // - opening and closing brackets are identified |
| 23 | // - a bracket pair type, like '(' and ')' is assigned a unique identifier that |
| 24 | // is identical for the opening and closing bracket. It is left to do these |
| 25 | // mappings. |
| 26 | // - The BPA algorithm requires that bracket characters that are canonical |
| 27 | // equivalents of each other be able to be substituted for each other. |
| 28 | // It is the responsibility of the caller to do this canonicalization. |
| 29 | // |
| 30 | // In implementing BD16, this implementation departs slightly from the "logical" |
| 31 | // algorithm defined in UAX#9. In particular, the stack referenced there |
| 32 | // supports operations that go beyond a "basic" stack. An equivalent |
| 33 | // implementation based on a linked list is used here. |
| 34 | |
| 35 | // Bidi_Paired_Bracket_Type |
| 36 | // BD14. An opening paired bracket is a character whose |
| 37 | // Bidi_Paired_Bracket_Type property value is Open. |
| 38 | // |
| 39 | // BD15. A closing paired bracket is a character whose |
| 40 | // Bidi_Paired_Bracket_Type property value is Close. |
| 41 | type bracketType byte |
| 42 | |
| 43 | const ( |
| 44 | bpNone bracketType = iota |
| 45 | bpOpen |
| 46 | bpClose |
| 47 | ) |
| 48 | |
| 49 | // bracketPair holds a pair of index values for opening and closing bracket |
| 50 | // location of a bracket pair. |
| 51 | type bracketPair struct { |
| 52 | opener int |
| 53 | closer int |
| 54 | } |
| 55 | |
| 56 | func (b *bracketPair) String() string { |
| 57 | return fmt.Sprintf("(%v, %v)", b.opener, b.closer) |
| 58 | } |
| 59 | |
| 60 | // bracketPairs is a slice of bracketPairs with a sort.Interface implementation. |
| 61 | type bracketPairs []bracketPair |
| 62 | |
| 63 | func (b bracketPairs) Len() int { return len(b) } |
| 64 | func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
| 65 | func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener } |
| 66 | |
| 67 | // resolvePairedBrackets runs the paired bracket part of the UBA algorithm. |
| 68 | // |
| 69 | // For each rune, it takes the indexes into the original string, the class the |
| 70 | // bracket type (in pairTypes) and the bracket identifier (pairValues). It also |
| 71 | // takes the direction type for the start-of-sentence and the embedding level. |
| 72 | // |
| 73 | // The identifiers for bracket types are the rune of the canonicalized opening |
| 74 | // bracket for brackets (open or close) or 0 for runes that are not brackets. |
| 75 | func resolvePairedBrackets(s *isolatingRunSequence) { |
| 76 | p := bracketPairer{ |
| 77 | sos: s.sos, |
| 78 | openers: list.New(), |
| 79 | codesIsolatedRun: s.types, |
| 80 | indexes: s.indexes, |
| 81 | } |
| 82 | dirEmbed := L |
| 83 | if s.level&1 != 0 { |
| 84 | dirEmbed = R |
| 85 | } |
| 86 | p.locateBrackets(s.p.pairTypes, s.p.pairValues) |
| 87 | p.resolveBrackets(dirEmbed, s.p.initialTypes) |
| 88 | } |
| 89 | |
| 90 | type bracketPairer struct { |
| 91 | sos Class // direction corresponding to start of sequence |
| 92 | |
| 93 | // The following is a restatement of BD 16 using non-algorithmic language. |
| 94 | // |
| 95 | // A bracket pair is a pair of characters consisting of an opening |
| 96 | // paired bracket and a closing paired bracket such that the |
| 97 | // Bidi_Paired_Bracket property value of the former equals the latter, |
| 98 | // subject to the following constraints. |
| 99 | // - both characters of a pair occur in the same isolating run sequence |
| 100 | // - the closing character of a pair follows the opening character |
| 101 | // - any bracket character can belong at most to one pair, the earliest possible one |
| 102 | // - any bracket character not part of a pair is treated like an ordinary character |
| 103 | // - pairs may nest properly, but their spans may not overlap otherwise |
| 104 | |
| 105 | // Bracket characters with canonical decompositions are supposed to be |
| 106 | // treated as if they had been normalized, to allow normalized and non- |
| 107 | // normalized text to give the same result. In this implementation that step |
| 108 | // is pushed out to the caller. The caller has to ensure that the pairValue |
| 109 | // slices contain the rune of the opening bracket after normalization for |
| 110 | // any opening or closing bracket. |
| 111 | |
| 112 | openers *list.List // list of positions for opening brackets |
| 113 | |
| 114 | // bracket pair positions sorted by location of opening bracket |
| 115 | pairPositions bracketPairs |
| 116 | |
| 117 | codesIsolatedRun []Class // directional bidi codes for an isolated run |
| 118 | indexes []int // array of index values into the original string |
| 119 | |
| 120 | } |
| 121 | |
| 122 | // matchOpener reports whether characters at given positions form a matching |
| 123 | // bracket pair. |
| 124 | func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool { |
| 125 | return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]] |
| 126 | } |
| 127 | |
| 128 | const maxPairingDepth = 63 |
| 129 | |
| 130 | // locateBrackets locates matching bracket pairs according to BD16. |
| 131 | // |
| 132 | // This implementation uses a linked list instead of a stack, because, while |
| 133 | // elements are added at the front (like a push) they are not generally removed |
| 134 | // in atomic 'pop' operations, reducing the benefit of the stack archetype. |
| 135 | func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) { |
| 136 | // traverse the run |
| 137 | // do that explicitly (not in a for-each) so we can record position |
| 138 | for i, index := range p.indexes { |
| 139 | |
| 140 | // look at the bracket type for each character |
| 141 | if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON { |
| 142 | // continue scanning |
| 143 | continue |
| 144 | } |
| 145 | switch pairTypes[index] { |
| 146 | case bpOpen: |
| 147 | // check if maximum pairing depth reached |
| 148 | if p.openers.Len() == maxPairingDepth { |
| 149 | p.openers.Init() |
| 150 | return |
| 151 | } |
| 152 | // remember opener location, most recent first |
| 153 | p.openers.PushFront(i) |
| 154 | |
| 155 | case bpClose: |
| 156 | // see if there is a match |
| 157 | count := 0 |
| 158 | for elem := p.openers.Front(); elem != nil; elem = elem.Next() { |
| 159 | count++ |
| 160 | opener := elem.Value.(int) |
| 161 | if p.matchOpener(pairValues, opener, i) { |
| 162 | // if the opener matches, add nested pair to the ordered list |
| 163 | p.pairPositions = append(p.pairPositions, bracketPair{opener, i}) |
| 164 | // remove up to and including matched opener |
| 165 | for ; count > 0; count-- { |
| 166 | p.openers.Remove(p.openers.Front()) |
| 167 | } |
| 168 | break |
| 169 | } |
| 170 | } |
| 171 | sort.Sort(p.pairPositions) |
| 172 | // if we get here, the closing bracket matched no openers |
| 173 | // and gets ignored |
| 174 | } |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | // Bracket pairs within an isolating run sequence are processed as units so |
| 179 | // that both the opening and the closing paired bracket in a pair resolve to |
| 180 | // the same direction. |
| 181 | // |
| 182 | // N0. Process bracket pairs in an isolating run sequence sequentially in |
| 183 | // the logical order of the text positions of the opening paired brackets |
| 184 | // using the logic given below. Within this scope, bidirectional types EN |
| 185 | // and AN are treated as R. |
| 186 | // |
| 187 | // Identify the bracket pairs in the current isolating run sequence |
| 188 | // according to BD16. For each bracket-pair element in the list of pairs of |
| 189 | // text positions: |
| 190 | // |
| 191 | // a Inspect the bidirectional types of the characters enclosed within the |
| 192 | // bracket pair. |
| 193 | // |
| 194 | // b If any strong type (either L or R) matching the embedding direction is |
| 195 | // found, set the type for both brackets in the pair to match the embedding |
| 196 | // direction. |
| 197 | // |
| 198 | // o [ e ] o -> o e e e o |
| 199 | // |
| 200 | // o [ o e ] -> o e o e e |
| 201 | // |
| 202 | // o [ NI e ] -> o e NI e e |
| 203 | // |
| 204 | // c Otherwise, if a strong type (opposite the embedding direction) is |
| 205 | // found, test for adjacent strong types as follows: 1 First, check |
| 206 | // backwards before the opening paired bracket until the first strong type |
| 207 | // (L, R, or sos) is found. If that first preceding strong type is opposite |
| 208 | // the embedding direction, then set the type for both brackets in the pair |
| 209 | // to that type. 2 Otherwise, set the type for both brackets in the pair to |
| 210 | // the embedding direction. |
| 211 | // |
| 212 | // o [ o ] e -> o o o o e |
| 213 | // |
| 214 | // o [ o NI ] o -> o o o NI o o |
| 215 | // |
| 216 | // e [ o ] o -> e e o e o |
| 217 | // |
| 218 | // e [ o ] e -> e e o e e |
| 219 | // |
| 220 | // e ( o [ o ] NI ) e -> e e o o o o NI e e |
| 221 | // |
| 222 | // d Otherwise, do not set the type for the current bracket pair. Note that |
| 223 | // if the enclosed text contains no strong types the paired brackets will |
| 224 | // both resolve to the same level when resolved individually using rules N1 |
| 225 | // and N2. |
| 226 | // |
| 227 | // e ( NI ) o -> e ( NI ) o |
| 228 | |
| 229 | // getStrongTypeN0 maps character's directional code to strong type as required |
| 230 | // by rule N0. |
| 231 | // |
| 232 | // TODO: have separate type for "strong" directionality. |
| 233 | func (p *bracketPairer) getStrongTypeN0(index int) Class { |
| 234 | switch p.codesIsolatedRun[index] { |
| 235 | // in the scope of N0, number types are treated as R |
| 236 | case EN, AN, AL, R: |
| 237 | return R |
| 238 | case L: |
| 239 | return L |
| 240 | default: |
| 241 | return ON |
| 242 | } |
| 243 | } |
| 244 | |
| 245 | // classifyPairContent reports the strong types contained inside a Bracket Pair, |
| 246 | // assuming the given embedding direction. |
| 247 | // |
| 248 | // It returns ON if no strong type is found. If a single strong type is found, |
| 249 | // it returns this type. Otherwise it returns the embedding direction. |
| 250 | // |
| 251 | // TODO: use separate type for "strong" directionality. |
| 252 | func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class { |
| 253 | dirOpposite := ON |
| 254 | for i := loc.opener + 1; i < loc.closer; i++ { |
| 255 | dir := p.getStrongTypeN0(i) |
| 256 | if dir == ON { |
| 257 | continue |
| 258 | } |
| 259 | if dir == dirEmbed { |
| 260 | return dir // type matching embedding direction found |
| 261 | } |
| 262 | dirOpposite = dir |
| 263 | } |
| 264 | // return ON if no strong type found, or class opposite to dirEmbed |
| 265 | return dirOpposite |
| 266 | } |
| 267 | |
| 268 | // classBeforePair determines which strong types are present before a Bracket |
| 269 | // Pair. Return R or L if strong type found, otherwise ON. |
| 270 | func (p *bracketPairer) classBeforePair(loc bracketPair) Class { |
| 271 | for i := loc.opener - 1; i >= 0; i-- { |
| 272 | if dir := p.getStrongTypeN0(i); dir != ON { |
| 273 | return dir |
| 274 | } |
| 275 | } |
| 276 | // no strong types found, return sos |
| 277 | return p.sos |
| 278 | } |
| 279 | |
| 280 | // assignBracketType implements rule N0 for a single bracket pair. |
| 281 | func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) { |
| 282 | // rule "N0, a", inspect contents of pair |
| 283 | dirPair := p.classifyPairContent(loc, dirEmbed) |
| 284 | |
| 285 | // dirPair is now L, R, or N (no strong type found) |
| 286 | |
| 287 | // the following logical tests are performed out of order compared to |
| 288 | // the statement of the rules but yield the same results |
| 289 | if dirPair == ON { |
| 290 | return // case "d" - nothing to do |
| 291 | } |
| 292 | |
| 293 | if dirPair != dirEmbed { |
| 294 | // case "c": strong type found, opposite - check before (c.1) |
| 295 | dirPair = p.classBeforePair(loc) |
| 296 | if dirPair == dirEmbed || dirPair == ON { |
| 297 | // no strong opposite type found before - use embedding (c.2) |
| 298 | dirPair = dirEmbed |
| 299 | } |
| 300 | } |
| 301 | // else: case "b", strong type found matching embedding, |
| 302 | // no explicit action needed, as dirPair is already set to embedding |
| 303 | // direction |
| 304 | |
| 305 | // set the bracket types to the type found |
| 306 | p.setBracketsToType(loc, dirPair, initialTypes) |
| 307 | } |
| 308 | |
| 309 | func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) { |
| 310 | p.codesIsolatedRun[loc.opener] = dirPair |
| 311 | p.codesIsolatedRun[loc.closer] = dirPair |
| 312 | |
| 313 | for i := loc.opener + 1; i < loc.closer; i++ { |
| 314 | index := p.indexes[i] |
| 315 | if initialTypes[index] != NSM { |
| 316 | break |
| 317 | } |
| 318 | p.codesIsolatedRun[i] = dirPair |
| 319 | } |
| 320 | |
| 321 | for i := loc.closer + 1; i < len(p.indexes); i++ { |
| 322 | index := p.indexes[i] |
| 323 | if initialTypes[index] != NSM { |
| 324 | break |
| 325 | } |
| 326 | p.codesIsolatedRun[i] = dirPair |
| 327 | } |
| 328 | } |
| 329 | |
| 330 | // resolveBrackets implements rule N0 for a list of pairs. |
| 331 | func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) { |
| 332 | for _, loc := range p.pairPositions { |
| 333 | p.assignBracketType(loc, dirEmbed, initialTypes) |
| 334 | } |
| 335 | } |