Zack Williams | e940c7a | 2019-08-21 14:25:39 -0700 | [diff] [blame] | 1 | // Copyright 2015 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package bidi |
| 6 | |
| 7 | import "log" |
| 8 | |
| 9 | // This implementation is a port based on the reference implementation found at: |
| 10 | // https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/ |
| 11 | // |
| 12 | // described in Unicode Bidirectional Algorithm (UAX #9). |
| 13 | // |
| 14 | // Input: |
| 15 | // There are two levels of input to the algorithm, since clients may prefer to |
| 16 | // supply some information from out-of-band sources rather than relying on the |
| 17 | // default behavior. |
| 18 | // |
| 19 | // - Bidi class array |
| 20 | // - Bidi class array, with externally supplied base line direction |
| 21 | // |
| 22 | // Output: |
| 23 | // Output is separated into several stages: |
| 24 | // |
| 25 | // - levels array over entire paragraph |
| 26 | // - reordering array over entire paragraph |
| 27 | // - levels array over line |
| 28 | // - reordering array over line |
| 29 | // |
| 30 | // Note that for conformance to the Unicode Bidirectional Algorithm, |
| 31 | // implementations are only required to generate correct reordering and |
| 32 | // character directionality (odd or even levels) over a line. Generating |
| 33 | // identical level arrays over a line is not required. Bidi explicit format |
| 34 | // codes (LRE, RLE, LRO, RLO, PDF) and BN can be assigned arbitrary levels and |
| 35 | // positions as long as the rest of the input is properly reordered. |
| 36 | // |
| 37 | // As the algorithm is defined to operate on a single paragraph at a time, this |
| 38 | // implementation is written to handle single paragraphs. Thus rule P1 is |
| 39 | // presumed by this implementation-- the data provided to the implementation is |
| 40 | // assumed to be a single paragraph, and either contains no 'B' codes, or a |
| 41 | // single 'B' code at the end of the input. 'B' is allowed as input to |
| 42 | // illustrate how the algorithm assigns it a level. |
| 43 | // |
| 44 | // Also note that rules L3 and L4 depend on the rendering engine that uses the |
| 45 | // result of the bidi algorithm. This implementation assumes that the rendering |
| 46 | // engine expects combining marks in visual order (e.g. to the left of their |
| 47 | // base character in RTL runs) and that it adjusts the glyphs used to render |
| 48 | // mirrored characters that are in RTL runs so that they render appropriately. |
| 49 | |
| 50 | // level is the embedding level of a character. Even embedding levels indicate |
| 51 | // left-to-right order and odd levels indicate right-to-left order. The special |
| 52 | // level of -1 is reserved for undefined order. |
| 53 | type level int8 |
| 54 | |
| 55 | const implicitLevel level = -1 |
| 56 | |
| 57 | // in returns if x is equal to any of the values in set. |
| 58 | func (c Class) in(set ...Class) bool { |
| 59 | for _, s := range set { |
| 60 | if c == s { |
| 61 | return true |
| 62 | } |
| 63 | } |
| 64 | return false |
| 65 | } |
| 66 | |
| 67 | // A paragraph contains the state of a paragraph. |
| 68 | type paragraph struct { |
| 69 | initialTypes []Class |
| 70 | |
| 71 | // Arrays of properties needed for paired bracket evaluation in N0 |
| 72 | pairTypes []bracketType // paired Bracket types for paragraph |
| 73 | pairValues []rune // rune for opening bracket or pbOpen and pbClose; 0 for pbNone |
| 74 | |
| 75 | embeddingLevel level // default: = implicitLevel; |
| 76 | |
| 77 | // at the paragraph levels |
| 78 | resultTypes []Class |
| 79 | resultLevels []level |
| 80 | |
| 81 | // Index of matching PDI for isolate initiator characters. For other |
| 82 | // characters, the value of matchingPDI will be set to -1. For isolate |
| 83 | // initiators with no matching PDI, matchingPDI will be set to the length of |
| 84 | // the input string. |
| 85 | matchingPDI []int |
| 86 | |
| 87 | // Index of matching isolate initiator for PDI characters. For other |
| 88 | // characters, and for PDIs with no matching isolate initiator, the value of |
| 89 | // matchingIsolateInitiator will be set to -1. |
| 90 | matchingIsolateInitiator []int |
| 91 | } |
| 92 | |
| 93 | // newParagraph initializes a paragraph. The user needs to supply a few arrays |
| 94 | // corresponding to the preprocessed text input. The types correspond to the |
| 95 | // Unicode BiDi classes for each rune. pairTypes indicates the bracket type for |
| 96 | // each rune. pairValues provides a unique bracket class identifier for each |
| 97 | // rune (suggested is the rune of the open bracket for opening and matching |
| 98 | // close brackets, after normalization). The embedding levels are optional, but |
| 99 | // may be supplied to encode embedding levels of styled text. |
| 100 | // |
| 101 | // TODO: return an error. |
| 102 | func newParagraph(types []Class, pairTypes []bracketType, pairValues []rune, levels level) *paragraph { |
| 103 | validateTypes(types) |
| 104 | validatePbTypes(pairTypes) |
| 105 | validatePbValues(pairValues, pairTypes) |
| 106 | validateParagraphEmbeddingLevel(levels) |
| 107 | |
| 108 | p := ¶graph{ |
| 109 | initialTypes: append([]Class(nil), types...), |
| 110 | embeddingLevel: levels, |
| 111 | |
| 112 | pairTypes: pairTypes, |
| 113 | pairValues: pairValues, |
| 114 | |
| 115 | resultTypes: append([]Class(nil), types...), |
| 116 | } |
| 117 | p.run() |
| 118 | return p |
| 119 | } |
| 120 | |
| 121 | func (p *paragraph) Len() int { return len(p.initialTypes) } |
| 122 | |
| 123 | // The algorithm. Does not include line-based processing (Rules L1, L2). |
| 124 | // These are applied later in the line-based phase of the algorithm. |
| 125 | func (p *paragraph) run() { |
| 126 | p.determineMatchingIsolates() |
| 127 | |
| 128 | // 1) determining the paragraph level |
| 129 | // Rule P1 is the requirement for entering this algorithm. |
| 130 | // Rules P2, P3. |
| 131 | // If no externally supplied paragraph embedding level, use default. |
| 132 | if p.embeddingLevel == implicitLevel { |
| 133 | p.embeddingLevel = p.determineParagraphEmbeddingLevel(0, p.Len()) |
| 134 | } |
| 135 | |
| 136 | // Initialize result levels to paragraph embedding level. |
| 137 | p.resultLevels = make([]level, p.Len()) |
| 138 | setLevels(p.resultLevels, p.embeddingLevel) |
| 139 | |
| 140 | // 2) Explicit levels and directions |
| 141 | // Rules X1-X8. |
| 142 | p.determineExplicitEmbeddingLevels() |
| 143 | |
| 144 | // Rule X9. |
| 145 | // We do not remove the embeddings, the overrides, the PDFs, and the BNs |
| 146 | // from the string explicitly. But they are not copied into isolating run |
| 147 | // sequences when they are created, so they are removed for all |
| 148 | // practical purposes. |
| 149 | |
| 150 | // Rule X10. |
| 151 | // Run remainder of algorithm one isolating run sequence at a time |
| 152 | for _, seq := range p.determineIsolatingRunSequences() { |
| 153 | // 3) resolving weak types |
| 154 | // Rules W1-W7. |
| 155 | seq.resolveWeakTypes() |
| 156 | |
| 157 | // 4a) resolving paired brackets |
| 158 | // Rule N0 |
| 159 | resolvePairedBrackets(seq) |
| 160 | |
| 161 | // 4b) resolving neutral types |
| 162 | // Rules N1-N3. |
| 163 | seq.resolveNeutralTypes() |
| 164 | |
| 165 | // 5) resolving implicit embedding levels |
| 166 | // Rules I1, I2. |
| 167 | seq.resolveImplicitLevels() |
| 168 | |
| 169 | // Apply the computed levels and types |
| 170 | seq.applyLevelsAndTypes() |
| 171 | } |
| 172 | |
| 173 | // Assign appropriate levels to 'hide' LREs, RLEs, LROs, RLOs, PDFs, and |
| 174 | // BNs. This is for convenience, so the resulting level array will have |
| 175 | // a value for every character. |
| 176 | p.assignLevelsToCharactersRemovedByX9() |
| 177 | } |
| 178 | |
| 179 | // determineMatchingIsolates determines the matching PDI for each isolate |
| 180 | // initiator and vice versa. |
| 181 | // |
| 182 | // Definition BD9. |
| 183 | // |
| 184 | // At the end of this function: |
| 185 | // |
| 186 | // - The member variable matchingPDI is set to point to the index of the |
| 187 | // matching PDI character for each isolate initiator character. If there is |
| 188 | // no matching PDI, it is set to the length of the input text. For other |
| 189 | // characters, it is set to -1. |
| 190 | // - The member variable matchingIsolateInitiator is set to point to the |
| 191 | // index of the matching isolate initiator character for each PDI character. |
| 192 | // If there is no matching isolate initiator, or the character is not a PDI, |
| 193 | // it is set to -1. |
| 194 | func (p *paragraph) determineMatchingIsolates() { |
| 195 | p.matchingPDI = make([]int, p.Len()) |
| 196 | p.matchingIsolateInitiator = make([]int, p.Len()) |
| 197 | |
| 198 | for i := range p.matchingIsolateInitiator { |
| 199 | p.matchingIsolateInitiator[i] = -1 |
| 200 | } |
| 201 | |
| 202 | for i := range p.matchingPDI { |
| 203 | p.matchingPDI[i] = -1 |
| 204 | |
| 205 | if t := p.resultTypes[i]; t.in(LRI, RLI, FSI) { |
| 206 | depthCounter := 1 |
| 207 | for j := i + 1; j < p.Len(); j++ { |
| 208 | if u := p.resultTypes[j]; u.in(LRI, RLI, FSI) { |
| 209 | depthCounter++ |
| 210 | } else if u == PDI { |
| 211 | if depthCounter--; depthCounter == 0 { |
| 212 | p.matchingPDI[i] = j |
| 213 | p.matchingIsolateInitiator[j] = i |
| 214 | break |
| 215 | } |
| 216 | } |
| 217 | } |
| 218 | if p.matchingPDI[i] == -1 { |
| 219 | p.matchingPDI[i] = p.Len() |
| 220 | } |
| 221 | } |
| 222 | } |
| 223 | } |
| 224 | |
| 225 | // determineParagraphEmbeddingLevel reports the resolved paragraph direction of |
| 226 | // the substring limited by the given range [start, end). |
| 227 | // |
| 228 | // Determines the paragraph level based on rules P2, P3. This is also used |
| 229 | // in rule X5c to find if an FSI should resolve to LRI or RLI. |
| 230 | func (p *paragraph) determineParagraphEmbeddingLevel(start, end int) level { |
| 231 | var strongType Class = unknownClass |
| 232 | |
| 233 | // Rule P2. |
| 234 | for i := start; i < end; i++ { |
| 235 | if t := p.resultTypes[i]; t.in(L, AL, R) { |
| 236 | strongType = t |
| 237 | break |
| 238 | } else if t.in(FSI, LRI, RLI) { |
| 239 | i = p.matchingPDI[i] // skip over to the matching PDI |
| 240 | if i > end { |
| 241 | log.Panic("assert (i <= end)") |
| 242 | } |
| 243 | } |
| 244 | } |
| 245 | // Rule P3. |
| 246 | switch strongType { |
| 247 | case unknownClass: // none found |
| 248 | // default embedding level when no strong types found is 0. |
| 249 | return 0 |
| 250 | case L: |
| 251 | return 0 |
| 252 | default: // AL, R |
| 253 | return 1 |
| 254 | } |
| 255 | } |
| 256 | |
| 257 | const maxDepth = 125 |
| 258 | |
| 259 | // This stack will store the embedding levels and override and isolated |
| 260 | // statuses |
| 261 | type directionalStatusStack struct { |
| 262 | stackCounter int |
| 263 | embeddingLevelStack [maxDepth + 1]level |
| 264 | overrideStatusStack [maxDepth + 1]Class |
| 265 | isolateStatusStack [maxDepth + 1]bool |
| 266 | } |
| 267 | |
| 268 | func (s *directionalStatusStack) empty() { s.stackCounter = 0 } |
| 269 | func (s *directionalStatusStack) pop() { s.stackCounter-- } |
| 270 | func (s *directionalStatusStack) depth() int { return s.stackCounter } |
| 271 | |
| 272 | func (s *directionalStatusStack) push(level level, overrideStatus Class, isolateStatus bool) { |
| 273 | s.embeddingLevelStack[s.stackCounter] = level |
| 274 | s.overrideStatusStack[s.stackCounter] = overrideStatus |
| 275 | s.isolateStatusStack[s.stackCounter] = isolateStatus |
| 276 | s.stackCounter++ |
| 277 | } |
| 278 | |
| 279 | func (s *directionalStatusStack) lastEmbeddingLevel() level { |
| 280 | return s.embeddingLevelStack[s.stackCounter-1] |
| 281 | } |
| 282 | |
| 283 | func (s *directionalStatusStack) lastDirectionalOverrideStatus() Class { |
| 284 | return s.overrideStatusStack[s.stackCounter-1] |
| 285 | } |
| 286 | |
| 287 | func (s *directionalStatusStack) lastDirectionalIsolateStatus() bool { |
| 288 | return s.isolateStatusStack[s.stackCounter-1] |
| 289 | } |
| 290 | |
| 291 | // Determine explicit levels using rules X1 - X8 |
| 292 | func (p *paragraph) determineExplicitEmbeddingLevels() { |
| 293 | var stack directionalStatusStack |
| 294 | var overflowIsolateCount, overflowEmbeddingCount, validIsolateCount int |
| 295 | |
| 296 | // Rule X1. |
| 297 | stack.push(p.embeddingLevel, ON, false) |
| 298 | |
| 299 | for i, t := range p.resultTypes { |
| 300 | // Rules X2, X3, X4, X5, X5a, X5b, X5c |
| 301 | switch t { |
| 302 | case RLE, LRE, RLO, LRO, RLI, LRI, FSI: |
| 303 | isIsolate := t.in(RLI, LRI, FSI) |
| 304 | isRTL := t.in(RLE, RLO, RLI) |
| 305 | |
| 306 | // override if this is an FSI that resolves to RLI |
| 307 | if t == FSI { |
| 308 | isRTL = (p.determineParagraphEmbeddingLevel(i+1, p.matchingPDI[i]) == 1) |
| 309 | } |
| 310 | if isIsolate { |
| 311 | p.resultLevels[i] = stack.lastEmbeddingLevel() |
| 312 | if stack.lastDirectionalOverrideStatus() != ON { |
| 313 | p.resultTypes[i] = stack.lastDirectionalOverrideStatus() |
| 314 | } |
| 315 | } |
| 316 | |
| 317 | var newLevel level |
| 318 | if isRTL { |
| 319 | // least greater odd |
| 320 | newLevel = (stack.lastEmbeddingLevel() + 1) | 1 |
| 321 | } else { |
| 322 | // least greater even |
| 323 | newLevel = (stack.lastEmbeddingLevel() + 2) &^ 1 |
| 324 | } |
| 325 | |
| 326 | if newLevel <= maxDepth && overflowIsolateCount == 0 && overflowEmbeddingCount == 0 { |
| 327 | if isIsolate { |
| 328 | validIsolateCount++ |
| 329 | } |
| 330 | // Push new embedding level, override status, and isolated |
| 331 | // status. |
| 332 | // No check for valid stack counter, since the level check |
| 333 | // suffices. |
| 334 | switch t { |
| 335 | case LRO: |
| 336 | stack.push(newLevel, L, isIsolate) |
| 337 | case RLO: |
| 338 | stack.push(newLevel, R, isIsolate) |
| 339 | default: |
| 340 | stack.push(newLevel, ON, isIsolate) |
| 341 | } |
| 342 | // Not really part of the spec |
| 343 | if !isIsolate { |
| 344 | p.resultLevels[i] = newLevel |
| 345 | } |
| 346 | } else { |
| 347 | // This is an invalid explicit formatting character, |
| 348 | // so apply the "Otherwise" part of rules X2-X5b. |
| 349 | if isIsolate { |
| 350 | overflowIsolateCount++ |
| 351 | } else { // !isIsolate |
| 352 | if overflowIsolateCount == 0 { |
| 353 | overflowEmbeddingCount++ |
| 354 | } |
| 355 | } |
| 356 | } |
| 357 | |
| 358 | // Rule X6a |
| 359 | case PDI: |
| 360 | if overflowIsolateCount > 0 { |
| 361 | overflowIsolateCount-- |
| 362 | } else if validIsolateCount == 0 { |
| 363 | // do nothing |
| 364 | } else { |
| 365 | overflowEmbeddingCount = 0 |
| 366 | for !stack.lastDirectionalIsolateStatus() { |
| 367 | stack.pop() |
| 368 | } |
| 369 | stack.pop() |
| 370 | validIsolateCount-- |
| 371 | } |
| 372 | p.resultLevels[i] = stack.lastEmbeddingLevel() |
| 373 | |
| 374 | // Rule X7 |
| 375 | case PDF: |
| 376 | // Not really part of the spec |
| 377 | p.resultLevels[i] = stack.lastEmbeddingLevel() |
| 378 | |
| 379 | if overflowIsolateCount > 0 { |
| 380 | // do nothing |
| 381 | } else if overflowEmbeddingCount > 0 { |
| 382 | overflowEmbeddingCount-- |
| 383 | } else if !stack.lastDirectionalIsolateStatus() && stack.depth() >= 2 { |
| 384 | stack.pop() |
| 385 | } |
| 386 | |
| 387 | case B: // paragraph separator. |
| 388 | // Rule X8. |
| 389 | |
| 390 | // These values are reset for clarity, in this implementation B |
| 391 | // can only occur as the last code in the array. |
| 392 | stack.empty() |
| 393 | overflowIsolateCount = 0 |
| 394 | overflowEmbeddingCount = 0 |
| 395 | validIsolateCount = 0 |
| 396 | p.resultLevels[i] = p.embeddingLevel |
| 397 | |
| 398 | default: |
| 399 | p.resultLevels[i] = stack.lastEmbeddingLevel() |
| 400 | if stack.lastDirectionalOverrideStatus() != ON { |
| 401 | p.resultTypes[i] = stack.lastDirectionalOverrideStatus() |
| 402 | } |
| 403 | } |
| 404 | } |
| 405 | } |
| 406 | |
| 407 | type isolatingRunSequence struct { |
| 408 | p *paragraph |
| 409 | |
| 410 | indexes []int // indexes to the original string |
| 411 | |
| 412 | types []Class // type of each character using the index |
| 413 | resolvedLevels []level // resolved levels after application of rules |
| 414 | level level |
| 415 | sos, eos Class |
| 416 | } |
| 417 | |
| 418 | func (i *isolatingRunSequence) Len() int { return len(i.indexes) } |
| 419 | |
| 420 | func maxLevel(a, b level) level { |
| 421 | if a > b { |
| 422 | return a |
| 423 | } |
| 424 | return b |
| 425 | } |
| 426 | |
| 427 | // Rule X10, second bullet: Determine the start-of-sequence (sos) and end-of-sequence (eos) types, |
| 428 | // either L or R, for each isolating run sequence. |
| 429 | func (p *paragraph) isolatingRunSequence(indexes []int) *isolatingRunSequence { |
| 430 | length := len(indexes) |
| 431 | types := make([]Class, length) |
| 432 | for i, x := range indexes { |
| 433 | types[i] = p.resultTypes[x] |
| 434 | } |
| 435 | |
| 436 | // assign level, sos and eos |
| 437 | prevChar := indexes[0] - 1 |
| 438 | for prevChar >= 0 && isRemovedByX9(p.initialTypes[prevChar]) { |
| 439 | prevChar-- |
| 440 | } |
| 441 | prevLevel := p.embeddingLevel |
| 442 | if prevChar >= 0 { |
| 443 | prevLevel = p.resultLevels[prevChar] |
| 444 | } |
| 445 | |
| 446 | var succLevel level |
| 447 | lastType := types[length-1] |
| 448 | if lastType.in(LRI, RLI, FSI) { |
| 449 | succLevel = p.embeddingLevel |
| 450 | } else { |
| 451 | // the first character after the end of run sequence |
| 452 | limit := indexes[length-1] + 1 |
| 453 | for ; limit < p.Len() && isRemovedByX9(p.initialTypes[limit]); limit++ { |
| 454 | |
| 455 | } |
| 456 | succLevel = p.embeddingLevel |
| 457 | if limit < p.Len() { |
| 458 | succLevel = p.resultLevels[limit] |
| 459 | } |
| 460 | } |
| 461 | level := p.resultLevels[indexes[0]] |
| 462 | return &isolatingRunSequence{ |
| 463 | p: p, |
| 464 | indexes: indexes, |
| 465 | types: types, |
| 466 | level: level, |
| 467 | sos: typeForLevel(maxLevel(prevLevel, level)), |
| 468 | eos: typeForLevel(maxLevel(succLevel, level)), |
| 469 | } |
| 470 | } |
| 471 | |
| 472 | // Resolving weak types Rules W1-W7. |
| 473 | // |
| 474 | // Note that some weak types (EN, AN) remain after this processing is |
| 475 | // complete. |
| 476 | func (s *isolatingRunSequence) resolveWeakTypes() { |
| 477 | |
| 478 | // on entry, only these types remain |
| 479 | s.assertOnly(L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM, LRI, RLI, FSI, PDI) |
| 480 | |
| 481 | // Rule W1. |
| 482 | // Changes all NSMs. |
| 483 | preceedingCharacterType := s.sos |
| 484 | for i, t := range s.types { |
| 485 | if t == NSM { |
| 486 | s.types[i] = preceedingCharacterType |
| 487 | } else { |
| 488 | if t.in(LRI, RLI, FSI, PDI) { |
| 489 | preceedingCharacterType = ON |
| 490 | } |
| 491 | preceedingCharacterType = t |
| 492 | } |
| 493 | } |
| 494 | |
| 495 | // Rule W2. |
| 496 | // EN does not change at the start of the run, because sos != AL. |
| 497 | for i, t := range s.types { |
| 498 | if t == EN { |
| 499 | for j := i - 1; j >= 0; j-- { |
| 500 | if t := s.types[j]; t.in(L, R, AL) { |
| 501 | if t == AL { |
| 502 | s.types[i] = AN |
| 503 | } |
| 504 | break |
| 505 | } |
| 506 | } |
| 507 | } |
| 508 | } |
| 509 | |
| 510 | // Rule W3. |
| 511 | for i, t := range s.types { |
| 512 | if t == AL { |
| 513 | s.types[i] = R |
| 514 | } |
| 515 | } |
| 516 | |
| 517 | // Rule W4. |
| 518 | // Since there must be values on both sides for this rule to have an |
| 519 | // effect, the scan skips the first and last value. |
| 520 | // |
| 521 | // Although the scan proceeds left to right, and changes the type |
| 522 | // values in a way that would appear to affect the computations |
| 523 | // later in the scan, there is actually no problem. A change in the |
| 524 | // current value can only affect the value to its immediate right, |
| 525 | // and only affect it if it is ES or CS. But the current value can |
| 526 | // only change if the value to its right is not ES or CS. Thus |
| 527 | // either the current value will not change, or its change will have |
| 528 | // no effect on the remainder of the analysis. |
| 529 | |
| 530 | for i := 1; i < s.Len()-1; i++ { |
| 531 | t := s.types[i] |
| 532 | if t == ES || t == CS { |
| 533 | prevSepType := s.types[i-1] |
| 534 | succSepType := s.types[i+1] |
| 535 | if prevSepType == EN && succSepType == EN { |
| 536 | s.types[i] = EN |
| 537 | } else if s.types[i] == CS && prevSepType == AN && succSepType == AN { |
| 538 | s.types[i] = AN |
| 539 | } |
| 540 | } |
| 541 | } |
| 542 | |
| 543 | // Rule W5. |
| 544 | for i, t := range s.types { |
| 545 | if t == ET { |
| 546 | // locate end of sequence |
| 547 | runStart := i |
| 548 | runEnd := s.findRunLimit(runStart, ET) |
| 549 | |
| 550 | // check values at ends of sequence |
| 551 | t := s.sos |
| 552 | if runStart > 0 { |
| 553 | t = s.types[runStart-1] |
| 554 | } |
| 555 | if t != EN { |
| 556 | t = s.eos |
| 557 | if runEnd < len(s.types) { |
| 558 | t = s.types[runEnd] |
| 559 | } |
| 560 | } |
| 561 | if t == EN { |
| 562 | setTypes(s.types[runStart:runEnd], EN) |
| 563 | } |
| 564 | // continue at end of sequence |
| 565 | i = runEnd |
| 566 | } |
| 567 | } |
| 568 | |
| 569 | // Rule W6. |
| 570 | for i, t := range s.types { |
| 571 | if t.in(ES, ET, CS) { |
| 572 | s.types[i] = ON |
| 573 | } |
| 574 | } |
| 575 | |
| 576 | // Rule W7. |
| 577 | for i, t := range s.types { |
| 578 | if t == EN { |
| 579 | // set default if we reach start of run |
| 580 | prevStrongType := s.sos |
| 581 | for j := i - 1; j >= 0; j-- { |
| 582 | t = s.types[j] |
| 583 | if t == L || t == R { // AL's have been changed to R |
| 584 | prevStrongType = t |
| 585 | break |
| 586 | } |
| 587 | } |
| 588 | if prevStrongType == L { |
| 589 | s.types[i] = L |
| 590 | } |
| 591 | } |
| 592 | } |
| 593 | } |
| 594 | |
| 595 | // 6) resolving neutral types Rules N1-N2. |
| 596 | func (s *isolatingRunSequence) resolveNeutralTypes() { |
| 597 | |
| 598 | // on entry, only these types can be in resultTypes |
| 599 | s.assertOnly(L, R, EN, AN, B, S, WS, ON, RLI, LRI, FSI, PDI) |
| 600 | |
| 601 | for i, t := range s.types { |
| 602 | switch t { |
| 603 | case WS, ON, B, S, RLI, LRI, FSI, PDI: |
| 604 | // find bounds of run of neutrals |
| 605 | runStart := i |
| 606 | runEnd := s.findRunLimit(runStart, B, S, WS, ON, RLI, LRI, FSI, PDI) |
| 607 | |
| 608 | // determine effective types at ends of run |
| 609 | var leadType, trailType Class |
| 610 | |
| 611 | // Note that the character found can only be L, R, AN, or |
| 612 | // EN. |
| 613 | if runStart == 0 { |
| 614 | leadType = s.sos |
| 615 | } else { |
| 616 | leadType = s.types[runStart-1] |
| 617 | if leadType.in(AN, EN) { |
| 618 | leadType = R |
| 619 | } |
| 620 | } |
| 621 | if runEnd == len(s.types) { |
| 622 | trailType = s.eos |
| 623 | } else { |
| 624 | trailType = s.types[runEnd] |
| 625 | if trailType.in(AN, EN) { |
| 626 | trailType = R |
| 627 | } |
| 628 | } |
| 629 | |
| 630 | var resolvedType Class |
| 631 | if leadType == trailType { |
| 632 | // Rule N1. |
| 633 | resolvedType = leadType |
| 634 | } else { |
| 635 | // Rule N2. |
| 636 | // Notice the embedding level of the run is used, not |
| 637 | // the paragraph embedding level. |
| 638 | resolvedType = typeForLevel(s.level) |
| 639 | } |
| 640 | |
| 641 | setTypes(s.types[runStart:runEnd], resolvedType) |
| 642 | |
| 643 | // skip over run of (former) neutrals |
| 644 | i = runEnd |
| 645 | } |
| 646 | } |
| 647 | } |
| 648 | |
| 649 | func setLevels(levels []level, newLevel level) { |
| 650 | for i := range levels { |
| 651 | levels[i] = newLevel |
| 652 | } |
| 653 | } |
| 654 | |
| 655 | func setTypes(types []Class, newType Class) { |
| 656 | for i := range types { |
| 657 | types[i] = newType |
| 658 | } |
| 659 | } |
| 660 | |
| 661 | // 7) resolving implicit embedding levels Rules I1, I2. |
| 662 | func (s *isolatingRunSequence) resolveImplicitLevels() { |
| 663 | |
| 664 | // on entry, only these types can be in resultTypes |
| 665 | s.assertOnly(L, R, EN, AN) |
| 666 | |
| 667 | s.resolvedLevels = make([]level, len(s.types)) |
| 668 | setLevels(s.resolvedLevels, s.level) |
| 669 | |
| 670 | if (s.level & 1) == 0 { // even level |
| 671 | for i, t := range s.types { |
| 672 | // Rule I1. |
| 673 | if t == L { |
| 674 | // no change |
| 675 | } else if t == R { |
| 676 | s.resolvedLevels[i] += 1 |
| 677 | } else { // t == AN || t == EN |
| 678 | s.resolvedLevels[i] += 2 |
| 679 | } |
| 680 | } |
| 681 | } else { // odd level |
| 682 | for i, t := range s.types { |
| 683 | // Rule I2. |
| 684 | if t == R { |
| 685 | // no change |
| 686 | } else { // t == L || t == AN || t == EN |
| 687 | s.resolvedLevels[i] += 1 |
| 688 | } |
| 689 | } |
| 690 | } |
| 691 | } |
| 692 | |
| 693 | // Applies the levels and types resolved in rules W1-I2 to the |
| 694 | // resultLevels array. |
| 695 | func (s *isolatingRunSequence) applyLevelsAndTypes() { |
| 696 | for i, x := range s.indexes { |
| 697 | s.p.resultTypes[x] = s.types[i] |
| 698 | s.p.resultLevels[x] = s.resolvedLevels[i] |
| 699 | } |
| 700 | } |
| 701 | |
| 702 | // Return the limit of the run consisting only of the types in validSet |
| 703 | // starting at index. This checks the value at index, and will return |
| 704 | // index if that value is not in validSet. |
| 705 | func (s *isolatingRunSequence) findRunLimit(index int, validSet ...Class) int { |
| 706 | loop: |
| 707 | for ; index < len(s.types); index++ { |
| 708 | t := s.types[index] |
| 709 | for _, valid := range validSet { |
| 710 | if t == valid { |
| 711 | continue loop |
| 712 | } |
| 713 | } |
| 714 | return index // didn't find a match in validSet |
| 715 | } |
| 716 | return len(s.types) |
| 717 | } |
| 718 | |
| 719 | // Algorithm validation. Assert that all values in types are in the |
| 720 | // provided set. |
| 721 | func (s *isolatingRunSequence) assertOnly(codes ...Class) { |
| 722 | loop: |
| 723 | for i, t := range s.types { |
| 724 | for _, c := range codes { |
| 725 | if t == c { |
| 726 | continue loop |
| 727 | } |
| 728 | } |
| 729 | log.Panicf("invalid bidi code %v present in assertOnly at position %d", t, s.indexes[i]) |
| 730 | } |
| 731 | } |
| 732 | |
| 733 | // determineLevelRuns returns an array of level runs. Each level run is |
| 734 | // described as an array of indexes into the input string. |
| 735 | // |
| 736 | // Determines the level runs. Rule X9 will be applied in determining the |
| 737 | // runs, in the way that makes sure the characters that are supposed to be |
| 738 | // removed are not included in the runs. |
| 739 | func (p *paragraph) determineLevelRuns() [][]int { |
| 740 | run := []int{} |
| 741 | allRuns := [][]int{} |
| 742 | currentLevel := implicitLevel |
| 743 | |
| 744 | for i := range p.initialTypes { |
| 745 | if !isRemovedByX9(p.initialTypes[i]) { |
| 746 | if p.resultLevels[i] != currentLevel { |
| 747 | // we just encountered a new run; wrap up last run |
| 748 | if currentLevel >= 0 { // only wrap it up if there was a run |
| 749 | allRuns = append(allRuns, run) |
| 750 | run = nil |
| 751 | } |
| 752 | // Start new run |
| 753 | currentLevel = p.resultLevels[i] |
| 754 | } |
| 755 | run = append(run, i) |
| 756 | } |
| 757 | } |
| 758 | // Wrap up the final run, if any |
| 759 | if len(run) > 0 { |
| 760 | allRuns = append(allRuns, run) |
| 761 | } |
| 762 | return allRuns |
| 763 | } |
| 764 | |
| 765 | // Definition BD13. Determine isolating run sequences. |
| 766 | func (p *paragraph) determineIsolatingRunSequences() []*isolatingRunSequence { |
| 767 | levelRuns := p.determineLevelRuns() |
| 768 | |
| 769 | // Compute the run that each character belongs to |
| 770 | runForCharacter := make([]int, p.Len()) |
| 771 | for i, run := range levelRuns { |
| 772 | for _, index := range run { |
| 773 | runForCharacter[index] = i |
| 774 | } |
| 775 | } |
| 776 | |
| 777 | sequences := []*isolatingRunSequence{} |
| 778 | |
| 779 | var currentRunSequence []int |
| 780 | |
| 781 | for _, run := range levelRuns { |
| 782 | first := run[0] |
| 783 | if p.initialTypes[first] != PDI || p.matchingIsolateInitiator[first] == -1 { |
| 784 | currentRunSequence = nil |
| 785 | // int run = i; |
| 786 | for { |
| 787 | // Copy this level run into currentRunSequence |
| 788 | currentRunSequence = append(currentRunSequence, run...) |
| 789 | |
| 790 | last := currentRunSequence[len(currentRunSequence)-1] |
| 791 | lastT := p.initialTypes[last] |
| 792 | if lastT.in(LRI, RLI, FSI) && p.matchingPDI[last] != p.Len() { |
| 793 | run = levelRuns[runForCharacter[p.matchingPDI[last]]] |
| 794 | } else { |
| 795 | break |
| 796 | } |
| 797 | } |
| 798 | sequences = append(sequences, p.isolatingRunSequence(currentRunSequence)) |
| 799 | } |
| 800 | } |
| 801 | return sequences |
| 802 | } |
| 803 | |
| 804 | // Assign level information to characters removed by rule X9. This is for |
| 805 | // ease of relating the level information to the original input data. Note |
| 806 | // that the levels assigned to these codes are arbitrary, they're chosen so |
| 807 | // as to avoid breaking level runs. |
| 808 | func (p *paragraph) assignLevelsToCharactersRemovedByX9() { |
| 809 | for i, t := range p.initialTypes { |
| 810 | if t.in(LRE, RLE, LRO, RLO, PDF, BN) { |
| 811 | p.resultTypes[i] = t |
| 812 | p.resultLevels[i] = -1 |
| 813 | } |
| 814 | } |
| 815 | // now propagate forward the levels information (could have |
| 816 | // propagated backward, the main thing is not to introduce a level |
| 817 | // break where one doesn't already exist). |
| 818 | |
| 819 | if p.resultLevels[0] == -1 { |
| 820 | p.resultLevels[0] = p.embeddingLevel |
| 821 | } |
| 822 | for i := 1; i < len(p.initialTypes); i++ { |
| 823 | if p.resultLevels[i] == -1 { |
| 824 | p.resultLevels[i] = p.resultLevels[i-1] |
| 825 | } |
| 826 | } |
| 827 | // Embedding information is for informational purposes only so need not be |
| 828 | // adjusted. |
| 829 | } |
| 830 | |
| 831 | // |
| 832 | // Output |
| 833 | // |
| 834 | |
| 835 | // getLevels computes levels array breaking lines at offsets in linebreaks. |
| 836 | // Rule L1. |
| 837 | // |
| 838 | // The linebreaks array must include at least one value. The values must be |
| 839 | // in strictly increasing order (no duplicates) between 1 and the length of |
| 840 | // the text, inclusive. The last value must be the length of the text. |
| 841 | func (p *paragraph) getLevels(linebreaks []int) []level { |
| 842 | // Note that since the previous processing has removed all |
| 843 | // P, S, and WS values from resultTypes, the values referred to |
| 844 | // in these rules are the initial types, before any processing |
| 845 | // has been applied (including processing of overrides). |
| 846 | // |
| 847 | // This example implementation has reinserted explicit format codes |
| 848 | // and BN, in order that the levels array correspond to the |
| 849 | // initial text. Their final placement is not normative. |
| 850 | // These codes are treated like WS in this implementation, |
| 851 | // so they don't interrupt sequences of WS. |
| 852 | |
| 853 | validateLineBreaks(linebreaks, p.Len()) |
| 854 | |
| 855 | result := append([]level(nil), p.resultLevels...) |
| 856 | |
| 857 | // don't worry about linebreaks since if there is a break within |
| 858 | // a series of WS values preceding S, the linebreak itself |
| 859 | // causes the reset. |
| 860 | for i, t := range p.initialTypes { |
| 861 | if t.in(B, S) { |
| 862 | // Rule L1, clauses one and two. |
| 863 | result[i] = p.embeddingLevel |
| 864 | |
| 865 | // Rule L1, clause three. |
| 866 | for j := i - 1; j >= 0; j-- { |
| 867 | if isWhitespace(p.initialTypes[j]) { // including format codes |
| 868 | result[j] = p.embeddingLevel |
| 869 | } else { |
| 870 | break |
| 871 | } |
| 872 | } |
| 873 | } |
| 874 | } |
| 875 | |
| 876 | // Rule L1, clause four. |
| 877 | start := 0 |
| 878 | for _, limit := range linebreaks { |
| 879 | for j := limit - 1; j >= start; j-- { |
| 880 | if isWhitespace(p.initialTypes[j]) { // including format codes |
| 881 | result[j] = p.embeddingLevel |
| 882 | } else { |
| 883 | break |
| 884 | } |
| 885 | } |
| 886 | start = limit |
| 887 | } |
| 888 | |
| 889 | return result |
| 890 | } |
| 891 | |
| 892 | // getReordering returns the reordering of lines from a visual index to a |
| 893 | // logical index for line breaks at the given offsets. |
| 894 | // |
| 895 | // Lines are concatenated from left to right. So for example, the fifth |
| 896 | // character from the left on the third line is |
| 897 | // |
| 898 | // getReordering(linebreaks)[linebreaks[1] + 4] |
| 899 | // |
| 900 | // (linebreaks[1] is the position after the last character of the second |
| 901 | // line, which is also the index of the first character on the third line, |
| 902 | // and adding four gets the fifth character from the left). |
| 903 | // |
| 904 | // The linebreaks array must include at least one value. The values must be |
| 905 | // in strictly increasing order (no duplicates) between 1 and the length of |
| 906 | // the text, inclusive. The last value must be the length of the text. |
| 907 | func (p *paragraph) getReordering(linebreaks []int) []int { |
| 908 | validateLineBreaks(linebreaks, p.Len()) |
| 909 | |
| 910 | return computeMultilineReordering(p.getLevels(linebreaks), linebreaks) |
| 911 | } |
| 912 | |
| 913 | // Return multiline reordering array for a given level array. Reordering |
| 914 | // does not occur across a line break. |
| 915 | func computeMultilineReordering(levels []level, linebreaks []int) []int { |
| 916 | result := make([]int, len(levels)) |
| 917 | |
| 918 | start := 0 |
| 919 | for _, limit := range linebreaks { |
| 920 | tempLevels := make([]level, limit-start) |
| 921 | copy(tempLevels, levels[start:]) |
| 922 | |
| 923 | for j, order := range computeReordering(tempLevels) { |
| 924 | result[start+j] = order + start |
| 925 | } |
| 926 | start = limit |
| 927 | } |
| 928 | return result |
| 929 | } |
| 930 | |
| 931 | // Return reordering array for a given level array. This reorders a single |
| 932 | // line. The reordering is a visual to logical map. For example, the |
| 933 | // leftmost char is string.charAt(order[0]). Rule L2. |
| 934 | func computeReordering(levels []level) []int { |
| 935 | result := make([]int, len(levels)) |
| 936 | // initialize order |
| 937 | for i := range result { |
| 938 | result[i] = i |
| 939 | } |
| 940 | |
| 941 | // locate highest level found on line. |
| 942 | // Note the rules say text, but no reordering across line bounds is |
| 943 | // performed, so this is sufficient. |
| 944 | highestLevel := level(0) |
| 945 | lowestOddLevel := level(maxDepth + 2) |
| 946 | for _, level := range levels { |
| 947 | if level > highestLevel { |
| 948 | highestLevel = level |
| 949 | } |
| 950 | if level&1 != 0 && level < lowestOddLevel { |
| 951 | lowestOddLevel = level |
| 952 | } |
| 953 | } |
| 954 | |
| 955 | for level := highestLevel; level >= lowestOddLevel; level-- { |
| 956 | for i := 0; i < len(levels); i++ { |
| 957 | if levels[i] >= level { |
| 958 | // find range of text at or above this level |
| 959 | start := i |
| 960 | limit := i + 1 |
| 961 | for limit < len(levels) && levels[limit] >= level { |
| 962 | limit++ |
| 963 | } |
| 964 | |
| 965 | for j, k := start, limit-1; j < k; j, k = j+1, k-1 { |
| 966 | result[j], result[k] = result[k], result[j] |
| 967 | } |
| 968 | // skip to end of level run |
| 969 | i = limit |
| 970 | } |
| 971 | } |
| 972 | } |
| 973 | |
| 974 | return result |
| 975 | } |
| 976 | |
| 977 | // isWhitespace reports whether the type is considered a whitespace type for the |
| 978 | // line break rules. |
| 979 | func isWhitespace(c Class) bool { |
| 980 | switch c { |
| 981 | case LRE, RLE, LRO, RLO, PDF, LRI, RLI, FSI, PDI, BN, WS: |
| 982 | return true |
| 983 | } |
| 984 | return false |
| 985 | } |
| 986 | |
| 987 | // isRemovedByX9 reports whether the type is one of the types removed in X9. |
| 988 | func isRemovedByX9(c Class) bool { |
| 989 | switch c { |
| 990 | case LRE, RLE, LRO, RLO, PDF, BN: |
| 991 | return true |
| 992 | } |
| 993 | return false |
| 994 | } |
| 995 | |
| 996 | // typeForLevel reports the strong type (L or R) corresponding to the level. |
| 997 | func typeForLevel(level level) Class { |
| 998 | if (level & 0x1) == 0 { |
| 999 | return L |
| 1000 | } |
| 1001 | return R |
| 1002 | } |
| 1003 | |
| 1004 | // TODO: change validation to not panic |
| 1005 | |
| 1006 | func validateTypes(types []Class) { |
| 1007 | if len(types) == 0 { |
| 1008 | log.Panic("types is null") |
| 1009 | } |
| 1010 | for i, t := range types[:len(types)-1] { |
| 1011 | if t == B { |
| 1012 | log.Panicf("B type before end of paragraph at index: %d", i) |
| 1013 | } |
| 1014 | } |
| 1015 | } |
| 1016 | |
| 1017 | func validateParagraphEmbeddingLevel(embeddingLevel level) { |
| 1018 | if embeddingLevel != implicitLevel && |
| 1019 | embeddingLevel != 0 && |
| 1020 | embeddingLevel != 1 { |
| 1021 | log.Panicf("illegal paragraph embedding level: %d", embeddingLevel) |
| 1022 | } |
| 1023 | } |
| 1024 | |
| 1025 | func validateLineBreaks(linebreaks []int, textLength int) { |
| 1026 | prev := 0 |
| 1027 | for i, next := range linebreaks { |
| 1028 | if next <= prev { |
| 1029 | log.Panicf("bad linebreak: %d at index: %d", next, i) |
| 1030 | } |
| 1031 | prev = next |
| 1032 | } |
| 1033 | if prev != textLength { |
| 1034 | log.Panicf("last linebreak was %d, want %d", prev, textLength) |
| 1035 | } |
| 1036 | } |
| 1037 | |
| 1038 | func validatePbTypes(pairTypes []bracketType) { |
| 1039 | if len(pairTypes) == 0 { |
| 1040 | log.Panic("pairTypes is null") |
| 1041 | } |
| 1042 | for i, pt := range pairTypes { |
| 1043 | switch pt { |
| 1044 | case bpNone, bpOpen, bpClose: |
| 1045 | default: |
| 1046 | log.Panicf("illegal pairType value at %d: %v", i, pairTypes[i]) |
| 1047 | } |
| 1048 | } |
| 1049 | } |
| 1050 | |
| 1051 | func validatePbValues(pairValues []rune, pairTypes []bracketType) { |
| 1052 | if pairValues == nil { |
| 1053 | log.Panic("pairValues is null") |
| 1054 | } |
| 1055 | if len(pairTypes) != len(pairValues) { |
| 1056 | log.Panic("pairTypes is different length from pairValues") |
| 1057 | } |
| 1058 | } |