blob: e4c0811016c2acd3f54b0ec93d59f84f8cbba190 [file] [log] [blame]
khenaidooac637102019-01-14 15:44:34 -05001// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package bidi
6
khenaidood948f772021-08-11 17:49:24 -04007import (
8 "fmt"
9 "log"
10)
khenaidooac637102019-01-14 15:44:34 -050011
12// This implementation is a port based on the reference implementation found at:
Scott Baker8461e152019-10-01 14:44:30 -070013// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/
khenaidooac637102019-01-14 15:44:34 -050014//
15// described in Unicode Bidirectional Algorithm (UAX #9).
16//
17// Input:
18// There are two levels of input to the algorithm, since clients may prefer to
19// supply some information from out-of-band sources rather than relying on the
20// default behavior.
21//
22// - Bidi class array
23// - Bidi class array, with externally supplied base line direction
24//
25// Output:
26// Output is separated into several stages:
27//
28// - levels array over entire paragraph
29// - reordering array over entire paragraph
30// - levels array over line
31// - reordering array over line
32//
33// Note that for conformance to the Unicode Bidirectional Algorithm,
34// implementations are only required to generate correct reordering and
35// character directionality (odd or even levels) over a line. Generating
36// identical level arrays over a line is not required. Bidi explicit format
37// codes (LRE, RLE, LRO, RLO, PDF) and BN can be assigned arbitrary levels and
38// positions as long as the rest of the input is properly reordered.
39//
40// As the algorithm is defined to operate on a single paragraph at a time, this
41// implementation is written to handle single paragraphs. Thus rule P1 is
42// presumed by this implementation-- the data provided to the implementation is
43// assumed to be a single paragraph, and either contains no 'B' codes, or a
44// single 'B' code at the end of the input. 'B' is allowed as input to
45// illustrate how the algorithm assigns it a level.
46//
47// Also note that rules L3 and L4 depend on the rendering engine that uses the
48// result of the bidi algorithm. This implementation assumes that the rendering
49// engine expects combining marks in visual order (e.g. to the left of their
50// base character in RTL runs) and that it adjusts the glyphs used to render
51// mirrored characters that are in RTL runs so that they render appropriately.
52
53// level is the embedding level of a character. Even embedding levels indicate
54// left-to-right order and odd levels indicate right-to-left order. The special
55// level of -1 is reserved for undefined order.
56type level int8
57
58const implicitLevel level = -1
59
60// in returns if x is equal to any of the values in set.
61func (c Class) in(set ...Class) bool {
62 for _, s := range set {
63 if c == s {
64 return true
65 }
66 }
67 return false
68}
69
70// A paragraph contains the state of a paragraph.
71type paragraph struct {
72 initialTypes []Class
73
74 // Arrays of properties needed for paired bracket evaluation in N0
75 pairTypes []bracketType // paired Bracket types for paragraph
76 pairValues []rune // rune for opening bracket or pbOpen and pbClose; 0 for pbNone
77
78 embeddingLevel level // default: = implicitLevel;
79
80 // at the paragraph levels
81 resultTypes []Class
82 resultLevels []level
83
84 // Index of matching PDI for isolate initiator characters. For other
85 // characters, the value of matchingPDI will be set to -1. For isolate
86 // initiators with no matching PDI, matchingPDI will be set to the length of
87 // the input string.
88 matchingPDI []int
89
90 // Index of matching isolate initiator for PDI characters. For other
91 // characters, and for PDIs with no matching isolate initiator, the value of
92 // matchingIsolateInitiator will be set to -1.
93 matchingIsolateInitiator []int
94}
95
96// newParagraph initializes a paragraph. The user needs to supply a few arrays
97// corresponding to the preprocessed text input. The types correspond to the
98// Unicode BiDi classes for each rune. pairTypes indicates the bracket type for
99// each rune. pairValues provides a unique bracket class identifier for each
100// rune (suggested is the rune of the open bracket for opening and matching
101// close brackets, after normalization). The embedding levels are optional, but
102// may be supplied to encode embedding levels of styled text.
khenaidood948f772021-08-11 17:49:24 -0400103func newParagraph(types []Class, pairTypes []bracketType, pairValues []rune, levels level) (*paragraph, error) {
104 var err error
105 if err = validateTypes(types); err != nil {
106 return nil, err
107 }
108 if err = validatePbTypes(pairTypes); err != nil {
109 return nil, err
110 }
111 if err = validatePbValues(pairValues, pairTypes); err != nil {
112 return nil, err
113 }
114 if err = validateParagraphEmbeddingLevel(levels); err != nil {
115 return nil, err
116 }
khenaidooac637102019-01-14 15:44:34 -0500117
118 p := &paragraph{
119 initialTypes: append([]Class(nil), types...),
120 embeddingLevel: levels,
121
122 pairTypes: pairTypes,
123 pairValues: pairValues,
124
125 resultTypes: append([]Class(nil), types...),
126 }
127 p.run()
khenaidood948f772021-08-11 17:49:24 -0400128 return p, nil
khenaidooac637102019-01-14 15:44:34 -0500129}
130
131func (p *paragraph) Len() int { return len(p.initialTypes) }
132
133// The algorithm. Does not include line-based processing (Rules L1, L2).
134// These are applied later in the line-based phase of the algorithm.
135func (p *paragraph) run() {
136 p.determineMatchingIsolates()
137
138 // 1) determining the paragraph level
139 // Rule P1 is the requirement for entering this algorithm.
140 // Rules P2, P3.
141 // If no externally supplied paragraph embedding level, use default.
142 if p.embeddingLevel == implicitLevel {
143 p.embeddingLevel = p.determineParagraphEmbeddingLevel(0, p.Len())
144 }
145
146 // Initialize result levels to paragraph embedding level.
147 p.resultLevels = make([]level, p.Len())
148 setLevels(p.resultLevels, p.embeddingLevel)
149
150 // 2) Explicit levels and directions
151 // Rules X1-X8.
152 p.determineExplicitEmbeddingLevels()
153
154 // Rule X9.
155 // We do not remove the embeddings, the overrides, the PDFs, and the BNs
156 // from the string explicitly. But they are not copied into isolating run
157 // sequences when they are created, so they are removed for all
158 // practical purposes.
159
160 // Rule X10.
161 // Run remainder of algorithm one isolating run sequence at a time
162 for _, seq := range p.determineIsolatingRunSequences() {
163 // 3) resolving weak types
164 // Rules W1-W7.
165 seq.resolveWeakTypes()
166
167 // 4a) resolving paired brackets
168 // Rule N0
169 resolvePairedBrackets(seq)
170
171 // 4b) resolving neutral types
172 // Rules N1-N3.
173 seq.resolveNeutralTypes()
174
175 // 5) resolving implicit embedding levels
176 // Rules I1, I2.
177 seq.resolveImplicitLevels()
178
179 // Apply the computed levels and types
180 seq.applyLevelsAndTypes()
181 }
182
183 // Assign appropriate levels to 'hide' LREs, RLEs, LROs, RLOs, PDFs, and
184 // BNs. This is for convenience, so the resulting level array will have
185 // a value for every character.
186 p.assignLevelsToCharactersRemovedByX9()
187}
188
189// determineMatchingIsolates determines the matching PDI for each isolate
190// initiator and vice versa.
191//
192// Definition BD9.
193//
194// At the end of this function:
195//
196// - The member variable matchingPDI is set to point to the index of the
197// matching PDI character for each isolate initiator character. If there is
198// no matching PDI, it is set to the length of the input text. For other
199// characters, it is set to -1.
200// - The member variable matchingIsolateInitiator is set to point to the
201// index of the matching isolate initiator character for each PDI character.
202// If there is no matching isolate initiator, or the character is not a PDI,
203// it is set to -1.
204func (p *paragraph) determineMatchingIsolates() {
205 p.matchingPDI = make([]int, p.Len())
206 p.matchingIsolateInitiator = make([]int, p.Len())
207
208 for i := range p.matchingIsolateInitiator {
209 p.matchingIsolateInitiator[i] = -1
210 }
211
212 for i := range p.matchingPDI {
213 p.matchingPDI[i] = -1
214
215 if t := p.resultTypes[i]; t.in(LRI, RLI, FSI) {
216 depthCounter := 1
217 for j := i + 1; j < p.Len(); j++ {
218 if u := p.resultTypes[j]; u.in(LRI, RLI, FSI) {
219 depthCounter++
220 } else if u == PDI {
221 if depthCounter--; depthCounter == 0 {
222 p.matchingPDI[i] = j
223 p.matchingIsolateInitiator[j] = i
224 break
225 }
226 }
227 }
228 if p.matchingPDI[i] == -1 {
229 p.matchingPDI[i] = p.Len()
230 }
231 }
232 }
233}
234
235// determineParagraphEmbeddingLevel reports the resolved paragraph direction of
236// the substring limited by the given range [start, end).
237//
238// Determines the paragraph level based on rules P2, P3. This is also used
239// in rule X5c to find if an FSI should resolve to LRI or RLI.
240func (p *paragraph) determineParagraphEmbeddingLevel(start, end int) level {
241 var strongType Class = unknownClass
242
243 // Rule P2.
244 for i := start; i < end; i++ {
245 if t := p.resultTypes[i]; t.in(L, AL, R) {
246 strongType = t
247 break
248 } else if t.in(FSI, LRI, RLI) {
249 i = p.matchingPDI[i] // skip over to the matching PDI
250 if i > end {
251 log.Panic("assert (i <= end)")
252 }
253 }
254 }
255 // Rule P3.
256 switch strongType {
257 case unknownClass: // none found
258 // default embedding level when no strong types found is 0.
259 return 0
260 case L:
261 return 0
262 default: // AL, R
263 return 1
264 }
265}
266
267const maxDepth = 125
268
269// This stack will store the embedding levels and override and isolated
270// statuses
271type directionalStatusStack struct {
272 stackCounter int
273 embeddingLevelStack [maxDepth + 1]level
274 overrideStatusStack [maxDepth + 1]Class
275 isolateStatusStack [maxDepth + 1]bool
276}
277
278func (s *directionalStatusStack) empty() { s.stackCounter = 0 }
279func (s *directionalStatusStack) pop() { s.stackCounter-- }
280func (s *directionalStatusStack) depth() int { return s.stackCounter }
281
282func (s *directionalStatusStack) push(level level, overrideStatus Class, isolateStatus bool) {
283 s.embeddingLevelStack[s.stackCounter] = level
284 s.overrideStatusStack[s.stackCounter] = overrideStatus
285 s.isolateStatusStack[s.stackCounter] = isolateStatus
286 s.stackCounter++
287}
288
289func (s *directionalStatusStack) lastEmbeddingLevel() level {
290 return s.embeddingLevelStack[s.stackCounter-1]
291}
292
293func (s *directionalStatusStack) lastDirectionalOverrideStatus() Class {
294 return s.overrideStatusStack[s.stackCounter-1]
295}
296
297func (s *directionalStatusStack) lastDirectionalIsolateStatus() bool {
298 return s.isolateStatusStack[s.stackCounter-1]
299}
300
301// Determine explicit levels using rules X1 - X8
302func (p *paragraph) determineExplicitEmbeddingLevels() {
303 var stack directionalStatusStack
304 var overflowIsolateCount, overflowEmbeddingCount, validIsolateCount int
305
306 // Rule X1.
307 stack.push(p.embeddingLevel, ON, false)
308
309 for i, t := range p.resultTypes {
310 // Rules X2, X3, X4, X5, X5a, X5b, X5c
311 switch t {
312 case RLE, LRE, RLO, LRO, RLI, LRI, FSI:
313 isIsolate := t.in(RLI, LRI, FSI)
314 isRTL := t.in(RLE, RLO, RLI)
315
316 // override if this is an FSI that resolves to RLI
317 if t == FSI {
318 isRTL = (p.determineParagraphEmbeddingLevel(i+1, p.matchingPDI[i]) == 1)
319 }
320 if isIsolate {
321 p.resultLevels[i] = stack.lastEmbeddingLevel()
322 if stack.lastDirectionalOverrideStatus() != ON {
323 p.resultTypes[i] = stack.lastDirectionalOverrideStatus()
324 }
325 }
326
327 var newLevel level
328 if isRTL {
329 // least greater odd
330 newLevel = (stack.lastEmbeddingLevel() + 1) | 1
331 } else {
332 // least greater even
333 newLevel = (stack.lastEmbeddingLevel() + 2) &^ 1
334 }
335
336 if newLevel <= maxDepth && overflowIsolateCount == 0 && overflowEmbeddingCount == 0 {
337 if isIsolate {
338 validIsolateCount++
339 }
340 // Push new embedding level, override status, and isolated
341 // status.
342 // No check for valid stack counter, since the level check
343 // suffices.
344 switch t {
345 case LRO:
346 stack.push(newLevel, L, isIsolate)
347 case RLO:
348 stack.push(newLevel, R, isIsolate)
349 default:
350 stack.push(newLevel, ON, isIsolate)
351 }
352 // Not really part of the spec
353 if !isIsolate {
354 p.resultLevels[i] = newLevel
355 }
356 } else {
357 // This is an invalid explicit formatting character,
358 // so apply the "Otherwise" part of rules X2-X5b.
359 if isIsolate {
360 overflowIsolateCount++
361 } else { // !isIsolate
362 if overflowIsolateCount == 0 {
363 overflowEmbeddingCount++
364 }
365 }
366 }
367
368 // Rule X6a
369 case PDI:
370 if overflowIsolateCount > 0 {
371 overflowIsolateCount--
372 } else if validIsolateCount == 0 {
373 // do nothing
374 } else {
375 overflowEmbeddingCount = 0
376 for !stack.lastDirectionalIsolateStatus() {
377 stack.pop()
378 }
379 stack.pop()
380 validIsolateCount--
381 }
382 p.resultLevels[i] = stack.lastEmbeddingLevel()
383
384 // Rule X7
385 case PDF:
386 // Not really part of the spec
387 p.resultLevels[i] = stack.lastEmbeddingLevel()
388
389 if overflowIsolateCount > 0 {
390 // do nothing
391 } else if overflowEmbeddingCount > 0 {
392 overflowEmbeddingCount--
393 } else if !stack.lastDirectionalIsolateStatus() && stack.depth() >= 2 {
394 stack.pop()
395 }
396
397 case B: // paragraph separator.
398 // Rule X8.
399
400 // These values are reset for clarity, in this implementation B
401 // can only occur as the last code in the array.
402 stack.empty()
403 overflowIsolateCount = 0
404 overflowEmbeddingCount = 0
405 validIsolateCount = 0
406 p.resultLevels[i] = p.embeddingLevel
407
408 default:
409 p.resultLevels[i] = stack.lastEmbeddingLevel()
410 if stack.lastDirectionalOverrideStatus() != ON {
411 p.resultTypes[i] = stack.lastDirectionalOverrideStatus()
412 }
413 }
414 }
415}
416
417type isolatingRunSequence struct {
418 p *paragraph
419
420 indexes []int // indexes to the original string
421
422 types []Class // type of each character using the index
423 resolvedLevels []level // resolved levels after application of rules
424 level level
425 sos, eos Class
426}
427
428func (i *isolatingRunSequence) Len() int { return len(i.indexes) }
429
430func maxLevel(a, b level) level {
431 if a > b {
432 return a
433 }
434 return b
435}
436
437// Rule X10, second bullet: Determine the start-of-sequence (sos) and end-of-sequence (eos) types,
438// either L or R, for each isolating run sequence.
439func (p *paragraph) isolatingRunSequence(indexes []int) *isolatingRunSequence {
440 length := len(indexes)
441 types := make([]Class, length)
442 for i, x := range indexes {
443 types[i] = p.resultTypes[x]
444 }
445
446 // assign level, sos and eos
447 prevChar := indexes[0] - 1
448 for prevChar >= 0 && isRemovedByX9(p.initialTypes[prevChar]) {
449 prevChar--
450 }
451 prevLevel := p.embeddingLevel
452 if prevChar >= 0 {
453 prevLevel = p.resultLevels[prevChar]
454 }
455
456 var succLevel level
457 lastType := types[length-1]
458 if lastType.in(LRI, RLI, FSI) {
459 succLevel = p.embeddingLevel
460 } else {
461 // the first character after the end of run sequence
462 limit := indexes[length-1] + 1
463 for ; limit < p.Len() && isRemovedByX9(p.initialTypes[limit]); limit++ {
464
465 }
466 succLevel = p.embeddingLevel
467 if limit < p.Len() {
468 succLevel = p.resultLevels[limit]
469 }
470 }
471 level := p.resultLevels[indexes[0]]
472 return &isolatingRunSequence{
473 p: p,
474 indexes: indexes,
475 types: types,
476 level: level,
477 sos: typeForLevel(maxLevel(prevLevel, level)),
478 eos: typeForLevel(maxLevel(succLevel, level)),
479 }
480}
481
482// Resolving weak types Rules W1-W7.
483//
484// Note that some weak types (EN, AN) remain after this processing is
485// complete.
486func (s *isolatingRunSequence) resolveWeakTypes() {
487
488 // on entry, only these types remain
489 s.assertOnly(L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM, LRI, RLI, FSI, PDI)
490
491 // Rule W1.
492 // Changes all NSMs.
Andrea Campanella3614a922021-02-25 12:40:42 +0100493 precedingCharacterType := s.sos
khenaidooac637102019-01-14 15:44:34 -0500494 for i, t := range s.types {
495 if t == NSM {
Andrea Campanella3614a922021-02-25 12:40:42 +0100496 s.types[i] = precedingCharacterType
khenaidooac637102019-01-14 15:44:34 -0500497 } else {
498 if t.in(LRI, RLI, FSI, PDI) {
Andrea Campanella3614a922021-02-25 12:40:42 +0100499 precedingCharacterType = ON
khenaidooac637102019-01-14 15:44:34 -0500500 }
Andrea Campanella3614a922021-02-25 12:40:42 +0100501 precedingCharacterType = t
khenaidooac637102019-01-14 15:44:34 -0500502 }
503 }
504
505 // Rule W2.
506 // EN does not change at the start of the run, because sos != AL.
507 for i, t := range s.types {
508 if t == EN {
509 for j := i - 1; j >= 0; j-- {
510 if t := s.types[j]; t.in(L, R, AL) {
511 if t == AL {
512 s.types[i] = AN
513 }
514 break
515 }
516 }
517 }
518 }
519
520 // Rule W3.
521 for i, t := range s.types {
522 if t == AL {
523 s.types[i] = R
524 }
525 }
526
527 // Rule W4.
528 // Since there must be values on both sides for this rule to have an
529 // effect, the scan skips the first and last value.
530 //
531 // Although the scan proceeds left to right, and changes the type
532 // values in a way that would appear to affect the computations
533 // later in the scan, there is actually no problem. A change in the
534 // current value can only affect the value to its immediate right,
535 // and only affect it if it is ES or CS. But the current value can
536 // only change if the value to its right is not ES or CS. Thus
537 // either the current value will not change, or its change will have
538 // no effect on the remainder of the analysis.
539
540 for i := 1; i < s.Len()-1; i++ {
541 t := s.types[i]
542 if t == ES || t == CS {
543 prevSepType := s.types[i-1]
544 succSepType := s.types[i+1]
545 if prevSepType == EN && succSepType == EN {
546 s.types[i] = EN
547 } else if s.types[i] == CS && prevSepType == AN && succSepType == AN {
548 s.types[i] = AN
549 }
550 }
551 }
552
553 // Rule W5.
554 for i, t := range s.types {
555 if t == ET {
556 // locate end of sequence
557 runStart := i
558 runEnd := s.findRunLimit(runStart, ET)
559
560 // check values at ends of sequence
561 t := s.sos
562 if runStart > 0 {
563 t = s.types[runStart-1]
564 }
565 if t != EN {
566 t = s.eos
567 if runEnd < len(s.types) {
568 t = s.types[runEnd]
569 }
570 }
571 if t == EN {
572 setTypes(s.types[runStart:runEnd], EN)
573 }
574 // continue at end of sequence
575 i = runEnd
576 }
577 }
578
579 // Rule W6.
580 for i, t := range s.types {
581 if t.in(ES, ET, CS) {
582 s.types[i] = ON
583 }
584 }
585
586 // Rule W7.
587 for i, t := range s.types {
588 if t == EN {
589 // set default if we reach start of run
590 prevStrongType := s.sos
591 for j := i - 1; j >= 0; j-- {
592 t = s.types[j]
593 if t == L || t == R { // AL's have been changed to R
594 prevStrongType = t
595 break
596 }
597 }
598 if prevStrongType == L {
599 s.types[i] = L
600 }
601 }
602 }
603}
604
605// 6) resolving neutral types Rules N1-N2.
606func (s *isolatingRunSequence) resolveNeutralTypes() {
607
608 // on entry, only these types can be in resultTypes
609 s.assertOnly(L, R, EN, AN, B, S, WS, ON, RLI, LRI, FSI, PDI)
610
611 for i, t := range s.types {
612 switch t {
613 case WS, ON, B, S, RLI, LRI, FSI, PDI:
614 // find bounds of run of neutrals
615 runStart := i
616 runEnd := s.findRunLimit(runStart, B, S, WS, ON, RLI, LRI, FSI, PDI)
617
618 // determine effective types at ends of run
619 var leadType, trailType Class
620
621 // Note that the character found can only be L, R, AN, or
622 // EN.
623 if runStart == 0 {
624 leadType = s.sos
625 } else {
626 leadType = s.types[runStart-1]
627 if leadType.in(AN, EN) {
628 leadType = R
629 }
630 }
631 if runEnd == len(s.types) {
632 trailType = s.eos
633 } else {
634 trailType = s.types[runEnd]
635 if trailType.in(AN, EN) {
636 trailType = R
637 }
638 }
639
640 var resolvedType Class
641 if leadType == trailType {
642 // Rule N1.
643 resolvedType = leadType
644 } else {
645 // Rule N2.
646 // Notice the embedding level of the run is used, not
647 // the paragraph embedding level.
648 resolvedType = typeForLevel(s.level)
649 }
650
651 setTypes(s.types[runStart:runEnd], resolvedType)
652
653 // skip over run of (former) neutrals
654 i = runEnd
655 }
656 }
657}
658
659func setLevels(levels []level, newLevel level) {
660 for i := range levels {
661 levels[i] = newLevel
662 }
663}
664
665func setTypes(types []Class, newType Class) {
666 for i := range types {
667 types[i] = newType
668 }
669}
670
671// 7) resolving implicit embedding levels Rules I1, I2.
672func (s *isolatingRunSequence) resolveImplicitLevels() {
673
674 // on entry, only these types can be in resultTypes
675 s.assertOnly(L, R, EN, AN)
676
677 s.resolvedLevels = make([]level, len(s.types))
678 setLevels(s.resolvedLevels, s.level)
679
680 if (s.level & 1) == 0 { // even level
681 for i, t := range s.types {
682 // Rule I1.
683 if t == L {
684 // no change
685 } else if t == R {
686 s.resolvedLevels[i] += 1
687 } else { // t == AN || t == EN
688 s.resolvedLevels[i] += 2
689 }
690 }
691 } else { // odd level
692 for i, t := range s.types {
693 // Rule I2.
694 if t == R {
695 // no change
696 } else { // t == L || t == AN || t == EN
697 s.resolvedLevels[i] += 1
698 }
699 }
700 }
701}
702
703// Applies the levels and types resolved in rules W1-I2 to the
704// resultLevels array.
705func (s *isolatingRunSequence) applyLevelsAndTypes() {
706 for i, x := range s.indexes {
707 s.p.resultTypes[x] = s.types[i]
708 s.p.resultLevels[x] = s.resolvedLevels[i]
709 }
710}
711
712// Return the limit of the run consisting only of the types in validSet
713// starting at index. This checks the value at index, and will return
714// index if that value is not in validSet.
715func (s *isolatingRunSequence) findRunLimit(index int, validSet ...Class) int {
716loop:
717 for ; index < len(s.types); index++ {
718 t := s.types[index]
719 for _, valid := range validSet {
720 if t == valid {
721 continue loop
722 }
723 }
724 return index // didn't find a match in validSet
725 }
726 return len(s.types)
727}
728
729// Algorithm validation. Assert that all values in types are in the
730// provided set.
731func (s *isolatingRunSequence) assertOnly(codes ...Class) {
732loop:
733 for i, t := range s.types {
734 for _, c := range codes {
735 if t == c {
736 continue loop
737 }
738 }
739 log.Panicf("invalid bidi code %v present in assertOnly at position %d", t, s.indexes[i])
740 }
741}
742
743// determineLevelRuns returns an array of level runs. Each level run is
744// described as an array of indexes into the input string.
745//
746// Determines the level runs. Rule X9 will be applied in determining the
747// runs, in the way that makes sure the characters that are supposed to be
748// removed are not included in the runs.
749func (p *paragraph) determineLevelRuns() [][]int {
750 run := []int{}
751 allRuns := [][]int{}
752 currentLevel := implicitLevel
753
754 for i := range p.initialTypes {
755 if !isRemovedByX9(p.initialTypes[i]) {
756 if p.resultLevels[i] != currentLevel {
757 // we just encountered a new run; wrap up last run
758 if currentLevel >= 0 { // only wrap it up if there was a run
759 allRuns = append(allRuns, run)
760 run = nil
761 }
762 // Start new run
763 currentLevel = p.resultLevels[i]
764 }
765 run = append(run, i)
766 }
767 }
768 // Wrap up the final run, if any
769 if len(run) > 0 {
770 allRuns = append(allRuns, run)
771 }
772 return allRuns
773}
774
775// Definition BD13. Determine isolating run sequences.
776func (p *paragraph) determineIsolatingRunSequences() []*isolatingRunSequence {
777 levelRuns := p.determineLevelRuns()
778
779 // Compute the run that each character belongs to
780 runForCharacter := make([]int, p.Len())
781 for i, run := range levelRuns {
782 for _, index := range run {
783 runForCharacter[index] = i
784 }
785 }
786
787 sequences := []*isolatingRunSequence{}
788
789 var currentRunSequence []int
790
791 for _, run := range levelRuns {
792 first := run[0]
793 if p.initialTypes[first] != PDI || p.matchingIsolateInitiator[first] == -1 {
794 currentRunSequence = nil
795 // int run = i;
796 for {
797 // Copy this level run into currentRunSequence
798 currentRunSequence = append(currentRunSequence, run...)
799
800 last := currentRunSequence[len(currentRunSequence)-1]
801 lastT := p.initialTypes[last]
802 if lastT.in(LRI, RLI, FSI) && p.matchingPDI[last] != p.Len() {
803 run = levelRuns[runForCharacter[p.matchingPDI[last]]]
804 } else {
805 break
806 }
807 }
808 sequences = append(sequences, p.isolatingRunSequence(currentRunSequence))
809 }
810 }
811 return sequences
812}
813
814// Assign level information to characters removed by rule X9. This is for
815// ease of relating the level information to the original input data. Note
816// that the levels assigned to these codes are arbitrary, they're chosen so
817// as to avoid breaking level runs.
818func (p *paragraph) assignLevelsToCharactersRemovedByX9() {
819 for i, t := range p.initialTypes {
820 if t.in(LRE, RLE, LRO, RLO, PDF, BN) {
821 p.resultTypes[i] = t
822 p.resultLevels[i] = -1
823 }
824 }
825 // now propagate forward the levels information (could have
826 // propagated backward, the main thing is not to introduce a level
827 // break where one doesn't already exist).
828
829 if p.resultLevels[0] == -1 {
830 p.resultLevels[0] = p.embeddingLevel
831 }
832 for i := 1; i < len(p.initialTypes); i++ {
833 if p.resultLevels[i] == -1 {
834 p.resultLevels[i] = p.resultLevels[i-1]
835 }
836 }
837 // Embedding information is for informational purposes only so need not be
838 // adjusted.
839}
840
841//
842// Output
843//
844
845// getLevels computes levels array breaking lines at offsets in linebreaks.
846// Rule L1.
847//
848// The linebreaks array must include at least one value. The values must be
849// in strictly increasing order (no duplicates) between 1 and the length of
850// the text, inclusive. The last value must be the length of the text.
851func (p *paragraph) getLevels(linebreaks []int) []level {
852 // Note that since the previous processing has removed all
853 // P, S, and WS values from resultTypes, the values referred to
854 // in these rules are the initial types, before any processing
855 // has been applied (including processing of overrides).
856 //
857 // This example implementation has reinserted explicit format codes
858 // and BN, in order that the levels array correspond to the
859 // initial text. Their final placement is not normative.
860 // These codes are treated like WS in this implementation,
861 // so they don't interrupt sequences of WS.
862
863 validateLineBreaks(linebreaks, p.Len())
864
865 result := append([]level(nil), p.resultLevels...)
866
867 // don't worry about linebreaks since if there is a break within
868 // a series of WS values preceding S, the linebreak itself
869 // causes the reset.
870 for i, t := range p.initialTypes {
871 if t.in(B, S) {
872 // Rule L1, clauses one and two.
873 result[i] = p.embeddingLevel
874
875 // Rule L1, clause three.
876 for j := i - 1; j >= 0; j-- {
877 if isWhitespace(p.initialTypes[j]) { // including format codes
878 result[j] = p.embeddingLevel
879 } else {
880 break
881 }
882 }
883 }
884 }
885
886 // Rule L1, clause four.
887 start := 0
888 for _, limit := range linebreaks {
889 for j := limit - 1; j >= start; j-- {
890 if isWhitespace(p.initialTypes[j]) { // including format codes
891 result[j] = p.embeddingLevel
892 } else {
893 break
894 }
895 }
896 start = limit
897 }
898
899 return result
900}
901
902// getReordering returns the reordering of lines from a visual index to a
903// logical index for line breaks at the given offsets.
904//
905// Lines are concatenated from left to right. So for example, the fifth
906// character from the left on the third line is
907//
908// getReordering(linebreaks)[linebreaks[1] + 4]
909//
910// (linebreaks[1] is the position after the last character of the second
911// line, which is also the index of the first character on the third line,
912// and adding four gets the fifth character from the left).
913//
914// The linebreaks array must include at least one value. The values must be
915// in strictly increasing order (no duplicates) between 1 and the length of
916// the text, inclusive. The last value must be the length of the text.
917func (p *paragraph) getReordering(linebreaks []int) []int {
918 validateLineBreaks(linebreaks, p.Len())
919
920 return computeMultilineReordering(p.getLevels(linebreaks), linebreaks)
921}
922
923// Return multiline reordering array for a given level array. Reordering
924// does not occur across a line break.
925func computeMultilineReordering(levels []level, linebreaks []int) []int {
926 result := make([]int, len(levels))
927
928 start := 0
929 for _, limit := range linebreaks {
930 tempLevels := make([]level, limit-start)
931 copy(tempLevels, levels[start:])
932
933 for j, order := range computeReordering(tempLevels) {
934 result[start+j] = order + start
935 }
936 start = limit
937 }
938 return result
939}
940
941// Return reordering array for a given level array. This reorders a single
942// line. The reordering is a visual to logical map. For example, the
943// leftmost char is string.charAt(order[0]). Rule L2.
944func computeReordering(levels []level) []int {
945 result := make([]int, len(levels))
946 // initialize order
947 for i := range result {
948 result[i] = i
949 }
950
951 // locate highest level found on line.
952 // Note the rules say text, but no reordering across line bounds is
953 // performed, so this is sufficient.
954 highestLevel := level(0)
955 lowestOddLevel := level(maxDepth + 2)
956 for _, level := range levels {
957 if level > highestLevel {
958 highestLevel = level
959 }
960 if level&1 != 0 && level < lowestOddLevel {
961 lowestOddLevel = level
962 }
963 }
964
965 for level := highestLevel; level >= lowestOddLevel; level-- {
966 for i := 0; i < len(levels); i++ {
967 if levels[i] >= level {
968 // find range of text at or above this level
969 start := i
970 limit := i + 1
971 for limit < len(levels) && levels[limit] >= level {
972 limit++
973 }
974
975 for j, k := start, limit-1; j < k; j, k = j+1, k-1 {
976 result[j], result[k] = result[k], result[j]
977 }
978 // skip to end of level run
979 i = limit
980 }
981 }
982 }
983
984 return result
985}
986
987// isWhitespace reports whether the type is considered a whitespace type for the
988// line break rules.
989func isWhitespace(c Class) bool {
990 switch c {
991 case LRE, RLE, LRO, RLO, PDF, LRI, RLI, FSI, PDI, BN, WS:
992 return true
993 }
994 return false
995}
996
997// isRemovedByX9 reports whether the type is one of the types removed in X9.
998func isRemovedByX9(c Class) bool {
999 switch c {
1000 case LRE, RLE, LRO, RLO, PDF, BN:
1001 return true
1002 }
1003 return false
1004}
1005
1006// typeForLevel reports the strong type (L or R) corresponding to the level.
1007func typeForLevel(level level) Class {
1008 if (level & 0x1) == 0 {
1009 return L
1010 }
1011 return R
1012}
1013
khenaidood948f772021-08-11 17:49:24 -04001014func validateTypes(types []Class) error {
khenaidooac637102019-01-14 15:44:34 -05001015 if len(types) == 0 {
khenaidood948f772021-08-11 17:49:24 -04001016 return fmt.Errorf("types is null")
khenaidooac637102019-01-14 15:44:34 -05001017 }
1018 for i, t := range types[:len(types)-1] {
1019 if t == B {
khenaidood948f772021-08-11 17:49:24 -04001020 return fmt.Errorf("B type before end of paragraph at index: %d", i)
khenaidooac637102019-01-14 15:44:34 -05001021 }
1022 }
khenaidood948f772021-08-11 17:49:24 -04001023 return nil
khenaidooac637102019-01-14 15:44:34 -05001024}
1025
khenaidood948f772021-08-11 17:49:24 -04001026func validateParagraphEmbeddingLevel(embeddingLevel level) error {
khenaidooac637102019-01-14 15:44:34 -05001027 if embeddingLevel != implicitLevel &&
1028 embeddingLevel != 0 &&
1029 embeddingLevel != 1 {
khenaidood948f772021-08-11 17:49:24 -04001030 return fmt.Errorf("illegal paragraph embedding level: %d", embeddingLevel)
khenaidooac637102019-01-14 15:44:34 -05001031 }
khenaidood948f772021-08-11 17:49:24 -04001032 return nil
khenaidooac637102019-01-14 15:44:34 -05001033}
1034
khenaidood948f772021-08-11 17:49:24 -04001035func validateLineBreaks(linebreaks []int, textLength int) error {
khenaidooac637102019-01-14 15:44:34 -05001036 prev := 0
1037 for i, next := range linebreaks {
1038 if next <= prev {
khenaidood948f772021-08-11 17:49:24 -04001039 return fmt.Errorf("bad linebreak: %d at index: %d", next, i)
khenaidooac637102019-01-14 15:44:34 -05001040 }
1041 prev = next
1042 }
1043 if prev != textLength {
khenaidood948f772021-08-11 17:49:24 -04001044 return fmt.Errorf("last linebreak was %d, want %d", prev, textLength)
khenaidooac637102019-01-14 15:44:34 -05001045 }
khenaidood948f772021-08-11 17:49:24 -04001046 return nil
khenaidooac637102019-01-14 15:44:34 -05001047}
1048
khenaidood948f772021-08-11 17:49:24 -04001049func validatePbTypes(pairTypes []bracketType) error {
khenaidooac637102019-01-14 15:44:34 -05001050 if len(pairTypes) == 0 {
khenaidood948f772021-08-11 17:49:24 -04001051 return fmt.Errorf("pairTypes is null")
khenaidooac637102019-01-14 15:44:34 -05001052 }
1053 for i, pt := range pairTypes {
1054 switch pt {
1055 case bpNone, bpOpen, bpClose:
1056 default:
khenaidood948f772021-08-11 17:49:24 -04001057 return fmt.Errorf("illegal pairType value at %d: %v", i, pairTypes[i])
khenaidooac637102019-01-14 15:44:34 -05001058 }
1059 }
khenaidood948f772021-08-11 17:49:24 -04001060 return nil
khenaidooac637102019-01-14 15:44:34 -05001061}
1062
khenaidood948f772021-08-11 17:49:24 -04001063func validatePbValues(pairValues []rune, pairTypes []bracketType) error {
khenaidooac637102019-01-14 15:44:34 -05001064 if pairValues == nil {
khenaidood948f772021-08-11 17:49:24 -04001065 return fmt.Errorf("pairValues is null")
khenaidooac637102019-01-14 15:44:34 -05001066 }
1067 if len(pairTypes) != len(pairValues) {
khenaidood948f772021-08-11 17:49:24 -04001068 return fmt.Errorf("pairTypes is different length from pairValues")
khenaidooac637102019-01-14 15:44:34 -05001069 }
khenaidood948f772021-08-11 17:49:24 -04001070 return nil
khenaidooac637102019-01-14 15:44:34 -05001071}