Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 1 | // Copyright 2015 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | //go:generate go run gen.go gen_trieval.go gen_ranges.go |
| 6 | |
| 7 | // Package bidi contains functionality for bidirectional text support. |
| 8 | // |
| 9 | // See https://www.unicode.org/reports/tr9. |
| 10 | // |
| 11 | // NOTE: UNDER CONSTRUCTION. This API may change in backwards incompatible ways |
| 12 | // and without notice. |
| 13 | package bidi // import "golang.org/x/text/unicode/bidi" |
| 14 | |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 15 | // TODO |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 16 | // - Transformer for reordering? |
| 17 | // - Transformer (validator, really) for Bidi Rule. |
| 18 | |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 19 | import ( |
| 20 | "bytes" |
| 21 | ) |
| 22 | |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 23 | // This API tries to avoid dealing with embedding levels for now. Under the hood |
| 24 | // these will be computed, but the question is to which extent the user should |
| 25 | // know they exist. We should at some point allow the user to specify an |
| 26 | // embedding hierarchy, though. |
| 27 | |
| 28 | // A Direction indicates the overall flow of text. |
| 29 | type Direction int |
| 30 | |
| 31 | const ( |
| 32 | // LeftToRight indicates the text contains no right-to-left characters and |
| 33 | // that either there are some left-to-right characters or the option |
| 34 | // DefaultDirection(LeftToRight) was passed. |
| 35 | LeftToRight Direction = iota |
| 36 | |
| 37 | // RightToLeft indicates the text contains no left-to-right characters and |
| 38 | // that either there are some right-to-left characters or the option |
| 39 | // DefaultDirection(RightToLeft) was passed. |
| 40 | RightToLeft |
| 41 | |
| 42 | // Mixed indicates text contains both left-to-right and right-to-left |
| 43 | // characters. |
| 44 | Mixed |
| 45 | |
| 46 | // Neutral means that text contains no left-to-right and right-to-left |
| 47 | // characters and that no default direction has been set. |
| 48 | Neutral |
| 49 | ) |
| 50 | |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 51 | type options struct { |
| 52 | defaultDirection Direction |
| 53 | } |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 54 | |
| 55 | // An Option is an option for Bidi processing. |
| 56 | type Option func(*options) |
| 57 | |
| 58 | // ICU allows the user to define embedding levels. This may be used, for example, |
| 59 | // to use hierarchical structure of markup languages to define embeddings. |
| 60 | // The following option may be a way to expose this functionality in this API. |
| 61 | // // LevelFunc sets a function that associates nesting levels with the given text. |
| 62 | // // The levels function will be called with monotonically increasing values for p. |
| 63 | // func LevelFunc(levels func(p int) int) Option { |
| 64 | // panic("unimplemented") |
| 65 | // } |
| 66 | |
| 67 | // DefaultDirection sets the default direction for a Paragraph. The direction is |
| 68 | // overridden if the text contains directional characters. |
| 69 | func DefaultDirection(d Direction) Option { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 70 | return func(opts *options) { |
| 71 | opts.defaultDirection = d |
| 72 | } |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 73 | } |
| 74 | |
| 75 | // A Paragraph holds a single Paragraph for Bidi processing. |
| 76 | type Paragraph struct { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 77 | p []byte |
| 78 | o Ordering |
| 79 | opts []Option |
| 80 | types []Class |
| 81 | pairTypes []bracketType |
| 82 | pairValues []rune |
| 83 | runes []rune |
| 84 | options options |
| 85 | } |
| 86 | |
| 87 | // Initialize the p.pairTypes, p.pairValues and p.types from the input previously |
| 88 | // set by p.SetBytes() or p.SetString(). Also limit the input up to (and including) a paragraph |
| 89 | // separator (bidi class B). |
| 90 | // |
| 91 | // The function p.Order() needs these values to be set, so this preparation could be postponed. |
| 92 | // But since the SetBytes and SetStrings functions return the length of the input up to the paragraph |
| 93 | // separator, the whole input needs to be processed anyway and should not be done twice. |
| 94 | // |
| 95 | // The function has the same return values as SetBytes() / SetString() |
| 96 | func (p *Paragraph) prepareInput() (n int, err error) { |
| 97 | p.runes = bytes.Runes(p.p) |
| 98 | bytecount := 0 |
| 99 | // clear slices from previous SetString or SetBytes |
| 100 | p.pairTypes = nil |
| 101 | p.pairValues = nil |
| 102 | p.types = nil |
| 103 | |
| 104 | for _, r := range p.runes { |
| 105 | props, i := LookupRune(r) |
| 106 | bytecount += i |
| 107 | cls := props.Class() |
| 108 | if cls == B { |
| 109 | return bytecount, nil |
| 110 | } |
| 111 | p.types = append(p.types, cls) |
| 112 | if props.IsOpeningBracket() { |
| 113 | p.pairTypes = append(p.pairTypes, bpOpen) |
| 114 | p.pairValues = append(p.pairValues, r) |
| 115 | } else if props.IsBracket() { |
| 116 | // this must be a closing bracket, |
| 117 | // since IsOpeningBracket is not true |
| 118 | p.pairTypes = append(p.pairTypes, bpClose) |
| 119 | p.pairValues = append(p.pairValues, r) |
| 120 | } else { |
| 121 | p.pairTypes = append(p.pairTypes, bpNone) |
| 122 | p.pairValues = append(p.pairValues, 0) |
| 123 | } |
| 124 | } |
| 125 | return bytecount, nil |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 126 | } |
| 127 | |
| 128 | // SetBytes configures p for the given paragraph text. It replaces text |
| 129 | // previously set by SetBytes or SetString. If b contains a paragraph separator |
| 130 | // it will only process the first paragraph and report the number of bytes |
| 131 | // consumed from b including this separator. Error may be non-nil if options are |
| 132 | // given. |
| 133 | func (p *Paragraph) SetBytes(b []byte, opts ...Option) (n int, err error) { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 134 | p.p = b |
| 135 | p.opts = opts |
| 136 | return p.prepareInput() |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 137 | } |
| 138 | |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 139 | // SetString configures s for the given paragraph text. It replaces text |
| 140 | // previously set by SetBytes or SetString. If s contains a paragraph separator |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 141 | // it will only process the first paragraph and report the number of bytes |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 142 | // consumed from s including this separator. Error may be non-nil if options are |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 143 | // given. |
| 144 | func (p *Paragraph) SetString(s string, opts ...Option) (n int, err error) { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 145 | p.p = []byte(s) |
| 146 | p.opts = opts |
| 147 | return p.prepareInput() |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 148 | } |
| 149 | |
| 150 | // IsLeftToRight reports whether the principle direction of rendering for this |
| 151 | // paragraphs is left-to-right. If this returns false, the principle direction |
| 152 | // of rendering is right-to-left. |
| 153 | func (p *Paragraph) IsLeftToRight() bool { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 154 | return p.Direction() == LeftToRight |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 155 | } |
| 156 | |
| 157 | // Direction returns the direction of the text of this paragraph. |
| 158 | // |
| 159 | // The direction may be LeftToRight, RightToLeft, Mixed, or Neutral. |
| 160 | func (p *Paragraph) Direction() Direction { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 161 | return p.o.Direction() |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 162 | } |
| 163 | |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 164 | // TODO: what happens if the position is > len(input)? This should return an error. |
| 165 | |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 166 | // RunAt reports the Run at the given position of the input text. |
| 167 | // |
| 168 | // This method can be used for computing line breaks on paragraphs. |
| 169 | func (p *Paragraph) RunAt(pos int) Run { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 170 | c := 0 |
| 171 | runNumber := 0 |
| 172 | for i, r := range p.o.runes { |
| 173 | c += len(r) |
| 174 | if pos < c { |
| 175 | runNumber = i |
| 176 | } |
| 177 | } |
| 178 | return p.o.Run(runNumber) |
| 179 | } |
| 180 | |
| 181 | func calculateOrdering(levels []level, runes []rune) Ordering { |
| 182 | var curDir Direction |
| 183 | |
| 184 | prevDir := Neutral |
| 185 | prevI := 0 |
| 186 | |
| 187 | o := Ordering{} |
| 188 | // lvl = 0,2,4,...: left to right |
| 189 | // lvl = 1,3,5,...: right to left |
| 190 | for i, lvl := range levels { |
| 191 | if lvl%2 == 0 { |
| 192 | curDir = LeftToRight |
| 193 | } else { |
| 194 | curDir = RightToLeft |
| 195 | } |
| 196 | if curDir != prevDir { |
| 197 | if i > 0 { |
| 198 | o.runes = append(o.runes, runes[prevI:i]) |
| 199 | o.directions = append(o.directions, prevDir) |
| 200 | o.startpos = append(o.startpos, prevI) |
| 201 | } |
| 202 | prevI = i |
| 203 | prevDir = curDir |
| 204 | } |
| 205 | } |
| 206 | o.runes = append(o.runes, runes[prevI:]) |
| 207 | o.directions = append(o.directions, prevDir) |
| 208 | o.startpos = append(o.startpos, prevI) |
| 209 | return o |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 210 | } |
| 211 | |
| 212 | // Order computes the visual ordering of all the runs in a Paragraph. |
| 213 | func (p *Paragraph) Order() (Ordering, error) { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 214 | if len(p.types) == 0 { |
| 215 | return Ordering{}, nil |
| 216 | } |
| 217 | |
| 218 | for _, fn := range p.opts { |
| 219 | fn(&p.options) |
| 220 | } |
| 221 | lvl := level(-1) |
| 222 | if p.options.defaultDirection == RightToLeft { |
| 223 | lvl = 1 |
| 224 | } |
| 225 | para, err := newParagraph(p.types, p.pairTypes, p.pairValues, lvl) |
| 226 | if err != nil { |
| 227 | return Ordering{}, err |
| 228 | } |
| 229 | |
| 230 | levels := para.getLevels([]int{len(p.types)}) |
| 231 | |
| 232 | p.o = calculateOrdering(levels, p.runes) |
| 233 | return p.o, nil |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 234 | } |
| 235 | |
| 236 | // Line computes the visual ordering of runs for a single line starting and |
| 237 | // ending at the given positions in the original text. |
| 238 | func (p *Paragraph) Line(start, end int) (Ordering, error) { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 239 | lineTypes := p.types[start:end] |
| 240 | para, err := newParagraph(lineTypes, p.pairTypes[start:end], p.pairValues[start:end], -1) |
| 241 | if err != nil { |
| 242 | return Ordering{}, err |
| 243 | } |
| 244 | levels := para.getLevels([]int{len(lineTypes)}) |
| 245 | o := calculateOrdering(levels, p.runes[start:end]) |
| 246 | return o, nil |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 247 | } |
| 248 | |
| 249 | // An Ordering holds the computed visual order of runs of a Paragraph. Calling |
| 250 | // SetBytes or SetString on the originating Paragraph invalidates an Ordering. |
| 251 | // The methods of an Ordering should only be called by one goroutine at a time. |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 252 | type Ordering struct { |
| 253 | runes [][]rune |
| 254 | directions []Direction |
| 255 | startpos []int |
| 256 | } |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 257 | |
| 258 | // Direction reports the directionality of the runs. |
| 259 | // |
| 260 | // The direction may be LeftToRight, RightToLeft, Mixed, or Neutral. |
| 261 | func (o *Ordering) Direction() Direction { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 262 | return o.directions[0] |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 263 | } |
| 264 | |
| 265 | // NumRuns returns the number of runs. |
| 266 | func (o *Ordering) NumRuns() int { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 267 | return len(o.runes) |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 268 | } |
| 269 | |
| 270 | // Run returns the ith run within the ordering. |
| 271 | func (o *Ordering) Run(i int) Run { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 272 | r := Run{ |
| 273 | runes: o.runes[i], |
| 274 | direction: o.directions[i], |
| 275 | startpos: o.startpos[i], |
| 276 | } |
| 277 | return r |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 278 | } |
| 279 | |
| 280 | // TODO: perhaps with options. |
| 281 | // // Reorder creates a reader that reads the runes in visual order per character. |
| 282 | // // Modifiers remain after the runes they modify. |
| 283 | // func (l *Runs) Reorder() io.Reader { |
| 284 | // panic("unimplemented") |
| 285 | // } |
| 286 | |
| 287 | // A Run is a continuous sequence of characters of a single direction. |
| 288 | type Run struct { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 289 | runes []rune |
| 290 | direction Direction |
| 291 | startpos int |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 292 | } |
| 293 | |
| 294 | // String returns the text of the run in its original order. |
| 295 | func (r *Run) String() string { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 296 | return string(r.runes) |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 297 | } |
| 298 | |
| 299 | // Bytes returns the text of the run in its original order. |
| 300 | func (r *Run) Bytes() []byte { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 301 | return []byte(r.String()) |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 302 | } |
| 303 | |
| 304 | // TODO: methods for |
| 305 | // - Display order |
| 306 | // - headers and footers |
| 307 | // - bracket replacement. |
| 308 | |
| 309 | // Direction reports the direction of the run. |
| 310 | func (r *Run) Direction() Direction { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 311 | return r.direction |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 312 | } |
| 313 | |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 314 | // Pos returns the position of the Run within the text passed to SetBytes or SetString of the |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 315 | // originating Paragraph value. |
| 316 | func (r *Run) Pos() (start, end int) { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 317 | return r.startpos, r.startpos + len(r.runes) - 1 |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 318 | } |
| 319 | |
| 320 | // AppendReverse reverses the order of characters of in, appends them to out, |
| 321 | // and returns the result. Modifiers will still follow the runes they modify. |
| 322 | // Brackets are replaced with their counterparts. |
| 323 | func AppendReverse(out, in []byte) []byte { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 324 | ret := make([]byte, len(in)+len(out)) |
| 325 | copy(ret, out) |
| 326 | inRunes := bytes.Runes(in) |
| 327 | |
| 328 | for i, r := range inRunes { |
| 329 | prop, _ := LookupRune(r) |
| 330 | if prop.IsBracket() { |
| 331 | inRunes[i] = prop.reverseBracket(r) |
| 332 | } |
| 333 | } |
| 334 | |
| 335 | for i, j := 0, len(inRunes)-1; i < j; i, j = i+1, j-1 { |
| 336 | inRunes[i], inRunes[j] = inRunes[j], inRunes[i] |
| 337 | } |
| 338 | copy(ret[len(out):], string(inRunes)) |
| 339 | |
| 340 | return ret |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 341 | } |
| 342 | |
| 343 | // ReverseString reverses the order of characters in s and returns a new string. |
| 344 | // Modifiers will still follow the runes they modify. Brackets are replaced with |
| 345 | // their counterparts. |
| 346 | func ReverseString(s string) string { |
khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 347 | input := []rune(s) |
| 348 | li := len(input) |
| 349 | ret := make([]rune, li) |
| 350 | for i, r := range input { |
| 351 | prop, _ := LookupRune(r) |
| 352 | if prop.IsBracket() { |
| 353 | ret[li-i-1] = prop.reverseBracket(r) |
| 354 | } else { |
| 355 | ret[li-i-1] = r |
| 356 | } |
| 357 | } |
| 358 | return string(ret) |
Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 359 | } |