blob: 520b9ada0e28e8a40a573eeb20e907116d405bb3 [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package transform provides reader and writer wrappers that transform the
6// bytes passing through as well as various transformations. Example
7// transformations provided by other packages include normalization and
8// conversion between character sets.
9package transform // import "golang.org/x/text/transform"
10
11import (
12 "bytes"
13 "errors"
14 "io"
15 "unicode/utf8"
16)
17
18var (
19 // ErrShortDst means that the destination buffer was too short to
20 // receive all of the transformed bytes.
21 ErrShortDst = errors.New("transform: short destination buffer")
22
23 // ErrShortSrc means that the source buffer has insufficient data to
24 // complete the transformation.
25 ErrShortSrc = errors.New("transform: short source buffer")
26
27 // ErrEndOfSpan means that the input and output (the transformed input)
28 // are not identical.
29 ErrEndOfSpan = errors.New("transform: input and output are not identical")
30
31 // errInconsistentByteCount means that Transform returned success (nil
32 // error) but also returned nSrc inconsistent with the src argument.
33 errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")
34
35 // errShortInternal means that an internal buffer is not large enough
36 // to make progress and the Transform operation must be aborted.
37 errShortInternal = errors.New("transform: short internal buffer")
38)
39
40// Transformer transforms bytes.
41type Transformer interface {
42 // Transform writes to dst the transformed bytes read from src, and
43 // returns the number of dst bytes written and src bytes read. The
44 // atEOF argument tells whether src represents the last bytes of the
45 // input.
46 //
47 // Callers should always process the nDst bytes produced and account
48 // for the nSrc bytes consumed before considering the error err.
49 //
50 // A nil error means that all of the transformed bytes (whether freshly
51 // transformed from src or left over from previous Transform calls)
52 // were written to dst. A nil error can be returned regardless of
53 // whether atEOF is true. If err is nil then nSrc must equal len(src);
54 // the converse is not necessarily true.
55 //
56 // ErrShortDst means that dst was too short to receive all of the
57 // transformed bytes. ErrShortSrc means that src had insufficient data
58 // to complete the transformation. If both conditions apply, then
59 // either error may be returned. Other than the error conditions listed
60 // here, implementations are free to report other errors that arise.
61 Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
62
63 // Reset resets the state and allows a Transformer to be reused.
64 Reset()
65}
66
67// SpanningTransformer extends the Transformer interface with a Span method
68// that determines how much of the input already conforms to the Transformer.
69type SpanningTransformer interface {
70 Transformer
71
72 // Span returns a position in src such that transforming src[:n] results in
73 // identical output src[:n] for these bytes. It does not necessarily return
74 // the largest such n. The atEOF argument tells whether src represents the
75 // last bytes of the input.
76 //
77 // Callers should always account for the n bytes consumed before
78 // considering the error err.
79 //
80 // A nil error means that all input bytes are known to be identical to the
Don Newton7577f072020-01-06 12:41:11 -050081 // output produced by the Transformer. A nil error can be returned
82 // regardless of whether atEOF is true. If err is nil, then n must
Don Newton98fd8812019-09-23 15:15:02 -040083 // equal len(src); the converse is not necessarily true.
84 //
85 // ErrEndOfSpan means that the Transformer output may differ from the
86 // input after n bytes. Note that n may be len(src), meaning that the output
87 // would contain additional bytes after otherwise identical output.
88 // ErrShortSrc means that src had insufficient data to determine whether the
89 // remaining bytes would change. Other than the error conditions listed
90 // here, implementations are free to report other errors that arise.
91 //
92 // Calling Span can modify the Transformer state as a side effect. In
93 // effect, it does the transformation just as calling Transform would, only
94 // without copying to a destination buffer and only up to a point it can
95 // determine the input and output bytes are the same. This is obviously more
96 // limited than calling Transform, but can be more efficient in terms of
97 // copying and allocating buffers. Calls to Span and Transform may be
98 // interleaved.
99 Span(src []byte, atEOF bool) (n int, err error)
100}
101
102// NopResetter can be embedded by implementations of Transformer to add a nop
103// Reset method.
104type NopResetter struct{}
105
106// Reset implements the Reset method of the Transformer interface.
107func (NopResetter) Reset() {}
108
109// Reader wraps another io.Reader by transforming the bytes read.
110type Reader struct {
111 r io.Reader
112 t Transformer
113 err error
114
115 // dst[dst0:dst1] contains bytes that have been transformed by t but
116 // not yet copied out via Read.
117 dst []byte
118 dst0, dst1 int
119
120 // src[src0:src1] contains bytes that have been read from r but not
121 // yet transformed through t.
122 src []byte
123 src0, src1 int
124
125 // transformComplete is whether the transformation is complete,
126 // regardless of whether or not it was successful.
127 transformComplete bool
128}
129
130const defaultBufSize = 4096
131
132// NewReader returns a new Reader that wraps r by transforming the bytes read
133// via t. It calls Reset on t.
134func NewReader(r io.Reader, t Transformer) *Reader {
135 t.Reset()
136 return &Reader{
137 r: r,
138 t: t,
139 dst: make([]byte, defaultBufSize),
140 src: make([]byte, defaultBufSize),
141 }
142}
143
144// Read implements the io.Reader interface.
145func (r *Reader) Read(p []byte) (int, error) {
146 n, err := 0, error(nil)
147 for {
148 // Copy out any transformed bytes and return the final error if we are done.
149 if r.dst0 != r.dst1 {
150 n = copy(p, r.dst[r.dst0:r.dst1])
151 r.dst0 += n
152 if r.dst0 == r.dst1 && r.transformComplete {
153 return n, r.err
154 }
155 return n, nil
156 } else if r.transformComplete {
157 return 0, r.err
158 }
159
160 // Try to transform some source bytes, or to flush the transformer if we
161 // are out of source bytes. We do this even if r.r.Read returned an error.
162 // As the io.Reader documentation says, "process the n > 0 bytes returned
163 // before considering the error".
164 if r.src0 != r.src1 || r.err != nil {
165 r.dst0 = 0
166 r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
167 r.src0 += n
168
169 switch {
170 case err == nil:
171 if r.src0 != r.src1 {
172 r.err = errInconsistentByteCount
173 }
174 // The Transform call was successful; we are complete if we
175 // cannot read more bytes into src.
176 r.transformComplete = r.err != nil
177 continue
178 case err == ErrShortDst && (r.dst1 != 0 || n != 0):
179 // Make room in dst by copying out, and try again.
180 continue
181 case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
182 // Read more bytes into src via the code below, and try again.
183 default:
184 r.transformComplete = true
185 // The reader error (r.err) takes precedence over the
186 // transformer error (err) unless r.err is nil or io.EOF.
187 if r.err == nil || r.err == io.EOF {
188 r.err = err
189 }
190 continue
191 }
192 }
193
194 // Move any untransformed source bytes to the start of the buffer
195 // and read more bytes.
196 if r.src0 != 0 {
197 r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
198 }
199 n, r.err = r.r.Read(r.src[r.src1:])
200 r.src1 += n
201 }
202}
203
204// TODO: implement ReadByte (and ReadRune??).
205
206// Writer wraps another io.Writer by transforming the bytes read.
207// The user needs to call Close to flush unwritten bytes that may
208// be buffered.
209type Writer struct {
210 w io.Writer
211 t Transformer
212 dst []byte
213
214 // src[:n] contains bytes that have not yet passed through t.
215 src []byte
216 n int
217}
218
219// NewWriter returns a new Writer that wraps w by transforming the bytes written
220// via t. It calls Reset on t.
221func NewWriter(w io.Writer, t Transformer) *Writer {
222 t.Reset()
223 return &Writer{
224 w: w,
225 t: t,
226 dst: make([]byte, defaultBufSize),
227 src: make([]byte, defaultBufSize),
228 }
229}
230
231// Write implements the io.Writer interface. If there are not enough
232// bytes available to complete a Transform, the bytes will be buffered
233// for the next write. Call Close to convert the remaining bytes.
234func (w *Writer) Write(data []byte) (n int, err error) {
235 src := data
236 if w.n > 0 {
237 // Append bytes from data to the last remainder.
238 // TODO: limit the amount copied on first try.
239 n = copy(w.src[w.n:], data)
240 w.n += n
241 src = w.src[:w.n]
242 }
243 for {
244 nDst, nSrc, err := w.t.Transform(w.dst, src, false)
245 if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
246 return n, werr
247 }
248 src = src[nSrc:]
249 if w.n == 0 {
250 n += nSrc
251 } else if len(src) <= n {
252 // Enough bytes from w.src have been consumed. We make src point
253 // to data instead to reduce the copying.
254 w.n = 0
255 n -= len(src)
256 src = data[n:]
257 if n < len(data) && (err == nil || err == ErrShortSrc) {
258 continue
259 }
260 }
261 switch err {
262 case ErrShortDst:
263 // This error is okay as long as we are making progress.
264 if nDst > 0 || nSrc > 0 {
265 continue
266 }
267 case ErrShortSrc:
268 if len(src) < len(w.src) {
269 m := copy(w.src, src)
270 // If w.n > 0, bytes from data were already copied to w.src and n
271 // was already set to the number of bytes consumed.
272 if w.n == 0 {
273 n += m
274 }
275 w.n = m
276 err = nil
277 } else if nDst > 0 || nSrc > 0 {
278 // Not enough buffer to store the remainder. Keep processing as
279 // long as there is progress. Without this case, transforms that
280 // require a lookahead larger than the buffer may result in an
281 // error. This is not something one may expect to be common in
282 // practice, but it may occur when buffers are set to small
283 // sizes during testing.
284 continue
285 }
286 case nil:
287 if w.n > 0 {
288 err = errInconsistentByteCount
289 }
290 }
291 return n, err
292 }
293}
294
295// Close implements the io.Closer interface.
296func (w *Writer) Close() error {
297 src := w.src[:w.n]
298 for {
299 nDst, nSrc, err := w.t.Transform(w.dst, src, true)
300 if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
301 return werr
302 }
303 if err != ErrShortDst {
304 return err
305 }
306 src = src[nSrc:]
307 }
308}
309
310type nop struct{ NopResetter }
311
312func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
313 n := copy(dst, src)
314 if n < len(src) {
315 err = ErrShortDst
316 }
317 return n, n, err
318}
319
320func (nop) Span(src []byte, atEOF bool) (n int, err error) {
321 return len(src), nil
322}
323
324type discard struct{ NopResetter }
325
326func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
327 return 0, len(src), nil
328}
329
330var (
331 // Discard is a Transformer for which all Transform calls succeed
332 // by consuming all bytes and writing nothing.
333 Discard Transformer = discard{}
334
335 // Nop is a SpanningTransformer that copies src to dst.
336 Nop SpanningTransformer = nop{}
337)
338
339// chain is a sequence of links. A chain with N Transformers has N+1 links and
340// N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
341// buffers given to chain.Transform and the middle N-1 buffers are intermediate
342// buffers owned by the chain. The i'th link transforms bytes from the i'th
343// buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
344// chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
345type chain struct {
346 link []link
347 err error
348 // errStart is the index at which the error occurred plus 1. Processing
349 // errStart at this level at the next call to Transform. As long as
350 // errStart > 0, chain will not consume any more source bytes.
351 errStart int
352}
353
354func (c *chain) fatalError(errIndex int, err error) {
355 if i := errIndex + 1; i > c.errStart {
356 c.errStart = i
357 c.err = err
358 }
359}
360
361type link struct {
362 t Transformer
363 // b[p:n] holds the bytes to be transformed by t.
364 b []byte
365 p int
366 n int
367}
368
369func (l *link) src() []byte {
370 return l.b[l.p:l.n]
371}
372
373func (l *link) dst() []byte {
374 return l.b[l.n:]
375}
376
377// Chain returns a Transformer that applies t in sequence.
378func Chain(t ...Transformer) Transformer {
379 if len(t) == 0 {
380 return nop{}
381 }
382 c := &chain{link: make([]link, len(t)+1)}
383 for i, tt := range t {
384 c.link[i].t = tt
385 }
386 // Allocate intermediate buffers.
387 b := make([][defaultBufSize]byte, len(t)-1)
388 for i := range b {
389 c.link[i+1].b = b[i][:]
390 }
391 return c
392}
393
394// Reset resets the state of Chain. It calls Reset on all the Transformers.
395func (c *chain) Reset() {
396 for i, l := range c.link {
397 if l.t != nil {
398 l.t.Reset()
399 }
400 c.link[i].p, c.link[i].n = 0, 0
401 }
402}
403
404// TODO: make chain use Span (is going to be fun to implement!)
405
406// Transform applies the transformers of c in sequence.
407func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
408 // Set up src and dst in the chain.
409 srcL := &c.link[0]
410 dstL := &c.link[len(c.link)-1]
411 srcL.b, srcL.p, srcL.n = src, 0, len(src)
412 dstL.b, dstL.n = dst, 0
413 var lastFull, needProgress bool // for detecting progress
414
415 // i is the index of the next Transformer to apply, for i in [low, high].
416 // low is the lowest index for which c.link[low] may still produce bytes.
417 // high is the highest index for which c.link[high] has a Transformer.
418 // The error returned by Transform determines whether to increase or
419 // decrease i. We try to completely fill a buffer before converting it.
420 for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
421 in, out := &c.link[i], &c.link[i+1]
422 nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
423 out.n += nDst
424 in.p += nSrc
425 if i > 0 && in.p == in.n {
426 in.p, in.n = 0, 0
427 }
428 needProgress, lastFull = lastFull, false
429 switch err0 {
430 case ErrShortDst:
431 // Process the destination buffer next. Return if we are already
432 // at the high index.
433 if i == high {
434 return dstL.n, srcL.p, ErrShortDst
435 }
436 if out.n != 0 {
437 i++
438 // If the Transformer at the next index is not able to process any
439 // source bytes there is nothing that can be done to make progress
440 // and the bytes will remain unprocessed. lastFull is used to
441 // detect this and break out of the loop with a fatal error.
442 lastFull = true
443 continue
444 }
445 // The destination buffer was too small, but is completely empty.
446 // Return a fatal error as this transformation can never complete.
447 c.fatalError(i, errShortInternal)
448 case ErrShortSrc:
449 if i == 0 {
450 // Save ErrShortSrc in err. All other errors take precedence.
451 err = ErrShortSrc
452 break
453 }
454 // Source bytes were depleted before filling up the destination buffer.
455 // Verify we made some progress, move the remaining bytes to the errStart
456 // and try to get more source bytes.
457 if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
458 // There were not enough source bytes to proceed while the source
459 // buffer cannot hold any more bytes. Return a fatal error as this
460 // transformation can never complete.
461 c.fatalError(i, errShortInternal)
462 break
463 }
464 // in.b is an internal buffer and we can make progress.
465 in.p, in.n = 0, copy(in.b, in.src())
466 fallthrough
467 case nil:
468 // if i == low, we have depleted the bytes at index i or any lower levels.
469 // In that case we increase low and i. In all other cases we decrease i to
470 // fetch more bytes before proceeding to the next index.
471 if i > low {
472 i--
473 continue
474 }
475 default:
476 c.fatalError(i, err0)
477 }
478 // Exhausted level low or fatal error: increase low and continue
479 // to process the bytes accepted so far.
480 i++
481 low = i
482 }
483
484 // If c.errStart > 0, this means we found a fatal error. We will clear
485 // all upstream buffers. At this point, no more progress can be made
486 // downstream, as Transform would have bailed while handling ErrShortDst.
487 if c.errStart > 0 {
488 for i := 1; i < c.errStart; i++ {
489 c.link[i].p, c.link[i].n = 0, 0
490 }
491 err, c.errStart, c.err = c.err, 0, nil
492 }
493 return dstL.n, srcL.p, err
494}
495
Don Newton7577f072020-01-06 12:41:11 -0500496// Deprecated: Use runes.Remove instead.
Don Newton98fd8812019-09-23 15:15:02 -0400497func RemoveFunc(f func(r rune) bool) Transformer {
498 return removeF(f)
499}
500
501type removeF func(r rune) bool
502
503func (removeF) Reset() {}
504
505// Transform implements the Transformer interface.
506func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
507 for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
508
509 if r = rune(src[0]); r < utf8.RuneSelf {
510 sz = 1
511 } else {
512 r, sz = utf8.DecodeRune(src)
513
514 if sz == 1 {
515 // Invalid rune.
516 if !atEOF && !utf8.FullRune(src) {
517 err = ErrShortSrc
518 break
519 }
520 // We replace illegal bytes with RuneError. Not doing so might
521 // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
522 // The resulting byte sequence may subsequently contain runes
523 // for which t(r) is true that were passed unnoticed.
524 if !t(r) {
525 if nDst+3 > len(dst) {
526 err = ErrShortDst
527 break
528 }
529 nDst += copy(dst[nDst:], "\uFFFD")
530 }
531 nSrc++
532 continue
533 }
534 }
535
536 if !t(r) {
537 if nDst+sz > len(dst) {
538 err = ErrShortDst
539 break
540 }
541 nDst += copy(dst[nDst:], src[:sz])
542 }
543 nSrc += sz
544 }
545 return
546}
547
548// grow returns a new []byte that is longer than b, and copies the first n bytes
549// of b to the start of the new slice.
550func grow(b []byte, n int) []byte {
551 m := len(b)
552 if m <= 32 {
553 m = 64
554 } else if m <= 256 {
555 m *= 2
556 } else {
557 m += m >> 1
558 }
559 buf := make([]byte, m)
560 copy(buf, b[:n])
561 return buf
562}
563
564const initialBufSize = 128
565
566// String returns a string with the result of converting s[:n] using t, where
567// n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
568func String(t Transformer, s string) (result string, n int, err error) {
569 t.Reset()
570 if s == "" {
571 // Fast path for the common case for empty input. Results in about a
572 // 86% reduction of running time for BenchmarkStringLowerEmpty.
573 if _, _, err := t.Transform(nil, nil, true); err == nil {
574 return "", 0, nil
575 }
576 }
577
578 // Allocate only once. Note that both dst and src escape when passed to
579 // Transform.
580 buf := [2 * initialBufSize]byte{}
581 dst := buf[:initialBufSize:initialBufSize]
582 src := buf[initialBufSize : 2*initialBufSize]
583
584 // The input string s is transformed in multiple chunks (starting with a
585 // chunk size of initialBufSize). nDst and nSrc are per-chunk (or
586 // per-Transform-call) indexes, pDst and pSrc are overall indexes.
587 nDst, nSrc := 0, 0
588 pDst, pSrc := 0, 0
589
590 // pPrefix is the length of a common prefix: the first pPrefix bytes of the
591 // result will equal the first pPrefix bytes of s. It is not guaranteed to
592 // be the largest such value, but if pPrefix, len(result) and len(s) are
593 // all equal after the final transform (i.e. calling Transform with atEOF
594 // being true returned nil error) then we don't need to allocate a new
595 // result string.
596 pPrefix := 0
597 for {
598 // Invariant: pDst == pPrefix && pSrc == pPrefix.
599
600 n := copy(src, s[pSrc:])
601 nDst, nSrc, err = t.Transform(dst, src[:n], pSrc+n == len(s))
602 pDst += nDst
603 pSrc += nSrc
604
605 // TODO: let transformers implement an optional Spanner interface, akin
606 // to norm's QuickSpan. This would even allow us to avoid any allocation.
607 if !bytes.Equal(dst[:nDst], src[:nSrc]) {
608 break
609 }
610 pPrefix = pSrc
611 if err == ErrShortDst {
612 // A buffer can only be short if a transformer modifies its input.
613 break
614 } else if err == ErrShortSrc {
615 if nSrc == 0 {
616 // No progress was made.
617 break
618 }
619 // Equal so far and !atEOF, so continue checking.
620 } else if err != nil || pPrefix == len(s) {
621 return string(s[:pPrefix]), pPrefix, err
622 }
623 }
624 // Post-condition: pDst == pPrefix + nDst && pSrc == pPrefix + nSrc.
625
626 // We have transformed the first pSrc bytes of the input s to become pDst
627 // transformed bytes. Those transformed bytes are discontiguous: the first
628 // pPrefix of them equal s[:pPrefix] and the last nDst of them equal
629 // dst[:nDst]. We copy them around, into a new dst buffer if necessary, so
630 // that they become one contiguous slice: dst[:pDst].
631 if pPrefix != 0 {
632 newDst := dst
633 if pDst > len(newDst) {
634 newDst = make([]byte, len(s)+nDst-nSrc)
635 }
636 copy(newDst[pPrefix:pDst], dst[:nDst])
637 copy(newDst[:pPrefix], s[:pPrefix])
638 dst = newDst
639 }
640
641 // Prevent duplicate Transform calls with atEOF being true at the end of
642 // the input. Also return if we have an unrecoverable error.
643 if (err == nil && pSrc == len(s)) ||
644 (err != nil && err != ErrShortDst && err != ErrShortSrc) {
645 return string(dst[:pDst]), pSrc, err
646 }
647
648 // Transform the remaining input, growing dst and src buffers as necessary.
649 for {
650 n := copy(src, s[pSrc:])
651 nDst, nSrc, err := t.Transform(dst[pDst:], src[:n], pSrc+n == len(s))
652 pDst += nDst
653 pSrc += nSrc
654
655 // If we got ErrShortDst or ErrShortSrc, do not grow as long as we can
656 // make progress. This may avoid excessive allocations.
657 if err == ErrShortDst {
658 if nDst == 0 {
659 dst = grow(dst, pDst)
660 }
661 } else if err == ErrShortSrc {
662 if nSrc == 0 {
663 src = grow(src, 0)
664 }
665 } else if err != nil || pSrc == len(s) {
666 return string(dst[:pDst]), pSrc, err
667 }
668 }
669}
670
671// Bytes returns a new byte slice with the result of converting b[:n] using t,
672// where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
673func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
674 return doAppend(t, 0, make([]byte, len(b)), b)
675}
676
677// Append appends the result of converting src[:n] using t to dst, where
678// n <= len(src), If err == nil, n will be len(src). It calls Reset on t.
679func Append(t Transformer, dst, src []byte) (result []byte, n int, err error) {
680 if len(dst) == cap(dst) {
681 n := len(src) + len(dst) // It is okay for this to be 0.
682 b := make([]byte, n)
683 dst = b[:copy(b, dst)]
684 }
685 return doAppend(t, len(dst), dst[:cap(dst)], src)
686}
687
688func doAppend(t Transformer, pDst int, dst, src []byte) (result []byte, n int, err error) {
689 t.Reset()
690 pSrc := 0
691 for {
692 nDst, nSrc, err := t.Transform(dst[pDst:], src[pSrc:], true)
693 pDst += nDst
694 pSrc += nSrc
695 if err != ErrShortDst {
696 return dst[:pDst], pSrc, err
697 }
698
699 // Grow the destination buffer, but do not grow as long as we can make
700 // progress. This may avoid excessive allocations.
701 if nDst == 0 {
702 dst = grow(dst, pDst)
703 }
704 }
705}