blob: adc487ea80483cbc892f0438c3133705cbcb46cd [file] [log] [blame]
sslobodrd046be82019-01-16 10:02:22 -05001package jsoniter
2
3import (
4 "fmt"
5 "unicode/utf16"
6)
7
8// ReadString read string from iterator
9func (iter *Iterator) ReadString() (ret string) {
10 c := iter.nextToken()
11 if c == '"' {
12 for i := iter.head; i < iter.tail; i++ {
13 c := iter.buf[i]
14 if c == '"' {
15 ret = string(iter.buf[iter.head:i])
16 iter.head = i + 1
17 return ret
18 } else if c == '\\' {
19 break
20 } else if c < ' ' {
21 iter.ReportError("ReadString",
22 fmt.Sprintf(`invalid control character found: %d`, c))
23 return
24 }
25 }
26 return iter.readStringSlowPath()
27 } else if c == 'n' {
28 iter.skipThreeBytes('u', 'l', 'l')
29 return ""
30 }
31 iter.ReportError("ReadString", `expects " or n, but found `+string([]byte{c}))
32 return
33}
34
35func (iter *Iterator) readStringSlowPath() (ret string) {
36 var str []byte
37 var c byte
38 for iter.Error == nil {
39 c = iter.readByte()
40 if c == '"' {
41 return string(str)
42 }
43 if c == '\\' {
44 c = iter.readByte()
45 str = iter.readEscapedChar(c, str)
46 } else {
47 str = append(str, c)
48 }
49 }
50 iter.ReportError("readStringSlowPath", "unexpected end of input")
51 return
52}
53
54func (iter *Iterator) readEscapedChar(c byte, str []byte) []byte {
55 switch c {
56 case 'u':
57 r := iter.readU4()
58 if utf16.IsSurrogate(r) {
59 c = iter.readByte()
60 if iter.Error != nil {
61 return nil
62 }
63 if c != '\\' {
64 iter.unreadByte()
65 str = appendRune(str, r)
66 return str
67 }
68 c = iter.readByte()
69 if iter.Error != nil {
70 return nil
71 }
72 if c != 'u' {
73 str = appendRune(str, r)
74 return iter.readEscapedChar(c, str)
75 }
76 r2 := iter.readU4()
77 if iter.Error != nil {
78 return nil
79 }
80 combined := utf16.DecodeRune(r, r2)
81 if combined == '\uFFFD' {
82 str = appendRune(str, r)
83 str = appendRune(str, r2)
84 } else {
85 str = appendRune(str, combined)
86 }
87 } else {
88 str = appendRune(str, r)
89 }
90 case '"':
91 str = append(str, '"')
92 case '\\':
93 str = append(str, '\\')
94 case '/':
95 str = append(str, '/')
96 case 'b':
97 str = append(str, '\b')
98 case 'f':
99 str = append(str, '\f')
100 case 'n':
101 str = append(str, '\n')
102 case 'r':
103 str = append(str, '\r')
104 case 't':
105 str = append(str, '\t')
106 default:
107 iter.ReportError("readEscapedChar",
108 `invalid escape char after \`)
109 return nil
110 }
111 return str
112}
113
114// ReadStringAsSlice read string from iterator without copying into string form.
115// The []byte can not be kept, as it will change after next iterator call.
116func (iter *Iterator) ReadStringAsSlice() (ret []byte) {
117 c := iter.nextToken()
118 if c == '"' {
119 for i := iter.head; i < iter.tail; i++ {
120 // require ascii string and no escape
121 // for: field name, base64, number
122 if iter.buf[i] == '"' {
123 // fast path: reuse the underlying buffer
124 ret = iter.buf[iter.head:i]
125 iter.head = i + 1
126 return ret
127 }
128 }
129 readLen := iter.tail - iter.head
130 copied := make([]byte, readLen, readLen*2)
131 copy(copied, iter.buf[iter.head:iter.tail])
132 iter.head = iter.tail
133 for iter.Error == nil {
134 c := iter.readByte()
135 if c == '"' {
136 return copied
137 }
138 copied = append(copied, c)
139 }
140 return copied
141 }
142 iter.ReportError("ReadStringAsSlice", `expects " or n, but found `+string([]byte{c}))
143 return
144}
145
146func (iter *Iterator) readU4() (ret rune) {
147 for i := 0; i < 4; i++ {
148 c := iter.readByte()
149 if iter.Error != nil {
150 return
151 }
152 if c >= '0' && c <= '9' {
153 ret = ret*16 + rune(c-'0')
154 } else if c >= 'a' && c <= 'f' {
155 ret = ret*16 + rune(c-'a'+10)
156 } else if c >= 'A' && c <= 'F' {
157 ret = ret*16 + rune(c-'A'+10)
158 } else {
159 iter.ReportError("readU4", "expects 0~9 or a~f, but found "+string([]byte{c}))
160 return
161 }
162 }
163 return ret
164}
165
166const (
167 t1 = 0x00 // 0000 0000
168 tx = 0x80 // 1000 0000
169 t2 = 0xC0 // 1100 0000
170 t3 = 0xE0 // 1110 0000
171 t4 = 0xF0 // 1111 0000
172 t5 = 0xF8 // 1111 1000
173
174 maskx = 0x3F // 0011 1111
175 mask2 = 0x1F // 0001 1111
176 mask3 = 0x0F // 0000 1111
177 mask4 = 0x07 // 0000 0111
178
179 rune1Max = 1<<7 - 1
180 rune2Max = 1<<11 - 1
181 rune3Max = 1<<16 - 1
182
183 surrogateMin = 0xD800
184 surrogateMax = 0xDFFF
185
186 maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
187 runeError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
188)
189
190func appendRune(p []byte, r rune) []byte {
191 // Negative values are erroneous. Making it unsigned addresses the problem.
192 switch i := uint32(r); {
193 case i <= rune1Max:
194 p = append(p, byte(r))
195 return p
196 case i <= rune2Max:
197 p = append(p, t2|byte(r>>6))
198 p = append(p, tx|byte(r)&maskx)
199 return p
200 case i > maxRune, surrogateMin <= i && i <= surrogateMax:
201 r = runeError
202 fallthrough
203 case i <= rune3Max:
204 p = append(p, t3|byte(r>>12))
205 p = append(p, tx|byte(r>>6)&maskx)
206 p = append(p, tx|byte(r)&maskx)
207 return p
208 default:
209 p = append(p, t4|byte(r>>18))
210 p = append(p, tx|byte(r>>12)&maskx)
211 p = append(p, tx|byte(r>>6)&maskx)
212 p = append(p, tx|byte(r)&maskx)
213 return p
214 }
215}