Zack Williams | e940c7a | 2019-08-21 14:25:39 -0700 | [diff] [blame] | 1 | package jsoniter |
| 2 | |
| 3 | import ( |
| 4 | "fmt" |
| 5 | "unicode/utf16" |
| 6 | ) |
| 7 | |
| 8 | // ReadString read string from iterator |
| 9 | func (iter *Iterator) ReadString() (ret string) { |
| 10 | c := iter.nextToken() |
| 11 | if c == '"' { |
| 12 | for i := iter.head; i < iter.tail; i++ { |
| 13 | c := iter.buf[i] |
| 14 | if c == '"' { |
| 15 | ret = string(iter.buf[iter.head:i]) |
| 16 | iter.head = i + 1 |
| 17 | return ret |
| 18 | } else if c == '\\' { |
| 19 | break |
| 20 | } else if c < ' ' { |
| 21 | iter.ReportError("ReadString", |
| 22 | fmt.Sprintf(`invalid control character found: %d`, c)) |
| 23 | return |
| 24 | } |
| 25 | } |
| 26 | return iter.readStringSlowPath() |
| 27 | } else if c == 'n' { |
| 28 | iter.skipThreeBytes('u', 'l', 'l') |
| 29 | return "" |
| 30 | } |
| 31 | iter.ReportError("ReadString", `expects " or n, but found `+string([]byte{c})) |
| 32 | return |
| 33 | } |
| 34 | |
| 35 | func (iter *Iterator) readStringSlowPath() (ret string) { |
| 36 | var str []byte |
| 37 | var c byte |
| 38 | for iter.Error == nil { |
| 39 | c = iter.readByte() |
| 40 | if c == '"' { |
| 41 | return string(str) |
| 42 | } |
| 43 | if c == '\\' { |
| 44 | c = iter.readByte() |
| 45 | str = iter.readEscapedChar(c, str) |
| 46 | } else { |
| 47 | str = append(str, c) |
| 48 | } |
| 49 | } |
| 50 | iter.ReportError("readStringSlowPath", "unexpected end of input") |
| 51 | return |
| 52 | } |
| 53 | |
| 54 | func (iter *Iterator) readEscapedChar(c byte, str []byte) []byte { |
| 55 | switch c { |
| 56 | case 'u': |
| 57 | r := iter.readU4() |
| 58 | if utf16.IsSurrogate(r) { |
| 59 | c = iter.readByte() |
| 60 | if iter.Error != nil { |
| 61 | return nil |
| 62 | } |
| 63 | if c != '\\' { |
| 64 | iter.unreadByte() |
| 65 | str = appendRune(str, r) |
| 66 | return str |
| 67 | } |
| 68 | c = iter.readByte() |
| 69 | if iter.Error != nil { |
| 70 | return nil |
| 71 | } |
| 72 | if c != 'u' { |
| 73 | str = appendRune(str, r) |
| 74 | return iter.readEscapedChar(c, str) |
| 75 | } |
| 76 | r2 := iter.readU4() |
| 77 | if iter.Error != nil { |
| 78 | return nil |
| 79 | } |
| 80 | combined := utf16.DecodeRune(r, r2) |
| 81 | if combined == '\uFFFD' { |
| 82 | str = appendRune(str, r) |
| 83 | str = appendRune(str, r2) |
| 84 | } else { |
| 85 | str = appendRune(str, combined) |
| 86 | } |
| 87 | } else { |
| 88 | str = appendRune(str, r) |
| 89 | } |
| 90 | case '"': |
| 91 | str = append(str, '"') |
| 92 | case '\\': |
| 93 | str = append(str, '\\') |
| 94 | case '/': |
| 95 | str = append(str, '/') |
| 96 | case 'b': |
| 97 | str = append(str, '\b') |
| 98 | case 'f': |
| 99 | str = append(str, '\f') |
| 100 | case 'n': |
| 101 | str = append(str, '\n') |
| 102 | case 'r': |
| 103 | str = append(str, '\r') |
| 104 | case 't': |
| 105 | str = append(str, '\t') |
| 106 | default: |
| 107 | iter.ReportError("readEscapedChar", |
| 108 | `invalid escape char after \`) |
| 109 | return nil |
| 110 | } |
| 111 | return str |
| 112 | } |
| 113 | |
| 114 | // ReadStringAsSlice read string from iterator without copying into string form. |
| 115 | // The []byte can not be kept, as it will change after next iterator call. |
| 116 | func (iter *Iterator) ReadStringAsSlice() (ret []byte) { |
| 117 | c := iter.nextToken() |
| 118 | if c == '"' { |
| 119 | for i := iter.head; i < iter.tail; i++ { |
| 120 | // require ascii string and no escape |
| 121 | // for: field name, base64, number |
| 122 | if iter.buf[i] == '"' { |
| 123 | // fast path: reuse the underlying buffer |
| 124 | ret = iter.buf[iter.head:i] |
| 125 | iter.head = i + 1 |
| 126 | return ret |
| 127 | } |
| 128 | } |
| 129 | readLen := iter.tail - iter.head |
| 130 | copied := make([]byte, readLen, readLen*2) |
| 131 | copy(copied, iter.buf[iter.head:iter.tail]) |
| 132 | iter.head = iter.tail |
| 133 | for iter.Error == nil { |
| 134 | c := iter.readByte() |
| 135 | if c == '"' { |
| 136 | return copied |
| 137 | } |
| 138 | copied = append(copied, c) |
| 139 | } |
| 140 | return copied |
| 141 | } |
| 142 | iter.ReportError("ReadStringAsSlice", `expects " or n, but found `+string([]byte{c})) |
| 143 | return |
| 144 | } |
| 145 | |
| 146 | func (iter *Iterator) readU4() (ret rune) { |
| 147 | for i := 0; i < 4; i++ { |
| 148 | c := iter.readByte() |
| 149 | if iter.Error != nil { |
| 150 | return |
| 151 | } |
| 152 | if c >= '0' && c <= '9' { |
| 153 | ret = ret*16 + rune(c-'0') |
| 154 | } else if c >= 'a' && c <= 'f' { |
| 155 | ret = ret*16 + rune(c-'a'+10) |
| 156 | } else if c >= 'A' && c <= 'F' { |
| 157 | ret = ret*16 + rune(c-'A'+10) |
| 158 | } else { |
| 159 | iter.ReportError("readU4", "expects 0~9 or a~f, but found "+string([]byte{c})) |
| 160 | return |
| 161 | } |
| 162 | } |
| 163 | return ret |
| 164 | } |
| 165 | |
| 166 | const ( |
| 167 | t1 = 0x00 // 0000 0000 |
| 168 | tx = 0x80 // 1000 0000 |
| 169 | t2 = 0xC0 // 1100 0000 |
| 170 | t3 = 0xE0 // 1110 0000 |
| 171 | t4 = 0xF0 // 1111 0000 |
| 172 | t5 = 0xF8 // 1111 1000 |
| 173 | |
| 174 | maskx = 0x3F // 0011 1111 |
| 175 | mask2 = 0x1F // 0001 1111 |
| 176 | mask3 = 0x0F // 0000 1111 |
| 177 | mask4 = 0x07 // 0000 0111 |
| 178 | |
| 179 | rune1Max = 1<<7 - 1 |
| 180 | rune2Max = 1<<11 - 1 |
| 181 | rune3Max = 1<<16 - 1 |
| 182 | |
| 183 | surrogateMin = 0xD800 |
| 184 | surrogateMax = 0xDFFF |
| 185 | |
| 186 | maxRune = '\U0010FFFF' // Maximum valid Unicode code point. |
| 187 | runeError = '\uFFFD' // the "error" Rune or "Unicode replacement character" |
| 188 | ) |
| 189 | |
| 190 | func appendRune(p []byte, r rune) []byte { |
| 191 | // Negative values are erroneous. Making it unsigned addresses the problem. |
| 192 | switch i := uint32(r); { |
| 193 | case i <= rune1Max: |
| 194 | p = append(p, byte(r)) |
| 195 | return p |
| 196 | case i <= rune2Max: |
| 197 | p = append(p, t2|byte(r>>6)) |
| 198 | p = append(p, tx|byte(r)&maskx) |
| 199 | return p |
| 200 | case i > maxRune, surrogateMin <= i && i <= surrogateMax: |
| 201 | r = runeError |
| 202 | fallthrough |
| 203 | case i <= rune3Max: |
| 204 | p = append(p, t3|byte(r>>12)) |
| 205 | p = append(p, tx|byte(r>>6)&maskx) |
| 206 | p = append(p, tx|byte(r)&maskx) |
| 207 | return p |
| 208 | default: |
| 209 | p = append(p, t4|byte(r>>18)) |
| 210 | p = append(p, tx|byte(r>>12)&maskx) |
| 211 | p = append(p, tx|byte(r>>6)&maskx) |
| 212 | p = append(p, tx|byte(r)&maskx) |
| 213 | return p |
| 214 | } |
| 215 | } |