Matteo Scandolo | 9a2772a | 2018-11-19 14:56:26 -0800 | [diff] [blame] | 1 | // Copyright 2016 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build !appengine |
| 6 | // +build gc |
| 7 | // +build !noasm |
| 8 | |
| 9 | #include "textflag.h" |
| 10 | |
| 11 | // The asm code generally follows the pure Go code in decode_other.go, except |
| 12 | // where marked with a "!!!". |
| 13 | |
| 14 | // func decode(dst, src []byte) int |
| 15 | // |
| 16 | // All local variables fit into registers. The non-zero stack size is only to |
| 17 | // spill registers and push args when issuing a CALL. The register allocation: |
| 18 | // - AX scratch |
| 19 | // - BX scratch |
| 20 | // - CX length or x |
| 21 | // - DX offset |
| 22 | // - SI &src[s] |
| 23 | // - DI &dst[d] |
| 24 | // + R8 dst_base |
| 25 | // + R9 dst_len |
| 26 | // + R10 dst_base + dst_len |
| 27 | // + R11 src_base |
| 28 | // + R12 src_len |
| 29 | // + R13 src_base + src_len |
| 30 | // - R14 used by doCopy |
| 31 | // - R15 used by doCopy |
| 32 | // |
| 33 | // The registers R8-R13 (marked with a "+") are set at the start of the |
| 34 | // function, and after a CALL returns, and are not otherwise modified. |
| 35 | // |
| 36 | // The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI. |
| 37 | // The s variable is implicitly SI - R11, and len(src)-s is R13 - SI. |
| 38 | TEXT ·decode(SB), NOSPLIT, $48-56 |
| 39 | // Initialize SI, DI and R8-R13. |
| 40 | MOVQ dst_base+0(FP), R8 |
| 41 | MOVQ dst_len+8(FP), R9 |
| 42 | MOVQ R8, DI |
| 43 | MOVQ R8, R10 |
| 44 | ADDQ R9, R10 |
| 45 | MOVQ src_base+24(FP), R11 |
| 46 | MOVQ src_len+32(FP), R12 |
| 47 | MOVQ R11, SI |
| 48 | MOVQ R11, R13 |
| 49 | ADDQ R12, R13 |
| 50 | |
| 51 | loop: |
| 52 | // for s < len(src) |
| 53 | CMPQ SI, R13 |
| 54 | JEQ end |
| 55 | |
| 56 | // CX = uint32(src[s]) |
| 57 | // |
| 58 | // switch src[s] & 0x03 |
| 59 | MOVBLZX (SI), CX |
| 60 | MOVL CX, BX |
| 61 | ANDL $3, BX |
| 62 | CMPL BX, $1 |
| 63 | JAE tagCopy |
| 64 | |
| 65 | // ---------------------------------------- |
| 66 | // The code below handles literal tags. |
| 67 | |
| 68 | // case tagLiteral: |
| 69 | // x := uint32(src[s] >> 2) |
| 70 | // switch |
| 71 | SHRL $2, CX |
| 72 | CMPL CX, $60 |
| 73 | JAE tagLit60Plus |
| 74 | |
| 75 | // case x < 60: |
| 76 | // s++ |
| 77 | INCQ SI |
| 78 | |
| 79 | doLit: |
| 80 | // This is the end of the inner "switch", when we have a literal tag. |
| 81 | // |
| 82 | // We assume that CX == x and x fits in a uint32, where x is the variable |
| 83 | // used in the pure Go decode_other.go code. |
| 84 | |
| 85 | // length = int(x) + 1 |
| 86 | // |
| 87 | // Unlike the pure Go code, we don't need to check if length <= 0 because |
| 88 | // CX can hold 64 bits, so the increment cannot overflow. |
| 89 | INCQ CX |
| 90 | |
| 91 | // Prepare to check if copying length bytes will run past the end of dst or |
| 92 | // src. |
| 93 | // |
| 94 | // AX = len(dst) - d |
| 95 | // BX = len(src) - s |
| 96 | MOVQ R10, AX |
| 97 | SUBQ DI, AX |
| 98 | MOVQ R13, BX |
| 99 | SUBQ SI, BX |
| 100 | |
| 101 | // !!! Try a faster technique for short (16 or fewer bytes) copies. |
| 102 | // |
| 103 | // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { |
| 104 | // goto callMemmove // Fall back on calling runtime·memmove. |
| 105 | // } |
| 106 | // |
| 107 | // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s |
| 108 | // against 21 instead of 16, because it cannot assume that all of its input |
| 109 | // is contiguous in memory and so it needs to leave enough source bytes to |
| 110 | // read the next tag without refilling buffers, but Go's Decode assumes |
| 111 | // contiguousness (the src argument is a []byte). |
| 112 | CMPQ CX, $16 |
| 113 | JGT callMemmove |
| 114 | CMPQ AX, $16 |
| 115 | JLT callMemmove |
| 116 | CMPQ BX, $16 |
| 117 | JLT callMemmove |
| 118 | |
| 119 | // !!! Implement the copy from src to dst as a 16-byte load and store. |
| 120 | // (Decode's documentation says that dst and src must not overlap.) |
| 121 | // |
| 122 | // This always copies 16 bytes, instead of only length bytes, but that's |
| 123 | // OK. If the input is a valid Snappy encoding then subsequent iterations |
| 124 | // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a |
| 125 | // non-nil error), so the overrun will be ignored. |
| 126 | // |
| 127 | // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or |
| 128 | // 16-byte loads and stores. This technique probably wouldn't be as |
| 129 | // effective on architectures that are fussier about alignment. |
| 130 | MOVOU 0(SI), X0 |
| 131 | MOVOU X0, 0(DI) |
| 132 | |
| 133 | // d += length |
| 134 | // s += length |
| 135 | ADDQ CX, DI |
| 136 | ADDQ CX, SI |
| 137 | JMP loop |
| 138 | |
| 139 | callMemmove: |
| 140 | // if length > len(dst)-d || length > len(src)-s { etc } |
| 141 | CMPQ CX, AX |
| 142 | JGT errCorrupt |
| 143 | CMPQ CX, BX |
| 144 | JGT errCorrupt |
| 145 | |
| 146 | // copy(dst[d:], src[s:s+length]) |
| 147 | // |
| 148 | // This means calling runtime·memmove(&dst[d], &src[s], length), so we push |
| 149 | // DI, SI and CX as arguments. Coincidentally, we also need to spill those |
| 150 | // three registers to the stack, to save local variables across the CALL. |
| 151 | MOVQ DI, 0(SP) |
| 152 | MOVQ SI, 8(SP) |
| 153 | MOVQ CX, 16(SP) |
| 154 | MOVQ DI, 24(SP) |
| 155 | MOVQ SI, 32(SP) |
| 156 | MOVQ CX, 40(SP) |
| 157 | CALL runtime·memmove(SB) |
| 158 | |
| 159 | // Restore local variables: unspill registers from the stack and |
| 160 | // re-calculate R8-R13. |
| 161 | MOVQ 24(SP), DI |
| 162 | MOVQ 32(SP), SI |
| 163 | MOVQ 40(SP), CX |
| 164 | MOVQ dst_base+0(FP), R8 |
| 165 | MOVQ dst_len+8(FP), R9 |
| 166 | MOVQ R8, R10 |
| 167 | ADDQ R9, R10 |
| 168 | MOVQ src_base+24(FP), R11 |
| 169 | MOVQ src_len+32(FP), R12 |
| 170 | MOVQ R11, R13 |
| 171 | ADDQ R12, R13 |
| 172 | |
| 173 | // d += length |
| 174 | // s += length |
| 175 | ADDQ CX, DI |
| 176 | ADDQ CX, SI |
| 177 | JMP loop |
| 178 | |
| 179 | tagLit60Plus: |
| 180 | // !!! This fragment does the |
| 181 | // |
| 182 | // s += x - 58; if uint(s) > uint(len(src)) { etc } |
| 183 | // |
| 184 | // checks. In the asm version, we code it once instead of once per switch case. |
| 185 | ADDQ CX, SI |
| 186 | SUBQ $58, SI |
| 187 | MOVQ SI, BX |
| 188 | SUBQ R11, BX |
| 189 | CMPQ BX, R12 |
| 190 | JA errCorrupt |
| 191 | |
| 192 | // case x == 60: |
| 193 | CMPL CX, $61 |
| 194 | JEQ tagLit61 |
| 195 | JA tagLit62Plus |
| 196 | |
| 197 | // x = uint32(src[s-1]) |
| 198 | MOVBLZX -1(SI), CX |
| 199 | JMP doLit |
| 200 | |
| 201 | tagLit61: |
| 202 | // case x == 61: |
| 203 | // x = uint32(src[s-2]) | uint32(src[s-1])<<8 |
| 204 | MOVWLZX -2(SI), CX |
| 205 | JMP doLit |
| 206 | |
| 207 | tagLit62Plus: |
| 208 | CMPL CX, $62 |
| 209 | JA tagLit63 |
| 210 | |
| 211 | // case x == 62: |
| 212 | // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 |
| 213 | MOVWLZX -3(SI), CX |
| 214 | MOVBLZX -1(SI), BX |
| 215 | SHLL $16, BX |
| 216 | ORL BX, CX |
| 217 | JMP doLit |
| 218 | |
| 219 | tagLit63: |
| 220 | // case x == 63: |
| 221 | // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 |
| 222 | MOVL -4(SI), CX |
| 223 | JMP doLit |
| 224 | |
| 225 | // The code above handles literal tags. |
| 226 | // ---------------------------------------- |
| 227 | // The code below handles copy tags. |
| 228 | |
| 229 | tagCopy4: |
| 230 | // case tagCopy4: |
| 231 | // s += 5 |
| 232 | ADDQ $5, SI |
| 233 | |
| 234 | // if uint(s) > uint(len(src)) { etc } |
| 235 | MOVQ SI, BX |
| 236 | SUBQ R11, BX |
| 237 | CMPQ BX, R12 |
| 238 | JA errCorrupt |
| 239 | |
| 240 | // length = 1 + int(src[s-5])>>2 |
| 241 | SHRQ $2, CX |
| 242 | INCQ CX |
| 243 | |
| 244 | // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) |
| 245 | MOVLQZX -4(SI), DX |
| 246 | JMP doCopy |
| 247 | |
| 248 | tagCopy2: |
| 249 | // case tagCopy2: |
| 250 | // s += 3 |
| 251 | ADDQ $3, SI |
| 252 | |
| 253 | // if uint(s) > uint(len(src)) { etc } |
| 254 | MOVQ SI, BX |
| 255 | SUBQ R11, BX |
| 256 | CMPQ BX, R12 |
| 257 | JA errCorrupt |
| 258 | |
| 259 | // length = 1 + int(src[s-3])>>2 |
| 260 | SHRQ $2, CX |
| 261 | INCQ CX |
| 262 | |
| 263 | // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) |
| 264 | MOVWQZX -2(SI), DX |
| 265 | JMP doCopy |
| 266 | |
| 267 | tagCopy: |
| 268 | // We have a copy tag. We assume that: |
| 269 | // - BX == src[s] & 0x03 |
| 270 | // - CX == src[s] |
| 271 | CMPQ BX, $2 |
| 272 | JEQ tagCopy2 |
| 273 | JA tagCopy4 |
| 274 | |
| 275 | // case tagCopy1: |
| 276 | // s += 2 |
| 277 | ADDQ $2, SI |
| 278 | |
| 279 | // if uint(s) > uint(len(src)) { etc } |
| 280 | MOVQ SI, BX |
| 281 | SUBQ R11, BX |
| 282 | CMPQ BX, R12 |
| 283 | JA errCorrupt |
| 284 | |
| 285 | // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) |
| 286 | MOVQ CX, DX |
| 287 | ANDQ $0xe0, DX |
| 288 | SHLQ $3, DX |
| 289 | MOVBQZX -1(SI), BX |
| 290 | ORQ BX, DX |
| 291 | |
| 292 | // length = 4 + int(src[s-2])>>2&0x7 |
| 293 | SHRQ $2, CX |
| 294 | ANDQ $7, CX |
| 295 | ADDQ $4, CX |
| 296 | |
| 297 | doCopy: |
| 298 | // This is the end of the outer "switch", when we have a copy tag. |
| 299 | // |
| 300 | // We assume that: |
| 301 | // - CX == length && CX > 0 |
| 302 | // - DX == offset |
| 303 | |
| 304 | // if offset <= 0 { etc } |
| 305 | CMPQ DX, $0 |
| 306 | JLE errCorrupt |
| 307 | |
| 308 | // if d < offset { etc } |
| 309 | MOVQ DI, BX |
| 310 | SUBQ R8, BX |
| 311 | CMPQ BX, DX |
| 312 | JLT errCorrupt |
| 313 | |
| 314 | // if length > len(dst)-d { etc } |
| 315 | MOVQ R10, BX |
| 316 | SUBQ DI, BX |
| 317 | CMPQ CX, BX |
| 318 | JGT errCorrupt |
| 319 | |
| 320 | // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length |
| 321 | // |
| 322 | // Set: |
| 323 | // - R14 = len(dst)-d |
| 324 | // - R15 = &dst[d-offset] |
| 325 | MOVQ R10, R14 |
| 326 | SUBQ DI, R14 |
| 327 | MOVQ DI, R15 |
| 328 | SUBQ DX, R15 |
| 329 | |
| 330 | // !!! Try a faster technique for short (16 or fewer bytes) forward copies. |
| 331 | // |
| 332 | // First, try using two 8-byte load/stores, similar to the doLit technique |
| 333 | // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is |
| 334 | // still OK if offset >= 8. Note that this has to be two 8-byte load/stores |
| 335 | // and not one 16-byte load/store, and the first store has to be before the |
| 336 | // second load, due to the overlap if offset is in the range [8, 16). |
| 337 | // |
| 338 | // if length > 16 || offset < 8 || len(dst)-d < 16 { |
| 339 | // goto slowForwardCopy |
| 340 | // } |
| 341 | // copy 16 bytes |
| 342 | // d += length |
| 343 | CMPQ CX, $16 |
| 344 | JGT slowForwardCopy |
| 345 | CMPQ DX, $8 |
| 346 | JLT slowForwardCopy |
| 347 | CMPQ R14, $16 |
| 348 | JLT slowForwardCopy |
| 349 | MOVQ 0(R15), AX |
| 350 | MOVQ AX, 0(DI) |
| 351 | MOVQ 8(R15), BX |
| 352 | MOVQ BX, 8(DI) |
| 353 | ADDQ CX, DI |
| 354 | JMP loop |
| 355 | |
| 356 | slowForwardCopy: |
| 357 | // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we |
| 358 | // can still try 8-byte load stores, provided we can overrun up to 10 extra |
| 359 | // bytes. As above, the overrun will be fixed up by subsequent iterations |
| 360 | // of the outermost loop. |
| 361 | // |
| 362 | // The C++ snappy code calls this technique IncrementalCopyFastPath. Its |
| 363 | // commentary says: |
| 364 | // |
| 365 | // ---- |
| 366 | // |
| 367 | // The main part of this loop is a simple copy of eight bytes at a time |
| 368 | // until we've copied (at least) the requested amount of bytes. However, |
| 369 | // if d and d-offset are less than eight bytes apart (indicating a |
| 370 | // repeating pattern of length < 8), we first need to expand the pattern in |
| 371 | // order to get the correct results. For instance, if the buffer looks like |
| 372 | // this, with the eight-byte <d-offset> and <d> patterns marked as |
| 373 | // intervals: |
| 374 | // |
| 375 | // abxxxxxxxxxxxx |
| 376 | // [------] d-offset |
| 377 | // [------] d |
| 378 | // |
| 379 | // a single eight-byte copy from <d-offset> to <d> will repeat the pattern |
| 380 | // once, after which we can move <d> two bytes without moving <d-offset>: |
| 381 | // |
| 382 | // ababxxxxxxxxxx |
| 383 | // [------] d-offset |
| 384 | // [------] d |
| 385 | // |
| 386 | // and repeat the exercise until the two no longer overlap. |
| 387 | // |
| 388 | // This allows us to do very well in the special case of one single byte |
| 389 | // repeated many times, without taking a big hit for more general cases. |
| 390 | // |
| 391 | // The worst case of extra writing past the end of the match occurs when |
| 392 | // offset == 1 and length == 1; the last copy will read from byte positions |
| 393 | // [0..7] and write to [4..11], whereas it was only supposed to write to |
| 394 | // position 1. Thus, ten excess bytes. |
| 395 | // |
| 396 | // ---- |
| 397 | // |
| 398 | // That "10 byte overrun" worst case is confirmed by Go's |
| 399 | // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy |
| 400 | // and finishSlowForwardCopy algorithm. |
| 401 | // |
| 402 | // if length > len(dst)-d-10 { |
| 403 | // goto verySlowForwardCopy |
| 404 | // } |
| 405 | SUBQ $10, R14 |
| 406 | CMPQ CX, R14 |
| 407 | JGT verySlowForwardCopy |
| 408 | |
| 409 | makeOffsetAtLeast8: |
| 410 | // !!! As above, expand the pattern so that offset >= 8 and we can use |
| 411 | // 8-byte load/stores. |
| 412 | // |
| 413 | // for offset < 8 { |
| 414 | // copy 8 bytes from dst[d-offset:] to dst[d:] |
| 415 | // length -= offset |
| 416 | // d += offset |
| 417 | // offset += offset |
| 418 | // // The two previous lines together means that d-offset, and therefore |
| 419 | // // R15, is unchanged. |
| 420 | // } |
| 421 | CMPQ DX, $8 |
| 422 | JGE fixUpSlowForwardCopy |
| 423 | MOVQ (R15), BX |
| 424 | MOVQ BX, (DI) |
| 425 | SUBQ DX, CX |
| 426 | ADDQ DX, DI |
| 427 | ADDQ DX, DX |
| 428 | JMP makeOffsetAtLeast8 |
| 429 | |
| 430 | fixUpSlowForwardCopy: |
| 431 | // !!! Add length (which might be negative now) to d (implied by DI being |
| 432 | // &dst[d]) so that d ends up at the right place when we jump back to the |
| 433 | // top of the loop. Before we do that, though, we save DI to AX so that, if |
| 434 | // length is positive, copying the remaining length bytes will write to the |
| 435 | // right place. |
| 436 | MOVQ DI, AX |
| 437 | ADDQ CX, DI |
| 438 | |
| 439 | finishSlowForwardCopy: |
| 440 | // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative |
| 441 | // length means that we overrun, but as above, that will be fixed up by |
| 442 | // subsequent iterations of the outermost loop. |
| 443 | CMPQ CX, $0 |
| 444 | JLE loop |
| 445 | MOVQ (R15), BX |
| 446 | MOVQ BX, (AX) |
| 447 | ADDQ $8, R15 |
| 448 | ADDQ $8, AX |
| 449 | SUBQ $8, CX |
| 450 | JMP finishSlowForwardCopy |
| 451 | |
| 452 | verySlowForwardCopy: |
| 453 | // verySlowForwardCopy is a simple implementation of forward copy. In C |
| 454 | // parlance, this is a do/while loop instead of a while loop, since we know |
| 455 | // that length > 0. In Go syntax: |
| 456 | // |
| 457 | // for { |
| 458 | // dst[d] = dst[d - offset] |
| 459 | // d++ |
| 460 | // length-- |
| 461 | // if length == 0 { |
| 462 | // break |
| 463 | // } |
| 464 | // } |
| 465 | MOVB (R15), BX |
| 466 | MOVB BX, (DI) |
| 467 | INCQ R15 |
| 468 | INCQ DI |
| 469 | DECQ CX |
| 470 | JNZ verySlowForwardCopy |
| 471 | JMP loop |
| 472 | |
| 473 | // The code above handles copy tags. |
| 474 | // ---------------------------------------- |
| 475 | |
| 476 | end: |
| 477 | // This is the end of the "for s < len(src)". |
| 478 | // |
| 479 | // if d != len(dst) { etc } |
| 480 | CMPQ DI, R10 |
| 481 | JNE errCorrupt |
| 482 | |
| 483 | // return 0 |
| 484 | MOVQ $0, ret+48(FP) |
| 485 | RET |
| 486 | |
| 487 | errCorrupt: |
| 488 | // return decodeErrCodeCorrupt |
| 489 | MOVQ $1, ret+48(FP) |
| 490 | RET |