khenaidoo | 7d3c558 | 2021-08-11 18:09:44 -0400 | [diff] [blame] | 1 | // Copyright 2020 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build !appengine |
| 6 | // +build gc |
| 7 | // +build !noasm |
| 8 | |
| 9 | #include "textflag.h" |
| 10 | |
| 11 | // The asm code generally follows the pure Go code in decode_other.go, except |
| 12 | // where marked with a "!!!". |
| 13 | |
| 14 | // func decode(dst, src []byte) int |
| 15 | // |
| 16 | // All local variables fit into registers. The non-zero stack size is only to |
| 17 | // spill registers and push args when issuing a CALL. The register allocation: |
| 18 | // - R2 scratch |
| 19 | // - R3 scratch |
| 20 | // - R4 length or x |
| 21 | // - R5 offset |
| 22 | // - R6 &src[s] |
| 23 | // - R7 &dst[d] |
| 24 | // + R8 dst_base |
| 25 | // + R9 dst_len |
| 26 | // + R10 dst_base + dst_len |
| 27 | // + R11 src_base |
| 28 | // + R12 src_len |
| 29 | // + R13 src_base + src_len |
| 30 | // - R14 used by doCopy |
| 31 | // - R15 used by doCopy |
| 32 | // |
| 33 | // The registers R8-R13 (marked with a "+") are set at the start of the |
| 34 | // function, and after a CALL returns, and are not otherwise modified. |
| 35 | // |
| 36 | // The d variable is implicitly R7 - R8, and len(dst)-d is R10 - R7. |
| 37 | // The s variable is implicitly R6 - R11, and len(src)-s is R13 - R6. |
| 38 | TEXT ·decode(SB), NOSPLIT, $56-56 |
| 39 | // Initialize R6, R7 and R8-R13. |
| 40 | MOVD dst_base+0(FP), R8 |
| 41 | MOVD dst_len+8(FP), R9 |
| 42 | MOVD R8, R7 |
| 43 | MOVD R8, R10 |
| 44 | ADD R9, R10, R10 |
| 45 | MOVD src_base+24(FP), R11 |
| 46 | MOVD src_len+32(FP), R12 |
| 47 | MOVD R11, R6 |
| 48 | MOVD R11, R13 |
| 49 | ADD R12, R13, R13 |
| 50 | |
| 51 | loop: |
| 52 | // for s < len(src) |
| 53 | CMP R13, R6 |
| 54 | BEQ end |
| 55 | |
| 56 | // R4 = uint32(src[s]) |
| 57 | // |
| 58 | // switch src[s] & 0x03 |
| 59 | MOVBU (R6), R4 |
| 60 | MOVW R4, R3 |
| 61 | ANDW $3, R3 |
| 62 | MOVW $1, R1 |
| 63 | CMPW R1, R3 |
| 64 | BGE tagCopy |
| 65 | |
| 66 | // ---------------------------------------- |
| 67 | // The code below handles literal tags. |
| 68 | |
| 69 | // case tagLiteral: |
| 70 | // x := uint32(src[s] >> 2) |
| 71 | // switch |
| 72 | MOVW $60, R1 |
| 73 | LSRW $2, R4, R4 |
| 74 | CMPW R4, R1 |
| 75 | BLS tagLit60Plus |
| 76 | |
| 77 | // case x < 60: |
| 78 | // s++ |
| 79 | ADD $1, R6, R6 |
| 80 | |
| 81 | doLit: |
| 82 | // This is the end of the inner "switch", when we have a literal tag. |
| 83 | // |
| 84 | // We assume that R4 == x and x fits in a uint32, where x is the variable |
| 85 | // used in the pure Go decode_other.go code. |
| 86 | |
| 87 | // length = int(x) + 1 |
| 88 | // |
| 89 | // Unlike the pure Go code, we don't need to check if length <= 0 because |
| 90 | // R4 can hold 64 bits, so the increment cannot overflow. |
| 91 | ADD $1, R4, R4 |
| 92 | |
| 93 | // Prepare to check if copying length bytes will run past the end of dst or |
| 94 | // src. |
| 95 | // |
| 96 | // R2 = len(dst) - d |
| 97 | // R3 = len(src) - s |
| 98 | MOVD R10, R2 |
| 99 | SUB R7, R2, R2 |
| 100 | MOVD R13, R3 |
| 101 | SUB R6, R3, R3 |
| 102 | |
| 103 | // !!! Try a faster technique for short (16 or fewer bytes) copies. |
| 104 | // |
| 105 | // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { |
| 106 | // goto callMemmove // Fall back on calling runtime·memmove. |
| 107 | // } |
| 108 | // |
| 109 | // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s |
| 110 | // against 21 instead of 16, because it cannot assume that all of its input |
| 111 | // is contiguous in memory and so it needs to leave enough source bytes to |
| 112 | // read the next tag without refilling buffers, but Go's Decode assumes |
| 113 | // contiguousness (the src argument is a []byte). |
| 114 | CMP $16, R4 |
| 115 | BGT callMemmove |
| 116 | CMP $16, R2 |
| 117 | BLT callMemmove |
| 118 | CMP $16, R3 |
| 119 | BLT callMemmove |
| 120 | |
| 121 | // !!! Implement the copy from src to dst as a 16-byte load and store. |
| 122 | // (Decode's documentation says that dst and src must not overlap.) |
| 123 | // |
| 124 | // This always copies 16 bytes, instead of only length bytes, but that's |
| 125 | // OK. If the input is a valid Snappy encoding then subsequent iterations |
| 126 | // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a |
| 127 | // non-nil error), so the overrun will be ignored. |
| 128 | // |
| 129 | // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or |
| 130 | // 16-byte loads and stores. This technique probably wouldn't be as |
| 131 | // effective on architectures that are fussier about alignment. |
| 132 | LDP 0(R6), (R14, R15) |
| 133 | STP (R14, R15), 0(R7) |
| 134 | |
| 135 | // d += length |
| 136 | // s += length |
| 137 | ADD R4, R7, R7 |
| 138 | ADD R4, R6, R6 |
| 139 | B loop |
| 140 | |
| 141 | callMemmove: |
| 142 | // if length > len(dst)-d || length > len(src)-s { etc } |
| 143 | CMP R2, R4 |
| 144 | BGT errCorrupt |
| 145 | CMP R3, R4 |
| 146 | BGT errCorrupt |
| 147 | |
| 148 | // copy(dst[d:], src[s:s+length]) |
| 149 | // |
| 150 | // This means calling runtime·memmove(&dst[d], &src[s], length), so we push |
| 151 | // R7, R6 and R4 as arguments. Coincidentally, we also need to spill those |
| 152 | // three registers to the stack, to save local variables across the CALL. |
| 153 | MOVD R7, 8(RSP) |
| 154 | MOVD R6, 16(RSP) |
| 155 | MOVD R4, 24(RSP) |
| 156 | MOVD R7, 32(RSP) |
| 157 | MOVD R6, 40(RSP) |
| 158 | MOVD R4, 48(RSP) |
| 159 | CALL runtime·memmove(SB) |
| 160 | |
| 161 | // Restore local variables: unspill registers from the stack and |
| 162 | // re-calculate R8-R13. |
| 163 | MOVD 32(RSP), R7 |
| 164 | MOVD 40(RSP), R6 |
| 165 | MOVD 48(RSP), R4 |
| 166 | MOVD dst_base+0(FP), R8 |
| 167 | MOVD dst_len+8(FP), R9 |
| 168 | MOVD R8, R10 |
| 169 | ADD R9, R10, R10 |
| 170 | MOVD src_base+24(FP), R11 |
| 171 | MOVD src_len+32(FP), R12 |
| 172 | MOVD R11, R13 |
| 173 | ADD R12, R13, R13 |
| 174 | |
| 175 | // d += length |
| 176 | // s += length |
| 177 | ADD R4, R7, R7 |
| 178 | ADD R4, R6, R6 |
| 179 | B loop |
| 180 | |
| 181 | tagLit60Plus: |
| 182 | // !!! This fragment does the |
| 183 | // |
| 184 | // s += x - 58; if uint(s) > uint(len(src)) { etc } |
| 185 | // |
| 186 | // checks. In the asm version, we code it once instead of once per switch case. |
| 187 | ADD R4, R6, R6 |
| 188 | SUB $58, R6, R6 |
| 189 | MOVD R6, R3 |
| 190 | SUB R11, R3, R3 |
| 191 | CMP R12, R3 |
| 192 | BGT errCorrupt |
| 193 | |
| 194 | // case x == 60: |
| 195 | MOVW $61, R1 |
| 196 | CMPW R1, R4 |
| 197 | BEQ tagLit61 |
| 198 | BGT tagLit62Plus |
| 199 | |
| 200 | // x = uint32(src[s-1]) |
| 201 | MOVBU -1(R6), R4 |
| 202 | B doLit |
| 203 | |
| 204 | tagLit61: |
| 205 | // case x == 61: |
| 206 | // x = uint32(src[s-2]) | uint32(src[s-1])<<8 |
| 207 | MOVHU -2(R6), R4 |
| 208 | B doLit |
| 209 | |
| 210 | tagLit62Plus: |
| 211 | CMPW $62, R4 |
| 212 | BHI tagLit63 |
| 213 | |
| 214 | // case x == 62: |
| 215 | // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 |
| 216 | MOVHU -3(R6), R4 |
| 217 | MOVBU -1(R6), R3 |
| 218 | ORR R3<<16, R4 |
| 219 | B doLit |
| 220 | |
| 221 | tagLit63: |
| 222 | // case x == 63: |
| 223 | // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 |
| 224 | MOVWU -4(R6), R4 |
| 225 | B doLit |
| 226 | |
| 227 | // The code above handles literal tags. |
| 228 | // ---------------------------------------- |
| 229 | // The code below handles copy tags. |
| 230 | |
| 231 | tagCopy4: |
| 232 | // case tagCopy4: |
| 233 | // s += 5 |
| 234 | ADD $5, R6, R6 |
| 235 | |
| 236 | // if uint(s) > uint(len(src)) { etc } |
| 237 | MOVD R6, R3 |
| 238 | SUB R11, R3, R3 |
| 239 | CMP R12, R3 |
| 240 | BGT errCorrupt |
| 241 | |
| 242 | // length = 1 + int(src[s-5])>>2 |
| 243 | MOVD $1, R1 |
| 244 | ADD R4>>2, R1, R4 |
| 245 | |
| 246 | // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) |
| 247 | MOVWU -4(R6), R5 |
| 248 | B doCopy |
| 249 | |
| 250 | tagCopy2: |
| 251 | // case tagCopy2: |
| 252 | // s += 3 |
| 253 | ADD $3, R6, R6 |
| 254 | |
| 255 | // if uint(s) > uint(len(src)) { etc } |
| 256 | MOVD R6, R3 |
| 257 | SUB R11, R3, R3 |
| 258 | CMP R12, R3 |
| 259 | BGT errCorrupt |
| 260 | |
| 261 | // length = 1 + int(src[s-3])>>2 |
| 262 | MOVD $1, R1 |
| 263 | ADD R4>>2, R1, R4 |
| 264 | |
| 265 | // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) |
| 266 | MOVHU -2(R6), R5 |
| 267 | B doCopy |
| 268 | |
| 269 | tagCopy: |
| 270 | // We have a copy tag. We assume that: |
| 271 | // - R3 == src[s] & 0x03 |
| 272 | // - R4 == src[s] |
| 273 | CMP $2, R3 |
| 274 | BEQ tagCopy2 |
| 275 | BGT tagCopy4 |
| 276 | |
| 277 | // case tagCopy1: |
| 278 | // s += 2 |
| 279 | ADD $2, R6, R6 |
| 280 | |
| 281 | // if uint(s) > uint(len(src)) { etc } |
| 282 | MOVD R6, R3 |
| 283 | SUB R11, R3, R3 |
| 284 | CMP R12, R3 |
| 285 | BGT errCorrupt |
| 286 | |
| 287 | // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) |
| 288 | MOVD R4, R5 |
| 289 | AND $0xe0, R5 |
| 290 | MOVBU -1(R6), R3 |
| 291 | ORR R5<<3, R3, R5 |
| 292 | |
| 293 | // length = 4 + int(src[s-2])>>2&0x7 |
| 294 | MOVD $7, R1 |
| 295 | AND R4>>2, R1, R4 |
| 296 | ADD $4, R4, R4 |
| 297 | |
| 298 | doCopy: |
| 299 | // This is the end of the outer "switch", when we have a copy tag. |
| 300 | // |
| 301 | // We assume that: |
| 302 | // - R4 == length && R4 > 0 |
| 303 | // - R5 == offset |
| 304 | |
| 305 | // if offset <= 0 { etc } |
| 306 | MOVD $0, R1 |
| 307 | CMP R1, R5 |
| 308 | BLE errCorrupt |
| 309 | |
| 310 | // if d < offset { etc } |
| 311 | MOVD R7, R3 |
| 312 | SUB R8, R3, R3 |
| 313 | CMP R5, R3 |
| 314 | BLT errCorrupt |
| 315 | |
| 316 | // if length > len(dst)-d { etc } |
| 317 | MOVD R10, R3 |
| 318 | SUB R7, R3, R3 |
| 319 | CMP R3, R4 |
| 320 | BGT errCorrupt |
| 321 | |
| 322 | // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length |
| 323 | // |
| 324 | // Set: |
| 325 | // - R14 = len(dst)-d |
| 326 | // - R15 = &dst[d-offset] |
| 327 | MOVD R10, R14 |
| 328 | SUB R7, R14, R14 |
| 329 | MOVD R7, R15 |
| 330 | SUB R5, R15, R15 |
| 331 | |
| 332 | // !!! Try a faster technique for short (16 or fewer bytes) forward copies. |
| 333 | // |
| 334 | // First, try using two 8-byte load/stores, similar to the doLit technique |
| 335 | // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is |
| 336 | // still OK if offset >= 8. Note that this has to be two 8-byte load/stores |
| 337 | // and not one 16-byte load/store, and the first store has to be before the |
| 338 | // second load, due to the overlap if offset is in the range [8, 16). |
| 339 | // |
| 340 | // if length > 16 || offset < 8 || len(dst)-d < 16 { |
| 341 | // goto slowForwardCopy |
| 342 | // } |
| 343 | // copy 16 bytes |
| 344 | // d += length |
| 345 | CMP $16, R4 |
| 346 | BGT slowForwardCopy |
| 347 | CMP $8, R5 |
| 348 | BLT slowForwardCopy |
| 349 | CMP $16, R14 |
| 350 | BLT slowForwardCopy |
| 351 | MOVD 0(R15), R2 |
| 352 | MOVD R2, 0(R7) |
| 353 | MOVD 8(R15), R3 |
| 354 | MOVD R3, 8(R7) |
| 355 | ADD R4, R7, R7 |
| 356 | B loop |
| 357 | |
| 358 | slowForwardCopy: |
| 359 | // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we |
| 360 | // can still try 8-byte load stores, provided we can overrun up to 10 extra |
| 361 | // bytes. As above, the overrun will be fixed up by subsequent iterations |
| 362 | // of the outermost loop. |
| 363 | // |
| 364 | // The C++ snappy code calls this technique IncrementalCopyFastPath. Its |
| 365 | // commentary says: |
| 366 | // |
| 367 | // ---- |
| 368 | // |
| 369 | // The main part of this loop is a simple copy of eight bytes at a time |
| 370 | // until we've copied (at least) the requested amount of bytes. However, |
| 371 | // if d and d-offset are less than eight bytes apart (indicating a |
| 372 | // repeating pattern of length < 8), we first need to expand the pattern in |
| 373 | // order to get the correct results. For instance, if the buffer looks like |
| 374 | // this, with the eight-byte <d-offset> and <d> patterns marked as |
| 375 | // intervals: |
| 376 | // |
| 377 | // abxxxxxxxxxxxx |
| 378 | // [------] d-offset |
| 379 | // [------] d |
| 380 | // |
| 381 | // a single eight-byte copy from <d-offset> to <d> will repeat the pattern |
| 382 | // once, after which we can move <d> two bytes without moving <d-offset>: |
| 383 | // |
| 384 | // ababxxxxxxxxxx |
| 385 | // [------] d-offset |
| 386 | // [------] d |
| 387 | // |
| 388 | // and repeat the exercise until the two no longer overlap. |
| 389 | // |
| 390 | // This allows us to do very well in the special case of one single byte |
| 391 | // repeated many times, without taking a big hit for more general cases. |
| 392 | // |
| 393 | // The worst case of extra writing past the end of the match occurs when |
| 394 | // offset == 1 and length == 1; the last copy will read from byte positions |
| 395 | // [0..7] and write to [4..11], whereas it was only supposed to write to |
| 396 | // position 1. Thus, ten excess bytes. |
| 397 | // |
| 398 | // ---- |
| 399 | // |
| 400 | // That "10 byte overrun" worst case is confirmed by Go's |
| 401 | // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy |
| 402 | // and finishSlowForwardCopy algorithm. |
| 403 | // |
| 404 | // if length > len(dst)-d-10 { |
| 405 | // goto verySlowForwardCopy |
| 406 | // } |
| 407 | SUB $10, R14, R14 |
| 408 | CMP R14, R4 |
| 409 | BGT verySlowForwardCopy |
| 410 | |
| 411 | makeOffsetAtLeast8: |
| 412 | // !!! As above, expand the pattern so that offset >= 8 and we can use |
| 413 | // 8-byte load/stores. |
| 414 | // |
| 415 | // for offset < 8 { |
| 416 | // copy 8 bytes from dst[d-offset:] to dst[d:] |
| 417 | // length -= offset |
| 418 | // d += offset |
| 419 | // offset += offset |
| 420 | // // The two previous lines together means that d-offset, and therefore |
| 421 | // // R15, is unchanged. |
| 422 | // } |
| 423 | CMP $8, R5 |
| 424 | BGE fixUpSlowForwardCopy |
| 425 | MOVD (R15), R3 |
| 426 | MOVD R3, (R7) |
| 427 | SUB R5, R4, R4 |
| 428 | ADD R5, R7, R7 |
| 429 | ADD R5, R5, R5 |
| 430 | B makeOffsetAtLeast8 |
| 431 | |
| 432 | fixUpSlowForwardCopy: |
| 433 | // !!! Add length (which might be negative now) to d (implied by R7 being |
| 434 | // &dst[d]) so that d ends up at the right place when we jump back to the |
| 435 | // top of the loop. Before we do that, though, we save R7 to R2 so that, if |
| 436 | // length is positive, copying the remaining length bytes will write to the |
| 437 | // right place. |
| 438 | MOVD R7, R2 |
| 439 | ADD R4, R7, R7 |
| 440 | |
| 441 | finishSlowForwardCopy: |
| 442 | // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative |
| 443 | // length means that we overrun, but as above, that will be fixed up by |
| 444 | // subsequent iterations of the outermost loop. |
| 445 | MOVD $0, R1 |
| 446 | CMP R1, R4 |
| 447 | BLE loop |
| 448 | MOVD (R15), R3 |
| 449 | MOVD R3, (R2) |
| 450 | ADD $8, R15, R15 |
| 451 | ADD $8, R2, R2 |
| 452 | SUB $8, R4, R4 |
| 453 | B finishSlowForwardCopy |
| 454 | |
| 455 | verySlowForwardCopy: |
| 456 | // verySlowForwardCopy is a simple implementation of forward copy. In C |
| 457 | // parlance, this is a do/while loop instead of a while loop, since we know |
| 458 | // that length > 0. In Go syntax: |
| 459 | // |
| 460 | // for { |
| 461 | // dst[d] = dst[d - offset] |
| 462 | // d++ |
| 463 | // length-- |
| 464 | // if length == 0 { |
| 465 | // break |
| 466 | // } |
| 467 | // } |
| 468 | MOVB (R15), R3 |
| 469 | MOVB R3, (R7) |
| 470 | ADD $1, R15, R15 |
| 471 | ADD $1, R7, R7 |
| 472 | SUB $1, R4, R4 |
| 473 | CBNZ R4, verySlowForwardCopy |
| 474 | B loop |
| 475 | |
| 476 | // The code above handles copy tags. |
| 477 | // ---------------------------------------- |
| 478 | |
| 479 | end: |
| 480 | // This is the end of the "for s < len(src)". |
| 481 | // |
| 482 | // if d != len(dst) { etc } |
| 483 | CMP R10, R7 |
| 484 | BNE errCorrupt |
| 485 | |
| 486 | // return 0 |
| 487 | MOVD $0, ret+48(FP) |
| 488 | RET |
| 489 | |
| 490 | errCorrupt: |
| 491 | // return decodeErrCodeCorrupt |
| 492 | MOVD $1, R2 |
| 493 | MOVD R2, ret+48(FP) |
| 494 | RET |