Holger Hildebrandt | fa07499 | 2020-03-27 15:42:06 +0000 | [diff] [blame] | 1 | // Copyright 2016 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build !appengine |
| 6 | // +build gc |
| 7 | // +build !noasm |
| 8 | |
| 9 | #include "textflag.h" |
| 10 | |
| 11 | // The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a |
| 12 | // Go toolchain regression. See https://github.com/golang/go/issues/15426 and |
| 13 | // https://github.com/golang/snappy/issues/29 |
| 14 | // |
| 15 | // As a workaround, the package was built with a known good assembler, and |
| 16 | // those instructions were disassembled by "objdump -d" to yield the |
| 17 | // 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 |
| 18 | // style comments, in AT&T asm syntax. Note that rsp here is a physical |
| 19 | // register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm). |
| 20 | // The instructions were then encoded as "BYTE $0x.." sequences, which assemble |
| 21 | // fine on Go 1.6. |
| 22 | |
| 23 | // The asm code generally follows the pure Go code in encode_other.go, except |
| 24 | // where marked with a "!!!". |
| 25 | |
| 26 | // ---------------------------------------------------------------------------- |
| 27 | |
| 28 | // func emitLiteral(dst, lit []byte) int |
| 29 | // |
| 30 | // All local variables fit into registers. The register allocation: |
| 31 | // - AX len(lit) |
| 32 | // - BX n |
| 33 | // - DX return value |
| 34 | // - DI &dst[i] |
| 35 | // - R10 &lit[0] |
| 36 | // |
| 37 | // The 24 bytes of stack space is to call runtime·memmove. |
| 38 | // |
| 39 | // The unusual register allocation of local variables, such as R10 for the |
| 40 | // source pointer, matches the allocation used at the call site in encodeBlock, |
| 41 | // which makes it easier to manually inline this function. |
| 42 | TEXT ·emitLiteral(SB), NOSPLIT, $24-56 |
| 43 | MOVQ dst_base+0(FP), DI |
| 44 | MOVQ lit_base+24(FP), R10 |
| 45 | MOVQ lit_len+32(FP), AX |
| 46 | MOVQ AX, DX |
| 47 | MOVL AX, BX |
| 48 | SUBL $1, BX |
| 49 | |
| 50 | CMPL BX, $60 |
| 51 | JLT oneByte |
| 52 | CMPL BX, $256 |
| 53 | JLT twoBytes |
| 54 | |
| 55 | threeBytes: |
| 56 | MOVB $0xf4, 0(DI) |
| 57 | MOVW BX, 1(DI) |
| 58 | ADDQ $3, DI |
| 59 | ADDQ $3, DX |
| 60 | JMP memmove |
| 61 | |
| 62 | twoBytes: |
| 63 | MOVB $0xf0, 0(DI) |
| 64 | MOVB BX, 1(DI) |
| 65 | ADDQ $2, DI |
| 66 | ADDQ $2, DX |
| 67 | JMP memmove |
| 68 | |
| 69 | oneByte: |
| 70 | SHLB $2, BX |
| 71 | MOVB BX, 0(DI) |
| 72 | ADDQ $1, DI |
| 73 | ADDQ $1, DX |
| 74 | |
| 75 | memmove: |
| 76 | MOVQ DX, ret+48(FP) |
| 77 | |
| 78 | // copy(dst[i:], lit) |
| 79 | // |
| 80 | // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push |
| 81 | // DI, R10 and AX as arguments. |
| 82 | MOVQ DI, 0(SP) |
| 83 | MOVQ R10, 8(SP) |
| 84 | MOVQ AX, 16(SP) |
| 85 | CALL runtime·memmove(SB) |
| 86 | RET |
| 87 | |
| 88 | // ---------------------------------------------------------------------------- |
| 89 | |
| 90 | // func emitCopy(dst []byte, offset, length int) int |
| 91 | // |
| 92 | // All local variables fit into registers. The register allocation: |
| 93 | // - AX length |
| 94 | // - SI &dst[0] |
| 95 | // - DI &dst[i] |
| 96 | // - R11 offset |
| 97 | // |
| 98 | // The unusual register allocation of local variables, such as R11 for the |
| 99 | // offset, matches the allocation used at the call site in encodeBlock, which |
| 100 | // makes it easier to manually inline this function. |
| 101 | TEXT ·emitCopy(SB), NOSPLIT, $0-48 |
| 102 | MOVQ dst_base+0(FP), DI |
| 103 | MOVQ DI, SI |
| 104 | MOVQ offset+24(FP), R11 |
| 105 | MOVQ length+32(FP), AX |
| 106 | |
| 107 | loop0: |
| 108 | // for length >= 68 { etc } |
| 109 | CMPL AX, $68 |
| 110 | JLT step1 |
| 111 | |
| 112 | // Emit a length 64 copy, encoded as 3 bytes. |
| 113 | MOVB $0xfe, 0(DI) |
| 114 | MOVW R11, 1(DI) |
| 115 | ADDQ $3, DI |
| 116 | SUBL $64, AX |
| 117 | JMP loop0 |
| 118 | |
| 119 | step1: |
| 120 | // if length > 64 { etc } |
| 121 | CMPL AX, $64 |
| 122 | JLE step2 |
| 123 | |
| 124 | // Emit a length 60 copy, encoded as 3 bytes. |
| 125 | MOVB $0xee, 0(DI) |
| 126 | MOVW R11, 1(DI) |
| 127 | ADDQ $3, DI |
| 128 | SUBL $60, AX |
| 129 | |
| 130 | step2: |
| 131 | // if length >= 12 || offset >= 2048 { goto step3 } |
| 132 | CMPL AX, $12 |
| 133 | JGE step3 |
| 134 | CMPL R11, $2048 |
| 135 | JGE step3 |
| 136 | |
| 137 | // Emit the remaining copy, encoded as 2 bytes. |
| 138 | MOVB R11, 1(DI) |
| 139 | SHRL $8, R11 |
| 140 | SHLB $5, R11 |
| 141 | SUBB $4, AX |
| 142 | SHLB $2, AX |
| 143 | ORB AX, R11 |
| 144 | ORB $1, R11 |
| 145 | MOVB R11, 0(DI) |
| 146 | ADDQ $2, DI |
| 147 | |
| 148 | // Return the number of bytes written. |
| 149 | SUBQ SI, DI |
| 150 | MOVQ DI, ret+40(FP) |
| 151 | RET |
| 152 | |
| 153 | step3: |
| 154 | // Emit the remaining copy, encoded as 3 bytes. |
| 155 | SUBL $1, AX |
| 156 | SHLB $2, AX |
| 157 | ORB $2, AX |
| 158 | MOVB AX, 0(DI) |
| 159 | MOVW R11, 1(DI) |
| 160 | ADDQ $3, DI |
| 161 | |
| 162 | // Return the number of bytes written. |
| 163 | SUBQ SI, DI |
| 164 | MOVQ DI, ret+40(FP) |
| 165 | RET |
| 166 | |
| 167 | // ---------------------------------------------------------------------------- |
| 168 | |
| 169 | // func extendMatch(src []byte, i, j int) int |
| 170 | // |
| 171 | // All local variables fit into registers. The register allocation: |
| 172 | // - DX &src[0] |
| 173 | // - SI &src[j] |
| 174 | // - R13 &src[len(src) - 8] |
| 175 | // - R14 &src[len(src)] |
| 176 | // - R15 &src[i] |
| 177 | // |
| 178 | // The unusual register allocation of local variables, such as R15 for a source |
| 179 | // pointer, matches the allocation used at the call site in encodeBlock, which |
| 180 | // makes it easier to manually inline this function. |
| 181 | TEXT ·extendMatch(SB), NOSPLIT, $0-48 |
| 182 | MOVQ src_base+0(FP), DX |
| 183 | MOVQ src_len+8(FP), R14 |
| 184 | MOVQ i+24(FP), R15 |
| 185 | MOVQ j+32(FP), SI |
| 186 | ADDQ DX, R14 |
| 187 | ADDQ DX, R15 |
| 188 | ADDQ DX, SI |
| 189 | MOVQ R14, R13 |
| 190 | SUBQ $8, R13 |
| 191 | |
| 192 | cmp8: |
| 193 | // As long as we are 8 or more bytes before the end of src, we can load and |
| 194 | // compare 8 bytes at a time. If those 8 bytes are equal, repeat. |
| 195 | CMPQ SI, R13 |
| 196 | JA cmp1 |
| 197 | MOVQ (R15), AX |
| 198 | MOVQ (SI), BX |
| 199 | CMPQ AX, BX |
| 200 | JNE bsf |
| 201 | ADDQ $8, R15 |
| 202 | ADDQ $8, SI |
| 203 | JMP cmp8 |
| 204 | |
| 205 | bsf: |
| 206 | // If those 8 bytes were not equal, XOR the two 8 byte values, and return |
| 207 | // the index of the first byte that differs. The BSF instruction finds the |
| 208 | // least significant 1 bit, the amd64 architecture is little-endian, and |
| 209 | // the shift by 3 converts a bit index to a byte index. |
| 210 | XORQ AX, BX |
| 211 | BSFQ BX, BX |
| 212 | SHRQ $3, BX |
| 213 | ADDQ BX, SI |
| 214 | |
| 215 | // Convert from &src[ret] to ret. |
| 216 | SUBQ DX, SI |
| 217 | MOVQ SI, ret+40(FP) |
| 218 | RET |
| 219 | |
| 220 | cmp1: |
| 221 | // In src's tail, compare 1 byte at a time. |
| 222 | CMPQ SI, R14 |
| 223 | JAE extendMatchEnd |
| 224 | MOVB (R15), AX |
| 225 | MOVB (SI), BX |
| 226 | CMPB AX, BX |
| 227 | JNE extendMatchEnd |
| 228 | ADDQ $1, R15 |
| 229 | ADDQ $1, SI |
| 230 | JMP cmp1 |
| 231 | |
| 232 | extendMatchEnd: |
| 233 | // Convert from &src[ret] to ret. |
| 234 | SUBQ DX, SI |
| 235 | MOVQ SI, ret+40(FP) |
| 236 | RET |
| 237 | |
| 238 | // ---------------------------------------------------------------------------- |
| 239 | |
| 240 | // func encodeBlock(dst, src []byte) (d int) |
| 241 | // |
| 242 | // All local variables fit into registers, other than "var table". The register |
| 243 | // allocation: |
| 244 | // - AX . . |
| 245 | // - BX . . |
| 246 | // - CX 56 shift (note that amd64 shifts by non-immediates must use CX). |
| 247 | // - DX 64 &src[0], tableSize |
| 248 | // - SI 72 &src[s] |
| 249 | // - DI 80 &dst[d] |
| 250 | // - R9 88 sLimit |
| 251 | // - R10 . &src[nextEmit] |
| 252 | // - R11 96 prevHash, currHash, nextHash, offset |
| 253 | // - R12 104 &src[base], skip |
| 254 | // - R13 . &src[nextS], &src[len(src) - 8] |
| 255 | // - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x |
| 256 | // - R15 112 candidate |
| 257 | // |
| 258 | // The second column (56, 64, etc) is the stack offset to spill the registers |
| 259 | // when calling other functions. We could pack this slightly tighter, but it's |
| 260 | // simpler to have a dedicated spill map independent of the function called. |
| 261 | // |
| 262 | // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An |
| 263 | // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill |
| 264 | // local variables (registers) during calls gives 32768 + 56 + 64 = 32888. |
| 265 | TEXT ·encodeBlock(SB), 0, $32888-56 |
| 266 | MOVQ dst_base+0(FP), DI |
| 267 | MOVQ src_base+24(FP), SI |
| 268 | MOVQ src_len+32(FP), R14 |
| 269 | |
| 270 | // shift, tableSize := uint32(32-8), 1<<8 |
| 271 | MOVQ $24, CX |
| 272 | MOVQ $256, DX |
| 273 | |
| 274 | calcShift: |
| 275 | // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { |
| 276 | // shift-- |
| 277 | // } |
| 278 | CMPQ DX, $16384 |
| 279 | JGE varTable |
| 280 | CMPQ DX, R14 |
| 281 | JGE varTable |
| 282 | SUBQ $1, CX |
| 283 | SHLQ $1, DX |
| 284 | JMP calcShift |
| 285 | |
| 286 | varTable: |
| 287 | // var table [maxTableSize]uint16 |
| 288 | // |
| 289 | // In the asm code, unlike the Go code, we can zero-initialize only the |
| 290 | // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU |
| 291 | // writes 16 bytes, so we can do only tableSize/8 writes instead of the |
| 292 | // 2048 writes that would zero-initialize all of table's 32768 bytes. |
| 293 | SHRQ $3, DX |
| 294 | LEAQ table-32768(SP), BX |
| 295 | PXOR X0, X0 |
| 296 | |
| 297 | memclr: |
| 298 | MOVOU X0, 0(BX) |
| 299 | ADDQ $16, BX |
| 300 | SUBQ $1, DX |
| 301 | JNZ memclr |
| 302 | |
| 303 | // !!! DX = &src[0] |
| 304 | MOVQ SI, DX |
| 305 | |
| 306 | // sLimit := len(src) - inputMargin |
| 307 | MOVQ R14, R9 |
| 308 | SUBQ $15, R9 |
| 309 | |
| 310 | // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't |
| 311 | // change for the rest of the function. |
| 312 | MOVQ CX, 56(SP) |
| 313 | MOVQ DX, 64(SP) |
| 314 | MOVQ R9, 88(SP) |
| 315 | |
| 316 | // nextEmit := 0 |
| 317 | MOVQ DX, R10 |
| 318 | |
| 319 | // s := 1 |
| 320 | ADDQ $1, SI |
| 321 | |
| 322 | // nextHash := hash(load32(src, s), shift) |
| 323 | MOVL 0(SI), R11 |
| 324 | IMULL $0x1e35a7bd, R11 |
| 325 | SHRL CX, R11 |
| 326 | |
| 327 | outer: |
| 328 | // for { etc } |
| 329 | |
| 330 | // skip := 32 |
| 331 | MOVQ $32, R12 |
| 332 | |
| 333 | // nextS := s |
| 334 | MOVQ SI, R13 |
| 335 | |
| 336 | // candidate := 0 |
| 337 | MOVQ $0, R15 |
| 338 | |
| 339 | inner0: |
| 340 | // for { etc } |
| 341 | |
| 342 | // s := nextS |
| 343 | MOVQ R13, SI |
| 344 | |
| 345 | // bytesBetweenHashLookups := skip >> 5 |
| 346 | MOVQ R12, R14 |
| 347 | SHRQ $5, R14 |
| 348 | |
| 349 | // nextS = s + bytesBetweenHashLookups |
| 350 | ADDQ R14, R13 |
| 351 | |
| 352 | // skip += bytesBetweenHashLookups |
| 353 | ADDQ R14, R12 |
| 354 | |
| 355 | // if nextS > sLimit { goto emitRemainder } |
| 356 | MOVQ R13, AX |
| 357 | SUBQ DX, AX |
| 358 | CMPQ AX, R9 |
| 359 | JA emitRemainder |
| 360 | |
| 361 | // candidate = int(table[nextHash]) |
| 362 | // XXX: MOVWQZX table-32768(SP)(R11*2), R15 |
| 363 | // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 |
| 364 | BYTE $0x4e |
| 365 | BYTE $0x0f |
| 366 | BYTE $0xb7 |
| 367 | BYTE $0x7c |
| 368 | BYTE $0x5c |
| 369 | BYTE $0x78 |
| 370 | |
| 371 | // table[nextHash] = uint16(s) |
| 372 | MOVQ SI, AX |
| 373 | SUBQ DX, AX |
| 374 | |
| 375 | // XXX: MOVW AX, table-32768(SP)(R11*2) |
| 376 | // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) |
| 377 | BYTE $0x66 |
| 378 | BYTE $0x42 |
| 379 | BYTE $0x89 |
| 380 | BYTE $0x44 |
| 381 | BYTE $0x5c |
| 382 | BYTE $0x78 |
| 383 | |
| 384 | // nextHash = hash(load32(src, nextS), shift) |
| 385 | MOVL 0(R13), R11 |
| 386 | IMULL $0x1e35a7bd, R11 |
| 387 | SHRL CX, R11 |
| 388 | |
| 389 | // if load32(src, s) != load32(src, candidate) { continue } break |
| 390 | MOVL 0(SI), AX |
| 391 | MOVL (DX)(R15*1), BX |
| 392 | CMPL AX, BX |
| 393 | JNE inner0 |
| 394 | |
| 395 | fourByteMatch: |
| 396 | // As per the encode_other.go code: |
| 397 | // |
| 398 | // A 4-byte match has been found. We'll later see etc. |
| 399 | |
| 400 | // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment |
| 401 | // on inputMargin in encode.go. |
| 402 | MOVQ SI, AX |
| 403 | SUBQ R10, AX |
| 404 | CMPQ AX, $16 |
| 405 | JLE emitLiteralFastPath |
| 406 | |
| 407 | // ---------------------------------------- |
| 408 | // Begin inline of the emitLiteral call. |
| 409 | // |
| 410 | // d += emitLiteral(dst[d:], src[nextEmit:s]) |
| 411 | |
| 412 | MOVL AX, BX |
| 413 | SUBL $1, BX |
| 414 | |
| 415 | CMPL BX, $60 |
| 416 | JLT inlineEmitLiteralOneByte |
| 417 | CMPL BX, $256 |
| 418 | JLT inlineEmitLiteralTwoBytes |
| 419 | |
| 420 | inlineEmitLiteralThreeBytes: |
| 421 | MOVB $0xf4, 0(DI) |
| 422 | MOVW BX, 1(DI) |
| 423 | ADDQ $3, DI |
| 424 | JMP inlineEmitLiteralMemmove |
| 425 | |
| 426 | inlineEmitLiteralTwoBytes: |
| 427 | MOVB $0xf0, 0(DI) |
| 428 | MOVB BX, 1(DI) |
| 429 | ADDQ $2, DI |
| 430 | JMP inlineEmitLiteralMemmove |
| 431 | |
| 432 | inlineEmitLiteralOneByte: |
| 433 | SHLB $2, BX |
| 434 | MOVB BX, 0(DI) |
| 435 | ADDQ $1, DI |
| 436 | |
| 437 | inlineEmitLiteralMemmove: |
| 438 | // Spill local variables (registers) onto the stack; call; unspill. |
| 439 | // |
| 440 | // copy(dst[i:], lit) |
| 441 | // |
| 442 | // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push |
| 443 | // DI, R10 and AX as arguments. |
| 444 | MOVQ DI, 0(SP) |
| 445 | MOVQ R10, 8(SP) |
| 446 | MOVQ AX, 16(SP) |
| 447 | ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)". |
| 448 | MOVQ SI, 72(SP) |
| 449 | MOVQ DI, 80(SP) |
| 450 | MOVQ R15, 112(SP) |
| 451 | CALL runtime·memmove(SB) |
| 452 | MOVQ 56(SP), CX |
| 453 | MOVQ 64(SP), DX |
| 454 | MOVQ 72(SP), SI |
| 455 | MOVQ 80(SP), DI |
| 456 | MOVQ 88(SP), R9 |
| 457 | MOVQ 112(SP), R15 |
| 458 | JMP inner1 |
| 459 | |
| 460 | inlineEmitLiteralEnd: |
| 461 | // End inline of the emitLiteral call. |
| 462 | // ---------------------------------------- |
| 463 | |
| 464 | emitLiteralFastPath: |
| 465 | // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2". |
| 466 | MOVB AX, BX |
| 467 | SUBB $1, BX |
| 468 | SHLB $2, BX |
| 469 | MOVB BX, (DI) |
| 470 | ADDQ $1, DI |
| 471 | |
| 472 | // !!! Implement the copy from lit to dst as a 16-byte load and store. |
| 473 | // (Encode's documentation says that dst and src must not overlap.) |
| 474 | // |
| 475 | // This always copies 16 bytes, instead of only len(lit) bytes, but that's |
| 476 | // OK. Subsequent iterations will fix up the overrun. |
| 477 | // |
| 478 | // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or |
| 479 | // 16-byte loads and stores. This technique probably wouldn't be as |
| 480 | // effective on architectures that are fussier about alignment. |
| 481 | MOVOU 0(R10), X0 |
| 482 | MOVOU X0, 0(DI) |
| 483 | ADDQ AX, DI |
| 484 | |
| 485 | inner1: |
| 486 | // for { etc } |
| 487 | |
| 488 | // base := s |
| 489 | MOVQ SI, R12 |
| 490 | |
| 491 | // !!! offset := base - candidate |
| 492 | MOVQ R12, R11 |
| 493 | SUBQ R15, R11 |
| 494 | SUBQ DX, R11 |
| 495 | |
| 496 | // ---------------------------------------- |
| 497 | // Begin inline of the extendMatch call. |
| 498 | // |
| 499 | // s = extendMatch(src, candidate+4, s+4) |
| 500 | |
| 501 | // !!! R14 = &src[len(src)] |
| 502 | MOVQ src_len+32(FP), R14 |
| 503 | ADDQ DX, R14 |
| 504 | |
| 505 | // !!! R13 = &src[len(src) - 8] |
| 506 | MOVQ R14, R13 |
| 507 | SUBQ $8, R13 |
| 508 | |
| 509 | // !!! R15 = &src[candidate + 4] |
| 510 | ADDQ $4, R15 |
| 511 | ADDQ DX, R15 |
| 512 | |
| 513 | // !!! s += 4 |
| 514 | ADDQ $4, SI |
| 515 | |
| 516 | inlineExtendMatchCmp8: |
| 517 | // As long as we are 8 or more bytes before the end of src, we can load and |
| 518 | // compare 8 bytes at a time. If those 8 bytes are equal, repeat. |
| 519 | CMPQ SI, R13 |
| 520 | JA inlineExtendMatchCmp1 |
| 521 | MOVQ (R15), AX |
| 522 | MOVQ (SI), BX |
| 523 | CMPQ AX, BX |
| 524 | JNE inlineExtendMatchBSF |
| 525 | ADDQ $8, R15 |
| 526 | ADDQ $8, SI |
| 527 | JMP inlineExtendMatchCmp8 |
| 528 | |
| 529 | inlineExtendMatchBSF: |
| 530 | // If those 8 bytes were not equal, XOR the two 8 byte values, and return |
| 531 | // the index of the first byte that differs. The BSF instruction finds the |
| 532 | // least significant 1 bit, the amd64 architecture is little-endian, and |
| 533 | // the shift by 3 converts a bit index to a byte index. |
| 534 | XORQ AX, BX |
| 535 | BSFQ BX, BX |
| 536 | SHRQ $3, BX |
| 537 | ADDQ BX, SI |
| 538 | JMP inlineExtendMatchEnd |
| 539 | |
| 540 | inlineExtendMatchCmp1: |
| 541 | // In src's tail, compare 1 byte at a time. |
| 542 | CMPQ SI, R14 |
| 543 | JAE inlineExtendMatchEnd |
| 544 | MOVB (R15), AX |
| 545 | MOVB (SI), BX |
| 546 | CMPB AX, BX |
| 547 | JNE inlineExtendMatchEnd |
| 548 | ADDQ $1, R15 |
| 549 | ADDQ $1, SI |
| 550 | JMP inlineExtendMatchCmp1 |
| 551 | |
| 552 | inlineExtendMatchEnd: |
| 553 | // End inline of the extendMatch call. |
| 554 | // ---------------------------------------- |
| 555 | |
| 556 | // ---------------------------------------- |
| 557 | // Begin inline of the emitCopy call. |
| 558 | // |
| 559 | // d += emitCopy(dst[d:], base-candidate, s-base) |
| 560 | |
| 561 | // !!! length := s - base |
| 562 | MOVQ SI, AX |
| 563 | SUBQ R12, AX |
| 564 | |
| 565 | inlineEmitCopyLoop0: |
| 566 | // for length >= 68 { etc } |
| 567 | CMPL AX, $68 |
| 568 | JLT inlineEmitCopyStep1 |
| 569 | |
| 570 | // Emit a length 64 copy, encoded as 3 bytes. |
| 571 | MOVB $0xfe, 0(DI) |
| 572 | MOVW R11, 1(DI) |
| 573 | ADDQ $3, DI |
| 574 | SUBL $64, AX |
| 575 | JMP inlineEmitCopyLoop0 |
| 576 | |
| 577 | inlineEmitCopyStep1: |
| 578 | // if length > 64 { etc } |
| 579 | CMPL AX, $64 |
| 580 | JLE inlineEmitCopyStep2 |
| 581 | |
| 582 | // Emit a length 60 copy, encoded as 3 bytes. |
| 583 | MOVB $0xee, 0(DI) |
| 584 | MOVW R11, 1(DI) |
| 585 | ADDQ $3, DI |
| 586 | SUBL $60, AX |
| 587 | |
| 588 | inlineEmitCopyStep2: |
| 589 | // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 } |
| 590 | CMPL AX, $12 |
| 591 | JGE inlineEmitCopyStep3 |
| 592 | CMPL R11, $2048 |
| 593 | JGE inlineEmitCopyStep3 |
| 594 | |
| 595 | // Emit the remaining copy, encoded as 2 bytes. |
| 596 | MOVB R11, 1(DI) |
| 597 | SHRL $8, R11 |
| 598 | SHLB $5, R11 |
| 599 | SUBB $4, AX |
| 600 | SHLB $2, AX |
| 601 | ORB AX, R11 |
| 602 | ORB $1, R11 |
| 603 | MOVB R11, 0(DI) |
| 604 | ADDQ $2, DI |
| 605 | JMP inlineEmitCopyEnd |
| 606 | |
| 607 | inlineEmitCopyStep3: |
| 608 | // Emit the remaining copy, encoded as 3 bytes. |
| 609 | SUBL $1, AX |
| 610 | SHLB $2, AX |
| 611 | ORB $2, AX |
| 612 | MOVB AX, 0(DI) |
| 613 | MOVW R11, 1(DI) |
| 614 | ADDQ $3, DI |
| 615 | |
| 616 | inlineEmitCopyEnd: |
| 617 | // End inline of the emitCopy call. |
| 618 | // ---------------------------------------- |
| 619 | |
| 620 | // nextEmit = s |
| 621 | MOVQ SI, R10 |
| 622 | |
| 623 | // if s >= sLimit { goto emitRemainder } |
| 624 | MOVQ SI, AX |
| 625 | SUBQ DX, AX |
| 626 | CMPQ AX, R9 |
| 627 | JAE emitRemainder |
| 628 | |
| 629 | // As per the encode_other.go code: |
| 630 | // |
| 631 | // We could immediately etc. |
| 632 | |
| 633 | // x := load64(src, s-1) |
| 634 | MOVQ -1(SI), R14 |
| 635 | |
| 636 | // prevHash := hash(uint32(x>>0), shift) |
| 637 | MOVL R14, R11 |
| 638 | IMULL $0x1e35a7bd, R11 |
| 639 | SHRL CX, R11 |
| 640 | |
| 641 | // table[prevHash] = uint16(s-1) |
| 642 | MOVQ SI, AX |
| 643 | SUBQ DX, AX |
| 644 | SUBQ $1, AX |
| 645 | |
| 646 | // XXX: MOVW AX, table-32768(SP)(R11*2) |
| 647 | // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) |
| 648 | BYTE $0x66 |
| 649 | BYTE $0x42 |
| 650 | BYTE $0x89 |
| 651 | BYTE $0x44 |
| 652 | BYTE $0x5c |
| 653 | BYTE $0x78 |
| 654 | |
| 655 | // currHash := hash(uint32(x>>8), shift) |
| 656 | SHRQ $8, R14 |
| 657 | MOVL R14, R11 |
| 658 | IMULL $0x1e35a7bd, R11 |
| 659 | SHRL CX, R11 |
| 660 | |
| 661 | // candidate = int(table[currHash]) |
| 662 | // XXX: MOVWQZX table-32768(SP)(R11*2), R15 |
| 663 | // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 |
| 664 | BYTE $0x4e |
| 665 | BYTE $0x0f |
| 666 | BYTE $0xb7 |
| 667 | BYTE $0x7c |
| 668 | BYTE $0x5c |
| 669 | BYTE $0x78 |
| 670 | |
| 671 | // table[currHash] = uint16(s) |
| 672 | ADDQ $1, AX |
| 673 | |
| 674 | // XXX: MOVW AX, table-32768(SP)(R11*2) |
| 675 | // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) |
| 676 | BYTE $0x66 |
| 677 | BYTE $0x42 |
| 678 | BYTE $0x89 |
| 679 | BYTE $0x44 |
| 680 | BYTE $0x5c |
| 681 | BYTE $0x78 |
| 682 | |
| 683 | // if uint32(x>>8) == load32(src, candidate) { continue } |
| 684 | MOVL (DX)(R15*1), BX |
| 685 | CMPL R14, BX |
| 686 | JEQ inner1 |
| 687 | |
| 688 | // nextHash = hash(uint32(x>>16), shift) |
| 689 | SHRQ $8, R14 |
| 690 | MOVL R14, R11 |
| 691 | IMULL $0x1e35a7bd, R11 |
| 692 | SHRL CX, R11 |
| 693 | |
| 694 | // s++ |
| 695 | ADDQ $1, SI |
| 696 | |
| 697 | // break out of the inner1 for loop, i.e. continue the outer loop. |
| 698 | JMP outer |
| 699 | |
| 700 | emitRemainder: |
| 701 | // if nextEmit < len(src) { etc } |
| 702 | MOVQ src_len+32(FP), AX |
| 703 | ADDQ DX, AX |
| 704 | CMPQ R10, AX |
| 705 | JEQ encodeBlockEnd |
| 706 | |
| 707 | // d += emitLiteral(dst[d:], src[nextEmit:]) |
| 708 | // |
| 709 | // Push args. |
| 710 | MOVQ DI, 0(SP) |
| 711 | MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative. |
| 712 | MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative. |
| 713 | MOVQ R10, 24(SP) |
| 714 | SUBQ R10, AX |
| 715 | MOVQ AX, 32(SP) |
| 716 | MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative. |
| 717 | |
| 718 | // Spill local variables (registers) onto the stack; call; unspill. |
| 719 | MOVQ DI, 80(SP) |
| 720 | CALL ·emitLiteral(SB) |
| 721 | MOVQ 80(SP), DI |
| 722 | |
| 723 | // Finish the "d +=" part of "d += emitLiteral(etc)". |
| 724 | ADDQ 48(SP), DI |
| 725 | |
| 726 | encodeBlockEnd: |
| 727 | MOVQ dst_base+0(FP), AX |
| 728 | SUBQ AX, DI |
| 729 | MOVQ DI, d+48(FP) |
| 730 | RET |