| // +build !appengine |
| // +build gc |
| // +build !purego |
| |
| #include "textflag.h" |
| |
| // Register allocation: |
| // AX h |
| // CX pointer to advance through b |
| // DX n |
| // BX loop end |
| // R8 v1, k1 |
| // R9 v2 |
| // R10 v3 |
| // R11 v4 |
| // R12 tmp |
| // R13 prime1v |
| // R14 prime2v |
| // R15 prime4v |
| |
| // round reads from and advances the buffer pointer in CX. |
| // It assumes that R13 has prime1v and R14 has prime2v. |
| #define round(r) \ |
| MOVQ (CX), R12 \ |
| ADDQ $8, CX \ |
| IMULQ R14, R12 \ |
| ADDQ R12, r \ |
| ROLQ $31, r \ |
| IMULQ R13, r |
| |
| // mergeRound applies a merge round on the two registers acc and val. |
| // It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v. |
| #define mergeRound(acc, val) \ |
| IMULQ R14, val \ |
| ROLQ $31, val \ |
| IMULQ R13, val \ |
| XORQ val, acc \ |
| IMULQ R13, acc \ |
| ADDQ R15, acc |
| |
| // func Sum64(b []byte) uint64 |
| TEXT ·Sum64(SB), NOSPLIT, $0-32 |
| // Load fixed primes. |
| MOVQ ·prime1v(SB), R13 |
| MOVQ ·prime2v(SB), R14 |
| MOVQ ·prime4v(SB), R15 |
| |
| // Load slice. |
| MOVQ b_base+0(FP), CX |
| MOVQ b_len+8(FP), DX |
| LEAQ (CX)(DX*1), BX |
| |
| // The first loop limit will be len(b)-32. |
| SUBQ $32, BX |
| |
| // Check whether we have at least one block. |
| CMPQ DX, $32 |
| JLT noBlocks |
| |
| // Set up initial state (v1, v2, v3, v4). |
| MOVQ R13, R8 |
| ADDQ R14, R8 |
| MOVQ R14, R9 |
| XORQ R10, R10 |
| XORQ R11, R11 |
| SUBQ R13, R11 |
| |
| // Loop until CX > BX. |
| blockLoop: |
| round(R8) |
| round(R9) |
| round(R10) |
| round(R11) |
| |
| CMPQ CX, BX |
| JLE blockLoop |
| |
| MOVQ R8, AX |
| ROLQ $1, AX |
| MOVQ R9, R12 |
| ROLQ $7, R12 |
| ADDQ R12, AX |
| MOVQ R10, R12 |
| ROLQ $12, R12 |
| ADDQ R12, AX |
| MOVQ R11, R12 |
| ROLQ $18, R12 |
| ADDQ R12, AX |
| |
| mergeRound(AX, R8) |
| mergeRound(AX, R9) |
| mergeRound(AX, R10) |
| mergeRound(AX, R11) |
| |
| JMP afterBlocks |
| |
| noBlocks: |
| MOVQ ·prime5v(SB), AX |
| |
| afterBlocks: |
| ADDQ DX, AX |
| |
| // Right now BX has len(b)-32, and we want to loop until CX > len(b)-8. |
| ADDQ $24, BX |
| |
| CMPQ CX, BX |
| JG fourByte |
| |
| wordLoop: |
| // Calculate k1. |
| MOVQ (CX), R8 |
| ADDQ $8, CX |
| IMULQ R14, R8 |
| ROLQ $31, R8 |
| IMULQ R13, R8 |
| |
| XORQ R8, AX |
| ROLQ $27, AX |
| IMULQ R13, AX |
| ADDQ R15, AX |
| |
| CMPQ CX, BX |
| JLE wordLoop |
| |
| fourByte: |
| ADDQ $4, BX |
| CMPQ CX, BX |
| JG singles |
| |
| MOVL (CX), R8 |
| ADDQ $4, CX |
| IMULQ R13, R8 |
| XORQ R8, AX |
| |
| ROLQ $23, AX |
| IMULQ R14, AX |
| ADDQ ·prime3v(SB), AX |
| |
| singles: |
| ADDQ $4, BX |
| CMPQ CX, BX |
| JGE finalize |
| |
| singlesLoop: |
| MOVBQZX (CX), R12 |
| ADDQ $1, CX |
| IMULQ ·prime5v(SB), R12 |
| XORQ R12, AX |
| |
| ROLQ $11, AX |
| IMULQ R13, AX |
| |
| CMPQ CX, BX |
| JL singlesLoop |
| |
| finalize: |
| MOVQ AX, R12 |
| SHRQ $33, R12 |
| XORQ R12, AX |
| IMULQ R14, AX |
| MOVQ AX, R12 |
| SHRQ $29, R12 |
| XORQ R12, AX |
| IMULQ ·prime3v(SB), AX |
| MOVQ AX, R12 |
| SHRQ $32, R12 |
| XORQ R12, AX |
| |
| MOVQ AX, ret+24(FP) |
| RET |
| |
| // writeBlocks uses the same registers as above except that it uses AX to store |
| // the x pointer. |
| |
| // func writeBlocks(x *xxh, b []byte) []byte |
| TEXT ·writeBlocks(SB), NOSPLIT, $0-56 |
| // Load fixed primes needed for round. |
| MOVQ ·prime1v(SB), R13 |
| MOVQ ·prime2v(SB), R14 |
| |
| // Load slice. |
| MOVQ b_base+8(FP), CX |
| MOVQ CX, ret_base+32(FP) // initialize return base pointer; see NOTE below |
| MOVQ b_len+16(FP), DX |
| LEAQ (CX)(DX*1), BX |
| SUBQ $32, BX |
| |
| // Load vN from x. |
| MOVQ x+0(FP), AX |
| MOVQ 0(AX), R8 // v1 |
| MOVQ 8(AX), R9 // v2 |
| MOVQ 16(AX), R10 // v3 |
| MOVQ 24(AX), R11 // v4 |
| |
| // We don't need to check the loop condition here; this function is |
| // always called with at least one block of data to process. |
| blockLoop: |
| round(R8) |
| round(R9) |
| round(R10) |
| round(R11) |
| |
| CMPQ CX, BX |
| JLE blockLoop |
| |
| // Copy vN back to x. |
| MOVQ R8, 0(AX) |
| MOVQ R9, 8(AX) |
| MOVQ R10, 16(AX) |
| MOVQ R11, 24(AX) |
| |
| // Construct return slice. |
| // NOTE: It's important that we don't construct a slice that has a base |
| // pointer off the end of the original slice, as in Go 1.7+ this will |
| // cause runtime crashes. (See discussion in, for example, |
| // https://github.com/golang/go/issues/16772.) |
| // Therefore, we calculate the length/cap first, and if they're zero, we |
| // keep the old base. This is what the compiler does as well if you |
| // write code like |
| // b = b[len(b):] |
| |
| // New length is 32 - (CX - BX) -> BX+32 - CX. |
| ADDQ $32, BX |
| SUBQ CX, BX |
| JZ afterSetBase |
| |
| MOVQ CX, ret_base+32(FP) |
| |
| afterSetBase: |
| MOVQ BX, ret_len+40(FP) |
| MOVQ BX, ret_cap+48(FP) // set cap == len |
| |
| RET |