| // +build !appengine |
| // +build gc |
| // +build !purego |
| |
| #include "textflag.h" |
| |
| // Register allocation: |
| // AX h |
| // SI pointer to advance through b |
| // DX n |
| // BX loop end |
| // R8 v1, k1 |
| // R9 v2 |
| // R10 v3 |
| // R11 v4 |
| // R12 tmp |
| // R13 prime1v |
| // R14 prime2v |
| // DI prime4v |
| |
| // round reads from and advances the buffer pointer in SI. |
| // It assumes that R13 has prime1v and R14 has prime2v. |
| #define round(r) \ |
| MOVQ (SI), R12 \ |
| ADDQ $8, SI \ |
| IMULQ R14, R12 \ |
| ADDQ R12, r \ |
| ROLQ $31, r \ |
| IMULQ R13, r |
| |
| // mergeRound applies a merge round on the two registers acc and val. |
| // It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v. |
| #define mergeRound(acc, val) \ |
| IMULQ R14, val \ |
| ROLQ $31, val \ |
| IMULQ R13, val \ |
| XORQ val, acc \ |
| IMULQ R13, acc \ |
| ADDQ DI, acc |
| |
| // func Sum64(b []byte) uint64 |
| TEXT ·Sum64(SB), NOSPLIT, $0-32 |
| // Load fixed primes. |
| MOVQ ·prime1v(SB), R13 |
| MOVQ ·prime2v(SB), R14 |
| MOVQ ·prime4v(SB), DI |
| |
| // Load slice. |
| MOVQ b_base+0(FP), SI |
| MOVQ b_len+8(FP), DX |
| LEAQ (SI)(DX*1), BX |
| |
| // The first loop limit will be len(b)-32. |
| SUBQ $32, BX |
| |
| // Check whether we have at least one block. |
| CMPQ DX, $32 |
| JLT noBlocks |
| |
| // Set up initial state (v1, v2, v3, v4). |
| MOVQ R13, R8 |
| ADDQ R14, R8 |
| MOVQ R14, R9 |
| XORQ R10, R10 |
| XORQ R11, R11 |
| SUBQ R13, R11 |
| |
| // Loop until SI > BX. |
| blockLoop: |
| round(R8) |
| round(R9) |
| round(R10) |
| round(R11) |
| |
| CMPQ SI, BX |
| JLE blockLoop |
| |
| MOVQ R8, AX |
| ROLQ $1, AX |
| MOVQ R9, R12 |
| ROLQ $7, R12 |
| ADDQ R12, AX |
| MOVQ R10, R12 |
| ROLQ $12, R12 |
| ADDQ R12, AX |
| MOVQ R11, R12 |
| ROLQ $18, R12 |
| ADDQ R12, AX |
| |
| mergeRound(AX, R8) |
| mergeRound(AX, R9) |
| mergeRound(AX, R10) |
| mergeRound(AX, R11) |
| |
| JMP afterBlocks |
| |
| noBlocks: |
| MOVQ ·prime5v(SB), AX |
| |
| afterBlocks: |
| ADDQ DX, AX |
| |
| // Right now BX has len(b)-32, and we want to loop until SI > len(b)-8. |
| ADDQ $24, BX |
| |
| CMPQ SI, BX |
| JG fourByte |
| |
| wordLoop: |
| // Calculate k1. |
| MOVQ (SI), R8 |
| ADDQ $8, SI |
| IMULQ R14, R8 |
| ROLQ $31, R8 |
| IMULQ R13, R8 |
| |
| XORQ R8, AX |
| ROLQ $27, AX |
| IMULQ R13, AX |
| ADDQ DI, AX |
| |
| CMPQ SI, BX |
| JLE wordLoop |
| |
| fourByte: |
| ADDQ $4, BX |
| CMPQ SI, BX |
| JG singles |
| |
| MOVL (SI), R8 |
| ADDQ $4, SI |
| IMULQ R13, R8 |
| XORQ R8, AX |
| |
| ROLQ $23, AX |
| IMULQ R14, AX |
| ADDQ ·prime3v(SB), AX |
| |
| singles: |
| ADDQ $4, BX |
| CMPQ SI, BX |
| JGE finalize |
| |
| singlesLoop: |
| MOVBQZX (SI), R12 |
| ADDQ $1, SI |
| IMULQ ·prime5v(SB), R12 |
| XORQ R12, AX |
| |
| ROLQ $11, AX |
| IMULQ R13, AX |
| |
| CMPQ SI, BX |
| JL singlesLoop |
| |
| finalize: |
| MOVQ AX, R12 |
| SHRQ $33, R12 |
| XORQ R12, AX |
| IMULQ R14, AX |
| MOVQ AX, R12 |
| SHRQ $29, R12 |
| XORQ R12, AX |
| IMULQ ·prime3v(SB), AX |
| MOVQ AX, R12 |
| SHRQ $32, R12 |
| XORQ R12, AX |
| |
| MOVQ AX, ret+24(FP) |
| RET |
| |
| // writeBlocks uses the same registers as above except that it uses AX to store |
| // the d pointer. |
| |
| // func writeBlocks(d *Digest, b []byte) int |
| TEXT ·writeBlocks(SB), NOSPLIT, $0-40 |
| // Load fixed primes needed for round. |
| MOVQ ·prime1v(SB), R13 |
| MOVQ ·prime2v(SB), R14 |
| |
| // Load slice. |
| MOVQ b_base+8(FP), SI |
| MOVQ b_len+16(FP), DX |
| LEAQ (SI)(DX*1), BX |
| SUBQ $32, BX |
| |
| // Load vN from d. |
| MOVQ d+0(FP), AX |
| MOVQ 0(AX), R8 // v1 |
| MOVQ 8(AX), R9 // v2 |
| MOVQ 16(AX), R10 // v3 |
| MOVQ 24(AX), R11 // v4 |
| |
| // We don't need to check the loop condition here; this function is |
| // always called with at least one block of data to process. |
| blockLoop: |
| round(R8) |
| round(R9) |
| round(R10) |
| round(R11) |
| |
| CMPQ SI, BX |
| JLE blockLoop |
| |
| // Copy vN back to d. |
| MOVQ R8, 0(AX) |
| MOVQ R9, 8(AX) |
| MOVQ R10, 16(AX) |
| MOVQ R11, 24(AX) |
| |
| // The number of bytes written is SI minus the old base pointer. |
| SUBQ b_base+8(FP), SI |
| MOVQ SI, ret+32(FP) |
| |
| RET |