Joey Armstrong | a6af152 | 2023-01-17 16:06:16 -0500 | [diff] [blame] | 1 | // +build !appengine |
| 2 | // +build gc |
| 3 | // +build !purego |
| 4 | |
| 5 | #include "textflag.h" |
| 6 | |
| 7 | // Register allocation: |
| 8 | // AX h |
| 9 | // CX pointer to advance through b |
| 10 | // DX n |
| 11 | // BX loop end |
| 12 | // R8 v1, k1 |
| 13 | // R9 v2 |
| 14 | // R10 v3 |
| 15 | // R11 v4 |
| 16 | // R12 tmp |
| 17 | // R13 prime1v |
| 18 | // R14 prime2v |
| 19 | // R15 prime4v |
| 20 | |
| 21 | // round reads from and advances the buffer pointer in CX. |
| 22 | // It assumes that R13 has prime1v and R14 has prime2v. |
| 23 | #define round(r) \ |
| 24 | MOVQ (CX), R12 \ |
| 25 | ADDQ $8, CX \ |
| 26 | IMULQ R14, R12 \ |
| 27 | ADDQ R12, r \ |
| 28 | ROLQ $31, r \ |
| 29 | IMULQ R13, r |
| 30 | |
| 31 | // mergeRound applies a merge round on the two registers acc and val. |
| 32 | // It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v. |
| 33 | #define mergeRound(acc, val) \ |
| 34 | IMULQ R14, val \ |
| 35 | ROLQ $31, val \ |
| 36 | IMULQ R13, val \ |
| 37 | XORQ val, acc \ |
| 38 | IMULQ R13, acc \ |
| 39 | ADDQ R15, acc |
| 40 | |
| 41 | // func Sum64(b []byte) uint64 |
| 42 | TEXT ·Sum64(SB), NOSPLIT, $0-32 |
| 43 | // Load fixed primes. |
| 44 | MOVQ ·prime1v(SB), R13 |
| 45 | MOVQ ·prime2v(SB), R14 |
| 46 | MOVQ ·prime4v(SB), R15 |
| 47 | |
| 48 | // Load slice. |
| 49 | MOVQ b_base+0(FP), CX |
| 50 | MOVQ b_len+8(FP), DX |
| 51 | LEAQ (CX)(DX*1), BX |
| 52 | |
| 53 | // The first loop limit will be len(b)-32. |
| 54 | SUBQ $32, BX |
| 55 | |
| 56 | // Check whether we have at least one block. |
| 57 | CMPQ DX, $32 |
| 58 | JLT noBlocks |
| 59 | |
| 60 | // Set up initial state (v1, v2, v3, v4). |
| 61 | MOVQ R13, R8 |
| 62 | ADDQ R14, R8 |
| 63 | MOVQ R14, R9 |
| 64 | XORQ R10, R10 |
| 65 | XORQ R11, R11 |
| 66 | SUBQ R13, R11 |
| 67 | |
| 68 | // Loop until CX > BX. |
| 69 | blockLoop: |
| 70 | round(R8) |
| 71 | round(R9) |
| 72 | round(R10) |
| 73 | round(R11) |
| 74 | |
| 75 | CMPQ CX, BX |
| 76 | JLE blockLoop |
| 77 | |
| 78 | MOVQ R8, AX |
| 79 | ROLQ $1, AX |
| 80 | MOVQ R9, R12 |
| 81 | ROLQ $7, R12 |
| 82 | ADDQ R12, AX |
| 83 | MOVQ R10, R12 |
| 84 | ROLQ $12, R12 |
| 85 | ADDQ R12, AX |
| 86 | MOVQ R11, R12 |
| 87 | ROLQ $18, R12 |
| 88 | ADDQ R12, AX |
| 89 | |
| 90 | mergeRound(AX, R8) |
| 91 | mergeRound(AX, R9) |
| 92 | mergeRound(AX, R10) |
| 93 | mergeRound(AX, R11) |
| 94 | |
| 95 | JMP afterBlocks |
| 96 | |
| 97 | noBlocks: |
| 98 | MOVQ ·prime5v(SB), AX |
| 99 | |
| 100 | afterBlocks: |
| 101 | ADDQ DX, AX |
| 102 | |
| 103 | // Right now BX has len(b)-32, and we want to loop until CX > len(b)-8. |
| 104 | ADDQ $24, BX |
| 105 | |
| 106 | CMPQ CX, BX |
| 107 | JG fourByte |
| 108 | |
| 109 | wordLoop: |
| 110 | // Calculate k1. |
| 111 | MOVQ (CX), R8 |
| 112 | ADDQ $8, CX |
| 113 | IMULQ R14, R8 |
| 114 | ROLQ $31, R8 |
| 115 | IMULQ R13, R8 |
| 116 | |
| 117 | XORQ R8, AX |
| 118 | ROLQ $27, AX |
| 119 | IMULQ R13, AX |
| 120 | ADDQ R15, AX |
| 121 | |
| 122 | CMPQ CX, BX |
| 123 | JLE wordLoop |
| 124 | |
| 125 | fourByte: |
| 126 | ADDQ $4, BX |
| 127 | CMPQ CX, BX |
| 128 | JG singles |
| 129 | |
| 130 | MOVL (CX), R8 |
| 131 | ADDQ $4, CX |
| 132 | IMULQ R13, R8 |
| 133 | XORQ R8, AX |
| 134 | |
| 135 | ROLQ $23, AX |
| 136 | IMULQ R14, AX |
| 137 | ADDQ ·prime3v(SB), AX |
| 138 | |
| 139 | singles: |
| 140 | ADDQ $4, BX |
| 141 | CMPQ CX, BX |
| 142 | JGE finalize |
| 143 | |
| 144 | singlesLoop: |
| 145 | MOVBQZX (CX), R12 |
| 146 | ADDQ $1, CX |
| 147 | IMULQ ·prime5v(SB), R12 |
| 148 | XORQ R12, AX |
| 149 | |
| 150 | ROLQ $11, AX |
| 151 | IMULQ R13, AX |
| 152 | |
| 153 | CMPQ CX, BX |
| 154 | JL singlesLoop |
| 155 | |
| 156 | finalize: |
| 157 | MOVQ AX, R12 |
| 158 | SHRQ $33, R12 |
| 159 | XORQ R12, AX |
| 160 | IMULQ R14, AX |
| 161 | MOVQ AX, R12 |
| 162 | SHRQ $29, R12 |
| 163 | XORQ R12, AX |
| 164 | IMULQ ·prime3v(SB), AX |
| 165 | MOVQ AX, R12 |
| 166 | SHRQ $32, R12 |
| 167 | XORQ R12, AX |
| 168 | |
| 169 | MOVQ AX, ret+24(FP) |
| 170 | RET |
| 171 | |
| 172 | // writeBlocks uses the same registers as above except that it uses AX to store |
| 173 | // the d pointer. |
| 174 | |
| 175 | // func writeBlocks(d *Digest, b []byte) int |
| 176 | TEXT ·writeBlocks(SB), NOSPLIT, $0-40 |
| 177 | // Load fixed primes needed for round. |
| 178 | MOVQ ·prime1v(SB), R13 |
| 179 | MOVQ ·prime2v(SB), R14 |
| 180 | |
| 181 | // Load slice. |
| 182 | MOVQ b_base+8(FP), CX |
| 183 | MOVQ b_len+16(FP), DX |
| 184 | LEAQ (CX)(DX*1), BX |
| 185 | SUBQ $32, BX |
| 186 | |
| 187 | // Load vN from d. |
| 188 | MOVQ d+0(FP), AX |
| 189 | MOVQ 0(AX), R8 // v1 |
| 190 | MOVQ 8(AX), R9 // v2 |
| 191 | MOVQ 16(AX), R10 // v3 |
| 192 | MOVQ 24(AX), R11 // v4 |
| 193 | |
| 194 | // We don't need to check the loop condition here; this function is |
| 195 | // always called with at least one block of data to process. |
| 196 | blockLoop: |
| 197 | round(R8) |
| 198 | round(R9) |
| 199 | round(R10) |
| 200 | round(R11) |
| 201 | |
| 202 | CMPQ CX, BX |
| 203 | JLE blockLoop |
| 204 | |
| 205 | // Copy vN back to d. |
| 206 | MOVQ R8, 0(AX) |
| 207 | MOVQ R9, 8(AX) |
| 208 | MOVQ R10, 16(AX) |
| 209 | MOVQ R11, 24(AX) |
| 210 | |
| 211 | // The number of bytes written is CX minus the old base pointer. |
| 212 | SUBQ b_base+8(FP), CX |
| 213 | MOVQ CX, ret+32(FP) |
| 214 | |
| 215 | RET |