Akash Reddy Kankanala | c28f0e2 | 2025-06-16 11:00:55 +0530 | [diff] [blame^] | 1 | //go:build !appengine && gc && !purego |
| 2 | // +build !appengine |
| 3 | // +build gc |
| 4 | // +build !purego |
| 5 | |
| 6 | #include "textflag.h" |
| 7 | |
| 8 | // Registers: |
| 9 | #define digest R1 |
| 10 | #define h R2 // return value |
| 11 | #define p R3 // input pointer |
| 12 | #define n R4 // input length |
| 13 | #define nblocks R5 // n / 32 |
| 14 | #define prime1 R7 |
| 15 | #define prime2 R8 |
| 16 | #define prime3 R9 |
| 17 | #define prime4 R10 |
| 18 | #define prime5 R11 |
| 19 | #define v1 R12 |
| 20 | #define v2 R13 |
| 21 | #define v3 R14 |
| 22 | #define v4 R15 |
| 23 | #define x1 R20 |
| 24 | #define x2 R21 |
| 25 | #define x3 R22 |
| 26 | #define x4 R23 |
| 27 | |
| 28 | #define round(acc, x) \ |
| 29 | MADD prime2, acc, x, acc \ |
| 30 | ROR $64-31, acc \ |
| 31 | MUL prime1, acc |
| 32 | |
| 33 | // round0 performs the operation x = round(0, x). |
| 34 | #define round0(x) \ |
| 35 | MUL prime2, x \ |
| 36 | ROR $64-31, x \ |
| 37 | MUL prime1, x |
| 38 | |
| 39 | #define mergeRound(acc, x) \ |
| 40 | round0(x) \ |
| 41 | EOR x, acc \ |
| 42 | MADD acc, prime4, prime1, acc |
| 43 | |
| 44 | // blockLoop processes as many 32-byte blocks as possible, |
| 45 | // updating v1, v2, v3, and v4. It assumes that n >= 32. |
| 46 | #define blockLoop() \ |
| 47 | LSR $5, n, nblocks \ |
| 48 | PCALIGN $16 \ |
| 49 | loop: \ |
| 50 | LDP.P 16(p), (x1, x2) \ |
| 51 | LDP.P 16(p), (x3, x4) \ |
| 52 | round(v1, x1) \ |
| 53 | round(v2, x2) \ |
| 54 | round(v3, x3) \ |
| 55 | round(v4, x4) \ |
| 56 | SUB $1, nblocks \ |
| 57 | CBNZ nblocks, loop |
| 58 | |
| 59 | // func Sum64(b []byte) uint64 |
| 60 | TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 |
| 61 | LDP b_base+0(FP), (p, n) |
| 62 | |
| 63 | LDP ·primes+0(SB), (prime1, prime2) |
| 64 | LDP ·primes+16(SB), (prime3, prime4) |
| 65 | MOVD ·primes+32(SB), prime5 |
| 66 | |
| 67 | CMP $32, n |
| 68 | CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 } |
| 69 | BLT afterLoop |
| 70 | |
| 71 | ADD prime1, prime2, v1 |
| 72 | MOVD prime2, v2 |
| 73 | MOVD $0, v3 |
| 74 | NEG prime1, v4 |
| 75 | |
| 76 | blockLoop() |
| 77 | |
| 78 | ROR $64-1, v1, x1 |
| 79 | ROR $64-7, v2, x2 |
| 80 | ADD x1, x2 |
| 81 | ROR $64-12, v3, x3 |
| 82 | ROR $64-18, v4, x4 |
| 83 | ADD x3, x4 |
| 84 | ADD x2, x4, h |
| 85 | |
| 86 | mergeRound(h, v1) |
| 87 | mergeRound(h, v2) |
| 88 | mergeRound(h, v3) |
| 89 | mergeRound(h, v4) |
| 90 | |
| 91 | afterLoop: |
| 92 | ADD n, h |
| 93 | |
| 94 | TBZ $4, n, try8 |
| 95 | LDP.P 16(p), (x1, x2) |
| 96 | |
| 97 | round0(x1) |
| 98 | |
| 99 | // NOTE: here and below, sequencing the EOR after the ROR (using a |
| 100 | // rotated register) is worth a small but measurable speedup for small |
| 101 | // inputs. |
| 102 | ROR $64-27, h |
| 103 | EOR x1 @> 64-27, h, h |
| 104 | MADD h, prime4, prime1, h |
| 105 | |
| 106 | round0(x2) |
| 107 | ROR $64-27, h |
| 108 | EOR x2 @> 64-27, h, h |
| 109 | MADD h, prime4, prime1, h |
| 110 | |
| 111 | try8: |
| 112 | TBZ $3, n, try4 |
| 113 | MOVD.P 8(p), x1 |
| 114 | |
| 115 | round0(x1) |
| 116 | ROR $64-27, h |
| 117 | EOR x1 @> 64-27, h, h |
| 118 | MADD h, prime4, prime1, h |
| 119 | |
| 120 | try4: |
| 121 | TBZ $2, n, try2 |
| 122 | MOVWU.P 4(p), x2 |
| 123 | |
| 124 | MUL prime1, x2 |
| 125 | ROR $64-23, h |
| 126 | EOR x2 @> 64-23, h, h |
| 127 | MADD h, prime3, prime2, h |
| 128 | |
| 129 | try2: |
| 130 | TBZ $1, n, try1 |
| 131 | MOVHU.P 2(p), x3 |
| 132 | AND $255, x3, x1 |
| 133 | LSR $8, x3, x2 |
| 134 | |
| 135 | MUL prime5, x1 |
| 136 | ROR $64-11, h |
| 137 | EOR x1 @> 64-11, h, h |
| 138 | MUL prime1, h |
| 139 | |
| 140 | MUL prime5, x2 |
| 141 | ROR $64-11, h |
| 142 | EOR x2 @> 64-11, h, h |
| 143 | MUL prime1, h |
| 144 | |
| 145 | try1: |
| 146 | TBZ $0, n, finalize |
| 147 | MOVBU (p), x4 |
| 148 | |
| 149 | MUL prime5, x4 |
| 150 | ROR $64-11, h |
| 151 | EOR x4 @> 64-11, h, h |
| 152 | MUL prime1, h |
| 153 | |
| 154 | finalize: |
| 155 | EOR h >> 33, h |
| 156 | MUL prime2, h |
| 157 | EOR h >> 29, h |
| 158 | MUL prime3, h |
| 159 | EOR h >> 32, h |
| 160 | |
| 161 | MOVD h, ret+24(FP) |
| 162 | RET |
| 163 | |
| 164 | // func writeBlocks(d *Digest, b []byte) int |
| 165 | TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 |
| 166 | LDP ·primes+0(SB), (prime1, prime2) |
| 167 | |
| 168 | // Load state. Assume v[1-4] are stored contiguously. |
| 169 | MOVD d+0(FP), digest |
| 170 | LDP 0(digest), (v1, v2) |
| 171 | LDP 16(digest), (v3, v4) |
| 172 | |
| 173 | LDP b_base+8(FP), (p, n) |
| 174 | |
| 175 | blockLoop() |
| 176 | |
| 177 | // Store updated state. |
| 178 | STP (v1, v2), 0(digest) |
| 179 | STP (v3, v4), 16(digest) |
| 180 | |
| 181 | BIC $31, n |
| 182 | MOVD n, ret+32(FP) |
| 183 | RET |