kesavand | c71914f | 2022-03-25 11:19:03 +0530 | [diff] [blame^] | 1 | // +build gc,!purego,!noasm |
| 2 | |
| 3 | #include "textflag.h" |
| 4 | |
| 5 | // Register allocation. |
| 6 | #define digest R1 |
| 7 | #define h R2 // Return value. |
| 8 | #define p R3 // Input pointer. |
| 9 | #define len R4 |
| 10 | #define nblocks R5 // len / 32. |
| 11 | #define prime1 R7 |
| 12 | #define prime2 R8 |
| 13 | #define prime3 R9 |
| 14 | #define prime4 R10 |
| 15 | #define prime5 R11 |
| 16 | #define v1 R12 |
| 17 | #define v2 R13 |
| 18 | #define v3 R14 |
| 19 | #define v4 R15 |
| 20 | #define x1 R20 |
| 21 | #define x2 R21 |
| 22 | #define x3 R22 |
| 23 | #define x4 R23 |
| 24 | |
| 25 | #define round(acc, x) \ |
| 26 | MADD prime2, acc, x, acc \ |
| 27 | ROR $64-31, acc \ |
| 28 | MUL prime1, acc \ |
| 29 | |
| 30 | // x = round(0, x). |
| 31 | #define round0(x) \ |
| 32 | MUL prime2, x \ |
| 33 | ROR $64-31, x \ |
| 34 | MUL prime1, x \ |
| 35 | |
| 36 | #define mergeRound(x) \ |
| 37 | round0(x) \ |
| 38 | EOR x, h \ |
| 39 | MADD h, prime4, prime1, h \ |
| 40 | |
| 41 | // Update v[1-4] with 32-byte blocks. Assumes len >= 32. |
| 42 | #define blocksLoop() \ |
| 43 | LSR $5, len, nblocks \ |
| 44 | PCALIGN $16 \ |
| 45 | loop: \ |
| 46 | LDP.P 32(p), (x1, x2) \ |
| 47 | round(v1, x1) \ |
| 48 | LDP -16(p), (x3, x4) \ |
| 49 | round(v2, x2) \ |
| 50 | SUB $1, nblocks \ |
| 51 | round(v3, x3) \ |
| 52 | round(v4, x4) \ |
| 53 | CBNZ nblocks, loop \ |
| 54 | |
| 55 | // The primes are repeated here to ensure that they're stored |
| 56 | // in a contiguous array, so we can load them with LDP. |
| 57 | DATA primes<> +0(SB)/8, $11400714785074694791 |
| 58 | DATA primes<> +8(SB)/8, $14029467366897019727 |
| 59 | DATA primes<>+16(SB)/8, $1609587929392839161 |
| 60 | DATA primes<>+24(SB)/8, $9650029242287828579 |
| 61 | DATA primes<>+32(SB)/8, $2870177450012600261 |
| 62 | GLOBL primes<>(SB), NOPTR+RODATA, $40 |
| 63 | |
| 64 | // func Sum64(b []byte) uint64 |
| 65 | TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 |
| 66 | LDP b_base+0(FP), (p, len) |
| 67 | |
| 68 | LDP primes<> +0(SB), (prime1, prime2) |
| 69 | LDP primes<>+16(SB), (prime3, prime4) |
| 70 | MOVD primes<>+32(SB), prime5 |
| 71 | |
| 72 | CMP $32, len |
| 73 | CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 } |
| 74 | BLO afterLoop |
| 75 | |
| 76 | ADD prime1, prime2, v1 |
| 77 | MOVD prime2, v2 |
| 78 | MOVD $0, v3 |
| 79 | NEG prime1, v4 |
| 80 | |
| 81 | blocksLoop() |
| 82 | |
| 83 | ROR $64-1, v1, x1 |
| 84 | ROR $64-7, v2, x2 |
| 85 | ADD x1, x2 |
| 86 | ROR $64-12, v3, x3 |
| 87 | ROR $64-18, v4, x4 |
| 88 | ADD x3, x4 |
| 89 | ADD x2, x4, h |
| 90 | |
| 91 | mergeRound(v1) |
| 92 | mergeRound(v2) |
| 93 | mergeRound(v3) |
| 94 | mergeRound(v4) |
| 95 | |
| 96 | afterLoop: |
| 97 | ADD len, h |
| 98 | |
| 99 | TBZ $4, len, try8 |
| 100 | LDP.P 16(p), (x1, x2) |
| 101 | |
| 102 | round0(x1) |
| 103 | ROR $64-27, h |
| 104 | EOR x1 @> 64-27, h, h |
| 105 | MADD h, prime4, prime1, h |
| 106 | |
| 107 | round0(x2) |
| 108 | ROR $64-27, h |
| 109 | EOR x2 @> 64-27, h |
| 110 | MADD h, prime4, prime1, h |
| 111 | |
| 112 | try8: |
| 113 | TBZ $3, len, try4 |
| 114 | MOVD.P 8(p), x1 |
| 115 | |
| 116 | round0(x1) |
| 117 | ROR $64-27, h |
| 118 | EOR x1 @> 64-27, h |
| 119 | MADD h, prime4, prime1, h |
| 120 | |
| 121 | try4: |
| 122 | TBZ $2, len, try2 |
| 123 | MOVWU.P 4(p), x2 |
| 124 | |
| 125 | MUL prime1, x2 |
| 126 | ROR $64-23, h |
| 127 | EOR x2 @> 64-23, h |
| 128 | MADD h, prime3, prime2, h |
| 129 | |
| 130 | try2: |
| 131 | TBZ $1, len, try1 |
| 132 | MOVHU.P 2(p), x3 |
| 133 | AND $255, x3, x1 |
| 134 | LSR $8, x3, x2 |
| 135 | |
| 136 | MUL prime5, x1 |
| 137 | ROR $64-11, h |
| 138 | EOR x1 @> 64-11, h |
| 139 | MUL prime1, h |
| 140 | |
| 141 | MUL prime5, x2 |
| 142 | ROR $64-11, h |
| 143 | EOR x2 @> 64-11, h |
| 144 | MUL prime1, h |
| 145 | |
| 146 | try1: |
| 147 | TBZ $0, len, end |
| 148 | MOVBU (p), x4 |
| 149 | |
| 150 | MUL prime5, x4 |
| 151 | ROR $64-11, h |
| 152 | EOR x4 @> 64-11, h |
| 153 | MUL prime1, h |
| 154 | |
| 155 | end: |
| 156 | EOR h >> 33, h |
| 157 | MUL prime2, h |
| 158 | EOR h >> 29, h |
| 159 | MUL prime3, h |
| 160 | EOR h >> 32, h |
| 161 | |
| 162 | MOVD h, ret+24(FP) |
| 163 | RET |
| 164 | |
| 165 | // func writeBlocks(d *Digest, b []byte) int |
| 166 | // |
| 167 | // Assumes len(b) >= 32. |
| 168 | TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 |
| 169 | LDP primes<>(SB), (prime1, prime2) |
| 170 | |
| 171 | // Load state. Assume v[1-4] are stored contiguously. |
| 172 | MOVD d+0(FP), digest |
| 173 | LDP 0(digest), (v1, v2) |
| 174 | LDP 16(digest), (v3, v4) |
| 175 | |
| 176 | LDP b_base+8(FP), (p, len) |
| 177 | |
| 178 | blocksLoop() |
| 179 | |
| 180 | // Store updated state. |
| 181 | STP (v1, v2), 0(digest) |
| 182 | STP (v3, v4), 16(digest) |
| 183 | |
| 184 | BIC $31, len |
| 185 | MOVD len, ret+32(FP) |
| 186 | RET |