[VOL-5292] Implementation for fetching the GEM port history Data from the ONT
Change-Id: I4cf22555cbd13bcd5e49e620c8aa8b67cbd2891c
Signed-off-by: Akash Reddy Kankanala <akash.kankanala@radisys.com>
diff --git a/vendor/github.com/cespare/xxhash/v2/xxhash_arm64.s b/vendor/github.com/cespare/xxhash/v2/xxhash_arm64.s
new file mode 100644
index 0000000..7e3145a
--- /dev/null
+++ b/vendor/github.com/cespare/xxhash/v2/xxhash_arm64.s
@@ -0,0 +1,183 @@
+//go:build !appengine && gc && !purego
+// +build !appengine
+// +build gc
+// +build !purego
+
+#include "textflag.h"
+
+// Registers:
+#define digest R1
+#define h R2 // return value
+#define p R3 // input pointer
+#define n R4 // input length
+#define nblocks R5 // n / 32
+#define prime1 R7
+#define prime2 R8
+#define prime3 R9
+#define prime4 R10
+#define prime5 R11
+#define v1 R12
+#define v2 R13
+#define v3 R14
+#define v4 R15
+#define x1 R20
+#define x2 R21
+#define x3 R22
+#define x4 R23
+
+#define round(acc, x) \
+ MADD prime2, acc, x, acc \
+ ROR $64-31, acc \
+ MUL prime1, acc
+
+// round0 performs the operation x = round(0, x).
+#define round0(x) \
+ MUL prime2, x \
+ ROR $64-31, x \
+ MUL prime1, x
+
+#define mergeRound(acc, x) \
+ round0(x) \
+ EOR x, acc \
+ MADD acc, prime4, prime1, acc
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that n >= 32.
+#define blockLoop() \
+ LSR $5, n, nblocks \
+ PCALIGN $16 \
+ loop: \
+ LDP.P 16(p), (x1, x2) \
+ LDP.P 16(p), (x3, x4) \
+ round(v1, x1) \
+ round(v2, x2) \
+ round(v3, x3) \
+ round(v4, x4) \
+ SUB $1, nblocks \
+ CBNZ nblocks, loop
+
+// func Sum64(b []byte) uint64
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
+ LDP b_base+0(FP), (p, n)
+
+ LDP ·primes+0(SB), (prime1, prime2)
+ LDP ·primes+16(SB), (prime3, prime4)
+ MOVD ·primes+32(SB), prime5
+
+ CMP $32, n
+ CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
+ BLT afterLoop
+
+ ADD prime1, prime2, v1
+ MOVD prime2, v2
+ MOVD $0, v3
+ NEG prime1, v4
+
+ blockLoop()
+
+ ROR $64-1, v1, x1
+ ROR $64-7, v2, x2
+ ADD x1, x2
+ ROR $64-12, v3, x3
+ ROR $64-18, v4, x4
+ ADD x3, x4
+ ADD x2, x4, h
+
+ mergeRound(h, v1)
+ mergeRound(h, v2)
+ mergeRound(h, v3)
+ mergeRound(h, v4)
+
+afterLoop:
+ ADD n, h
+
+ TBZ $4, n, try8
+ LDP.P 16(p), (x1, x2)
+
+ round0(x1)
+
+ // NOTE: here and below, sequencing the EOR after the ROR (using a
+ // rotated register) is worth a small but measurable speedup for small
+ // inputs.
+ ROR $64-27, h
+ EOR x1 @> 64-27, h, h
+ MADD h, prime4, prime1, h
+
+ round0(x2)
+ ROR $64-27, h
+ EOR x2 @> 64-27, h, h
+ MADD h, prime4, prime1, h
+
+try8:
+ TBZ $3, n, try4
+ MOVD.P 8(p), x1
+
+ round0(x1)
+ ROR $64-27, h
+ EOR x1 @> 64-27, h, h
+ MADD h, prime4, prime1, h
+
+try4:
+ TBZ $2, n, try2
+ MOVWU.P 4(p), x2
+
+ MUL prime1, x2
+ ROR $64-23, h
+ EOR x2 @> 64-23, h, h
+ MADD h, prime3, prime2, h
+
+try2:
+ TBZ $1, n, try1
+ MOVHU.P 2(p), x3
+ AND $255, x3, x1
+ LSR $8, x3, x2
+
+ MUL prime5, x1
+ ROR $64-11, h
+ EOR x1 @> 64-11, h, h
+ MUL prime1, h
+
+ MUL prime5, x2
+ ROR $64-11, h
+ EOR x2 @> 64-11, h, h
+ MUL prime1, h
+
+try1:
+ TBZ $0, n, finalize
+ MOVBU (p), x4
+
+ MUL prime5, x4
+ ROR $64-11, h
+ EOR x4 @> 64-11, h, h
+ MUL prime1, h
+
+finalize:
+ EOR h >> 33, h
+ MUL prime2, h
+ EOR h >> 29, h
+ MUL prime3, h
+ EOR h >> 32, h
+
+ MOVD h, ret+24(FP)
+ RET
+
+// func writeBlocks(d *Digest, b []byte) int
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
+ LDP ·primes+0(SB), (prime1, prime2)
+
+ // Load state. Assume v[1-4] are stored contiguously.
+ MOVD d+0(FP), digest
+ LDP 0(digest), (v1, v2)
+ LDP 16(digest), (v3, v4)
+
+ LDP b_base+8(FP), (p, n)
+
+ blockLoop()
+
+ // Store updated state.
+ STP (v1, v2), 0(digest)
+ STP (v3, v4), 16(digest)
+
+ BIC $31, n
+ MOVD n, ret+32(FP)
+ RET