peridot/vendor/github.com/cespare/xxhash/v2/xxhash_arm64.s

//go:build !appengine && gc && !purego
// +build !appengine
// +build gc
// +build !purego

#include "textflag.h"

// Registers:
#define digest	R1
#define h	R2 // return value
#define p	R3 // input pointer
#define n	R4 // input length
#define nblocks	R5 // n / 32
#define prime1	R7
#define prime2	R8
#define prime3	R9
#define prime4	R10
#define prime5	R11
#define v1	R12
#define v2	R13
#define v3	R14
#define v4	R15
#define x1	R20
#define x2	R21
#define x3	R22
#define x4	R23

#define round(acc, x) \
	MADD prime2, acc, x, acc \
	ROR  $64-31, acc         \
	MUL  prime1, acc

// round0 performs the operation x = round(0, x).
#define round0(x) \
	MUL prime2, x \
	ROR $64-31, x \
	MUL prime1, x

#define mergeRound(acc, x) \
	round0(x)                     \
	EOR  x, acc                   \
	MADD acc, prime4, prime1, acc

// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that n >= 32.
#define blockLoop() \
	LSR     $5, n, nblocks  \
	PCALIGN $16             \
	loop:                   \
	LDP.P   16(p), (x1, x2) \
	LDP.P   16(p), (x3, x4) \
	round(v1, x1)           \
	round(v2, x2)           \
	round(v3, x3)           \
	round(v4, x4)           \
	SUB     $1, nblocks     \
	CBNZ    nblocks, loop

// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
	LDP b_base+0(FP), (p, n)

	LDP  ·primes+0(SB), (prime1, prime2)
	LDP  ·primes+16(SB), (prime3, prime4)
	MOVD ·primes+32(SB), prime5

	CMP  $32, n
	CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
	BLT  afterLoop

	ADD  prime1, prime2, v1
	MOVD prime2, v2
	MOVD $0, v3
	NEG  prime1, v4

	blockLoop()

	ROR $64-1, v1, x1
	ROR $64-7, v2, x2
	ADD x1, x2
	ROR $64-12, v3, x3
	ROR $64-18, v4, x4
	ADD x3, x4
	ADD x2, x4, h

	mergeRound(h, v1)
	mergeRound(h, v2)
	mergeRound(h, v3)
	mergeRound(h, v4)

afterLoop:
	ADD n, h

	TBZ   $4, n, try8
	LDP.P 16(p), (x1, x2)

	round0(x1)

	// NOTE: here and below, sequencing the EOR after the ROR (using a
	// rotated register) is worth a small but measurable speedup for small
	// inputs.
	ROR  $64-27, h
	EOR  x1 @> 64-27, h, h
	MADD h, prime4, prime1, h

	round0(x2)
	ROR  $64-27, h
	EOR  x2 @> 64-27, h, h
	MADD h, prime4, prime1, h

try8:
	TBZ    $3, n, try4
	MOVD.P 8(p), x1

	round0(x1)
	ROR  $64-27, h
	EOR  x1 @> 64-27, h, h
	MADD h, prime4, prime1, h

try4:
	TBZ     $2, n, try2
	MOVWU.P 4(p), x2

	MUL  prime1, x2
	ROR  $64-23, h
	EOR  x2 @> 64-23, h, h
	MADD h, prime3, prime2, h

try2:
	TBZ     $1, n, try1
	MOVHU.P 2(p), x3
	AND     $255, x3, x1
	LSR     $8, x3, x2

	MUL prime5, x1
	ROR $64-11, h
	EOR x1 @> 64-11, h, h
	MUL prime1, h

	MUL prime5, x2
	ROR $64-11, h
	EOR x2 @> 64-11, h, h
	MUL prime1, h

try1:
	TBZ   $0, n, finalize
	MOVBU (p), x4

	MUL prime5, x4
	ROR $64-11, h
	EOR x4 @> 64-11, h, h
	MUL prime1, h

finalize:
	EOR h >> 33, h
	MUL prime2, h
	EOR h >> 29, h
	MUL prime3, h
	EOR h >> 32, h

	MOVD h, ret+24(FP)
	RET

// func writeBlocks(d *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
	LDP ·primes+0(SB), (prime1, prime2)

	// Load state. Assume v[1-4] are stored contiguously.
	MOVD d+0(FP), digest
	LDP  0(digest), (v1, v2)
	LDP  16(digest), (v3, v4)

	LDP b_base+8(FP), (p, n)

	blockLoop()

	// Store updated state.
	STP (v1, v2), 0(digest)
	STP (v3, v4), 16(digest)

	BIC  $31, n
	MOVD n, ret+32(FP)
	RET
Major upgrades Upgrade to Go 1.20.5, Hydra v2 SDK, rules-go v0.44.2 (with proper resolves), protobuf v25.3 and mass upgrade of Go dependencies. 2024-02-24 00:34:55 +00:00			`//go:build !appengine && gc && !purego`
			`// +build !appengine`
			`// +build gc`
			`// +build !purego`

			`#include "textflag.h"`

			`// Registers:`
			`#define digest R1`
			`#define h R2 // return value`
			`#define p R3 // input pointer`
			`#define n R4 // input length`
			`#define nblocks R5 // n / 32`
			`#define prime1 R7`
			`#define prime2 R8`
			`#define prime3 R9`
			`#define prime4 R10`
			`#define prime5 R11`
			`#define v1 R12`
			`#define v2 R13`
			`#define v3 R14`
			`#define v4 R15`
			`#define x1 R20`
			`#define x2 R21`
			`#define x3 R22`
			`#define x4 R23`

			`#define round(acc, x) \`
			`MADD prime2, acc, x, acc \`
			`ROR $64-31, acc \`
			`MUL prime1, acc`

			`// round0 performs the operation x = round(0, x).`
			`#define round0(x) \`
			`MUL prime2, x \`
			`ROR $64-31, x \`
			`MUL prime1, x`

			`#define mergeRound(acc, x) \`
			`round0(x) \`
			`EOR x, acc \`
			`MADD acc, prime4, prime1, acc`

			`// blockLoop processes as many 32-byte blocks as possible,`
			`// updating v1, v2, v3, and v4. It assumes that n >= 32.`
			`#define blockLoop() \`
			`LSR $5, n, nblocks \`
			`PCALIGN $16 \`
			`loop: \`
			`LDP.P 16(p), (x1, x2) \`
			`LDP.P 16(p), (x3, x4) \`
			`round(v1, x1) \`
			`round(v2, x2) \`
			`round(v3, x3) \`
			`round(v4, x4) \`
			`SUB $1, nblocks \`
			`CBNZ nblocks, loop`

			`// func Sum64(b []byte) uint64`
			`TEXT ·Sum64(SB), NOSPLIT\|NOFRAME, $0-32`
			`LDP b_base+0(FP), (p, n)`

			`LDP ·primes+0(SB), (prime1, prime2)`
			`LDP ·primes+16(SB), (prime3, prime4)`
			`MOVD ·primes+32(SB), prime5`

			`CMP $32, n`
			`CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }`
			`BLT afterLoop`

			`ADD prime1, prime2, v1`
			`MOVD prime2, v2`
			`MOVD $0, v3`
			`NEG prime1, v4`

			`blockLoop()`

			`ROR $64-1, v1, x1`
			`ROR $64-7, v2, x2`
			`ADD x1, x2`
			`ROR $64-12, v3, x3`
			`ROR $64-18, v4, x4`
			`ADD x3, x4`
			`ADD x2, x4, h`

			`mergeRound(h, v1)`
			`mergeRound(h, v2)`
			`mergeRound(h, v3)`
			`mergeRound(h, v4)`

			`afterLoop:`
			`ADD n, h`

			`TBZ $4, n, try8`
			`LDP.P 16(p), (x1, x2)`

			`round0(x1)`

			`// NOTE: here and below, sequencing the EOR after the ROR (using a`
			`// rotated register) is worth a small but measurable speedup for small`
			`// inputs.`
			`ROR $64-27, h`
			`EOR x1 @> 64-27, h, h`
			`MADD h, prime4, prime1, h`

			`round0(x2)`
			`ROR $64-27, h`
			`EOR x2 @> 64-27, h, h`
			`MADD h, prime4, prime1, h`

			`try8:`
			`TBZ $3, n, try4`
			`MOVD.P 8(p), x1`

			`round0(x1)`
			`ROR $64-27, h`
			`EOR x1 @> 64-27, h, h`
			`MADD h, prime4, prime1, h`

			`try4:`
			`TBZ $2, n, try2`
			`MOVWU.P 4(p), x2`

			`MUL prime1, x2`
			`ROR $64-23, h`
			`EOR x2 @> 64-23, h, h`
			`MADD h, prime3, prime2, h`

			`try2:`
			`TBZ $1, n, try1`
			`MOVHU.P 2(p), x3`
			`AND $255, x3, x1`
			`LSR $8, x3, x2`

			`MUL prime5, x1`
			`ROR $64-11, h`
			`EOR x1 @> 64-11, h, h`
			`MUL prime1, h`

			`MUL prime5, x2`
			`ROR $64-11, h`
			`EOR x2 @> 64-11, h, h`
			`MUL prime1, h`

			`try1:`
			`TBZ $0, n, finalize`
			`MOVBU (p), x4`

			`MUL prime5, x4`
			`ROR $64-11, h`
			`EOR x4 @> 64-11, h, h`
			`MUL prime1, h`

			`finalize:`
			`EOR h >> 33, h`
			`MUL prime2, h`
			`EOR h >> 29, h`
			`MUL prime3, h`
			`EOR h >> 32, h`

			`MOVD h, ret+24(FP)`
			`RET`

			`// func writeBlocks(d *Digest, b []byte) int`
			`TEXT ·writeBlocks(SB), NOSPLIT\|NOFRAME, $0-40`
			`LDP ·primes+0(SB), (prime1, prime2)`

			`// Load state. Assume v[1-4] are stored contiguously.`
			`MOVD d+0(FP), digest`
			`LDP 0(digest), (v1, v2)`
			`LDP 16(digest), (v3, v4)`

			`LDP b_base+8(FP), (p, n)`

			`blockLoop()`

			`// Store updated state.`
			`STP (v1, v2), 0(digest)`
			`STP (v3, v4), 16(digest)`

			`BIC $31, n`
			`MOVD n, ret+32(FP)`
			`RET`