peridot/vendor/github.com/cespare/xxhash/v2/xxhash_arm64.s

//go:build !appengine && gc && !purego
// +build !appengine
// +build gc
// +build !purego

#include "textflag.h"

// Registers:
#define digest	R1
#define h	R2 // return value
#define p	R3 // input pointer
#define n	R4 // input length
#define nblocks	R5 // n / 32
#define prime1	R7
#define prime2	R8
#define prime3	R9
#define prime4	R10
#define prime5	R11
#define v1	R12
#define v2	R13
#define v3	R14
#define v4	R15
#define x1	R20
#define x2	R21
#define x3	R22
#define x4	R23

#define round(acc, x) \
	MADD prime2, acc, x, acc \
	ROR  $64-31, acc         \
	MUL  prime1, acc

// round0 performs the operation x = round(0, x).
#define round0(x) \
	MUL prime2, x \
	ROR $64-31, x \
	MUL prime1, x

#define mergeRound(acc, x) \
	round0(x)                     \
	EOR  x, acc                   \
	MADD acc, prime4, prime1, acc

// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that n >= 32.
#define blockLoop() \
	LSR     $5, n, nblocks  \
	PCALIGN $16             \
	loop:                   \
	LDP.P   16(p), (x1, x2) \
	LDP.P   16(p), (x3, x4) \
	round(v1, x1)           \
	round(v2, x2)           \
	round(v3, x3)           \
	round(v4, x4)           \
	SUB     $1, nblocks     \
	CBNZ    nblocks, loop

// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
	LDP b_base+0(FP), (p, n)

	LDP  ·primes+0(SB), (prime1, prime2)
	LDP  ·primes+16(SB), (prime3, prime4)
	MOVD ·primes+32(SB), prime5

	CMP  $32, n
	CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
	BLT  afterLoop

	ADD  prime1, prime2, v1
	MOVD prime2, v2
	MOVD $0, v3
	NEG  prime1, v4

	blockLoop()

	ROR $64-1, v1, x1
	ROR $64-7, v2, x2
	ADD x1, x2
	ROR $64-12, v3, x3
	ROR $64-18, v4, x4
	ADD x3, x4
	ADD x2, x4, h

	mergeRound(h, v1)
	mergeRound(h, v2)
	mergeRound(h, v3)
	mergeRound(h, v4)

afterLoop:
	ADD n, h

	TBZ   $4, n, try8
	LDP.P 16(p), (x1, x2)

	round0(x1)

	// NOTE: here and below, sequencing the EOR after the ROR (using a
	// rotated register) is worth a small but measurable speedup for small
	// inputs.
	ROR  $64-27, h
	EOR  x1 @> 64-27, h, h
	MADD h, prime4, prime1, h

	round0(x2)
	ROR  $64-27, h
	EOR  x2 @> 64-27, h, h
	MADD h, prime4, prime1, h

try8:
	TBZ    $3, n, try4
	MOVD.P 8(p), x1

	round0(x1)
	ROR  $64-27, h
	EOR  x1 @> 64-27, h, h
	MADD h, prime4, prime1, h

try4:
	TBZ     $2, n, try2
	MOVWU.P 4(p), x2

	MUL  prime1, x2
	ROR  $64-23, h
	EOR  x2 @> 64-23, h, h
	MADD h, prime3, prime2, h

try2:
	TBZ     $1, n, try1
	MOVHU.P 2(p), x3
	AND     $255, x3, x1
	LSR     $8, x3, x2

	MUL prime5, x1
	ROR $64-11, h
	EOR x1 @> 64-11, h, h
	MUL prime1, h

	MUL prime5, x2
	ROR $64-11, h
	EOR x2 @> 64-11, h, h
	MUL prime1, h

try1:
	TBZ   $0, n, finalize
	MOVBU (p), x4

	MUL prime5, x4
	ROR $64-11, h
	EOR x4 @> 64-11, h, h
	MUL prime1, h

finalize:
	EOR h >> 33, h
	MUL prime2, h
	EOR h >> 29, h
	MUL prime3, h
	EOR h >> 32, h

	MOVD h, ret+24(FP)
	RET

// func writeBlocks(d *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
	LDP ·primes+0(SB), (prime1, prime2)

	// Load state. Assume v[1-4] are stored contiguously.
	MOVD d+0(FP), digest
	LDP  0(digest), (v1, v2)
	LDP  16(digest), (v3, v4)

	LDP b_base+8(FP), (p, n)

	blockLoop()

	// Store updated state.
	STP (v1, v2), 0(digest)
	STP (v3, v4), 16(digest)

	BIC  $31, n
	MOVD n, ret+32(FP)
	RET