mirror of
https://github.com/rocky-linux/peridot.git
synced 2024-11-01 04:41:22 +00:00
216 lines
3.5 KiB
ArmAsm
216 lines
3.5 KiB
ArmAsm
|
// +build !appengine
|
||
|
// +build gc
|
||
|
// +build !purego
|
||
|
|
||
|
#include "textflag.h"
|
||
|
|
||
|
// Register allocation:
|
||
|
// AX h
|
||
|
// SI pointer to advance through b
|
||
|
// DX n
|
||
|
// BX loop end
|
||
|
// R8 v1, k1
|
||
|
// R9 v2
|
||
|
// R10 v3
|
||
|
// R11 v4
|
||
|
// R12 tmp
|
||
|
// R13 prime1v
|
||
|
// R14 prime2v
|
||
|
// DI prime4v
|
||
|
|
||
|
// round reads from and advances the buffer pointer in SI.
|
||
|
// It assumes that R13 has prime1v and R14 has prime2v.
|
||
|
#define round(r) \
|
||
|
MOVQ (SI), R12 \
|
||
|
ADDQ $8, SI \
|
||
|
IMULQ R14, R12 \
|
||
|
ADDQ R12, r \
|
||
|
ROLQ $31, r \
|
||
|
IMULQ R13, r
|
||
|
|
||
|
// mergeRound applies a merge round on the two registers acc and val.
|
||
|
// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
|
||
|
#define mergeRound(acc, val) \
|
||
|
IMULQ R14, val \
|
||
|
ROLQ $31, val \
|
||
|
IMULQ R13, val \
|
||
|
XORQ val, acc \
|
||
|
IMULQ R13, acc \
|
||
|
ADDQ DI, acc
|
||
|
|
||
|
// func Sum64(b []byte) uint64
|
||
|
TEXT ·Sum64(SB), NOSPLIT, $0-32
|
||
|
// Load fixed primes.
|
||
|
MOVQ ·prime1v(SB), R13
|
||
|
MOVQ ·prime2v(SB), R14
|
||
|
MOVQ ·prime4v(SB), DI
|
||
|
|
||
|
// Load slice.
|
||
|
MOVQ b_base+0(FP), SI
|
||
|
MOVQ b_len+8(FP), DX
|
||
|
LEAQ (SI)(DX*1), BX
|
||
|
|
||
|
// The first loop limit will be len(b)-32.
|
||
|
SUBQ $32, BX
|
||
|
|
||
|
// Check whether we have at least one block.
|
||
|
CMPQ DX, $32
|
||
|
JLT noBlocks
|
||
|
|
||
|
// Set up initial state (v1, v2, v3, v4).
|
||
|
MOVQ R13, R8
|
||
|
ADDQ R14, R8
|
||
|
MOVQ R14, R9
|
||
|
XORQ R10, R10
|
||
|
XORQ R11, R11
|
||
|
SUBQ R13, R11
|
||
|
|
||
|
// Loop until SI > BX.
|
||
|
blockLoop:
|
||
|
round(R8)
|
||
|
round(R9)
|
||
|
round(R10)
|
||
|
round(R11)
|
||
|
|
||
|
CMPQ SI, BX
|
||
|
JLE blockLoop
|
||
|
|
||
|
MOVQ R8, AX
|
||
|
ROLQ $1, AX
|
||
|
MOVQ R9, R12
|
||
|
ROLQ $7, R12
|
||
|
ADDQ R12, AX
|
||
|
MOVQ R10, R12
|
||
|
ROLQ $12, R12
|
||
|
ADDQ R12, AX
|
||
|
MOVQ R11, R12
|
||
|
ROLQ $18, R12
|
||
|
ADDQ R12, AX
|
||
|
|
||
|
mergeRound(AX, R8)
|
||
|
mergeRound(AX, R9)
|
||
|
mergeRound(AX, R10)
|
||
|
mergeRound(AX, R11)
|
||
|
|
||
|
JMP afterBlocks
|
||
|
|
||
|
noBlocks:
|
||
|
MOVQ ·prime5v(SB), AX
|
||
|
|
||
|
afterBlocks:
|
||
|
ADDQ DX, AX
|
||
|
|
||
|
// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
|
||
|
ADDQ $24, BX
|
||
|
|
||
|
CMPQ SI, BX
|
||
|
JG fourByte
|
||
|
|
||
|
wordLoop:
|
||
|
// Calculate k1.
|
||
|
MOVQ (SI), R8
|
||
|
ADDQ $8, SI
|
||
|
IMULQ R14, R8
|
||
|
ROLQ $31, R8
|
||
|
IMULQ R13, R8
|
||
|
|
||
|
XORQ R8, AX
|
||
|
ROLQ $27, AX
|
||
|
IMULQ R13, AX
|
||
|
ADDQ DI, AX
|
||
|
|
||
|
CMPQ SI, BX
|
||
|
JLE wordLoop
|
||
|
|
||
|
fourByte:
|
||
|
ADDQ $4, BX
|
||
|
CMPQ SI, BX
|
||
|
JG singles
|
||
|
|
||
|
MOVL (SI), R8
|
||
|
ADDQ $4, SI
|
||
|
IMULQ R13, R8
|
||
|
XORQ R8, AX
|
||
|
|
||
|
ROLQ $23, AX
|
||
|
IMULQ R14, AX
|
||
|
ADDQ ·prime3v(SB), AX
|
||
|
|
||
|
singles:
|
||
|
ADDQ $4, BX
|
||
|
CMPQ SI, BX
|
||
|
JGE finalize
|
||
|
|
||
|
singlesLoop:
|
||
|
MOVBQZX (SI), R12
|
||
|
ADDQ $1, SI
|
||
|
IMULQ ·prime5v(SB), R12
|
||
|
XORQ R12, AX
|
||
|
|
||
|
ROLQ $11, AX
|
||
|
IMULQ R13, AX
|
||
|
|
||
|
CMPQ SI, BX
|
||
|
JL singlesLoop
|
||
|
|
||
|
finalize:
|
||
|
MOVQ AX, R12
|
||
|
SHRQ $33, R12
|
||
|
XORQ R12, AX
|
||
|
IMULQ R14, AX
|
||
|
MOVQ AX, R12
|
||
|
SHRQ $29, R12
|
||
|
XORQ R12, AX
|
||
|
IMULQ ·prime3v(SB), AX
|
||
|
MOVQ AX, R12
|
||
|
SHRQ $32, R12
|
||
|
XORQ R12, AX
|
||
|
|
||
|
MOVQ AX, ret+24(FP)
|
||
|
RET
|
||
|
|
||
|
// writeBlocks uses the same registers as above except that it uses AX to store
|
||
|
// the d pointer.
|
||
|
|
||
|
// func writeBlocks(d *Digest, b []byte) int
|
||
|
TEXT ·writeBlocks(SB), NOSPLIT, $0-40
|
||
|
// Load fixed primes needed for round.
|
||
|
MOVQ ·prime1v(SB), R13
|
||
|
MOVQ ·prime2v(SB), R14
|
||
|
|
||
|
// Load slice.
|
||
|
MOVQ b_base+8(FP), SI
|
||
|
MOVQ b_len+16(FP), DX
|
||
|
LEAQ (SI)(DX*1), BX
|
||
|
SUBQ $32, BX
|
||
|
|
||
|
// Load vN from d.
|
||
|
MOVQ d+0(FP), AX
|
||
|
MOVQ 0(AX), R8 // v1
|
||
|
MOVQ 8(AX), R9 // v2
|
||
|
MOVQ 16(AX), R10 // v3
|
||
|
MOVQ 24(AX), R11 // v4
|
||
|
|
||
|
// We don't need to check the loop condition here; this function is
|
||
|
// always called with at least one block of data to process.
|
||
|
blockLoop:
|
||
|
round(R8)
|
||
|
round(R9)
|
||
|
round(R10)
|
||
|
round(R11)
|
||
|
|
||
|
CMPQ SI, BX
|
||
|
JLE blockLoop
|
||
|
|
||
|
// Copy vN back to d.
|
||
|
MOVQ R8, 0(AX)
|
||
|
MOVQ R9, 8(AX)
|
||
|
MOVQ R10, 16(AX)
|
||
|
MOVQ R11, 24(AX)
|
||
|
|
||
|
// The number of bytes written is SI minus the old base pointer.
|
||
|
SUBQ b_base+8(FP), SI
|
||
|
MOVQ SI, ret+32(FP)
|
||
|
|
||
|
RET
|