| //go:build !appengine && gc && !purego |
| // +build !appengine |
| // +build gc |
| // +build !purego |
| |
| #include "textflag.h" |
| |
| // Registers: |
| #define h AX |
| #define d AX |
| #define p SI // pointer to advance through b |
| #define n DX |
| #define end BX // loop end |
| #define v1 R8 |
| #define v2 R9 |
| #define v3 R10 |
| #define v4 R11 |
| #define x R12 |
| #define prime1 R13 |
| #define prime2 R14 |
| #define prime4 DI |
| |
| #define round(acc, x) \ |
| IMULQ prime2, x \ |
| ADDQ x, acc \ |
| ROLQ $31, acc \ |
| IMULQ prime1, acc |
| |
| // round0 performs the operation x = round(0, x). |
| #define round0(x) \ |
| IMULQ prime2, x \ |
| ROLQ $31, x \ |
| IMULQ prime1, x |
| |
| // mergeRound applies a merge round on the two registers acc and x. |
| // It assumes that prime1, prime2, and prime4 have been loaded. |
| #define mergeRound(acc, x) \ |
| round0(x) \ |
| XORQ x, acc \ |
| IMULQ prime1, acc \ |
| ADDQ prime4, acc |
| |
| // blockLoop processes as many 32-byte blocks as possible, |
| // updating v1, v2, v3, and v4. It assumes that there is at least one block |
| // to process. |
| #define blockLoop() \ |
| loop: \ |
| MOVQ +0(p), x \ |
| round(v1, x) \ |
| MOVQ +8(p), x \ |
| round(v2, x) \ |
| MOVQ +16(p), x \ |
| round(v3, x) \ |
| MOVQ +24(p), x \ |
| round(v4, x) \ |
| ADDQ $32, p \ |
| CMPQ p, end \ |
| JLE loop |
| |
| // func Sum64(b []byte) uint64 |
| TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 |
| // Load fixed primes. |
| MOVQ ·primes+0(SB), prime1 |
| MOVQ ·primes+8(SB), prime2 |
| MOVQ ·primes+24(SB), prime4 |
| |
| // Load slice. |
| MOVQ b_base+0(FP), p |
| MOVQ b_len+8(FP), n |
| LEAQ (p)(n*1), end |
| |
| // The first loop limit will be len(b)-32. |
| SUBQ $32, end |
| |
| // Check whether we have at least one block. |
| CMPQ n, $32 |
| JLT noBlocks |
| |
| // Set up initial state (v1, v2, v3, v4). |
| MOVQ prime1, v1 |
| ADDQ prime2, v1 |
| MOVQ prime2, v2 |
| XORQ v3, v3 |
| XORQ v4, v4 |
| SUBQ prime1, v4 |
| |
| blockLoop() |
| |
| MOVQ v1, h |
| ROLQ $1, h |
| MOVQ v2, x |
| ROLQ $7, x |
| ADDQ x, h |
| MOVQ v3, x |
| ROLQ $12, x |
| ADDQ x, h |
| MOVQ v4, x |
| ROLQ $18, x |
| ADDQ x, h |
| |
| mergeRound(h, v1) |
| mergeRound(h, v2) |
| mergeRound(h, v3) |
| mergeRound(h, v4) |
| |
| JMP afterBlocks |
| |
| noBlocks: |
| MOVQ ·primes+32(SB), h |
| |
| afterBlocks: |
| ADDQ n, h |
| |
| ADDQ $24, end |
| CMPQ p, end |
| JG try4 |
| |
| loop8: |
| MOVQ (p), x |
| ADDQ $8, p |
| round0(x) |
| XORQ x, h |
| ROLQ $27, h |
| IMULQ prime1, h |
| ADDQ prime4, h |
| |
| CMPQ p, end |
| JLE loop8 |
| |
| try4: |
| ADDQ $4, end |
| CMPQ p, end |
| JG try1 |
| |
| MOVL (p), x |
| ADDQ $4, p |
| IMULQ prime1, x |
| XORQ x, h |
| |
| ROLQ $23, h |
| IMULQ prime2, h |
| ADDQ ·primes+16(SB), h |
| |
| try1: |
| ADDQ $4, end |
| CMPQ p, end |
| JGE finalize |
| |
| loop1: |
| MOVBQZX (p), x |
| ADDQ $1, p |
| IMULQ ·primes+32(SB), x |
| XORQ x, h |
| ROLQ $11, h |
| IMULQ prime1, h |
| |
| CMPQ p, end |
| JL loop1 |
| |
| finalize: |
| MOVQ h, x |
| SHRQ $33, x |
| XORQ x, h |
| IMULQ prime2, h |
| MOVQ h, x |
| SHRQ $29, x |
| XORQ x, h |
| IMULQ ·primes+16(SB), h |
| MOVQ h, x |
| SHRQ $32, x |
| XORQ x, h |
| |
| MOVQ h, ret+24(FP) |
| RET |
| |
| // func writeBlocks(d *Digest, b []byte) int |
| TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 |
| // Load fixed primes needed for round. |
| MOVQ ·primes+0(SB), prime1 |
| MOVQ ·primes+8(SB), prime2 |
| |
| // Load slice. |
| MOVQ b_base+8(FP), p |
| MOVQ b_len+16(FP), n |
| LEAQ (p)(n*1), end |
| SUBQ $32, end |
| |
| // Load vN from d. |
| MOVQ s+0(FP), d |
| MOVQ 0(d), v1 |
| MOVQ 8(d), v2 |
| MOVQ 16(d), v3 |
| MOVQ 24(d), v4 |
| |
| // We don't need to check the loop condition here; this function is |
| // always called with at least one block of data to process. |
| blockLoop() |
| |
| // Copy vN back to d. |
| MOVQ v1, 0(d) |
| MOVQ v2, 8(d) |
| MOVQ v3, 16(d) |
| MOVQ v4, 24(d) |
| |
| // The number of bytes written is p minus the old base pointer. |
| SUBQ b_base+8(FP), p |
| MOVQ p, ret+32(FP) |
| |
| RET |