blob: 3e8b132579ec2ea32773fd3839f63e2a772b4ad6 [file] [log] [blame]
Abhay Kumarfe505f22025-11-10 14:16:31 +00001//go:build !appengine && gc && !purego
Naveen Sampath04696f72022-06-13 15:19:14 +05302// +build !appengine
3// +build gc
4// +build !purego
5
6#include "textflag.h"
7
Abhay Kumarfe505f22025-11-10 14:16:31 +00008// Registers:
9#define h AX
10#define d AX
11#define p SI // pointer to advance through b
12#define n DX
13#define end BX // loop end
14#define v1 R8
15#define v2 R9
16#define v3 R10
17#define v4 R11
18#define x R12
19#define prime1 R13
20#define prime2 R14
21#define prime4 DI
Naveen Sampath04696f72022-06-13 15:19:14 +053022
Abhay Kumarfe505f22025-11-10 14:16:31 +000023#define round(acc, x) \
24 IMULQ prime2, x \
25 ADDQ x, acc \
26 ROLQ $31, acc \
27 IMULQ prime1, acc
Naveen Sampath04696f72022-06-13 15:19:14 +053028
Abhay Kumarfe505f22025-11-10 14:16:31 +000029// round0 performs the operation x = round(0, x).
30#define round0(x) \
31 IMULQ prime2, x \
32 ROLQ $31, x \
33 IMULQ prime1, x
34
35// mergeRound applies a merge round on the two registers acc and x.
36// It assumes that prime1, prime2, and prime4 have been loaded.
37#define mergeRound(acc, x) \
38 round0(x) \
39 XORQ x, acc \
40 IMULQ prime1, acc \
41 ADDQ prime4, acc
42
43// blockLoop processes as many 32-byte blocks as possible,
44// updating v1, v2, v3, and v4. It assumes that there is at least one block
45// to process.
46#define blockLoop() \
47loop: \
48 MOVQ +0(p), x \
49 round(v1, x) \
50 MOVQ +8(p), x \
51 round(v2, x) \
52 MOVQ +16(p), x \
53 round(v3, x) \
54 MOVQ +24(p), x \
55 round(v4, x) \
56 ADDQ $32, p \
57 CMPQ p, end \
58 JLE loop
Naveen Sampath04696f72022-06-13 15:19:14 +053059
60// func Sum64(b []byte) uint64
Abhay Kumarfe505f22025-11-10 14:16:31 +000061TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
Naveen Sampath04696f72022-06-13 15:19:14 +053062 // Load fixed primes.
Abhay Kumarfe505f22025-11-10 14:16:31 +000063 MOVQ ·primes+0(SB), prime1
64 MOVQ ·primes+8(SB), prime2
65 MOVQ ·primes+24(SB), prime4
Naveen Sampath04696f72022-06-13 15:19:14 +053066
67 // Load slice.
Abhay Kumarfe505f22025-11-10 14:16:31 +000068 MOVQ b_base+0(FP), p
69 MOVQ b_len+8(FP), n
70 LEAQ (p)(n*1), end
Naveen Sampath04696f72022-06-13 15:19:14 +053071
72 // The first loop limit will be len(b)-32.
Abhay Kumarfe505f22025-11-10 14:16:31 +000073 SUBQ $32, end
Naveen Sampath04696f72022-06-13 15:19:14 +053074
75 // Check whether we have at least one block.
Abhay Kumarfe505f22025-11-10 14:16:31 +000076 CMPQ n, $32
Naveen Sampath04696f72022-06-13 15:19:14 +053077 JLT noBlocks
78
79 // Set up initial state (v1, v2, v3, v4).
Abhay Kumarfe505f22025-11-10 14:16:31 +000080 MOVQ prime1, v1
81 ADDQ prime2, v1
82 MOVQ prime2, v2
83 XORQ v3, v3
84 XORQ v4, v4
85 SUBQ prime1, v4
Naveen Sampath04696f72022-06-13 15:19:14 +053086
Abhay Kumarfe505f22025-11-10 14:16:31 +000087 blockLoop()
Naveen Sampath04696f72022-06-13 15:19:14 +053088
Abhay Kumarfe505f22025-11-10 14:16:31 +000089 MOVQ v1, h
90 ROLQ $1, h
91 MOVQ v2, x
92 ROLQ $7, x
93 ADDQ x, h
94 MOVQ v3, x
95 ROLQ $12, x
96 ADDQ x, h
97 MOVQ v4, x
98 ROLQ $18, x
99 ADDQ x, h
Naveen Sampath04696f72022-06-13 15:19:14 +0530100
Abhay Kumarfe505f22025-11-10 14:16:31 +0000101 mergeRound(h, v1)
102 mergeRound(h, v2)
103 mergeRound(h, v3)
104 mergeRound(h, v4)
Naveen Sampath04696f72022-06-13 15:19:14 +0530105
106 JMP afterBlocks
107
108noBlocks:
Abhay Kumarfe505f22025-11-10 14:16:31 +0000109 MOVQ ·primes+32(SB), h
Naveen Sampath04696f72022-06-13 15:19:14 +0530110
111afterBlocks:
Abhay Kumarfe505f22025-11-10 14:16:31 +0000112 ADDQ n, h
Naveen Sampath04696f72022-06-13 15:19:14 +0530113
Abhay Kumarfe505f22025-11-10 14:16:31 +0000114 ADDQ $24, end
115 CMPQ p, end
116 JG try4
Naveen Sampath04696f72022-06-13 15:19:14 +0530117
Abhay Kumarfe505f22025-11-10 14:16:31 +0000118loop8:
119 MOVQ (p), x
120 ADDQ $8, p
121 round0(x)
122 XORQ x, h
123 ROLQ $27, h
124 IMULQ prime1, h
125 ADDQ prime4, h
Naveen Sampath04696f72022-06-13 15:19:14 +0530126
Abhay Kumarfe505f22025-11-10 14:16:31 +0000127 CMPQ p, end
128 JLE loop8
Naveen Sampath04696f72022-06-13 15:19:14 +0530129
Abhay Kumarfe505f22025-11-10 14:16:31 +0000130try4:
131 ADDQ $4, end
132 CMPQ p, end
133 JG try1
Naveen Sampath04696f72022-06-13 15:19:14 +0530134
Abhay Kumarfe505f22025-11-10 14:16:31 +0000135 MOVL (p), x
136 ADDQ $4, p
137 IMULQ prime1, x
138 XORQ x, h
Naveen Sampath04696f72022-06-13 15:19:14 +0530139
Abhay Kumarfe505f22025-11-10 14:16:31 +0000140 ROLQ $23, h
141 IMULQ prime2, h
142 ADDQ ·primes+16(SB), h
Naveen Sampath04696f72022-06-13 15:19:14 +0530143
Abhay Kumarfe505f22025-11-10 14:16:31 +0000144try1:
145 ADDQ $4, end
146 CMPQ p, end
Naveen Sampath04696f72022-06-13 15:19:14 +0530147 JGE finalize
148
Abhay Kumarfe505f22025-11-10 14:16:31 +0000149loop1:
150 MOVBQZX (p), x
151 ADDQ $1, p
152 IMULQ ·primes+32(SB), x
153 XORQ x, h
154 ROLQ $11, h
155 IMULQ prime1, h
Naveen Sampath04696f72022-06-13 15:19:14 +0530156
Abhay Kumarfe505f22025-11-10 14:16:31 +0000157 CMPQ p, end
158 JL loop1
Naveen Sampath04696f72022-06-13 15:19:14 +0530159
160finalize:
Abhay Kumarfe505f22025-11-10 14:16:31 +0000161 MOVQ h, x
162 SHRQ $33, x
163 XORQ x, h
164 IMULQ prime2, h
165 MOVQ h, x
166 SHRQ $29, x
167 XORQ x, h
168 IMULQ ·primes+16(SB), h
169 MOVQ h, x
170 SHRQ $32, x
171 XORQ x, h
Naveen Sampath04696f72022-06-13 15:19:14 +0530172
Abhay Kumarfe505f22025-11-10 14:16:31 +0000173 MOVQ h, ret+24(FP)
Naveen Sampath04696f72022-06-13 15:19:14 +0530174 RET
175
Naveen Sampath04696f72022-06-13 15:19:14 +0530176// func writeBlocks(d *Digest, b []byte) int
Abhay Kumarfe505f22025-11-10 14:16:31 +0000177TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
Naveen Sampath04696f72022-06-13 15:19:14 +0530178 // Load fixed primes needed for round.
Abhay Kumarfe505f22025-11-10 14:16:31 +0000179 MOVQ ·primes+0(SB), prime1
180 MOVQ ·primes+8(SB), prime2
Naveen Sampath04696f72022-06-13 15:19:14 +0530181
182 // Load slice.
Abhay Kumarfe505f22025-11-10 14:16:31 +0000183 MOVQ b_base+8(FP), p
184 MOVQ b_len+16(FP), n
185 LEAQ (p)(n*1), end
186 SUBQ $32, end
Naveen Sampath04696f72022-06-13 15:19:14 +0530187
188 // Load vN from d.
Abhay Kumarfe505f22025-11-10 14:16:31 +0000189 MOVQ s+0(FP), d
190 MOVQ 0(d), v1
191 MOVQ 8(d), v2
192 MOVQ 16(d), v3
193 MOVQ 24(d), v4
Naveen Sampath04696f72022-06-13 15:19:14 +0530194
195 // We don't need to check the loop condition here; this function is
196 // always called with at least one block of data to process.
Abhay Kumarfe505f22025-11-10 14:16:31 +0000197 blockLoop()
Naveen Sampath04696f72022-06-13 15:19:14 +0530198
199 // Copy vN back to d.
Abhay Kumarfe505f22025-11-10 14:16:31 +0000200 MOVQ v1, 0(d)
201 MOVQ v2, 8(d)
202 MOVQ v3, 16(d)
203 MOVQ v4, 24(d)
Naveen Sampath04696f72022-06-13 15:19:14 +0530204
Abhay Kumarfe505f22025-11-10 14:16:31 +0000205 // The number of bytes written is p minus the old base pointer.
206 SUBQ b_base+8(FP), p
207 MOVQ p, ret+32(FP)
Naveen Sampath04696f72022-06-13 15:19:14 +0530208
209 RET