blob: 3e8b132579ec2ea32773fd3839f63e2a772b4ad6 [file] [log] [blame]
Abhay Kumar40252eb2025-10-13 13:25:53 +00001//go:build !appengine && gc && !purego
khenaidoo26721882021-08-11 17:42:52 -04002// +build !appengine
3// +build gc
4// +build !purego
5
6#include "textflag.h"
7
Abhay Kumar40252eb2025-10-13 13:25:53 +00008// Registers:
9#define h AX
10#define d AX
11#define p SI // pointer to advance through b
12#define n DX
13#define end BX // loop end
14#define v1 R8
15#define v2 R9
16#define v3 R10
17#define v4 R11
18#define x R12
19#define prime1 R13
20#define prime2 R14
21#define prime4 DI
khenaidoo26721882021-08-11 17:42:52 -040022
Abhay Kumar40252eb2025-10-13 13:25:53 +000023#define round(acc, x) \
24 IMULQ prime2, x \
25 ADDQ x, acc \
26 ROLQ $31, acc \
27 IMULQ prime1, acc
khenaidoo26721882021-08-11 17:42:52 -040028
Abhay Kumar40252eb2025-10-13 13:25:53 +000029// round0 performs the operation x = round(0, x).
30#define round0(x) \
31 IMULQ prime2, x \
32 ROLQ $31, x \
33 IMULQ prime1, x
34
35// mergeRound applies a merge round on the two registers acc and x.
36// It assumes that prime1, prime2, and prime4 have been loaded.
37#define mergeRound(acc, x) \
38 round0(x) \
39 XORQ x, acc \
40 IMULQ prime1, acc \
41 ADDQ prime4, acc
42
43// blockLoop processes as many 32-byte blocks as possible,
44// updating v1, v2, v3, and v4. It assumes that there is at least one block
45// to process.
46#define blockLoop() \
47loop: \
48 MOVQ +0(p), x \
49 round(v1, x) \
50 MOVQ +8(p), x \
51 round(v2, x) \
52 MOVQ +16(p), x \
53 round(v3, x) \
54 MOVQ +24(p), x \
55 round(v4, x) \
56 ADDQ $32, p \
57 CMPQ p, end \
58 JLE loop
khenaidoo26721882021-08-11 17:42:52 -040059
60// func Sum64(b []byte) uint64
Abhay Kumar40252eb2025-10-13 13:25:53 +000061TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
khenaidoo26721882021-08-11 17:42:52 -040062 // Load fixed primes.
Abhay Kumar40252eb2025-10-13 13:25:53 +000063 MOVQ ·primes+0(SB), prime1
64 MOVQ ·primes+8(SB), prime2
65 MOVQ ·primes+24(SB), prime4
khenaidoo26721882021-08-11 17:42:52 -040066
67 // Load slice.
Abhay Kumar40252eb2025-10-13 13:25:53 +000068 MOVQ b_base+0(FP), p
69 MOVQ b_len+8(FP), n
70 LEAQ (p)(n*1), end
khenaidoo26721882021-08-11 17:42:52 -040071
72 // The first loop limit will be len(b)-32.
Abhay Kumar40252eb2025-10-13 13:25:53 +000073 SUBQ $32, end
khenaidoo26721882021-08-11 17:42:52 -040074
75 // Check whether we have at least one block.
Abhay Kumar40252eb2025-10-13 13:25:53 +000076 CMPQ n, $32
khenaidoo26721882021-08-11 17:42:52 -040077 JLT noBlocks
78
79 // Set up initial state (v1, v2, v3, v4).
Abhay Kumar40252eb2025-10-13 13:25:53 +000080 MOVQ prime1, v1
81 ADDQ prime2, v1
82 MOVQ prime2, v2
83 XORQ v3, v3
84 XORQ v4, v4
85 SUBQ prime1, v4
khenaidoo26721882021-08-11 17:42:52 -040086
Abhay Kumar40252eb2025-10-13 13:25:53 +000087 blockLoop()
khenaidoo26721882021-08-11 17:42:52 -040088
Abhay Kumar40252eb2025-10-13 13:25:53 +000089 MOVQ v1, h
90 ROLQ $1, h
91 MOVQ v2, x
92 ROLQ $7, x
93 ADDQ x, h
94 MOVQ v3, x
95 ROLQ $12, x
96 ADDQ x, h
97 MOVQ v4, x
98 ROLQ $18, x
99 ADDQ x, h
khenaidoo26721882021-08-11 17:42:52 -0400100
Abhay Kumar40252eb2025-10-13 13:25:53 +0000101 mergeRound(h, v1)
102 mergeRound(h, v2)
103 mergeRound(h, v3)
104 mergeRound(h, v4)
khenaidoo26721882021-08-11 17:42:52 -0400105
106 JMP afterBlocks
107
108noBlocks:
Abhay Kumar40252eb2025-10-13 13:25:53 +0000109 MOVQ ·primes+32(SB), h
khenaidoo26721882021-08-11 17:42:52 -0400110
111afterBlocks:
Abhay Kumar40252eb2025-10-13 13:25:53 +0000112 ADDQ n, h
khenaidoo26721882021-08-11 17:42:52 -0400113
Abhay Kumar40252eb2025-10-13 13:25:53 +0000114 ADDQ $24, end
115 CMPQ p, end
116 JG try4
khenaidoo26721882021-08-11 17:42:52 -0400117
Abhay Kumar40252eb2025-10-13 13:25:53 +0000118loop8:
119 MOVQ (p), x
120 ADDQ $8, p
121 round0(x)
122 XORQ x, h
123 ROLQ $27, h
124 IMULQ prime1, h
125 ADDQ prime4, h
khenaidoo26721882021-08-11 17:42:52 -0400126
Abhay Kumar40252eb2025-10-13 13:25:53 +0000127 CMPQ p, end
128 JLE loop8
khenaidoo26721882021-08-11 17:42:52 -0400129
Abhay Kumar40252eb2025-10-13 13:25:53 +0000130try4:
131 ADDQ $4, end
132 CMPQ p, end
133 JG try1
khenaidoo26721882021-08-11 17:42:52 -0400134
Abhay Kumar40252eb2025-10-13 13:25:53 +0000135 MOVL (p), x
136 ADDQ $4, p
137 IMULQ prime1, x
138 XORQ x, h
khenaidoo26721882021-08-11 17:42:52 -0400139
Abhay Kumar40252eb2025-10-13 13:25:53 +0000140 ROLQ $23, h
141 IMULQ prime2, h
142 ADDQ ·primes+16(SB), h
khenaidoo26721882021-08-11 17:42:52 -0400143
Abhay Kumar40252eb2025-10-13 13:25:53 +0000144try1:
145 ADDQ $4, end
146 CMPQ p, end
khenaidoo26721882021-08-11 17:42:52 -0400147 JGE finalize
148
Abhay Kumar40252eb2025-10-13 13:25:53 +0000149loop1:
150 MOVBQZX (p), x
151 ADDQ $1, p
152 IMULQ ·primes+32(SB), x
153 XORQ x, h
154 ROLQ $11, h
155 IMULQ prime1, h
khenaidoo26721882021-08-11 17:42:52 -0400156
Abhay Kumar40252eb2025-10-13 13:25:53 +0000157 CMPQ p, end
158 JL loop1
khenaidoo26721882021-08-11 17:42:52 -0400159
160finalize:
Abhay Kumar40252eb2025-10-13 13:25:53 +0000161 MOVQ h, x
162 SHRQ $33, x
163 XORQ x, h
164 IMULQ prime2, h
165 MOVQ h, x
166 SHRQ $29, x
167 XORQ x, h
168 IMULQ ·primes+16(SB), h
169 MOVQ h, x
170 SHRQ $32, x
171 XORQ x, h
khenaidoo26721882021-08-11 17:42:52 -0400172
Abhay Kumar40252eb2025-10-13 13:25:53 +0000173 MOVQ h, ret+24(FP)
khenaidoo26721882021-08-11 17:42:52 -0400174 RET
175
khenaidoo26721882021-08-11 17:42:52 -0400176// func writeBlocks(d *Digest, b []byte) int
Abhay Kumar40252eb2025-10-13 13:25:53 +0000177TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
khenaidoo26721882021-08-11 17:42:52 -0400178 // Load fixed primes needed for round.
Abhay Kumar40252eb2025-10-13 13:25:53 +0000179 MOVQ ·primes+0(SB), prime1
180 MOVQ ·primes+8(SB), prime2
khenaidoo26721882021-08-11 17:42:52 -0400181
182 // Load slice.
Abhay Kumar40252eb2025-10-13 13:25:53 +0000183 MOVQ b_base+8(FP), p
184 MOVQ b_len+16(FP), n
185 LEAQ (p)(n*1), end
186 SUBQ $32, end
khenaidoo26721882021-08-11 17:42:52 -0400187
188 // Load vN from d.
Abhay Kumar40252eb2025-10-13 13:25:53 +0000189 MOVQ s+0(FP), d
190 MOVQ 0(d), v1
191 MOVQ 8(d), v2
192 MOVQ 16(d), v3
193 MOVQ 24(d), v4
khenaidoo26721882021-08-11 17:42:52 -0400194
195 // We don't need to check the loop condition here; this function is
196 // always called with at least one block of data to process.
Abhay Kumar40252eb2025-10-13 13:25:53 +0000197 blockLoop()
khenaidoo26721882021-08-11 17:42:52 -0400198
199 // Copy vN back to d.
Abhay Kumar40252eb2025-10-13 13:25:53 +0000200 MOVQ v1, 0(d)
201 MOVQ v2, 8(d)
202 MOVQ v3, 16(d)
203 MOVQ v4, 24(d)
khenaidoo26721882021-08-11 17:42:52 -0400204
Abhay Kumar40252eb2025-10-13 13:25:53 +0000205 // The number of bytes written is p minus the old base pointer.
206 SUBQ b_base+8(FP), p
207 MOVQ p, ret+32(FP)
khenaidoo26721882021-08-11 17:42:52 -0400208
209 RET