1
// Copyright 2009 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
5
#include "../../../cmd/ld/textflag.h"
7
// This file provides fast assembly versions for the elementary
8
// arithmetic operations on vectors implemented in arith.go.
10
// Literal instruction for MOVQ $0, CX.
11
// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
12
#define ZERO_CX BYTE $0x48; \
20
// func mulWW(x, y Word) (z1, z0 Word)
21
TEXT ·mulWW(SB),NOSPLIT,$0
29
// func divWW(x1, x0, y Word) (q, r Word)
30
TEXT ·divWW(SB),NOSPLIT,$0
39
// func addVV(z, x, y []Word) (c Word)
40
TEXT ·addVV(SB),NOSPLIT,$0
49
// s/JL/JMP/ below to disable the unrolled loop
51
JL V1 // if n < 0 goto V1
54
// regular loop body unrolled 4x
58
MOVQ 16(R8)(SI*8), R13
59
MOVQ 24(R8)(SI*8), R14
62
ADCQ 16(R9)(SI*8), R13
63
ADCQ 24(R9)(SI*8), R14
64
MOVQ R11, 0(R10)(SI*8)
65
MOVQ R12, 8(R10)(SI*8)
66
MOVQ R13, 16(R10)(SI*8)
67
MOVQ R14, 24(R10)(SI*8)
72
JGE U1 // if n >= 0 goto U1
74
V1: ADDQ $4, DI // n += 4
75
JLE E1 // if n <= 0 goto E1
81
MOVQ R11, 0(R10)(SI*8)
86
JG L1 // if n > 0 goto L1
88
E1: MOVQ CX, c+72(FP) // return c
92
// func subVV(z, x, y []Word) (c Word)
93
// (same as addVV except for SBBQ instead of ADCQ and label names)
94
TEXT ·subVV(SB),NOSPLIT,$0
103
// s/JL/JMP/ below to disable the unrolled loop
104
SUBQ $4, DI // n -= 4
105
JL V2 // if n < 0 goto V2
108
// regular loop body unrolled 4x
109
RCRQ $1, CX // CF = c
110
MOVQ 0(R8)(SI*8), R11
111
MOVQ 8(R8)(SI*8), R12
112
MOVQ 16(R8)(SI*8), R13
113
MOVQ 24(R8)(SI*8), R14
114
SBBQ 0(R9)(SI*8), R11
115
SBBQ 8(R9)(SI*8), R12
116
SBBQ 16(R9)(SI*8), R13
117
SBBQ 24(R9)(SI*8), R14
118
MOVQ R11, 0(R10)(SI*8)
119
MOVQ R12, 8(R10)(SI*8)
120
MOVQ R13, 16(R10)(SI*8)
121
MOVQ R14, 24(R10)(SI*8)
122
RCLQ $1, CX // c = CF
124
ADDQ $4, SI // i += 4
125
SUBQ $4, DI // n -= 4
126
JGE U2 // if n >= 0 goto U2
128
V2: ADDQ $4, DI // n += 4
129
JLE E2 // if n <= 0 goto E2
132
RCRQ $1, CX // CF = c
133
MOVQ 0(R8)(SI*8), R11
134
SBBQ 0(R9)(SI*8), R11
135
MOVQ R11, 0(R10)(SI*8)
136
RCLQ $1, CX // c = CF
140
JG L2 // if n > 0 goto L2
142
E2: MOVQ CX, c+72(FP) // return c
146
// func addVW(z, x []Word, y Word) (c Word)
147
TEXT ·addVW(SB),NOSPLIT,$0
150
MOVQ y+48(FP), CX // c = y
155
// s/JL/JMP/ below to disable the unrolled loop
156
SUBQ $4, DI // n -= 4
157
JL V3 // if n < 4 goto V3
160
// regular loop body unrolled 4x
161
MOVQ 0(R8)(SI*8), R11
162
MOVQ 8(R8)(SI*8), R12
163
MOVQ 16(R8)(SI*8), R13
164
MOVQ 24(R8)(SI*8), R14
171
MOVQ R11, 0(R10)(SI*8)
172
MOVQ R12, 8(R10)(SI*8)
173
MOVQ R13, 16(R10)(SI*8)
174
MOVQ R14, 24(R10)(SI*8)
176
ADDQ $4, SI // i += 4
177
SUBQ $4, DI // n -= 4
178
JGE U3 // if n >= 0 goto U3
180
V3: ADDQ $4, DI // n += 4
181
JLE E3 // if n <= 0 goto E3
185
MOVQ CX, 0(R10)(SI*8)
187
RCLQ $1, CX // c = CF
191
JG L3 // if n > 0 goto L3
193
E3: MOVQ CX, c+56(FP) // return c
197
// func subVW(z, x []Word, y Word) (c Word)
198
// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
199
TEXT ·subVW(SB),NOSPLIT,$0
202
MOVQ y+48(FP), CX // c = y
207
// s/JL/JMP/ below to disable the unrolled loop
208
SUBQ $4, DI // n -= 4
209
JL V4 // if n < 4 goto V4
212
// regular loop body unrolled 4x
213
MOVQ 0(R8)(SI*8), R11
214
MOVQ 8(R8)(SI*8), R12
215
MOVQ 16(R8)(SI*8), R13
216
MOVQ 24(R8)(SI*8), R14
223
MOVQ R11, 0(R10)(SI*8)
224
MOVQ R12, 8(R10)(SI*8)
225
MOVQ R13, 16(R10)(SI*8)
226
MOVQ R14, 24(R10)(SI*8)
228
ADDQ $4, SI // i += 4
229
SUBQ $4, DI // n -= 4
230
JGE U4 // if n >= 0 goto U4
232
V4: ADDQ $4, DI // n += 4
233
JLE E4 // if n <= 0 goto E4
236
MOVQ 0(R8)(SI*8), R11
238
MOVQ R11, 0(R10)(SI*8)
240
RCLQ $1, CX // c = CF
244
JG L4 // if n > 0 goto L4
246
E4: MOVQ CX, c+56(FP) // return c
250
// func shlVU(z, x []Word, s uint) (c Word)
251
TEXT ·shlVU(SB),NOSPLIT,$0
252
MOVQ z_len+8(FP), BX // i = z
254
JL X8b // i < 0 (n <= 0)
260
MOVQ (R8)(BX*8), AX // w1 = x[n-1]
262
SHLQ CX, DX:AX // w1>>ŝ
269
L8: MOVQ AX, DX // w = w1
270
MOVQ -8(R8)(BX*8), AX // w1 = x[i-1]
271
SHLQ CX, DX:AX // w<<s | w1>>ŝ
272
MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ
277
X8a: SHLQ CX, AX // w1<<s
278
MOVQ AX, (R10) // z[0] = w1<<s
281
X8b: MOVQ $0, c+56(FP)
285
// func shrVU(z, x []Word, s uint) (c Word)
286
TEXT ·shrVU(SB),NOSPLIT,$0
287
MOVQ z_len+8(FP), R11
289
JL X9b // n < 0 (n <= 0)
295
MOVQ (R8), AX // w1 = x[0]
297
SHRQ CX, DX:AX // w1<<ŝ
304
L9: MOVQ AX, DX // w = w1
305
MOVQ 8(R8)(BX*8), AX // w1 = x[i+1]
306
SHRQ CX, DX:AX // w>>s | w1<<ŝ
307
MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
314
X9a: SHRQ CX, AX // w1>>s
315
MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
318
X9b: MOVQ $0, c+56(FP)
322
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
323
TEXT ·mulAddVWW(SB),NOSPLIT,$0
327
MOVQ r+56(FP), CX // c = r
328
MOVQ z_len+8(FP), R11
332
L5: MOVQ (R8)(BX*8), AX
340
E5: CMPQ BX, R11 // i < n
347
// func addMulVVW(z, x []Word, y Word) (c Word)
348
TEXT ·addMulVVW(SB),NOSPLIT,$0
352
MOVQ z_len+8(FP), R11
357
L6: MOVQ (R8)(BX*8), AX
366
E6: CMPQ BX, R11 // i < n
373
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
374
TEXT ·divWVW(SB),NOSPLIT,$0
376
MOVQ xn+24(FP), DX // r = xn
379
MOVQ z_len+8(FP), BX // i = z
382
L7: MOVQ (R8)(BX*8), AX
386
E7: SUBQ $1, BX // i--
392
// func bitLen(x Word) (n int)
393
TEXT ·bitLen(SB),NOSPLIT,$0