5
5
// This file provides fast assembly versions for the elementary
6
6
// arithmetic operations on vectors implemented in arith.go.
8
// TODO(gri) - experiment with unrolled loops for faster execution
8
// Literal instruction for MOVQ $0, CX.
9
// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
10
#define ZERO_CX BYTE $0x48; \
10
18
// func mulWW(x, y Word) (z1, z0 Word)
11
19
TEXT ·mulWW(SB),7,$0
29
37
// func addVV(z, x, y []Word) (c Word)
30
38
TEXT ·addVV(SB),7,$0
39
L1: MOVQ (R8)(BX*8), AX
46
E1: CMPQ BX, R11 // i < n
47
// s/JL/JMP/ below to disable the unrolled loop
49
JL V1 // if n < 0 goto V1
52
// regular loop body unrolled 4x
56
MOVQ 16(R8)(SI*8), R13
57
MOVQ 24(R8)(SI*8), R14
60
ADCQ 16(R9)(SI*8), R13
61
ADCQ 24(R9)(SI*8), R14
62
MOVQ R11, 0(R10)(SI*8)
63
MOVQ R12, 8(R10)(SI*8)
64
MOVQ R13, 16(R10)(SI*8)
65
MOVQ R14, 24(R10)(SI*8)
70
JGE U1 // if n >= 0 goto U1
72
V1: ADDQ $4, DI // n += 4
73
JLE E1 // if n <= 0 goto E1
79
MOVQ R11, 0(R10)(SI*8)
84
JG L1 // if n > 0 goto L1
86
E1: MOVQ CX, c+72(FP) // return c
53
90
// func subVV(z, x, y []Word) (c Word)
54
// (same as addVV_s except for SBBQ instead of ADCQ and label names)
91
// (same as addVV except for SBBQ instead of ADCQ and label names)
55
92
TEXT ·subVV(SB),7,$0
64
L2: MOVQ (R8)(BX*8), AX
71
E2: CMPQ BX, R11 // i < n
101
// s/JL/JMP/ below to disable the unrolled loop
102
SUBQ $4, DI // n -= 4
103
JL V2 // if n < 0 goto V2
106
// regular loop body unrolled 4x
107
RCRQ $1, CX // CF = c
108
MOVQ 0(R8)(SI*8), R11
109
MOVQ 8(R8)(SI*8), R12
110
MOVQ 16(R8)(SI*8), R13
111
MOVQ 24(R8)(SI*8), R14
112
SBBQ 0(R9)(SI*8), R11
113
SBBQ 8(R9)(SI*8), R12
114
SBBQ 16(R9)(SI*8), R13
115
SBBQ 24(R9)(SI*8), R14
116
MOVQ R11, 0(R10)(SI*8)
117
MOVQ R12, 8(R10)(SI*8)
118
MOVQ R13, 16(R10)(SI*8)
119
MOVQ R14, 24(R10)(SI*8)
120
RCLQ $1, CX // c = CF
122
ADDQ $4, SI // i += 4
123
SUBQ $4, DI // n -= 4
124
JGE U2 // if n >= 0 goto U2
126
V2: ADDQ $4, DI // n += 4
127
JLE E2 // if n <= 0 goto E2
130
RCRQ $1, CX // CF = c
131
MOVQ 0(R8)(SI*8), R11
132
SBBQ 0(R9)(SI*8), R11
133
MOVQ R11, 0(R10)(SI*8)
134
RCLQ $1, CX // c = CF
138
JG L2 // if n > 0 goto L2
140
E2: MOVQ CX, c+72(FP) // return c
78
144
// func addVW(z, x []Word, y Word) (c Word)
79
145
TEXT ·addVW(SB),7,$0
148
MOVQ y+48(FP), CX // c = y
82
MOVQ y+32(FP), AX // c = y
87
L3: ADDQ (R8)(BX*8), AX
93
E3: CMPQ BX, R11 // i < n
153
// s/JL/JMP/ below to disable the unrolled loop
154
SUBQ $4, DI // n -= 4
155
JL V3 // if n < 4 goto V3
158
// regular loop body unrolled 4x
159
MOVQ 0(R8)(SI*8), R11
160
MOVQ 8(R8)(SI*8), R12
161
MOVQ 16(R8)(SI*8), R13
162
MOVQ 24(R8)(SI*8), R14
169
MOVQ R11, 0(R10)(SI*8)
170
MOVQ R12, 8(R10)(SI*8)
171
MOVQ R13, 16(R10)(SI*8)
172
MOVQ R14, 24(R10)(SI*8)
174
ADDQ $4, SI // i += 4
175
SUBQ $4, DI // n -= 4
176
JGE U3 // if n >= 0 goto U3
178
V3: ADDQ $4, DI // n += 4
179
JLE E3 // if n <= 0 goto E3
183
MOVQ CX, 0(R10)(SI*8)
185
RCLQ $1, CX // c = CF
189
JG L3 // if n > 0 goto L3
191
E3: MOVQ CX, c+56(FP) // return c
100
195
// func subVW(z, x []Word, y Word) (c Word)
196
// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
101
197
TEXT ·subVW(SB),7,$0
200
MOVQ y+48(FP), CX // c = y
102
201
MOVQ z+0(FP), R10
104
MOVQ y+32(FP), AX // c = y
109
L4: MOVQ (R8)(BX*8), DX // TODO(gri) is there a reverse SUBQ?
116
E4: CMPQ BX, R11 // i < n
205
// s/JL/JMP/ below to disable the unrolled loop
206
SUBQ $4, DI // n -= 4
207
JL V4 // if n < 4 goto V4
210
// regular loop body unrolled 4x
211
MOVQ 0(R8)(SI*8), R11
212
MOVQ 8(R8)(SI*8), R12
213
MOVQ 16(R8)(SI*8), R13
214
MOVQ 24(R8)(SI*8), R14
221
MOVQ R11, 0(R10)(SI*8)
222
MOVQ R12, 8(R10)(SI*8)
223
MOVQ R13, 16(R10)(SI*8)
224
MOVQ R14, 24(R10)(SI*8)
226
ADDQ $4, SI // i += 4
227
SUBQ $4, DI // n -= 4
228
JGE U4 // if n >= 0 goto U4
230
V4: ADDQ $4, DI // n += 4
231
JLE E4 // if n <= 0 goto E4
234
MOVQ 0(R8)(SI*8), R11
236
MOVQ R11, 0(R10)(SI*8)
238
RCLQ $1, CX // c = CF
242
JG L4 // if n > 0 goto L4
244
E4: MOVQ CX, c+56(FP) // return c
123
248
// func shlVU(z, x []Word, s uint) (c Word)
124
249
TEXT ·shlVU(SB),7,$0
125
MOVL n+8(FP), BX // i = n
250
MOVQ z_len+8(FP), BX // i = z
127
252
JL X8b // i < 0 (n <= 0)
130
255
MOVQ z+0(FP), R10
133
258
MOVQ (R8)(BX*8), AX // w1 = x[n-1]
135
260
SHLQ CX, DX:AX // w1>>ŝ
139
264
JLE X8a // i <= 0
234
359
ADDQ AX, (R10)(BX*8)
239
364
E6: CMPQ BX, R11 // i < n
246
371
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
247
372
TEXT ·divWVW(SB),7,$0
248
373
MOVQ z+0(FP), R10
249
MOVQ xn+16(FP), DX // r = xn
252
MOVL n+8(FP), BX // i = n
374
MOVQ xn+24(FP), DX // r = xn
377
MOVQ z_len+8(FP), BX // i = z
255
380
L7: MOVQ (R8)(BX*8), AX
257
382
MOVQ AX, (R10)(BX*8)
259
E7: SUBL $1, BX // i--
384
E7: SUBQ $1, BX // i--
265
390
// func bitLen(x Word) (n int)
266
391
TEXT ·bitLen(SB),7,$0