4
#error "This kernel requires SSE3"
14
#elif defined(ATL_GAS_x8664)
20
#error "This kernel requires x86 assembly!"
31
#define PFDIST 256 /* optimized for 32-bit P4E */
33
#define PFDIST 512 /* opt for Athlon 64 X2 */
37
# byte offset %rdi 4 %rsi 8 %rdx 12 %rcx 16
38
# void ATL_UAXPY(const int N, const SCALAR alpha, const TYPE *X, const int incX,
39
# TYPE *Y, const int incY)
42
.global ATL_asmdecor(ATL_UAXPY)
43
ATL_asmdecor(ATL_UAXPY):
48
# Put hi{1.0,-1.0}lo in rX0
55
movlps (%esp), rX0 # rX0 = {XXX, XXX, 1.0, -1.0}
57
# Store regs to stack and load parameters
63
movl OFF+8(%esp), Nr # address of alpha
69
movlps (%rsi), alp1 # Load alpha
71
# Put hi{1.0,-1.0}lo in rX0
78
movlps -8(%rsp), rX0 # rX0 = {XXX, XXX, 1.0, -1.0}
80
movlhps alp1, alp1 # alp1 = {ialpha, ralpha, ialpha, ralpha}
83
movlhps rX0, rX0 # rX0 = {1.0 , -1.0 , 1.0 , -1.0 }
84
pshufd $0x11,alp1,alp2 # alp2 = {ralpha, ialpha, ralpha, ialpha}
85
movaps alp2, salp # salp = {ralpha, ialpha, ralpha, ialpha}
86
mulps rX0, alp2 # alp2 = {ralpha, -ialph, ralpha, -ialph}
87
mulss rX0, salp # salp = {ralpha, ialpha, ralpha, -ialph}
88
pshufd $0xE1,salp,salp # salp = {ralpha, ialpha, -ialph, ralpha}
90
# If X is only 4-byte aligned, it's alignment cannot be fixed,
96
# Force X to 16-byte boundary so we can use MOVSxDUP
101
# One peeled iteration to force X to 16-byte alignment
103
# salp = { ra, ia, -ia, ra}
104
movlps (X), rX0 # rX0 = { XX, XX, ix, rx}
105
xorps rY0, rY0 # get rid of junk in top 64 bits
106
movlhps rX0, rX0 # rX0 = { ix, rx, ix, rx}
107
movlps (Y), rY0 # rY0 = { 0, 0, iy, ry}
108
mulps salp, rX0 # rX0 = {ra*ix, ia*rx, -ia*ix, ra*rx}
109
haddps rX0, rX0 # rX0 = {XX,XX, ra*ix+ia*rx, ra*rx-iaix}
110
addps rX0, rY0 # rY0 = {XX,XX, iyN, ryN}
130
# alp1 = {ia, ra, ia, ra}
131
# alp2 = {ra, -ia, ra, -ia}
132
movsldup (X,N,8), rX0 # rX0 = {rx1, rx1, rx0, rx0}
133
movshdup (X,N,8), rX1 # rX1 = {ix1, ix1, ix0, ix0}
134
movaps (Y,N,8), rY0 # rY0 = {iy1, ry1, iy0, ry0}
135
mulps alp1, rX0 # rX0 = {ia*rx1,ra*rx1,ia*rx0,ra*rx0}
136
prefetchw PFDIST(Y,N,8)
137
addps rX0, rY0 # rY0 gets 1st part of results
138
mulps alp2, rX1 # rX1 = {ra*ix1,-ia*ix1,ra*ix0,-ia*ix0}
139
prefetcht0 PFDIST(X,N,8)
140
addps rX1, rY0 # rY0 gets last part of results
145
# Do one more scalar iteration if there's a remainder
150
# salp = { ra, ia, -ia, ra}
151
movlps (X), rX0 # rX0 = { XX, XX, ix, rx}
152
xorps rY0, rY0 # get rid of junk in top 64 bits
153
movlhps rX0, rX0 # rX0 = { ix, rx, ix, rx}
154
movlps (Y), rY0 # rY0 = { 0, 0, iy, ry}
155
mulps salp, rX0 # rX0 = {ra*ix, ia*rx, -ia*ix, ra*rx}
156
haddps rX0, rX0 # rX0 = {XX,XX, ra*ix+ia*rx, ra*rx-iaix}
157
addps rX0, rY0 # rY0 = {XX,XX, iyN, ryN}
170
# This code assumes aligned X, but unaligned Y
183
# alp1 = {ia, ra, ia, ra}
184
# alp2 = {ra, -ia, ra, -ia}
185
movsldup (X,N,8), rX0 # rX0 = {rx1, rx1, rx0, rx0}
186
movshdup (X,N,8), rX1 # rX1 = {ix1, ix1, ix0, ix0}
187
movups (Y,N,8), rY0 # rY0 = {iy1, ry1, iy0, ry0}
188
mulps alp1, rX0 # rX0 = {ia*rx1,ra*rx1,ia*rx0,ra*rx0}
189
prefetchw PFDIST(Y,N,8)
190
addps rX0, rY0 # rY0 gets 1st part of results
191
mulps alp2, rX1 # rX1 = {ra*ix1,-ia*ix1,ra*ix0,-ia*ix0}
192
prefetcht0 PFDIST(X,N,8)
193
addps rX1, rY0 # rY0 gets last part of results
199
# X is not aligned even on 8-byte boundary, so cannot align it at all
200
# This shouldn't happen much, so just implement the unaligned Y case,
201
# so this case implements neither vector aligned
211
cmp $0, N # alp1 = { ia, ra, ia, ra}
212
je CLEANUP # salp = { ra, ia, -ia, ra}
214
movsldup alp1, alp1 # alp1 = { ra, ra, ra, ra}
215
pshufd $0x99,salp,alp2 # alp2 = { ia, -ia, ia, -ia}
217
movups (X,N,8), rX0 # rX0 = {ix1, rx1, ix0, rx0}
218
pshufd $0xB1,rX0,rX1 # rX1 = {rx1, ix1, rx0, ix0}
219
movups (Y,N,8), rY0 # rY0 = {iy1, ry1, iy0, ry0}
220
mulps alp1, rX0 # rX0 = {ix1*ra, rx1*ra,ix0*ra, rx1*ra}
221
prefetchw PFDIST(Y,N,8)
223
mulps alp2, rX1 # rX1 = {rx1*ia,-ix1*ia,rx0*ia,-ix0*ia}
224
prefetcht0 PFDIST(X,N,8)