2
* This kernel does GEMV by performing 28 simultaneous dot-products along
3
* the rows of the no-transpose matrix A. It assumes M is a multiple of 28.
7
#error "This kernel requires x86-64 assembly!"
10
* Integer register usage
22
* Floating point vector register usage
44
*void ATL_UGEMV(ATL_CINT M, ATL_CINT N, const TYPE *A, ATL_CINT lda,
45
* const TYPE *X, TYPE *Y)
48
* y = [0,1]*y + A*x, A is MxN, len(X) = N, len(Y) = M
51
#define AddYtoDot(addr_, reg_)
53
#define AddYtoDot(addr_, reg_) addpd addr_, reg_
56
#define prefY(addr_) prefetchw addr_
58
#define prefY(addr_) prefetcht0 addr_
61
.global ATL_asmdecor(ATL_UGEMV)
63
ATL_asmdecor(ATL_UGEMV):
66
neg NN /* make N negative for update X */
67
mov NN, N0 /* backup of N so we can restore at end of M loop */
69
sub $-128, pA0 /* pA0 += 128 bytes */
70
sub $-128, pY /* pY += 128 bytes */
71
shl $3, lda /* lda *= sizeof */
119
add $8, pX /* pX++ */
121
movapd -128(pA0), rA0
124
movapd -112(pA0), rA0
167
AddYtoDot(-128(pY), rY0)
169
AddYtoDot(-112(pY), rY2)
171
AddYtoDot(-96(pY), rY4)
173
AddYtoDot(-80(pY), rY6)
175
AddYtoDot(-64(pY), rY8)
177
AddYtoDot(-48(pY), rY10)
179
AddYtoDot(-32(pY), rY12)
181
AddYtoDot(-16(pY), rY14)
183
AddYtoDot((pY), rY16)
185
AddYtoDot(16(pY), rY18)
187
AddYtoDot(32(pY), rY20)
189
AddYtoDot(48(pY), rY22)
191
AddYtoDot(64(pY), rY24)
193
AddYtoDot(80(pY), rY26)
196
lea (pX, N0, 8), pX /* pX -= N*sizeof */