4
#error "This kernel requires x86-64 assembly!"
9
#define M %rdi /* already in */
10
#define N %rsi /* already in */
11
#define itmp %rax /* tmp used for byte load/use pipeline */
13
#define pX %rdx /* already in */
14
#define pA0 %rcx /* 8(%rsp) */
16
#define II %rbx /* loaded in loop */
17
#define pY %rbp /* moved from r9 */
18
#define incAn %r8 /* 16(%rsp) */
30
(ATL_CINT M, ATL_CINT N, const TYPE alpha0, const TYPE *X,
31
%rcx %r8 %r9 8(%rsp) 16(%rsp)
32
ATL_CINT incX, const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda)
36
.global ATL_asmdecor(ATL_UGERK)
38
ATL_asmdecor(ATL_UGERK):
41
* Save callee-saved iregs
52
* Load & compute all integer variables
56
movslq 16(%rsp), incAn /* incAn = lda */
57
sub M, incAn /* incAn = lda - M */
58
shl $3, incAn /* incAn = (lda-M)*sizeof */
62
mov M, incX /* incX = M */
63
shl $3, incX /* incX = M*sizeof */
65
lea (pY, N, 8), pY /* pY += N */
71
movb (pA0), itb /* forced fetch of pA0 */
72
movb itb, (pA0) /* force cache write coherence message */
74
movddup (pY,N,8), y0 /* y0 = {y0, y0} */
78
* Align data on CL boundary
80
test mask, pA0 /* if (pA0 & 0x3F) --> is 64-byte aligned */
81
jz CLALIGNED /* start aligned loop */
82
LOOPALIGNCL: /* loop until aligned or out of ops */
83
movsd (pX), x0 /* x0 = {xx, x0} */
84
mulsd y0, x0 /* x0 = {xx, x0*y0} */
85
addsd (pA0), x0 /* x0 = {xx, a00+x0*y0} */
90
jz MLOOPDONE /* finish MLOOP if out of M */
91
test mask, pA0 /* if (pA0 & 0x3F) --> is 64-byte aligned */
92
jnz LOOPALIGNCL /* continue until aligned */
95
mov II, Mr /* Mr = remaining iterations */
96
and mask7, II /* II = ((remaining iter)/8)*8 */
97
jz ROLLED_DO_MR /* if nothing left, goto cleanup loop */
98
sub II, Mr /* Mr = # of iter left after II (mul8) done */
100
sub $8, II /* stop 1 iter early; if not enough its left */
101
jbe ROLLED_ADD8 /* handle everything in rolled loop */
104
* This loop starts at a 64-byte cache line boundary
106
movapd (pX), x0 /* x0 = {x1, x0}, pipelined out of loop */
108
movb 64(pA0), itb /* forced fetch of pA0 */
109
movb itb, 64(pA0) /* force cache write coherence message */
110
mulpd y0, x0 /* x0 = {x1*y0, x0*y0} */
111
addpd (pA0), x0 /* x0 = {a10+x1*y0, a00+x0*y0} */
114
movapd 16(pX),x0 /* x0 = {x1, x0} */
115
mulpd y0, x0 /* x0 = {x1*y0, x0*y0} */
116
addpd 16(pA0), x0 /* x0 = {a10+x1*y0, a00+x0*y0} */
119
movapd 32(pX),x0 /* x0 = {x1, x0} */
120
mulpd y0, x0 /* x0 = {x1*y0, x0*y0} */
121
addpd 32(pA0), x0 /* x0 = {a10+x1*y0, a00+x0*y0} */
124
movapd 48(pX),x0 /* x0 = {x1, x0} */
126
mulpd y0, x0 /* x0 = {x1*y0, x0*y0} */
127
addpd 48(pA0), x0 /* x0 = {a10+x1*y0, a00+x0*y0} */
130
movapd (pX),x0 /* x0 = {x1, x0}, SW pipelined */
135
* Drain preloading pipe
137
mulpd y0, x0 /* x0 = {x1*y0, x0*y0} */
138
addpd (pA0), x0 /* x0 = {a10+x1*y0, a00+x0*y0} */
141
movapd 16(pX),x0 /* x0 = {x1, x0} */
142
mulpd y0, x0 /* x0 = {x1*y0, x0*y0} */
143
addpd 16(pA0), x0 /* x0 = {a10+x1*y0, a00+x0*y0} */
146
movapd 32(pX),x0 /* x0 = {x1, x0} */
147
mulpd y0, x0 /* x0 = {x1*y0, x0*y0} */
148
addpd 32(pA0), x0 /* x0 = {a10+x1*y0, a00+x0*y0} */
151
movapd 48(pX),x0 /* x0 = {x1, x0} */
152
mulpd y0, x0 /* x0 = {x1*y0, x0*y0} */
153
addpd 48(pA0), x0 /* x0 = {a10+x1*y0, a00+x0*y0} */
167
* EPILOGUE: restore registers and return
173
movq %r13, -32(%rsp), %r13
174
movq %r14, -40(%rsp), %r14
175
movq %r15, -48(%rsp), %r15
185
movsd (pX), x0 /* x0 = {xx, x0} */
186
mulsd y0, x0 /* x0 = {xx, x0*y0} */
187
addsd (pA0), x0 /* x0 = {xx, a00+x0*y0} */
190
add $8, pA0 /* pA0++ */