2
* Automatically Tuned Linear Algebra Software v3.10.1
4
* Redistribution and use in source and binary forms, with or without
5
* modification, are permitted provided that the following conditions
7
* 1. Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* 2. Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions, and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
12
* 3. The name of the ATLAS group or the names of its contributers may
13
* not be used to endorse or promote products derived from this
14
* software without specific written permission.
16
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
20
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
* POSSIBILITY OF SUCH DAMAGE.
29
#include "atlas_asm.h"
31
#error "This kernel requires x86-64 assembly!"
34
#error "This routine requires SSE3!"
39
#define M %rdi /* already in */
40
#define N %rsi /* already in */
41
#define II %rax /* loaded in loop */
42
#define pX %rdx /* already in */
43
#define pA0 %r11 /* 56(%rsp) */
44
#define pA1 %rbx /* computed */
45
#define pY %rcx /* already in */
46
#define pZ %r9 /* already in */
47
#define lda %r10 /* 16(%rsp) */
48
#define pW %r8 /* already in */
63
#define vposneg %xmm15
69
(ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y,
70
%r8 %r9 8(%rsp) 16(%rsp)
71
const TYPE *W, const TYPE *Z, TYPE *A, ATL_CINT lda);
74
.global ATL_asmdecor(ATL_UGER2K)
76
ATL_asmdecor(ATL_UGER2K):
79
* construct vector that has -1.0 in low word, and 1.0 in high word
85
movapd -24(%rsp), vposneg /* vposneg = {1.0, -1.0} */
87
* Save callee-saved regs
91
* Load & compute all integer variables
94
shl $4, lda /* lda *= sizeof */
96
lea (pA0, lda), pA1 /* pA1 = pA0 + lda */
98
lea -2(M,M), M /* M = 2(M-1) */
99
// add M, M /* M = 2*M */
100
lea (pX, M, 8), pX /* pX += 2*M */
101
lea (pW, M, 8), pW /* pW += 2*M */
102
lea (pA0, M, 8), pA0 /* pA0 += 2*M */
103
lea (pA0, lda), pA1 /* pA1 next column over */
106
* We assume N is a multiple of 2 for this loop
109
movddup (pY), y0r /* y0r = {y0r, y0r} */
110
movddup 8(pY), y0i /* y0i = {y0i, y0i} */
111
mulpd vposneg, y0i /* y0i = {y0i,-y0i} */
112
movddup 16(pY), y1r /* y1r = {y1r, y1r} */
113
movddup 24(pY), y1i /* y1i = {y1i, y1i} */
114
mulpd vposneg, y1i /* y0i = {y1i,-y1i} */
115
movapd (pX,M,8), x0 /* x0 = {x0i, x0r} */
118
movddup (pZ), z0r /* z0r = {z0r, z0r} */
119
movddup 8(pZ), z0i /* z0i = {z0i, z0i} */
120
mulpd vposneg, z0i /* z0i = {z0i,-z0i} */
121
movddup 16(pZ), z1r /* z1r = {z1r, z1r} */
122
movddup 24(pZ), z1i /* z1i = {z1i, z1i} */
123
mulpd vposneg, z1i /* z0i = {z1i,-z1i} */
125
pshufd $0x4E, x0, revx0 /* revx0 = {x0r, x0i} */
131
* Reuse X to compute rank-1 update x*y for 2 columns of A
133
movapd x0, a0 /* a0 = {x0i, x0r} */
134
mulpd y0r, a0 /* a0 = {x0i*y0r, x0r*y0r} */
135
addpd (pA0,II,8), a0 /* a0 = {a0i+x0i*y0r, a0r+x0r*y0r} */
137
movapd revx0, A0 /* A0 = {x0r, x0i} */
138
mulpd y0i, A0 /* A0 = {x0r*y0i, -x0i*y0i} */
139
addpd A0, a0; /* a0 = {a0i+x0i*y0r+x0r*y0i,a0r+x0r*y0r-x0i*x0i} */
141
movapd x0, a1 /* a1 = {x0i, x0r} */
142
movapd (pW,II,8), x0 /* x0 = {w0i, w0r} */
143
mulpd y1r, a1 /* a1 = {x0i*y1r, x0r*y1r} */
144
addpd (pA1,II,8), a1 /* a1 = {A1i+x0i*y1r, A1r+x0r*y1r} */
145
mulpd y1i, revx0 /* rx0= {x0r*y1i, -x0i*y1i} */
146
addpd revx0, a1 /* a1={A1i+x0i*y1r+x0r*y1i, A1r+x0r*y1r-x0i*y1i} */
147
pshufd $0x4E, x0, revx0 /* revx0 = {w0r, w0i} */
149
* Reuse W to compute rank-1 update w*z for 2 columns of A to complete
150
* the rank-2 update of these two column elements
153
movapd x0, A0 /* A0 = {w0i, w0r} */
154
mulpd z0r, A0 /* A0 = {w0i*z0r, w0r*z0r} */
155
addpd A0, a0 /* a0 = {a0i+w0i*z0r, a0r+w0r*z0r} */
157
movapd revx0, A0 /* A0 = {w0r, w0i} */
158
mulpd z0i, A0 /* A0 = {w0r*z0i, -w0i*z0i} */
159
addpd A0, a0 /* a0 = completed rank-2 update */
160
movapd a0, (pA0,II,8) /* store completed rank-2 update */
162
mulpd z1r, x0 /* x0 = {w0i*z1r, w0r*z1r} */
163
addpd x0, a1 /* a1 = {a1i+w0i*z1r, a1r+w0r*z1r} */
164
movapd 16(pX,II,8), x0 /* x0 = {x0i, x0r} */
166
mulpd z1i, revx0 /* revx0={w0r*z1i, -w0i*z1i} */
167
addpd revx0, a1 /* completed rank-2 update for a1 */
168
pshufd $0x4E, x0, revx0 /* revx0 = {x0r, x0i} */
169
movapd a1, (pA1,II,8)
179
* Reuse X to compute rank-1 update x*y for 2 columns of A
181
movapd x0, a0 /* a0 = {x0i, x0r} */
182
mulpd y0r, a0 /* a0 = {x0i*y0r, x0r*y0r} */
183
addpd (pA0,II,8), a0 /* a0 = {a0i+x0i*y0r, a0r+x0r*y0r} */
185
movapd revx0, A0 /* A0 = {x0r, x0i} */
186
mulpd y0i, A0 /* A0 = {x0r*y0i, -x0i*y0i} */
187
addpd A0, a0; /* a0 = {a0i+x0i*y0r+x0r*y0i,a0r+x0r*y0r-x0i*x0i} */
189
movapd x0, a1 /* a1 = {x0i, x0r} */
190
movapd (pW,II,8), x0 /* x0 = {w0i, w0r} */
191
mulpd y1r, a1 /* a1 = {x0i*y1r, x0r*y1r} */
192
addpd (pA1,II,8), a1 /* a1 = {A1i+x0i*y1r, A1r+x0r*y1r} */
193
mulpd y1i, revx0 /* rx0= {x0r*y1i, -x0i*y1i} */
194
addpd revx0, a1 /* a1={A1i+x0i*y1r+x0r*y1i, A1r+x0r*y1r-x0i*y1i} */
195
pshufd $0x4E, x0, revx0 /* revx0 = {w0r, w0i} */
197
* Reuse W to compute rank-1 update w*z for 2 columns of A to complete
198
* the rank-2 update of these two column elements
201
movapd x0, A0 /* A0 = {w0i, w0r} */
202
mulpd z0r, A0 /* A0 = {w0i*z0r, w0r*z0r} */
203
addpd A0, a0 /* a0 = {a0i+w0i*z0r, a0r+w0r*z0r} */
205
movapd revx0, A0 /* A0 = {w0r, w0i} */
206
mulpd z0i, A0 /* A0 = {w0r*z0i, -w0i*z0i} */
207
addpd A0, a0 /* a0 = completed rank-2 update */
208
movapd a0, (pA0,II,8) /* store completed rank-2 update */
210
mulpd z1r, x0 /* x0 = {w0i*z1r, w0r*z1r} */
211
addpd x0, a1 /* a1 = {a1i+w0i*z1r, a1r+w0r*z1r} */
213
mulpd z1i, revx0 /* revx0={w0r*z1i, -w0i*z1i} */
214
addpd revx0, a1 /* completed rank-2 update for a1 */
215
movapd a1, (pA1,II,8)
216
lea (pA0, lda, 2), pA0
217
lea (pA1, lda, 2), pA1
223
* EPILOGUE: restore registers and return
227
movq %r13, -32(%rsp), %r13
228
movq %r14, -40(%rsp), %r14
229
movq %r15, -48(%rsp), %r15