2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2012, 2010 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
31
#error "This routine requires SSE3!"
34
#include "atlas_asm.h"
36
* This file does a 1x4 unrolled mvt_sse with these params:
40
#error "This kernel requires x86-64 assembly!"
43
* Integer register assignment
60
* SSE register assignment
74
#define NONEPONEOFF -72
75
#define NONEPONE %xmm15
92
* Define macros controlling prefetch
98
#define PFADIST PFDIST
108
#define PFIY prefetchw
110
#define PFIY prefetchnta
114
#define PFIX prefetcht0
117
#define PFIA prefetchnta
119
#if PFADIST == 0 /* flag for no prefetch */
122
#define prefA(mem) PFIA mem
124
#if PFYDIST == 0 /* flag for no prefetch */
127
#define prefY(mem) PFIY mem
129
#if PFXDIST == 0 /* flag for no prefetch */
132
#define prefX(mem) PFIX mem
135
* %rdi %rsi %rdx %rcx
136
* void ATL_UGEMV(ATL_CINT M, ATL_CINT N, const TYPE *A, ATL_CINT lda,
138
* const TYPE *X, TYPE *Y)
141
.global ATL_asmdecor(ATL_UGEMV)
143
ATL_asmdecor(ATL_UGEMV):
146
* Save callee-saved iregs
155
* Compute M = (M/MU)*MU, Mr = M - (M/MU)*MU
156
* NOTE: Mr is %rcx reg, so we can use jcx to go to cleanup loop
158
mov %rcx, lda /* move lda to assigned register, rax */
159
mov $1, Mr /* setup assignment to peel */
160
xor Ma, Ma /* default to no peel */
161
test $0xF, pA0 /* 0 if 16-byte aligned */
162
cmovnz Mr, Ma /* if nonzero, say need 1 iteration peel */
164
mov M, Mr /* Mr = M */
165
shr $3, M /* M = M / MU */
166
shl $3, M /* M = (M/MU)*MU */
167
sub M, Mr /* Mr = M - (M/MU)*MU */
169
* Construct ponenone = {-1.0,1.0,-1.0,1.0}
173
fldz /* ST = 0.0 1.0 */
174
fsub %st(1), %st /* ST = -1.0 1.0 */
175
fsts NONEPONEOFF+4(%rsp)
176
fstps NONEPONEOFF+12(%rsp) /* ST = 1.0 */
177
fsts NONEPONEOFF(%rsp)
178
fstps NONEPONEOFF+8(%rsp) /* ST=NULL, mem=-1,1,-1,1*/
179
movapd NONEPONEOFF(%rsp), NONEPONE
183
mov lda, incAn /* incAn = lda */
184
sub M, incAn /* incAn = lda - (M/MU)*MU */
186
sub Mr, incAn /* incAn = lda - M */
187
shl $3, incAn /* incAn = (lda-M)*sizeof */
188
shl $3, lda /* lda *= sizeof */
189
sub $-128, pA0 /* code compaction by using signed 1-byte offsets */
190
sub $-128, pX /* code compaction by using signed 1-byte offsets */
191
mov pX, pX0 /* save for restore after M loops */
192
mov $-64, incAXm /* code comp: use reg rather than constant */
193
lea (lda, lda,2), lda3 /* lda3 = 3*lda */
194
lea (incAn, lda3), incAn /* incAn = (4*lda-M)*sizeof */
195
mov $8*1, incII /* code comp: use reg rather than constant */
208
* If no peeled iteration, start M-loop, else do peeled iteration
215
movlps -128(pX), rX0 /* rX0 = {0, 0, Xi, Xr} */
216
pshufd $0xB1, rX0, rx0 /* rx0 = {0, 0, Xr, Xi} */
217
movlps -128(pA0), rA0 /* rA0 = {0, 0, Ai, Ar} */
218
movaps rA0, rt0 /* rt0 = {0, 0, Ai, Ar} */
219
mulps rx0, rA0 /* rA0 = {0, 0, Xr*Ai, Xi*Ar} */
221
mulps rX0, rt0 /* rt0 = {0, 0, Xi*Ai, Xr*Ar} */
223
movlps -128(pA0,lda), rA0 /* rA0 = {0, 0, Ai, Ar} */
224
movaps rA0, rt0 /* rt0 = {0, 0, Ai, Ar} */
225
mulps rx0, rA0 /* rA0 = {0, 0, Xr*Ai, Xi*Ar} */
227
mulps rX0, rt0 /* rt0 = {0, 0, Xi*Ai, Xr*Ar} */
229
movlps -128(pA0,lda,2), rA0 /* rA0 = {0, 0, Ai, Ar} */
230
movaps rA0, rt0 /* rt0 = {0, 0, Ai, Ar} */
231
mulps rx0, rA0 /* rA0 = {0, 0, Xr*Ai, Xi*Ar} */
233
mulps rX0, rt0 /* rt0 = {0, 0, Xi*Ai, Xr*Ar} */
235
movlps -128(pA0,lda3), rA0 /* rA0 = {0, 0, Ai, Ar} */
236
movaps rA0, rt0 /* rt0 = {0, 0, Ai, Ar} */
237
mulps rx0, rA0 /* rA0 = {0, 0, Xr*Ai, Xi*Ar} */
239
mulps rX0, rt0 /* rt0 = {0, 0, Xi*Ai, Xr*Ar} */
245
movapd 0-128(pX), rX0 /* rX0 = Xi, Xr */
246
pshufd $0xB1, rX0, rx0 /* rx0 = Xr, Xi */
247
MOVA 0-128(pA0), rA0 /* rA0 = Ai, Ar */
248
movapd rA0, rt0 /* rt0 = Ai, Ar */
249
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
251
prefA(PFADIST+0(pA0))
252
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
255
MOVA 0-128(pA0,lda), rA0 /* rA0 = Ai, Ar */
256
movapd rA0, rt0 /* rt0 = Ai, Ar */
257
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
259
prefA(PFADIST+0(pA0,lda))
260
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
262
MOVA 0-128(pA0,lda,2), rA0 /* rA0 = Ai, Ar */
263
movapd rA0, rt0 /* rt0 = Ai, Ar */
264
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
266
prefA(PFADIST+0(pA0,lda,2))
267
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
269
MOVA 0-128(pA0,lda3), rA0 /* rA0 = Ai, Ar */
270
movapd rA0, rt0 /* rt0 = Ai, Ar */
271
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
273
prefA(PFADIST+0(pA0,lda3))
274
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
277
movapd 16-128(pX), rX0 /* rX0 = Xi, Xr */
278
pshufd $0xB1, rX0, rx0 /* rx0 = Xr, Xi */
279
MOVA 16-128(pA0), rA0 /* rA0 = Ai, Ar */
280
movapd rA0, rt0 /* rt0 = Ai, Ar */
281
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
283
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
286
MOVA 16-128(pA0,lda), rA0 /* rA0 = Ai, Ar */
287
movapd rA0, rt0 /* rt0 = Ai, Ar */
288
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
290
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
292
MOVA 16-128(pA0,lda,2), rA0 /* rA0 = Ai, Ar */
293
movapd rA0, rt0 /* rt0 = Ai, Ar */
294
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
296
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
298
MOVA 16-128(pA0,lda3), rA0 /* rA0 = Ai, Ar */
299
movapd rA0, rt0 /* rt0 = Ai, Ar */
300
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
302
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
305
movapd 32-128(pX), rX0 /* rX0 = Xi, Xr */
306
pshufd $0xB1, rX0, rx0 /* rx0 = Xr, Xi */
307
MOVA 32-128(pA0), rA0 /* rA0 = Ai, Ar */
308
movapd rA0, rt0 /* rt0 = Ai, Ar */
309
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
311
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
314
MOVA 32-128(pA0,lda), rA0 /* rA0 = Ai, Ar */
315
movapd rA0, rt0 /* rt0 = Ai, Ar */
316
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
318
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
320
MOVA 32-128(pA0,lda,2), rA0 /* rA0 = Ai, Ar */
321
movapd rA0, rt0 /* rt0 = Ai, Ar */
322
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
324
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
326
MOVA 32-128(pA0,lda3), rA0 /* rA0 = Ai, Ar */
327
movapd rA0, rt0 /* rt0 = Ai, Ar */
328
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
330
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
333
movapd 48-128(pX), rX0 /* rX0 = Xi, Xr */
334
pshufd $0xB1, rX0, rx0 /* rx0 = Xr, Xi */
335
MOVA 48-128(pA0), rA0 /* rA0 = Ai, Ar */
336
movapd rA0, rt0 /* rt0 = Ai, Ar */
337
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
339
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
342
MOVA 48-128(pA0,lda), rA0 /* rA0 = Ai, Ar */
343
movapd rA0, rt0 /* rt0 = Ai, Ar */
344
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
346
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
348
MOVA 48-128(pA0,lda,2), rA0 /* rA0 = Ai, Ar */
349
movapd rA0, rt0 /* rt0 = Ai, Ar */
350
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
352
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
354
MOVA 48-128(pA0,lda3), rA0 /* rA0 = Ai, Ar */
355
movapd rA0, rt0 /* rt0 = Ai, Ar */
356
mulpd rx0, rA0 /* rA0 = Ai*Xr, Ar*Xi */
358
mulpd rX0, rt0 /* rt0 = Ai*Xi, Ar*Xr */
374
movlps -128(pX), rX0 /* rX0 = {0, 0, Xi, Xr} */
375
pshufd $0xB1, rX0, rx0 /* rx0 = {0, 0, Xr, Xi} */
376
movlps -128(pA0), rA0 /* rA0 = {0, 0, Ai, Ar} */
377
movaps rA0, rt0 /* rt0 = {0, 0, Ai, Ar} */
378
mulps rx0, rA0 /* rA0 = {0, 0, Xr*Ai, Xi*Ar} */
380
mulps rX0, rt0 /* rt0 = {0, 0, Xi*Ai, Xr*Ar} */
382
movlps -128(pA0,lda), rA0 /* rA0 = {0, 0, Ai, Ar} */
383
movaps rA0, rt0 /* rt0 = {0, 0, Ai, Ar} */
384
mulps rx0, rA0 /* rA0 = {0, 0, Xr*Ai, Xi*Ar} */
386
mulps rX0, rt0 /* rt0 = {0, 0, Xi*Ai, Xr*Ar} */
388
movlps -128(pA0,lda,2), rA0 /* rA0 = {0, 0, Ai, Ar} */
389
movaps rA0, rt0 /* rt0 = {0, 0, Ai, Ar} */
390
mulps rx0, rA0 /* rA0 = {0, 0, Xr*Ai, Xi*Ar} */
392
mulps rX0, rt0 /* rt0 = {0, 0, Xi*Ai, Xr*Ar} */
394
movlps -128(pA0,lda3), rA0 /* rA0 = {0, 0, Ai, Ar} */
395
movaps rA0, rt0 /* rt0 = {0, 0, Ai, Ar} */
396
mulps rx0, rA0 /* rA0 = {0, 0, Xr*Ai, Xi*Ar} */
398
mulps rX0, rt0 /* rt0 = {0, 0, Xi*Ai, Xr*Ar} */
406
/* rYr0 = {-rY0d, rY0c, -rY0b, rY0a} */
407
/* rYi0 = { iY0d, iY0c, iY0b, iY0a} */
408
mulps NONEPONE, rY0r /* rYr = {rY0d, rY0c, rY0b, rY0a} */
409
mulps NONEPONE, rY1r /* rYr = {rY1d, rY1c, rY1b, rY1a} */
410
haddps rY0i, rY0r /* rYr = {iY0cd ,iY0ab, rY0cd, rY0ab} */
411
haddps rY1i, rY1r /* rYr = {iY1cd ,iY1ab, rY1cd, rY1ab} */
412
haddps rY1r, rY0r /* rYr = {iY1abcd,rY1abcd,iY0abcd,rY0abcd} */
417
mulps NONEPONE, rY2r /* rYr = {rY0d, rY0c, rY0b, rY0a} */
418
mulps NONEPONE, rY3r /* rYr = {rY1d, rY1c, rY1b, rY1a} */
419
haddps rY2i, rY2r /* rYr = {iY0cd ,iY0ab, rY0cd, rY0ab} */
420
haddps rY3i, rY3r /* rYr = {iY1cd ,iY1ab, rY1cd, rY1ab} */
421
haddps rY3r, rY2r /* rYr = {iY1abcd,rY1abcd,iY0abcd,rY0abcd} */
426
prefY(4*8+PFYDIST(pY))
434
* EPILOGUE: restore registers and return