2
* Automatically Tuned Linear Algebra Software v3.8.3
3
* (C) Copyright 2004 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
30
#include "atlas_asm.h"
34
*Efficeon-optimized 4x1x60 SGEMM. Pipelined to 4 (4 accumulators).
35
*Prefetches the next col of B, and a col from the next block of A in the M-loop
36
*Purposely kept small so it is retained in cache, and easy to translate when
41
#error "This kernel requires gas x86-32 assembler!"
45
#error "KB must be 60!"
48
#if !defined(KB) || (KB == 0)
49
#error "KB must be a compile-time constant!"
58
#error "MB must be multiple of 4!"
63
#define CMUL(i_) (2*(i_))
69
* Integer register usage shown be these defines
95
#define NB2so (NBso+NBso)
96
#define NB3so (NBso+NBso+NBso)
97
#define NB4so (NBso+NBso+NBso+NBso)
98
#define NB5so (NBso+NBso+NBso+NBso+NBso)
99
#define NB6so (NBso+NBso+NBso+NBso+NBso+NBso)
100
#define NB7so (NB6so+NBso)
101
#define NB8so (NB6so+NB2so)
102
#define NB9so (NB6so+NB3so)
103
#define NB10so (NB6so+NB4so)
104
#define NB11so (NB6so+NB5so)
106
#define MBKBso (MB*KB*4)
113
#define pref2(mem) prefetcht0 mem
114
#define prefB(mem) prefetcht0 mem
115
#define prefC(mem) prefetcht0 mem
122
*void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,
124
* const TYPE *A, const int lda, const TYPE *B, const int ldb,
126
* const TYPE beta, TYPE *C, const int ldc)
129
.global ATL_asmdecor(ATL_USERMM)
130
ATL_asmdecor(ATL_USERMM):
132
* Save callee-saved iregs; Save old stack pointer in eax,
133
* so we can adjust for BETA alignment
136
#define BETAOFF FSIZE+36
144
* Initialize pA = A; pB = B; pC = C;
147
movl FSIZE+4(%esp), %ebx
148
movl %ebx, COFF+4(%esp)
150
movl %ebx, COFF+8(%esp)
152
movl FSIZE+20(%esp), pA
153
movl FSIZE+28(%esp), pB
154
movl FSIZE+40(%esp), pC
156
movb FSIZE+8(%esp), stN
161
* Set incCn = (ldc - MB)*sizeof
163
movl FSIZE+44(%esp), incCn
165
subl COFF+4(%esp), incCn
174
/* movl incCn, COFF(%esp) */
177
addl COFF+8(%esp), pfA
185
movb COFF+4(%esp), stM
189
lea 120+NBso(pB0), pfB
198
movss CMUL(4)(pC), rC1
199
movss CMUL(8)(pC), rC2
200
movss CMUL(12)(pC), rC3
202
movss BETAOFF(%esp), m0
209
movaps 0-120(pB0), m3
210
movaps 0-120(pA0), m0
211
movaps NBso+0-120(pA0), m1
212
movaps NB2so+0-120(pA0), m2
217
* Unrolled & pipelined K-loop
219
mulps NB3so+0-120(pA0), m3
221
movaps 16-120(pB0), m0
223
movaps NB0so+16-120(pA0), m1
226
movaps NB1so+16-120(pA0), m2
229
movaps NB2so+16-120(pA0), m3
232
mulps NB3so+16-120(pA0), m0
234
movaps 32-120(pB0), m1
236
movaps NB0so+32-120(pA0), m2
239
movaps NB1so+32-120(pA0), m3
242
movaps NB2so+32-120(pA0), m0
245
mulps NB3so+32-120(pA0), m1
247
movaps 48-120(pB0), m2
249
movaps NB0so+48-120(pA0), m3
252
movaps NB1so+48-120(pA0), m0
255
movaps NB2so+48-120(pA0), m1
258
mulps NB3so+48-120(pA0), m2
260
movaps 64-120(pB0), m3
262
movaps NB0so+64-120(pA0), m0
265
movaps NB1so+64-120(pA0), m1
268
movaps NB2so+64-120(pA0), m2
271
mulps NB3so+64-120(pA0), m3
273
movaps 80-120(pB0), m0
275
movaps NB0so+80-120(pA0), m1
278
movaps NB1so+80-120(pA0), m2
281
movaps NB2so+80-120(pA0), m3
284
mulps NB3so+80-120(pA0), m0
286
movaps 96-120(pB0), m1
288
movaps NB0so+96-120(pA0), m2
291
movaps NB1so+96-120(pA0), m3
294
movaps NB2so+96-120(pA0), m0
297
mulps NB3so+96-120(pA0), m1
299
movaps 112-120(pB0), m2
301
movaps NB0so+112-120(pA0), m3
304
movaps NB1so+112-120(pA0), m0
307
movaps NB2so+112-120(pA0), m1
310
mulps NB3so+112-120(pA0), m2
312
movaps 128-120(pB0), m3
314
movaps NB0so+128-120(pA0), m0
317
movaps NB1so+128-120(pA0), m1
320
movaps NB2so+128-120(pA0), m2
323
mulps NB3so+128-120(pA0), m3
325
movaps 144-120(pB0), m0
327
movaps NB0so+144-120(pA0), m1
330
movaps NB1so+144-120(pA0), m2
333
movaps NB2so+144-120(pA0), m3
336
mulps NB3so+144-120(pA0), m0
338
movaps 160-120(pB0), m1
340
movaps NB0so+160-120(pA0), m2
343
movaps NB1so+160-120(pA0), m3
346
movaps NB2so+160-120(pA0), m0
349
mulps NB3so+160-120(pA0), m1
351
movaps 176-120(pB0), m2
353
movaps NB0so+176-120(pA0), m3
356
movaps NB1so+176-120(pA0), m0
359
movaps NB2so+176-120(pA0), m1
362
mulps NB3so+176-120(pA0), m2
364
movaps 192-120(pB0), m3
366
movaps NB0so+192-120(pA0), m0
369
movaps NB1so+192-120(pA0), m1
372
movaps NB2so+192-120(pA0), m2
375
mulps NB3so+192-120(pA0), m3
377
movaps 208-120(pB0), m0
379
movaps NB0so+208-120(pA0), m1
382
movaps NB1so+208-120(pA0), m2
385
movaps NB2so+208-120(pA0), m3
388
mulps NB3so+208-120(pA0), m0
390
movaps 224-120(pB0), m1
392
movaps NB0so+224-120(pA0), m2
395
movaps NB1so+224-120(pA0), m3
398
movaps NB2so+224-120(pA0), m0
401
mulps NB3so+224-120(pA0), m1
408
* Get these bastard things summed up correctly
409
* Note this summation is Camm's, as his sequence was faster
410
* than the piece of crap I came up with
412
movaps rC0, m0 /* m0 = c0d c0c c0b c0a */
413
unpcklps rC1, rC0 /* rC0 = c1b c0b c1a c0d */
414
movaps rC2, m1 /* m1 = c2d c2c c2b c2a */
415
unpckhps rC1, m0 /* m0 = c1d c0d c1c c0c */
417
unpcklps rC3, rC2 /* rC2 = c3b c2b c3a c2a */
419
addps m0, rC0 /* rC0 = c1bd c0bd c1ac c0ac */
420
unpckhps rC3, m1 /* m1 = c3d c2d c3c c2c */
422
movaps rC0, m0 /* m0 = c1bd c0bd c1ac c0ac */
423
addps m1, rC2 /* rC2 = c3bd c2bd c3ac c2ac */
424
shufps $0x44,rC2,rC0 /* rC0 = c3ac c2ac c1ac c0ac */
426
shufps $0xEE,rC2,m0 /* m0 = c3bd c2bd c1bd c0bd */
428
addps m0, rC0 /* rC0 = c3abcd c2abcd c1abcd c0abcd */
432
* Write results back to C
435
/* rC0 = c3 c2 c1 c0 */
436
pshufd $0xB1, rC0, rC1 /* rC1 = c2 c3 c0 c1 */
437
movhlps rC0, rC2 /* rC2 = X X c3 c2 */
438
movhlps rC1, rC3 /* rC3 = X X c2 c3 */
447
* pC += 6; pA += 2*NB
456
* pC += incCn; pA -= NBNB; pB += NB;
459
/* addl COFF(%esp), pC */
461
subl COFF+8(%esp), pA0
473
* Restore callee-saved iregs