2
* Automatically Tuned Linear Algebra Software v3.8.3
3
* (C) Copyright 2008 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
30
#include "atlas_asm.h"
31
#include "atlas_asm.h"
32
#if !defined(ATL_GAS_x8664) && !defined(ATL_GAS_x8632)
33
#error "This kernel requires x86 assembly!"
37
#define CMUL(arg_) 2*arg_
40
#define CMUL(arg_) arg_
56
#define FSIZE 16*4+BETASZ
59
#define iAmOFF ldcOFF-4
60
#define PFAOFF iAmOFF-4
61
#define PFBOFF PFAOFF-4
65
#define iBnOFF iAnOFF-4
66
#define iCnOFF iBnOFF-4
70
*Integer register usage shown by these defines
81
#define incAm iAmOFF(%esp)
82
#define incAn iAnOFF(%esp)
83
#define incBn iBnOFF(%esp)
84
#define incCn iCnOFF(%esp)
87
#define MM0 MOFF(%esp)
88
#define PFA PFAOFF(%esp)
89
#define PFB PFBOFF(%esp)
90
#define ldc ldcOFF(%esp)
91
#define KK0 KOFF(%esp)
108
#define incCn iCOFF(%rsp)
109
#define MM0 iIOFF(%rsp)
126
* Define some macros for instruction selection
127
* VZERO: xorpd, xorps, pxor
128
* MOVAB: movapd,movaps or movupd/movups
130
#define VZERO(reg_) xorps reg_, reg_
132
#define MOVAPD movaps
133
#define MOVUPD movups
137
#define pref2(mem) prefetcht1 mem
138
#define prefB(mem) prefetcht0 mem
139
#define prefC(mem) prefetcht0 mem
146
%rdi/4 %rsi/8 %rdx/12 %xmm0/16
147
void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,
148
%rcx/24 %r8/28 %r9/32 8/36
149
const TYPE *A, const int lda, const TYPE *B, const int ldb,
151
const TYPE beta, TYPE *C, const int ldc)
154
.global ATL_asmdecor(ATL_USERMM)
156
ATL_asmdecor(ATL_USERMM):
158
* Save callee-saved iregs
161
movl %esp, %eax /* save original stack ptr */
162
sub $FSIZE, %esp /* allocate stack space */
163
andw $0xFFF0, %sp /* SP now 16-byte aligned */
164
movl %ebp, BETASZ(%esp)
165
movl %ebx, BETASZ+4(%esp)
166
movl %esi, BETASZ+8(%esp)
167
movl %edi, BETASZ+12(%esp)
168
movl %eax, BETASZ+16(%esp) /* original SP saved to new stack */
181
* Setup input parameters
182
* For x8632 %eax has old stack ptr; eax is pA1, so set this up late
189
movl 12(%eax), KK /* load K */
192
movl 36(%eax), itmp /* itmp = ldb */
193
lea (pB0, itmp, 8), pB1 /* pB1 = pB0 + ldb*sizeof */
194
shl $4, itmp /* itmp = 2*sizeof*ldb */
195
movl itmp, incBn /* incBn = 2*sizeof*ldb */
199
movsd 40(%eax), rB0 /* load beta */
200
unpcklpd rB0, rB0 /* rB0 = {beta, beta} */
201
MOVAPD rB0, BETAOFF(%esp) /* store BETA to BETAOFF */
204
movl 52(%eax), itmp /* itmp = ldc */
205
shl $CSH, itmp /* itmp = ldc*sizeof */
206
movl itmp, ldcOFF(%esp) /* ldc = ldc*sizeof */
207
shr $CSH-1, itmp /* itmp = 2*ldc */
208
sub MM0, itmp /* itmp = 2*ldc - M */
209
shl $CSH, itmp /* itmp = (2*ldc-M)*sizeof */
210
movl itmp, incCn /* incCn = (2*ldc-M)*sizeof */
211
movl 28(%eax), itmp /* itmp = lda */
212
lea (pA0, itmp,8), pA1 /* just overwrote old SP in EAX */
213
shl $4, itmp /* itmp = 2*sizeof*lda */
214
movl itmp, incAm /* incAm = 2*sizeof*lda */
217
* pfA = A + 2*lda*M; incAn = lda*M
219
movl MM0, itmp /* itmp = M */
220
imull incAm, itmp /* itmp = 2*lda*M */
221
lea PFAINC(pA0, itmp), itmp /* pfA = pA0 + 2*lda*M - PFAINC */
222
movl itmp, PFA /* pfA = 2*lda*M + pA0 - PFAINC */
223
sub pA0, itmp /* itmp = 2*lda*M - PFAINC*/
224
sub $PFAINC, itmp /* itmp = 2*lda*M */
225
shr $1, itmp /* itmp = lda*M */
226
movl itmp, incAn /* incAn = lda*M */
229
* Get parameters moves to correct registers
233
movq %r8, pA1 /* pA1 = lda */
234
movq %r9, pB0 /* pB0 = B */
235
movslq 8(%rsp), pB1 /* pB1 = ldb */
236
unpcklpd %xmm1, %xmm1 /* xmm1 = {beta, beta} */
238
movq 16(%rsp), pC0 /* pC0 = C */
239
movslq 24(%rsp), ldc /* ldc = ldc */
241
* ===================================================
242
* Compute rest of needed variables using these inputs
243
* ===================================================
245
shl $3, pB1 /* pB1 = ldb*sizeof */
246
lea (pB1, pB1), incBn /* incBn = 2*ldb*sizeof */
247
add pB0, pB1 /* pB1 = pB0 + ldb*sizeof */
248
lea (ldc,ldc), PFA /* PFA = 2*ldc */
249
sub MM, PFA /* PFA = 2*ldc - M */
250
shl $CSH, PFA /* PFA = (2*ldc-M)*sizeof */
251
movq PFA, incCn /* incCn = (2*ldc-M)*sizeof */
252
shl $CSH, ldc /* ldc *= sizeof */
253
shl $3, pA1 /* pA1 = lda * sizeof */
254
lea (pA1, pA1), incAm /* incAm = 2*lda*sizeof */
255
mov MM, PFA /* PFA = M */
256
imulq pA1, PFA /* PFA = M * lda*sizeof */
257
movq PFA, incAn /* incAn = M*lda*sizeof */
258
lea PFAINC(pA0,PFA,2),PFA /* PFA = pA0+2*M*lda*sizeof - PFAINC */
259
add pA0, pA1 /* pA1 = pA0 + lda*sizeof */
260
mov pB0, PFB /* PBF = pB0 */
261
add incBn, PFB /* PFB = pB0 + 2*ldb*sizeof */
262
movq MM, MM0 /* MM0 = MM */
264
sub $2, KK /* must stop K it early to drain advance load */
267
* Have pA/B point to end of column, so we can run loop backwards
269
lea (pA0, KK, 8), pA0 /* pA0 += K */
270
lea (pB0, KK, 8), pB0 /* pB0 += K */
271
lea (pA1, KK, 8), pA1 /* pA1 += K */
272
lea (pB1, KK, 8), pB1 /* pB1 += K */
292
* Peel 1st iteration of K to avoid need to zero rCxx
294
MOVAB -16(pB0,KK,8), rA0
295
MOVAB -16(pA0,KK,8), rC00
312
MOVAB -16(pA1,KK,8), rC10
321
movhpd 16(pC0,ldc), rC1
323
MOVAPD (pC0,ldc), rC1
329
MOVAB -16(pB1,KK,8), rA0
331
MOVAB (pB0,KK,8), rB0
333
#if !defined(ATL_GAS_x8632) && !defined(BETA0)
338
MOVAB (pA0,KK,8), rA0
342
MOVAB (pA1,KK,8), rA1
345
MOVAB (pB1,KK,8), rB0
348
MOVAB 16(pB0,KK,8), rB0
354
* Peel last iteration to stop forward fetch of B
382
#elif !defined(BETA0)
398
MOVAPD rC00, rA0 /* rA0 = c00a c00b */
399
MOVAPD rC01, rB0 /* rB0 = c01a c01b */
400
unpcklpd rC10, rC00 /* rC00 = c00a c10a */
401
unpcklpd rC11, rC01 /* rC01 = c01a c11a */
402
unpckhpd rC10, rA0 /* rA0 = c00b c10b */
403
unpckhpd rC11, rB0 /* rB0 = c01b c11b */
404
addpd rA0, rC00 /* rC00 = c00ab c10ab */
405
addpd rB0, rC01 /* rC01 = c01ab c11ab */
408
movl ldcOFF(%esp), itmp
413
MOVAPD BETAOFF(%esp), rB0
425
MOVAPD (pC0,itmp), rA1
430
addpd (pC0,itmp), rC01
447
addq incAm, pA0 /* pA0 += lda*sizeof*2 */
448
addq incAm, pA1 /* pA1 += lda*sizeof*2 */
452
movlpd rC00, -32(pC0)
453
movhpd rC00, -16(pC0)
454
movlpd rC01, -32(pC0,itmp)
455
movhpd rC01, -16(pC0,itmp)
457
MOVAPD rC00, -2*CMUL(8)(pC0)
458
MOVAPD rC01, -2*CMUL(8)(pC0,itmp)
477
movl BETASZ(%esp), %ebp
478
movl BETASZ+4(%esp), %ebx
479
movl BETASZ+8(%esp), %esi
480
movl BETASZ+12(%esp), %edi
481
movl BETASZ+16(%esp), %esp /* restore saved original SP */
491
#endif /* end of ifndef DCPLX -- CPLX must use unaligned loads to C */
494
* Code specialized for when C or ldc is not aligned to 16-byte boundary, so
495
* we must use unaligned loads. This is a big cost on Core2 systems
499
* Peel 1st iteration of K to avoid need to zero rCxx
501
MOVAB -16(pB0,KK,8), rA0
502
MOVAB -16(pA0,KK,8), rC00
519
MOVAB -16(pA1,KK,8), rC10
528
movhpd 16(pC0,ldc), rC1
530
MOVUPD (pC0,ldc), rC1
536
MOVAB -16(pB1,KK,8), rA0
538
MOVAB (pB0,KK,8), rB0
540
#if !defined(ATL_GAS_x8632) && !defined(BETA0)
545
MOVAB (pA0,KK,8), rA0
549
MOVAB (pA1,KK,8), rA1
552
MOVAB (pB1,KK,8), rB0
555
MOVAB 16(pB0,KK,8), rB0
561
* Peel last iteration to stop forward fetch of B
589
#elif !defined(BETA0)
605
MOVAPD rC00, rA0 /* rA0 = c00a c00b */
606
MOVAPD rC01, rB0 /* rB0 = c01a c01b */
607
unpcklpd rC10, rC00 /* rC00 = c00a c10a */
608
unpcklpd rC11, rC01 /* rC01 = c01a c11a */
609
unpckhpd rC10, rA0 /* rA0 = c00b c10b */
610
unpckhpd rC11, rB0 /* rB0 = c01b c11b */
611
addpd rA0, rC00 /* rC00 = c00ab c10ab */
612
addpd rB0, rC01 /* rC01 = c01ab c11ab */
615
movl ldcOFF(%esp), itmp
622
movapd BETAOFF(%esp), rB0
635
movsd (pC0,itmp), rA1
636
movhpd 16(pC0,itmp), rA1
647
MOVAPD BETAOFF(%esp), rB0
660
MOVUPD (pC0,itmp), rA1
665
MOVUPD (pC0, itmp), rA1
684
addq incAm, pA0 /* pA0 += lda*sizeof*2 */
685
addq incAm, pA1 /* pA1 += lda*sizeof*2 */
689
movlpd rC00, -32(pC0)
690
movhpd rC00, -16(pC0)
691
movlpd rC01, -32(pC0,itmp)
692
movhpd rC01, -16(pC0,itmp)
694
MOVUPD rC00, -2*CMUL(8)(pC0)
695
MOVUPD rC01, -2*CMUL(8)(pC0,itmp)
714
movl BETASZ(%esp), %ebp
715
movl BETASZ+4(%esp), %ebx
716
movl BETASZ+8(%esp), %esi
717
movl BETASZ+12(%esp), %edi
718
movl BETASZ+16(%esp), %esp /* restore saved original SP */
729
* Code specialized for K == 2; pA0 & pAB pt to start of arrays
730
* Assume C unaligned so we don't have to write this cleanup case twice.
731
* This assumption costs you major perf. if you care about this case
732
* (since load/store of C dominant cost when K=2). This code is more for
733
* correctness than perf. as presently written.
739
movq MM0, KK /* KK is now M-loop counter */
755
movsd CMUL(8)(pC0), rC10
756
movsd (pC0,itmp), rC01
757
movsd CMUL(8)(pC0,itmp), rC11
760
MOVAPD BETAOFF(%esp), rA1
788
MOVAPD rC00, rA0 /* rA0 = c00a c00b */
789
MOVAPD rC01, rB0 /* rB0 = c01a c01b */
790
unpcklpd rC10, rC00 /* rC00 = c00a c10a */
791
unpcklpd rC11, rC01 /* rC01 = c01a c11a */
792
unpckhpd rC10, rA0 /* rA0 = c00b c10b */
793
unpckhpd rC11, rB0 /* rB0 = c01b c11b */
794
addpd rA0, rC00 /* rC00 = c00ab c10ab */
795
addpd rB0, rC01 /* rC01 = c01ab c11ab */
807
movlpd rC00, -32(pC0)
808
movhpd rC00, -16(pC0)
809
movlpd rC01, -32(pC0,itmp)
810
movhpd rC01, -16(pC0,itmp)
812
MOVUPD rC00, -2*8(pC0)
813
MOVUPD rC01, -2*8(pC0,itmp)
815
jnz MNLOOP_K2 /* end of M-loop */
826
movl BETASZ(%esp), %ebp
827
movl BETASZ+4(%esp), %ebx
828
movl BETASZ+8(%esp), %esi
829
movl BETASZ+12(%esp), %edi
830
movl BETASZ+16(%esp), %esp /* restore saved original SP */
841
* Code specialized for K == 4; pA0 & pAB pt to start of arrays + 16
842
* Assume C unaligned so we don't have to write this cleanup case twice.
843
* This assumption costs you major perf. if you care about this case
844
* (since load/store of C dominant cost when K=4). This code is more for
845
* correctness than perf. as presently written.
851
movq MM0, KK /* KK is now M-loop counter */
868
movsd CMUL(8)(pC0), rC10
869
movsd (pC0,itmp), rC01
870
movsd CMUL(8)(pC0,itmp), rC11
873
MOVAPD BETAOFF(%esp), rA1
914
MOVAPD rC00, rA0 /* rA0 = c00a c00b */
915
MOVAPD rC01, rB0 /* rB0 = c01a c01b */
916
unpcklpd rC10, rC00 /* rC00 = c00a c10a */
917
unpcklpd rC11, rC01 /* rC01 = c01a c11a */
918
unpckhpd rC10, rA0 /* rA0 = c00b c10b */
919
unpckhpd rC11, rB0 /* rB0 = c01b c11b */
920
addpd rA0, rC00 /* rC00 = c00ab c10ab */
921
addpd rB0, rC01 /* rC01 = c01ab c11ab */
933
movlpd rC00, -32(pC0)
934
movhpd rC00, -16(pC0)
935
movlpd rC01, -32(pC0,itmp)
936
movhpd rC01, -16(pC0,itmp)
938
MOVUPD rC00, -2*CMUL(8)(pC0)
939
MOVUPD rC01, -2*CMUL(8)(pC0,itmp)
941
jnz MNLOOP_K4 /* end of M-loop */
952
movl BETASZ(%esp), %ebp
953
movl BETASZ+4(%esp), %ebx
954
movl BETASZ+8(%esp), %esi
955
movl BETASZ+12(%esp), %edi
956
movl BETASZ+16(%esp), %esp /* restore saved original SP */