2
* Automatically Tuned Linear Algebra Software v3.8.3
3
* (C) Copyright 2007 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
30
#include "atlas_misc.h"
31
#include "atlas_lvl3.h"
34
int Mjoin(PATL,mmJITcp)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
35
const int M0, const int N, const int K,
36
const SCALAR alpha, const TYPE *A, const int lda,
37
const TYPE *B, const int ldb, const SCALAR beta,
38
TYPE *C, const int ldc)
40
* Copy matmul algorithm, copies A and B on-the-fly
41
* If M < 0, allocates only (MB+NB)*KB workspace
47
MAT2BLK2 A2blk, B2blk;
48
NBMM0 NBmm0, NBmm1, pNBmm0;
49
const int M = (M0 >= 0) ? M0 : -M0;
50
int nkblks, nmblks, nnblks, mr, nr, kr, KR, bigK, h, i, j, ZEROC;
51
int incAk, incBk, incAm, incBn, incAW, incAWp, incBW, incBWp, incW;
54
* If both M and N <= NB, and one of them is not full, call BPP, which
55
* can sometimes avoid doing cleanup forall cases
57
if (M <= MB && N <= NB && (M != MB || N != NB))
58
return(Mjoin(PATL,mmBPP)(TA, TB, M, N, K, alpha, A, lda, B, ldb,
61
* If these workspace increments are 0, we do JIT NBxNB copies instead of
62
* copying entire array/panel. Don't copy mat if you can't reuse it.
66
incAW = (N > NB) ? KB*MB : 0;
67
incBW = (M > NB) ? KB*NB : 0;
69
else /* allocate in minimal space */
78
* K-loop is special, in that we don't call user cleanup, must explicitly zero,
79
* and K-cleanup is typically slower even for generated kernels. Therefore,
80
* allow extra leaway for doing extra flops. Note error is unaffected by
81
* any of these extra flops: K-loop has elts zeroed, and multiplying zeros
82
* and adding in zeros doesn't add to error
84
KR = (kr && kr+4 >= KB) ? KB : kr;
108
if (i <= ATL_MaxMalloc || !(incAW | incBW))
109
v = malloc(ATL_Cachelen+i);
111
pA = ATL_AlignPtr(v);
112
pB0 = pA + (incAW ? bigK*MB : KB*MB);
113
if (TA == AtlasNoTrans)
115
A2blk = Mjoin(PATL,gemoveT);
121
A2blk = Mjoin(PATL,gemove);
125
if (TB == AtlasNoTrans)
127
B2blk = Mjoin(PATL,gemove);
133
B2blk = Mjoin(PATL,gemoveT);
138
* See what kernel we're calling
140
if ( SCALAR_IS_ONE(beta) )
143
pNBmm0 = Mjoin(PATL,pNBmm_b1);
145
else if ( SCALAR_IS_ZERO(beta) )
148
pNBmm0 = Mjoin(PATL,pNBmm_b0);
153
pNBmm0 = Mjoin(PATL,pNBmm_bX);
155
KR = (KR == KB) ? KB : 0;
156
ZEROC = !KR && SCALAR_IS_ZERO(beta);
158
for (i=0; i < nmblks; i++)
161
pB = pB0; /* foreach row-panel of A, start at B's copy space */
162
for (j=nnblks; j; j--)
164
Mjoin(PATL,mmK)(MB, MB, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta,
165
a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBW,
166
C, ldc, A2blk, B2blk, NBmm0, NBmm_b1);
167
B += incBn; /* copy next col panel of B */
168
pB += incW; /* to next col panel of pB */
169
a = (incAW ? NULL : a); /* reuse row-panel of A if copied */
175
Mjoin(PATL,gezero)(MB, nr, C, ldc);
176
Mjoin(PATL,mmK)(MB, MB, nr, nr, nkblks, kr, KR, ATL_rone, alpha, beta,
177
a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBWp,
178
C, ldc, A2blk, B2blk, pNBmm0, Mjoin(PATL,pNBmm_b1));
180
C += MB - nnblks*ldc*NB;
183
B = NULL; /* finished copying B */
191
a = A + nmblks*incAm;
193
if ( SCALAR_IS_ONE(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b1);
194
else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b0);
195
else NBmm0 = Mjoin(PATL,pMBmm_bX);
196
for (j=nnblks; j; j--)
198
Mjoin(PATL,mmK)(mr, mr, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta,
199
a, lda, incAk, pA, incAWp, B, ldb, incBk, pB, incBW,
200
C, ldc, A2blk, B2blk, NBmm0, Mjoin(PATL,pMBmm_b1));
201
B += incBn; /* copy next col panel of B */
202
pB += incW; /* to next col panel of pB */
203
a = (incAW ? NULL : a); /* reuse row-panel of A if copied */
208
if ( SCALAR_IS_ZERO(beta) )
209
Mjoin(PATL,gezero)(mr, nr, C, ldc);
210
Mjoin(PATL,mmK)(mr, mr, nr, nr, nkblks, kr, (incAW | incBW) ? KR:0,
211
ATL_rone, alpha, beta, a, lda, incAk, pA, incAWp,
212
B, ldb, incBk, pB, incBWp, C, ldc, A2blk, B2blk,
213
Mjoin(PATL,pKBmm), Mjoin(PATL,pKBmm));