2
* Automatically Tuned Linear Algebra Software v3.8.3
3
* (C) Copyright 2007 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
30
#include "atlas_misc.h"
31
#include "atlas_lvl3.h"
34
int Mjoin(PATL,mmBPP)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
35
const int M, const int N, const int K,
36
const SCALAR alpha, const TYPE *A, const int lda,
37
const TYPE *B, const int ldb, const SCALAR beta,
38
TYPE *C, const int ldc0)
40
* Copy algorithm, assuming M <= MB && N <= NB, K large (shape: block, panel,
41
* panel); copies A and B on-the-fly
46
void (*A2blk)(int N, int M, const SCALAR alpha, const TYPE *A, int lda,
48
void (*B2blk)(int N, int M, const SCALAR alpha, const TYPE *A, int lda,
51
int ldc, m, n, nblk, k, kr, incA, incB;
53
if (M > MB || N > NB) /* don't handle multiple M/N blocks */
55
if (M < MB && M+ATL_mmMU >= MB)
59
if (N < NB && N+ATL_mmNU >= NB)
63
ldc = (((m*sizeof(TYPE)+ATL_Cachelen-1)/ATL_Cachelen)*ATL_Cachelen)
65
vC = malloc(ATL_Cachelen+ATL_MulBySize(ldc*n+KB*(m+n)));
67
pC = ATL_AlignPtr(vC);
70
if (TA == AtlasNoTrans)
72
A2blk = Mjoin(PATL,gemoveT);
77
A2blk = Mjoin(PATL,gemove);
80
if (TB == AtlasNoTrans)
82
B2blk = Mjoin(PATL,gemove);
87
B2blk = Mjoin(PATL,gemoveT);
91
* If we are going to multiply zeros to avoid cleanup, zero workspace
94
Mjoin(PATL,zero)(ldc*n+KB*(m+n), pC, 1);
96
* See what kernel we're calling
100
if (n == NB) /* no cleanup */
105
else /* need to call N-cleanup kernel */
107
NBmm0 = Mjoin(PATL,pNBmm_b0);
108
NBmm1 = Mjoin(PATL,pNBmm_b1);
111
else if (n == NB) /* call M-cleanup kernel */
113
NBmm0 = Mjoin(PATL,pMBmm_b0);
114
NBmm1 = Mjoin(PATL,pMBmm_b1);
116
else /* both N & M are cleanup, call general K clean */
118
NBmm0 = Mjoin(PATL,pKBmm);
119
NBmm1 = Mjoin(PATL,pKBmm);
120
if (m == M && n == N) /* must zero pC if not done above */
121
Mjoin(PATL,zero)(ldc*n, pC, 1);
127
Mjoin(PATL,zero)(ldc*n, pC, 1);
128
Mjoin(PATL,mmK)(M, m, N, n, nblk, kr, (kr && kr+4 >= KB) ? KB : 0,
129
ATL_rone, ATL_rone, ATL_rzero, A, lda, incA, pA, 0,
130
B, ldb, incB, pB, 0, pC, ldc, A2blk, B2blk, NBmm0, NBmm1);
131
Mjoin(PATL,geadd)(M, N, alpha, pC, ldc, beta, C, ldc0);