1
#ifndef ATLAS_QRRMETH_H
2
#define ATLAS_QRRMETH_H
5
* PCA does not work reliably on ARMv7
10
#if !defined(ATL_USEPTHREADS) || defined(FORCE_NO_PCA)
12
* If PCA is not an option, we use unblocked if either diminsion is too small
13
* to allow us to reliably register block the L3BLAS, or if we can fit the
14
* entire problem into the L1 cache. All other problems recur.
16
#define METHOD(METH_, M_, N_, LDA_) /* 0,1,2,3=Recur,Unblkd,Cp,NoC */ \
17
(METH_)=( (N_ < 8) || (M_ < 8) || (((size_t)M_)*N <= ATL_L1elts) ) ? 1:0;
19
#if !defined(CacheEdge) || CacheEdge <=0 || CacheEdge >= 8*1024*1024
20
#define MyCacheEdge (ATL_NTHREADS*(size_t)256*1024) /* collective edge */
22
#define MyCacheEdge (ATL_NTHREADS*(size_t)CacheEdge) /* collective edge */
25
* BIGM is the # of elts we need to force no-copy-PCA even in when we don't
26
* fit into the collective cache and N is very small. In this rule of thumb,
27
* we set it to roughly the page size
29
#define ATL_PCA_BIGM (ATL_DivBySize(4096))
31
* MINM is an estimate of the min local M we can have that will amortize
32
* the parallel sync costs. These numbers will need to be refined.
35
#define ATL_PCA_MINM 256
37
#define ATL_PCA_MINM 128
39
#define ATL_PCA_MINM 96
41
#define ATL_PCA_MINM 64
44
* Note: Expect M_ to be static dimension, N_ to be recursing dimension.
46
#define METHOD(METH_, M_, N_, LDA_) /* 0,1,2,3=Recur,unblck,Cp,NoC */ \
48
METH_ = 0; /* default to recursion */ \
50
* If N_ is too small to get advantage from register blocking, stop recurs \
51
* and use NoCopy-PCA for huge M otherwise use unblocked \
55
METH_ = ((M_) > (ATL_PCA_BIGM<<Mmin(3,ATL_NTHRPOW2))) ? 3 : 1; \
58
* Don't consider PCA or unblocked unless problem fits in collective cache
60
if (MyCacheEdge >= ATL_MulBySize(M_)*N_) \
62
const int zrows = Mmax(((M_)>>ATL_NTHRPOW2),Mmin(M_, N_)); \
63
/* # of rows zero must take */ \
64
const int rrows = (M_) - zrows; /* # of rows for non-0 to take */ \
65
METH_ = 2; /* default to Cp-PCA if it fits in cache */ \
67
* If it fits in the L1, or if the number of local rows is too small
68
* to bear the cost of synchronization, use normal unblocked algorithm
70
if ( (((size_t)M_)*N_ <= ATL_L1elts) || /* Fits in L1, */ \
71
(M_) < ATL_PCA_MINM || /* Total rows very small, */ \
72
((M_)>>ATL_NTHRPOW2) < 3 || /* Too few per core, */ \
73
(rrows<<3) < zrows ) /* Rmndr too small % to help */ \
74
METH_ = 1; /* Use unblocked. */ \
76
} /* END METHOD MACRO */
77
#endif /* end if on threading or not */
79
#endif /* end multiple inclusion guard */