~ubuntu-branches/ubuntu/trusty/atlas/trusty

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#ifndef ATLAS_QRRMETH_H
   #define ATLAS_QRRMETH_H
#include "atlas_pca.h"
/*
 * PCA does not work reliably on ARMv7
 */
#ifndef ATL_USEPCA
   #define FORCE_NO_PCA 1
#endif
#if !defined(ATL_USEPTHREADS) || defined(FORCE_NO_PCA)
/*
 * If PCA is not an option, we use unblocked if either diminsion is too small
 * to allow us to reliably register block the L3BLAS, or if we can fit the
 * entire problem into the L1 cache.  All other problems recur.
 */
   #define METHOD(METH_, M_, N_, LDA_) /* 0,1,2,3=Recur,Unblkd,Cp,NoC      */ \
      (METH_)=( (N_ < 8) || (M_ < 8) || (((size_t)M_)*N <= ATL_L1elts) ) ? 1:0;
#else
   #if !defined(CacheEdge) || CacheEdge <=0 || CacheEdge >= 8*1024*1024
      #define MyCacheEdge (ATL_NTHREADS*(size_t)256*1024) /* collective edge */
   #else
      #define MyCacheEdge (ATL_NTHREADS*(size_t)CacheEdge) /* collective edge */
   #endif
/*
 * BIGM is the # of elts we need to force no-copy-PCA even in when we don't
 * fit into the collective cache and N is very small.  In this rule of thumb,
 * we set it to roughly the page size
 */
   #define ATL_PCA_BIGM (ATL_DivBySize(4096))
/*
 * MINM is an estimate of the min local M we can have that will amortize
 * the parallel sync costs.  These numbers will need to be refined.
 */
   #ifdef SREAL
      #define ATL_PCA_MINM 256
   #elif defined(DREAL)
      #define ATL_PCA_MINM 128
   #elif defined(SCPLX)
      #define ATL_PCA_MINM 96
   #else
      #define ATL_PCA_MINM 64
   #endif
/*
 * Note: Expect M_ to be static dimension, N_ to be recursing dimension.
 */
   #define METHOD(METH_, M_, N_, LDA_) /* 0,1,2,3=Recur,unblck,Cp,NoC      */ \
   {                                                                          \
      METH_ = 0;                       /* default to recursion */ \
/* \
 *    If N_ is too small to get advantage from register blocking, stop recurs \
 *    and use NoCopy-PCA for huge M otherwise use unblocked \
 */ \
      if (N_ < 4) \
      { \
         METH_ = ((M_) > (ATL_PCA_BIGM<<Mmin(3,ATL_NTHRPOW2))) ? 3 : 1; \
      } \
/* \
 *    Don't consider PCA or unblocked unless problem fits in collective cache
 */ \
      if (MyCacheEdge >= ATL_MulBySize(M_)*N_) \
      { \
         const int zrows = Mmax(((M_)>>ATL_NTHRPOW2),Mmin(M_, N_));  \
                                            /* # of rows zero must take */ \
         const int rrows = (M_) - zrows;    /* # of rows for non-0 to take */ \
         METH_ = 2;   /* default to Cp-PCA if it fits in cache */ \
/* \
 *       If it fits in the L1, or if the number of local rows is too small
 *       to bear the cost of synchronization, use normal unblocked algorithm
 */ \
         if ( (((size_t)M_)*N_ <= ATL_L1elts) ||  /* Fits in L1, */ \
              (M_) < ATL_PCA_MINM ||           /* Total rows very small, */ \
              ((M_)>>ATL_NTHRPOW2) < 3 ||      /* Too few per core, */ \
              (rrows<<3) < zrows )             /* Rmndr too small % to help */ \
            METH_ = 1;                         /* Use unblocked. */ \
      } \
   } /* END METHOD MACRO */
#endif                          /* end if on threading or not */

#endif /* end multiple inclusion guard */