15
15
// You should have received a copy of the GNU Lesser General Public License
16
16
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
18
// Structures representing coprocessors (e.g. GPUs);
19
// used in both client and server.
23
// 1) The use of "CUDA" is misleading; it really means "NVIDIA GPU".
24
// 2) The design treats each resource type as a pool of identical devices;
25
// for example, there is a single "CUDA long-term debt" per project,
26
// and a scheduler request contains a request (#instances, instance-seconds)
28
// In reality, the instances of a resource type can have different properties:
29
// In the case of CUDA, "compute capability", driver version, RAM, speed, etc.
30
// How to resolve this discrepancy?
32
// Prior to 21 Apr 09 we identified the fastest instance
33
// and pretended that the others were identical to it.
34
// This approach has a serious flaw:
35
// suppose that the fastest instance has characteristics
36
// (version, RAM etc.) that satisfy the project's requirements,
37
// but other instances to not.
38
// Then BOINC executes jobs on GPUs that can't handle them,
39
// the jobs fail, the host is punished, etc.
41
// We could treat each GPU has a separate resource,
42
// with its own set of debts, backoffs, etc.
43
// However, this would imply tying jobs to instances,
44
// which is undesirable from a scheduling viewpoint.
45
// It would also be a big code change in both client and server.
47
// Instead, (as of 21 Apr 09) our approach is to identify a
48
// "most capable" instance, which in the case of CUDA is based on
49
// a) compute capability
53
// (in decreasing priority).
54
// We ignore and don't use any instances that are less capable
55
// on any of these axes.
57
// This design avoids running coprocessor apps on instances
58
// that are incapable of handling them, and it involves no server changes.
59
// Its drawback is that, on systems with multiple and differing GPUs,
60
// it may not use some GPUs that actually could be used.
29
73
#include "miofile.h"
31
#define MAX_COPROC_INSTANCES 8
74
#include "cal_boinc.h"
76
#define MAX_COPROC_INSTANCES 64
78
// represents a requirement for a coproc.
79
// This is a parsed version of the <coproc> elements in an <app_version>
80
// (used in client only)
83
char type[256]; // must be unique
88
// represents a coproc on a particular computer.
90
// objects will always be a derived class (COPROC_CUDA, COPROC_ATI)
91
// Used in both client and server.
34
94
char type[256]; // must be unique
35
95
int count; // how many are present
36
int used; // how many are in use (used by client)
37
void* owner[MAX_COPROC_INSTANCES];
38
// which ACTIVE_TASK each one is allocated to
96
double used; // how many are in use (used by client)
98
// the following are used in both client and server for work-fetch info
101
// how many instance-seconds of work requested
102
double req_instances;
103
// client is requesting enough jobs to use this many instances
104
double estimated_delay;
105
// resource will be saturated for this long
107
// temps used in client (enforce_schedule())
108
// to keep track of what fraction of each instance is in use
109
// during instance assignment
111
double usage[MAX_COPROC_INSTANCES];
112
double pending_usage[MAX_COPROC_INSTANCES];
114
// the device number of each instance
115
// These are not sequential if we omit instances (see above)
117
int device_nums[MAX_COPROC_INSTANCES];
118
int device_num; // temp used in scan process
119
bool running_graphics_app[MAX_COPROC_INSTANCES];
120
// is this GPU running a graphics app (NVIDIA only)
121
double available_ram[MAX_COPROC_INSTANCES];
122
bool available_ram_unknown[MAX_COPROC_INSTANCES];
123
// couldn't get available RAM; don't start new apps on this instance
124
double available_ram_fake[MAX_COPROC_INSTANCES];
126
double last_print_time;
40
128
#ifndef _USING_FCGI_
41
129
virtual void write_xml(MIOFILE&);
130
void write_request(MIOFILE&);
132
inline void clear() {
133
// can't just memcpy() - trashes vtable
140
for (int i=0; i<MAX_COPROC_INSTANCES; i++) {
142
running_graphics_app[i] = true;
143
available_ram[i] = 0;
144
available_ram_fake[i] = 0;
145
available_ram_unknown[i] = true;
148
inline void clear_usage() {
149
for (int i=0; i<count; i++) {
151
pending_usage[i] = 0;
43
154
COPROC(const char* t){
47
memset(&owner, 0, sizeof(owner));
49
virtual void description(char*){};
50
161
virtual ~COPROC(){}
51
162
int parse(MIOFILE&);
55
std::vector<COPROC*> coprocs; // not deleted in destructor
56
// so any structure that includes this needs to do it manually
60
void delete_coprocs(){
61
for (unsigned int i=0; i<coprocs.size(); i++) {
66
void write_xml(MIOFILE& out) {
67
for (unsigned int i=0; i<coprocs.size(); i++) {
68
coprocs[i]->write_xml(out);
72
std::vector<std::string> get();
74
COPROC* lookup(char*);
75
bool sufficient_coprocs(COPROCS&, bool log_flag, const char* prefix);
76
void reserve_coprocs(COPROCS&, void*, bool log_flag, const char* prefix);
77
void free_coprocs(COPROCS&, void*, bool log_flag, const char* prefix);
79
for (unsigned int i=0; i<coprocs.size(); i++) {
80
COPROC* cp = coprocs[i];
81
if (cp->used < cp->count) return false;
86
// Copy a coproc set, setting usage to zero.
87
// used in round-robin simulator and CPU scheduler,
88
// to avoid messing w/ master copy
90
void clone(COPROCS& c, bool copy_used) {
91
for (unsigned int i=0; i<c.coprocs.size(); i++) {
92
COPROC* cp = c.coprocs[i];
93
COPROC* cp2 = new COPROC(cp->type);
94
cp2->count = cp->count;
95
if (copy_used) cp2->used = cp->used;
96
coprocs.push_back(cp2);
101
// the following copied from /usr/local/cuda/include/driver_types.h
163
void print_available_ram();
166
// based on cudaDeviceProp from /usr/local/cuda/include/driver_types.h
167
// doesn't have to match exactly since we get the attributes one at a time.
103
struct cudaDeviceProp {
169
struct CUDA_DEVICE_PROP {
105
size_t totalGlobalMem;
106
size_t sharedMemPerBlock;
171
unsigned int totalGlobalMem;
172
// not used on the server; dtotalGlobalMem is used instead
173
// (since some boards have >= 4GB)
174
int sharedMemPerBlock;
107
175
int regsPerBlock;
110
178
int maxThreadsPerBlock;
111
179
int maxThreadsDim[3];
112
180
int maxGridSize[3];
114
size_t totalConstMem;
183
int major; // compute capability
117
size_t textureAlignment;
185
int textureAlignment;
118
186
int deviceOverlap;
119
187
int multiProcessorCount;
120
int __cudaReserved[40];
188
double dtotalGlobalMem; // not defined in client
123
191
struct COPROC_CUDA : public COPROC {
192
int cuda_version; // CUDA runtime version
193
int display_driver_version;
194
CUDA_DEVICE_PROP prop;
126
196
#ifndef _USING_FCGI_
127
virtual void write_xml(MIOFILE&);
197
virtual void write_xml(MIOFILE&, bool include_request);
129
199
COPROC_CUDA(): COPROC("CUDA"){}
130
200
virtual ~COPROC_CUDA(){}
131
static const char* get(COPROCS&);
132
virtual void description(char*);
138
struct COPROC_CELL_SPE : public COPROC {
139
static const char* get(COPROCS&);
140
COPROC_CELL_SPE() : COPROC("Cell SPE"){}
141
virtual void description(char*);
142
virtual ~COPROC_CELL_SPE(){}
145
void fake_cuda(COPROCS&, int);
203
std::vector<std::string>&, std::vector<std::string>&,
204
std::vector<int>& ignore_devs
206
void description(char*);
210
// Estimate of peak FLOPS.
211
// FLOPS for a given app may be much less;
212
// e.g. for SETI@home it's about 0.18 of the peak
214
inline double peak_flops() {
215
// clock rate is scaled down by 1000;
216
// each processor has 8 or 32 cores;
217
// each core can do 2 ops per clock
219
int cores_per_proc = (prop.major>=2)?32:8;
220
double x = (1000.*prop.clockRate) * prop.multiProcessorCount * cores_per_proc * 2.;
223
void get_available_ram();
225
bool check_running_graphics_app();
226
void fake(int driver_version, double ram, int count);
230
struct COPROC_ATI : public COPROC {
236
CALdeviceattribs attribs;
239
virtual void write_xml(MIOFILE&, bool include_request);
241
COPROC_ATI(): COPROC("ATI"){}
242
virtual ~COPROC_ATI(){}
245
std::vector<std::string>&, std::vector<std::string>&,
246
std::vector<int>& ignore_devs
248
void description(char*);
251
inline double peak_flops() {
252
double x = attribs.numberOfSIMD * attribs.wavefrontSize * 2.5 * attribs.engineClock * 1.e6;
256
void get_available_ram();
257
void fake(double, int);
265
~COPROCS(){} // don't delete coprocs; else crash in APP_INIT_DATA logic
266
void write_xml(MIOFILE& out, bool include_request);
268
bool use_all, std::vector<std::string> &descs,
269
std::vector<std::string> &warnings,
270
std::vector<int>& ignore_cuda_dev,
271
std::vector<int>& ignore_ati_dev
274
void summary_string(char*, int);
276
// Copy a coproc set, possibly setting usage to zero.
277
// used in round-robin simulator and CPU scheduler,
278
// to avoid messing w/ master copy
280
void clone(COPROCS& c, bool copy_used) {
288
inline void clear() {
292
inline void clear_usage() {
297
return (cuda.count==0) && (ati.count==0);
300
return cuda.count + ati.count;