~ubuntu-branches/ubuntu/precise/boinc/precise

if (est_wallclock_duration*2 > reduced_delay_bound && est_wallclock_duration*2 < delay_bound && est_wallclock_duration*2 < delay_bound*config.reliable_reduced_delay_bound*2 ) {

955

reduced_delay_bound = est_wallclock_duration*2;

956

}

957

delay_bound = (int) reduced_delay_bound;

958

}

959

960

result.report_deadline = result.sent_time + delay_bound;

1096

// We're sending this result for the first time

1097

961

1098

result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;

962

1099

} else {

963

1100

// Result was already sent to this host but was lost,

964

// so we are resending it.

1101

// so we're resending it.

965

1102

966

1103

resent_result = true;

967

968

// TODO: explain the following

969

970

if (result.report_deadline < result.sent_time) {

971

result.report_deadline = result.sent_time + 10;

972

}

973

if (result.report_deadline > result.sent_time + delay_bound) {

974

result.report_deadline = result.sent_time + delay_bound;

975

}

976

1104

977

1105

if (config.debug_send) {

978

log_messages.printf(MSG_DEBUG,

979

"[RESULT#%d] [HOST#%d] (resend lost work)\n",

980

result.id, reply.host.id

1106

log_messages.printf(MSG_NORMAL,

1107

"[send] [RESULT#%d] [HOST#%d] (resend lost work)\n",

1108

result.id, g_reply->host.id

981

1109

);

982

1110

}

983

1111

}

985

1113

if (retval == ERR_DB_NOT_FOUND) {

986

1114

log_messages.printf(MSG_CRITICAL,

987

1115

"[RESULT#%d] [HOST#%d]: CAN'T SEND, already sent to another host\n",

988

result.id, reply.host.id

1116

result.id, g_reply->host.id

989

1117

);

990

1118

} else if (retval) {

991

1119

log_messages.printf(MSG_CRITICAL,

994

1122

}

995

1123

if (retval) return retval;

996

1124

997

wu_seconds_filled = estimate_wallclock_duration(wu, request, reply);

1125

double est_dur = estimate_duration(wu, *bavp);

998

1126

if (config.debug_send) {

999

1127

log_messages.printf(MSG_NORMAL,

1000

"[HOST#%d] Sending [RESULT#%d %s] (fills %.2f seconds)\n",

1001

reply.host.id, result.id, result.name, wu_seconds_filled

1128

"[HOST#%d] Sending [RESULT#%d %s] (est. dur. %.2f seconds)\n",

1129

g_reply->host.id, result.id, result.name, est_dur

1002

1130

);

1003

1131

}

1004

1132

1030

1158

return retval;

1031

1159

}

1032

1160

result.bavp = bavp;

1033

reply.insert_result(result);

1034

reply.wreq.seconds_to_fill -= wu_seconds_filled;

1035

request.estimated_delay += wu_seconds_filled/effective_ncpus(request, reply);

1036

reply.wreq.nresults++;

1037

reply.wreq.nresults_on_host++;

1038

if (!resent_result) reply.host.nresults_today++;

1161

g_reply->insert_result(result);

1162

if (g_wreq->rsc_spec_request) {

1163

if (bavp->host_usage.ncudas) {

1164

g_wreq->cuda_req_secs -= est_dur;

1165

g_wreq->cuda_req_instances -= bavp->host_usage.ncudas;

1166

} else if (bavp->host_usage.natis) {

1167

g_wreq->ati_req_secs -= est_dur;

1168

g_wreq->ati_req_instances -= bavp->host_usage.natis;

1169

} else {

1170

g_wreq->cpu_req_secs -= est_dur;

1171

g_wreq->cpu_req_instances -= bavp->host_usage.avg_ncpus;

1172

}

1173

} else {

1174

g_wreq->seconds_to_fill -= est_dur;

1175

}

1176

update_estimated_delay(*bavp, est_dur);

1177

g_wreq->njobs_sent++;

1178

config.max_jobs_in_progress.register_job(app, bavp->host_usage.uses_gpu());

1179

if (!resent_result) {

1180

DB_HOST_APP_VERSION* havp = bavp->host_app_version();

1181

if (havp) {

1182

havp->n_jobs_today++;

1183

}

1184

}

1039

1185

1040

1186

// add this result to workload for simulation

1041

1187

1042

if (config.workload_sim && request.have_other_results_list) {

1043

double est_cpu = estimate_cpu_duration(wu, reply);

1044

IP_RESULT ipr ("", time(0)+wu.delay_bound, est_cpu);

1045

request.ip_results.push_back(ipr);

1188

if (config.workload_sim && g_request->have_other_results_list) {

1189

IP_RESULT ipr ("", time(0)+wu.delay_bound, est_dur);

1190

g_request->ip_results.push_back(ipr);

1046

1191

}

1047

1192

1048

1193

// mark job as done if debugging flag is set;

1067

1212

// mark it as replicated

1068

1213

1069

1214

if (wu.target_nresults == 1 && app->target_nresults > 1) {

1070

if (reply.wreq.trust) {

1215

if (bavp->trusted) {

1071

1216

if (config.debug_send) {

1072

log_messages.printf(MSG_DEBUG,

1073

"[WU#%d] sending to trusted host, not replicating\n", wu.id

1217

log_messages.printf(MSG_NORMAL,

1218

"[send] [WU#%d] using trusted app version, not replicating\n", wu.id

1074

1219

);

1075

1220

}

1076

1221

} else {

1082

1227

);

1083

1228

dbwu.id = wu.id;

1084

1229

if (config.debug_send) {

1085

log_messages.printf(MSG_DEBUG,

1086

"[WU#%d] sending to untrusted host, replicating\n", wu.id

1230

log_messages.printf(MSG_NORMAL,

1231

"[send] [WU#%d] sending to untrusted host, replicating\n", wu.id

1087

1232

);

1088

1233

}

1089

1234

retval = dbwu.update_field(buf);

1098

1243

return 0;

1099

1244

}

1100

1245

1101

// send messages to user about why jobs were or weren't sent

1102

1103

static void explain_to_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

1104

char helpful[512];

1246

// Send high-priority messages about things the user can change easily

1247

// (namely the driver version)

1248

// and low-priority messages about things that can't easily be changed,

1249

// but which may be interfering with getting tasks or latest apps

1250

1251

static void send_gpu_messages(

1252

GPU_REQUIREMENTS& req, double ram, int version, const char* rsc_name

1253

) {

1254

char buf[256];

1255

if (ram < req.min_ram) {

1256

sprintf(buf,

1257

"A minimum of %d MB (preferably %d MB) of video RAM is needed to process tasks using your computer's %s",

1258

(int) (req.min_ram/MEGA),

1259

(int) (req.opt_ram/MEGA),

1260

rsc_name

1261

);

1262

g_reply->insert_message(buf, "low");

1263

} else {

1264

if (version) {

1265

if (version < req.min_driver_version) {

1266

sprintf(buf,

1267

"%s: %s",

1268

rsc_name,

1269

_("Upgrade to the latest driver to process tasks using your computer's GPU")

1270

);

1271

g_reply->insert_message(buf, "notice");

1272

} else if (version < req.opt_driver_version) {

1273

sprintf(buf,

1274

"%s: %s",

1275

rsc_name,

1276

_("Upgrade to the latest driver to use all of this project's GPU applications")

1277

);

1278

g_reply->insert_message(buf, "low");

1279

}

1280

}

1281

}

1282

}

1283

1284

// send messages to user about why jobs were or weren't sent,

1285

// recommendations for GPU driver upgrades, etc.

1286

1287

static void send_user_messages() {

1288

char buf[512];

1105

1289

unsigned int i;

1106

1290

int j;

1107

1291

1292

// Mac client with GPU but too-old client

1293

1294

if (g_request->coprocs.cuda.count

1295

&& ssp->have_cuda_apps

1296

&& strstr(g_request->host.os_name, "Darwin")

1297

&& g_request->core_client_version < 61028

1298

) {

1299

g_reply->insert_message(

1300

_("A newer version of BOINC is needed to use your NVIDIA GPU; please upgrade to the current version"),

1301

"notice"

1302

);

1303

}

1304

1305

// GPU-only project, client lacks GPU

1306

1307

bool usable_gpu = (ssp->have_cuda_apps && g_request->coprocs.cuda.count)

1308

|| (ssp->have_ati_apps && g_request->coprocs.ati.count);

1309

if (!ssp->have_cpu_apps && !usable_gpu) {

1310

if (ssp->have_cuda_apps) {

1311

if (ssp->have_ati_apps) {

1312

g_reply->insert_message(

1313

_("An NVIDIA or ATI GPU is required to run tasks for this project"),

1314

"notice"

1315

);

1316

} else {

1317

g_reply->insert_message(

1318

_("An NVIDIA GPU is required to run tasks for this project"),

1319

"notice"

1320

);

1321

}

1322

} else if (ssp->have_ati_apps) {

1323

g_reply->insert_message(

1324

_("An ATI GPU is required to run tasks for this project"),

1325

"notice"

1326

);

1327

}

1328

}

1329

1330

if (g_request->coprocs.cuda.count && ssp->have_cuda_apps) {

1331

send_gpu_messages(cuda_requirements,

1332

g_request->coprocs.cuda.prop.dtotalGlobalMem,

1333

g_request->coprocs.cuda.display_driver_version,

1334

"NVIDIA GPU"

1335

);

1336

}

1337

if (g_request->coprocs.ati.count && ssp->have_ati_apps) {

1338

send_gpu_messages(ati_requirements,

1339

g_request->coprocs.ati.attribs.localRAM*MEGA,

1340

g_request->coprocs.ati.version_num,

1341

"ATI GPU"

1342

);

1343

}

1344

1345

1108

1346

// If work was sent from apps the user did not select, explain.

1109

1347

// NOTE: this will have to be done differently with matchmaker scheduling

1110

1348

1111

if (!config.locality_scheduling && !config.matchmaker) {

1112

if (reply.wreq.nresults && !reply.wreq.user_apps_only) {

1113

USER_MESSAGE um(

1349

if (!config.locality_scheduling && !config.locality_scheduler_fraction && !config.matchmaker) {

1350

if (g_wreq->njobs_sent && !g_wreq->user_apps_only) {

1351

g_reply->insert_message(

1114

1352

"No work can be sent for the applications you have selected",

1115

"high"

1353

"low"

1116

1354

);

1117

reply.insert_message(um);

1118

1355

1119

1356

// Inform the user about applications with no work

1120

1357

1121

for (i=0; i<reply.wreq.host_info.preferred_apps.size(); i++) {

1122

if (!reply.wreq.host_info.preferred_apps[i].work_available) {

1123

APP* app = ssp->lookup_app(reply.wreq.host_info.preferred_apps[i].appid);

1358

for (i=0; i<g_wreq->preferred_apps.size(); i++) {

1359

if (!g_wreq->preferred_apps[i].work_available) {

1360

APP* app = ssp->lookup_app(g_wreq->preferred_apps[i].appid);

1124

1361

// don't write message if the app is deprecated

1125

1362

1126

1363

if (app) {

1127

1364

char explanation[256];

1128

1365

sprintf(explanation,

1129

1366

"No work is available for %s",

1130

find_user_friendly_name(reply.wreq.host_info.preferred_apps[i].appid)

1367

find_user_friendly_name(g_wreq->preferred_apps[i].appid)

1131

1368

);

1132

USER_MESSAGE um2(explanation, "high");

1133

reply.insert_message(um2);

1369

g_reply->insert_message( explanation, "low");

1134

1370

}

1135

1371

}

1136

1372

}

1138

1374

// Tell the user about applications they didn't qualify for

1139

1375

1140

1376

for (j=0; j<preferred_app_message_index; j++){

1141

reply.insert_message(reply.wreq.no_work_messages.at(j));

1377

g_reply->insert_message(g_wreq->no_work_messages.at(j));

1142

1378

}

1143

USER_MESSAGE um1(

1144

"You have selected to receive work from other applications if no work is available for the applications you selected",

1145

"high"

1146

);

1147

reply.insert_message(um1);

1148

USER_MESSAGE um2("Sending work from other applications", "high");

1149

reply.insert_message(um2);

1379

g_reply->insert_message(

1380

"Your preferences allow work from applications other than those selected",

1381

"low"

1382

);

1383

g_reply->insert_message(

1384

"Sending work from other applications", "low"

1385

);

1150

1386

}

1151

1387

}

1152

1388

1153

1389

// if client asked for work and we're not sending any, explain why

1154

1390

1155

if (reply.wreq.nresults == 0) {

1156

reply.set_delay(DELAY_NO_WORK_TEMP);

1157

USER_MESSAGE um2("No work sent", "high");

1158

reply.insert_message(um2);

1159

// Inform the user about applications with no work

1160

for (i=0; i<reply.wreq.host_info.preferred_apps.size(); i++) {

1161

if (!reply.wreq.host_info.preferred_apps[i].work_available) {

1162

APP* app = ssp->lookup_app(reply.wreq.host_info.preferred_apps[i].appid);

1163

// don't write message if the app is deprecated

1164

if (app != NULL) {

1165

char explanation[256];

1166

sprintf(explanation, "No work is available for %s",

1167

find_user_friendly_name(reply.wreq.host_info.preferred_apps[i].appid)

1391

if (g_wreq->njobs_sent == 0) {

1392

g_reply->set_delay(DELAY_NO_WORK_TEMP);

1393

g_reply->insert_message("No work sent", "low");

1394

1395

// Tell the user about applications with no work

1396

1397

for (i=0; i<g_wreq->preferred_apps.size(); i++) {

1398

if (!g_wreq->preferred_apps[i].work_available) {

1399

APP* app = ssp->lookup_app(g_wreq->preferred_apps[i].appid);

1400

// don't write message if the app is deprecated

1401

if (app != NULL) {

1402

sprintf(buf, "No work is available for %s",

1403

find_user_friendly_name(

1404

g_wreq->preferred_apps[i].appid

1405

)

1168

1406

);

1169

USER_MESSAGE um(explanation, "high");

1170

reply.insert_message(um);

1171

}

1172

}

1173

}

1174

// Inform the user about applications they didn't qualify for

1175

for (i=0; i<reply.wreq.no_work_messages.size(); i++){

1176

reply.insert_message(reply.wreq.no_work_messages.at(i));

1177

}

1178

if (reply.wreq.no_app_version) {

1179

reply.set_delay(DELAY_NO_WORK_PERM);

1180

}

1181

if (reply.wreq.no_allowed_apps_available) {

1182

USER_MESSAGE um(

1183

"No work available for the applications you have selected. Please check your settings on the web site.",

1184

"high"

1407

g_reply->insert_message(buf, "low");

1408

}

1409

}

1410

}

1411

1412

// Tell the user about applications they didn't qualify for

1413

1414

for (i=0; i<g_wreq->no_work_messages.size(); i++){

1415

g_reply->insert_message(g_wreq->no_work_messages.at(i));

1416

}

1417

if (g_wreq->no_allowed_apps_available) {

1418

g_reply->insert_message(

1419

_("No work available for the applications you have selected. Please check your project preferences on the web site."),

1420

"notice"

1185

1421

);

1186

reply.insert_message(um);

1187

1422

}

1188

if (reply.wreq.speed.insufficient) {

1189

if (sreq.core_client_version>419) {

1190

sprintf(helpful,

1191

"(won't finish in time) "

1192

"BOINC runs %.1f%% of time, computation enabled %.1f%% of that",

1193

100.0*reply.host.on_frac, 100.0*reply.host.active_frac

1423

if (g_wreq->speed.insufficient) {

1424

if (g_request->core_client_version>41900) {

1425

sprintf(buf,

1426

"Tasks won't finish in time: BOINC runs %.1f%% of the time; computation is enabled %.1f%% of that",

1427

100*g_reply->host.on_frac, 100*g_reply->host.active_frac

1194

1428

);

1195

1429

} else {

1196

sprintf(helpful,

1197

"(won't finish in time) "

1198

"Computer available %.1f%% of time",

1199

100.0*reply.host.on_frac

1200

);

1201

}

1202

USER_MESSAGE um(helpful, "high");

1203

reply.insert_message(um);

1204

}

1205

if (reply.wreq.hr_reject_temp) {

1206

USER_MESSAGE um(

1207

"(there was work but it was committed to other platforms)",

1208

"high"

1209

);

1210

reply.insert_message(um);

1211

}

1212

if (reply.wreq.hr_reject_perm) {

1213

USER_MESSAGE um(

1214

"(your platform is not supported by this project)",

1215

"high"

1216

);

1217

reply.insert_message(um);

1218

}

1219

if (reply.wreq.outdated_core) {

1220

USER_MESSAGE um(

1221

" (your BOINC client is old - please install current version)",

1222

"high"

1223

);

1224

reply.insert_message(um);

1225

reply.set_delay(DELAY_NO_WORK_PERM);

1226

log_messages.printf(MSG_NORMAL,

1227

"Not sending work because client is outdated\n"

1228

);

1229

}

1230

if (reply.wreq.excessive_work_buf) {

1231

USER_MESSAGE um(

1232

"(Your network connection interval is longer than WU deadline)",

1233

"high"

1234

);

1235

reply.insert_message(um);

1236

}

1237

if (reply.wreq.no_jobs_available) {

1238

USER_MESSAGE um(

1239

"(Project has no jobs available)",

1240

"high"

1241

);

1242

reply.insert_message(um);

1243

}

1244

if (reply.wreq.daily_result_quota_exceeded) {

1245

struct tm *rpc_time_tm;

1246

int delay_time;

1247

1248

sprintf(helpful, "(reached daily quota of %d results)", reply.wreq.daily_result_quota);

1249

USER_MESSAGE um(helpful, "high");

1250

reply.insert_message(um);

1251

log_messages.printf(MSG_NORMAL,

1252

"Daily result quota exceeded for host %d\n",

1253

reply.host.id

1254

);

1255

1256

// set delay so host won't return until a random time in

1257

// the first hour of the next day.

1258

// This is to prevent a lot of hosts from flooding the scheduler

1259

// with requests at the same time of day.

1260

1261

time_t t = reply.host.rpc_time;

1262

rpc_time_tm = localtime(&t);

1263

delay_time = (23 - rpc_time_tm->tm_hour) * 3600

1264

+ (59 - rpc_time_tm->tm_min) * 60

1265

+ (60 - rpc_time_tm->tm_sec)

1266

+ (int)(3600*(double)rand()/(double)RAND_MAX);

1267

reply.set_delay(delay_time);

1268

}

1269

if (reply.wreq.cache_size_exceeded) {

1270

sprintf(helpful, "(reached per-CPU limit of %d tasks)",

1271

config.max_wus_in_progress

1272

);

1273

USER_MESSAGE um(helpful, "high");

1274

reply.insert_message(um);

1275

reply.set_delay(DELAY_NO_WORK_CACHE);

1276

log_messages.printf(MSG_NORMAL,

1277

"host %d already has %d result(s) in progress\n",

1278

reply.host.id, reply.wreq.nresults_on_host

1279

);

1280

}

1281

}

1282

}

1283

1284

static void get_running_frac(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

1285

if (sreq.core_client_version<=419) {

1286

reply.wreq.running_frac = reply.host.on_frac;

1430

sprintf(buf,

1431

"Tasks won't finish in time: Computer available %.1f%% of the time",

1432

100*g_reply->host.on_frac

1433

);

1434

}

1435

g_reply->insert_message(buf, "low");

1436

}

1437

if (g_wreq->hr_reject_temp) {

1438

g_reply->insert_message(

1439

"Tasks are committed to other platforms",

1440

"low"

1441

);

1442

}

1443

if (g_wreq->hr_reject_perm) {

1444

g_reply->insert_message(

1445

_("Your computer type is not supported by this project"),

1446

"notice"

1447

);

1448

}

1449

if (g_wreq->outdated_client) {

1450

g_reply->insert_message(

1451

_("Newer BOINC version required; please install current version"),

1452

"notice"

1453

);

1454

g_reply->set_delay(DELAY_NO_WORK_PERM);

1455

log_messages.printf(MSG_NORMAL,

1456

"Not sending work because newer client version required\n"

1457

);

1458

}

1459

if (g_wreq->no_cuda_prefs) {

1460

g_reply->insert_message(

1461

_("Tasks for NVIDIA GPU are available, but your preferences are set to not accept them"),

1462

"notice"

1463

);

1464

}

1465

if (g_wreq->no_ati_prefs) {

1466

g_reply->insert_message(

1467

_("Tasks for ATI GPU are available, but your preferences are set to not accept them"),

1468

"notice"

1469

);

1470

}

1471

if (g_wreq->no_cpu_prefs) {

1472

g_reply->insert_message(

1473

_("Tasks for CPU are available, but your preferences are set to not accept them"),

1474

"notice"

1475

);

1476

}

1477

DB_HOST_APP_VERSION* havp = quota_exceeded_version();

1478

if (havp) {

1479

sprintf(buf, "This computer has finished a daily quota of %d tasks)",

1480

havp->max_jobs_per_day

1481

);

1482

g_reply->insert_message(buf, "low");

1483

if (config.debug_quota) {

1484

log_messages.printf(MSG_NORMAL,

1485

"[quota] Daily quota %d exceeded for app version %d\n",

1486

havp->max_jobs_per_day, havp->app_version_id

1487

);

1488

}

1489

g_reply->set_delay(DELAY_NO_WORK_CACHE);

1490

}

1491

if (g_wreq->max_jobs_on_host_exceeded

1492

|| g_wreq->max_jobs_on_host_cpu_exceeded

1493

|| g_wreq->max_jobs_on_host_gpu_exceeded

1494

) {

1495

sprintf(buf, "This computer has reached a limit on tasks in progress");

1496

g_reply->insert_message(buf, "low");

1497

g_reply->set_delay(DELAY_NO_WORK_CACHE);

1498

}

1499

}

1500

}

1501

1502

static double clamp_req_sec(double x) {

1503

if (x < MIN_REQ_SECS) return MIN_REQ_SECS;

1504

if (x > MAX_REQ_SECS) return MAX_REQ_SECS;

1505

return x;

1506

}

1507

1508

// prepare to send jobs, both resent and new;

1509

// decipher request type, fill in WORK_REQ

1510

1511

void send_work_setup() {

1512

unsigned int i;

1513

1514

g_wreq->seconds_to_fill = clamp_req_sec(g_request->work_req_seconds);

1515

g_wreq->cpu_req_secs = clamp_req_sec(g_request->cpu_req_secs);

1516

g_wreq->cpu_req_instances = g_request->cpu_req_instances;

1517

g_wreq->anonymous_platform = is_anonymous(g_request->platforms.list[0]);

1518

1519

if (g_wreq->anonymous_platform) {

1520

estimate_flops_anon_platform();

1521

}

1522

cuda_requirements.clear();

1523

ati_requirements.clear();

1524

1525

g_wreq->disk_available = max_allowable_disk();

1526

get_mem_sizes();

1527

get_running_frac();

1528

g_wreq->get_job_limits();

1529

1530

if (g_request->coprocs.cuda.count) {

1531

g_wreq->cuda_req_secs = clamp_req_sec(g_request->coprocs.cuda.req_secs);

1532

g_wreq->cuda_req_instances = g_request->coprocs.cuda.req_instances;

1533

if (g_request->coprocs.cuda.estimated_delay < 0) {

1534

g_request->coprocs.cuda.estimated_delay = g_request->cpu_estimated_delay;

1535

}

1536

}

1537

if (g_request->coprocs.ati.count) {

1538

g_wreq->ati_req_secs = clamp_req_sec(g_request->coprocs.ati.req_secs);

1539

g_wreq->ati_req_instances = g_request->coprocs.ati.req_instances;

1540

if (g_request->coprocs.ati.estimated_delay < 0) {

1541

g_request->coprocs.ati.estimated_delay = g_request->cpu_estimated_delay;

1542

}

1543

}

1544

if (g_wreq->cpu_req_secs || g_wreq->cuda_req_secs || g_wreq->ati_req_secs) {

1545

g_wreq->rsc_spec_request = true;

1287

1546

} else {

1288

reply.wreq.running_frac = reply.host.active_frac * reply.host.on_frac;

1289

}

1290

if (reply.wreq.running_frac < HOST_ACTIVE_FRAC_MIN) {

1291

reply.wreq.running_frac = HOST_ACTIVE_FRAC_MIN;

1292

}

1293

if (reply.wreq.running_frac > 1) reply.wreq.running_frac = 1;

1294

}

1295

1296

static void send_work_old(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

1297

reply.wreq.beta_only = false;

1298

reply.wreq.user_apps_only = true;

1299

1300

// give top priority to results that require a 'reliable host'

1301

1302

if (reply.wreq.host_info.reliable) {

1303

reply.wreq.reliable_only = true;

1304

reply.wreq.infeasible_only = false;

1305

scan_work_array(sreq, reply);

1306

}

1307

reply.wreq.reliable_only = false;

1308

1309

// give 2nd priority to results for a beta app

1310

// (projects should load beta work with care,

1311

// otherwise your users won't get production work done!

1312

1313

if (reply.wreq.host_info.allow_beta_work) {

1314

reply.wreq.beta_only = true;

1315

if (config.debug_send) {

1316

log_messages.printf(MSG_DEBUG,

1317

"[HOST#%d] will accept beta work. Scanning for beta work.\n",

1318

reply.host.id

1319

);

1320

}

1321

scan_work_array(sreq, reply);

1322

}

1323

reply.wreq.beta_only = false;

1324

1325

// give next priority to results that were infeasible for some other host

1326

1327

reply.wreq.infeasible_only = true;

1328

scan_work_array(sreq, reply);

1329

1330

reply.wreq.infeasible_only = false;

1331

scan_work_array(sreq, reply);

1332

1333

// If user has selected apps but will accept any,

1334

// and we haven't found any jobs for selected apps, try others

1335

1336

if (!reply.wreq.nresults && reply.wreq.host_info.allow_non_preferred_apps ) {

1337

reply.wreq.user_apps_only = false;

1338

preferred_app_message_index = reply.wreq.no_work_messages.size();

1339

if (config.debug_send) {

1340

log_messages.printf(MSG_DEBUG,

1341

"[HOST#%d] is looking for work from a non-preferred application\n",

1342

reply.host.id

1343

);

1344

}

1345

scan_work_array(sreq, reply);

1346

}

1347

}

1348

1349

#define ER_MAX 0.05

1350

// decide whether to unreplicated jobs to this host

1351

1352

void set_trust(SCHEDULER_REPLY& reply) {

1353

reply.wreq.trust = false;

1354

if (reply.host.error_rate > ER_MAX) {

1355

if (config.debug_send) {

1356

log_messages.printf(MSG_DEBUG,

1357

"set_trust: error rate %f > %f, don't trust\n",

1358

reply.host.error_rate, ER_MAX

1359

);

1360

}

1361

return;

1362

}

1363

double x = sqrt(reply.host.error_rate/ER_MAX);

1364

if (drand() > x) reply.wreq.trust = true;

1365

if (config.debug_send) {

1366

log_messages.printf(MSG_DEBUG,

1367

"set_trust: random choice for error rate %f: %s\n",

1368

reply.host.error_rate, reply.wreq.trust?"yes":"no"

1369

);

1370

}

1371

}

1372

1373

void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

1374

if (sreq.work_req_seconds <= 0) return;

1375

1376

reply.wreq.disk_available = max_allowable_disk(sreq, reply);

1377

1378

if (hr_unknown_platform(sreq.host)) {

1379

reply.wreq.hr_reject_perm = true;

1380

return;

1381

}

1382

1383

get_host_info(reply); // parse project prefs for app details

1384

1385

set_trust(reply);

1386

1387

get_running_frac(sreq, reply);

1388

1389

if (config.debug_send) {

1390

log_messages.printf(MSG_DEBUG,

1391

"%s matchmaker scheduling; %s EDF sim\n",

1547

g_wreq->rsc_spec_request = false;

1548

}

1549

1550

for (i=0; i<g_request->other_results.size(); i++) {

1551

OTHER_RESULT& r = g_request->other_results[i];

1552

APP* app = NULL;

1553

bool uses_gpu = false;

1554

bool have_cav = false;

1555

if (r.app_version >= 0

1556

&& r.app_version < (int)g_request->client_app_versions.size()

1557

) {

1558

CLIENT_APP_VERSION& cav = g_request->client_app_versions[r.app_version];

1559

app = cav.app;

1560

if (app) {

1561

have_cav = true;

1562

uses_gpu = cav.host_usage.uses_gpu();

1563

}

1564

}

1565

if (!have_cav) {

1566

if (r.have_plan_class && app_plan_uses_gpu(r.plan_class)) {

1567

uses_gpu = true;

1568

}

1569

}

1570

config.max_jobs_in_progress.register_job(app, uses_gpu);

1571

}

1572

1573

// print details of request to log

1574

1575

if (config.debug_send) {

1576

log_messages.printf(MSG_NORMAL,

1577

"[send] %s matchmaker scheduling; %s EDF sim\n",

1392

1578

config.matchmaker?"Using":"Not using",

1393

1579

config.workload_sim?"Using":"Not using"

1394

1580

);

1395

log_messages.printf(MSG_DEBUG,

1396

"available disk %f GB, work_buf_min %d\n",

1397

reply.wreq.disk_available/GIGA,

1398

(int)sreq.global_prefs.work_buf_min()

1399

);

1400

log_messages.printf(MSG_DEBUG,

1401

"running frac %f DCF %f est delay %d\n",

1402

reply.wreq.running_frac,

1403

reply.host.duration_correction_factor,

1404

(int)sreq.estimated_delay

1405

);

1406

}

1407

1408

reply.wreq.seconds_to_fill = sreq.work_req_seconds;

1409

if (reply.wreq.seconds_to_fill > MAX_SECONDS_TO_SEND) {

1410

reply.wreq.seconds_to_fill = MAX_SECONDS_TO_SEND;

1411

}

1412

if (reply.wreq.seconds_to_fill < MIN_SECONDS_TO_SEND) {

1413

reply.wreq.seconds_to_fill = MIN_SECONDS_TO_SEND;

1414

}

1581

log_messages.printf(MSG_NORMAL,

1582

"[send] CPU: req %.2f sec, %.2f instances; est delay %.2f\n",

1583

g_wreq->cpu_req_secs, g_wreq->cpu_req_instances,

1584

g_request->cpu_estimated_delay

1585

);

1586

if (g_request->coprocs.cuda.count) {

1587

log_messages.printf(MSG_NORMAL,

1588

"[send] CUDA: req %.2f sec, %.2f instances; est delay %.2f\n",

1589

g_wreq->cuda_req_secs, g_wreq->cuda_req_instances,

1590

g_request->coprocs.cuda.estimated_delay

1591

);

1592

}

1593

if (g_request->coprocs.ati.count) {

1594

log_messages.printf(MSG_NORMAL,

1595

"[send] ATI: req %.2f sec, %.2f instances; est delay %.2f\n",

1596

g_wreq->ati_req_secs, g_wreq->ati_req_instances,

1597

g_request->coprocs.ati.estimated_delay

1598

);

1599

}

1600

log_messages.printf(MSG_NORMAL,

1601

"[send] work_req_seconds: %.2f secs\n",

1602

g_wreq->seconds_to_fill

1603

);

1604

log_messages.printf(MSG_NORMAL,

1605

"[send] available disk %.2f GB, work_buf_min %d\n",

1606

g_wreq->disk_available/GIGA,

1607

(int)g_request->global_prefs.work_buf_min()

1608

);

1609

log_messages.printf(MSG_NORMAL,

1610

"[send] active_frac %f on_frac %f\n",

1611

g_reply->host.active_frac,

1612

g_reply->host.on_frac

1613

);

1614

if (g_wreq->anonymous_platform) {

1615

log_messages.printf(MSG_NORMAL,

1616

"Anonymous platform app versions:\n"

1617

);

1618

for (i=0; i<g_request->client_app_versions.size(); i++) {

1619

CLIENT_APP_VERSION& cav = g_request->client_app_versions[i];

1620

log_messages.printf(MSG_NORMAL,

1621

" app: %s version %d cpus %.2f cudas %.2f atis %.2f flops %fG\n",

1622

cav.app_name,

1623

cav.version_num,

1624

cav.host_usage.avg_ncpus,

1625

cav.host_usage.ncudas,

1626

cav.host_usage.natis,

1627

cav.host_usage.projected_flops/1e9

1628

);

1629

}

1630

}

1631

}

1632

}

1633

1634

// If a record is not in DB, create it.

1635

1636

int update_host_app_versions(vector<RESULT>& results, int hostid) {

1637

vector<DB_HOST_APP_VERSION> new_havs;

1638

unsigned int i, j;

1639

int retval;

1640

1641

for (i=0; i<results.size(); i++) {

1642

RESULT& r = results[i];

1643

int gavid = generalized_app_version_id(r.app_version_id, r.appid);

1644

DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid);

1645

if (!havp) {

1646

bool found = false;

1647

for (j=0; j<new_havs.size(); j++) {

1648

DB_HOST_APP_VERSION& hav = new_havs[j];

1649

if (hav.app_version_id == gavid) {

1650

found = true;

1651

}

1652

}

1653

if (!found) {

1654

DB_HOST_APP_VERSION hav;

1655

hav.clear();

1656

hav.host_id = hostid;

1657

hav.app_version_id = gavid;

1658

new_havs.push_back(hav);

1659

}

1660

}

1661

}

1662

1663

// create new records

1664

1665

for (i=0; i<new_havs.size(); i++) {

1666

DB_HOST_APP_VERSION& hav = new_havs[i];

1667

1668

retval = hav.insert();

1669

if (retval) {

1670

log_messages.printf(MSG_CRITICAL,

1671

"hav.insert(): %d\n", retval

1672

);

1673

} else {

1674

if (config.debug_credit) {

1675

log_messages.printf(MSG_NORMAL,

1676

"[credit] created host_app_version record (%d, %d)\n",

1677

hav.host_id, hav.app_version_id

1678

);

1679

}

1680

}

1681

}

1682

return 0;

1683

}

1684

1685

void send_work() {

1686

int retval;

1687

1688

if (!work_needed(false)) {

1689

send_user_messages();

1690

return;

1691

}

1692

g_wreq->no_jobs_available = true;

1693

1694

if (!g_wreq->rsc_spec_request && g_wreq->seconds_to_fill == 0) {

1695

return;

1696

}

1697

1698

if (all_apps_use_hr && hr_unknown_platform(g_request->host)) {

1699

log_messages.printf(MSG_NORMAL,

1700

"Not sending work because unknown HR class\n"

1701

);

1702

g_wreq->hr_reject_perm = true;

1703

return;

1704

}

1705

1706

// decide on attributes of HOST_APP_VERSIONS

1707

1708

get_reliability_and_trust();

1709

1710

get_prefs_info();

1415

1711

1416

1712

if (config.enable_assignment) {

1417

if (send_assigned_jobs(sreq, reply)) {

1713

if (send_assigned_jobs()) {

1418

1714

if (config.debug_assignment) {

1419

log_messages.printf(MSG_DEBUG,

1420

"[HOST#%d] sent assigned jobs\n", reply.host.id

1715

log_messages.printf(MSG_NORMAL,

1716

"[assign] [HOST#%d] sent assigned jobs\n", g_reply->host.id

1421

1717

);

1422

1718

}

1423

return;

1719

goto done;

1424

1720

}

1425

1721

}

1426

1722

1427

if (config.workload_sim && sreq.have_other_results_list) {

1723

if (config.workload_sim && g_request->have_other_results_list) {

1428

1724

init_ip_results(

1429

sreq.global_prefs.work_buf_min(), effective_ncpus(sreq, reply), sreq.ip_results

1725

g_request->global_prefs.work_buf_min(),

1726

g_wreq->effective_ncpus, g_request->ip_results

1430

1727

);

1431

1728

}

1432

1729

1433

if (config.locality_scheduling) {

1434

reply.wreq.infeasible_only = false;

1435

send_work_locality(sreq, reply);

1730

if (config.locality_scheduler_fraction > 0) {

1731

if (drand() < config.locality_scheduler_fraction) {

1732

if (config.debug_locality) {

1733

log_messages.printf(MSG_NORMAL,

1734

"[mixed] sending locality work first\n"

1735

);

1736

}

1737

send_work_locality();

1738

if (config.debug_locality) {

1739

log_messages.printf(MSG_NORMAL,

1740

"[mixed] sending non-locality work second\n"

1741

);

1742

}

1743

send_work_old();

1744

} else {

1745

if (config.debug_locality) {

1746

log_messages.printf(MSG_NORMAL,

1747

"[mixed] sending non-locality work first\n"

1748

);

1749

}

1750

send_work_old();

1751

if (config.debug_locality) {

1752

log_messages.printf(MSG_NORMAL,

1753

"[mixed] sending locality work second\n"

1754

);

1755

}

1756

send_work_locality();

1757

}

1758

} else if (config.locality_scheduling) {

1759

send_work_locality();

1436

1760

} else if (config.matchmaker) {

1437

send_work_matchmaker(sreq, reply);

1438

} else {

1439

send_work_old(sreq, reply);

1440

}

1441

1442

explain_to_user(sreq, reply);

1443

}

1444

1445

// Matchmaker scheduling code follows

1446

1447

struct JOB {

1448

int index;

1449

double score;

1450

double est_time;

1451

double disk_usage;

1452

APP* app;

1453

BEST_APP_VERSION* bavp;

1454

1455

bool get_score(SCHEDULER_REQUEST&, SCHEDULER_REPLY&);

1456

};

1457

1458

struct JOB_SET {

1459

double work_req;

1460

double est_time;

1461

double disk_usage;

1462

double disk_limit;

1463

int max_jobs;

1464

std::list<JOB> jobs; // sorted high to low

1465

1466

JOB_SET(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

1467

work_req = sreq.work_req_seconds;

1468

est_time = 0;

1469

disk_usage = 0;

1470

disk_limit = reply.wreq.disk_available;

1471

max_jobs = config.max_wus_to_send;

1472

int ncpus = effective_ncpus(sreq, reply), n;

1473

1474

if (config.daily_result_quota) {

1475

if (reply.host.max_results_day == 0 || reply.host.max_results_day>config.daily_result_quota) {

1476

reply.host.max_results_day = config.daily_result_quota;

1477

}

1478

reply.wreq.daily_result_quota = ncpus*reply.host.max_results_day;

1479

n = reply.wreq.daily_result_quota - reply.host.nresults_today;

1480

if (n < 0) n = 0;

1481

if (n < max_jobs) max_jobs = n;

1482

}

1483

1484

if (config.max_wus_in_progress) {

1485

n = config.max_wus_in_progress*ncpus - reply.wreq.nresults_on_host;

1486

if (n < 0) n = 0;

1487

if (n < max_jobs) max_jobs = n;

1488

}

1489

}

1490

void add_job(JOB&);

1491

double higher_score_disk_usage(double);

1492

double lowest_score();

1493

inline bool request_satisfied() {

1494

return est_time >= work_req;

1495

}

1496

void send(SCHEDULER_REQUEST&, SCHEDULER_REPLY&);

1497

};

1498

1499

// reread result from DB, make sure it's still unsent

1500

// TODO: from here to add_result_to_reply()

1501

// (which updates the DB record) should be a transaction

1502

1503

int read_sendable_result(DB_RESULT& result) {

1504

int retval = result.lookup_id(result.id);

1505

if (retval) {

1506

log_messages.printf(MSG_CRITICAL,

1507

"[RESULT#%d] result.lookup_id() failed %d\n",

1508

result.id, retval

1509

);

1510

return ERR_NOT_FOUND;

1511

}

1512

if (result.server_state != RESULT_SERVER_STATE_UNSENT) {

1513

log_messages.printf(MSG_NORMAL,

1514

"[RESULT#%d] expected to be unsent; instead, state is %d\n",

1515

result.id, result.server_state

1516

);

1517

return ERR_BAD_RESULT_STATE;

1518

}

1519

return 0;

1520

}

1521

1522

// compute a "score" for sending this job to this host.

1523

// Return false if the WU is infeasible.

1524

// Otherwise set est_time and disk_usage.

1525

1526

bool JOB::get_score(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

1527

WORKUNIT wu;

1528

int retval;

1529

1530

WU_RESULT& wu_result = ssp->wu_results[index];

1531

wu = wu_result.workunit;

1532

app = ssp->lookup_app(wu.appid);

1533

1534

score = 0;

1535

1536

// Find the app_version for the client's platform.

1537

1538

bavp = get_app_version(sreq, reply, wu);

1539

if (!bavp) return false;

1540

1541

retval = wu_is_infeasible_fast(wu, sreq, reply, *app);

1542

if (retval) {

1543

if (config.debug_send) {

1544

log_messages.printf(MSG_DEBUG,

1545

"[HOST#%d] [WU#%d %s] WU is infeasible: %s\n",

1546

reply.host.id, wu.id, wu.name, infeasible_string(retval)

1547

);

1548

}

1549

return false;

1550

}

1551

1552

score = 1;

1553

1554

// check if user has selected apps,

1555

// and send beta work to beta users

1556

1557

if (app->beta && !config.distinct_beta_apps) {

1558

if (reply.wreq.host_info.allow_beta_work) {

1559

score += 1;

1560

} else {

1561

return false;

1562

}

1563

} else {

1564

if (app_not_selected(wu, sreq, reply)) {

1565

if (!reply.wreq.host_info.allow_non_preferred_apps) {

1566

return false;

1567

} else {

1568

// Allow work to be sent, but it will not get a bump in its score

1569

}

1570

} else {

1571

score += 1;

1572

}

1573

}

1574

1575

// if job needs to get done fast, send to fast/reliable host

1576

1577

if (reply.wreq.host_info.reliable && (wu_result.need_reliable)) {

1578

score += 1;

1579

}

1580

1581

// if job already committed to an HR class,

1582

// try to send to host in that class

1583

1584

if (wu_result.infeasible_count) {

1585

score += 1;

1586

}

1587

1588

// Favor jobs that will run fast

1589

1590

score += bavp->host_usage.flops/1e9;

1591

1592

// match large jobs to fast hosts

1593

1594

if (config.job_size_matching) {

1595

double host_stdev = (reply.host.p_fpops - ssp->perf_info.host_fpops_mean)/ ssp->perf_info.host_fpops_stdev;

1596

double diff = host_stdev - wu_result.fpops_size;

1597

score -= diff*diff;

1598

}

1599

1600

// TODO: If user has selected some apps but will accept jobs from others,

1601

// try to send them jobs from the selected apps

1602

1603

1604

est_time = estimate_wallclock_duration(wu, sreq, reply);

1605

disk_usage = wu.rsc_disk_bound;

1606

return true;

1607

}

1608

1609

bool wu_is_infeasible_slow(

1610

WU_RESULT& wu_result, SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply

1611

) {

1612

char buf[256];

1613

int retval;

1614

int n;

1615

DB_RESULT result;

1616

1617

// Don't send if we've already sent a result of this WU to this user.

1618

1619

if (config.one_result_per_user_per_wu) {

1620

sprintf(buf,

1621

"where workunitid=%d and userid=%d",

1622

wu_result.workunit.id, reply.user.id

1623

);

1624

retval = result.count(n, buf);

1625

if (retval) {

1626

log_messages.printf(MSG_CRITICAL,

1627

"send_work: can't get result count (%d)\n", retval

1628

);

1629

return true;

1630

} else {

1631

if (n>0) {

1632

if (config.debug_send) {

1633

log_messages.printf(MSG_DEBUG,

1634

"send_work: user %d already has %d result(s) for WU %d\n",

1635

reply.user.id, n, wu_result.workunit.id

1636

);

1637

}

1638

return true;

1639

}

1640

}

1641

} else if (config.one_result_per_host_per_wu) {

1642

// Don't send if we've already sent a result

1643

// of this WU to this host.

1644

// We only have to check this

1645

// if we don't send one result per user.

1646

1647

sprintf(buf,

1648

"where workunitid=%d and hostid=%d",

1649

wu_result.workunit.id, reply.host.id

1650

);

1651

retval = result.count(n, buf);

1652

if (retval) {

1653

log_messages.printf(MSG_CRITICAL,

1654

"send_work: can't get result count (%d)\n", retval

1655

);

1656

return true;

1657

} else {

1658

if (n>0) {

1659

if (config.debug_send) {

1660

log_messages.printf(MSG_DEBUG,

1661

"send_work: host %d already has %d result(s) for WU %d\n",

1662

reply.host.id, n, wu_result.workunit.id

1663

);

1664

}

1665

return true;

1666

}

1667

}

1668

}

1669

1670

APP* app = ssp->lookup_app(wu_result.workunit.appid);

1671

WORKUNIT wu = wu_result.workunit;

1672

if (app_hr_type(*app)) {

1673

if (already_sent_to_different_platform_careful(

1674

sreq, reply.wreq, wu, *app

1675

)) {

1676

if (config.debug_send) {

1677

log_messages.printf(MSG_DEBUG,

1678

"[HOST#%d] [WU#%d %s] WU is infeasible (assigned to different platform)\n",

1679

reply.host.id, wu.id, wu.name

1680

);

1681

}

1682

// Mark the workunit as infeasible.

1683

// This ensures that jobs already assigned to a platform

1684

// are processed first.

1685

1686

wu_result.infeasible_count++;

1687

return true;

1688

}

1689

}

1690

return false;

1691

}

1692

1693

double JOB_SET::lowest_score() {

1694

if (jobs.empty()) return 0;

1695

return jobs.back().score;

1696

}

1697

1698

// add the given job, and remove lowest-score jobs that

1699

// - are in excess of work request

1700

// - are in excess of per-request or per-day limits

1701

// - cause the disk limit to be exceeded

1702

1703

void JOB_SET::add_job(JOB& job) {

1704

while (!jobs.empty()) {

1705

JOB& worst_job = jobs.back();

1706

if (est_time + job.est_time - worst_job.est_time > work_req) {

1707

est_time -= worst_job.est_time;

1708

disk_usage -= worst_job.disk_usage;

1709

jobs.pop_back();

1710

ssp->wu_results[worst_job.index].state = WR_STATE_PRESENT;

1711

} else {

1712

break;

1713

}

1714

}

1715

while (!jobs.empty()) {

1716

JOB& worst_job = jobs.back();

1717

if (disk_usage + job.disk_usage > disk_limit) {

1718

est_time -= worst_job.est_time;

1719

disk_usage -= worst_job.disk_usage;

1720

jobs.pop_back();

1721

ssp->wu_results[worst_job.index].state = WR_STATE_PRESENT;

1722

} else {

1723

break;

1724

}

1725

}

1726

1727

if (jobs.size() == max_jobs) {

1728

JOB& worst_job = jobs.back();

1729

jobs.pop_back();

1730

ssp->wu_results[worst_job.index].state = WR_STATE_PRESENT;

1731

}

1732

1733

list<JOB>::iterator i = jobs.begin();

1734

while (i != jobs.end()) {

1735

if (i->score < job.score) {

1736

jobs.insert(i, job);

1737

break;

1738

}

1739

i++;

1740

}

1741

if (i == jobs.end()) {

1742

jobs.push_back(job);

1743

}

1744

est_time += job.est_time;

1745

disk_usage += job.disk_usage;

1746

if (config.debug_send) {

1747

log_messages.printf(MSG_DEBUG,

1748

"added job to set. est_time %f disk_usage %f\n",

1749

est_time, disk_usage

1750

);

1751

}

1752

}

1753

1754

// return the disk usage of jobs above the given score

1755

1756

double JOB_SET::higher_score_disk_usage(double v) {

1757

double sum = 0;

1758

list<JOB>::iterator i = jobs.begin();

1759

while (i != jobs.end()) {

1760

if (i->score < v) break;

1761

sum += i->disk_usage;

1762

i++;

1763

}

1764

return sum;

1765

}

1766

1767

void JOB_SET::send(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

1768

WORKUNIT wu;

1769

DB_RESULT result;

1770

int retval;

1771

1772

list<JOB>::iterator i = jobs.begin();

1773

while (i != jobs.end()) {

1774

JOB& job = *(i++);

1775

WU_RESULT wu_result = ssp->wu_results[job.index];

1776

ssp->wu_results[job.index].state = WR_STATE_EMPTY;

1777

wu = wu_result.workunit;

1778

result.id = wu_result.resultid;

1779

retval = read_sendable_result(result);

1780

if (!retval) {

1781

add_result_to_reply(result, wu, sreq, reply, job.bavp);

1782

}

1783

}

1784

}

1785

1786

void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

1787

int i, slots_locked=0, slots_nonempty=0;

1788

JOB_SET jobs (sreq, reply);

1789

int min_slots = config.mm_min_slots;

1790

if (!min_slots) min_slots = ssp->max_wu_results/2;

1791

int max_slots = config.mm_max_slots;

1792

if (!max_slots) max_slots = ssp->max_wu_results;

1793

int max_locked = 10;

1794

1795

lock_sema();

1796

i = rand() % ssp->max_wu_results;

1797

1798

// scan through the job cache, maintaining a JOB_SET of jobs

1799

// that we can send to this client, ordered by score.

1800

1801

for (int slots_scanned=0; slots_scanned<max_slots; slots_scanned++) {

1802

i = (i+1) % ssp->max_wu_results;

1803

WU_RESULT& wu_result = ssp->wu_results[i];

1804

switch (wu_result.state) {

1805

case WR_STATE_EMPTY:

1806

continue;

1807

case WR_STATE_PRESENT:

1808

slots_nonempty++;

1809

break;

1810

default:

1811

slots_nonempty++;

1812

if (wu_result.state == g_pid) break;

1813

slots_locked++;

1814

continue;

1815

}

1816

1817

JOB job;

1818

job.index = i;

1819

1820

// get score for this job, and skip it if it fails quick check.

1821

// NOTE: the EDF check done in get_score()

1822

// includes only in-progress jobs.

1823

1824

if (!job.get_score(sreq, reply)) {

1825

continue;

1826

}

1827

if (config.debug_send) {

1828

log_messages.printf(MSG_DEBUG,

1829

"score for %s: %f\n", wu_result.workunit.name, job.score

1830

);

1831

}

1832

1833

if (job.score > jobs.lowest_score() || !jobs.request_satisfied()) {

1834

ssp->wu_results[i].state = g_pid;

1835

unlock_sema();

1836

if (wu_is_infeasible_slow(wu_result, sreq, reply)) {

1837

// if we can't use this job, put it back in pool

1838

1839

lock_sema();

1840

ssp->wu_results[i].state = WR_STATE_PRESENT;

1841

continue;

1842

}

1843

lock_sema();

1844

jobs.add_job(job);

1845

}

1846

1847

if (jobs.request_satisfied() && slots_scanned>=min_slots) break;

1848

}

1849

1850

if (!slots_nonempty) {

1851

log_messages.printf(MSG_CRITICAL,

1852

"Job cache is empty - check feeder\n"

1853

);

1854

reply.wreq.no_jobs_available = true;

1855

}

1856

1857

// TODO: trim jobs from tail of list until we pass the EDF check

1858

1859

jobs.send(sreq, reply);

1860

unlock_sema();

1861

if (slots_locked > max_locked) {

1862

log_messages.printf(MSG_CRITICAL,

1863

"Found too many locked slots (%d>%d) - increase array size",

1864

slots_locked, max_locked

1865

);

1866

}

1867

}

1868

1869

const char *BOINC_RCSID_32dcd335e7 = "$Id: sched_send.cpp 16611 2008-12-03 20:55:22Z romw $";

1761

send_work_matchmaker();

1762

} else {

1763

send_work_old();

1764

}

1765

1766

done:

1767

retval = update_host_app_versions(g_reply->results, g_reply->host.id);

1768

if (retval) {

1769

log_messages.printf(MSG_CRITICAL,

1770

"update_host_app_versions() failed: %d\n", retval

1771

);

1772

}

1773

send_user_messages();

1774

}

1775

1776

const char *BOINC_RCSID_32dcd335e7 = "$Id: sched_send.cpp 22651 2010-11-08 17:57:13Z romw $";

Older »