113
105
needs_shmem = false;
114
106
want_network = 0;
115
107
premature_exit_count = 0;
116
coprocs_reserved = false;
118
109
memset(&procinfo, 0, sizeof(procinfo));
111
process_handle = NULL;
123
114
premature_exit_count = 0;
126
static const char* task_state_name(int val) {
128
case PROCESS_UNINITIALIZED: return "UNINITIALIZED";
129
case PROCESS_EXECUTING: return "EXECUTING";
130
case PROCESS_SUSPENDED: return "SUSPENDED";
131
case PROCESS_ABORT_PENDING: return "ABORT_PENDING";
132
case PROCESS_EXITED: return "EXITED";
133
case PROCESS_WAS_SIGNALED: return "WAS_SIGNALED";
134
case PROCESS_EXIT_UNKNOWN: return "EXIT_UNKNOWN";
135
case PROCESS_ABORTED: return "ABORTED";
136
case PROCESS_COULDNT_START: return "COULDNT_START";
137
case PROCESS_QUIT_PENDING: return "QUIT_PENDING";
142
void ACTIVE_TASK::set_task_state(int val, const char* where) {
144
if (log_flags.task_debug) {
145
msg_printf(result->project, MSG_INFO,
146
"[task_debug] task_state=%s for %s from %s",
147
task_state_name(val), result->name, where
117
// preempt this task;
118
// called from the CLIENT_STATE::enforce_schedule()
119
// and ACTIVE_TASK_SET::suspend_all()
121
int ACTIVE_TASK::preempt(int preempt_type) {
125
switch (preempt_type) {
129
case REMOVE_MAYBE_USER:
130
case REMOVE_MAYBE_SCHED:
131
// GPU jobs: always remove from mem, since it's tying up GPU RAM
133
if (result->uses_coprocs()) {
137
// if it's never checkpointed, leave in mem
139
if (checkpoint_elapsed_time == 0) {
143
// otherwise obey user prefs
145
remove = !gstate.global_prefs.leave_apps_in_memory;
153
if (log_flags.cpu_sched) {
154
msg_printf(result->project, MSG_INFO,
155
"[cpu_sched] Preempting %s (removed from memory)",
159
set_task_state(PROCESS_QUIT_PENDING, "preempt");
160
retval = request_exit();
162
if (log_flags.cpu_sched) {
163
msg_printf(result->project, MSG_INFO,
164
"[cpu_sched] Preempting %s (left in memory)",
152
173
// called when a process has exited or we've killed it
154
175
void ACTIVE_TASK::cleanup_task() {
157
CloseHandle(pid_handle);
177
if (process_handle) {
178
CloseHandle(process_handle);
179
process_handle = NULL;
160
181
// detach from shared mem.
161
182
// This will destroy shmem seg since we're the last attachment
230
251
bool app_running(vector<PROCINFO>& piv, const char* p) {
231
252
for (unsigned int i=0; i<piv.size(); i++) {
232
253
PROCINFO& pi = piv[i];
233
if (!strcmp(pi.command, p)) {
254
//msg_printf(0, MSG_INFO, "running: [%s]", pi.command);
255
if (!strcasecmp(pi.command, p)) {
263
void procinfo_show(PROCINFO& pi, vector<PROCINFO>& piv) {
265
memset(&pi, 0, sizeof(pi));
266
for (i=0; i<piv.size(); i++) {
267
PROCINFO& p = piv[i];
269
pi.kernel_time += p.kernel_time;
270
pi.user_time += p.user_time;
271
msg_printf(NULL, MSG_INFO, "%d %s: boinc %d low %d (%f %f) total (%f %f)",
272
p.id, p.command, p.is_boinc_app, p.is_low_priority, p.kernel_time, p.user_time, pi.kernel_time, pi.user_time
240
278
void ACTIVE_TASK_SET::get_memory_usage() {
241
279
static double last_mem_time=0;
282
static bool first = true;
283
static double last_cpu_time;
245
285
double diff = gstate.now - last_mem_time;
246
286
if (diff < 10) return;
250
290
retval = procinfo_setup(piv);
252
292
if (log_flags.mem_usage_debug) {
253
msg_printf(0, MSG_INTERNAL_ERROR,
254
"[mem_usage_debug] procinfo_setup() returned %d", retval
293
msg_printf(NULL, MSG_INTERNAL_ERROR,
294
"[mem_usage] procinfo_setup() returned %d", retval
259
299
for (i=0; i<active_tasks.size(); i++) {
260
300
ACTIVE_TASK* atp = active_tasks[i];
261
if (atp->scheduler_state == CPU_SCHED_SCHEDULED) {
262
PROCINFO& pi = atp->procinfo;
263
unsigned long last_page_fault_count = pi.page_fault_count;
264
memset(&pi, 0, sizeof(pi));
266
procinfo_app(pi, piv);
267
pi.working_set_size_smoothed = .5*pi.working_set_size_smoothed + pi.working_set_size;
269
int pf = pi.page_fault_count - last_page_fault_count;
270
pi.page_fault_rate = pf/diff;
271
if (log_flags.mem_usage_debug) {
272
msg_printf(atp->result->project, MSG_INFO,
273
"[mem_usage_debug] %s: RAM %.2fMB, page %.2fMB, %.2f page faults/sec, user CPU %.3f, kernel CPU %.3f",
275
pi.working_set_size/MEGA, pi.swap_size/MEGA,
277
pi.user_time, pi.kernel_time
301
if (atp->task_state() == PROCESS_UNINITIALIZED) continue;
302
if (atp->pid ==0) continue;
304
// scan all active tasks with a process, even if not scheduled, because
305
// 1) we might have recently suspended a tasks,
306
// and we still need to count its time
307
// 2) preempted tasks might not actually suspend themselves
308
// (and we'd count that as non-BOINC CPU usage
309
// and suspend everything).
311
PROCINFO& pi = atp->procinfo;
312
unsigned long last_page_fault_count = pi.page_fault_count;
313
memset(&pi, 0, sizeof(pi));
315
procinfo_app(pi, piv, atp->app_version->graphics_exec_file);
316
pi.working_set_size_smoothed = .5*pi.working_set_size_smoothed + pi.working_set_size;
318
int pf = pi.page_fault_count - last_page_fault_count;
319
pi.page_fault_rate = pf/diff;
320
if (log_flags.mem_usage_debug) {
321
msg_printf(atp->result->project, MSG_INFO,
322
"[mem_usage] %s: RAM %.2fMB, page %.2fMB, %.2f page faults/sec, user CPU %.3f, kernel CPU %.3f",
324
pi.working_set_size/MEGA, pi.swap_size/MEGA,
326
pi.user_time, pi.kernel_time
283
331
exclusive_app_running = false;
332
bool old_egar = exclusive_gpu_app_running;
333
exclusive_gpu_app_running = false;
284
334
for (i=0; i<config.exclusive_apps.size(); i++) {
285
335
if (app_running(piv, config.exclusive_apps[i].c_str())) {
286
336
exclusive_app_running = true;
340
for (i=0; i<config.exclusive_gpu_apps.size(); i++) {
341
if (app_running(piv, config.exclusive_gpu_apps[i].c_str())) {
342
exclusive_gpu_app_running = true;
346
if (old_egar != exclusive_gpu_app_running) {
347
gstate.request_schedule_cpus("Exclusive GPU app status changed");
292
// the following is not useful because most OSs don't
293
// move idle processes out of RAM, so physical memory is always full
350
// get info on non-BOINC processes.
351
// mem usage info is not useful because most OSs don't
352
// move idle processes out of RAM, so physical memory is always full.
353
// Also (at least on Win) page faults are used for various things,
354
// not all of them generate disk I/O,
355
// so they're not useful for detecting paging/thrashing.
358
//procinfo_show(pi, piv);
295
359
procinfo_other(pi, piv);
296
msg_printf(NULL, MSG_INFO, "All others: RAM %.2fMB, page %.2fMB, user %.3f, kernel %.3f",
297
pi.working_set_size/MEGA, pi.swap_size/MEGA,
298
pi.user_time, pi.kernel_time
360
if (log_flags.mem_usage_debug) {
361
msg_printf(NULL, MSG_INFO,
362
"[mem_usage] All others: RAM %.2fMB, page %.2fMB, user %.3f, kernel %.3f",
363
pi.working_set_size/MEGA, pi.swap_size/MEGA,
364
pi.user_time, pi.kernel_time
367
double new_cpu_time = pi.user_time + pi.kernel_time;
371
non_boinc_cpu_usage = (new_cpu_time - last_cpu_time)/(diff*gstate.host_info.p_ncpus);
372
// processes might have exited in the last 10 sec,
373
// causing this to be negative.
374
if (non_boinc_cpu_usage < 0) non_boinc_cpu_usage = 0;
375
if (log_flags.mem_usage_debug) {
376
msg_printf(NULL, MSG_INFO,
377
"[mem_usage] non-BOINC CPU usage: %.2f%%", non_boinc_cpu_usage*100
381
last_cpu_time = new_cpu_time;
303
// Do periodic checks on running apps:
304
// - get latest CPU time and % done info
305
// - check if any has exited, and clean up
306
// - see if any has exceeded its CPU or disk space limits, and abort it
308
bool ACTIVE_TASK_SET::poll() {
311
static double last_time = 0;
312
if (gstate.now - last_time < 1.0) return false;
313
last_time = gstate.now;
315
action = check_app_exited();
317
send_trickle_downs();
319
process_control_poll();
321
action |= check_rsc_limits_exceeded();
322
action |= get_msgs();
323
for (i=0; i<active_tasks.size(); i++) {
324
ACTIVE_TASK* atp = active_tasks[i];
325
if (atp->task_state() == PROCESS_ABORT_PENDING) {
326
if (gstate.now > atp->abort_time + ABORT_TIMEOUT) {
327
atp->kill_task(false);
330
if (atp->task_state() == PROCESS_QUIT_PENDING) {
331
if (gstate.now > atp->quit_time + QUIT_TIMEOUT) {
332
atp->kill_task(true);
338
gstate.set_client_state_dirty("ACTIVE_TASK_SET::poll");
344
386
// There's a new trickle file.
345
387
// Move it from slot dir to project dir
608
660
else if (parse_str(buf, "<project_master_url>", project_master_url, sizeof(project_master_url))) continue;
609
661
else if (parse_int(buf, "<slot>", slot)) continue;
610
662
else if (parse_int(buf, "<active_task_state>", dummy)) continue;
611
else if (parse_double(buf, "<checkpoint_cpu_time>", checkpoint_cpu_time)) {
612
current_cpu_time = checkpoint_cpu_time;
663
else if (parse_double(buf, "<checkpoint_cpu_time>", checkpoint_cpu_time)) continue;
664
else if (parse_bool(buf, "once_ran_edf", once_ran_edf)) continue;
615
665
else if (parse_double(buf, "<fraction_done>", fraction_done)) continue;
616
else if (parse_double(buf, "<checkpoint_elapsed_time>", checkpoint_elapsed_time)) {
617
elapsed_time = checkpoint_elapsed_time;
666
else if (parse_double(buf, "<checkpoint_elapsed_time>", checkpoint_elapsed_time)) continue;
620
667
else if (parse_int(buf, "<app_version_num>", n)) continue;
621
668
else if (parse_double(buf, "<swap_size>", procinfo.swap_size)) continue;
622
669
else if (parse_double(buf, "<working_set_size>", procinfo.working_set_size)) continue;
623
670
else if (parse_double(buf, "<working_set_size_smoothed>", procinfo.working_set_size_smoothed)) continue;
624
671
else if (parse_double(buf, "<page_fault_rate>", procinfo.page_fault_rate)) continue;
672
else if (parse_double(buf, "<current_cpu_time>", x)) continue;
626
674
if (log_flags.unparsed_xml) {
627
msg_printf(0, MSG_INFO,
675
msg_printf(project, MSG_INFO,
628
676
"[unparsed_xml] ACTIVE_TASK::parse(): unrecognized %s\n", buf
876
911
void ACTIVE_TASK_SET::init() {
877
912
for (unsigned int i=0; i<active_tasks.size(); i++) {
878
913
ACTIVE_TASK* atp = active_tasks[i];
879
914
atp->init(atp->result);
880
915
atp->scheduler_state = CPU_SCHED_PREEMPTED;
916
atp->read_task_state_file();
917
atp->current_cpu_time = atp->checkpoint_cpu_time;
918
atp->elapsed_time = atp->checkpoint_elapsed_time;
886
const char *BOINC_RCSID_778b61195e = "$Id: app.cpp 16622 2008-12-04 18:13:52Z romw $";
924
static const char* task_state_name(int val) {
926
case PROCESS_UNINITIALIZED: return "UNINITIALIZED";
927
case PROCESS_EXECUTING: return "EXECUTING";
928
case PROCESS_SUSPENDED: return "SUSPENDED";
929
case PROCESS_ABORT_PENDING: return "ABORT_PENDING";
930
case PROCESS_EXITED: return "EXITED";
931
case PROCESS_WAS_SIGNALED: return "WAS_SIGNALED";
932
case PROCESS_EXIT_UNKNOWN: return "EXIT_UNKNOWN";
933
case PROCESS_ABORTED: return "ABORTED";
934
case PROCESS_COULDNT_START: return "COULDNT_START";
935
case PROCESS_QUIT_PENDING: return "QUIT_PENDING";
940
void ACTIVE_TASK::set_task_state(int val, const char* where) {
942
if (log_flags.task_debug) {
943
msg_printf(result->project, MSG_INFO,
944
"[task] task_state=%s for %s from %s",
945
task_state_name(val), result->name, where