77
// Do periodic checks on running apps:
78
// - get latest CPU time and % done info
79
// - check if any has exited, and clean up
80
// - see if any has exceeded its CPU or disk space limits, and abort it
82
bool ACTIVE_TASK_SET::poll() {
85
static double last_time = 0;
86
if (gstate.now - last_time < TASK_POLL_PERIOD) return false;
87
last_time = gstate.now;
89
action = check_app_exited();
93
process_control_poll();
94
action |= check_rsc_limits_exceeded();
96
for (i=0; i<active_tasks.size(); i++) {
97
ACTIVE_TASK* atp = active_tasks[i];
98
if (atp->task_state() == PROCESS_ABORT_PENDING) {
99
if (gstate.now > atp->abort_time + ABORT_TIMEOUT) {
100
atp->kill_task(false);
103
if (atp->task_state() == PROCESS_QUIT_PENDING) {
104
if (gstate.now > atp->quit_time + QUIT_TIMEOUT) {
105
atp->kill_task(true);
111
gstate.set_client_state_dirty("ACTIVE_TASK_SET::poll");
118
// deprecated; TerminateProcessById() doesn't work if
119
// the process is running as a different user
76
122
bool ACTIVE_TASK::kill_all_children() {
315
388
handle_premature_exit(will_restart);
318
if (log_flags.task_debug) {
319
msg_printf(result->project, MSG_INFO,
320
"[task_debug] exit status %d\n",
392
if (temporary_exit_file_present(x)) {
393
if (log_flags.task_debug) {
394
msg_printf(result->project, MSG_INFO,
395
"[task] task called temporary_exit(%f)", x
398
set_task_state(PROCESS_UNINITIALIZED, "temporary exit");
400
result->schedule_backoff = gstate.now + x;
402
if (log_flags.task_debug) {
403
msg_printf(result->project, MSG_INFO,
404
"[task] process exited with status %d\n",
324
409
} else if (WIFSIGNALED(stat)) {
325
410
int got_signal = WTERMSIG(stat);
327
412
if (log_flags.task_debug) {
328
413
msg_printf(result->project, MSG_INFO,
329
"[task_debug] process got signal %d", signal
414
"[task] process got signal %d", signal
570
674
for (i=0; i<active_tasks.size(); i++) {
571
675
atp = active_tasks[i];
572
676
if (atp->task_state() != PROCESS_EXECUTING) continue;
573
if (atp->current_cpu_time > atp->max_cpu_time) {
677
if (!atp->result->project->non_cpu_intensive && (atp->elapsed_time > atp->max_elapsed_time)) {
574
678
msg_printf(atp->result->project, MSG_INFO,
575
"Aborting task %s: exceeded CPU time limit %f\n",
576
atp->result->name, atp->max_cpu_time
679
"Aborting task %s: exceeded elapsed time limit %.2f (%.2fG/%.2fG)",
680
atp->result->name, atp->max_elapsed_time,
681
atp->result->wup->rsc_fpops_bound/1e9,
682
atp->result->avp->flops/1e9
578
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum CPU time exceeded");
684
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum elapsed time exceeded");
579
685
did_anything = true;
774
881
// called only from CLIENT_STATE::suspend_tasks(),
775
882
// e.g. because on batteries, time of day, benchmarking, CPU throttle, etc.
777
void ACTIVE_TASK_SET::suspend_all(bool cpu_throttle) {
784
leave_in_mem = gstate.global_prefs.leave_apps_in_memory;
786
for (i=0; i<active_tasks.size(); i++) {
884
void ACTIVE_TASK_SET::suspend_all(int reason) {
885
for (unsigned int i=0; i<active_tasks.size(); i++) {
787
886
ACTIVE_TASK* atp = active_tasks[i];
788
887
if (atp->task_state() != PROCESS_EXECUTING) continue;
889
case SUSPEND_REASON_CPU_THROTTLE:
790
890
// if we're doing CPU throttling, don't bother suspending apps
791
891
// that don't use a full CPU
793
893
if (atp->result->project->non_cpu_intensive) continue;
794
894
if (atp->app_version->avg_ncpus < 1) continue;
796
atp->preempt(!leave_in_mem);
895
atp->preempt(REMOVE_NEVER);
897
case SUSPEND_REASON_BENCHMARKS:
898
atp->preempt(REMOVE_NEVER);
900
case SUSPEND_REASON_CPU_USAGE:
901
// If we're suspending because of non-BOINC CPU load,
902
// don't remove from memory.
903
// Some systems do a security check when apps are launched,
904
// which uses a lot of CPU.
905
// Avoid going into a preemption loop.
907
if (atp->result->project->non_cpu_intensive) break;
908
atp->preempt(REMOVE_NEVER);
911
atp->preempt(REMOVE_MAYBE_USER);
992
1110
// check for msgs from active tasks.
993
// Return true if any of them has changed its checkpoint_cpu_time
994
// (since in that case we need to write state file)
996
bool ACTIVE_TASK_SET::get_msgs() {
1112
void ACTIVE_TASK_SET::get_msgs() {
998
1114
ACTIVE_TASK *atp;
999
1115
double old_time;
1000
bool action = false;
1001
1116
static double last_time=0;
1002
1117
double delta_t;
1003
1118
if (last_time) {
1004
1119
delta_t = gstate.now - last_time;
1121
// Normally this is called every second.
1122
// If delta_t is > 10, we'll assume that a period of hibernation
1123
// or suspension happened, and treat it as zero.
1124
// If negative, must be clock reset. Ignore.
1126
if (delta_t > 10 || delta_t < 0) {
1008
1132
last_time = gstate.now;
1011
1134
for (i=0; i<active_tasks.size(); i++) {
1012
1135
atp = active_tasks[i];
1013
1136
if (!atp->process_exists()) continue;
1014
1137
old_time = atp->checkpoint_cpu_time;
1015
if (atp->scheduler_state == CPU_SCHED_SCHEDULED) {
1138
if (atp->task_state() == PROCESS_EXECUTING) {
1016
1139
atp->elapsed_time += delta_t;
1018
1141
if (atp->get_app_status_msg()) {
1019
1142
if (old_time != atp->checkpoint_cpu_time) {
1020
gstate.request_enforce_schedule("Checkpoint reached");
1144
sprintf(buf, "%s checkpointed", atp->result->name);
1145
gstate.request_schedule_cpus(buf);
1021
1146
atp->checkpoint_wall_time = gstate.now;
1022
1147
atp->premature_exit_count = 0;
1023
1148
atp->checkpoint_elapsed_time = atp->elapsed_time;
1025
1149
if (log_flags.task_debug) {
1026
1150
msg_printf(atp->wup->project, MSG_INFO,
1027
"[task_debug] result %s checkpointed",
1151
"[task] result %s checkpointed",
1028
1152
atp->result->name
1030
1154
} else if (log_flags.checkpoint_debug) {
1031
1155
msg_printf(atp->wup->project, MSG_INFO,
1032
"[checkpoint_debug] result %s checkpointed",
1156
"[checkpoint] result %s checkpointed",
1033
1157
atp->result->name
1160
atp->write_task_state_file();
1038
1163
atp->get_trickle_up_msg();
1043
const char *BOINC_RCSID_10ca137461 = "$Id: app_control.cpp 16608 2008-12-03 18:35:17Z romw $";
1167
// write checkpoint state to a file in the slot dir
1168
// (this avoids rewriting the state file on each checkpoint)
1170
void ACTIVE_TASK::write_task_state_file() {
1172
sprintf(path, "%s/%s", slot_dir, TASK_STATE_FILENAME);
1173
FILE* f = fopen(path, "w");
1177
" <project_master_url>%s</project_master_url>\n"
1178
" <result_name>%s</result_name>\n"
1179
" <checkpoint_cpu_time>%f</checkpoint_cpu_time>\n"
1180
" <checkpoint_elapsed_time>%f</checkpoint_elapsed_time>\n"
1181
" <fraction_done>%f</fraction_done>\n"
1183
result->project->master_url,
1185
checkpoint_cpu_time,
1186
checkpoint_elapsed_time,
1192
// called on startup; read the task state file in case it's more recent
1193
// then the main state file
1195
void ACTIVE_TASK::read_task_state_file() {
1196
char buf[4096], path[1024], s[1024];
1197
sprintf(path, "%s/%s", slot_dir, TASK_STATE_FILENAME);
1198
FILE* f = fopen(path, "r");
1201
fread(buf, 1, 4096, f);
1205
// sanity checks - project and result name must match
1207
if (!parse_str(buf, "<project_master_url>", s, sizeof(s))) {
1208
msg_printf(wup->project, MSG_INTERNAL_ERROR,
1209
"no project URL in task state file"
1213
if (strcmp(s, result->project->master_url)) {
1214
msg_printf(wup->project, MSG_INTERNAL_ERROR,
1215
"wrong project URL in task state file"
1219
if (!parse_str(buf, "<result_name>", s, sizeof(s))) {
1220
msg_printf(wup->project, MSG_INTERNAL_ERROR,
1221
"no task name in task state file"
1225
if (strcmp(s, result->name)) {
1226
msg_printf(wup->project, MSG_INTERNAL_ERROR,
1227
"wrong task name in task state file"
1231
if (parse_double(buf, "<checkpoint_cpu_time>", x)) {
1232
if (x > checkpoint_cpu_time) {
1233
checkpoint_cpu_time = x;
1236
if (parse_double(buf, "<checkpoint_elapsed_time>", x)) {
1237
if (x > checkpoint_elapsed_time) {
1238
checkpoint_elapsed_time = x;