131
131
tps.push_back(pi);
133
for (i=0; i < tps.size(); i++) {
134
PROCINFO tp = tps[i];
135
for (j=0; j < ps.size(); j++) {
133
for (i=0; i < tps.size(); i++) {
134
PROCINFO tp = tps[i];
135
for (j=0; j < ps.size(); j++) {
137
137
if (tp.id == p.parentid) {
138
138
if (TerminateProcessById(p.id)) {
139
139
tps.push_back(p);
151
151
int ACTIVE_TASK::request_exit() {
152
152
if (!app_client_shm.shm) return 1;
153
153
process_control_queue.msg_queue_send(
155
155
app_client_shm.shm->process_control_request
157
set_task_state(PROCESS_QUIT_PENDING, "request_exit()");
157
158
quit_time = gstate.now;
160
get_descendants(pid, descendants);
189
static inline void kill_processes(vector<int> pids) {
190
for (unsigned int i=0; i<pids.size(); i++) {
191
kill_app_process(pids[i]);
186
195
// Kill the task (and descendants) by OS-specific means.
188
197
int ACTIVE_TASK::kill_task(bool restart) {
193
202
// all we can do is terminate the main process,
194
203
// using the handle we got when we created it.
196
TerminateProcess(process_handle, 1);
206
TerminateProcess(process_handle, 1);
198
210
get_descendants(pid, pids);
199
211
pids.push_back(pid);
200
for (unsigned int i=0; i<pids.size(); i++) {
201
kill_app_process(pids[i]);
212
kill_processes(pids);
205
set_task_state(PROCESS_UNINITIALIZED, "kill_task");
215
set_task_state(PROCESS_UNINITIALIZED, "kill_task");
207
217
sprintf(buf, "restarting %s", result->name);
208
gstate.request_schedule_cpus(buf);
210
set_task_state(PROCESS_ABORTED, "kill_task");
218
gstate.request_schedule_cpus(buf);
220
set_task_state(PROCESS_ABORTED, "kill_task");
279
// handle a task that exited prematurely (i.e. the job isn't done)
289
// handle a task that exited prematurely (i.e. no finish file)
281
291
void ACTIVE_TASK::handle_premature_exit(bool& will_restart) {
282
// if it exited because we sent it a quit message, don't count
284
if (task_state() == PROCESS_QUIT_PENDING) {
292
switch (task_state()) {
293
case PROCESS_QUIT_PENDING:
285
294
set_task_state(PROCESS_UNINITIALIZED, "handle_premature_exit");
286
295
will_restart = true;
296
kill_processes(descendants);
298
case PROCESS_ABORT_PENDING:
299
set_task_state(PROCESS_UNINITIALIZED, "handle_premature_exit");
300
will_restart = false;
301
kill_processes(descendants);
526
541
void ACTIVE_TASK_SET::send_heartbeats() {
528
543
ACTIVE_TASK* atp;
530
double ar = gstate.available_ram();
545
double ar = gstate.available_ram();
532
547
for (i=0; i<active_tasks.size(); i++) {
533
548
atp = active_tasks[i];
534
549
if (!atp->process_exists()) continue;
535
550
if (!atp->app_client_shm.shm) continue;
536
snprintf(buf, sizeof(buf), "<heartbeat/>"
538
"<max_wss>%e</max_wss>",
539
atp->procinfo.working_set_size, ar
551
snprintf(buf, sizeof(buf), "<heartbeat/>"
553
"<max_wss>%e</max_wss>",
554
atp->procinfo.working_set_size, ar
541
556
if (gstate.network_suspended) {
542
557
strcat(buf, "<network_suspended/>");
567
582
if (!atp->process_exists()) continue;
568
583
if (!atp->app_client_shm.shm) continue;
570
// if app has had the same message in its send buffer for 180 sec,
571
// assume it's hung and restart it
573
if (atp->process_control_queue.timeout(180)) {
585
// if app has had the same message in its send buffer for 180 sec,
586
// assume it's hung and restart it
588
if (atp->process_control_queue.timeout(180)) {
574
589
if (log_flags.task_debug) {
575
590
msg_printf(atp->result->project, MSG_INFO,
576
591
"Restarting %s - message timeout", atp->result->name
579
atp->kill_task(true);
581
atp->process_control_queue.msg_queue_poll(
582
atp->app_client_shm.shm->process_control_request
594
atp->kill_task(true);
596
atp->process_control_queue.msg_queue_poll(
597
atp->app_client_shm.shm->process_control_request
681
696
bool do_disk_check = false;
682
697
bool did_anything = false;
684
double ram_left = gstate.available_ram();
685
double max_ram = gstate.max_available_ram();
699
double ram_left = gstate.available_ram();
700
double max_ram = gstate.max_available_ram();
687
702
// Some slot dirs have lots of files,
688
703
// so only check every min(disk_interval, 300) secs
701
716
atp->result->name, atp->max_elapsed_time,
702
717
atp->result->wup->rsc_fpops_bound/1e9,
703
718
atp->result->avp->flops/1e9
705
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum elapsed time exceeded");
709
if (atp->procinfo.working_set_size_smoothed > max_ram) {
710
msg_printf(atp->result->project, MSG_INFO,
711
"Aborting task %s: exceeded memory limit %.2fMB > %.2fMB\n",
713
atp->procinfo.working_set_size_smoothed/MEGA, max_ram/MEGA
715
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum memory exceeded");
720
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum elapsed time exceeded");
724
if (atp->procinfo.working_set_size_smoothed > max_ram) {
725
msg_printf(atp->result->project, MSG_INFO,
726
"Aborting task %s: exceeded memory limit %.2fMB > %.2fMB\n",
728
atp->procinfo.working_set_size_smoothed/MEGA, max_ram/MEGA
730
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum memory exceeded");
719
734
if (do_disk_check && atp->check_max_disk_exceeded()) {
720
735
did_anything = true;
723
ram_left -= atp->procinfo.working_set_size_smoothed;
726
gstate.request_schedule_cpus("RAM usage limit exceeded");
738
ram_left -= atp->procinfo.working_set_size_smoothed;
741
gstate.request_schedule_cpus("RAM usage limit exceeded");
728
743
if (do_disk_check) {
729
744
last_disk_check_time = gstate.now;
741
756
if (task_state() == PROCESS_EXECUTING || task_state() == PROCESS_SUSPENDED) {
742
757
set_task_state(PROCESS_ABORT_PENDING, "abort_task");
743
758
abort_time = gstate.now;
761
get_descendants(pid, descendants);
746
763
set_task_state(PROCESS_ABORTED, "abort_task");
831
// send quit signal to all tasks in the project
848
// send quit message to all tasks in the project
832
849
// (or all tasks, if proj==0).
833
850
// If they don't exit in 5 seconds,
834
851
// send them a kill signal and wait up to 5 more seconds to exit.
935
952
// if we're doing CPU throttling, don't bother suspending apps
936
953
// that don't use a full CPU
938
if (atp->result->non_cpu_intensive()) continue;
955
if (atp->result->dont_throttle()) continue;
939
956
if (atp->app_version->avg_ncpus < 1) continue;
940
957
atp->preempt(REMOVE_NEVER);
1029
1046
int ACTIVE_TASK::suspend() {
1030
1047
if (!app_client_shm.shm) return 0;
1031
if (task_state() != PROCESS_EXECUTING) {
1032
msg_printf(result->project, MSG_INFO,
1033
"Internal error: expected process %s to be executing", result->name
1036
int n = process_control_queue.msg_queue_purge("<resume/>");
1038
process_control_queue.msg_queue_send(
1040
app_client_shm.shm->process_control_request
1048
if (task_state() != PROCESS_EXECUTING) {
1049
msg_printf(result->project, MSG_INFO,
1050
"Internal error: expected process %s to be executing", result->name
1053
int n = process_control_queue.msg_queue_purge("<resume/>");
1055
process_control_queue.msg_queue_send(
1057
app_client_shm.shm->process_control_request
1043
1060
set_task_state(PROCESS_SUSPENDED, "suspend");
1049
1066
int ACTIVE_TASK::unsuspend() {
1050
1067
if (!app_client_shm.shm) return 0;
1051
if (task_state() != PROCESS_SUSPENDED) {
1052
msg_printf(result->project, MSG_INFO,
1053
"Internal error: expected process %s to be suspended", result->name
1068
if (task_state() != PROCESS_SUSPENDED) {
1069
msg_printf(result->project, MSG_INFO,
1070
"Internal error: expected process %s to be suspended", result->name
1056
1073
if (log_flags.cpu_sched) {
1057
1074
msg_printf(result->project, MSG_INFO,
1058
1075
"[cpu_sched] Resuming %s", result->name
1061
int n = process_control_queue.msg_queue_purge("<suspend/>");
1063
process_control_queue.msg_queue_send(
1065
app_client_shm.shm->process_control_request
1078
int n = process_control_queue.msg_queue_purge("<suspend/>");
1080
process_control_queue.msg_queue_send(
1082
app_client_shm.shm->process_control_request
1068
1085
set_task_state(PROCESS_EXECUTING, "unsuspend");