78
81
// update result with new report time and sent time
81
SCHED_MSG_LOG::MSG_DEBUG,
83
log_messages.printf(MSG_DEBUG,
82
84
"[RESULT#%d] [HOST#%d] %s report_deadline (resend lost work)\n",
83
85
result.id, reply.host.id,
84
86
result_report_deadline==result.report_deadline?"NO update to":"Updated"
86
result.sent_time = result_sent_time;
88
result.sent_time = now;
87
89
result.report_deadline = result_report_deadline;
91
bool resend_lost_work(
92
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply,
93
PLATFORM_LIST& platforms, SCHED_SHMEM& ss
93
// resend any jobs that:
94
// 1) we already sent to this host;
95
// 2) are still in progress (i.e. haven't timed out) and
96
// 3) aren't present on the host
97
// Return true if there were any such jobs
99
bool resend_lost_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
96
101
std::vector<DB_RESULT>results;
104
char warning_msg[256];
99
105
bool did_any = false;
106
int num_eligible_to_resend=0;
101
107
int num_resent=0;
108
BEST_APP_VERSION* bavp;
107
// print list of results on host
109
for (i=0; i<sreq.other_results.size(); i++) {
110
OTHER_RESULT& orp=sreq.other_results[i];
111
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
112
"Result is on [HOST#%d]: %s\n",
113
reply.host.id, orp.name.c_str()
117
111
sprintf(buf, " where hostid=%d and server_state=%d ",
118
112
reply.host.id, RESULT_SERVER_STATE_IN_PROGRESS
120
114
while (!result.enumerate(buf)) {
121
115
bool found = false;
123
116
for (i=0; i<sreq.other_results.size(); i++) {
124
117
OTHER_RESULT& orp = sreq.other_results[i];
125
118
if (!strcmp(orp.name.c_str(), result.name)) {
132
SCHED_MSG_LOG::MSG_DEBUG,
133
"[HOST#%d] found lost [RESULT#%d]: %s\n",
134
reply.host.id, result.id, result.name
138
retval = wu.lookup_id(result.workunitid);
140
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
141
"[HOST#%d] WU not found for [RESULT#%d]\n",
142
reply.host.id, result.id
147
reply.wreq.core_client_version =
148
sreq.core_client_major_version*100 + sreq.core_client_minor_version;
150
retval = get_app_version(
151
wu, app, avp, sreq, reply, platforms, ss
154
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
155
"[HOST#%d] no app version [RESULT#%d]\n",
156
reply.host.id, result.id
161
// If time is too close to the deadline,
162
// or we already have a canonical result,
163
// or WU error flag is set,
164
// then don't bother to resend this result.
165
// Instead make it time out right away
166
// so that the transitioner does 'the right thing'.
168
char warning_msg[256];
171
wu.canonical_resultid ||
172
possibly_give_result_new_deadline(result, wu, reply)
174
result.report_deadline = time(0);
175
retval = result.mark_as_sent(result.server_state);
176
if (retval==ERR_DB_NOT_FOUND) {
178
SCHED_MSG_LOG::MSG_CRITICAL,
179
"[RESULT#%d] [HOST#%d]: CAN'T SEND, already sent to another host\n",
180
result.id, reply.host.id
184
SCHED_MSG_LOG::MSG_CRITICAL,
185
"resend_lost_result: can't update result deadline: %d\n", retval
188
if (retval) continue;
190
retval = update_wu_transition_time(wu, result.report_deadline);
193
SCHED_MSG_LOG::MSG_CRITICAL,
194
"resend_lost_result: can't update WU transition time: %d\n", retval
199
SCHED_MSG_LOG::MSG_DEBUG,
200
"[HOST#%d][RESULT#%d] not needed or too close to deadline, expiring\n",
201
reply.host.id, result.id
203
sprintf(warning_msg, "Didn't resend lost result %s (expired)", result.name);
204
USER_MESSAGE um(warning_msg, "high");
205
reply.insert_message(um);
209
retval = add_result_to_reply(
210
result, wu, sreq, reply, platforms, app, avp
213
log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
125
num_eligible_to_resend++;
126
log_messages.printf(MSG_DEBUG,
127
"[HOST#%d] found lost [RESULT#%d]: %s\n",
128
reply.host.id, result.id, result.name
132
retval = wu.lookup_id(result.workunitid);
134
log_messages.printf(MSG_CRITICAL,
135
"[HOST#%d] WU not found for [RESULT#%d]\n",
136
reply.host.id, result.id
141
bavp = get_app_version(sreq, reply, wu);
143
log_messages.printf(MSG_CRITICAL,
144
"[HOST#%d] no app version [RESULT#%d]\n",
145
reply.host.id, result.id
150
// If time is too close to the deadline,
151
// or we already have a canonical result,
152
// or WU error flag is set,
153
// then don't bother to resend this result.
154
// Instead make it time out right away
155
// so that the transitioner does 'the right thing'.
159
wu.canonical_resultid ||
160
possibly_give_result_new_deadline(result, wu, reply)
162
log_messages.printf(MSG_DEBUG,
163
"[HOST#%d][RESULT#%d] not needed or too close to deadline, expiring\n",
164
reply.host.id, result.id
166
result.report_deadline = time(0)-1;
167
retval = result.mark_as_sent(result.server_state);
169
log_messages.printf(MSG_CRITICAL,
170
"resend_lost_work: can't update result deadline: %d\n", retval
175
retval = update_wu_transition_time(wu, result.report_deadline);
177
log_messages.printf(MSG_CRITICAL,
178
"resend_lost_result: can't update WU transition time: %d\n", retval
183
"Didn't resend lost result %s (expired)", result.name
185
USER_MESSAGE um(warning_msg, "high");
186
reply.insert_message(um);
188
retval = add_result_to_reply(result, wu, sreq, reply, bavp);
190
log_messages.printf(MSG_CRITICAL,
214
191
"[HOST#%d] failed to send [RESULT#%d]\n",
215
192
reply.host.id, result.id