1
/*___INFO__MARK_BEGIN__*/
2
/*************************************************************************
4
* The Contents of this file are made available subject to the terms of
5
* the Sun Industry Standards Source License Version 1.2
7
* Sun Microsystems Inc., March, 2001
10
* Sun Industry Standards Source License Version 1.2
11
* =================================================
12
* The contents of this file are subject to the Sun Industry Standards
13
* Source License Version 1.2 (the "License"); You may not use this file
14
* except in compliance with the License. You may obtain a copy of the
15
* License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
17
* Software provided under this License is provided on an "AS IS" basis,
18
* WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
19
* WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
20
* MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
21
* See the License for the specific provisions governing your rights and
22
* obligations concerning the Software.
24
* The Initial Developer of the Original Code is: Sun Microsystems, Inc.
26
* Copyright: 2001 by Sun Microsystems, Inc.
28
* All Rights Reserved.
30
************************************************************************/
31
/*___INFO__MARK_END__*/
38
#include "sge_usageL.h"
40
#include "execution_states.h"
41
#include "sge_mailrec.h"
42
#include "admin_mail.h"
46
#include "sge_feature.h"
47
#include "sge_report.h"
49
#include "uti/sge_unistd.h"
50
#include "uti/sge_stdio.h"
52
#include "msg_common.h"
53
#include "msg_daemons_common.h"
55
int admail_states[MAX_SSTATE + 1] = {
58
/* 2 SSTATE_FAILURE_BEFORE_JOB */ 0,
59
/* 3 ESSTATE_NO_SHEPHERD */ 0,
60
/* 4 ESSTATE_NO_CONFIG */ 0,
61
/* 5 ESSTATE_NO_PID */ 0,
62
/* 6 SSTATE_READ_CONFIG */ 0,
63
/* 7 SSTATE_BEFORE_PROLOG */ BIT_ADM_NEW_CONF | BIT_ADM_QCHANGE,
64
/* 8 SSTATE_PROLOG_FAILED */ 0,
65
/* 9 SSTATE_BEFORE_PESTART */ 0,
66
/* 10 SSTATE_PESTART_FAILED */ 0,
67
/* 11 SSTATE_BEFORE_JOB */ 0,
68
/* 12 SSTATE_BEFORE_PESTOP */ 0,
69
/* 13 SSTATE_PESTOP_FAILED */ 0,
70
/* 14 SSTATE_BEFORE_EPILOG */ BIT_ADM_NEW_CONF | BIT_ADM_QCHANGE,
71
/* 15 SSTATE_EPILOG_FAILED */ 0,
72
/* 16 SSTATE_EPILOG_FAILED */ 0,
73
/* 17 ESSTATE_DIED_THRU_SIGNAL */ 0,
74
/* 18 ESSTATE_SHEPHERD_EXIT */ 0,
75
/* 19 ESSTATE_NO_EXITSTATUS */ 0,
76
/* 20 ESSTATE_UNEXP_ERRORFILE */ 0,
77
/* 21 ESSTATE_UNKNOWN_JOB */ 0,
78
/* 22 ESSTATE_EXECD_LOST_RUNNING */ 0,
79
/* 23 ESSTATE_PTF_CANT_GET_PIDS */ 0,
80
/* 24 SSTATE_MIGRATE */ BIT_ADM_NEVER,
81
/* 25 SSTATE_AGAIN */ BIT_ADM_NEVER,
82
/* 26 SSTATE_OPEN_OUTPUT */ 0,
83
/* 27 SSTATE_NO_SHELL */ 0,
84
/* 28 SSTATE_NO_CWD */ 0,
85
/* 29 SSTATE_AFS_PROBLEM */ 0,
86
/* 30 SSTATE_APPERROR */ 0,
87
/* 31 SSTATE_PASSWD_FILE_ERROR */ 0,
88
/* 32 SSTATE_PASSWD_MISSING */ 0,
89
/* 33 SSTATE_PASSWD_WRONG */ 0,
90
/* 34 SSTATE_HELPER_SERVICE_ERROR */ 0,
91
/* 35 SSTATE_HELPER_SERVICE_BEFORE_JOB */ 0,
92
/* 36 SSTATE_CHECK_DAEMON_CONFIG */ 0 };
94
u_long32 admail_times[MAX_SSTATE + 1];
97
** this functions reports job failures to the admin
98
** it might not be apt to report on errors that
99
** have nothing to do with a particular job
101
void job_related_adminmail(u_long32 progid, lListElem *jr, int is_array, const char *job_owner)
103
static int first = 1;
104
char sge_mail_subj[1024];
105
char sge_mail_body[2048];
106
char sge_mail_start[128];
107
char sge_mail_end[128];
108
char str_general[512] = "";
109
u_long32 jobid, jataskid, failed, general;
112
lList *lp_mail = NULL;
115
char *shepherd_filenames[] = { "trace", "error", "pe_hostfile" };
119
SGE_STRUCT_STAT statbuf;
120
char filepath[SGE_PATH_MAX];
123
char *sge_mail_body_total = NULL;
124
int sge_mail_body_total_size = 0;
129
char* administrator_mail = NULL;
131
DENTER(TOP_LAYER, "job_related_adminmail");
133
sge_dstring_init(&ds, buffer, sizeof(buffer));
135
DPRINTF(("sizeof(admail_times) : %d\n", sizeof(admail_times)));
137
memset(admail_times, sizeof(admail_times), 0);
141
administrator_mail = mconf_get_administrator_mail();
143
if (administrator_mail == NULL) {
148
if (!strcasecmp(administrator_mail, "none")) {
149
FREE(administrator_mail);
154
if (!(q=lGetString(jr, JR_queue_name)))
155
q = MSG_MAIL_UNKNOWN_NAME;
156
if ((ep=lGetSubStr(jr, UA_name, "start_time", JR_usage)))
157
strcpy(sge_mail_start, sge_ctime((time_t)lGetDouble(ep, UA_value), &ds));
159
strcpy(sge_mail_start, MSG_MAIL_UNKNOWN_NAME);
160
if ((ep=lGetSubStr(jr, UA_name, "end_time", JR_usage)))
161
strcpy(sge_mail_end, sge_ctime((time_t)lGetDouble(ep, UA_value), &ds));
163
strcpy(sge_mail_end, MSG_MAIL_UNKNOWN_NAME);
165
jobid = lGetUlong(jr, JR_job_number);
166
jataskid = lGetUlong(jr, JR_ja_task_number);
168
failed = lGetUlong(jr, JR_failed);
169
general = lGetUlong(jr, JR_general_failure);
176
sge_dstring_init(&ds, buffer, sizeof(buffer));
178
if (failed <= MAX_SSTATE) {
180
** a state might have more than one bit set
182
if ((admail_states[failed] & BIT_ADM_NEVER)) {
183
DPRINTF(("NEVER SENDING ADMIN MAIL for state %d\n", failed));
184
FREE(administrator_mail);
188
if ((admail_states[failed] & BIT_ADM_NEW_CONF)) {
189
if (admail_times[failed]) {
190
DPRINTF(("NOT SENDING ADMIN MAIL AGAIN for state %d, again on conf\n", failed));
191
FREE(administrator_mail);
196
if ((admail_states[failed] & BIT_ADM_QCHANGE)) {
197
if (admail_times[failed]) {
198
DPRINTF(("NOT SENDING ADMIN MAIL AGAIN for state %d, again on qchange\n", failed));
199
FREE(administrator_mail);
204
if ((admail_states[failed] & BIT_ADM_HOUR)) {
205
if ((now - admail_times[failed] < 3600))
206
DPRINTF(("NOT SENDING ADMIN MAIL AGAIN for state %d, again next hour\n", failed));
207
FREE(administrator_mail);
211
admail_times[failed] = now;
213
if (!(err_str=lGetString(jr, JR_err_str)))
214
err_str = MSG_MAIL_UNKNOWN_REASON;
216
ret = mailrec_parse(&lp_mail, administrator_mail);
218
ERROR((SGE_EVENT, MSG_MAIL_PARSE_S,
219
(administrator_mail ? administrator_mail : MSG_NULL)));
220
FREE(administrator_mail);
225
if (lGetString(jr, JR_pe_task_id_str) == NULL) {
226
/* This is a regular job */
227
if (general == GFSTATE_QUEUE) {
228
sprintf(str_general, MSG_GFSTATE_QUEUE_S, q);
230
else if (general == GFSTATE_HOST) {
231
const char *s = strchr(q, '@');
234
sprintf(str_general, MSG_GFSTATE_HOST_S, s);
236
sprintf(str_general, MSG_GFSTATE_HOST_S, MSG_MAIL_UNKNOWN_NAME);
239
else if (general == GFSTATE_JOB) {
241
sprintf(str_general, MSG_GFSTATE_JOB_UU, sge_u32c(jobid), sge_u32c(jataskid));
243
sprintf(str_general, MSG_GFSTATE_JOB_U, sge_u32c(jobid));
246
sprintf(str_general, MSG_NONE);
249
/* This is a pe task */
250
sprintf(str_general, MSG_GFSTATE_PEJOB_U, sge_u32c(jobid));
254
sprintf(sge_mail_subj, MSG_MAIL_SUBJECT_SUU,
255
feature_get_product_name(FS_SHORT_VERSION, &ds), sge_u32c(jobid), sge_u32c(jataskid));
257
sprintf(sge_mail_subj, MSG_MAIL_SUBJECT_SU,
258
feature_get_product_name(FS_SHORT_VERSION, &ds), sge_u32c(jobid));
259
sprintf(sge_mail_body,
260
MSG_MAIL_BODY_USSSSSSS,
263
job_owner, q, sge_mail_start, sge_mail_end,
264
get_sstate_description(failed),
267
** attach the trace and error file to admin mail if it is present
269
sge_mail_body_total_size = strlen(sge_mail_body) + 1000;
271
for (i=0; i<num_files; i++) {
272
shepherd_files[i].exists = 0;
274
for (i=0; i<num_files; i++) {
275
/* JG: TODO (254): use function creating path */
276
sprintf(shepherd_files[i].filepath, "%s/" sge_u32"."sge_u32"/%s", ACTIVE_DIR,
277
jobid, jataskid, shepherd_filenames[i]);
278
if (!SGE_STAT(shepherd_files[i].filepath, &shepherd_files[i].statbuf)
279
&& (shepherd_files[i].statbuf.st_size > 0)) {
280
sge_mail_body_total_size += shepherd_files[i].statbuf.st_size;
281
shepherd_files[i].exists = 1;
285
** allocate enough space for trace and error file
287
sge_mail_body_total = (char*) malloc(sizeof(char) *
288
sge_mail_body_total_size);
290
strcpy(sge_mail_body_total, sge_mail_body);
293
for (i=0; i<num_files; i++) {
294
if (shepherd_files[i].exists) {
295
sprintf(sge_mail_body_total, "%s\nShepherd %s:\n",
296
sge_mail_body_total, shepherd_filenames[i]);
297
start = strlen(sge_mail_body_total);
298
if ((fp = fopen(shepherd_files[i].filepath, "r"))) {
301
n=fread(sge_mail_body_total+start, 1,
302
sge_mail_body_total_size - start, fp);
304
sge_mail_body_total[start + n] = '\0';
309
cull_mail(progid, lp_mail, sge_mail_subj, sge_mail_body_total,
310
MSG_MAIL_TYPE_ADMIN);
312
if (sge_mail_body_total)
313
free((char*)sge_mail_body_total);
316
FREE(administrator_mail);
320
DPRINTF((MSG_FILE_ERRORCLOSEINGXY_SS, shepherd_files[i].filepath, strerror(errno)));
330
DENTER(TOP_LAYER, "adm_mail_reset");
333
** let 0 be a reset all
336
memset(admail_times, sizeof(admail_times), 0);
340
DPRINTF(("resetting admin mail for state %d\n", state));
341
for (i = 0; i < MAX_SSTATE + 1; i++) {
342
if ((admail_states[i] & state)) {