20
20
* are permitted provided that all of the following conditions are met.
21
21
* After December 31, 2001, only conditions 3-6 must be met:
23
* 1. Commercial and/or non-commercial use of the Software is permitted
24
* provided a current software registration is on file at www.OpenPBS.org.
25
* If use of this software contributes to a publication, product, or
26
* service, proper attribution must be given; see www.OpenPBS.org/credit.html
28
* 2. Redistribution in any form is only permitted for non-commercial,
29
* non-profit purposes. There can be no charge for the Software or any
30
* software incorporating the Software. Further, there can be no
31
* expectation of revenue generated as a consequence of redistributing
34
23
* 3. Any Redistribution of source code must retain the above copyright notice
35
24
* and the acknowledgment contained in paragraph 6, this list of conditions
36
25
* and the disclaimer contained in paragraph 7.
174
166
extern char *path_spool;
175
167
extern char *path_svrdb;
176
168
extern char *path_svrdb_new;
169
extern char *path_svrlog;
177
170
extern char *path_track;
178
171
extern char *path_nodes;
179
172
extern char *path_nodes_new;
180
173
extern char *path_nodestate;
181
174
extern char *path_nodenote;
182
175
extern char *path_nodenote_new;
176
extern char *path_checkpoint;
184
178
extern int queue_rank;
185
179
extern char server_name[];
233
229
#define CHANGE_STATE 1
234
230
#define KEEP_STATE 0
233
* dynamic array, with utility functions for easy appending
236
typedef struct darray_t {
245
* Initialize a dynamic array to a specific size
246
* @param Array (O) Assumed to be uninitialized struct
247
* @param InitialSize (I) raised to 0 if less than 0
252
darray_t *Array, /* I */
253
int InitialSize) /* I */
256
if (InitialSize <= 0)
263
Array->Length = InitialSize;
264
Array->Data = (void **)malloc(sizeof(Array->Data[0]) * InitialSize);
266
if (Array->Data == NULL)
270
Array->AppendIndex = 0;
272
} /*END DArrayInit */
277
* Free the resources associated with Array
278
* It does NOT free any data stored in the array, just the array structure itself.
284
darray_t *Array) /* I */
290
Array->AppendIndex = 0;
292
} /*END DArrayFree */
297
* Append Item onto the end of Array, resizing it if necessary
304
darray_t *Array, /* I/O */
309
if(Array->AppendIndex >= Array->Length)
311
int newLength = Array->Length * 2;
316
Array->Length = newLength;
317
Array->Data = realloc(Array->Data,sizeof(Array->Data[0]) * Array->Length);
319
if(Array->Data == NULL)
322
Array->AppendIndex = 0;
328
assert(Array->AppendIndex >= 0);
329
assert(Array->AppendIndex < Array->Length);
331
Array->Data[Array->AppendIndex++] = Item;
333
} /* END DArrayAppend */
338
* Sort two job structs by their priority in ascending order
343
static int SortPrioAscend(
345
const void *A, /* I */
346
const void *B) /* I */
349
job *pjob1 = *((job **)A);
350
job *pjob2 = *((job **)B);
351
int prio1 = pjob1->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long;
352
int prio2 = pjob2->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long;
353
return(prio1 - prio2);
354
} /*END SortPrioAscend */
357
void update_default_np()
359
struct pbsnode *pnode;
364
default_np = server.sv_attr[(int)SRV_ATR_NPDefault].at_val.at_long;
369
for(i = 0; i < svr_totnodes; i++)
371
pnode = pbsndlist[i];
373
npfreediff = pnode->nd_nsn - pnode->nd_nsnfree;
374
pnode->nd_nsn = default_np;
375
pnode->nd_nsnfree = default_np - npfreediff;
236
383
/* Add the server names from /var/spool/torque/server_name to the trusted hosts list. */
238
add_server_names_to_acl_hosts(void)
385
void add_server_names_to_acl_hosts(void)
241
388
int n, list_len, rc;
493
645
/* 2. set up the various paths and other global variables we need */
495
path_priv = build_path(path_home, PBS_SVR_PRIVATE, suffix_slash);
647
if(path_priv == NULL)
649
path_priv = build_path(path_home, PBS_SVR_PRIVATE, suffix_slash);
497
652
path_arrays = build_path(path_priv, PBS_ARRAYDIR, suffix_slash);
505
660
path_acct = build_path(path_priv, PBS_ACCT, suffix_slash);
507
path_svrdb = build_path(path_priv, PBS_SERVERDB, NULL);
662
if(path_svrdb == NULL)
664
path_svrdb = build_path(path_priv, PBS_SERVERDB, NULL);
509
667
path_svrdb_new = build_path(path_priv, PBS_SERVERDB, new_tag);
669
path_svrlog = build_path(path_home, PBS_LOGFILES, suffix_slash);
511
671
path_track = build_path(path_priv, PBS_TRACKING, NULL);
513
673
path_nodes = build_path(path_priv, NODE_DESCRIP, NULL);
521
681
path_nodenote_new = build_path(path_priv, NODE_NOTE, new_tag);
683
path_checkpoint = build_path(path_home, PBS_CHKPTDIR, suffix_slash);
685
if (svr_resc_def == NULL)
525
690
#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK)
564
729
/* 3. Set default server attibutes values */
566
if (server.sv_attr[(int)SRV_ATR_scheduling].at_flags & ATR_VFLAG_SET)
568
a_opt = server.sv_attr[(int)SRV_ATR_scheduling].at_val.at_long;
571
731
for (i = 0;i < SRV_ATR_LAST;i++)
572
732
clear_attr(&server.sv_attr[i], &svr_attr_def[i]);
625
785
/* Open the server database (save file) and read it in */
627
if ((rc != 0) || ((rc = svr_recov(path_svrdb)) == -1))
787
if ((rc != 0) || ((rc = svr_recov(path_svrdb, FALSE)) == -1))
629
789
log_err(rc, "pbsd_init", msg_init_baddb);
675
835
/* 7. Set up other server and global variables */
837
if (a_opt_init != -1)
679
839
/* a_option was set, overrides saved value of scheduling attr */
681
server.sv_attr[(int)SRV_ATR_scheduling].at_val.at_long = a_opt;
841
server.sv_attr[(int)SRV_ATR_scheduling].at_val.at_long = a_opt_init;
682
842
server.sv_attr[(int)SRV_ATR_scheduling].at_flags |=
792
953
while ((pdirent = readdir(dir)) != NULL)
794
956
if (chk_save_file(pdirent->d_name) == 0)
796
958
/* if not create or clean recovery, recover arrays */
798
960
if ((type != RECOV_CREATE) && (type != RECOV_COLD))
800
963
/* skip files without the proper suffix */
801
964
baselen = strlen(pdirent->d_name) - array_suf_len;
805
968
if (strcmp(psuffix, ARRAY_FILE_SUFFIX))
808
972
pa = array_recov(pdirent->d_name);
815
979
"could not recover array-struct from file %s--skipping",
816
980
pdirent->d_name);
818
982
log_err(errno, "pbsd_init", log_buffer);
823
988
pa->jobs_recovered = 0;
827
993
unlink(pdirent->d_name);
907
1077
if ((pjob = job_recov(pdirent->d_name)) != NULL)
909
if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_COMPLETE)
911
/* ignore/remove completed job */
913
/* for some reason, if a completed job is recovered, and it is
914
* forcibly purged with 'qdel -p', it will get deleted a
915
* second time resulting in a segfault */
922
if (pbsd_init_job(pjob, type) == FAILURE)
925
PBSEVENT_ERROR | PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_JOB | PBSEVENT_FORCE,
933
if ((type != RECOV_COLD) &&
934
(type != RECOV_CREATE) &&
935
(!(pjob->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)) &&
936
(pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT))
938
strcpy(basen, pdirent->d_name);
940
psuffix = basen + baselen;
942
strcpy(psuffix, JOB_SCRIPT_SUFFIX);
944
if (chk_save_file(basen) != 0)
947
PBSEVENT_ERROR | PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_JOB | PBSEVENT_FORCE,
949
pjob->ji_qs.ji_jobid,
1080
if (DArrayAppend(&Array,pjob) == FAILURE)
1082
log_err(ENOMEM,"main","out of memory reloading jobs");
1112
} /* END while ((pdirent = readdir(dir)) != NULL) */
1115
qsort(Array.Data,Array.AppendIndex,sizeof(Array.Data[0]),SortPrioAscend);
1117
for (Index = 0; Index < Array.AppendIndex; Index++)
1119
job *pjob = (job *)Array.Data[Index];
1121
if (pbsd_init_job(pjob, type) == FAILURE)
1124
PBSEVENT_ERROR | PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_JOB | PBSEVENT_FORCE,
1126
pjob->ji_qs.ji_jobid,
1132
if ((type != RECOV_COLD) &&
1133
(type != RECOV_CREATE) &&
1134
(!(pjob->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)) &&
1135
(pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT))
1137
strcpy(basen, pjob->ji_qs.ji_jobid);
1139
strcat(basen, JOB_SCRIPT_SUFFIX);
1141
if (chk_save_file(basen) != 0)
1144
PBSEVENT_ERROR | PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_JOB | PBSEVENT_FORCE,
1146
pjob->ji_qs.ji_jobid,
985
1156
if ((had != server.sv_qs.sv_numjobs) &&
986
1157
(type != RECOV_CREATE) &&
1037
1207
if (pjob == NULL)
1039
/* TODO, we need to so something here, we can't finish cloning the array! */
1209
/* TODO, we need to so something here, we can't finish cloning the
1044
1215
/* TODO if num_cloned != num_recovered then something strange happend
1045
it is possible num_recovered == num_cloned+1. That means that the server
1046
terminated after cloning a job but before updating the saved array_info struct.
1047
we probably should delete that last job and start the cloning process off at
1216
it is possible num_recovered == num_cloned+1. That means that the
1217
server terminated after cloning a job but before updating the saved
1218
array_info struct. we probably should delete that last job and start
1219
the cloning process off at num_cloned. Someone must have been
1220
naughty and did a kill -9 on pbs_server */
1049
1221
wt = set_task(WORK_Timed, time_now + 1, job_clone_wt, (void*)pjob);
1092
1264
#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK)
1094
if (chk_file_sec(path_track,0,0,S_IWGRP|S_IWOTH,0,EMsg) != 0)
1266
if (chk_file_sec(path_track, 0, 0, S_IWGRP | S_IWOTH, 0, EMsg) != 0)
1099
1271
#endif /* not DEBUG and not NO_SECURITY_CHECK */
1101
if (fstat(fd,&statbuf) < 0)
1273
if (fstat(fd, &statbuf) < 0)
1103
log_err(errno,"pbs_init","unable to stat tracking file");
1275
log_err(errno, "pbs_init", "unable to stat tracking file");
1127
1299
/* NOTE: tracking file records are optional */
1129
i = read(fd,(char *)server.sv_track,server.sv_tracksize * sizeof(struct tracking));
1301
i = read(fd, (char *)server.sv_track, server.sv_tracksize * sizeof(struct tracking));
1133
log_err(errno,"pbs_init","unable to read tracksize from tracking file");
1305
log_err(errno, "pbs_init", "unable to read tracksize from tracking file");
1140
1312
/* set work task to periodically save the tracking records */
1142
set_task(WORK_Timed,(long)(time_now + PBS_SAVE_TRACK_TM),track_save,0);
1314
set_task(WORK_Timed, (long)(time_now + PBS_SAVE_TRACK_TM), track_save, 0);
1400
1576
case JOB_SUBSTATE_COMPLETE:
1402
/* NOOP - completed jobs are already purged above */
1403
/* for some reason, this doesn't actually work */
1578
/* Completed jobs are no longer purged on startup */
1405
1580
pwt = set_task(WORK_Immed, 0, on_job_exit, (void *)pjob);
1700
1875
log_err(-1, "catch_child", log_buffer);
1702
DBPRT(("catch_child no work task found for pid %d\n", pid));
1877
DBPRT(("catch_child no work task found for pid %d\n", (int)pid));
1705
1880
} /* END while (1) */
2234
* This just reads in the server attributes from the server db.
2239
int type) /* type of initialization */
2242
static char id[] = "get_svr_attr";
2244
char *suffix_slash = "/";
2246
if (type != RECOV_CREATE)
2248
/* Open the server database (save file) and read it in */
2250
if(path_priv == NULL)
2252
path_priv = build_path(path_home, PBS_SVR_PRIVATE, suffix_slash);
2254
if(path_svrdb == NULL)
2256
path_svrdb = build_path(path_priv, PBS_SERVERDB, NULL);
2259
if (svr_resc_def == NULL)
2264
if (((rc = chk_save_file(path_svrdb))!= 0) || ((rc = svr_recov(path_svrdb, TRUE)) == -1))
2266
log_err(rc, id ,msg_init_baddb);
2274
} /* END get_svr_attr() */