1002
/* if we are not running with real and effective uid of 0, forget it */
1004
if ((getuid() != 0) || (geteuid() != 0))
1006
fprintf(stderr, "%s: must be run by root\n",
1013
/* make sure no other server is running with this home directory */
1015
sprintf(lockfile, "%s/%s/server.lock",
1019
if ((lockfds = open(lockfile, O_CREAT | O_TRUNC | O_WRONLY, 0600)) < 0)
1021
sprintf(log_buffer, "%s: unable to open lock file '%s'",
1025
fprintf(stderr, "%s\n",
1028
log_err(errno, msg_daemonname, log_buffer);
1034
if (high_availability_mode)
1036
/* This will allow multiple instance of the pbs_server to be
1037
* running. This must be done before setting up the client
1038
* sockets interface, reading the config file, and contacting
1039
* the compute nodes.
1042
if (TDoBackground == 1)
1046
/* parent goes away */
1051
while (try_lock_out(lockfds, F_WRLCK))
1052
sleep(TSERVER_HA_CHECK_TIME); /* Relinquish */
1056
lock_out(lockfds, F_WRLCK);
922
* next_task - look for the next work task to perform:
923
* 1. If svr_delay_entry is set, then a delayed task is ready so
924
* find and process it.
925
* 2. All items on the immediate list, then
926
* 3. All items on the timed task list which have expired times
928
* Returns: amount of time till next task
931
static time_t next_task()
936
struct work_task *nxt;
938
struct work_task *ptask;
939
time_t tilwhen = server.sv_attr[(int)SRV_ATR_schedule_iteration].at_val.at_long;
941
time_now = time((time_t *)0);
945
ptask = (struct work_task *)GET_NEXT(task_list_event);
947
while (ptask != NULL)
949
nxt = (struct work_task *)GET_NEXT(ptask->wt_linkall);
951
if (ptask->wt_type == WORK_Deferred_Cmp)
952
dispatch_task(ptask);
960
while ((ptask = (struct work_task *)GET_NEXT(task_list_immed)) != NULL)
961
dispatch_task(ptask);
963
while ((ptask = (struct work_task *)GET_NEXT(task_list_timed)) != NULL)
965
if ((delay = ptask->wt_event - time_now) > 0)
974
dispatch_task(ptask); /* will delete link */
978
/* should the scheduler be run? If so, adjust the delay time */
980
if ((delay = server.sv_next_schedule - time_now) <= 0)
981
svr_do_schedule = SCH_SCHEDULE_TIME;
982
else if (delay < tilwhen)
986
} /* END next_task() */
991
* start_hot_jobs - place any job which is state QUEUED and has the
992
* HOT start flag set into execution.
994
* Returns the number of jobs to be hot started.
997
static int start_hot_jobs(void)
1003
for (pjob = (job *)GET_NEXT(svr_alljobs);
1005
pjob = (job *)GET_NEXT(pjob->ji_alljobs))
1007
if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_QUEUED) &&
1008
(pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART))
1013
pjob->ji_qs.ji_jobid,
1014
"attempting to hot start job");
1016
svr_startjob(pjob, NULL, NULL, NULL);
1023
} /* END start_hot_jobs() */
1039
time_t last_jobstat_time;
1042
void ping_nodes A_((struct work_task *));
1043
void check_nodes A_((struct work_task *));
1044
void check_log A_((struct work_task *));
1045
void check_acct_log A_((struct work_task *));
1047
extern char *msg_startup2; /* log message */
1049
last_jobstat_time = time_now;
1059
1050
server.sv_started = time(&time_now); /* time server started */
1062
* Open the log file so we can start recording events
1064
* set log_event_mask to point to the log_event attribute value so
1065
* it controls which events are logged.
1068
log_event_mask = &server.sv_attr[SRV_ATR_log_events].at_val.at_long;
1070
sprintf(path_log, "%s/%s",
1074
log_open(log_file, path_log);
1076
sprintf(log_buffer, msg_startup1, server_name, server_init_type);
1079
PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_FORCE,
1080
PBS_EVENTCLASS_SERVER,
1084
/* initialize the server objects and perform specified recovery */
1085
/* will be left in the server's private directory */
1087
if ((plogenv = getenv("PBSLOGLEVEL")) != NULL)
1089
LOGLEVEL = (int)strtol(plogenv, NULL, 10);
1092
if ((pc = getenv("PBSDEBUG")) != NULL)
1098
/* NOTE: env cleared in pbsd_init() */
1100
if (pbsd_init(server_init_type) != 0)
1102
log_err(-1, msg_daemonname, "pbsd_init failed");
1107
/* initialize the network interface */
1109
sprintf(log_buffer, "Using ports Server:%d Scheduler:%d MOM:%d",
1110
pbs_server_port_dis,
1115
PBSEVENT_SYSTEM | PBSEVENT_ADMIN,
1116
PBS_EVENTCLASS_SERVER,
1120
if (init_network(pbs_server_port_dis, process_request) != 0)
1122
perror("pbs_server: network");
1124
log_err(-1, msg_daemonname, "init_network failed dis");
1129
if (init_network(0, process_request) != 0)
1131
perror("pbs_server: unix domain socket");
1133
log_err(-1, msg_daemonname, "init_network failed unix domain socket");
1138
if (TDoBackground == 1)
1140
/* go into the background and become own session/process group */
1142
lock_out(lockfds, F_UNLCK);
1146
/* parent goes away */
1151
if ((sid = setsid()) == -1)
1153
log_err(errno, msg_daemonname, "setsid failed");
1158
lock_out(lockfds, F_WRLCK);
1164
dummyfile = fopen("/dev/null", "r");
1165
assert((dummyfile != 0) && (fileno(dummyfile) == 0));
1167
dummyfile = fopen("/dev/null", "w");
1168
assert((dummyfile != 0) && (fileno(dummyfile) == 1));
1170
dummyfile = fopen("/dev/null", "w");
1171
assert((dummyfile != 0) && (fileno(dummyfile) == 2));
1172
} /* END if (TDoBackground == 1) */
1175
if ((plogenv != NULL) && isdigit(plogenv[0]))
1176
LOGLEVEL = (int)strtol(plogenv, NULL, 0);
1180
setvbuf(stdout, NULL, _IOLBF, 0);
1182
setvbuf(stderr, NULL, _IOLBF, 0);
1185
sprintf(log_buffer, "%ld\n",
1189
if (write(lockfds, log_buffer, strlen(log_buffer)) !=
1190
(ssize_t)strlen(log_buffer))
1192
log_err(errno, msg_daemonname, "failed to write pid to lockfile");
1197
#if (PLOCK_DAEMONS & 1)
1202
if ((rppfd = rpp_bind(pbs_server_port_dis)) == -1)
1204
log_err(errno, msg_daemonname, "rpp_bind");
1209
rpp_fd = -1; /* force rpp_bind() to get another socket */
1211
tryport = IPPORT_RESERVED;
1215
while (--tryport > 0)
1217
if ((privfd = rpp_bind(tryport)) != -1)
1220
if ((errno != EADDRINUSE) && (errno != EADDRNOTAVAIL))
1226
log_err(errno, msg_daemonname, "no privileged ports");
1234
PBSEVENT_SYSTEM | PBSEVENT_FORCE,
1235
PBS_EVENTCLASS_SERVER,
1237
"creating rpp and private interfaces");
1240
add_conn(rppfd, Primary, (pbs_net_t)0, 0, PBS_SOCK_INET, rpp_request);
1242
add_conn(privfd, Primary, (pbs_net_t)0, 0, PBS_SOCK_INET, rpp_request);
1244
1053
/* record the fact that we are up and running */
1457
1267
update_nodes_file();
1273
* initialize_globals
1275
* Set the intial state of global variables.
1279
initialize_globals(void)
1281
strcpy(pbs_current_user, "PBS_Server");
1283
msg_daemonname = strdup(pbs_current_user);
1289
* set_globals_from_environment
1291
* Set the intial state of global variables based on
1292
* the program environment variables.
1296
set_globals_from_environment(void)
1301
/* initialize service port numbers for self, Scheduler, and MOM */
1303
if ((ptr = getenv("PBS_BATCH_SERVICE_PORT")) != NULL)
1305
pbs_server_port_dis = (int)strtol(ptr, NULL, 10);
1308
if ((ptr = getenv("PBS_SCHEDULER_SERVICE_PORT")) != NULL)
1310
pbs_scheduler_port = (int)strtol(ptr, NULL, 10);
1313
if ((ptr = getenv("PBS_MOM_SERVICE_PORT")) != NULL)
1315
pbs_mom_port = (int)strtol(ptr, NULL, 10);
1318
if ((ptr = getenv("PBS_MANAGER_SERVICE_PORT")) != NULL)
1320
pbs_rm_port = (int)strtol(ptr, NULL, 10);
1323
if ((plogenv = getenv("PBSLOGLEVEL")) != NULL)
1325
/* Note the plogenv is global and is tested in main_loop */
1326
LOGLEVEL = (int)strtol(plogenv, NULL, 10);
1329
if ((ptr = getenv("PBSDEBUG")) != NULL)
1336
} /* END set_globals_from_environment() */
1343
* main - the initialization and main loop of pbs_daemon
1349
char *argv[]) /* I */
1354
int rppfd; /* fd to receive is HELLO's */
1355
int privfd; /* fd to send is messages */
1357
char lockfile[MAXPATHLEN + 1];
1359
char *pathPtr = NULL;
1360
char EMsg[MAX_LINE];
1361
char tmpLine[MAX_LINE];
1363
extern char *msg_svrdown; /* log message */
1364
extern char *msg_startup1; /* log message */
1368
initialize_globals();
1369
time_now = time((time_t *)0);
1370
set_globals_from_environment();
1372
/* set standard umask */
1376
/* save argv and the path for later use */
1377
for (i = 0;i < argc;i++)
1379
ArgV[i] = (char *)malloc(sizeof(char) * (strlen(argv[i])+1));
1381
if (ArgV[i] == NULL)
1383
printf("ERROR: failed to allocate memory to save argv, shutting down\n");
1388
strcpy(ArgV[i],argv[i]);
1391
/* save the path before we go into the background. If we don't do this
1392
* we can't restart the server because the path will change */
1393
pathPtr = getenv("PATH");
1394
snprintf(OriginalPath,sizeof(OriginalPath),"%s",pathPtr);
1396
/* close files for security purposes */
1397
/* do the following before getuid() and geteuid() can trigger
1398
nss_ldap into opening a socket to the LDAP server */
1400
i = sysconf(_SC_OPEN_MAX);
1403
close(i); /* close any file desc left open by parent */
1405
/* find out the name of this machine (hostname) */
1409
if ((gethostname(server_host, PBS_MAXHOSTNAME) == -1) ||
1410
(get_fullhostname(server_host, server_host, PBS_MAXHOSTNAME, EMsg) == -1))
1412
snprintf(tmpLine, sizeof(tmpLine), "unable to determine local server hostname %c %s",
1413
EMsg[0] ? '-' : ' ',
1416
log_err(-1, "pbsd_main", tmpLine);
1418
exit(1); /* FAILURE - shutdown */
1421
strcpy(server_name, server_host); /* by default server = host */
1423
pbs_server_addr = get_hostaddr(server_host);
1424
pbs_mom_addr = pbs_server_addr; /* assume on same host */
1425
pbs_scheduler_addr = pbs_server_addr; /* assume on same host */
1427
/* The following port numbers might have been initialized in set_globals_from_environment() above. */
1429
if (pbs_server_port_dis <= 0)
1430
pbs_server_port_dis = get_svrport(PBS_BATCH_SERVICE_NAME, "tcp", PBS_BATCH_SERVICE_PORT_DIS);
1432
if (pbs_scheduler_port <= 0)
1433
pbs_scheduler_port = get_svrport(PBS_SCHEDULER_SERVICE_NAME, "tcp", PBS_SCHEDULER_SERVICE_PORT);
1435
if (pbs_mom_port <= 0)
1436
pbs_mom_port = get_svrport(PBS_MOM_SERVICE_NAME, "tcp", PBS_MOM_SERVICE_PORT);
1438
if (pbs_rm_port <= 0)
1439
pbs_rm_port = get_svrport(PBS_MANAGER_SERVICE_NAME, "tcp", PBS_MANAGER_SERVICE_PORT);
1441
parse_command_line(argc, argv);
1443
/* if we are not running with real and effective uid of 0, forget it */
1445
if ((getuid() != 0) || (geteuid() != 0))
1447
fprintf(stderr, "%s: must be run by root\n",
1454
* Read in server attributes so they are available to be used
1455
* Attributes will not be read in on a pbs_server -t create
1458
if (get_svr_attr(server_init_type) == -1)
1460
fprintf(stderr,"%s: failed to get server attributes\n",
1467
* make sure no other server is running with this home directory.
1468
* If server lockfile attribute has been set use it.
1469
* If not use default location for it
1472
if ((server.sv_attr[(int)SRV_ATR_lockfile].at_flags & ATR_VFLAG_SET) &&
1473
(server.sv_attr[(int)SRV_ATR_lockfile].at_val.at_str))
1475
char *LockfilePtr = server.sv_attr[(int)SRV_ATR_lockfile].at_val.at_str;
1477
/* check if an absolute path is specified or not */
1479
if (LockfilePtr[0] == '/')
1481
snprintf(lockfile,sizeof(lockfile),"%s",LockfilePtr);
1485
snprintf(lockfile,sizeof(lockfile),"%s/%s/%s",
1493
sprintf(lockfile,"%s/%s/server.lock",
1498
#ifndef USE_HA_THREADS
1499
if ((lockfds = open(lockfile, O_CREAT | O_TRUNC | O_WRONLY, 0600)) < 0)
1501
sprintf(log_buffer, "%s: unable to open lock file '%s'",
1505
fprintf(stderr, "%s\n",
1508
log_err(errno, msg_daemonname, log_buffer);
1512
#endif /* !USE_HA_THREADS */
1514
/* HA EVENTS MUST HAPPEN HERE */
1516
strcpy(HALockFile,lockfile);
1517
HALockCheckTime = server.sv_attr[(int)SRV_ATR_LockfileCheckTime].at_val.at_long;
1518
HALockUpdateTime = server.sv_attr[(int)SRV_ATR_LockfileUpdateTime].at_val.at_long;
1520
/* apply HA defaults */
1522
if (HALockCheckTime == 0)
1523
HALockCheckTime = PBS_LOCKFILE_CHECK_TIME;
1525
if (HALockUpdateTime == 0)
1526
HALockUpdateTime = PBS_LOCKFILE_UPDATE_TIME;
1528
if ((pc = getenv("PBSDEBUG")) != NULL)
1534
/* handle running in the background or not if we're debugging */
1536
if(high_availability_mode)
1538
if (daemonize_server(TDoBackground, &sid) == FAILURE)
1544
#ifdef OS_LOSES_FD_OVER_FORK
1545
/* NOTE: file descriptors may be lost across forks in SLES 10 SP1 */
1547
#ifndef USE_HA_THREADS
1550
if ((lockfds = open(lockfile, O_CREAT | O_TRUNC | O_WRONLY, 0600)) < 0)
1552
sprintf(log_buffer, "%s: unable to open lock file '%s'",
1556
fprintf(stderr, "%s\n",log_buffer);
1558
log_err(errno, msg_daemonname, log_buffer);
1563
#endif /* !USE_HA_THREADS */
1564
/* no file descriptor was held if we're using ha threads */
1565
#endif /* OS_LOSES_FD_OVER_FORK */
1567
#ifdef USE_HA_THREADS
1568
if (high_availability_mode)
1574
if ((lockfds = open(lockfile,O_CREAT|O_TRUNC|O_WRONLY,0600)) < 0)
1576
sprintf(log_buffer,"%s: unable to open lock file '%s'",
1580
fprintf(stderr,"%s\n",log_buffer);
1582
log_err(errno,msg_daemonname,log_buffer);
1588
if (high_availability_mode)
1590
/* This will allow multiple instance of the pbs_server to be
1591
* running. This must be done before setting up the client
1592
* sockets interface, reading the config file, and contacting
1593
* the compute nodes.
1596
while (try_lock_out(lockfds, F_WRLCK))
1597
sleep(TSERVER_HA_CHECK_TIME); /* Relinquish */
1598
} /* END if (high_availability_mode) */
1601
lock_out(lockfds, F_WRLCK);
1606
* Open the log file so we can start recording events
1608
* set log_event_mask to point to the log_event attribute value so
1609
* it controls which events are logged.
1612
log_event_mask = &server.sv_attr[SRV_ATR_log_events].at_val.at_long;
1614
sprintf(path_log, "%s/%s",
1618
log_open(log_file, path_log);
1620
sprintf(log_buffer, msg_startup1, server_name, server_init_type);
1623
PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_FORCE,
1624
PBS_EVENTCLASS_SERVER,
1628
/* initialize the server objects and perform specified recovery */
1629
/* will be left in the server's private directory */
1630
/* NOTE: env cleared in pbsd_init() */
1632
if (pbsd_init(server_init_type) != 0)
1634
log_err(-1, msg_daemonname, "pbsd_init failed");
1639
/* initialize the network interface */
1641
sprintf(log_buffer, "Using ports Server:%d Scheduler:%d MOM:%d (server: '%s')",
1642
pbs_server_port_dis,
1648
PBSEVENT_SYSTEM | PBSEVENT_ADMIN,
1649
PBS_EVENTCLASS_SERVER,
1653
if (init_network(pbs_server_port_dis, process_request) != 0)
1655
perror("pbs_server: network");
1657
log_err(-1, msg_daemonname, "init_network failed dis");
1662
if (init_network(0, process_request) != 0)
1664
perror("pbs_server: unix domain socket");
1666
log_err(-1, msg_daemonname, "init_network failed unix domain socket");
1672
/* handle running in the background or not if we're debugging */
1674
if(!high_availability_mode)
1676
if (daemonize_server(TDoBackground,&sid) == FAILURE)
1682
sprintf(log_buffer, "%ld\n", (long)sid);
1684
if (!high_availability_mode)
1686
if (write(lockfds, log_buffer, strlen(log_buffer)) !=
1687
(ssize_t)strlen(log_buffer))
1689
log_err(errno, msg_daemonname, "failed to write pid to lockfile");
1695
#if (PLOCK_DAEMONS & 1)
1700
if ((rppfd = rpp_bind(pbs_server_port_dis)) == -1)
1702
log_err(errno, msg_daemonname, "rpp_bind");
1707
rpp_fd = -1; /* force rpp_bind() to get another socket */
1709
tryport = IPPORT_RESERVED;
1713
while (--tryport > 0)
1715
if ((privfd = rpp_bind(tryport)) != -1)
1718
if ((errno != EADDRINUSE) && (errno != EADDRNOTAVAIL))
1724
log_err(errno, msg_daemonname, "no privileged ports");
1732
PBSEVENT_SYSTEM | PBSEVENT_FORCE,
1733
PBS_EVENTCLASS_SERVER,
1735
"creating rpp and private interfaces");
1738
add_conn(rppfd, Primary, (pbs_net_t)0, 0, PBS_SOCK_INET, rpp_request);
1740
add_conn(privfd, Primary, (pbs_net_t)0, 0, PBS_SOCK_INET, rpp_request);
1460
1746
RPPConfigure(1, 0); /* help rpp_shutdown go a bit faster */
1462
1747
rpp_shutdown();
1464
1749
shutdown_ack();
1599
* next_task - look for the next work task to perform:
1600
* 1. If svr_delay_entry is set, then a delayed task is ready so
1601
* find and process it.
1602
* 2. All items on the immediate list, then
1603
* 3. All items on the timed task list which have expired times
1605
* Returns: amount of time till next task
1608
static time_t next_task()
1611
static char id[] = "next_task";
1614
struct work_task *nxt;
1616
struct work_task *ptask;
1617
time_t tilwhen = server.sv_attr[(int)SRV_ATR_schedule_iteration].at_val.at_long;
1619
time_now = time((time_t *)0);
1621
if (svr_delay_entry)
1627
PBS_EVENTCLASS_REQUEST,
1629
"CHECKING svr_delay_entry");
1632
ptask = (struct work_task *)GET_NEXT(task_list_event);
1634
while (ptask != NULL)
1636
nxt = (struct work_task *)GET_NEXT(ptask->wt_linkall);
1638
if (ptask->wt_type == WORK_Deferred_Cmp)
1643
"DISPATCH Task WORK_Deferred_Cmp type %d, wt_event %ld, wt_aux %d",
1644
ptask->wt_type, ptask->wt_event, ptask->wt_aux);
1648
PBS_EVENTCLASS_REQUEST,
1654
dispatch_task(ptask);
1660
svr_delay_entry = 0;
1663
while ((ptask = (struct work_task *)GET_NEXT(task_list_immed)) != NULL)
1668
"DISPATCH Task #1 type %d, wt_event %ld, wt_aux %d",
1669
ptask->wt_type, ptask->wt_event, ptask->wt_aux);
1673
PBS_EVENTCLASS_REQUEST,
1679
dispatch_task(ptask);
1682
while ((ptask = (struct work_task *)GET_NEXT(task_list_timed)) != NULL)
1684
if ((delay = ptask->wt_event - time_now) > 0)
1686
if (tilwhen > delay)
1696
"DISPATCH Task #2 type %d, wt_event %ld, wt_aux %d",
1697
ptask->wt_type, ptask->wt_event, ptask->wt_aux);
1701
PBS_EVENTCLASS_REQUEST,
1706
dispatch_task(ptask); /* will delete link */
1710
/* should the scheduler be run? If so, adjust the delay time */
1712
if ((delay = server.sv_next_schedule - time_now) <= 0)
1713
svr_do_schedule = SCH_SCHEDULE_TIME;
1714
else if (delay < tilwhen)
1718
} /* END next_task() */
1725
* start_hot_jobs - place any job which is state QUEUED and has the
1726
* HOT start flag set into execution.
1728
* Returns the number of jobs to be hot started.
1731
static int start_hot_jobs(void)
1737
pjob = (job *)GET_NEXT(svr_alljobs);
1739
while (pjob != NULL)
1741
if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_QUEUED) &&
1742
(pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART))
1747
pjob->ji_qs.ji_jobid,
1748
"attempting to hot start job");
1750
svr_startjob(pjob, NULL, NULL, NULL);
1755
pjob = (job *)GET_NEXT(pjob->ji_alljobs);
1756
} /* END while (pjob != NULL) */
1759
} /* END start_hot_jobs() */
1767
* @return Zero on success, one on failure
1931
* This function will extract the directory portion of
1932
* the given path and copy it into the Dir parameter
1934
* @param FullPath (I)
1948
if ((FullPath == NULL) ||
1956
snprintf(Dir,sizeof(Dir),"%s",FullPath);
1958
ptr = strrchr(Dir,'/');
1966
} /* END extract_dir() */
1972
* Checks to see if HALockFile is defined, exists, and can be properly
1975
* @return TRUE if HALockFile is valid for use
1977
int is_ha_lock_file_valid(
1982
char LockDir[MAX_PATH_LEN];
1983
char ErrorString[MAX_LINE];
1984
char id[] = "is_ha_lock_file_valid";
1986
bool_t GoodPermissions = FALSE;
1988
if (HALockFile[0] == '\0')
1993
extract_dir(HALockFile,LockDir,sizeof(LockDir));
1995
if (stat(LockDir,&Stat) != 0)
1997
char tmpLine[MAX_LINE];
2000
strerror_r(errno,ErrorString,sizeof(ErrorString));
2002
snprintf(tmpLine,sizeof(tmpLine),"could not stat the lockfile dir '%s': %s",
2006
log_err(errno,id,tmpLine);
2011
/* directory must be owned by the TORQUE user and must have
2012
* read/write/exec permissions */
2013
if ((Stat.st_uid == getuid()) &&
2014
(Stat.st_mode & (S_IRUSR | S_IWUSR | S_IXUSR)))
2016
/* we can write to this directory */
2018
GoodPermissions = TRUE;
2020
else if ((Stat.st_gid == getgid()) &&
2021
(Stat.st_mode & (S_IRGRP | S_IWGRP | S_IXGRP)))
2023
GoodPermissions = TRUE;
2025
else if (Stat.st_mode & (S_IROTH | S_IWOTH | S_IXOTH))
2027
GoodPermissions = TRUE;
2030
if (GoodPermissions == FALSE)
2032
log_err(-1,id,"could not obtain the needed permissions for the lock file");
2035
return(GoodPermissions);
2036
} /* END is_ha_lock_file_valid() */
2041
* Try to release a lock on the given file
2043
* @param LockFile (I) Name of the file to unlock
2044
* @param LockFD (I) file descriptor to unlock [modified]
2046
* @return SUCCESS if the lock is released, FAILURE otherwise.
2048
int release_file_lock(
2057
if ((Lockfile == NULL) ||
2063
if (ISEMPTYSTR(Lockfile))
2074
fds = open(Lockfile,O_CREAT|O_TRUNC|O_WRONLY,0600);
2079
/* could not open lock file */
2084
flock.l_type = F_UNLCK;
2085
flock.l_whence = SEEK_SET;
2089
if (fcntl(fds,F_SETLK,&flock) != 0)
2101
} /* END release_file_lock() */
2107
* Try to acquire a lock on the given file
2109
* @param LockFile (I) the name of the file to lock.
2110
* @param LockFD (I/O) File descriptor for the lock file.
2111
* @param FileType (I) For logging (type of file lock)
2113
* @return SUCCESS if the lock is acquired, FAILURE otherwise
2115
int acquire_file_lock(
2124
char id[] = "acquire_file_lock";
2126
if ((LockFile == NULL) ||
2133
if (LockFile[0] == '\0')
2135
sprintf(log_buffer,"ALERT: empty %s lock filename\n",
2137
log_err(-1,id,log_buffer);
2142
fds = open(LockFile,O_CREAT|O_RDWR,0600);
2146
/* could not open lock file */
2148
sprintf(log_buffer,"ALERT: could not open %s lock file '%s' (errno: %d:%s)\n",
2153
log_err(errno,id,log_buffer);
2158
flock.l_type = F_WRLCK;
2159
flock.l_whence = SEEK_SET;
2162
flock.l_pid = getpid();
2164
if (fcntl(fds,F_SETLK,&flock) != 0)
2168
sprintf(log_buffer,"ALERT could not create lock on file '%s' (errno: %d:%s)\n",
2173
log_err(errno,id,log_buffer);
2178
/* don't close file; closing would lose the lock */
2183
} /* END acquire_file_lock() */
2189
*"Touch" the lockfile so that if locks are failing
2190
* other processes can see we still have possession
2191
* of the file. Also, we need to check that we can still
2196
* @return FAILURE if we don't have possession of the lock file anymore
2198
void *update_ha_lock_thread(
2203
char EMsg[MAX_LINE];
2207
struct stat statbuf;
2208
struct utimbuf timebuf;
2209
static long LastModifyTime = 0;
2210
char id[] = "update_ha_lock_thread";
2212
if (ISEMPTYSTR(HALockFile))
2214
/* locking HA not enabled */
2223
usleep(DEF_USPERSECOND * HALockUpdateTime);
2228
mutex_lock(&EUIDMutex);
2231
if (stat(HALockFile,&statbuf) == 0)
2233
/* check to make sure that no other process has modified this file
2234
* since the last time we did */
2236
if ((LastModifyTime > 0) && (LastModifyTime != statbuf.st_mtime))
2238
snprintf(EMsg,sizeof(EMsg),"update time changed unexpectedly");
2244
/* no one has touched this file since we last did--continue */
2246
LastModifyTime = time(NULL);
2247
timebuf.actime = LastModifyTime;
2248
timebuf.modtime = LastModifyTime;
2251
rc = utime(HALockFile,&timebuf);
2261
snprintf(EMsg,sizeof(EMsg),"could not stat file");
2264
mutex_unlock(&EUIDMutex);
2266
/* NOTE: HALockFile is emptied out when we delete the file during shutdown */
2268
if ((rc == -1) && !ISEMPTYSTR(HALockFile))
2270
char ErrorString[MAX_LINE];
2271
/* error occurred--immediate shutdown needed */
2273
if (LocalErrno != 0)
2275
strerror_r(LocalErrno,ErrorString,sizeof(ErrorString));
2277
sprintf(log_buffer,"could not update HA lock file '%s' in heartbeat thread (%s - errno %d:%s)",
2283
log_err(LocalErrno,id,log_buffer);
2287
sprintf(log_buffer,"could not update HA lock file '%s' in heartbeat thread (%s)",
2291
log_err(-1,id,log_buffer);
2294
/* restart pbs_server */
2298
} /* END while (TRUE) */
2302
} /* END update_ha_lock_thread() */
2308
int start_update_ha_lock_thread()
2311
#ifndef USE_HA_THREADS
2312
/* not compiled with threads */
2314
log_err(-1,"start_update_ha_lock_thread",
2315
"WARNING: cannot create HA update thread - pthreads not enabled\n");
2318
#else /* USE_HA_THREADS is defined */
2320
pthread_t HALockThread;
2321
pthread_attr_t HALockThreadAttr;
2326
char smallBuf[MAX_LINE];
2327
char id[] = "start_update_ha_lock_thread";
2329
/* write the pid to the lockfile for correctness */
2330
fds = open(HALockFile,O_TRUNC|O_WRONLY,0600);
2334
log_err(-1,id,"Couldn't write the pid to the lockfile\n");
2339
snprintf(smallBuf,sizeof(smallBuf),"%ld\n",(long)sid);
2340
if (write(fds,smallBuf,strlen(smallBuf)) != (ssize_t)strlen(smallBuf))
2342
log_err(-1,id,"Couldn't write the pid to the lockfile\n");
2347
/* we don't need an open handle on the lockfile, just correct update times */
2350
pthread_attr_init(&HALockThreadAttr);
2352
rc = pthread_create(&HALockThread,&HALockThreadAttr,update_ha_lock_thread,NULL);
2356
/* error creating thread */
2358
log_err(-1,id,"Could not create HA Lock Thread\n");
2365
PBS_EVENTCLASS_SERVER,
2367
"HA Lock update thread is now created\n");
2368
#endif /* ifndef USE_HA_THREADS */
2371
} /* END start_update_ha_lock_thread() */
2378
mutex_t *Mutex) /* I */
2381
#ifdef USE_HA_THREADS
2382
if (pthread_mutex_lock(Mutex) != 0)
2384
log_err(-1,"mutex_lock","ALERT: cannot lock mutex!\n");
2388
#endif /* ifdef USE_HA_THREADS */
2391
} /* END mutex_lock() */
2398
mutex_t *Mutex) /* I */
2401
#ifdef USE_HA_THREADS
2402
if (pthread_mutex_unlock(Mutex) != 0)
2404
log_err(-1,"mutex_unlock","ALERT: cannot unlock mutex!\n");
2408
#endif /* ifdef USE_HA_THREADS */
2411
} /* END mutex_unlock() */
2417
#ifdef USE_HA_THREADS
2419
* * lock_out_ha - lock out using moab style high availability
2421
static void lock_out_ha()
2424
bool_t UseFLock = TRUE;
2425
bool_t FilePossession = FALSE;
2426
bool_t FileIsMissing = FALSE;
2428
char MutexLockFile[MAX_NAME];
2429
char id[] = "lock_out_ha";
2431
int MutexLockFD = -1;
2434
struct stat StatBuf;
2436
snprintf(MutexLockFile,sizeof(MutexLockFile),"%s.mutex",
2439
time_now = time(NULL);
2441
while (!FilePossession)
2445
usleep(DEF_USPERSECOND * HALockCheckTime);
2447
time_now = time(NULL);
2450
if (MutexLockFD > 0)
2456
if (is_ha_lock_file_valid(HALockFile) == FALSE)
2461
if (UseFLock == TRUE)
2463
/* try to get a filesystem lock on the "mutex" file */
2465
while (acquire_file_lock(MutexLockFile,&MutexLockFD,"HA") == FAILURE)
2467
strcpy(log_buffer,"Could not acquire HA flock--trying again in 1 second\n");
2469
usleep(DEF_USPERSECOND);
2473
/* check if file lock exists */
2475
if (stat(HALockFile,&StatBuf) == 0)
2477
/* file DOES exist--check time */
2479
FileIsMissing = FALSE;
2481
if ((time_now - StatBuf.st_mtime) < HALockCheckTime)
2483
/* someone else probably has the lock */
2488
/* update the file to mark it as ours */
2490
utime(HALockFile,NULL);
2492
FilePossession = TRUE;
2496
/* file doesn't exist--wait required amount of time and check again */
2498
if (FileIsMissing == FALSE)
2500
FileIsMissing = TRUE;
2502
/* if we don't have a mutex to protect file creation
2503
* race conditions, we need to wait and check again:
2504
* otherwise we can safely create it immediately */
2506
if (UseFLock == FALSE)
2510
/* this is not the first time the file has been missing--we are
2511
* probably safe to create it */
2513
HALockFD = open(HALockFile,O_CREAT|O_EXCL|O_RDONLY,0600);
2517
sprintf(log_buffer,"could not create HA lock file '%s'--errno %d:%s",
2525
FilePossession = TRUE;
2528
if (FilePossession == TRUE)
2530
/* start heartbeat thread */
2532
start_update_ha_lock_thread();
2535
if (UseFLock == TRUE)
2536
close(MutexLockFD); /* unlock file mutex */
2537
} /* END while (!FilePossession) */
2539
/* we have the file lock--go ahead and log this fact */
2543
PBS_EVENTCLASS_SERVER,
2545
"high availability file lock obtained");
2546
} /* END lock_out_ha() */
2547
#endif /* USE_HA_THREADS */
2553
* daemonize_server()
2554
* figures out, based on the mode, whether or not to run in the background and does so
2556
* @param DoBackground - (I) indicates whether or not we should run in the background
2557
* @param sid - (O) set to the correct pid
2558
* @return success unless we could not run in the background and we're supposed to
2560
static int daemonize_server(
2562
int DoBackground, /* I */
2569
char id[] = "daemonize_server";
2573
/* handle foreground (i.e. debug mode) */
2577
setvbuf(stdout,NULL,_IOLBF,0);
2578
setvbuf(stderr,NULL,_IOLBF,0);
2583
/* run pbs_server in the background */
2585
/* fork to disconnect from terminal */
2587
if ((pid = fork()) == -1)
2589
log_err(errno,id,"cannot fork into background");
2597
/* exit if parent */
2601
PBS_EVENTCLASS_SERVER,
2603
"INFO: parent is exiting");
2608
/* NOTE: setsid() disconnects from controlling-terminal */
2610
if ((*sid = setsid()) == -1)
2612
log_err(errno,id,"Could not disconnect from controlling terminal");
2617
/* disconnect stdin,stdout,stderr */
2623
dummyfile = fopen("/dev/null","r");
2624
assert((dummyfile != 0) && (fileno(dummyfile) == 0));
2626
dummyfile = fopen("/dev/null","w");
2627
assert((dummyfile != 0) && (fileno(dummyfile) == 1));
2629
dummyfile = fopen("/dev/null","w");
2630
assert((dummyfile != 0) && (fileno(dummyfile) == 2));
2632
if ((pid = fork()) == -1)
2634
log_err(errno,id,"cannot fork into background");
2641
/* exit if parent */
2645
PBS_EVENTCLASS_SERVER,
2647
"INFO: parent is exiting");
2652
/* update the sid (pid written to the lock file) so that
2653
* the correct pid is present */
2658
PBS_EVENTCLASS_SERVER,
2660
"INFO: child process in background");
2663
} /* END daemonize_server() */
2667
#ifndef USE_HA_THREADS
2669
* lock_out - lock out other daemons from this directory.
2671
static void lock_out(
2674
int op) /* F_WRLCK or F_UNLCK */
2677
if (try_lock_out(fds,op))
2679
strcpy(log_buffer,"pbs_server: another server running\n");
2682
log_err(errno,msg_daemonname,log_buffer);
2684
fprintf(stderr,"%s", log_buffer);
2693
* * @return Zero on success, one on failure
1769
2695
static int try_lock_out(
1772
int op) /* F_WRLCK or F_UNLCK */
2698
int op) /* F_WRLCK or F_UNLCK */
1776
2701
struct flock flock;
1778
2703
flock.l_type = op;
1779
2704
flock.l_whence = SEEK_SET;
1780
2705
flock.l_start = 0;
1781
2706
flock.l_len = 0;
1783
return(fcntl(fds, F_SETLK, &flock) != 0);
1788
* lock_out - lock out other daemons from this directory.
1790
static void lock_out(
1793
int op) /* F_WRLCK or F_UNLCK */
1796
if (try_lock_out(fds, op))
1798
strcpy(log_buffer, "pbs_server: another server running\n");
1800
log_err(errno, msg_daemonname, log_buffer);
1802
fprintf(stderr, "%s", log_buffer);
2708
return(fcntl(fds,F_SETLK,&flock) != 0);
2710
#endif /* !USE_HA_THREADS */
2716
* gets attributes for the specified file/directory
2718
* @param FileName (I)
2719
* @param ModifyTime (O) [optional]
2720
* @param FileSize (O) [optional]
2721
* @param IsExe (O) [optional]
2722
* @param IsDir (O) [optional]
2727
char *FileName, /* I */
2728
unsigned long *ModifyTime, /* O (optional */
2729
long *FileSize, /* O (optional */
2730
bool_t *IsExe, /* O (optional */
2731
bool_t *IsDir) /* O (optional */
2736
char *id = "get_file_info";
2743
if (ModifyTime != NULL)
2746
if (FileSize != NULL)
2752
if ((FileName == NULL) || (FileName[0] == '\0'))
2757
/* FORMAT: <FILENAME>[ <ARG>]... */
2759
/* NOTE: mask off, then restore possible args */
2760
ptr = strchr(FileName,' ');
2765
rc = stat(FileName,&sbuf);
2769
sprintf(log_buffer,"INFO: cannot stat file '%s', errno: %d (%s)\n",
2774
log_err(errno,id,log_buffer);
2779
if (ModifyTime != NULL)
2781
*ModifyTime = (unsigned long)sbuf.st_mtime;
2784
if (FileSize != NULL)
2786
*FileSize = (long)sbuf.st_size;
2791
if (sbuf.st_mode & S_IXUSR)
2799
if (sbuf.st_mode & S_IFDIR)
2806
} /* end get_file_info() */
2812
* gets the full path for command
2814
* @return SUCCESS if the path is found, FAILURE otherwise
2820
char *GoodCmd, /* O */
2821
int GoodCmdLen) /* O */
2824
char *TokPtr = NULL;
2825
char *Delims = ":;"; /* windows and unix path deliminators */
2827
char tmpPath[MAX_LINE];
2828
bool_t IsExe = FALSE;
2829
bool_t IsDir = FALSE;
2833
/* absolute path specified */
2835
if (get_file_info(Cmd,NULL,NULL,&IsExe,&IsDir) == FAILURE)
2840
if ((IsExe == FALSE) && (IsDir == FALSE))
2845
snprintf(GoodCmd,GoodCmdLen,"%s",Cmd);
2850
PathLocation = strtok_r(OriginalPath,Delims,&TokPtr);
2852
while (PathLocation != NULL)
2854
if (strlen(PathLocation) <= 0)
2856
PathLocation = strtok_r(NULL,Delims,&TokPtr);
2861
if (PathLocation[strlen(PathLocation) - 1] == '/')
2863
sprintf(tmpPath,"%s%s",
2869
sprintf(tmpPath,"%s/%s",
2874
if (get_file_info(tmpPath,NULL,NULL,&IsExe,NULL) == FAILURE)
2876
PathLocation = strtok_r(NULL,Delims,&TokPtr);
2883
PathLocation = strtok_r(NULL,Delims,&TokPtr);
2888
snprintf(GoodCmd,GoodCmdLen,"%s",tmpPath);
2891
} /* END while (PathLocation != NULL) */
2894
} /* END get_full_path() */
2900
* * Restarts the pbs_server
2908
char FullCmd[MAX_LINE];
2909
char *id = "svr_restart";
2914
sizeof(FullCmd)) == FAILURE)
2916
sprintf(log_buffer,"ALERT: cannot locate full path for '%s'\n",
2919
log_err(-1,id,log_buffer);
2924
/* shut down network connections and rpp */
2926
RPPConfigure(1,0); /* help rpp_shutdown go a bit faster */
2929
net_close(-1); /* close all network connections */
2931
/* copying FullCmd to ArV[0] is necessary for multiple restarts because
2932
* the path changes when we run pbs_server in the background. */
2934
if (strcmp(FullCmd,ArgV[0]) != 0)
2938
ArgV[0] = malloc(sizeof(char) * (strlen(FullCmd) + 1));
2940
if (ArgV[0] == NULL)
2942
/* could not malloc */
2944
log_err(errno,id,"ERROR: cannot allocate memory for full command, cannot restart\n");
2949
strcpy(ArgV[0],FullCmd);
2952
sprintf(log_buffer,"INFO: about to exec '%s'\n",ArgV[0]);
2956
PBS_EVENTCLASS_SERVER,
2962
if ((rc = execv(FullCmd,ArgV)) == -1)
2974
} /* END svr_restart() */
2980
* restores this attribute to its default where supported/possible
2983
void restore_attr_default(
2985
struct attribute *attr) /* I */
2990
index = (int)(attr - server.sv_attr);
2992
attr->at_flags &= ~ATR_VFLAG_SET;
2996
case SRV_ATR_log_events:
2998
server.sv_attr[(int)SRV_ATR_log_events].at_val.at_long = PBSEVENT_MASK;
3002
case SRV_ATR_tcp_timeout:
3004
server.sv_attr[(int)SRV_ATR_tcp_timeout].at_val.at_long = PBS_TCPTIMEOUT;
3008
case SRV_ATR_JobStatRate:
3010
server.sv_attr[(int)SRV_ATR_JobStatRate].at_val.at_long = PBS_RESTAT_JOB;
3014
case SRV_ATR_PollJobs:
3016
server.sv_attr[(int)SRV_ATR_PollJobs].at_val.at_long = PBS_POLLJOBS;
3020
case SRV_ATR_LogLevel:
3022
server.sv_attr[(int)SRV_ATR_LogLevel].at_val.at_long = 0;
3028
/* should never get here, but if we do then reset the flags so the user knows
3029
* that the value hasn't been cleared */
3031
attr->at_flags |= ATR_VFLAG_SET;
3036
} /* END restore_attr_default() */
1808
3039
/* END pbsd_main.c */