1
/*___INFO__MARK_BEGIN__*/
2
/*************************************************************************
4
* The Contents of this file are made available subject to the terms of
5
* the Sun Industry Standards Source License Version 1.2
7
* Sun Microsystems Inc., March, 2001
10
* Sun Industry Standards Source License Version 1.2
11
* =================================================
12
* The contents of this file are subject to the Sun Industry Standards
13
* Source License Version 1.2 (the "License"); You may not use this file
14
* except in compliance with the License. You may obtain a copy of the
15
* License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
17
* Software provided under this License is provided on an "AS IS" basis,
18
* WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
19
* WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
20
* MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
21
* See the License for the specific provisions governing your rights and
22
* obligations concerning the Software.
24
* The Initial Developer of the Original Code is: Sun Microsystems, Inc.
26
* Copyright: 2001 by Sun Microsystems, Inc.
28
* All Rights Reserved.
30
************************************************************************/
31
/*___INFO__MARK_END__*/
34
#include <sys/types.h>
35
#include <sys/socket.h>
36
#include <netinet/in.h>
37
#include <arpa/inet.h>
39
#include <netinet/tcp.h>
42
#include <sys/types.h>
44
#include <sys/resource.h>
51
#include "uti/sge_stdio.h"
52
#include "uti/sge_uidgid.h"
54
#include "basis_types.h"
56
#include "config_file.h"
57
#include "err_trace.h"
58
#include "qlogin_starter.h"
60
#include "msg_common.h"
66
/****** qrsh_starter/delete_qrsh_pid_file() *****************************************
68
* delete_qrsh_pid_file() -- delete the pid file from $TMPDIR
71
* static int delete_qrsh_pid_file()
74
* Delete the pid file created by qrsh_starter
77
* 1, if the file could be deleted
78
* 0, if and error occured. Possible error situations are:
79
* - the environment variable TMPDIR cannot be read
80
* - the file cannot be deleted
84
*******************************************************************************/
85
int delete_qrsh_pid_file()
87
char *pid_file_name = NULL;
90
if((pid_file_name = search_conf_val("qrsh_pid_file")) == NULL) {
91
shepherd_trace("cannot get variable %s", pid_file_name);
95
if (unlink(pid_file_name) != 0) {
96
shepherd_trace("cannot delete qrsh pid file %s", pid_file_name);
106
/****** shepherd/qrsh/write_to_qrsh() *****************************************
108
* write_to_qrsh -- short description
111
* int write_to_qrsh(const char *data);
114
* Writes the contents of <data> to an other (remote) process over
115
* a socket connection.
116
* Host and port of the communication partner are read from the
117
* configuration entry "qrsh_control_port".
118
* A socket client connection is opened to the named host and port,
119
* and the data is written.
122
* data - null terminated string with data to write
125
* 0, if function finishes correctly
126
* 1, if the config entry qrsh_control_port does not exist
127
* 2, if qrsh_control_port contains illegal data
128
* 3, if opening the socket failed
129
* 4, if the hostname cannot be resolved
130
* 5, if connecting to the socket fails
131
* 6, if writing the data fails
132
******************************************************************************/
133
int write_to_qrsh(const char *data)
135
char *address = NULL;
141
struct sockaddr_in server;
144
shepherd_trace("write_to_qrsh - data = %s", data);
146
/* read destination host and port from config */
147
address = get_conf_val("qrsh_control_port");
149
if (address == NULL) {
150
shepherd_trace("config does not contain entry for qrsh_control_port");
154
shepherd_trace("write_to_qrsh - address = %s", address);
156
c = strchr(address, ':');
158
shepherd_trace("illegal value for qrsh_control_port: \"%s\". "
159
"Should be host:port", address);
167
shepherd_trace("write_to_qrsh - host = %s, port = %d", host, port);
170
sock = socket( AF_INET, SOCK_STREAM, 0);
172
shepherd_trace("error opening stream socket: %s", strerror(errno));
176
/* connect socket using name specified by command line. */
177
server.sin_family = AF_INET;
178
hp = gethostbyname(host);
181
* gethostbyname returns a structure including the network address
182
* of the specified host.
184
if (hp == (struct hostent *) 0) {
185
shepherd_trace("%s: unknown host", host);
190
memcpy((char *) &server.sin_addr, (char *) hp->h_addr, hp->h_length);
191
server.sin_port = htons(port);
193
if (connect(sock, (struct sockaddr *) &server, sizeof server) == -1) {
194
shepherd_trace("error connecting stream socket: %s", strerror(errno));
200
datalen = strlen(data) + 1;
201
if (write(sock, data, datalen) != datalen) {
202
shepherd_trace("error writing data to qrsh_control_port");
207
/* close connection */
212
/****** shepherd/qrsh/write_exit_code_to_qrsh() *******************************
214
* write_exit_code_to_qrsh -- write an exit code to qrsh
217
* void write_exit_code_to_qrsh(int exit_code)
220
* If the program handled by this shepherd uses rsh mechanism
221
* (configuration value "rsh_daemon" is set), then the function
222
* writes an exit code to the corresponding qrsh process via a
225
* The exit code is either taken from parameter <exit_code>, if it is
226
* notequal 0, to signal an error condition in the shepherd,
227
* or read from a special file ($TMPDIR/qrsh_exit_code).
230
* exit_code - status of the calling process
233
* shepherd/qrsh/write_to_qrsh()
234
******************************************************************************/
235
void write_exit_code_to_qrsh(int exit_code)
240
/* rshd exited with OK: try to get returncode from qrsh_starter file */
241
shepherd_trace("write_exit_code_to_qrsh(%d)", exit_code);
243
/* write exit code as string number to qrsh */
244
sprintf(buffer, "%d", exit_code);
245
if (write_to_qrsh(buffer) != 0) {
246
shepherd_trace("writing exit code to qrsh failed");
250
/****** shepherd/qrsh/get_exit_code_of_qrsh_starter() *************************
252
* get_exit_code_of_qrsh_starter -- short description
255
* #include "qlogin_starter.h"
256
* int get_exit_code_of_qrsh_starter(int* exit_code);
259
* Reads the exit code from a process started via qrsh - qrsh_starter
260
* from a file in the jobs TMPDIR.
263
* exit_code - exit code of qrsh_starter
267
* 1, if an error occured while trying to get the exit code
268
******************************************************************************/
269
int get_exit_code_of_qrsh_starter(int* exit_code)
277
/* rshd exited with OK: try to get returncode from qrsh_starter file */
279
/* we only have an error file in TMPDIR in case of rsh,
280
* otherwise pass exit_code */
281
if (search_conf_val("rsh_daemon") != NULL) {
286
tmpdir = search_conf_val("qrsh_tmpdir");
287
taskid = search_conf_val("pe_task_id");
288
shepherd_trace("get_exit_code_of_qrsh_starter - TMPDIR = %s, pe_task_id = %s",
289
tmpdir ? tmpdir : "0", taskid ? taskid : "0");
290
if (tmpdir != NULL) {
291
if (taskid != NULL) {
292
sprintf(buffer, "%s/qrsh_exit_code.%s", tmpdir, taskid);
294
sprintf(buffer, "%s/qrsh_exit_code", tmpdir);
297
errorfile = fopen(buffer, "r");
298
if (errorfile != NULL) {
300
if (fscanf(errorfile, "%d", exit_code) == 1) {
301
shepherd_trace("error code from remote command is %d", *exit_code);
304
if (unlink(buffer) != 0) {
305
shepherd_trace("can't delete %s", buffer);
308
shepherd_trace("can't open file %s: %s", buffer, strerror(errno));
311
shepherd_trace("unable to get qrsh_tmpdir");
316
shepherd_trace(MSG_FILE_NOCLOSE_SS, buffer, strerror(errno));
320
/****** shepherd/qrsh/get_error_of_qrsh_starter() *************************
322
* get_error_of_qrsh_starter -- get error message from qrsh_starter
325
* #include "qlogin_starter.h"
327
* get_error_of_qrsh_starter(void);
330
* Reads an error message that qrsh_starter may have written to the
331
* qrsh jobs tmpdir due to an error in the startup phase of the qrsh job.
334
* the error message from qrsh_starter or
335
* NULL, if no error was generated (the job started up without problems)
338
* The returned string is dynamically allocated. It is in the responsibility
339
* of the caller to free it.
340
******************************************************************************/
341
const char *get_error_of_qrsh_starter(void)
343
char buffer[SGE_PATH_MAX];
348
/* rshd exited with OK: try to get error messages from qrsh_starter file */
349
shepherd_trace("get_error_of_qrsh_starter()");
351
/* we only have an error file in TMPDIR in case of rsh */
352
if (search_conf_val("rsh_daemon") != NULL) {
357
tmpdir = search_conf_val("qrsh_tmpdir");
358
taskid = search_conf_val("qrsh_task_id");
359
shepherd_trace("get_error_of_qrsh_starter - TMPDIR = %s, qrsh_task_id = %s",
360
tmpdir ? tmpdir : "0", taskid ? taskid : "0");
361
if (tmpdir != NULL) {
362
if (taskid != NULL) {
363
sprintf(buffer, "%s/qrsh_error.%s", tmpdir, taskid);
365
sprintf(buffer, "%s/qrsh_error", tmpdir);
368
errorfile = fopen(buffer, "r");
369
if (errorfile != NULL) {
370
char buffer[MAX_STRING_SIZE];
372
if (fgets(buffer, MAX_STRING_SIZE, errorfile) != NULL) {
373
shepherd_trace("error string from qrsh_starter is %s", buffer);
374
ret = strdup(buffer);
377
if (unlink(buffer) != 0) {
378
shepherd_trace("can't delete %s", buffer);
385
shepherd_trace(MSG_FILE_NOCLOSE_SS, buffer, strerror(errno));
390
/****** shepherd/qrsh/qlogin_starter() ****************************************
393
* qlogin_starter -- short description
396
* #include "qlogin_starter.h"
397
* int qlogin_starter(const char *cwd, char *daemon);
400
* The function is called from shepherd to start a protocol daemon
401
* like telnetd, rshd or rlogind.
402
* The mechanism used to call these daemons is that of inetd:
403
* - a socket is created (server side, any free port is assigned
404
* by the operating system)
405
* - qlogin_starter waits for someone to connect to this socket
406
* - the socket file handles are redirected to stdin, stdout
408
* - the daemon process is started
409
* Additionally to the inetd mechanism, the port number and some
410
* other information is sent to the qrsh process that initiated
411
* (over qmaster, schedd, execd, shepherd) the qlogin_starter call.
414
* cwd - the current working directory (the active_jobs directory)
415
* daemon - name and path of the daemon to start
418
* on success, the function will not return (it exec's)
419
* 4, if there is a problem with permissions
420
* 5, if a socket cannot be allocated
421
* 6, if a socket bind fails
422
* 7, if socket name (port) cannot be determined
423
* 8, if environment (to be passed to qrsh) cannot be read
424
* 9, if sending information to qrsh fails
425
* 10, if nobody connects to the socket within a one minute
426
* 11, if the acception of a connecting client fails
427
* 12, if the execution of the daemon fails
428
******************************************************************************/
429
int qlogin_starter(const char *cwd, char *daemon, char** env)
439
struct sockaddr_in serv_addr;
440
struct timeval timeout;
442
char *args[20]; /* JG: TODO: should be dynamically allocated */
444
const char *sge_root = NULL;
445
const char *arch = NULL;
447
#if defined(IRIX65) || defined(INTERIX) || defined(DARWIN6) || defined(ALPHA5) || defined(HP1164)
455
len = sizeof(serv_addr);
457
/* must be root because we must access /dev/something */
458
if( setgid(SGE_SUPERUSER_GID) ||
459
setuid(SGE_SUPERUSER_UID) ||
460
setegid(SGE_SUPERUSER_GID) ||
461
seteuid(SGE_SUPERUSER_UID)) {
462
shepherd_trace("cannot change uid/gid\n");
465
shepherd_trace("uid = "uid_t_fmt", euid = "uid_t_fmt", gid = "gid_t_fmt
466
", egid = "gid_t_fmt, getuid(), geteuid(), getgid(), getegid());
468
/* socket stuff from here */
469
sockfd = socket(AF_INET, SOCK_STREAM, 0);
472
shepherd_trace("cannot open socket.");
475
shepherd_trace("using sfd %d", sockfd);
477
setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *) &on, sizeof(on));
479
/* bind an address to any socket */
480
memset((char *) &serv_addr, 0, sizeof(serv_addr));
481
serv_addr.sin_port = 0;
482
serv_addr.sin_family = AF_INET;
483
serv_addr.sin_addr.s_addr = INADDR_ANY;
484
ret = bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr));
486
shepherd_trace("cannot bind socket: %s", strerror(errno));
492
/* find out assigned port number and pass it to caller */
493
length = sizeof(serv_addr);
494
if (getsockname(sockfd,(struct sockaddr *) &serv_addr, &length) == -1) {
495
shepherd_trace("getting socket name failed: %s", strerror(errno));
501
/* listen on socked - make connections be accepted */
502
if (listen(sockfd, 1) != 0) {
503
shepherd_trace("listen failed: %s", strerror(errno));
509
/* send necessary info to qrsh: port + utilbin directory + active job
512
port = ntohs(serv_addr.sin_port);
513
shepherd_trace("bound to port %d", port);
515
sge_root = sge_get_root_dir(0, NULL, 0, 1);
516
arch = sge_get_arch();
518
if (sge_root == NULL || arch == NULL) {
519
shepherd_trace("reading environment SGE_ROOT and ARC failed");
525
snprintf(buffer, 2048, "0:%d:%s/utilbin/%s:%s:%s",
526
port, sge_root, arch, cwd, get_conf_val("host"));
528
if (write_to_qrsh(buffer) != 0) {
529
shepherd_trace("communication with qrsh failed");
535
/* wait for connection */
536
shepherd_trace("waiting for connection.");
537
/* use a reasonable timeout (60 seconds) to prevent hanging here forever */
539
FD_SET(sockfd, &fds);
542
if (select(sockfd+1, &fds, NULL, NULL, &timeout) < 1) {
543
shepherd_trace("nobody connected to the socket");
549
/* accept connection */
550
newsfd = accept(sockfd, (struct sockaddr *)(&serv_addr), &len);
552
shepherd_trace("error when accepting socket conection");
557
shepherd_trace("accepted connection on fd %d", newsfd);
559
/* now we have a connection and do no longer need the "well known" port
560
* free this resource.
565
/* don't close on exec */
566
fcntl( newsfd, F_SETFD, 0 );
569
setsockopt(newsfd, IPPROTO_TCP, TCP_NODELAY, (const char *) &sso, sizeof(int));
571
/* use this fd as stdin,out,err */
576
/* close all the rest */
577
for (fd=3; fd<FD_SETSIZE; fd++)
580
shepherd_trace("daemon to start: |%s|", daemon);
582
/* split daemon commandline into single arguments */
583
/* JG: TODO: might contain quoted arguments containing spaces
584
* make function to split or use an already existing one
586
args[argc++] = strtok(daemon, " ");
587
while ((args[argc++] = strtok(NULL, " ")) != NULL);
591
shepherd_trace("daemon commandline split to %d arguments", argc);
592
while (args[i] != NULL) {
593
shepherd_trace("daemon argv[%d] = |%s|", i, args[i]);
600
execve(args[0], args, env);
602
/* oh oh, exec failed */
603
/* no way to tell anyone, becuase all FDs are closed */
604
/* last chance -> tell parent process */