1
/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/parallel.c,v 1.22 2005-02-22 18:47:02 manoj Exp $ */
12
#include <sys/socket.h>
14
#if defined(SUN) || defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || \
15
defined(AIX) || defined(NEXT) || defined(LINUX)
25
extern char *getenv();
26
#if defined(ULTRIX) || defined(SGI) || defined(NEXT) || defined(HPUX) || \
27
defined(KSR) || defined(DECOSF)
28
extern void *malloc();
33
#if !(defined(SGI) || defined(LINUX))
34
extern char *strdup();
37
extern void NextValueServer();
39
extern int WaitAll(long nchild);
41
#if (defined(SUN) && !defined(SOLARIS))
42
extern char *sprintf();
46
static char *ProcgrpFile(argc, argv)
50
Find the name of the procgrp file from
52
1) the first argument on the command line with .p appended
53
2) as 1) but also prepending $HOME/pdir/
54
2) the translation of the environmental variable PROCGRP
55
3) the file PROCGRP in the current directory
63
len = strlen(argv[1]);
64
tmp = malloc((unsigned) (len+3) );
65
(void) strcpy(tmp, argv[1]);
66
(void) strcpy(tmp+len, ".p");
68
if (stat(tmp, &buf) == 0) /* try ./arg1.p */
73
if ( (home = getenv("HOME")) != (char *) NULL ) {
74
tmp = malloc((unsigned) (strlen(home) + len + 9));
75
(void) strcpy(tmp, home);
76
(void) strcpy(tmp+strlen(home),"/pdir/");
77
(void) strcpy(tmp+strlen(home)+6,argv[1]);
78
(void) strcpy(tmp+strlen(home)+6+len,".p");
80
(void) printf("tmp = %s\n",tmp);
82
if (stat(tmp, &buf) == 0) /* try $HOME/pdir/arg1.p */
89
if ( (tmp = getenv("PROCGRP")) != (char *) NULL )
90
if (stat(tmp, &buf) == 0)
93
return strdup("PROCGRP");
96
static void SkipPastEOL(fp)
99
Read past first newline character
104
while ( (char) (test = getc(fp)) != '\n')
109
static char *GetProcgrp(filename, len_procgrp)
113
Read the entire contents of the PROCGRP into a NULL terminated
114
character string. Be lazy and read the file twice, first to
115
count the number of characters (ftell cannot be beleived?).
122
if ( (file = fopen(filename,"r")) == (FILE *) NULL ) {
123
(void) fprintf(stderr,"Master: PROCGRP = %s\n",filename);
124
Error("Master: failed to open PROCGRP", (long) 0);
128
while ( (status = getc(file)) != EOF) {
137
if ( (tmp = procgrp = malloc((unsigned) *len_procgrp)) == (char *) NULL )
138
Error("GetProcgrp: failed in malloc", (long) *len_procgrp);
140
(void) fseek(file, 0L, (int) 0); /* Seek to beginning of file */
142
while ( (status = getc(file)) != EOF) {
146
*tmp++ = (char) status;
151
if ( (int) (tmp - procgrp + 1) != *len_procgrp )
152
Error("GetProcgrp: screwup dimensioning procgrp", (long) *len_procgrp);
159
char *Canonical(name)
162
Use gethostbyname and return the canonicalized name.
165
struct hostent *host;
167
if ( (host = gethostbyname(name)) != (struct hostent *) NULL )
168
return strdup(host->h_name);
170
return (char *) NULL;
173
static long RemoteCreate(remote_hostname, remote_username,
174
remote_executable, argc, argv,
175
n_clus, n_proc, clus_id, proc_id)
176
char *remote_hostname;
177
char *remote_username;
178
char *remote_executable;
186
Using rsh create a process on remote_hostname running the
187
executable in the remote file remote_executable. Through
188
arguments pass it my hostname and the port number of a socket
189
to conenct on. Also propagate the arguments which this program
192
Listen for a connection to be established. The return value of
193
RemoteCreate is the filedescriptor of the socket connecting the
196
Rsh should ensure that the standard output of the remote
197
process is connected to the local standard output and that
198
local interrupts are propagated to the remote process.
201
char local_hostname[256], c_port[8];
202
char c_n_clus[8], c_n_proc[8], c_clus_id[8], c_proc_id[8];
204
int sock, port, i, pid;
207
/* Create and bind socket to wild card internet name */
209
CreateSocketAndBind(&sock, &port);
211
/* create remote process using rsh passing master hostname and
214
if (gethostname(local_hostname, 256) || strlen(local_hostname) == 0)
215
Error("RemoteCreate: gethostname failed", (long) 0);
217
(void) sprintf(c_port, "%d", port);
218
(void) sprintf(c_n_clus, "%ld", n_clus);
219
(void) sprintf(c_n_proc, "%ld", n_proc);
220
(void) sprintf(c_clus_id, "%ld", clus_id);
221
(void) sprintf(c_proc_id, "%ld", proc_id);
223
(void) printf(" Creating: host=%s, user=%s,\n\
225
remote_hostname, remote_username, remote_executable,
230
/* In child process */
232
sleep(1); /* So that parallel can make the sockets */
235
if (proc_id != 0) /* Close all uneeded files */
236
(void) fclose(stdin);
245
/* Overlay the desired executable */
247
if (strcmp(remote_hostname, local_hostname) != 0) {
249
argv2[1 ] = remote_hostname;
251
argv2[3 ] = remote_username;
253
argv2[5 ] = remote_executable;
255
for (i=2; i<argc; i++)
256
argv2[i+5] = argv[i];
257
argv2[argc+5 ] = "-master";
258
argv2[argc+6 ] = local_hostname;
259
argv2[argc+7 ] = c_port;
260
argv2[argc+8 ] = c_n_clus;
261
argv2[argc+9 ] = c_n_proc;
262
argv2[argc+10] = c_clus_id;
263
argv2[argc+11] = c_proc_id;
264
argv2[argc+12] = (char *) NULL;
266
if ( (tmp = getenv("TCGRSH")) != (char *) NULL )
267
(void) execv(tmp,argv2);
270
(void) execv("/usr/bsd/rsh",argv2);
273
(void) execv("/usr/bin/remsh",argv2);
276
(void) execv("/usr/bin/rsh",argv2);
278
#if !defined(SGI) && !defined(HPUX) && !defined(LINUX)
279
(void) execv("/usr/ucb/rsh",argv2);
283
argv2[0 ] = remote_executable;
284
for (i=1; i<(argc-1); i++) /* Don't copy the .p file name over */
285
argv2[i] = argv[i+1];
286
argv2[i+0] = "-master";
287
argv2[i+1] = Canonical(local_hostname);
289
argv2[i+3] = c_n_clus;
290
argv2[i+4] = c_n_proc;
291
argv2[i+5] = c_clus_id;
292
argv2[i+6] = c_proc_id;
293
argv2[i+7] = (char *) NULL;
295
(void) execv(remote_executable, argv2);
298
Error("RemoteCreate: in child after execv", (long) -1);
301
SR_pids[SR_numchild++] = pid;
303
Error("RemoteCreate: failed forking process", (long) pid);
305
/* accept one connection */
307
return ListenAndAccept(sock);
314
This is the master process of the cluster network.
316
a) read the procgrp file. This is found by trying in turn:
318
1) the first argument on the command line with .p appended
319
2) the translation of the environmental variable PROCGRP
320
3) the file PROCGRP in the current directory
322
b) create the remote processes specified in this file, connecting
323
to them via sockets and pass them the entire contents of the
324
PROCGRP file in ascii
326
c) Navigate messages to establish connections between the remote
329
d) wait for all the children to finish and exit with the appropriate
333
char hostname[256]; /* Me */
334
char *filename; /* The name of PROCGRP file */
335
char *procgrp; /* The contents of PROCGRP */
336
long len_procgrp; /* The length of PROCGRP */
337
long i, j, node, type, lenbuf, status=0, sync=1;
339
/* Initialize all the globals */
343
/* Set up handler for SIGINT and SIGCHLD */
349
/* on Solaris parallel gets SIGSEGV interrupted while polling in NxtVal */
354
/* Generate a name for the PROCGRP file */
356
filename = ProcgrpFile(argc, argv);
358
(void) printf("PROCGRP = %s\n",filename);
360
/* Read in the entire contents of the PROCGRP file */
362
procgrp = GetProcgrp(filename, &len_procgrp);
364
/* Parse the procgrp info filling in the ClusterInfo structure and
365
computing the number of clusters */
367
if (gethostname(hostname, sizeof hostname) || strlen(hostname) == 0)
368
Error("parallel: gethostname failed?", (long) sizeof hostname);
370
InitClusInfo(procgrp, hostname);
375
/* I am the master process so I have the highest ids */
377
SR_proc_id = SR_n_proc;
379
/* Now create the remote cluster master processes */
381
for (i=0; i<SR_n_clus; i++) {
382
node = SR_clus_info[i].masterid;
383
SR_proc_info[node].sock = RemoteCreate(SR_clus_info[i].hostname,
384
SR_clus_info[i].user,
385
SR_clus_info[i].image,
391
type = TYPE_BEGIN | MSGINT;
392
lenbuf = sizeof(long);
393
SND_(&type, (char *) &len_procgrp, &lenbuf, &node, &sync);
394
type = TYPE_BEGIN | MSGCHR;
395
SND_(&type, procgrp, &len_procgrp, &node, &sync);
398
/* Now have to route messages between the cluster masters as they connect */
400
for (i=1; i< SR_n_clus; i++)
401
for (j=0; j < i; j++)
402
RemoteConnect(SR_clus_info[i].masterid,
403
SR_clus_info[j].masterid,
406
/* Now for the next value service I need to connect to everyone else */
408
for (i=0; i < SR_n_clus; i++)
409
for (j=1; j<SR_clus_info[i].nslave; j++)
410
RemoteConnect(SR_proc_id,
411
SR_clus_info[i].masterid+j,
412
SR_clus_info[i].masterid);
414
/* Since we only using sockets we can block in select when waiting for a message */
417
for (i=0; i<(SR_n_proc+1); i++) {
418
if (SR_proc_info[i].sock >= 0) {
419
SR_socks[SR_nsock] = SR_proc_info[i].sock;
420
SR_socks_proc[SR_nsock] = i;
425
/* Provide the next value service ... exit gracefully when get termination
426
message from everyone or detect error */
430
/* Now wait patiently for everything to finish, then close all
431
sockets and return */
433
status = WaitAll(SR_n_clus);