3
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
4
* University Research and Technology
5
* Corporation. All rights reserved.
6
* Copyright (c) 2004-2005 The University of Tennessee and The University
7
* of Tennessee Research Foundation. All rights
9
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10
* University of Stuttgart. All rights reserved.
11
* Copyright (c) 2004-2005 The Regents of the University of California.
12
* All rights reserved.
15
* Additional copyrights may follow
20
#include "orte_config.h"
23
* Debugger support for orterun
25
* We interpret the MPICH debugger interface as follows:
28
* - spawns the other processes,
29
* - fills in the table MPIR_proctable, and sets MPIR_proctable_size
30
* - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1)
31
* - calls MPIR_Breakpoint() which the debugger will have a
34
* b) Applications start and then spin until MPIR_debug_gate is set
35
* non-zero by the debugger.
37
* This file implements (a).
43
#endif /* HAVE_STDLIB_H */
46
#endif /* HAVE_STRINGS_H */
49
#endif /* HAVE_UNISTD_H */
55
extern char **environ;
57
#include "opal/util/opal_environ.h"
58
#include "opal/util/output.h"
59
#include "opal/util/argv.h"
60
#include "opal/util/show_help.h"
61
#include "opal/util/path.h"
62
#include "opal/class/opal_list.h"
63
#include "opal/mca/base/base.h"
64
#include "orte/mca/errmgr/errmgr.h"
65
#include "orte/mca/rmgr/rmgr_types.h"
66
#include "orte/mca/rmaps/base/rmaps_base_map.h"
67
#include "orte/runtime/runtime.h"
68
#include "totalview.h"
70
/* +++ begin MPICH/TotalView interface definitions */
72
#define MPIR_DEBUG_SPAWNED 1
73
#define MPIR_DEBUG_ABORTING 2
75
struct MPIR_PROCDESC {
76
char *host_name; /* something that can be passed to inet_addr */
77
char *executable_name; /* name of binary */
78
int pid; /* process pid */
81
struct MPIR_PROCDESC *MPIR_proctable = NULL;
82
int MPIR_proctable_size = 0;
83
int MPIR_being_debugged = 0;
84
int MPIR_force_to_main = 0;
85
volatile int MPIR_debug_state = 0;
86
volatile int MPIR_i_am_starter = 0;
87
volatile int MPIR_debug_gate = 0;
88
volatile int MPIR_acquired_pre_main = 0;
90
void *MPIR_Breakpoint(void);
92
/* --- end MPICH/TotalView interface definitions */
95
* NOTE: The job description in the registry will likely evolve to use
96
* the "jobgrp_t", but this works for now.
98
* An initial skeleton of how to implement this with jobgrp_t is
99
* available in SVN as orte/tools/orterun/totalview.c, version 7075.
103
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
105
static void dump(void)
109
DUMP_INT(MPIR_being_debugged);
110
DUMP_INT(MPIR_debug_gate);
111
DUMP_INT(MPIR_debug_state);
112
DUMP_INT(MPIR_acquired_pre_main);
113
DUMP_INT(MPIR_i_am_starter);
114
DUMP_INT(MPIR_proctable_size);
115
fprintf(stderr, " MPIR_proctable:\n");
116
for (i = 0; i < MPIR_proctable_size; i++) {
118
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
120
MPIR_proctable[i].host_name,
121
MPIR_proctable[i].executable_name,
122
MPIR_proctable[i].pid);
127
* Process one line from the orte_base_user_debugger MCA param and
128
* look for that debugger in the path. If we find it, fill in
131
static int process(char *orig_line, char *basename, int argc, char **argv,
135
char *line, *full_line = strdup(orig_line);
136
char *user_argv, *tmp, **tmp_argv;
141
return ORTE_ERR_OUT_OF_RESOURCE;
144
/* Trim off whitespace at the beginning and ending of line */
146
for (i = 0; '\0' != line[i] && isspace(line[i]); ++line) {
149
for (i = strlen(line) - 2; i > 0 && isspace(line[i]); ++i) {
152
if (strlen(line) <= 0) {
156
/* Remove --debug, --debugger, and -tv from the user command line
160
user_argv = strdup("");
162
tmp_argv = opal_argv_copy(argv);
163
for (i = 0; NULL != tmp_argv[i]; ++i) {
164
if (0 == strcmp(tmp_argv[i], "-debug") ||
165
0 == strcmp(tmp_argv[i], "--debug")) {
167
tmp_argv[i] = strdup("");
168
} else if (0 == strcmp(tmp_argv[i], "-tv") ||
169
0 == strcmp(tmp_argv[i], "--tv")) {
171
tmp_argv[i] = strdup("");
172
} else if (0 == strcmp(tmp_argv[i], "--debugger") ||
173
0 == strcmp(tmp_argv[i], "-debugger")) {
175
tmp_argv[i] = strdup("");
176
if (NULL != tmp_argv[i + 1]) {
179
tmp_argv[i] = strdup("");
183
user_argv = opal_argv_join(tmp_argv + 1, ' ');
184
opal_argv_free(tmp_argv);
187
/* Replace @@ tokens - line should never realistically be bigger
188
than MAX_INT, so just cast to int to remove compiler warning */
190
for (i = 0; i < (int) strlen(line); ++i) {
192
if (0 == strncmp(line + i, "@mpirun@", 8)) {
194
asprintf(&tmp, "%s%s%s", line, argv[0], line + i + 8);
195
} else if (0 == strncmp(line + i, "@orterun@", 9)) {
197
asprintf(&tmp, "%s%s%s", line, argv[0], line + i + 9);
198
} else if (0 == strncmp(line + i, "@mpirun_args@", 13)) {
200
asprintf(&tmp, "%s%s%s", line, user_argv, line + i + 13);
201
} else if (0 == strncmp(line + i, "@orterun_args@", 14)) {
203
asprintf(&tmp, "%s%s%s", line, user_argv, line + i + 14);
208
full_line = line = tmp;
213
/* Split up into argv */
215
*new_argv = opal_argv_split(line, ' ');
218
/* Can we find argv[0] in the path? */
220
getcwd(cwd, PATH_MAX);
221
tmp = opal_path_findv((*new_argv)[0], X_OK, environ, cwd);
227
/* All done -- didn't find it */
229
opal_argv_free(*new_argv);
231
return ORTE_ERR_NOT_FOUND;
235
* Run a user-level debugger
237
void orte_run_debugger(char *basename, int argc, char *argv[])
240
char **new_argv = NULL;
241
char *value, **lines;
243
/* Get the orte_base_debug MCA parameter and search for a debugger
246
id = mca_base_param_find("orte", NULL, "base_user_debugger");
248
opal_show_help("help-orterun.txt", "debugger-mca-param-not-found",
253
mca_base_param_lookup_string(id, &value);
255
opal_show_help("help-orterun.txt", "debugger-orte_base_user_debugger-empty",
260
/* Look through all the values in the MCA param */
262
lines = opal_argv_split(value, ':');
264
for (i = 0; NULL != lines[i]; ++i) {
265
if (ORTE_SUCCESS == process(lines[i], basename, argc, argv,
271
/* If we didn't find one, abort */
273
if (NULL == lines[i]) {
274
opal_show_help("help-orterun.txt", "debugger-not-found", true);
277
opal_argv_free(lines);
281
execvp(new_argv[0], new_argv);
282
value = opal_argv_join(new_argv, ' ');
283
opal_show_help("help-orterun.txt", "debugger-exec-failed",
284
true, basename, value, new_argv[0]);
286
opal_argv_free(new_argv);
292
* Initialization of data structures for running under a debugger
293
* using the MPICH/TotalView parallel debugger interface. Before the
294
* spawn we need to check if we are being run under a TotalView-like
295
* debugger; if so then inform applications via an MCA parameter.
297
void orte_totalview_init_before_spawn(void)
299
if (MPIR_DEBUG_SPAWNED == MPIR_being_debugged) {
304
if (orte_debug_flag) {
305
opal_output(0, "Info: Spawned by a debugger");
308
if (mca_base_param_reg_int_name("orte", "mpi_wait_for_totalview",
309
"Whether the MPI application should wait for a debugger or not",
310
false, false, (int)false, &value) < 0) {
311
opal_output(0, "Error: mca_base_param_reg_int_name\n");
314
/* push mca parameter into the environment (not done automatically?) */
316
s = mca_base_param_environ_variable("orte", "mpi_wait_for_totalview", NULL);
317
if (ORTE_SUCCESS != opal_setenv(s, "1", true, &environ)) {
318
opal_output(0, "Error: Can't setenv %s\n", s);
326
* Initialization of data structures for running under a debugger
327
* using the MPICH/TotalView parallel debugger interface. This stage
328
* of initialization must occur after stage2 of spawn and is invoked
331
* @param jobid The jobid returned by spawn.
333
void orte_totalview_init_after_spawn(orte_jobid_t jobid)
335
opal_list_t list_of_resource_maps;
336
opal_list_item_t *item;
340
if (MPIR_proctable) {
341
/* already initialized */
345
if (0) { /* debugging daemons <<-- needs work */
347
if (orte_debug_flag) {
348
opal_output(0, "Info: Setting up debugger process table for daemons\n");
354
* Debugging applications or not being debugged.
356
* Either way, fill in the proc table for the application
357
* processes in case someone attaches later.
360
if (orte_debug_flag) {
361
opal_output(0, "Info: Setting up debugger process table for applications\n");
364
MPIR_debug_state = 1;
366
OBJ_CONSTRUCT(&list_of_resource_maps, opal_list_t);
368
/* Get a list of the resource maps for this job */
370
rc = orte_rmaps_base_get_map(jobid, &list_of_resource_maps);
371
if (ORTE_SUCCESS != rc) {
372
opal_output(0, "Error: Can't get list of resource maps\n");
376
/* find the total number of processes in the job */
378
for (item = opal_list_get_first(&list_of_resource_maps);
379
item != opal_list_get_end(&list_of_resource_maps);
380
item = opal_list_get_next(item)) {
381
orte_rmaps_base_map_t *map = (orte_rmaps_base_map_t*) item;
382
MPIR_proctable_size += map->num_procs;
385
/* allocate MPIR_proctable */
387
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
388
MPIR_proctable_size);
389
if (MPIR_proctable == NULL) {
390
opal_output(0, "Error: Out of memory\n");
391
OBJ_DESTRUCT(&list_of_resource_maps);
394
/* initialize MPIR_proctable */
396
for (item = opal_list_get_first(&list_of_resource_maps);
397
item != opal_list_get_end(&list_of_resource_maps);
398
item = opal_list_get_next(item)) {
399
orte_rmaps_base_map_t *map = (orte_rmaps_base_map_t*) item;
400
for (i = 0; i < map->num_procs; i++) {
401
orte_rmaps_base_proc_t *proc = map->procs[i];
402
MPIR_proctable[i].host_name = proc->proc_node->node->node_name;
403
MPIR_proctable[i].executable_name = proc->app;
404
MPIR_proctable[i].pid = proc->local_pid;
408
OBJ_DESTRUCT(&list_of_resource_maps);
412
if (orte_debug_flag) {
416
(void) MPIR_Breakpoint();
421
* Release resources associated with data structures for running under
422
* a debugger using the MPICH/TotalView parallel debugger interface.
424
void orte_totalview_finalize(void)
426
if (MPIR_proctable) {
427
free(MPIR_proctable);
432
* Breakpoint function for parallel debuggers
434
void *MPIR_Breakpoint(void)