Skip to content

Commit c15bf14

Browse files
committed
opal: Add opal_abort_print_stack mca variable with aliases for ompi/oshmem
This commit allows to control output during abnormal oshmem/ompi application termination. Fixed issue in backtrace output. HAVE_BACKTRACE was never set so user was limited in control of this variable. Two related mca variables are moved to opal layer. Corresponding aliases are added for ompi and oshmem.
1 parent ab70ca6 commit c15bf14

File tree

7 files changed

+123
-47
lines changed

7 files changed

+123
-47
lines changed

ompi/runtime/ompi_mpi_abort.c

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1818
* reserved.
19+
* Copyright (c) 2015 Mellanox Technologies, Inc.
20+
* All rights reserved.
1921
* $COPYRIGHT$
2022
*
2123
* Additional copyrights may follow
@@ -40,6 +42,7 @@
4042
#include <errno.h>
4143

4244
#include "opal/mca/backtrace/backtrace.h"
45+
#include "opal/runtime/opal_params.h"
4346

4447
#include "ompi/communicator/communicator.h"
4548
#include "ompi/runtime/mpiruntime.h"
@@ -137,11 +140,11 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
137140

138141
/* Should we print a stack trace? Not aggregated because they
139142
might be different on all processes. */
140-
if (ompi_mpi_abort_print_stack) {
143+
if (opal_abort_print_stack) {
141144
char **messages;
142145
int len, i;
143146

144-
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
147+
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
145148
for (i = 0; i < len; ++i) {
146149
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
147150
i, messages[i]);
@@ -161,7 +164,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
161164
if (errcode < 0 ||
162165
asprintf(&msg, "[%s:%d] aborting with MPI error %s%s",
163166
host, (int) pid, ompi_mpi_errnum_get_string(errcode),
164-
ompi_mpi_abort_print_stack ?
167+
opal_abort_print_stack ?
165168
" (stack trace available on stderr)" : "") < 0) {
166169
msg = NULL;
167170
}
@@ -172,20 +175,20 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
172175

173176
/* Should we wait for a while before aborting? */
174177

175-
if (0 != ompi_mpi_abort_delay) {
176-
if (ompi_mpi_abort_delay < 0) {
177-
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
178+
if (0 != opal_abort_delay) {
179+
if (opal_abort_delay < 0) {
180+
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
178181
host, (int) pid);
179182
fflush(stderr);
180183
while (1) {
181184
sleep(5);
182185
}
183186
} else {
184187
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
185-
host, (int) pid, ompi_mpi_abort_delay);
188+
host, (int) pid, opal_abort_delay);
186189
do {
187190
sleep(1);
188-
} while (--ompi_mpi_abort_delay > 0);
191+
} while (--opal_abort_delay > 0);
189192
}
190193
}
191194

ompi/runtime/ompi_mpi_params.c

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
* reserved.
1616
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
1717
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
18+
* Copyright (c) 2015 Mellanox Technologies, Inc.
19+
* All rights reserved.
1820
* $COPYRIGHT$
1921
*
2022
* Additional copyrights may follow
@@ -53,8 +55,6 @@ int ompi_debug_show_mpi_alloc_mem_leaks = 0;
5355
bool ompi_debug_no_free_handles = false;
5456
bool ompi_mpi_show_mca_params = false;
5557
char *ompi_mpi_show_mca_params_file = NULL;
56-
bool ompi_mpi_abort_print_stack = false;
57-
int ompi_mpi_abort_delay = 0;
5858
bool ompi_mpi_keep_fqdn_hostnames = false;
5959
bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6060
bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
@@ -206,33 +206,6 @@ int ompi_mpi_register_params(void)
206206

207207
/* User-level process pinning controls */
208208

209-
/* MPI_ABORT controls */
210-
ompi_mpi_abort_delay = 0;
211-
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay",
212-
"If nonzero, print out an identifying message when MPI_ABORT is invoked (hostname, PID of the process that called MPI_ABORT) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
213-
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
214-
OPAL_INFO_LVL_9,
215-
MCA_BASE_VAR_SCOPE_READONLY,
216-
&ompi_mpi_abort_delay);
217-
218-
ompi_mpi_abort_print_stack = false;
219-
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_print_stack",
220-
"If nonzero, print out a stack trace when MPI_ABORT is invoked",
221-
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
222-
/* If we do not have stack trace
223-
capability, make this a constant
224-
MCA variable */
225-
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
226-
0,
227-
OPAL_INFO_LVL_9,
228-
MCA_BASE_VAR_SCOPE_READONLY,
229-
#else
230-
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
231-
OPAL_INFO_LVL_9,
232-
MCA_BASE_VAR_SCOPE_CONSTANT,
233-
#endif
234-
&ompi_mpi_abort_print_stack);
235-
236209
ompi_mpi_preconnect_mpi = false;
237210
value = mca_base_var_register("ompi", "mpi", NULL, "preconnect_mpi",
238211
"Whether to force MPI processes to fully "
@@ -307,6 +280,18 @@ int ompi_mpi_register_params(void)
307280
MCA_BASE_VAR_SCOPE_READONLY,
308281
&ompi_mpi_dynamics_enabled);
309282

283+
value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
284+
if (0 <= value) {
285+
(void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_delay",
286+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
287+
}
288+
289+
value = mca_base_var_find ("opal", "opal", NULL, "abort_print_stack");
290+
if (0 <= value) {
291+
(void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_print_stack",
292+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
293+
}
294+
310295
return OMPI_SUCCESS;
311296
}
312297

opal/runtime/opal_params.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
2020
* Copyright (c) 2015 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
22+
* Copyright (c) 2015 Mellanox Technologies, Inc.
23+
* All rights reserved.
2224
* $COPYRIGHT$
2325
*
2426
* Additional copyrights may follow
@@ -65,6 +67,8 @@ bool opal_base_distill_checkpoint_ready = false;
6567
*/
6668
int opal_leave_pinned = -1;
6769
bool opal_leave_pinned_pipeline = false;
70+
bool opal_abort_print_stack = false;
71+
int opal_abort_delay = 0;
6872

6973
static bool opal_register_done = false;
7074

@@ -280,6 +284,38 @@ int opal_register_params(void)
280284
MCA_BASE_VAR_SCOPE_READONLY,
281285
&opal_warn_on_fork);
282286

287+
opal_abort_delay = 0;
288+
ret = mca_base_var_register("opal", "opal", NULL, "abort_delay",
289+
"If nonzero, print out an identifying message when abort operation is invoked (hostname, PID of the process that called abort) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
290+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
291+
OPAL_INFO_LVL_5,
292+
MCA_BASE_VAR_SCOPE_READONLY,
293+
&opal_abort_delay);
294+
if (0 > ret) {
295+
return ret;
296+
}
297+
298+
opal_abort_print_stack = false;
299+
ret = mca_base_var_register("opal", "opal", NULL, "abort_print_stack",
300+
"If nonzero, print out a stack trace when abort is invoked",
301+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
302+
/* If we do not have stack trace
303+
capability, make this a constant
304+
MCA variable */
305+
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
306+
0,
307+
OPAL_INFO_LVL_5,
308+
MCA_BASE_VAR_SCOPE_READONLY,
309+
#else
310+
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
311+
OPAL_INFO_LVL_5,
312+
MCA_BASE_VAR_SCOPE_CONSTANT,
313+
#endif
314+
&opal_abort_print_stack);
315+
if (0 > ret) {
316+
return ret;
317+
}
318+
283319
/* The ddt engine has a few parameters */
284320
ret = opal_datatype_register_params();
285321
if (OPAL_SUCCESS != ret) {

opal/runtime/opal_params.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
1717
* All rights reserved.
1818
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
19+
* Copyright (c) 2015 Mellanox Technologies, Inc.
20+
* All rights reserved.
1921
* $COPYRIGHT$
2022
*
2123
* Additional copyrights may follow
@@ -55,6 +57,22 @@ OPAL_DECLSPEC extern int opal_leave_pinned;
5557
*/
5658
OPAL_DECLSPEC extern bool opal_leave_pinned_pipeline;
5759

60+
/**
61+
* Whether an abort operation should print out a stack trace or not.
62+
*/
63+
OPAL_DECLSPEC extern bool opal_abort_print_stack;
64+
65+
/**
66+
* Whether abort operation should print out an identifying message
67+
* (e.g., hostname and PID) and loop waiting for a debugger to
68+
* attach. The value of the integer is how many seconds to wait:
69+
*
70+
* 0 = do not print the message and do not loop
71+
* negative value = print the message and loop forever
72+
* positive value = print the message and delay for that many seconds
73+
*/
74+
OPAL_DECLSPEC extern int opal_abort_delay;
75+
5876
#if OPAL_ENABLE_DEBUG
5977
extern bool opal_progress_debug;
6078
#endif

oshmem/runtime/oshmem_shmem_abort.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#endif
2525

2626
#include "opal/mca/backtrace/backtrace.h"
27+
#include "opal/runtime/opal_params.h"
2728

2829
#include "orte/util/proc_info.h"
2930
#include "orte/runtime/runtime.h"
@@ -71,11 +72,11 @@ int oshmem_shmem_abort(int errcode)
7172

7273
/* Should we print a stack trace? Not aggregated because they
7374
might be different on all processes. */
74-
if (ompi_mpi_abort_print_stack) {
75+
if (opal_abort_print_stack) {
7576
char **messages;
7677
int len, i;
7778

78-
if (OSHMEM_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
79+
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
7980
for (i = 0; i < len; ++i) {
8081
fprintf(stderr,
8182
"[%s:%d] [%d] func:%s\n",
@@ -94,6 +95,25 @@ int oshmem_shmem_abort(int errcode)
9495
}
9596
}
9697

98+
/* Should we wait for a while before aborting? */
99+
100+
if (0 != opal_abort_delay) {
101+
if (opal_abort_delay < 0) {
102+
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
103+
host, (int) pid);
104+
fflush(stderr);
105+
while (1) {
106+
sleep(5);
107+
}
108+
} else {
109+
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
110+
host, (int) pid, opal_abort_delay);
111+
do {
112+
sleep(1);
113+
} while (--opal_abort_delay > 0);
114+
}
115+
}
116+
97117
if (!orte_initialized || !oshmem_shmem_initialized) {
98118
if (orte_show_help_is_available()) {
99119
/* TODO help message from SHMEM not from MPI is needed*/

oshmem/runtime/oshmem_shmem_params.c

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013 Mellanox Technologies, Inc.
2+
* Copyright (c) 2013-2015 Mellanox Technologies, Inc.
33
* All rights reserved.
44
* $COPYRIGHT$
55
*
@@ -8,8 +8,12 @@
88
* $HEADER$
99
*/
1010

11-
#include "params.h"
12-
#include "runtime.h"
11+
#include "oshmem_config.h"
12+
13+
#include "opal/runtime/opal_params.h"
14+
15+
#include "oshmem/runtime/params.h"
16+
#include "oshmem/runtime/runtime.h"
1317
#include "oshmem/constants.h"
1418

1519

@@ -19,6 +23,8 @@ int oshmem_preconnect_all = 0;
1923

2024
int oshmem_shmem_register_params(void)
2125
{
26+
int value;
27+
2228
(void) mca_base_var_register("oshmem",
2329
"oshmem",
2430
NULL,
@@ -63,5 +69,17 @@ int oshmem_shmem_register_params(void)
6369
MCA_BASE_VAR_SCOPE_READONLY,
6470
&oshmem_preconnect_all);
6571

72+
value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
73+
if (0 <= value) {
74+
(void) mca_base_var_register_synonym(value, "oshmem", "oshmem", NULL, "abort_delay",
75+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
76+
}
77+
78+
value = mca_base_var_find ("opal", "opal", NULL, "abort_print_stack");
79+
if (0 <= value) {
80+
(void) mca_base_var_register_synonym(value, "oshmem", "oshmem", NULL, "abort_print_stack",
81+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
82+
}
83+
6684
return OSHMEM_SUCCESS;
6785
}

oshmem/runtime/params.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013 Mellanox Technologies, Inc.
2+
* Copyright (c) 2013-2015 Mellanox Technologies, Inc.
33
* All rights reserved.
44
* $COPYRIGHT$
55
*
@@ -19,10 +19,6 @@ BEGIN_C_DECLS
1919
* Global variables
2020
*/
2121

22-
/**
23-
* Whether an MPI_ABORT should print out a stack trace or not.
24-
*/
25-
OSHMEM_DECLSPEC extern bool ompi_mpi_abort_print_stack;
2622

2723
/**
2824
* Whether or not the lock routines are recursive

0 commit comments

Comments
 (0)