Skip to content

Commit d8c8573

Browse files
committed
Merge pull request #1151 from igor-ivanov/pr/opal-abort-vars
Add new mca variables opal_abort_delay and opal_abort_print_stack
2 parents 324534b + c15bf14 commit d8c8573

File tree

7 files changed

+108
-86
lines changed

7 files changed

+108
-86
lines changed

ompi/runtime/ompi_mpi_abort.c

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1818
* reserved.
19+
* Copyright (c) 2015 Mellanox Technologies, Inc.
20+
* All rights reserved.
1921
* $COPYRIGHT$
2022
*
2123
* Additional copyrights may follow
@@ -40,6 +42,7 @@
4042
#include <errno.h>
4143

4244
#include "opal/mca/backtrace/backtrace.h"
45+
#include "opal/runtime/opal_params.h"
4346

4447
#include "ompi/communicator/communicator.h"
4548
#include "ompi/runtime/mpiruntime.h"
@@ -137,11 +140,11 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
137140

138141
/* Should we print a stack trace? Not aggregated because they
139142
might be different on all processes. */
140-
if (ompi_mpi_abort_print_stack) {
143+
if (opal_abort_print_stack) {
141144
char **messages;
142145
int len, i;
143146

144-
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
147+
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
145148
for (i = 0; i < len; ++i) {
146149
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
147150
i, messages[i]);
@@ -161,7 +164,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
161164
if (errcode < 0 ||
162165
asprintf(&msg, "[%s:%d] aborting with MPI error %s%s",
163166
host, (int) pid, ompi_mpi_errnum_get_string(errcode),
164-
ompi_mpi_abort_print_stack ?
167+
opal_abort_print_stack ?
165168
" (stack trace available on stderr)" : "") < 0) {
166169
msg = NULL;
167170
}
@@ -172,20 +175,20 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
172175

173176
/* Should we wait for a while before aborting? */
174177

175-
if (0 != ompi_mpi_abort_delay) {
176-
if (ompi_mpi_abort_delay < 0) {
177-
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
178+
if (0 != opal_abort_delay) {
179+
if (opal_abort_delay < 0) {
180+
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
178181
host, (int) pid);
179182
fflush(stderr);
180183
while (1) {
181184
sleep(5);
182185
}
183186
} else {
184187
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
185-
host, (int) pid, ompi_mpi_abort_delay);
188+
host, (int) pid, opal_abort_delay);
186189
do {
187190
sleep(1);
188-
} while (--ompi_mpi_abort_delay > 0);
191+
} while (--opal_abort_delay > 0);
189192
}
190193
}
191194

ompi/runtime/ompi_mpi_params.c

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
* reserved.
1616
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
1717
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
18+
* Copyright (c) 2015 Mellanox Technologies, Inc.
19+
* All rights reserved.
1820
* $COPYRIGHT$
1921
*
2022
* Additional copyrights may follow
@@ -53,8 +55,6 @@ int ompi_debug_show_mpi_alloc_mem_leaks = 0;
5355
bool ompi_debug_no_free_handles = false;
5456
bool ompi_mpi_show_mca_params = false;
5557
char *ompi_mpi_show_mca_params_file = NULL;
56-
bool ompi_mpi_abort_print_stack = false;
57-
int ompi_mpi_abort_delay = 0;
5858
bool ompi_mpi_keep_fqdn_hostnames = false;
5959
bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
6060
bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE);
@@ -206,33 +206,6 @@ int ompi_mpi_register_params(void)
206206

207207
/* User-level process pinning controls */
208208

209-
/* MPI_ABORT controls */
210-
ompi_mpi_abort_delay = 0;
211-
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay",
212-
"If nonzero, print out an identifying message when MPI_ABORT is invoked (hostname, PID of the process that called MPI_ABORT) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
213-
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
214-
OPAL_INFO_LVL_9,
215-
MCA_BASE_VAR_SCOPE_READONLY,
216-
&ompi_mpi_abort_delay);
217-
218-
ompi_mpi_abort_print_stack = false;
219-
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_print_stack",
220-
"If nonzero, print out a stack trace when MPI_ABORT is invoked",
221-
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
222-
/* If we do not have stack trace
223-
capability, make this a constant
224-
MCA variable */
225-
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
226-
0,
227-
OPAL_INFO_LVL_9,
228-
MCA_BASE_VAR_SCOPE_READONLY,
229-
#else
230-
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
231-
OPAL_INFO_LVL_9,
232-
MCA_BASE_VAR_SCOPE_CONSTANT,
233-
#endif
234-
&ompi_mpi_abort_print_stack);
235-
236209
ompi_mpi_preconnect_mpi = false;
237210
value = mca_base_var_register("ompi", "mpi", NULL, "preconnect_mpi",
238211
"Whether to force MPI processes to fully "
@@ -307,6 +280,18 @@ int ompi_mpi_register_params(void)
307280
MCA_BASE_VAR_SCOPE_READONLY,
308281
&ompi_mpi_dynamics_enabled);
309282

283+
value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
284+
if (0 <= value) {
285+
(void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_delay",
286+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
287+
}
288+
289+
value = mca_base_var_find ("opal", "opal", NULL, "abort_print_stack");
290+
if (0 <= value) {
291+
(void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_print_stack",
292+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
293+
}
294+
310295
return OMPI_SUCCESS;
311296
}
312297

opal/runtime/opal_params.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
2020
* Copyright (c) 2015 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
22+
* Copyright (c) 2015 Mellanox Technologies, Inc.
23+
* All rights reserved.
2224
* $COPYRIGHT$
2325
*
2426
* Additional copyrights may follow
@@ -65,6 +67,8 @@ bool opal_base_distill_checkpoint_ready = false;
6567
*/
6668
int opal_leave_pinned = -1;
6769
bool opal_leave_pinned_pipeline = false;
70+
bool opal_abort_print_stack = false;
71+
int opal_abort_delay = 0;
6872

6973
static bool opal_register_done = false;
7074

@@ -280,6 +284,38 @@ int opal_register_params(void)
280284
MCA_BASE_VAR_SCOPE_READONLY,
281285
&opal_warn_on_fork);
282286

287+
opal_abort_delay = 0;
288+
ret = mca_base_var_register("opal", "opal", NULL, "abort_delay",
289+
"If nonzero, print out an identifying message when abort operation is invoked (hostname, PID of the process that called abort) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
290+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
291+
OPAL_INFO_LVL_5,
292+
MCA_BASE_VAR_SCOPE_READONLY,
293+
&opal_abort_delay);
294+
if (0 > ret) {
295+
return ret;
296+
}
297+
298+
opal_abort_print_stack = false;
299+
ret = mca_base_var_register("opal", "opal", NULL, "abort_print_stack",
300+
"If nonzero, print out a stack trace when abort is invoked",
301+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
302+
/* If we do not have stack trace
303+
capability, make this a constant
304+
MCA variable */
305+
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
306+
0,
307+
OPAL_INFO_LVL_5,
308+
MCA_BASE_VAR_SCOPE_READONLY,
309+
#else
310+
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
311+
OPAL_INFO_LVL_5,
312+
MCA_BASE_VAR_SCOPE_CONSTANT,
313+
#endif
314+
&opal_abort_print_stack);
315+
if (0 > ret) {
316+
return ret;
317+
}
318+
283319
/* The ddt engine has a few parameters */
284320
ret = opal_datatype_register_params();
285321
if (OPAL_SUCCESS != ret) {

opal/runtime/opal_params.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
1717
* All rights reserved.
1818
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
19+
* Copyright (c) 2015 Mellanox Technologies, Inc.
20+
* All rights reserved.
1921
* $COPYRIGHT$
2022
*
2123
* Additional copyrights may follow
@@ -55,6 +57,22 @@ OPAL_DECLSPEC extern int opal_leave_pinned;
5557
*/
5658
OPAL_DECLSPEC extern bool opal_leave_pinned_pipeline;
5759

60+
/**
61+
* Whether an abort operation should print out a stack trace or not.
62+
*/
63+
OPAL_DECLSPEC extern bool opal_abort_print_stack;
64+
65+
/**
66+
* Whether abort operation should print out an identifying message
67+
* (e.g., hostname and PID) and loop waiting for a debugger to
68+
* attach. The value of the integer is how many seconds to wait:
69+
*
70+
* 0 = do not print the message and do not loop
71+
* negative value = print the message and loop forever
72+
* positive value = print the message and delay for that many seconds
73+
*/
74+
OPAL_DECLSPEC extern int opal_abort_delay;
75+
5876
#if OPAL_ENABLE_DEBUG
5977
extern bool opal_progress_debug;
6078
#endif

oshmem/runtime/oshmem_shmem_abort.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#endif
2525

2626
#include "opal/mca/backtrace/backtrace.h"
27+
#include "opal/runtime/opal_params.h"
2728

2829
#include "orte/util/proc_info.h"
2930
#include "orte/runtime/runtime.h"
@@ -71,7 +72,7 @@ int oshmem_shmem_abort(int errcode)
7172

7273
/* Should we print a stack trace? Not aggregated because they
7374
might be different on all processes. */
74-
if (oshmem_shmem_abort_print_stack) {
75+
if (opal_abort_print_stack) {
7576
char **messages;
7677
int len, i;
7778

@@ -95,20 +96,21 @@ int oshmem_shmem_abort(int errcode)
9596
}
9697

9798
/* Should we wait for a while before aborting? */
98-
if (0 != oshmem_shmem_abort_delay) {
99-
if (oshmem_shmem_abort_delay < 0) {
100-
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
99+
100+
if (0 != opal_abort_delay) {
101+
if (opal_abort_delay < 0) {
102+
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
101103
host, (int) pid);
102104
fflush(stderr);
103105
while (1) {
104106
sleep(5);
105107
}
106108
} else {
107109
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
108-
host, (int) pid, oshmem_shmem_abort_delay);
110+
host, (int) pid, opal_abort_delay);
109111
do {
110112
sleep(1);
111-
} while (--oshmem_shmem_abort_delay > 0);
113+
} while (--opal_abort_delay > 0);
112114
}
113115
}
114116

oshmem/runtime/oshmem_shmem_params.c

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013 Mellanox Technologies, Inc.
2+
* Copyright (c) 2013-2015 Mellanox Technologies, Inc.
33
* All rights reserved.
44
* $COPYRIGHT$
55
*
@@ -8,41 +8,22 @@
88
* $HEADER$
99
*/
1010

11-
#include "params.h"
12-
#include "runtime.h"
11+
#include "oshmem_config.h"
12+
13+
#include "opal/runtime/opal_params.h"
14+
15+
#include "oshmem/runtime/params.h"
16+
#include "oshmem/runtime/runtime.h"
1317
#include "oshmem/constants.h"
1418

1519

16-
bool oshmem_shmem_abort_print_stack = false;
17-
int oshmem_shmem_abort_delay = 0;
1820
int oshmem_shmem_lock_recursive = 0;
1921
int oshmem_shmem_api_verbose = 0;
2022
int oshmem_preconnect_all = 0;
2123

2224
int oshmem_shmem_register_params(void)
2325
{
24-
oshmem_shmem_abort_delay = 0;
25-
(void) mca_base_var_register("oshmem",
26-
"oshmem",
27-
NULL,
28-
"abort_delay",
29-
"If nonzero, print out an identifying message when abort is invoked (hostname, PID of the process that called abort operation) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
30-
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
31-
OPAL_INFO_LVL_9,
32-
MCA_BASE_VAR_SCOPE_READONLY,
33-
&oshmem_shmem_abort_delay);
34-
35-
oshmem_shmem_abort_print_stack = false;
36-
(void) mca_base_var_register("oshmem",
37-
"oshmem",
38-
NULL,
39-
"abort_print_stack",
40-
"If nonzero, print out a stack trace when abort is invoked",
41-
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
42-
0,
43-
OPAL_INFO_LVL_9,
44-
MCA_BASE_VAR_SCOPE_READONLY,
45-
&oshmem_shmem_abort_print_stack);
26+
int value;
4627

4728
(void) mca_base_var_register("oshmem",
4829
"oshmem",
@@ -88,5 +69,17 @@ int oshmem_shmem_register_params(void)
8869
MCA_BASE_VAR_SCOPE_READONLY,
8970
&oshmem_preconnect_all);
9071

72+
value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
73+
if (0 <= value) {
74+
(void) mca_base_var_register_synonym(value, "oshmem", "oshmem", NULL, "abort_delay",
75+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
76+
}
77+
78+
value = mca_base_var_find ("opal", "opal", NULL, "abort_print_stack");
79+
if (0 <= value) {
80+
(void) mca_base_var_register_synonym(value, "oshmem", "oshmem", NULL, "abort_print_stack",
81+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
82+
}
83+
9184
return OSHMEM_SUCCESS;
9285
}

oshmem/runtime/params.h

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013 Mellanox Technologies, Inc.
2+
* Copyright (c) 2013-2015 Mellanox Technologies, Inc.
33
* All rights reserved.
44
* $COPYRIGHT$
55
*
@@ -19,21 +19,6 @@ BEGIN_C_DECLS
1919
* Global variables
2020
*/
2121

22-
/**
23-
* Whether an abort should print out a stack trace or not.
24-
*/
25-
OSHMEM_DECLSPEC extern bool oshmem_shmem_abort_print_stack;
26-
27-
/**
28-
* Whether abort should print out an identifying message
29-
* (e.g., hostname and PID) and loop waiting for a debugger to
30-
* attach. The value of the integer is how many seconds to wait:
31-
*
32-
* 0 = do not print the message and do not loop
33-
* negative value = print the message and loop forever
34-
* positive value = print the message and delay for that many seconds
35-
*/
36-
OSHMEM_DECLSPEC extern int oshmem_shmem_abort_delay;
3722

3823
/**
3924
* Whether or not the lock routines are recursive

0 commit comments

Comments
 (0)