Skip to content

Commit 90501f5

Browse files
committed
Fix part initialization deadlock.
Deadlocks were observed when doing multiple `ompi_comm_idup` at once during initialization when multiple threads could call `opal_progress`. This may be due to a request ordering mismatch, or some other error: the precise cause was not identified. Doing the `ompi_comm_idup` one at a time avoids the issue. Signed-off-by: Keluaa <34173752+Keluaa@users.noreply.github.com>
1 parent 59e3a83 commit 90501f5

File tree

2 files changed

+25
-33
lines changed

2 files changed

+25
-33
lines changed

ompi/mca/part/persist/part_persist.h

Lines changed: 24 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -66,16 +66,14 @@ struct ompi_part_persist_t {
6666
int32_t next_recv_tag;
6767
ompi_communicator_t *part_comm; /* This approach requires a separate tag space, so we need a dedicated communicator. */
6868
ompi_request_t *part_comm_req;
69-
int32_t part_comm_ready;
7069
ompi_communicator_t *part_comm_setup; /* We create a second communicator to send set-up messages (rational: these
7170
messages go in the opposite direction of normal messages, need to use MPI_ANY_SOURCE
7271
to support different communicators, and thus need to have a unique tag. Because tags
7372
are controlled by the sender in this model, we cannot assume that the tag will be
7473
unused in part_comm. */
7574
ompi_request_t *part_comm_sreq;
76-
int32_t part_comm_sready;
77-
int32_t init_comms;
7875
int32_t init_world;
76+
int32_t init_world_step;
7977
int32_t my_world_rank; /* Because the back end communicators use a world rank, we need to communicate ours
8078
to set up the requests. */
8179
opal_atomic_int32_t block_entry;
@@ -182,32 +180,29 @@ mca_part_persist_progress(void)
182180

183181
/* Can't do anything if we don't have world */
184182
if(0 == ompi_part_persist.init_world) {
185-
ompi_part_persist.my_world_rank = ompi_comm_rank(&ompi_mpi_comm_world.comm);
186-
err = ompi_comm_idup(&ompi_mpi_comm_world.comm, &ompi_part_persist.part_comm, &ompi_part_persist.part_comm_req);
187-
if(OMPI_SUCCESS != err) goto end_part_progress;
188-
ompi_part_persist.part_comm_ready = 0;
189-
err = ompi_comm_idup(&ompi_mpi_comm_world.comm, &ompi_part_persist.part_comm_setup, &ompi_part_persist.part_comm_sreq);
190-
if(OMPI_SUCCESS != err) goto end_part_progress;
191-
ompi_part_persist.part_comm_sready = 0;
192-
ompi_part_persist.init_world = 1;
193-
completed++;
194-
goto end_part_progress;
195-
}
196-
197-
/* Check to see if Comms are setup */
198-
if(0 == ompi_part_persist.init_comms) {
199-
if(0 == ompi_part_persist.part_comm_ready) {
200-
err = ompi_request_test(&ompi_part_persist.part_comm_req, &ompi_part_persist.part_comm_ready, MPI_STATUS_IGNORE);
201-
if(OMPI_SUCCESS != err) goto end_part_progress;
202-
}
203-
if(0 == ompi_part_persist.part_comm_sready) {
204-
err = ompi_request_test(&ompi_part_persist.part_comm_sreq, &ompi_part_persist.part_comm_sready, MPI_STATUS_IGNORE);
205-
if(OMPI_SUCCESS != err) goto end_part_progress;
206-
}
207-
if(0 != ompi_part_persist.part_comm_ready && 0 != ompi_part_persist.part_comm_sready) {
208-
ompi_part_persist.init_comms = 1;
183+
int done = 0;
184+
switch (ompi_part_persist.init_world_step) {
185+
case 0:
186+
ompi_part_persist.my_world_rank = ompi_comm_rank(&ompi_mpi_comm_world.comm);
187+
err = ompi_comm_idup(&ompi_mpi_comm_world.comm, &ompi_part_persist.part_comm, &ompi_part_persist.part_comm_req);
188+
done = 1;
189+
break;
190+
case 1:
191+
err = ompi_request_test(&ompi_part_persist.part_comm_req, &done, MPI_STATUS_IGNORE);
192+
break;
193+
case 2:
194+
err = ompi_comm_idup(&ompi_mpi_comm_world.comm, &ompi_part_persist.part_comm_setup, &ompi_part_persist.part_comm_sreq);
195+
done = 1;
196+
break;
197+
case 3:
198+
err = ompi_request_test(&ompi_part_persist.part_comm_sreq, &done, MPI_STATUS_IGNORE);
199+
break;
200+
default:
201+
ompi_part_persist.init_world = 1;
202+
break;
209203
}
210-
completed++;
204+
ompi_part_persist.init_world_step += 0 != done;
205+
completed += 0 != done;
211206
goto end_part_progress;
212207
}
213208

@@ -479,7 +474,7 @@ mca_part_persist_psend_init(const void* buf,
479474
if(OMPI_SUCCESS != err) return OMPI_ERROR;
480475

481476
/* Non-blocking receive on setup info */
482-
if(1 == ompi_part_persist.init_comms) {
477+
if(1 == ompi_part_persist.init_world) {
483478
err = MCA_PML_CALL(irecv(&(req->setup_info[1]), sizeof(struct ompi_mca_persist_setup_t), MPI_BYTE, MPI_ANY_SOURCE, req->my_recv_tag, ompi_part_persist.part_comm_setup, &req->setup_req[1]));
484479
if(OMPI_SUCCESS != err) return OMPI_ERROR;
485480
req->flag_post_setup_recv = false;

ompi/mca/part/persist/part_persist_component.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,8 @@ mca_part_persist_component_open(void)
9898

9999
mca_part_persist_init_lists();
100100

101-
ompi_part_persist.init_comms = 0;
102101
ompi_part_persist.init_world = -1;
103-
104-
ompi_part_persist.part_comm_ready = 0;
105-
ompi_part_persist.part_comm_ready = 0;
102+
ompi_part_persist.init_world_step = 0;
106103

107104
ompi_part_persist.block_entry = 0;
108105
return OMPI_SUCCESS;

0 commit comments

Comments
 (0)