Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit bd3d6ff

Browse files
committed
Merge pull request #977 from ggouaillardet/topic/v2.x/hetero_spawn
fix spawn on heterogeneous clusters
2 parents ae525ec + dab2c1a commit bd3d6ff

File tree

3 files changed

+25
-17
lines changed

3 files changed

+25
-17
lines changed

ompi/dpm/dpm.c

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
378378
opal_list_append(&ilist, &cd->super);
379379
}
380380
/* either way, add to the remote list */
381-
cd = OBJ_NEW(ompi_dpm_proct_caddy_t);
381+
cd = OBJ_NEW(ompi_dpm_proct_caddy_t);
382382
cd->p = proc;
383383
opal_list_append(&rlist, &cd->super);
384384
}
@@ -403,24 +403,19 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
403403
i = 0;
404404
OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) {
405405
opal_value_t *kv;
406-
new_proc_list[i] = cd->p;
407-
/* set the locality */
408-
new_proc_list[i]->super.proc_flags = OPAL_PROC_NON_LOCAL;
409-
/* have to save it for later */
406+
proc = cd->p;
407+
new_proc_list[i] = proc ;
408+
/* ompi_proc_complete_init_single() initializes and optionally retrieves
409+
* OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without
410+
* them, we are just fine */
411+
ompi_proc_complete_init_single(proc);
412+
/* save the locality for later */
410413
kv = OBJ_NEW(opal_value_t);
411414
kv->key = strdup(OPAL_PMIX_LOCALITY);
412415
kv->type = OPAL_UINT16;
413-
kv->data.uint16 = OPAL_PROC_NON_LOCAL;
414-
opal_pmix.store_local(&cd->p->super.proc_name, kv);
416+
kv->data.uint16 = proc->super.proc_flags;
417+
opal_pmix.store_local(&proc->super.proc_name, kv);
415418
OBJ_RELEASE(kv); // maintain accounting
416-
/* we can retrieve the hostname at no cost because it
417-
* was provided at connect */
418-
OPAL_MODEX_RECV_VALUE(rc, OPAL_PMIX_HOSTNAME, &new_proc_list[i]->super.proc_name,
419-
(char**)&(new_proc_list[i]->super.proc_hostname), OPAL_STRING);
420-
if (OPAL_SUCCESS != rc) {
421-
/* we can live without it */
422-
new_proc_list[i]->super.proc_hostname = NULL;
423-
}
424419
++i;
425420
}
426421
/* call add_procs on the new ones */

ompi/proc/proc.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,14 +129,15 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
129129
* retrieving the hostname (if below the modex cutoff), determining the
130130
* remote architecture, and calculating the locality of the process.
131131
*/
132-
static int ompi_proc_complete_init_single (ompi_proc_t *proc)
132+
int ompi_proc_complete_init_single (ompi_proc_t *proc)
133133
{
134134
uint16_t u16, *u16ptr;
135135
int ret;
136136

137137
u16ptr = &u16;
138138

139-
if (OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid) {
139+
if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
140+
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
140141
/* nothing else to do */
141142
return OMPI_SUCCESS;
142143
}

ompi/proc/proc.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,18 @@ OMPI_DECLSPEC int ompi_proc_init(void);
122122
*/
123123
OMPI_DECLSPEC int ompi_proc_complete_init(void);
124124

125+
/**
126+
* Complete filling up the proc information (arch, name and locality) for
127+
* a given proc. This function is to be called only after the modex exchange
128+
* has been completed.
129+
*
130+
* @param[in] proc the proc whose information will be filled up
131+
*
132+
* @retval OMPI_SUCCESS All information correctly set.
133+
* @retval OMPI_ERROR Some info could not be initialized.
134+
*/
135+
OMPI_DECLSPEC int ompi_proc_complete_init_single(ompi_proc_t* proc);
136+
125137
/**
126138
* Finalize the OMPI Process subsystem
127139
*

0 commit comments

Comments
 (0)