Skip to content

Commit f8957f2

Browse files
committed
Merge pull request #1768 from hjelmn/cq_fix
btl/openib: fix cq resize calculation
2 parents 97c1643 + dd519c5 commit f8957f2

File tree

2 files changed

+40
-26
lines changed

2 files changed

+40
-26
lines changed

opal/mca/btl/openib/btl_openib.c

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -425,13 +425,20 @@ static int openib_btl_prepare(struct mca_btl_openib_module_t* openib_btl)
425425
static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl)
426426
{
427427
uint32_t send_cqes, recv_cqes;
428-
int rc = OPAL_SUCCESS, qp;
428+
int rc = OPAL_SUCCESS;
429429
mca_btl_openib_device_t *device = openib_btl->device;
430+
uint32_t requested[BTL_OPENIB_MAX_CQ];
431+
bool need_resize = false;
430432

431433
opal_mutex_lock(&openib_btl->ib_lock);
434+
435+
for (int cq = 0 ; cq < BTL_OPENIB_MAX_CQ ; ++cq) {
436+
requested[cq] = 0;
437+
}
438+
432439
/* figure out reasonable sizes for completion queues */
433-
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
434-
if(BTL_OPENIB_QP_TYPE_SRQ(qp)) {
440+
for (int qp = 0 ; qp < mca_btl_openib_component.num_qps ; qp++) {
441+
if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
435442
send_cqes = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
436443
recv_cqes = mca_btl_openib_component.qp_infos[qp].rd_num;
437444
} else {
@@ -440,24 +447,30 @@ static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl)
440447
recv_cqes = send_cqes;
441448
}
442449

443-
opal_mutex_lock(&openib_btl->device->device_lock);
444-
openib_btl->device->cq_size[qp_cq_prio(qp)] += recv_cqes;
445-
openib_btl->device->cq_size[BTL_OPENIB_LP_CQ] += send_cqes;
446-
opal_mutex_unlock(&openib_btl->device->device_lock);
450+
requested[qp_cq_prio(qp)] += recv_cqes;
451+
requested[BTL_OPENIB_LP_CQ] += send_cqes;
447452
}
448453

449-
rc = adjust_cq(device, BTL_OPENIB_HP_CQ);
450-
if (OPAL_SUCCESS != rc) {
451-
goto out;
452-
}
454+
opal_mutex_lock (&openib_btl->device->device_lock);
455+
for (int cq = 0 ; cq < BTL_OPENIB_MAX_CQ ; ++cq) {
456+
if (requested[cq] < mca_btl_openib_component.ib_cq_size[cq]) {
457+
requested[cq] = mca_btl_openib_component.ib_cq_size[cq];
458+
} else if (requested[cq] > openib_btl->device->ib_dev_attr.max_cqe) {
459+
requested[cq] = openib_btl->device->ib_dev_attr.max_cqe;
460+
}
453461

454-
rc = adjust_cq(device, BTL_OPENIB_LP_CQ);
455-
if (OPAL_SUCCESS != rc) {
456-
goto out;
457-
}
462+
if (openib_btl->device->cq_size[cq] < requested[cq]) {
463+
openib_btl->device->cq_size[cq] = requested[cq];
458464

459-
out:
465+
rc = adjust_cq (device, cq);
466+
if (OPAL_SUCCESS != rc) {
467+
break;
468+
}
469+
}
470+
}
471+
opal_mutex_unlock (&openib_btl->device->device_lock);
460472
opal_mutex_unlock(&openib_btl->ib_lock);
473+
461474
return rc;
462475
}
463476

@@ -1107,7 +1120,7 @@ int mca_btl_openib_add_procs(
11071120
}
11081121

11091122
if (nprocs_new) {
1110-
OPAL_THREAD_ADD32(&openib_btl->num_peers, nprocs_new);
1123+
opal_atomic_add_32 (&openib_btl->num_peers, nprocs_new);
11111124

11121125
/* adjust cq sizes given the new procs */
11131126
rc = openib_btl_size_queues (openib_btl);
@@ -1217,7 +1230,7 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul
12171230

12181231
/* this is a new process to this openib btl
12191232
* account this procs if need */
1220-
OPAL_THREAD_ADD32(&openib_btl->num_peers, 1);
1233+
opal_atomic_add_32 (&openib_btl->num_peers, 1);
12211234
rc = openib_btl_size_queues(openib_btl);
12221235
if (OPAL_SUCCESS != rc) {
12231236
BTL_ERROR(("error creating cqs"));

opal/mca/btl/openib/btl_openib.h

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,12 @@ BEGIN_C_DECLS
8282
* Infiniband (IB) BTL component.
8383
*/
8484

85+
enum {
86+
BTL_OPENIB_HP_CQ,
87+
BTL_OPENIB_LP_CQ,
88+
BTL_OPENIB_MAX_CQ,
89+
};
90+
8591
typedef enum {
8692
MCA_BTL_OPENIB_TRANSPORT_IB,
8793
MCA_BTL_OPENIB_TRANSPORT_IWARP,
@@ -206,7 +212,7 @@ struct mca_btl_openib_component_t {
206212
uint32_t reg_mru_len; /**< Length of the registration cache most recently used list */
207213
uint32_t use_srq; /**< Use the Shared Receive Queue (SRQ mode) */
208214

209-
uint32_t ib_cq_size[2]; /**< Max outstanding CQE on the CQ */
215+
uint32_t ib_cq_size[BTL_OPENIB_MAX_CQ]; /**< Max outstanding CQE on the CQ */
210216

211217
int ib_max_inline_data; /**< Max size of inline data */
212218
unsigned int ib_pkey_val;
@@ -379,8 +385,8 @@ typedef struct mca_btl_openib_device_t {
379385
#endif
380386
struct ibv_device_attr ib_dev_attr;
381387
struct ibv_pd *ib_pd;
382-
struct ibv_cq *ib_cq[2];
383-
uint32_t cq_size[2];
388+
struct ibv_cq *ib_cq[BTL_OPENIB_MAX_CQ];
389+
uint32_t cq_size[BTL_OPENIB_MAX_CQ];
384390
mca_mpool_base_module_t *mpool;
385391
mca_rcache_base_module_t *rcache;
386392
/* MTU for this device */
@@ -863,11 +869,6 @@ extern int mca_btl_openib_ft_event(int state);
863869
*/
864870
void mca_btl_openib_show_init_error(const char *file, int line,
865871
const char *func, const char *dev);
866-
867-
#define BTL_OPENIB_HP_CQ 0
868-
#define BTL_OPENIB_LP_CQ 1
869-
870-
871872
/**
872873
* Post to Shared Receive Queue with certain priority
873874
*

0 commit comments

Comments
 (0)