@@ -181,6 +181,7 @@ static int accelerator_rocm_component_register(void)
181
181
182
182
int opal_accelerator_rocm_lazy_init ()
183
183
{
184
+ hipError_t hip_err ;
184
185
int err = OPAL_SUCCESS ;
185
186
186
187
/* Double checked locking to avoid having to
@@ -196,41 +197,94 @@ int opal_accelerator_rocm_lazy_init()
196
197
goto out ;
197
198
}
198
199
199
- err = hipGetDeviceCount (& opal_accelerator_rocm_num_devices );
200
- if (hipSuccess != err ) {
200
+ hip_err = hipGetDeviceCount (& opal_accelerator_rocm_num_devices );
201
+ if (hipSuccess != hip_err ) {
201
202
opal_output (0 , "Failed to query device count, err=%d %s\n" ,
202
- err , hipGetErrorString (err ));
203
- err = OPAL_ERROR ;
203
+ hip_err , hipGetErrorString (hip_err ));
204
+ err = OPAL_ERROR ;
204
205
goto out ;
205
206
}
206
207
207
208
hipStream_t memcpy_stream ;
208
- err = hipStreamCreate (& memcpy_stream );
209
- if (hipSuccess != err ) {
209
+ hip_err = hipStreamCreate (& memcpy_stream );
210
+ if (hipSuccess != hip_err ) {
210
211
opal_output (0 , "Could not create hipStream, err=%d %s\n" ,
211
- err , hipGetErrorString (err ));
212
- err = OPAL_ERROR ; // we got hipErrorInvalidValue, pretty bad
212
+ hip_err , hipGetErrorString (hip_err ));
213
+ err = OPAL_ERROR ; // we got hipErrorInvalidValue, pretty bad
213
214
goto out ;
214
215
}
216
+
215
217
opal_accelerator_rocm_MemcpyStream = malloc (sizeof (hipStream_t ));
218
+ if (NULL == opal_accelerator_rocm_MemcpyStream ) {
219
+ opal_output (0 , "Could not allocate hipStream\n" );
220
+ err = OPAL_ERR_OUT_OF_RESOURCE ;
221
+ goto out ;
222
+ }
216
223
* opal_accelerator_rocm_MemcpyStream = memcpy_stream ;
217
224
218
225
opal_accelerator_rocm_mem_bw = malloc (sizeof (float )* opal_accelerator_rocm_num_devices );
226
+ if (NULL == opal_accelerator_rocm_mem_bw ) {
227
+ opal_output (0 , "Could not allocate memory_bw array\n" );
228
+ err = OPAL_ERR_OUT_OF_RESOURCE ;
229
+ goto out ;
230
+ }
231
+
219
232
for (int i = 0 ; i < opal_accelerator_rocm_num_devices ; ++ i ) {
220
233
int mem_clock_rate ; // kHz
221
- err = hipDeviceGetAttribute (& mem_clock_rate ,
222
- hipDeviceAttributeMemoryClockRate ,
223
- i );
234
+ hip_err = hipDeviceGetAttribute (& mem_clock_rate ,
235
+ hipDeviceAttributeMemoryClockRate ,
236
+ i );
237
+ if (hipSuccess != hip_err ) {
238
+ opal_output (0 , "Failed to query device MemoryClockRate, err=%d %s\n" ,
239
+ hip_err , hipGetErrorString (hip_err ));
240
+ err = OPAL_ERROR ;
241
+ goto out ;
242
+ }
243
+
224
244
int bus_width ; // bit
225
- err = hipDeviceGetAttribute (& bus_width ,
226
- hipDeviceAttributeMemoryBusWidth ,
227
- i );
228
- /* bw = clock_rate * bus width * 2bit multiplier
229
- * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
230
- */
245
+ hip_err = hipDeviceGetAttribute (& bus_width ,
246
+ hipDeviceAttributeMemoryBusWidth ,
247
+ i );
248
+ if (hipSuccess != hip_err ) {
249
+ opal_output (0 , "Failed to query device MemoryBusWidth, err=%d %s\n" ,
250
+ hip_err , hipGetErrorString (hip_err ));
251
+ err = OPAL_ERROR ;
252
+ goto out ;
253
+ }
254
+
255
+ /* bw = clock_rate * bus width * 2bit multiplier */
231
256
float bw = ((float )mem_clock_rate * (float )bus_width * 2.0 ) / 1024 / 1024 / 8 ;
232
257
opal_accelerator_rocm_mem_bw [i ] = bw ;
233
258
}
259
+
260
+ #if HIP_VERSION >= 60000000
261
+ int dev_id ;
262
+ hip_err = hipGetDevice (& dev_id );
263
+ if (hipSuccess != hip_err ) {
264
+ opal_output (0 , "error retrieving current device" );
265
+ err = OPAL_ERROR ;
266
+ goto out ;
267
+ }
268
+
269
+ int has_large_bar = 0 ;
270
+ hip_err = hipDeviceGetAttribute (& has_large_bar , hipDeviceAttributeIsLargeBar ,
271
+ dev_id );
272
+ if (hipSuccess != hip_err ) {
273
+ opal_output (0 , "error retrieving current device" );
274
+ err = OPAL_ERROR ;
275
+ goto out ;
276
+ }
277
+
278
+ if (0 == has_large_bar ) {
279
+ // Without large BAR we have to use hipMemcpy(Async) for all data transfers
280
+ opal_output (0 , "Large BAR support is not enabled on current device. "
281
+ "Enable large BAR support in BIOS (Above 4G Encoding) for "
282
+ "better performance\n." );
283
+ opal_accelerator_rocm_memcpyH2D_limit = 0 ;
284
+ opal_accelerator_rocm_memcpyD2H_limit = 0 ;
285
+ }
286
+ #endif
287
+
234
288
err = OPAL_SUCCESS ;
235
289
opal_atomic_wmb ();
236
290
accelerator_rocm_init_complete = true;
0 commit comments