Skip to content

Commit 54ddbc6

Browse files
authored
[AMDGPU] fix amdgpu_max_num_work_groups in templates (#141633)
Clang does not instantiate amdgpu_max_num_work_groups attribute with one template argument, causing assertion codegen. Fixes: #139570
1 parent f98bdd9 commit 54ddbc6

File tree

2 files changed

+40
-13
lines changed

2 files changed

+40
-13
lines changed

clang/lib/Sema/SemaTemplateInstantiateDecl.cpp

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -648,21 +648,30 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
648648
EnterExpressionEvaluationContext Unevaluated(
649649
S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
650650

651-
ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
652-
if (!ResultX.isUsable())
653-
return;
654-
ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
655-
if (!ResultY.isUsable())
656-
return;
657-
ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
658-
if (!ResultZ.isUsable())
659-
return;
651+
Expr *XExpr = nullptr;
652+
Expr *YExpr = nullptr;
653+
Expr *ZExpr = nullptr;
654+
655+
if (Attr.getMaxNumWorkGroupsX()) {
656+
ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
657+
if (ResultX.isUsable())
658+
XExpr = ResultX.getAs<Expr>();
659+
}
660+
661+
if (Attr.getMaxNumWorkGroupsY()) {
662+
ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
663+
if (ResultY.isUsable())
664+
YExpr = ResultY.getAs<Expr>();
665+
}
660666

661-
Expr *XExpr = ResultX.getAs<Expr>();
662-
Expr *YExpr = ResultY.getAs<Expr>();
663-
Expr *ZExpr = ResultZ.getAs<Expr>();
667+
if (Attr.getMaxNumWorkGroupsZ()) {
668+
ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
669+
if (ResultZ.isUsable())
670+
ZExpr = ResultZ.getAs<Expr>();
671+
}
664672

665-
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
673+
if (XExpr)
674+
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
666675
}
667676

668677
// This doesn't take any template parameters, but we have a custom action that

clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,24 @@ __global__ void template_32_4_a_max_num_work_groups() {}
7878
template __global__ void template_32_4_a_max_num_work_groups<2>();
7979
// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_4_a_max_num_work_groupsILj2EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
8080

81+
template<unsigned a>
82+
__attribute__((amdgpu_max_num_work_groups(a)))
83+
__global__ void template_a_max_num_work_groups() {}
84+
template __global__ void template_a_max_num_work_groups<32>();
85+
// CHECK: define{{.*}} amdgpu_kernel void @_Z30template_a_max_num_work_groupsILj32EEvv() [[MAX_NUM_WORK_GROUPS_32_1_1]]
86+
87+
template<unsigned a, unsigned b>
88+
__attribute__((amdgpu_max_num_work_groups(a, b)))
89+
__global__ void template_a_b_max_num_work_groups() {}
90+
template __global__ void template_a_b_max_num_work_groups<32, 1>();
91+
// CHECK: define{{.*}} amdgpu_kernel void @_Z32template_a_b_max_num_work_groupsILj32ELj1EEvv() [[MAX_NUM_WORK_GROUPS_32_1_1]]
92+
93+
template<unsigned a, unsigned b, unsigned c>
94+
__attribute__((amdgpu_max_num_work_groups(a, b, c)))
95+
__global__ void template_a_b_c_max_num_work_groups() {}
96+
template __global__ void template_a_b_c_max_num_work_groups<32, 4, 2>();
97+
// CHECK: define{{.*}} amdgpu_kernel void @_Z34template_a_b_c_max_num_work_groupsILj32ELj4ELj2EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2]]
98+
8199
// Make sure this is silently accepted on other targets.
82100
// NAMD-NOT: "amdgpu-flat-work-group-size"
83101
// NAMD-NOT: "amdgpu-waves-per-eu"

0 commit comments

Comments
 (0)