Skip to content

Commit 8b5e841

Browse files
authored
[MLIR][XeGPU] Updates XeGPU TensorDescAttr and Refine Gather/Scatter definition (#109675)
Bring back #109144 with fixes to VectorToXeGPU
1 parent 3fbf6f8 commit 8b5e841

File tree

8 files changed

+365
-175
lines changed

8 files changed

+365
-175
lines changed

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,18 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
1919
let mnemonic = attrMnemonic;
2020
}
2121

22-
def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
22+
class XeGPU_TensorDescAttr<string name, string attrMnemonic, list<Trait> traits = [],
23+
string baseCppClass = "::mlir::Attribute">
24+
: XeGPUAttr<name, attrMnemonic, traits, baseCppClass> {
25+
let assemblyFormat = "`<` struct(params) `>`";
26+
}
27+
28+
def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_tdesc_attr"> {
2329
let summary = [{a composite attribute for `TensorDescType`}];
24-
let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite
30+
let description = [{`BlockTensorDesc` (or `block_tdesc_attr`) is a composite
2531
attribute defined for `TensorDescType` for describing following
2632
properties of a `TensorDesc`.
27-
1. `memory_scope`: It describes where the data block described by the
33+
1. `memory_space`: It describes where the data block described by the
2834
TensorDesc is located, `Global` device memory or `Shared` local memory.
2935
It is default to `Global`.
3036
2. `array_length`: It describes how many horizontally consecutive blocks
@@ -33,43 +39,63 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
3339
8x32. Its default value is 1.
3440
3. `boundary_check`: It is used to indicates the hardware whether to do
3541
out-of-boundary check. The default value is true.
36-
4. `scattered`: It is used to differenciate TensorDescs created from
37-
`create_nd_tdesc` vs from `create_tdesc`.
3842
}];
3943

4044
let parameters = (ins
41-
OptionalParameter<"MemoryScopeAttr">: $memory_scope,
45+
OptionalParameter<"MemorySpaceAttr">: $memory_space,
4246
OptionalParameter<"IntegerAttr", "1">: $array_length,
43-
OptionalParameter<"BoolAttr", "true">: $boundary_check,
44-
OptionalParameter<"BoolAttr", "false">: $scattered
47+
OptionalParameter<"BoolAttr", "true">: $boundary_check
4548
);
4649

4750
let builders = [
4851
AttrBuilder<(ins
49-
CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
52+
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
5053
CArg<"int", "1">:$array_length,
51-
CArg<"bool", "true">: $boundary_check,
52-
CArg<"bool", "false">: $scattered
54+
CArg<"bool", "true">: $boundary_check
5355
)>
5456
];
5557

56-
let assemblyFormat = "`<` struct(params) `>`";
5758
}
5859

60+
def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> {
61+
let summary = [{a composite attribute for `TensorDescType`}];
62+
let description = [{`ScatterTensorDesc` (or `scatter_tdesc_attr`) is a composite
63+
attribute defined for `TensorDescType` for describing following
64+
properties of a `TensorDesc`.
65+
1. `memory_space`: It describes where the data block described by the
66+
TensorDesc is located, `Global` device memory or `Shared` local memory.
67+
It is default to `Global`.
68+
2. `chunk_size`: indicates number of continious elements accessed for each
69+
offset, default is 1. It is used with `scattered` attr only.
70+
}];
71+
72+
let parameters = (ins
73+
OptionalParameter<"MemorySpaceAttr">: $memory_space,
74+
OptionalParameter<"IntegerAttr", "1">: $chunk_size
75+
);
76+
77+
let builders = [
78+
AttrBuilder<(ins
79+
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
80+
CArg<"int", "1">: $chunk_size
81+
)>
82+
];
83+
}
84+
5985
//===----------------------------------------------------------------------===//
6086
// XeGPU Memory Scope Enums.
6187
//===----------------------------------------------------------------------===//
62-
def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
63-
def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
64-
def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
88+
def XeGPU_MemorySpaceGlobal: I32EnumAttrCase<"Global", 0, "global">;
89+
def XeGPU_MemorySpaceShared: I32EnumAttrCase<"SLM", 3, "slm">;
90+
def XeGPU_MemorySpace: I32EnumAttr<"MemorySpace",
6591
"The address space of the memory the tensor descritor is created for",
66-
[XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
92+
[XeGPU_MemorySpaceGlobal, XeGPU_MemorySpaceShared]> {
6793
let genSpecializedAttr = 0;
6894
let cppNamespace = "::mlir::xegpu";
6995
}
7096

71-
def XeGPU_MemoryScopeAttr:
72-
EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
97+
def XeGPU_MemorySpaceAttr:
98+
EnumAttr<XeGPU_Dialect, XeGPU_MemorySpace, "memory_space"> {
7399
let summary = [{Describe the location of data described by a `TensorDesc`:
74100
Global device memory (`Global`) or Shared local memory (`SLM`).}];
75101
let assemblyFormat = "$value";
@@ -116,4 +142,4 @@ def XeGPU_FenceScopeAttr:
116142
let assemblyFormat = "$value";
117143
}
118144

119-
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
145+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,23 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
218218
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
219219

220220
mlir::Value getViewSource() { return getSource(); }
221+
222+
unsigned getSourceMemorySpace() {
223+
auto srcTy = getSourceType();
224+
if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
225+
auto attr = memrefTy.getMemorySpace();
226+
if (attr) {
227+
if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr)) {
228+
return static_cast<unsigned>(intAttr.getInt());
229+
}
230+
if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
231+
return static_cast<unsigned>(memSpaceAttr.getValue());
232+
}
233+
}
234+
// take global as default memory scope.
235+
return static_cast<unsigned>(MemorySpace::Global);
236+
}
237+
221238
}];
222239
}
223240

@@ -411,8 +428,10 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
411428
is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
412429
implying each element in the array corresponds to a work-item (SIMT lane)
413430
in the subgroup.
414-
* chunk_size: [optional attribute] indicates number of continious
415-
elements accessed for each offset, default is 1.
431+
432+
The first dimension of the result TensorDesc corresponds to work-items, so it should
433+
match the dimension of offsets. It may also has a second dimension corresponding to
434+
the chunk_size if the chunk size is larger than 1.
416435

417436
Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
418437
```mlir
@@ -424,29 +443,22 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
424443
It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
425444
```mlir
426445
%0 = memref.alloc() : memref<1024xf32>
427-
%1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
446+
%1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>
428447
```
429448

430449
Example 3. It is similar to Example 2, but there is some overlaps among workitems.
431450
It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
432451
```mlir
433452
%0 = memref.alloc() : memref<1024xf32>
434-
%1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
453+
%1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>>
435454
```
436455
}];
437456

438457
let arguments = (ins XeGPU_BaseAddrType: $source,
439458
Variadic<Index>: $offsets,
440-
DenseI64ArrayAttr: $const_offsets,
441-
DefaultValuedAttr<I64Attr, "1">: $chunk_size);
459+
DenseI64ArrayAttr: $const_offsets);
442460
let results = (outs XeGPU_TensorDesc:$TensorDesc);
443461

444-
let builders = [
445-
OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
446-
"llvm::ArrayRef<OpFoldResult>": $offsets,
447-
CArg<"uint32_t", "1"> : $chunk_size)>,
448-
];
449-
450462
let assemblyFormat = [{
451463
$source
452464
custom<DynamicIndexList>($offsets, $const_offsets)
@@ -473,6 +485,22 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
473485
assert(idx < getNumOffsets() && "Invalid out of bound access.");
474486
return getMixedOffsets()[idx];
475487
}
488+
489+
unsigned getSourceMemorySpace() {
490+
auto srcTy = getSource().getType();
491+
if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
492+
auto attr = memrefTy.getMemorySpace();
493+
if (attr) {
494+
if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr))
495+
return static_cast<unsigned>(intAttr.getInt());
496+
if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
497+
return static_cast<unsigned>(memSpaceAttr.getValue());
498+
}
499+
}
500+
// take global as default memory scope.
501+
return static_cast<unsigned>(MemorySpace::Global);
502+
}
503+
476504
}];
477505

478506
let hasVerifier = 1;
@@ -520,28 +548,31 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
520548

521549
let description = [{ It (aka. load) load data per each work-item. The output
522550
describes the data being loaded at the subgroup level, so its size is
523-
consistent with the number of work-items in a subgroup. When `chunk_size_per_lane`
524-
attribute is larger than 1 in TensorDesc, the output vector will be 2D vector,
525-
with dim-1 correspoding to the chunk size.
551+
consistent with the number of work-items in a subgroup. When the chunk size
552+
is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
553+
to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item.
554+
Specially, there is a transpose effect on the result (as compared to the TensorDesc)
555+
due to the hardware implementation. Therefore, a transpose attribute is introduced
556+
on purpose, making sure users are aware of this implicit transformation.
526557

527558
The mask operand masks out memory access so that it is safe to pass out-of-boundary
528559
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
529560

530561
Example:
531562
```mlir
532-
%2 = xegpu.load %1, %0 {transpose = [1, 0],
563+
%2 = xegpu.load %1, %0 {transpose,
533564
l1_hint = #xegpu.cache_hint<cached>,
534565
l2_hint = #xegpu.cache_hint<uncached>,
535566
l3_hint = #xegpu.cache_hint<uncached>}
536-
: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
537-
-> vector<16xf32>
567+
: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
568+
vector<16xi1> -> vector<16xf32>
538569
```
539570

540571
}];
541572

542573
let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
543574
XeGPU_MaskType: $mask,
544-
OptionalAttr<DenseI64ArrayAttr>: $transpose,
575+
OptionalAttr<UnitAttr>: $transpose,
545576
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
546577
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
547578
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -573,11 +604,15 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
573604
let hasVerifier = 1;
574605
}
575606

576-
def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>,
577-
AllElementTypesMatch<["value", "TensorDesc"]>]> {
607+
def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "TensorDesc"]>,
608+
AllElementTypesMatch<["value", "TensorDesc"]>]> {
578609
let summary = "store data to scattered memory locations.";
579-
let description = [{ It (aka. store) stores data to scattered memory locations.
580-
It has similar semantic to `load_gather`.
610+
let description = [{ It (aka. store) stores data to scattered memory locations. The value is
611+
typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
612+
a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
613+
and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter`
614+
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
615+
introduced on purpose, making sure users are aware of this implicit transformation.
581616

582617
Example:
583618
```mlir
@@ -592,6 +627,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDe
592627
XeGPU_ValueType: $value,
593628
XeGPU_TensorDesc: $TensorDesc,
594629
XeGPU_MaskType: $mask,
630+
OptionalAttr<UnitAttr>: $transpose,
595631
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
596632
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
597633
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -723,7 +759,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
723759

724760
def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
725761
AllElementTypesMatch<["tensorDesc", "value", "result"]>,
726-
AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> {
762+
AllShapesMatch<["tensorDesc", "value", "result"]>]> {
727763
let summary = "Atomic ready-modify-write operation on the TensorDesc. ";
728764

729765
let description = [{
@@ -808,7 +844,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
808844
2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be
809845
within each workgroup. "GPU" means the scope would be across workgroups within the GPU.
810846
}];
811-
let arguments = (ins XeGPU_MemoryScopeAttr: $memory_kind,
847+
let arguments = (ins XeGPU_MemorySpaceAttr: $memory_kind,
812848
XeGPU_FenceScopeAttr: $fence_scope);
813849
let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}];
814850
let extraClassDeclaration = extraBaseClassDeclaration;

0 commit comments

Comments
 (0)