@@ -218,6 +218,23 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
218
218
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
219
219
220
220
mlir::Value getViewSource() { return getSource(); }
221
+
222
+ unsigned getSourceMemorySpace() {
223
+ auto srcTy = getSourceType();
224
+ if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
225
+ auto attr = memrefTy.getMemorySpace();
226
+ if (attr) {
227
+ if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr)) {
228
+ return static_cast<unsigned>(intAttr.getInt());
229
+ }
230
+ if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
231
+ return static_cast<unsigned>(memSpaceAttr.getValue());
232
+ }
233
+ }
234
+ // take global as default memory scope.
235
+ return static_cast<unsigned>(MemorySpace::Global);
236
+ }
237
+
221
238
}];
222
239
}
223
240
@@ -411,8 +428,10 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
411
428
is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
412
429
implying each element in the array corresponds to a work-item (SIMT lane)
413
430
in the subgroup.
414
- * chunk_size: [optional attribute] indicates number of continious
415
- elements accessed for each offset, default is 1.
431
+
432
+ The first dimension of the result TensorDesc corresponds to work-items, so it should
433
+ match the dimension of offsets. It may also has a second dimension corresponding to
434
+ the chunk_size if the chunk size is larger than 1.
416
435
417
436
Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
418
437
```mlir
@@ -424,29 +443,22 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
424
443
It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
425
444
```mlir
426
445
%0 = memref.alloc() : memref<1024xf32>
427
- %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8} : memref<1024xf32> -> TensorDesc<4x8xf32>
446
+ %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8 >
428
447
```
429
448
430
449
Example 3. It is similar to Example 2, but there is some overlaps among workitems.
431
450
It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
432
451
```mlir
433
452
%0 = memref.alloc() : memref<1024xf32>
434
- %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8} : memref<1024xf32> -> TensorDesc<4x8xf32>
453
+ %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8> >
435
454
```
436
455
}];
437
456
438
457
let arguments = (ins XeGPU_BaseAddrType: $source,
439
458
Variadic<Index>: $offsets,
440
- DenseI64ArrayAttr: $const_offsets,
441
- DefaultValuedAttr<I64Attr, "1">: $chunk_size);
459
+ DenseI64ArrayAttr: $const_offsets);
442
460
let results = (outs XeGPU_TensorDesc:$TensorDesc);
443
461
444
- let builders = [
445
- OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
446
- "llvm::ArrayRef<OpFoldResult>": $offsets,
447
- CArg<"uint32_t", "1"> : $chunk_size)>,
448
- ];
449
-
450
462
let assemblyFormat = [{
451
463
$source
452
464
custom<DynamicIndexList>($offsets, $const_offsets)
@@ -473,6 +485,22 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
473
485
assert(idx < getNumOffsets() && "Invalid out of bound access.");
474
486
return getMixedOffsets()[idx];
475
487
}
488
+
489
+ unsigned getSourceMemorySpace() {
490
+ auto srcTy = getSource().getType();
491
+ if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
492
+ auto attr = memrefTy.getMemorySpace();
493
+ if (attr) {
494
+ if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr))
495
+ return static_cast<unsigned>(intAttr.getInt());
496
+ if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
497
+ return static_cast<unsigned>(memSpaceAttr.getValue());
498
+ }
499
+ }
500
+ // take global as default memory scope.
501
+ return static_cast<unsigned>(MemorySpace::Global);
502
+ }
503
+
476
504
}];
477
505
478
506
let hasVerifier = 1;
@@ -520,28 +548,31 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
520
548
521
549
let description = [{ It (aka. load) load data per each work-item. The output
522
550
describes the data being loaded at the subgroup level, so its size is
523
- consistent with the number of work-items in a subgroup. When `chunk_size_per_lane`
524
- attribute is larger than 1 in TensorDesc, the output vector will be 2D vector,
525
- with dim-1 correspoding to the chunk size.
551
+ consistent with the number of work-items in a subgroup. When the chunk size
552
+ is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
553
+ to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item.
554
+ Specially, there is a transpose effect on the result (as compared to the TensorDesc)
555
+ due to the hardware implementation. Therefore, a transpose attribute is introduced
556
+ on purpose, making sure users are aware of this implicit transformation.
526
557
527
558
The mask operand masks out memory access so that it is safe to pass out-of-boundary
528
559
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
529
560
530
561
Example:
531
562
```mlir
532
- %2 = xegpu.load %1, %0 {transpose = [1, 0] ,
563
+ %2 = xegpu.load %1, %0 {transpose,
533
564
l1_hint = #xegpu.cache_hint<cached>,
534
565
l2_hint = #xegpu.cache_hint<uncached>,
535
566
l3_hint = #xegpu.cache_hint<uncached>}
536
- : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true >>, vector<16xi1>
537
- -> vector<16xf32>
567
+ : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global >>,
568
+ vector<16xi1> -> vector<16xf32>
538
569
```
539
570
540
571
}];
541
572
542
573
let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
543
574
XeGPU_MaskType: $mask,
544
- OptionalAttr<DenseI64ArrayAttr >: $transpose,
575
+ OptionalAttr<UnitAttr >: $transpose,
545
576
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
546
577
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
547
578
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -573,11 +604,15 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
573
604
let hasVerifier = 1;
574
605
}
575
606
576
- def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch <["value", "TensorDesc"]>,
577
- AllElementTypesMatch<["value", "TensorDesc"]>]> {
607
+ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch <["value", "TensorDesc"]>,
608
+ AllElementTypesMatch<["value", "TensorDesc"]>]> {
578
609
let summary = "store data to scattered memory locations.";
579
- let description = [{ It (aka. store) stores data to scattered memory locations.
580
- It has similar semantic to `load_gather`.
610
+ let description = [{ It (aka. store) stores data to scattered memory locations. The value is
611
+ typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
612
+ a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
613
+ and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter`
614
+ has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
615
+ introduced on purpose, making sure users are aware of this implicit transformation.
581
616
582
617
Example:
583
618
```mlir
@@ -592,6 +627,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDe
592
627
XeGPU_ValueType: $value,
593
628
XeGPU_TensorDesc: $TensorDesc,
594
629
XeGPU_MaskType: $mask,
630
+ OptionalAttr<UnitAttr>: $transpose,
595
631
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
596
632
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
597
633
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -723,7 +759,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
723
759
724
760
def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
725
761
AllElementTypesMatch<["tensorDesc", "value", "result"]>,
726
- AllShapesMatch<["tensorDesc", "mask", " value", "result"]>]> {
762
+ AllShapesMatch<["tensorDesc", "value", "result"]>]> {
727
763
let summary = "Atomic ready-modify-write operation on the TensorDesc. ";
728
764
729
765
let description = [{
@@ -808,7 +844,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
808
844
2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be
809
845
within each workgroup. "GPU" means the scope would be across workgroups within the GPU.
810
846
}];
811
- let arguments = (ins XeGPU_MemoryScopeAttr : $memory_kind,
847
+ let arguments = (ins XeGPU_MemorySpaceAttr : $memory_kind,
812
848
XeGPU_FenceScopeAttr: $fence_scope);
813
849
let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}];
814
850
let extraClassDeclaration = extraBaseClassDeclaration;
0 commit comments