MP
Matthew Poremba (Gerrit)
Thu, May 25, 2023 7:14 PM
Matthew Poremba has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/70317?usp=email )
Change subject: configs,dev-amdgpu: GPUFS MI200/gfx90a support
......................................................................
configs,dev-amdgpu: GPUFS MI200/gfx90a support
Add support for MI200-like device. This includes adding PCI IDs and new
MMIOs for the device, a different MAP_PROCESS packet, and a different
calculation for the number of VGPRs.
M configs/example/gpufs/runfs.py
M configs/example/gpufs/system/amdgpu.py
M configs/example/gpufs/system/system.py
M src/dev/amdgpu/amdgpu_device.cc
M src/dev/amdgpu/amdgpu_device.hh
M src/dev/amdgpu/amdgpu_nbio.cc
M src/dev/amdgpu/amdgpu_nbio.hh
M src/dev/amdgpu/amdgpu_vm.hh
M src/dev/amdgpu/pm4_defines.hh
M src/dev/amdgpu/pm4_packet_processor.cc
M src/dev/amdgpu/pm4_packet_processor.hh
M src/gpu-compute/GPU.py
M src/gpu-compute/gpu_command_processor.cc
M src/gpu-compute/hsa_queue_entry.hh
14 files changed, 173 insertions(+), 27 deletions(-)
Approvals:
Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved
kokoro: Regressions pass
diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index 4c90601..f8ef70d 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -132,8 +132,9 @@
parser.add_argument(
"--gpu-device",
default="Vega10",
-
choices=["Vega10", "MI100"],
-
help="GPU model to run: Vega10 (gfx900) or MI100 (gfx908)",
-
choices=["Vega10", "MI100", "MI200"],
-
help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
-
"MI200 (gfx90a)",
)
diff --git a/configs/example/gpufs/system/amdgpu.py
b/configs/example/gpufs/system/amdgpu.py
index 5f98b55..9697e50 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -177,6 +177,10 @@
system.pc.south_bridge.gpu.DeviceID = 0x738C
system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
system.pc.south_bridge.gpu.SubsystemID = 0x0C34
- elif args.gpu_device == "MI200":
-
system.pc.south_bridge.gpu.DeviceID = 0x740F
-
system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
-
system.pc.south_bridge.gpu.SubsystemID = 0x0C34
elif args.gpu_device == "Vega10":
system.pc.south_bridge.gpu.DeviceID = 0x6863
else:
diff --git a/configs/example/gpufs/system/system.py
b/configs/example/gpufs/system/system.py
index 90c5c01..263ffc0 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -152,6 +152,16 @@
0x7D000,
]
sdma_sizes = [0x1000] * 8
diff --git a/src/dev/amdgpu/amdgpu_device.cc
b/src/dev/amdgpu/amdgpu_device.cc
index 7037e6f..3260d05 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -115,7 +115,7 @@
sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
- } else if (p.device_name == "MI100") {
- } else if (p.device_name == "MI100" || p.device_name == "MI200") {
sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
@@ -144,10 +144,19 @@
if (p.device_name == "Vega10") {
setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);
-
gfx_version = GfxVersion::gfx900;
} else if (p.device_name == "MI100") {
setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);
setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory
-
gfx_version = GfxVersion::gfx908;
- } else if (p.device_name == "MI200") {
-
// This device can have either 64GB or 128GB of device memory.
-
// This limits to 16GB for simulation.
-
setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
-
setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
-
setRegVal(MI200_MEM_SIZE_REG, 0x3ff0);
-
gfx_version = GfxVersion::gfx90a;
} else {
panic("Unknown GPU device %s\n", p.device_name);
}
diff --git a/src/dev/amdgpu/amdgpu_device.hh
b/src/dev/amdgpu/amdgpu_device.hh
index cab7991..56ed2f4 100644
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -42,6 +42,7 @@
#include "dev/amdgpu/mmio_reader.hh"
#include "dev/io_device.hh"
#include "dev/pci/device.hh"
+#include "enums/GfxVersion.hh"
#include "params/AMDGPUDevice.hh"
namespace gem5
@@ -145,6 +146,9 @@
*/
memory::PhysicalMemory deviceMem;
- /* Device information */
- GfxVersion gfx_version = GfxVersion::gfx900;
- public:
AMDGPUDevice(const AMDGPUDeviceParams &p);
@@ -206,6 +210,9 @@
uint16_t getVMID(Addr doorbell) { return doorbellVMIDMap[doorbell]; }
std::unordered_map<uint16_t, std::set<int>>& getUsedVMIDs();
void insertQId(uint16_t vmid, int id);
+
- /* Device information */
- GfxVersion getGfxVersion() const { return gfx_version; }
};
} // namespace gem5
diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc
index 69e4373..07027c3 100644
--- a/src/dev/amdgpu/amdgpu_nbio.cc
+++ b/src/dev/amdgpu/amdgpu_nbio.cc
@@ -75,12 +75,14 @@
case VEGA10_INV_ENG17_ACK2:
case MI100_INV_ENG17_ACK2:
case MI100_INV_ENG17_ACK3:
-
case MI200_INV_ENG17_ACK2:
pkt->setLE<uint32_t>(0x10001);
break;
case VEGA10_INV_ENG17_SEM1:
case VEGA10_INV_ENG17_SEM2:
case MI100_INV_ENG17_SEM2:
case MI100_INV_ENG17_SEM3:
-
case MI200_INV_ENG17_SEM2:
pkt->setLE<uint32_t>(0x1);
break;
// PSP responds with bit 31 set when ready
diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh
index d1e5391..dc95443 100644
--- a/src/dev/amdgpu/amdgpu_nbio.hh
+++ b/src/dev/amdgpu/amdgpu_nbio.hh
@@ -80,6 +80,11 @@
#define MI100_INV_ENG17_SEM2 0x6a888
#define MI100_INV_ENG17_SEM3 0x76888
+#define MI200_INV_ENG17_ACK1 0x0a318
+#define MI200_INV_ENG17_ACK2 0x6b018
+#define MI200_INV_ENG17_SEM1 0x0a288
+#define MI200_INV_ENG17_SEM2 0x6af88
+
class AMDGPUNbio
{
public:
diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh
index ac35a11..f35a735 100644
--- a/src/dev/amdgpu/amdgpu_vm.hh
+++ b/src/dev/amdgpu/amdgpu_vm.hh
@@ -81,6 +81,10 @@
#define MI100_FB_LOCATION_BASE
0x6ac00
#define MI100_FB_LOCATION_TOP
0x6ac04
+#define MI200_MEM_SIZE_REG
0x0378c
+#define MI200_FB_LOCATION_BASE
0x6b300
+#define MI200_FB_LOCATION_TOP
0x6b304
+
// AMD GPUs support 16 different virtual address spaces
static constexpr int AMDGPU_VM_COUNT = 16;
diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh
index 42832d5..a303f8e 100644
--- a/src/dev/amdgpu/pm4_defines.hh
+++ b/src/dev/amdgpu/pm4_defines.hh
@@ -275,6 +275,64 @@
typedef struct GEM5_PACKED
{
- uint32_t pasid : 16;
- uint32_t reserved0 : 8;
- uint32_t diq : 1;
- uint32_t processQuantum : 7;
- union
- {
-
struct
-
{
-
uint32_t ptBaseLo;
-
uint32_t ptBaseHi;
-
};
-
uint64_t ptBase;
- };
- uint32_t shMemBases;
- uint32_t shMemConfig;
- uint32_t sqShaderTbaLo;
- uint32_t sqShaderTbaHi;
- uint32_t sqShaderTmaLo;
- uint32_t sqShaderTmaHi;
- uint32_t reserved1;
- union
- {
-
struct
-
{
-
uint32_t gdsAddrLo;
-
uint32_t gdsAddrHi;
-
};
-
uint64_t gdsAddr;
- };
- union
- {
-
struct
-
{
-
uint32_t numGws : 7;
-
uint32_t sdma_enable : 1;
-
uint32_t numOac : 4;
-
uint32_t reserved3 : 4;
-
uint32_t gdsSize : 6;
-
uint32_t numQueues : 10;
-
};
-
uint32_t ordinal14;
- };
- uint32_t spiGdbgPerVmidCntl;
- uint32_t tcpWatchCntl[4];
- union
- {
-
struct
-
{
-
uint32_t completionSignalLo;
-
uint32_t completionSignalHi;
-
};
-
uint64_t completionSignal;
- };
+} PM4MapProcessMI200;
+static_assert(sizeof(PM4MapProcessMI200) == 80);
+typedef struct GEM5_PACKED
+{
uint32_t function : 4;
uint32_t memSpace : 2;
uint32_t operation : 2;
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc
b/src/dev/amdgpu/pm4_packet_processor.cc
index 3690113..e7b8465 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -271,12 +271,21 @@
dmaBuffer);
} break;
case IT_MAP_PROCESS: {
-
dmaBuffer = new PM4MapProcess();
-
cb = new DmaVirtCallback<uint64_t>(
-
[ = ] (const uint64_t &)
-
{ mapProcess(q, (PM4MapProcess *)dmaBuffer); });
-
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
-
dmaBuffer);
-
if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
-
dmaBuffer = new PM4MapProcessMI200();
-
cb = new DmaVirtCallback<uint64_t>(
-
[ = ] (const uint64_t &)
-
{ mapProcessGfx90a(q, (PM4MapProcessMI200
*)dmaBuffer); });
-
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
-
cb, dmaBuffer);
-
} else {
-
dmaBuffer = new PM4MapProcess();
-
cb = new DmaVirtCallback<uint64_t>(
-
[ = ] (const uint64_t &)
-
{ mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
-
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
-
dmaBuffer);
-
}
} break;
case IT_UNMAP_QUEUES: {
@@ -613,27 +622,50 @@
}
void
-PM4PacketProcessor::mapProcess(PM4Queue *q, PM4MapProcess *pkt)
+PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
- q->incRptr(sizeof(PM4MapProcess));
- uint16_t vmid = gpuDevice->allocateVMID(pkt->pasid);
- uint16_t vmid = gpuDevice->allocateVMID(pasid);
pkt->processQuantum,
-
gpuDevice->getVM().setPageTableBase(vmid, ptBase);
-
gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, shMemBases);
// Setup the apertures that gem5 uses. These values are bits [63:48].
- Addr lds_base = (Addr)bits(pkt->shMemBases, 31, 16) << 48;
- Addr scratch_base = (Addr)bits(pkt->shMemBases, 15, 0) << 48;
-
Addr lds_base = (Addr)bits(shMemBases, 31, 16) << 48;
-
Addr scratch_base = (Addr)bits(shMemBases, 15, 0) << 48;
// There does not seem to be any register for the limit, but the driver
// assumes scratch and LDS have a 4GB aperture, so use that.
gpuDevice->CP()->shader()->setLdsApe(lds_base, lds_base + 0xFFFFFFFF);
gpuDevice->CP()->shader()->setScratchApe(scratch_base,
scratch_base + 0xFFFFFFFF);
+}
+void
+PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
+{
- q->incRptr(sizeof(PM4MapProcess));
- DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
-
"%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
-
pkt->ptBase, pkt->completionSignal);
- mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);
- delete pkt;
- decodeNext(q);
+}
+void
+PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
+{
-
q->incRptr(sizeof(PM4MapProcessMI200));
-
DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
-
"%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
-
pkt->ptBase, pkt->completionSignal);
-
mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);
delete pkt;
decodeNext(q);
diff --git a/src/dev/amdgpu/pm4_packet_processor.hh
b/src/dev/amdgpu/pm4_packet_processor.hh
index 4617a21..3fb0551 100644
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -141,7 +141,9 @@
void mapQueues(PM4Queue *q, PM4MapQueues *pkt);
void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
void doneMQDWrite(Addr mqdAddr, Addr addr);
- void mapProcess(PM4Queue *q, PM4MapProcess *pkt);
- void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
- void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
- void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc
*mqd,
uint16_t vmid);
void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 3a87186..c5449cc 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -45,7 +45,7 @@
class GfxVersion(ScopedEnum):
- vals = ["gfx801", "gfx803", "gfx900", "gfx902"]
- vals = ["gfx801", "gfx803", "gfx900", "gfx902", "gfx908", "gfx90a"]
class PoolManager(SimObject):
diff --git a/src/gpu-compute/gpu_command_processor.cc
b/src/gpu-compute/gpu_command_processor.cc
index af59b78..9755180 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -228,7 +228,8 @@
DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
-
dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
-
dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
-
gpuDevice->getGfxVersion());
DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
"grid size (%dx%dx%d) kernarg addr: %#x, completion "
diff --git a/src/gpu-compute/hsa_queue_entry.hh
b/src/gpu-compute/hsa_queue_entry.hh
index fbe0efe..4083c1c 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -51,6 +51,7 @@
#include "base/types.hh"
#include "dev/hsa/hsa_packet.hh"
#include "dev/hsa/hsa_queue.hh"
+#include "enums/GfxVersion.hh"
#include "gpu-compute/kernel_code.hh"
namespace gem5
@@ -61,7 +62,7 @@
public:
HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
gfx_version)
: kernName(kernel_name),
_wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
@@ -92,9 +93,19 @@
// we need to rip register usage from the resource registers.
//
// We can't get an exact number of registers from the resource
it
it.
-
// We determine the number of registers by solving for "vgprs_used"
-
// in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
-
// #code-object-v3-kernel-descriptor
-
// Currently, the only supported gfx version in gem5 that computes
-
// this differently is gfx90a.
-
if (!numVgprs) {
-
if (gfx_version == GfxVersion::gfx90a) {
-
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
-
} else {
-
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
-
}
-
}
if (!numSgprs || numSgprs ==
std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {
--
To view, visit
https://gem5-review.googlesource.com/c/public/gem5/+/70317?usp=email
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings?usp=email
Gerrit-MessageType: merged
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I0fb7b3ad928826beaa5386d52a94ba504369cb0d
Gerrit-Change-Number: 70317
Gerrit-PatchSet: 2
Gerrit-Owner: Matthew Poremba matthew.poremba@amd.com
Gerrit-Reviewer: Jason Lowe-Power jason@lowepower.com
Gerrit-Reviewer: Jason Lowe-Power power.jg@gmail.com
Gerrit-Reviewer: Matthew Poremba matthew.poremba@amd.com
Gerrit-Reviewer: kokoro noreply+kokoro@google.com
Matthew Poremba has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/70317?usp=email )
Change subject: configs,dev-amdgpu: GPUFS MI200/gfx90a support
......................................................................
configs,dev-amdgpu: GPUFS MI200/gfx90a support
Add support for MI200-like device. This includes adding PCI IDs and new
MMIOs for the device, a different MAP_PROCESS packet, and a different
calculation for the number of VGPRs.
Change-Id: I0fb7b3ad928826beaa5386d52a94ba504369cb0d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70317
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
M configs/example/gpufs/runfs.py
M configs/example/gpufs/system/amdgpu.py
M configs/example/gpufs/system/system.py
M src/dev/amdgpu/amdgpu_device.cc
M src/dev/amdgpu/amdgpu_device.hh
M src/dev/amdgpu/amdgpu_nbio.cc
M src/dev/amdgpu/amdgpu_nbio.hh
M src/dev/amdgpu/amdgpu_vm.hh
M src/dev/amdgpu/pm4_defines.hh
M src/dev/amdgpu/pm4_packet_processor.cc
M src/dev/amdgpu/pm4_packet_processor.hh
M src/gpu-compute/GPU.py
M src/gpu-compute/gpu_command_processor.cc
M src/gpu-compute/hsa_queue_entry.hh
14 files changed, 173 insertions(+), 27 deletions(-)
Approvals:
Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved
kokoro: Regressions pass
diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index 4c90601..f8ef70d 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -132,8 +132,9 @@
parser.add_argument(
"--gpu-device",
default="Vega10",
- choices=["Vega10", "MI100"],
- help="GPU model to run: Vega10 (gfx900) or MI100 (gfx908)",
+ choices=["Vega10", "MI100", "MI200"],
+ help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
+ "MI200 (gfx90a)",
)
diff --git a/configs/example/gpufs/system/amdgpu.py
b/configs/example/gpufs/system/amdgpu.py
index 5f98b55..9697e50 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -177,6 +177,10 @@
system.pc.south_bridge.gpu.DeviceID = 0x738C
system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
system.pc.south_bridge.gpu.SubsystemID = 0x0C34
+ elif args.gpu_device == "MI200":
+ system.pc.south_bridge.gpu.DeviceID = 0x740F
+ system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
+ system.pc.south_bridge.gpu.SubsystemID = 0x0C34
elif args.gpu_device == "Vega10":
system.pc.south_bridge.gpu.DeviceID = 0x6863
else:
diff --git a/configs/example/gpufs/system/system.py
b/configs/example/gpufs/system/system.py
index 90c5c01..263ffc0 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -152,6 +152,16 @@
0x7D000,
]
sdma_sizes = [0x1000] * 8
+ elif args.gpu_device == "MI200":
+ num_sdmas = 5
+ sdma_bases = [
+ 0x4980,
+ 0x6180,
+ 0x78000,
+ 0x79000,
+ 0x7A000,
+ ]
+ sdma_sizes = [0x1000] * 5
else:
m5.util.panic(f"Unknown GPU device {args.gpu_device}")
diff --git a/src/dev/amdgpu/amdgpu_device.cc
b/src/dev/amdgpu/amdgpu_device.cc
index 7037e6f..3260d05 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -115,7 +115,7 @@
sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
- } else if (p.device_name == "MI100") {
+ } else if (p.device_name == "MI100" || p.device_name == "MI200") {
sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
@@ -144,10 +144,19 @@
if (p.device_name == "Vega10") {
setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);
+ gfx_version = GfxVersion::gfx900;
} else if (p.device_name == "MI100") {
setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);
setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory
+ gfx_version = GfxVersion::gfx908;
+ } else if (p.device_name == "MI200") {
+ // This device can have either 64GB or 128GB of device memory.
+ // This limits to 16GB for simulation.
+ setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
+ setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
+ setRegVal(MI200_MEM_SIZE_REG, 0x3ff0);
+ gfx_version = GfxVersion::gfx90a;
} else {
panic("Unknown GPU device %s\n", p.device_name);
}
diff --git a/src/dev/amdgpu/amdgpu_device.hh
b/src/dev/amdgpu/amdgpu_device.hh
index cab7991..56ed2f4 100644
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -42,6 +42,7 @@
#include "dev/amdgpu/mmio_reader.hh"
#include "dev/io_device.hh"
#include "dev/pci/device.hh"
+#include "enums/GfxVersion.hh"
#include "params/AMDGPUDevice.hh"
namespace gem5
@@ -145,6 +146,9 @@
*/
memory::PhysicalMemory deviceMem;
+ /* Device information */
+ GfxVersion gfx_version = GfxVersion::gfx900;
+
public:
AMDGPUDevice(const AMDGPUDeviceParams &p);
@@ -206,6 +210,9 @@
uint16_t getVMID(Addr doorbell) { return doorbellVMIDMap[doorbell]; }
std::unordered_map<uint16_t, std::set<int>>& getUsedVMIDs();
void insertQId(uint16_t vmid, int id);
+
+ /* Device information */
+ GfxVersion getGfxVersion() const { return gfx_version; }
};
} // namespace gem5
diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc
index 69e4373..07027c3 100644
--- a/src/dev/amdgpu/amdgpu_nbio.cc
+++ b/src/dev/amdgpu/amdgpu_nbio.cc
@@ -75,12 +75,14 @@
case VEGA10_INV_ENG17_ACK2:
case MI100_INV_ENG17_ACK2:
case MI100_INV_ENG17_ACK3:
+ case MI200_INV_ENG17_ACK2:
pkt->setLE<uint32_t>(0x10001);
break;
case VEGA10_INV_ENG17_SEM1:
case VEGA10_INV_ENG17_SEM2:
case MI100_INV_ENG17_SEM2:
case MI100_INV_ENG17_SEM3:
+ case MI200_INV_ENG17_SEM2:
pkt->setLE<uint32_t>(0x1);
break;
// PSP responds with bit 31 set when ready
diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh
index d1e5391..dc95443 100644
--- a/src/dev/amdgpu/amdgpu_nbio.hh
+++ b/src/dev/amdgpu/amdgpu_nbio.hh
@@ -80,6 +80,11 @@
#define MI100_INV_ENG17_SEM2 0x6a888
#define MI100_INV_ENG17_SEM3 0x76888
+#define MI200_INV_ENG17_ACK1 0x0a318
+#define MI200_INV_ENG17_ACK2 0x6b018
+#define MI200_INV_ENG17_SEM1 0x0a288
+#define MI200_INV_ENG17_SEM2 0x6af88
+
class AMDGPUNbio
{
public:
diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh
index ac35a11..f35a735 100644
--- a/src/dev/amdgpu/amdgpu_vm.hh
+++ b/src/dev/amdgpu/amdgpu_vm.hh
@@ -81,6 +81,10 @@
#define MI100_FB_LOCATION_BASE
0x6ac00
#define MI100_FB_LOCATION_TOP
0x6ac04
+#define MI200_MEM_SIZE_REG
0x0378c
+#define MI200_FB_LOCATION_BASE
0x6b300
+#define MI200_FB_LOCATION_TOP
0x6b304
+
// AMD GPUs support 16 different virtual address spaces
static constexpr int AMDGPU_VM_COUNT = 16;
diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh
index 42832d5..a303f8e 100644
--- a/src/dev/amdgpu/pm4_defines.hh
+++ b/src/dev/amdgpu/pm4_defines.hh
@@ -275,6 +275,64 @@
typedef struct GEM5_PACKED
{
+ uint32_t pasid : 16;
+ uint32_t reserved0 : 8;
+ uint32_t diq : 1;
+ uint32_t processQuantum : 7;
+ union
+ {
+ struct
+ {
+ uint32_t ptBaseLo;
+ uint32_t ptBaseHi;
+ };
+ uint64_t ptBase;
+ };
+ uint32_t shMemBases;
+ uint32_t shMemConfig;
+ uint32_t sqShaderTbaLo;
+ uint32_t sqShaderTbaHi;
+ uint32_t sqShaderTmaLo;
+ uint32_t sqShaderTmaHi;
+ uint32_t reserved1;
+ union
+ {
+ struct
+ {
+ uint32_t gdsAddrLo;
+ uint32_t gdsAddrHi;
+ };
+ uint64_t gdsAddr;
+ };
+ union
+ {
+ struct
+ {
+ uint32_t numGws : 7;
+ uint32_t sdma_enable : 1;
+ uint32_t numOac : 4;
+ uint32_t reserved3 : 4;
+ uint32_t gdsSize : 6;
+ uint32_t numQueues : 10;
+ };
+ uint32_t ordinal14;
+ };
+ uint32_t spiGdbgPerVmidCntl;
+ uint32_t tcpWatchCntl[4];
+ union
+ {
+ struct
+ {
+ uint32_t completionSignalLo;
+ uint32_t completionSignalHi;
+ };
+ uint64_t completionSignal;
+ };
+} PM4MapProcessMI200;
+static_assert(sizeof(PM4MapProcessMI200) == 80);
+
+typedef struct GEM5_PACKED
+{
uint32_t function : 4;
uint32_t memSpace : 2;
uint32_t operation : 2;
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc
b/src/dev/amdgpu/pm4_packet_processor.cc
index 3690113..e7b8465 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -271,12 +271,21 @@
dmaBuffer);
} break;
case IT_MAP_PROCESS: {
- dmaBuffer = new PM4MapProcess();
- cb = new DmaVirtCallback<uint64_t>(
- [ = ] (const uint64_t &)
- { mapProcess(q, (PM4MapProcess *)dmaBuffer); });
- dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
- dmaBuffer);
+ if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
+ dmaBuffer = new PM4MapProcessMI200();
+ cb = new DmaVirtCallback<uint64_t>(
+ [ = ] (const uint64_t &)
+ { mapProcessGfx90a(q, (PM4MapProcessMI200
*)dmaBuffer); });
+ dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
+ cb, dmaBuffer);
+ } else {
+ dmaBuffer = new PM4MapProcess();
+ cb = new DmaVirtCallback<uint64_t>(
+ [ = ] (const uint64_t &)
+ { mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
+ dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
+ dmaBuffer);
+ }
} break;
case IT_UNMAP_QUEUES: {
@@ -613,27 +622,50 @@
}
void
-PM4PacketProcessor::mapProcess(PM4Queue *q, PM4MapProcess *pkt)
+PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
+ uint32_t shMemBases)
{
- q->incRptr(sizeof(PM4MapProcess));
- uint16_t vmid = gpuDevice->allocateVMID(pkt->pasid);
+ uint16_t vmid = gpuDevice->allocateVMID(pasid);
- DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p vmid: %d
quantum: "
- "%d pt: %p signal: %p\n", pkt->pasid, vmid,
pkt->processQuantum,
- pkt->ptBase, pkt->completionSignal);
-
- gpuDevice->getVM().setPageTableBase(vmid, pkt->ptBase);
- gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES,
pkt->shMemBases);
+ gpuDevice->getVM().setPageTableBase(vmid, ptBase);
+ gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, shMemBases);
// Setup the apertures that gem5 uses. These values are bits [63:48].
- Addr lds_base = (Addr)bits(pkt->shMemBases, 31, 16) << 48;
- Addr scratch_base = (Addr)bits(pkt->shMemBases, 15, 0) << 48;
+ Addr lds_base = (Addr)bits(shMemBases, 31, 16) << 48;
+ Addr scratch_base = (Addr)bits(shMemBases, 15, 0) << 48;
// There does not seem to be any register for the limit, but the driver
// assumes scratch and LDS have a 4GB aperture, so use that.
gpuDevice->CP()->shader()->setLdsApe(lds_base, lds_base + 0xFFFFFFFF);
gpuDevice->CP()->shader()->setScratchApe(scratch_base,
scratch_base + 0xFFFFFFFF);
+}
+
+void
+PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
+{
+ q->incRptr(sizeof(PM4MapProcess));
+
+ DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
+ "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
+ pkt->ptBase, pkt->completionSignal);
+
+ mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);
+
+ delete pkt;
+ decodeNext(q);
+}
+
+void
+PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
+{
+ q->incRptr(sizeof(PM4MapProcessMI200));
+
+ DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
+ "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
+ pkt->ptBase, pkt->completionSignal);
+
+ mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);
delete pkt;
decodeNext(q);
diff --git a/src/dev/amdgpu/pm4_packet_processor.hh
b/src/dev/amdgpu/pm4_packet_processor.hh
index 4617a21..3fb0551 100644
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -141,7 +141,9 @@
void mapQueues(PM4Queue *q, PM4MapQueues *pkt);
void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
void doneMQDWrite(Addr mqdAddr, Addr addr);
- void mapProcess(PM4Queue *q, PM4MapProcess *pkt);
+ void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
+ void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
+ void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc
*mqd,
uint16_t vmid);
void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 3a87186..c5449cc 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -45,7 +45,7 @@
class GfxVersion(ScopedEnum):
- vals = ["gfx801", "gfx803", "gfx900", "gfx902"]
+ vals = ["gfx801", "gfx803", "gfx900", "gfx902", "gfx908", "gfx90a"]
class PoolManager(SimObject):
diff --git a/src/gpu-compute/gpu_command_processor.cc
b/src/gpu-compute/gpu_command_processor.cc
index af59b78..9755180 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -228,7 +228,8 @@
DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
- dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
+ dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
+ gpuDevice->getGfxVersion());
DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
"grid size (%dx%dx%d) kernarg addr: %#x, completion "
diff --git a/src/gpu-compute/hsa_queue_entry.hh
b/src/gpu-compute/hsa_queue_entry.hh
index fbe0efe..4083c1c 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -51,6 +51,7 @@
#include "base/types.hh"
#include "dev/hsa/hsa_packet.hh"
#include "dev/hsa/hsa_queue.hh"
+#include "enums/GfxVersion.hh"
#include "gpu-compute/kernel_code.hh"
namespace gem5
@@ -61,7 +62,7 @@
public:
HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
- Addr host_pkt_addr, Addr code_addr)
+ Addr host_pkt_addr, Addr code_addr, GfxVersion
gfx_version)
: kernName(kernel_name),
_wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
@@ -92,9 +93,19 @@
// we need to rip register usage from the resource registers.
//
// We can't get an exact number of registers from the resource
- // registers because they round, but we can get an upper bound on
it
- if (!numVgprs)
- numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
+ // registers because they round, but we can get an upper bound on
it.
+ // We determine the number of registers by solving for "vgprs_used"
+ // in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
+ // #code-object-v3-kernel-descriptor
+ // Currently, the only supported gfx version in gem5 that computes
+ // this differently is gfx90a.
+ if (!numVgprs) {
+ if (gfx_version == GfxVersion::gfx90a) {
+ numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
+ } else {
+ numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
+ }
+ }
if (!numSgprs || numSgprs ==
std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {
--
To view, visit
https://gem5-review.googlesource.com/c/public/gem5/+/70317?usp=email
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings?usp=email
Gerrit-MessageType: merged
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I0fb7b3ad928826beaa5386d52a94ba504369cb0d
Gerrit-Change-Number: 70317
Gerrit-PatchSet: 2
Gerrit-Owner: Matthew Poremba <matthew.poremba@amd.com>
Gerrit-Reviewer: Jason Lowe-Power <jason@lowepower.com>
Gerrit-Reviewer: Jason Lowe-Power <power.jg@gmail.com>
Gerrit-Reviewer: Matthew Poremba <matthew.poremba@amd.com>
Gerrit-Reviewer: kokoro <noreply+kokoro@google.com>