gem5-dev@gem5.org

The gem5 Developer List

View all threads

[M] Change in gem5/gem5[develop]: configs,dev-amdgpu: GPUFS MI200/gfx90a support

MP
Matthew Poremba (Gerrit)
Thu, May 25, 2023 7:14 PM

Matthew Poremba has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/70317?usp=email )

Change subject: configs,dev-amdgpu: GPUFS MI200/gfx90a support
......................................................................

configs,dev-amdgpu: GPUFS MI200/gfx90a support

Add support for MI200-like device. This includes adding PCI IDs and new
MMIOs for the device, a different MAP_PROCESS packet, and a different
calculation for the number of VGPRs.

Change-Id: I0fb7b3ad928826beaa5386d52a94ba504369cb0d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70317
Reviewed-by: Jason Lowe-Power power.jg@gmail.com
Maintainer: Jason Lowe-Power power.jg@gmail.com
Tested-by: kokoro noreply+kokoro@google.com

M configs/example/gpufs/runfs.py
M configs/example/gpufs/system/amdgpu.py
M configs/example/gpufs/system/system.py
M src/dev/amdgpu/amdgpu_device.cc
M src/dev/amdgpu/amdgpu_device.hh
M src/dev/amdgpu/amdgpu_nbio.cc
M src/dev/amdgpu/amdgpu_nbio.hh
M src/dev/amdgpu/amdgpu_vm.hh
M src/dev/amdgpu/pm4_defines.hh
M src/dev/amdgpu/pm4_packet_processor.cc
M src/dev/amdgpu/pm4_packet_processor.hh
M src/gpu-compute/GPU.py
M src/gpu-compute/gpu_command_processor.cc
M src/gpu-compute/hsa_queue_entry.hh
14 files changed, 173 insertions(+), 27 deletions(-)

Approvals:
Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved
kokoro: Regressions pass

diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index 4c90601..f8ef70d 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -132,8 +132,9 @@
parser.add_argument(
"--gpu-device",
default="Vega10",

  •    choices=["Vega10", "MI100"],
    
  •    help="GPU model to run: Vega10 (gfx900) or MI100 (gfx908)",
    
  •    choices=["Vega10", "MI100", "MI200"],
    
  •    help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
    
  •    "MI200 (gfx90a)",
    )
    

diff --git a/configs/example/gpufs/system/amdgpu.py
b/configs/example/gpufs/system/amdgpu.py
index 5f98b55..9697e50 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -177,6 +177,10 @@
system.pc.south_bridge.gpu.DeviceID = 0x738C
system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
system.pc.south_bridge.gpu.SubsystemID = 0x0C34

  • elif args.gpu_device == "MI200":
  •    system.pc.south_bridge.gpu.DeviceID = 0x740F
    
  •    system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
    
  •    system.pc.south_bridge.gpu.SubsystemID = 0x0C34
    elif args.gpu_device == "Vega10":
        system.pc.south_bridge.gpu.DeviceID = 0x6863
    else:
    

diff --git a/configs/example/gpufs/system/system.py
b/configs/example/gpufs/system/system.py
index 90c5c01..263ffc0 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -152,6 +152,16 @@
0x7D000,
]
sdma_sizes = [0x1000] * 8

  • elif args.gpu_device == "MI200":
  •    num_sdmas = 5
    
  •    sdma_bases = [
    
  •        0x4980,
    
  •        0x6180,
    
  •        0x78000,
    
  •        0x79000,
    
  •        0x7A000,
    
  •    ]
    
  •    sdma_sizes = [0x1000] * 5
    else:
        m5.util.panic(f"Unknown GPU device {args.gpu_device}")
    

diff --git a/src/dev/amdgpu/amdgpu_device.cc
b/src/dev/amdgpu/amdgpu_device.cc
index 7037e6f..3260d05 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -115,7 +115,7 @@
sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});

  • } else if (p.device_name == "MI100") {
  • } else if (p.device_name == "MI100" || p.device_name == "MI200") {
    sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
    sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
    sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
    @@ -144,10 +144,19 @@
    if (p.device_name == "Vega10") {
    setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);
    setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);
  •    gfx_version = GfxVersion::gfx900;
    } else if (p.device_name == "MI100") {
        setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);
        setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);
        setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory
    
  •    gfx_version = GfxVersion::gfx908;
    
  • } else if (p.device_name == "MI200") {
  •    // This device can have either 64GB or 128GB of device memory.
    
  •    // This limits to 16GB for simulation.
    
  •    setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
    
  •    setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
    
  •    setRegVal(MI200_MEM_SIZE_REG, 0x3ff0);
    
  •    gfx_version = GfxVersion::gfx90a;
    } else {
        panic("Unknown GPU device %s\n", p.device_name);
    }
    

diff --git a/src/dev/amdgpu/amdgpu_device.hh
b/src/dev/amdgpu/amdgpu_device.hh
index cab7991..56ed2f4 100644
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -42,6 +42,7 @@
#include "dev/amdgpu/mmio_reader.hh"
#include "dev/io_device.hh"
#include "dev/pci/device.hh"
+#include "enums/GfxVersion.hh"
#include "params/AMDGPUDevice.hh"

namespace gem5
@@ -145,6 +146,9 @@
*/
memory::PhysicalMemory deviceMem;

  • /* Device information */
  • GfxVersion gfx_version = GfxVersion::gfx900;
  • public:
    AMDGPUDevice(const AMDGPUDeviceParams &p);

@@ -206,6 +210,9 @@
uint16_t getVMID(Addr doorbell) { return doorbellVMIDMap[doorbell]; }
std::unordered_map<uint16_t, std::set<int>>& getUsedVMIDs();
void insertQId(uint16_t vmid, int id);
+

  • /* Device information */
  • GfxVersion getGfxVersion() const { return gfx_version; }
    };

} // namespace gem5
diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc
index 69e4373..07027c3 100644
--- a/src/dev/amdgpu/amdgpu_nbio.cc
+++ b/src/dev/amdgpu/amdgpu_nbio.cc
@@ -75,12 +75,14 @@
case VEGA10_INV_ENG17_ACK2:
case MI100_INV_ENG17_ACK2:
case MI100_INV_ENG17_ACK3:

  •  case MI200_INV_ENG17_ACK2:
        pkt->setLE<uint32_t>(0x10001);
        break;
      case VEGA10_INV_ENG17_SEM1:
      case VEGA10_INV_ENG17_SEM2:
      case MI100_INV_ENG17_SEM2:
      case MI100_INV_ENG17_SEM3:
    
  •  case MI200_INV_ENG17_SEM2:
        pkt->setLE<uint32_t>(0x1);
        break;
      // PSP responds with bit 31 set when ready
    

diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh
index d1e5391..dc95443 100644
--- a/src/dev/amdgpu/amdgpu_nbio.hh
+++ b/src/dev/amdgpu/amdgpu_nbio.hh
@@ -80,6 +80,11 @@
#define MI100_INV_ENG17_SEM2                              0x6a888
#define MI100_INV_ENG17_SEM3                              0x76888

+#define MI200_INV_ENG17_ACK1                              0x0a318
+#define MI200_INV_ENG17_ACK2                              0x6b018
+#define MI200_INV_ENG17_SEM1                              0x0a288
+#define MI200_INV_ENG17_SEM2                              0x6af88
+
class AMDGPUNbio
{
public:
diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh
index ac35a11..f35a735 100644
--- a/src/dev/amdgpu/amdgpu_vm.hh
+++ b/src/dev/amdgpu/amdgpu_vm.hh
@@ -81,6 +81,10 @@
#define MI100_FB_LOCATION_BASE
0x6ac00
#define MI100_FB_LOCATION_TOP
0x6ac04

+#define MI200_MEM_SIZE_REG
0x0378c
+#define MI200_FB_LOCATION_BASE
0x6b300
+#define MI200_FB_LOCATION_TOP
0x6b304
+
// AMD GPUs support 16 different virtual address spaces
static constexpr int AMDGPU_VM_COUNT = 16;

diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh
index 42832d5..a303f8e 100644
--- a/src/dev/amdgpu/pm4_defines.hh
+++ b/src/dev/amdgpu/pm4_defines.hh
@@ -275,6 +275,64 @@

typedef struct GEM5_PACKED
{

  • uint32_t pasid : 16;
  • uint32_t reserved0 : 8;
  • uint32_t diq : 1;
  • uint32_t processQuantum : 7;
  • union
  • {
  •    struct
    
  •    {
    
  •        uint32_t ptBaseLo;
    
  •        uint32_t ptBaseHi;
    
  •    };
    
  •    uint64_t ptBase;
    
  • };
  • uint32_t shMemBases;
  • uint32_t shMemConfig;
  • uint32_t sqShaderTbaLo;
  • uint32_t sqShaderTbaHi;
  • uint32_t sqShaderTmaLo;
  • uint32_t sqShaderTmaHi;
  • uint32_t reserved1;
  • union
  • {
  •    struct
    
  •    {
    
  •        uint32_t gdsAddrLo;
    
  •        uint32_t gdsAddrHi;
    
  •    };
    
  •    uint64_t gdsAddr;
    
  • };
  • union
  • {
  •    struct
    
  •    {
    
  •        uint32_t numGws : 7;
    
  •        uint32_t sdma_enable : 1;
    
  •        uint32_t numOac : 4;
    
  •        uint32_t reserved3 : 4;
    
  •        uint32_t gdsSize : 6;
    
  •        uint32_t numQueues : 10;
    
  •    };
    
  •    uint32_t ordinal14;
    
  • };
  • uint32_t spiGdbgPerVmidCntl;
  • uint32_t tcpWatchCntl[4];
  • union
  • {
  •    struct
    
  •    {
    
  •        uint32_t completionSignalLo;
    
  •        uint32_t completionSignalHi;
    
  •    };
    
  •    uint64_t completionSignal;
    
  • };
    +}  PM4MapProcessMI200;
    +static_assert(sizeof(PM4MapProcessMI200) == 80);

+typedef struct GEM5_PACKED
+{
uint32_t function : 4;
uint32_t memSpace : 2;
uint32_t operation : 2;
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc
b/src/dev/amdgpu/pm4_packet_processor.cc
index 3690113..e7b8465 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -271,12 +271,21 @@
dmaBuffer);
} break;
case IT_MAP_PROCESS: {

  •    dmaBuffer = new PM4MapProcess();
    
  •    cb = new DmaVirtCallback<uint64_t>(
    
  •        [ = ] (const uint64_t &)
    
  •            { mapProcess(q, (PM4MapProcess *)dmaBuffer); });
    
  •    dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
    
  •                dmaBuffer);
    
  •    if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
    
  •        dmaBuffer = new PM4MapProcessMI200();
    
  •        cb = new DmaVirtCallback<uint64_t>(
    
  •            [ = ] (const uint64_t &)
    
  •                { mapProcessGfx90a(q, (PM4MapProcessMI200  
    

*)dmaBuffer); });

  •        dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
    
  •                    cb, dmaBuffer);
    
  •    } else {
    
  •        dmaBuffer = new PM4MapProcess();
    
  •        cb = new DmaVirtCallback<uint64_t>(
    
  •            [ = ] (const uint64_t &)
    
  •                { mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
    
  •        dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
    
  •                    dmaBuffer);
    
  •    }
        } break;
    
      case IT_UNMAP_QUEUES: {
    

@@ -613,27 +622,50 @@
}

void
-PM4PacketProcessor::mapProcess(PM4Queue *q, PM4MapProcess *pkt)
+PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,

  •                           uint32_t shMemBases)
    
    {
  • q->incRptr(sizeof(PM4MapProcess));
  • uint16_t vmid = gpuDevice->allocateVMID(pkt->pasid);
  • uint16_t vmid = gpuDevice->allocateVMID(pasid);
  • DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p vmid: %d
    quantum: "
  •        "%d pt: %p signal: %p\n", pkt->pasid, vmid,  
    

pkt->processQuantum,

  •        pkt->ptBase, pkt->completionSignal);
    
  • gpuDevice->getVM().setPageTableBase(vmid, pkt->ptBase);
  • gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES,
    pkt->shMemBases);
  • gpuDevice->getVM().setPageTableBase(vmid, ptBase);

  • gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, shMemBases);

    // Setup the apertures that gem5 uses. These values are bits [63:48].

  • Addr lds_base = (Addr)bits(pkt->shMemBases, 31, 16) << 48;
  • Addr scratch_base = (Addr)bits(pkt->shMemBases, 15, 0) << 48;
  • Addr lds_base = (Addr)bits(shMemBases, 31, 16) << 48;

  • Addr scratch_base = (Addr)bits(shMemBases, 15, 0) << 48;

    // There does not seem to be any register for the limit, but the driver
    // assumes scratch and LDS have a 4GB aperture, so use that.
    gpuDevice->CP()->shader()->setLdsApe(lds_base, lds_base + 0xFFFFFFFF);
    gpuDevice->CP()->shader()->setScratchApe(scratch_base,
    scratch_base + 0xFFFFFFFF);
    +}

+void
+PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
+{

  • q->incRptr(sizeof(PM4MapProcess));
  • DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
  •        "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
    
  •        pkt->ptBase, pkt->completionSignal);
    
  • mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);
  • delete pkt;
  • decodeNext(q);
    +}

+void
+PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
+{

  • q->incRptr(sizeof(PM4MapProcessMI200));

  • DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "

  •        "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
    
  •        pkt->ptBase, pkt->completionSignal);
    
  • mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);

    delete pkt;
    decodeNext(q);
    diff --git a/src/dev/amdgpu/pm4_packet_processor.hh
    b/src/dev/amdgpu/pm4_packet_processor.hh
    index 4617a21..3fb0551 100644
    --- a/src/dev/amdgpu/pm4_packet_processor.hh
    +++ b/src/dev/amdgpu/pm4_packet_processor.hh
    @@ -141,7 +141,9 @@
    void mapQueues(PM4Queue *q, PM4MapQueues *pkt);
    void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
    void doneMQDWrite(Addr mqdAddr, Addr addr);

  • void mapProcess(PM4Queue *q, PM4MapProcess *pkt);
  • void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
  • void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
  • void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
    void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc
    *mqd,
    uint16_t vmid);
    void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
    diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
    index 3a87186..c5449cc 100644
    --- a/src/gpu-compute/GPU.py
    +++ b/src/gpu-compute/GPU.py
    @@ -45,7 +45,7 @@

class GfxVersion(ScopedEnum):

  • vals = ["gfx801", "gfx803", "gfx900", "gfx902"]
  • vals = ["gfx801", "gfx803", "gfx900", "gfx902", "gfx908", "gfx90a"]

class PoolManager(SimObject):
diff --git a/src/gpu-compute/gpu_command_processor.cc
b/src/gpu-compute/gpu_command_processor.cc
index af59b78..9755180 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -228,7 +228,8 @@
DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());

  HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
  •    dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
    
  •    dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
    
  •    gpuDevice->getGfxVersion());
    
    DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
        "grid size (%dx%dx%d) kernarg addr: %#x, completion "
    

diff --git a/src/gpu-compute/hsa_queue_entry.hh
b/src/gpu-compute/hsa_queue_entry.hh
index fbe0efe..4083c1c 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -51,6 +51,7 @@
#include "base/types.hh"
#include "dev/hsa/hsa_packet.hh"
#include "dev/hsa/hsa_queue.hh"
+#include "enums/GfxVersion.hh"
#include "gpu-compute/kernel_code.hh"

namespace gem5
@@ -61,7 +62,7 @@
public:
HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
int dispatch_id, void *disp_pkt, AMDKernelCode *akc,

  •              Addr host_pkt_addr, Addr code_addr)
    
  •              Addr host_pkt_addr, Addr code_addr, GfxVersion  
    

gfx_version)
: kernName(kernel_name),

_wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,

(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
@@ -92,9 +93,19 @@
// we need to rip register usage from the resource registers.
//
// We can't get an exact number of registers from the resource

  •    // registers because they round, but we can get an upper bound on  
    

it

  •    if (!numVgprs)
    
  •        numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
    
  •    // registers because they round, but we can get an upper bound on  
    

it.

  •    // We determine the number of registers by solving for "vgprs_used"
    
  •    // in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
    
  •    //     #code-object-v3-kernel-descriptor
    
  •    // Currently, the only supported gfx version in gem5 that computes
    
  •    // this differently is gfx90a.
    
  •    if (!numVgprs) {
    
  •        if (gfx_version == GfxVersion::gfx90a) {
    
  •            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
    
  •        } else {
    
  •            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
    
  •        }
    
  •    }
    
        if (!numSgprs || numSgprs ==
    

std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {

--
To view, visit
https://gem5-review.googlesource.com/c/public/gem5/+/70317?usp=email
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings?usp=email

Gerrit-MessageType: merged
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I0fb7b3ad928826beaa5386d52a94ba504369cb0d
Gerrit-Change-Number: 70317
Gerrit-PatchSet: 2
Gerrit-Owner: Matthew Poremba matthew.poremba@amd.com
Gerrit-Reviewer: Jason Lowe-Power jason@lowepower.com
Gerrit-Reviewer: Jason Lowe-Power power.jg@gmail.com
Gerrit-Reviewer: Matthew Poremba matthew.poremba@amd.com
Gerrit-Reviewer: kokoro noreply+kokoro@google.com

Matthew Poremba has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/70317?usp=email ) Change subject: configs,dev-amdgpu: GPUFS MI200/gfx90a support ...................................................................... configs,dev-amdgpu: GPUFS MI200/gfx90a support Add support for MI200-like device. This includes adding PCI IDs and new MMIOs for the device, a different MAP_PROCESS packet, and a different calculation for the number of VGPRs. Change-Id: I0fb7b3ad928826beaa5386d52a94ba504369cb0d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70317 Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Jason Lowe-Power <power.jg@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com> --- M configs/example/gpufs/runfs.py M configs/example/gpufs/system/amdgpu.py M configs/example/gpufs/system/system.py M src/dev/amdgpu/amdgpu_device.cc M src/dev/amdgpu/amdgpu_device.hh M src/dev/amdgpu/amdgpu_nbio.cc M src/dev/amdgpu/amdgpu_nbio.hh M src/dev/amdgpu/amdgpu_vm.hh M src/dev/amdgpu/pm4_defines.hh M src/dev/amdgpu/pm4_packet_processor.cc M src/dev/amdgpu/pm4_packet_processor.hh M src/gpu-compute/GPU.py M src/gpu-compute/gpu_command_processor.cc M src/gpu-compute/hsa_queue_entry.hh 14 files changed, 173 insertions(+), 27 deletions(-) Approvals: Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py index 4c90601..f8ef70d 100644 --- a/configs/example/gpufs/runfs.py +++ b/configs/example/gpufs/runfs.py @@ -132,8 +132,9 @@ parser.add_argument( "--gpu-device", default="Vega10", - choices=["Vega10", "MI100"], - help="GPU model to run: Vega10 (gfx900) or MI100 (gfx908)", + choices=["Vega10", "MI100", "MI200"], + help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or " + "MI200 (gfx90a)", ) diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py index 5f98b55..9697e50 100644 --- a/configs/example/gpufs/system/amdgpu.py +++ b/configs/example/gpufs/system/amdgpu.py @@ -177,6 +177,10 @@ system.pc.south_bridge.gpu.DeviceID = 0x738C system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002 system.pc.south_bridge.gpu.SubsystemID = 0x0C34 + elif args.gpu_device == "MI200": + system.pc.south_bridge.gpu.DeviceID = 0x740F + system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002 + system.pc.south_bridge.gpu.SubsystemID = 0x0C34 elif args.gpu_device == "Vega10": system.pc.south_bridge.gpu.DeviceID = 0x6863 else: diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 90c5c01..263ffc0 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -152,6 +152,16 @@ 0x7D000, ] sdma_sizes = [0x1000] * 8 + elif args.gpu_device == "MI200": + num_sdmas = 5 + sdma_bases = [ + 0x4980, + 0x6180, + 0x78000, + 0x79000, + 0x7A000, + ] + sdma_sizes = [0x1000] * 5 else: m5.util.panic(f"Unknown GPU device {args.gpu_device}") diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 7037e6f..3260d05 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -115,7 +115,7 @@ sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo}); sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize}); sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo}); - } else if (p.device_name == "MI100") { + } else if (p.device_name == "MI100" || p.device_name == "MI200") { sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo}); sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo}); sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi}); @@ -144,10 +144,19 @@ if (p.device_name == "Vega10") { setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24); + gfx_version = GfxVersion::gfx900; } else if (p.device_name == "MI100") { setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24); setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory + gfx_version = GfxVersion::gfx908; + } else if (p.device_name == "MI200") { + // This device can have either 64GB or 128GB of device memory. + // This limits to 16GB for simulation. + setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24); + setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24); + setRegVal(MI200_MEM_SIZE_REG, 0x3ff0); + gfx_version = GfxVersion::gfx90a; } else { panic("Unknown GPU device %s\n", p.device_name); } diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index cab7991..56ed2f4 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -42,6 +42,7 @@ #include "dev/amdgpu/mmio_reader.hh" #include "dev/io_device.hh" #include "dev/pci/device.hh" +#include "enums/GfxVersion.hh" #include "params/AMDGPUDevice.hh" namespace gem5 @@ -145,6 +146,9 @@ */ memory::PhysicalMemory deviceMem; + /* Device information */ + GfxVersion gfx_version = GfxVersion::gfx900; + public: AMDGPUDevice(const AMDGPUDeviceParams &p); @@ -206,6 +210,9 @@ uint16_t getVMID(Addr doorbell) { return doorbellVMIDMap[doorbell]; } std::unordered_map<uint16_t, std::set<int>>& getUsedVMIDs(); void insertQId(uint16_t vmid, int id); + + /* Device information */ + GfxVersion getGfxVersion() const { return gfx_version; } }; } // namespace gem5 diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc index 69e4373..07027c3 100644 --- a/src/dev/amdgpu/amdgpu_nbio.cc +++ b/src/dev/amdgpu/amdgpu_nbio.cc @@ -75,12 +75,14 @@ case VEGA10_INV_ENG17_ACK2: case MI100_INV_ENG17_ACK2: case MI100_INV_ENG17_ACK3: + case MI200_INV_ENG17_ACK2: pkt->setLE<uint32_t>(0x10001); break; case VEGA10_INV_ENG17_SEM1: case VEGA10_INV_ENG17_SEM2: case MI100_INV_ENG17_SEM2: case MI100_INV_ENG17_SEM3: + case MI200_INV_ENG17_SEM2: pkt->setLE<uint32_t>(0x1); break; // PSP responds with bit 31 set when ready diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh index d1e5391..dc95443 100644 --- a/src/dev/amdgpu/amdgpu_nbio.hh +++ b/src/dev/amdgpu/amdgpu_nbio.hh @@ -80,6 +80,11 @@ #define MI100_INV_ENG17_SEM2 0x6a888 #define MI100_INV_ENG17_SEM3 0x76888 +#define MI200_INV_ENG17_ACK1 0x0a318 +#define MI200_INV_ENG17_ACK2 0x6b018 +#define MI200_INV_ENG17_SEM1 0x0a288 +#define MI200_INV_ENG17_SEM2 0x6af88 + class AMDGPUNbio { public: diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh index ac35a11..f35a735 100644 --- a/src/dev/amdgpu/amdgpu_vm.hh +++ b/src/dev/amdgpu/amdgpu_vm.hh @@ -81,6 +81,10 @@ #define MI100_FB_LOCATION_BASE 0x6ac00 #define MI100_FB_LOCATION_TOP 0x6ac04 +#define MI200_MEM_SIZE_REG 0x0378c +#define MI200_FB_LOCATION_BASE 0x6b300 +#define MI200_FB_LOCATION_TOP 0x6b304 + // AMD GPUs support 16 different virtual address spaces static constexpr int AMDGPU_VM_COUNT = 16; diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh index 42832d5..a303f8e 100644 --- a/src/dev/amdgpu/pm4_defines.hh +++ b/src/dev/amdgpu/pm4_defines.hh @@ -275,6 +275,64 @@ typedef struct GEM5_PACKED { + uint32_t pasid : 16; + uint32_t reserved0 : 8; + uint32_t diq : 1; + uint32_t processQuantum : 7; + union + { + struct + { + uint32_t ptBaseLo; + uint32_t ptBaseHi; + }; + uint64_t ptBase; + }; + uint32_t shMemBases; + uint32_t shMemConfig; + uint32_t sqShaderTbaLo; + uint32_t sqShaderTbaHi; + uint32_t sqShaderTmaLo; + uint32_t sqShaderTmaHi; + uint32_t reserved1; + union + { + struct + { + uint32_t gdsAddrLo; + uint32_t gdsAddrHi; + }; + uint64_t gdsAddr; + }; + union + { + struct + { + uint32_t numGws : 7; + uint32_t sdma_enable : 1; + uint32_t numOac : 4; + uint32_t reserved3 : 4; + uint32_t gdsSize : 6; + uint32_t numQueues : 10; + }; + uint32_t ordinal14; + }; + uint32_t spiGdbgPerVmidCntl; + uint32_t tcpWatchCntl[4]; + union + { + struct + { + uint32_t completionSignalLo; + uint32_t completionSignalHi; + }; + uint64_t completionSignal; + }; +} PM4MapProcessMI200; +static_assert(sizeof(PM4MapProcessMI200) == 80); + +typedef struct GEM5_PACKED +{ uint32_t function : 4; uint32_t memSpace : 2; uint32_t operation : 2; diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index 3690113..e7b8465 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -271,12 +271,21 @@ dmaBuffer); } break; case IT_MAP_PROCESS: { - dmaBuffer = new PM4MapProcess(); - cb = new DmaVirtCallback<uint64_t>( - [ = ] (const uint64_t &) - { mapProcess(q, (PM4MapProcess *)dmaBuffer); }); - dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb, - dmaBuffer); + if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) { + dmaBuffer = new PM4MapProcessMI200(); + cb = new DmaVirtCallback<uint64_t>( + [ = ] (const uint64_t &) + { mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); }); + dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200), + cb, dmaBuffer); + } else { + dmaBuffer = new PM4MapProcess(); + cb = new DmaVirtCallback<uint64_t>( + [ = ] (const uint64_t &) + { mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); }); + dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb, + dmaBuffer); + } } break; case IT_UNMAP_QUEUES: { @@ -613,27 +622,50 @@ } void -PM4PacketProcessor::mapProcess(PM4Queue *q, PM4MapProcess *pkt) +PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase, + uint32_t shMemBases) { - q->incRptr(sizeof(PM4MapProcess)); - uint16_t vmid = gpuDevice->allocateVMID(pkt->pasid); + uint16_t vmid = gpuDevice->allocateVMID(pasid); - DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p vmid: %d quantum: " - "%d pt: %p signal: %p\n", pkt->pasid, vmid, pkt->processQuantum, - pkt->ptBase, pkt->completionSignal); - - gpuDevice->getVM().setPageTableBase(vmid, pkt->ptBase); - gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, pkt->shMemBases); + gpuDevice->getVM().setPageTableBase(vmid, ptBase); + gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, shMemBases); // Setup the apertures that gem5 uses. These values are bits [63:48]. - Addr lds_base = (Addr)bits(pkt->shMemBases, 31, 16) << 48; - Addr scratch_base = (Addr)bits(pkt->shMemBases, 15, 0) << 48; + Addr lds_base = (Addr)bits(shMemBases, 31, 16) << 48; + Addr scratch_base = (Addr)bits(shMemBases, 15, 0) << 48; // There does not seem to be any register for the limit, but the driver // assumes scratch and LDS have a 4GB aperture, so use that. gpuDevice->CP()->shader()->setLdsApe(lds_base, lds_base + 0xFFFFFFFF); gpuDevice->CP()->shader()->setScratchApe(scratch_base, scratch_base + 0xFFFFFFFF); +} + +void +PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt) +{ + q->incRptr(sizeof(PM4MapProcess)); + + DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: " + "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum, + pkt->ptBase, pkt->completionSignal); + + mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases); + + delete pkt; + decodeNext(q); +} + +void +PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt) +{ + q->incRptr(sizeof(PM4MapProcessMI200)); + + DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: " + "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum, + pkt->ptBase, pkt->completionSignal); + + mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases); delete pkt; decodeNext(q); diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh index 4617a21..3fb0551 100644 --- a/src/dev/amdgpu/pm4_packet_processor.hh +++ b/src/dev/amdgpu/pm4_packet_processor.hh @@ -141,7 +141,9 @@ void mapQueues(PM4Queue *q, PM4MapQueues *pkt); void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt); void doneMQDWrite(Addr mqdAddr, Addr addr); - void mapProcess(PM4Queue *q, PM4MapProcess *pkt); + void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases); + void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt); + void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt); void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd, uint16_t vmid); void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index 3a87186..c5449cc 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -45,7 +45,7 @@ class GfxVersion(ScopedEnum): - vals = ["gfx801", "gfx803", "gfx900", "gfx902"] + vals = ["gfx801", "gfx803", "gfx900", "gfx902", "gfx908", "gfx90a"] class PoolManager(SimObject): diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index af59b78..9755180 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -228,7 +228,8 @@ DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str()); HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id, - dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr); + dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr, + gpuDevice->getGfxVersion()); DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), " "grid size (%dx%dx%d) kernarg addr: %#x, completion " diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index fbe0efe..4083c1c 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -51,6 +51,7 @@ #include "base/types.hh" #include "dev/hsa/hsa_packet.hh" #include "dev/hsa/hsa_queue.hh" +#include "enums/GfxVersion.hh" #include "gpu-compute/kernel_code.hh" namespace gem5 @@ -61,7 +62,7 @@ public: HSAQueueEntry(std::string kernel_name, uint32_t queue_id, int dispatch_id, void *disp_pkt, AMDKernelCode *akc, - Addr host_pkt_addr, Addr code_addr) + Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version) : kernName(kernel_name), _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x, (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y, @@ -92,9 +93,19 @@ // we need to rip register usage from the resource registers. // // We can't get an exact number of registers from the resource - // registers because they round, but we can get an upper bound on it - if (!numVgprs) - numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; + // registers because they round, but we can get an upper bound on it. + // We determine the number of registers by solving for "vgprs_used" + // in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html + // #code-object-v3-kernel-descriptor + // Currently, the only supported gfx version in gem5 that computes + // this differently is gfx90a. + if (!numVgprs) { + if (gfx_version == GfxVersion::gfx90a) { + numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8; + } else { + numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; + } + } if (!numSgprs || numSgprs == std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) { -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/70317?usp=email To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings?usp=email Gerrit-MessageType: merged Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I0fb7b3ad928826beaa5386d52a94ba504369cb0d Gerrit-Change-Number: 70317 Gerrit-PatchSet: 2 Gerrit-Owner: Matthew Poremba <matthew.poremba@amd.com> Gerrit-Reviewer: Jason Lowe-Power <jason@lowepower.com> Gerrit-Reviewer: Jason Lowe-Power <power.jg@gmail.com> Gerrit-Reviewer: Matthew Poremba <matthew.poremba@amd.com> Gerrit-Reviewer: kokoro <noreply+kokoro@google.com>