gem5-dev@gem5.org

The gem5 Developer List

View all threads

[M] Change in gem5/gem5[develop]: arch-arm: Support Arm SVE Load-Broadcast Octaword instructions.

BB
Bobby Bruce (Gerrit)
Thu, May 25, 2023 9:36 PM

Bobby Bruce has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/70727?usp=email )

Change subject: arch-arm: Support Arm SVE Load-Broadcast Octaword
instructions.
......................................................................

arch-arm: Support Arm SVE Load-Broadcast Octaword instructions.

Add support for the Arm SVE Load-Broadcast Octaword (LD1RO{B,H,W,D})
instructions. These are similar to the Load-Broadcast
Quadword (LD1RQ{B,H,W,D}) instructions, but work on a 32-byte memory
segment rather than a 16-byte memory segment. Consequently, the LD1ROx
implementations build on the code for the LD1RQx implementations.

For more information please refer to the "ARM Architecture Reference
Manual Supplement - The Scalable Vector Extension (SVE), for ARMv8-A"
(https://developer.arm.com/architectures/cpu-architecture/a-profile/
docs/arm-architecture-reference-manual-supplement-armv8-a)

Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa
Reviewed-by: Richard Cooper richard.cooper@arm.com
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70727
Reviewed-by: Andreas Sandberg andreas.sandberg@arm.com
Maintainer: Andreas Sandberg andreas.sandberg@arm.com
Tested-by: kokoro noreply+kokoro@google.com

M src/arch/arm/isa/formats/sve_2nd_level.isa
M src/arch/arm/isa/insts/sve_mem.isa
2 files changed, 112 insertions(+), 51 deletions(-)

Approvals:
Andreas Sandberg: Looks good to me, approved; Looks good to me, approved
kokoro: Regressions pass

diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa
b/src/arch/arm/isa/formats/sve_2nd_level.isa
index 440722a..f74181a 100644
--- a/src/arch/arm/isa/formats/sve_2nd_level.isa
+++ b/src/arch/arm/isa/formats/sve_2nd_level.isa
@@ -3219,66 +3219,96 @@
}  // decodeSveMemGather32

  StaticInstPtr
  • decodeSveLoadBcastQuadSS(ExtMachInst machInst)
  • decodeSveLoadBcastMultiSS(ExtMachInst machInst)
    {
  •    uint8_t num = bits(machInst, 22, 21);
    
  •    if (num != 0x00) {
    
  •        return new Unknown64(machInst);
    
  •    }
    
  •     RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0);
        RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5));
        RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10);
        RegIndex rm = (RegIndex)(uint8_t) bits(machInst, 20, 16);
    
  •    uint8_t msz = bits(machInst, 24, 23);
    
  •    switch (msz) {
    
  •        case 0:
    
  •    uint8_t msz_esz = bits(machInst, 24, 21);
    
  •    switch (msz_esz) {
    
  •        // Load-Broadcast Quad-word Variants
    
  •        case 0b0000: // 0x0:
                return new SveLd1RqSS<uint8_t, uint8_t>("ld1rqb",
                        machInst, zt, pg, rn, rm);
    
  •        case 1:
    
  •            return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",
    
  •        case 0b0100: // 0x4:
    
  •             return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",
                        machInst, zt, pg, rn, rm);
    
  •        case 2:
    
  •        case 0b1000: // 0x8:
                return new SveLd1RqSS<uint32_t, uint32_t>("ld1rqw",
                        machInst, zt, pg, rn, rm);
    
  •        case 3:
    
  •        case 0b1100: // 0xc:
                return new SveLd1RqSS<uint64_t, uint64_t>("ld1rqd",
                        machInst, zt, pg, rn, rm);
    
  •        // Load-Broadcast Octa-word Variants
    
  •        case 0b0001: // 0x1:
    
  •            return new SveLd1RoSS<uint8_t, uint8_t>("ld1rob",
    
  •                    machInst, zt, pg, rn, rm);
    
  •        case 0b0101: // 0x5:
    
  •            return new SveLd1RoSS<uint16_t, uint16_t>("ld1roh",
    
  •                    machInst, zt, pg, rn, rm);
    
  •        case 0b1001: // 0x9:
    
  •            return new SveLd1RoSS<uint32_t, uint32_t>("ld1row",
    
  •                    machInst, zt, pg, rn, rm);
    
  •        case 0b1101: // 0xd:
    
  •            return new SveLd1RoSS<uint64_t, uint64_t>("ld1rod",
    
  •                    machInst, zt, pg, rn, rm);
    
  •        default:
    
  •          return new Unknown64(machInst);
        }
    
        return new Unknown64(machInst);
    
  • }  // decodeSveLoadBcastQuadSS
  • }  // decodeSveLoadBcastMultiSS

    StaticInstPtr

  • decodeSveLoadBcastQuadSI(ExtMachInst machInst)
  • decodeSveLoadBcastMultiSI(ExtMachInst machInst)
    {
  •    uint8_t num = bits(machInst, 22, 21);
    
  •    if (num != 0x00) {
    
  •        return new Unknown64(machInst);
    
  •    }
    
  •     RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0);
        RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5));
        RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10);
        uint64_t imm = sext<4>(bits(machInst, 19, 16));
    
  •    uint8_t msz = bits(machInst, 24, 23);
    
  •    switch (msz) {
    
  •        case 0:
    
  •    uint8_t msz_esz = bits(machInst, 24, 21);
    
  •    switch (msz_esz) {
    
  •        // Load-Broadcast Quad-word Variants
    
  •        case 0b0000: // 0x0:
                return new SveLd1RqSI<uint8_t, uint8_t>("ld1rqb",
                        machInst, zt, pg, rn, imm);
    
  •        case 1:
    
  •        case 0b0100: // 0x4:
                return new SveLd1RqSI<uint16_t, uint16_t>("ld1rqh",
                        machInst, zt, pg, rn, imm);
    
  •        case 2:
    
  •        case 0b1000: // 0x8:
                return new SveLd1RqSI<uint32_t, uint32_t>("ld1rqw",
                        machInst, zt, pg, rn, imm);
    
  •        case 3:
    
  •        case 0b1100: // 0xc:
                return new SveLd1RqSI<uint64_t, uint64_t>("ld1rqd",
                        machInst, zt, pg, rn, imm);
    
  •        // Load-Broadcast Octa-word Variants
    
  •        case 0b0001: // 0x1:
    
  •            return new SveLd1RoSI<uint8_t, uint8_t>("ld1rob",
    
  •                    machInst, zt, pg, rn, imm);
    
  •        case 0b0101: // 0x5:
    
  •            return new SveLd1RoSI<uint16_t, uint16_t>("ld1roh",
    
  •                    machInst, zt, pg, rn, imm);
    
  •        case 0b1001: // 0x9:
    
  •            return new SveLd1RoSI<uint32_t, uint32_t>("ld1row",
    
  •                    machInst, zt, pg, rn, imm);
    
  •        case 0b1101: // 0xd:
    
  •            return new SveLd1RoSI<uint64_t, uint64_t>("ld1rod",
    
  •                    machInst, zt, pg, rn, imm);
    
  •        default:
    
  •          return new Unknown64(machInst);
        }
    
        return new Unknown64(machInst);
    
  • }  // decodeSveLoadBcastQuadSI
  • }  // decodeSveLoadBcastMultiSI

    StaticInstPtr
    decodeSveContigLoadSS(ExtMachInst machInst)
    @@ -3388,10 +3418,10 @@
    {
    switch (bits(machInst, 15, 13)) {
    case 0x0:

  •        return decodeSveLoadBcastQuadSS(machInst);
    
  •        return decodeSveLoadBcastMultiSS(machInst);
          case 0x1:
            if (bits(machInst, 20) == 0x0) {
    
  •            return decodeSveLoadBcastQuadSI(machInst);
    
  •            return decodeSveLoadBcastMultiSI(machInst);
            }
            break;
          case 0x2:
    

diff --git a/src/arch/arm/isa/insts/sve_mem.isa
b/src/arch/arm/isa/insts/sve_mem.isa
index 8a73d13..bece368 100644
--- a/src/arch/arm/isa/insts/sve_mem.isa
+++ b/src/arch/arm/isa/insts/sve_mem.isa
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2019 ARM Limited
+// Copyright (c) 2017-2020 ARM Limited
// All rights reserved
//
// The license below extends only to copyright in the software and shall
@@ -1480,20 +1480,33 @@
exec_output += SveStructMemExecDeclare.subst(substDict)

  # Generates definitions for SVE load-and-replicate quadword  

instructions

  • def emitSveLoadAndReplQuad(offsetIsImm):
  • def emitSveLoadAndReplMulti(offsetIsImm, numQwordSegments):
    global header_output, exec_output, decoders
  •    assert(numQwordSegments in (1, 2)) # Quadword or Octaword
    
  •    from collections import namedtuple
    
  •    InstConfig = namedtuple("_InstConfig", "mnemonic classname  
    

baseclass")

  •    INST_CONFIGURATIONS = {
    
  •        # (offsetIsImm, numQwordSegments) -> InstConfig Recors
    
  •        (True, 1): InstConfig("ld1rq", "SveLd1RqSI", "SveContigMemSI"),
    
  •        (False, 1):  
    

InstConfig("ld1rq", "SveLd1RqSS", "SveContigMemSS"),

  •        (True, 2): InstConfig("ld1ro", "SveLd1RoSI", "SveContigMemSI"),
    
  •        (False, 2):  
    

InstConfig("ld1ro", "SveLd1RoSS", "SveContigMemSS"),

  •    }
    
  •    inst_config = INST_CONFIGURATIONS[(offsetIsImm, numQwordSegments)]
    
  •    memAccessSize = numQwordSegments * 16;
        tplHeader = 'template <class RegElemType, class MemElemType>'
        tplArgs = '<RegElemType, MemElemType>'
        eaCode = SPAlignmentCheckCode + '''
    
  •    int memAccessSize = 16;
    
  •    EA = XBase + '''
    
  •    int memAccessSize = %(memAccessSize)d;
    
  •    EA = XBase + ''' % dict(memAccessSize=memAccessSize)
        if offsetIsImm:
    
  •        eaCode += '(((int64_t) this->imm) * 16);'
    
  •        eaCode += ('(((int64_t) this->imm) * %(memAccessSize)d);'
    
  •                   % dict(memAccessSize=memAccessSize))
        else:
            eaCode += '(XOffset * sizeof(MemElemType));'
        loadRdEnableCode = '''
    
  •    eCount = 16/sizeof(RegElemType);
    
  •    auto rdEn = std::vector<bool>(16, true);
    
  •    eCount = %(memAccessSize)d/sizeof(RegElemType);
    
  •    auto rdEn = std::vector<bool>(%(memAccessSize)d, true);
        for (int i = 0; i < eCount; ++i) {
            if (!GpOp_x[i]) {
                for (int j = 0; j < sizeof(RegElemType); ++j) {
    

@@ -1501,26 +1514,40 @@
}
}
}

  •    '''
    
  •    ''' % dict(memAccessSize=memAccessSize)
        memAccCode = '''
    
  •    __uint128_t qword;
    
  •    RegElemType* qp = reinterpret_cast<RegElemType*>(&qword);
    
  •    for (int i = 0; i < 16/sizeof(RegElemType); ++i) {
    
  •    // Copy active elements of the data from memory into a temporary
    
  •    // quadword/octaword
    
  •    __uint128_t qwords[%(numQwordSegments)d];
    
  •    eCount = %(memAccessSize)d/sizeof(RegElemType);
    
  •    RegElemType* qp = reinterpret_cast<RegElemType*>(&qwords);
    
  •    for (int i = 0; i < eCount; ++i) {
            if (GpOp_x[i]) {
                qp[i] = memDataView[i];
            } else {
                qp[i] = 0;
            }
        }
    
  •    eCount = ArmStaticInst::getCurSveVecLen<__uint128_t>(
    
  •    // Repeat the temporary quadword/octaword segment into the
    
  •    // vector register. Zero fill the remainder for non-full
    
  •    // octawords.
    
  •    unsigned numQwords = ArmStaticInst::getCurSveVecLen<__uint128_t>(
                xc->tcBase());
    
  •    for (int i = 0; i < eCount; ++i) {
    
  •        AA64FpDest_uq[i] = qword;
    
  •    unsigned numFullQwords = numQwords -
    
  •                             (numQwords %% %(numQwordSegments)d);
    
  •    for (int i = 0; i < numQwords; ++i) {
    
  •        if (i < numFullQwords) {
    
  •            AA64FpDest_uq[i] = qwords[i %% %(numQwordSegments)d];
    
  •        } else {
    
  •            AA64FpDest_uq[i] = 0;
    
  •        }
        }
    
  •    '''
    
  •    iop = ArmInstObjParams('ld1rq',
    
  •            'SveLd1RqSI' if offsetIsImm else 'SveLd1RqSS',
    
  •            'SveContigMemSI' if offsetIsImm else 'SveContigMemSS',
    
  •    ''' % dict(memAccessSize=memAccessSize,
    
  •               numQwordSegments=numQwordSegments)
    
  •    iop = ArmInstObjParams(
    
  •            inst_config.mnemonic,
    
  •            inst_config.classname,
    
  •            inst_config.baseclass,
                {'tpl_header': tplHeader,
                 'tpl_args': tplArgs,
                 'rden_code': loadRdEnableCode,
    

@@ -1539,8 +1566,7 @@
SveContigLoadCompleteAcc.subst(iop))
for ttype in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'):
substDict = {'tpl_args': '<%s, %s>' % (ttype, ttype),

  •                'class_name': 'SveLd1RqSI' if offsetIsImm
    
  •                              else 'SveLd1RqSS'}
    
  •                     'class_name': inst_config.classname}
            exec_output += SveContigMemExecDeclare.subst(substDict)
    
    # LD1[S]{B,H,W,D} (scalar plus immediate)
    

@@ -1556,9 +1582,14 @@
emitSveLoadAndRepl()

  # LD1RQ{B,H,W,D} (scalar plus immediate)
  • emitSveLoadAndReplQuad(offsetIsImm = True)
  • emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=1)

    LD1RQ{B,H,W,D} (scalar plus scalar)

  • emitSveLoadAndReplQuad(offsetIsImm = False)
  • emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=1)

  • LD1RO{B,H,W,D} (scalar plus immediate)

  • emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=2)

  • LD1RO{B,H,W,D} (scalar plus scalar)

  • emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=2)

    LD{2,3,4}{B,H,W,D} (scalar plus immediate)

    ST{2,3,4}{B,H,W,D} (scalar plus immediate)

--
To view, visit
https://gem5-review.googlesource.com/c/public/gem5/+/70727?usp=email
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings?usp=email

Gerrit-MessageType: merged
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa
Gerrit-Change-Number: 70727
Gerrit-PatchSet: 8
Gerrit-Owner: Giacomo Travaglini giacomo.travaglini@arm.com
Gerrit-Reviewer: Andreas Sandberg andreas.sandberg@arm.com
Gerrit-Reviewer: Bobby Bruce bbruce@ucdavis.edu
Gerrit-Reviewer: Giacomo Travaglini giacomo.travaglini@arm.com
Gerrit-Reviewer: Jason Lowe-Power power.jg@gmail.com
Gerrit-Reviewer: Richard Cooper richard.cooper@arm.com
Gerrit-Reviewer: kokoro noreply+kokoro@google.com

Bobby Bruce has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/70727?usp=email ) Change subject: arch-arm: Support Arm SVE Load-Broadcast Octaword instructions. ...................................................................... arch-arm: Support Arm SVE Load-Broadcast Octaword instructions. Add support for the Arm SVE Load-Broadcast Octaword (LD1RO{B,H,W,D}) instructions. These are similar to the Load-Broadcast Quadword (LD1RQ{B,H,W,D}) instructions, but work on a 32-byte memory segment rather than a 16-byte memory segment. Consequently, the LD1ROx implementations build on the code for the LD1RQx implementations. For more information please refer to the "ARM Architecture Reference Manual Supplement - The Scalable Vector Extension (SVE), for ARMv8-A" (https://developer.arm.com/architectures/cpu-architecture/a-profile/ docs/arm-architecture-reference-manual-supplement-armv8-a) Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa Reviewed-by: Richard Cooper <richard.cooper@arm.com> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70727 Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com> Maintainer: Andreas Sandberg <andreas.sandberg@arm.com> Tested-by: kokoro <noreply+kokoro@google.com> --- M src/arch/arm/isa/formats/sve_2nd_level.isa M src/arch/arm/isa/insts/sve_mem.isa 2 files changed, 112 insertions(+), 51 deletions(-) Approvals: Andreas Sandberg: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa index 440722a..f74181a 100644 --- a/src/arch/arm/isa/formats/sve_2nd_level.isa +++ b/src/arch/arm/isa/formats/sve_2nd_level.isa @@ -3219,66 +3219,96 @@ } // decodeSveMemGather32 StaticInstPtr - decodeSveLoadBcastQuadSS(ExtMachInst machInst) + decodeSveLoadBcastMultiSS(ExtMachInst machInst) { - uint8_t num = bits(machInst, 22, 21); - if (num != 0x00) { - return new Unknown64(machInst); - } - RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0); RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5)); RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10); RegIndex rm = (RegIndex)(uint8_t) bits(machInst, 20, 16); - uint8_t msz = bits(machInst, 24, 23); - switch (msz) { - case 0: + + uint8_t msz_esz = bits(machInst, 24, 21); + + switch (msz_esz) { + // Load-Broadcast Quad-word Variants + case 0b0000: // 0x0: return new SveLd1RqSS<uint8_t, uint8_t>("ld1rqb", machInst, zt, pg, rn, rm); - case 1: - return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh", + case 0b0100: // 0x4: + return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh", machInst, zt, pg, rn, rm); - case 2: + case 0b1000: // 0x8: return new SveLd1RqSS<uint32_t, uint32_t>("ld1rqw", machInst, zt, pg, rn, rm); - case 3: + case 0b1100: // 0xc: return new SveLd1RqSS<uint64_t, uint64_t>("ld1rqd", machInst, zt, pg, rn, rm); + + // Load-Broadcast Octa-word Variants + case 0b0001: // 0x1: + return new SveLd1RoSS<uint8_t, uint8_t>("ld1rob", + machInst, zt, pg, rn, rm); + case 0b0101: // 0x5: + return new SveLd1RoSS<uint16_t, uint16_t>("ld1roh", + machInst, zt, pg, rn, rm); + case 0b1001: // 0x9: + return new SveLd1RoSS<uint32_t, uint32_t>("ld1row", + machInst, zt, pg, rn, rm); + case 0b1101: // 0xd: + return new SveLd1RoSS<uint64_t, uint64_t>("ld1rod", + machInst, zt, pg, rn, rm); + + default: + return new Unknown64(machInst); } return new Unknown64(machInst); - } // decodeSveLoadBcastQuadSS + } // decodeSveLoadBcastMultiSS StaticInstPtr - decodeSveLoadBcastQuadSI(ExtMachInst machInst) + decodeSveLoadBcastMultiSI(ExtMachInst machInst) { - uint8_t num = bits(machInst, 22, 21); - if (num != 0x00) { - return new Unknown64(machInst); - } - RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0); RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5)); RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10); uint64_t imm = sext<4>(bits(machInst, 19, 16)); - uint8_t msz = bits(machInst, 24, 23); - switch (msz) { - case 0: + + uint8_t msz_esz = bits(machInst, 24, 21); + + switch (msz_esz) { + // Load-Broadcast Quad-word Variants + case 0b0000: // 0x0: return new SveLd1RqSI<uint8_t, uint8_t>("ld1rqb", machInst, zt, pg, rn, imm); - case 1: + case 0b0100: // 0x4: return new SveLd1RqSI<uint16_t, uint16_t>("ld1rqh", machInst, zt, pg, rn, imm); - case 2: + case 0b1000: // 0x8: return new SveLd1RqSI<uint32_t, uint32_t>("ld1rqw", machInst, zt, pg, rn, imm); - case 3: + case 0b1100: // 0xc: return new SveLd1RqSI<uint64_t, uint64_t>("ld1rqd", machInst, zt, pg, rn, imm); + + // Load-Broadcast Octa-word Variants + case 0b0001: // 0x1: + return new SveLd1RoSI<uint8_t, uint8_t>("ld1rob", + machInst, zt, pg, rn, imm); + case 0b0101: // 0x5: + return new SveLd1RoSI<uint16_t, uint16_t>("ld1roh", + machInst, zt, pg, rn, imm); + case 0b1001: // 0x9: + return new SveLd1RoSI<uint32_t, uint32_t>("ld1row", + machInst, zt, pg, rn, imm); + case 0b1101: // 0xd: + return new SveLd1RoSI<uint64_t, uint64_t>("ld1rod", + machInst, zt, pg, rn, imm); + + default: + return new Unknown64(machInst); } return new Unknown64(machInst); - } // decodeSveLoadBcastQuadSI + } // decodeSveLoadBcastMultiSI StaticInstPtr decodeSveContigLoadSS(ExtMachInst machInst) @@ -3388,10 +3418,10 @@ { switch (bits(machInst, 15, 13)) { case 0x0: - return decodeSveLoadBcastQuadSS(machInst); + return decodeSveLoadBcastMultiSS(machInst); case 0x1: if (bits(machInst, 20) == 0x0) { - return decodeSveLoadBcastQuadSI(machInst); + return decodeSveLoadBcastMultiSI(machInst); } break; case 0x2: diff --git a/src/arch/arm/isa/insts/sve_mem.isa b/src/arch/arm/isa/insts/sve_mem.isa index 8a73d13..bece368 100644 --- a/src/arch/arm/isa/insts/sve_mem.isa +++ b/src/arch/arm/isa/insts/sve_mem.isa @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2019 ARM Limited +// Copyright (c) 2017-2020 ARM Limited // All rights reserved // // The license below extends only to copyright in the software and shall @@ -1480,20 +1480,33 @@ exec_output += SveStructMemExecDeclare.subst(substDict) # Generates definitions for SVE load-and-replicate quadword instructions - def emitSveLoadAndReplQuad(offsetIsImm): + def emitSveLoadAndReplMulti(offsetIsImm, numQwordSegments): global header_output, exec_output, decoders + assert(numQwordSegments in (1, 2)) # Quadword or Octaword + from collections import namedtuple + InstConfig = namedtuple("_InstConfig", "mnemonic classname baseclass") + INST_CONFIGURATIONS = { + # (offsetIsImm, numQwordSegments) -> InstConfig Recors + (True, 1): InstConfig("ld1rq", "SveLd1RqSI", "SveContigMemSI"), + (False, 1): InstConfig("ld1rq", "SveLd1RqSS", "SveContigMemSS"), + (True, 2): InstConfig("ld1ro", "SveLd1RoSI", "SveContigMemSI"), + (False, 2): InstConfig("ld1ro", "SveLd1RoSS", "SveContigMemSS"), + } + inst_config = INST_CONFIGURATIONS[(offsetIsImm, numQwordSegments)] + memAccessSize = numQwordSegments * 16; tplHeader = 'template <class RegElemType, class MemElemType>' tplArgs = '<RegElemType, MemElemType>' eaCode = SPAlignmentCheckCode + ''' - int memAccessSize = 16; - EA = XBase + ''' + int memAccessSize = %(memAccessSize)d; + EA = XBase + ''' % dict(memAccessSize=memAccessSize) if offsetIsImm: - eaCode += '(((int64_t) this->imm) * 16);' + eaCode += ('(((int64_t) this->imm) * %(memAccessSize)d);' + % dict(memAccessSize=memAccessSize)) else: eaCode += '(XOffset * sizeof(MemElemType));' loadRdEnableCode = ''' - eCount = 16/sizeof(RegElemType); - auto rdEn = std::vector<bool>(16, true); + eCount = %(memAccessSize)d/sizeof(RegElemType); + auto rdEn = std::vector<bool>(%(memAccessSize)d, true); for (int i = 0; i < eCount; ++i) { if (!GpOp_x[i]) { for (int j = 0; j < sizeof(RegElemType); ++j) { @@ -1501,26 +1514,40 @@ } } } - ''' + ''' % dict(memAccessSize=memAccessSize) memAccCode = ''' - __uint128_t qword; - RegElemType* qp = reinterpret_cast<RegElemType*>(&qword); - for (int i = 0; i < 16/sizeof(RegElemType); ++i) { + // Copy active elements of the data from memory into a temporary + // quadword/octaword + __uint128_t qwords[%(numQwordSegments)d]; + eCount = %(memAccessSize)d/sizeof(RegElemType); + RegElemType* qp = reinterpret_cast<RegElemType*>(&qwords); + for (int i = 0; i < eCount; ++i) { if (GpOp_x[i]) { qp[i] = memDataView[i]; } else { qp[i] = 0; } } - eCount = ArmStaticInst::getCurSveVecLen<__uint128_t>( + // Repeat the temporary quadword/octaword segment into the + // vector register. Zero fill the remainder for non-full + // octawords. + unsigned numQwords = ArmStaticInst::getCurSveVecLen<__uint128_t>( xc->tcBase()); - for (int i = 0; i < eCount; ++i) { - AA64FpDest_uq[i] = qword; + unsigned numFullQwords = numQwords - + (numQwords %% %(numQwordSegments)d); + for (int i = 0; i < numQwords; ++i) { + if (i < numFullQwords) { + AA64FpDest_uq[i] = qwords[i %% %(numQwordSegments)d]; + } else { + AA64FpDest_uq[i] = 0; + } } - ''' - iop = ArmInstObjParams('ld1rq', - 'SveLd1RqSI' if offsetIsImm else 'SveLd1RqSS', - 'SveContigMemSI' if offsetIsImm else 'SveContigMemSS', + ''' % dict(memAccessSize=memAccessSize, + numQwordSegments=numQwordSegments) + iop = ArmInstObjParams( + inst_config.mnemonic, + inst_config.classname, + inst_config.baseclass, {'tpl_header': tplHeader, 'tpl_args': tplArgs, 'rden_code': loadRdEnableCode, @@ -1539,8 +1566,7 @@ SveContigLoadCompleteAcc.subst(iop)) for ttype in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'): substDict = {'tpl_args': '<%s, %s>' % (ttype, ttype), - 'class_name': 'SveLd1RqSI' if offsetIsImm - else 'SveLd1RqSS'} + 'class_name': inst_config.classname} exec_output += SveContigMemExecDeclare.subst(substDict) # LD1[S]{B,H,W,D} (scalar plus immediate) @@ -1556,9 +1582,14 @@ emitSveLoadAndRepl() # LD1RQ{B,H,W,D} (scalar plus immediate) - emitSveLoadAndReplQuad(offsetIsImm = True) + emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=1) # LD1RQ{B,H,W,D} (scalar plus scalar) - emitSveLoadAndReplQuad(offsetIsImm = False) + emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=1) + + # LD1RO{B,H,W,D} (scalar plus immediate) + emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=2) + # LD1RO{B,H,W,D} (scalar plus scalar) + emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=2) # LD{2,3,4}{B,H,W,D} (scalar plus immediate) # ST{2,3,4}{B,H,W,D} (scalar plus immediate) -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/70727?usp=email To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings?usp=email Gerrit-MessageType: merged Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa Gerrit-Change-Number: 70727 Gerrit-PatchSet: 8 Gerrit-Owner: Giacomo Travaglini <giacomo.travaglini@arm.com> Gerrit-Reviewer: Andreas Sandberg <andreas.sandberg@arm.com> Gerrit-Reviewer: Bobby Bruce <bbruce@ucdavis.edu> Gerrit-Reviewer: Giacomo Travaglini <giacomo.travaglini@arm.com> Gerrit-Reviewer: Jason Lowe-Power <power.jg@gmail.com> Gerrit-Reviewer: Richard Cooper <richard.cooper@arm.com> Gerrit-Reviewer: kokoro <noreply+kokoro@google.com>