Empathy List Archives

gem5-dev@gem5.org

The gem5 Developer List

[M] Change in gem5/gem5[develop]: arch-arm: Support Arm SVE Load-Broadcast Octaword instructions.

Bobby Bruce (Gerrit)

Thu, May 25, 2023 9:36 PM

Bobby Bruce has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/70727?usp=email )

Change subject: arch-arm: Support Arm SVE Load-Broadcast Octaword
instructions.
......................................................................

arch-arm: Support Arm SVE Load-Broadcast Octaword instructions.

Add support for the Arm SVE Load-Broadcast Octaword (LD1RO{B,H,W,D})
instructions. These are similar to the Load-Broadcast
Quadword (LD1RQ{B,H,W,D}) instructions, but work on a 32-byte memory
segment rather than a 16-byte memory segment. Consequently, the LD1ROx
implementations build on the code for the LD1RQx implementations.

For more information please refer to the "ARM Architecture Reference
Manual Supplement - The Scalable Vector Extension (SVE), for ARMv8-A"
(https://developer.arm.com/architectures/cpu-architecture/a-profile/
docs/arm-architecture-reference-manual-supplement-armv8-a)

Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa
Reviewed-by: Richard Cooper richard.cooper@arm.com
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70727
Reviewed-by: Andreas Sandberg andreas.sandberg@arm.com
Maintainer: Andreas Sandberg andreas.sandberg@arm.com
Tested-by: kokoro noreply+kokoro@google.com

M src/arch/arm/isa/formats/sve_2nd_level.isa
M src/arch/arm/isa/insts/sve_mem.isa
2 files changed, 112 insertions(+), 51 deletions(-)

Approvals:
Andreas Sandberg: Looks good to me, approved; Looks good to me, approved
kokoro: Regressions pass

diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa
b/src/arch/arm/isa/formats/sve_2nd_level.isa
index 440722a..f74181a 100644
--- a/src/arch/arm/isa/formats/sve_2nd_level.isa
+++ b/src/arch/arm/isa/formats/sve_2nd_level.isa
@@ -3219,66 +3219,96 @@
} // decodeSveMemGather32

  StaticInstPtr

decodeSveLoadBcastQuadSS(ExtMachInst machInst)

decodeSveLoadBcastMultiSS(ExtMachInst machInst)
{

   uint8_t num = bits(machInst, 22, 21);

```
   if (num != 0x00) {
```
```
       return new Unknown64(machInst);
```
```
   }
```

    RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0);
    RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5));
    RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10);
    RegIndex rm = (RegIndex)(uint8_t) bits(machInst, 20, 16);

   uint8_t msz = bits(machInst, 24, 23);

```
   switch (msz) {
```
```
       case 0:
```

   uint8_t msz_esz = bits(machInst, 24, 21);

```
   switch (msz_esz) {
```

       // Load-Broadcast Quad-word Variants

       case 0b0000: // 0x0:
            return new SveLd1RqSS<uint8_t, uint8_t>("ld1rqb",
                    machInst, zt, pg, rn, rm);

```
       case 1:
```

           return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",

```
       case 0b0100: // 0x4:
```

            return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",
                    machInst, zt, pg, rn, rm);

```
       case 2:
```

       case 0b1000: // 0x8:
            return new SveLd1RqSS<uint32_t, uint32_t>("ld1rqw",
                    machInst, zt, pg, rn, rm);

```
       case 3:
```

       case 0b1100: // 0xc:
            return new SveLd1RqSS<uint64_t, uint64_t>("ld1rqd",
                    machInst, zt, pg, rn, rm);

       // Load-Broadcast Octa-word Variants

```
       case 0b0001: // 0x1:
```

           return new SveLd1RoSS<uint8_t, uint8_t>("ld1rob",

                   machInst, zt, pg, rn, rm);

```
       case 0b0101: // 0x5:
```

           return new SveLd1RoSS<uint16_t, uint16_t>("ld1roh",

                   machInst, zt, pg, rn, rm);

```
       case 0b1001: // 0x9:
```

           return new SveLd1RoSS<uint32_t, uint32_t>("ld1row",

                   machInst, zt, pg, rn, rm);

```
       case 0b1101: // 0xd:
```

           return new SveLd1RoSS<uint64_t, uint64_t>("ld1rod",

                   machInst, zt, pg, rn, rm);

```
       default:
```

         return new Unknown64(machInst);
    }

    return new Unknown64(machInst);

} // decodeSveLoadBcastQuadSS

} // decodeSveLoadBcastMultiSS

StaticInstPtr

decodeSveLoadBcastQuadSI(ExtMachInst machInst)

decodeSveLoadBcastMultiSI(ExtMachInst machInst)
{

   uint8_t num = bits(machInst, 22, 21);

```
   if (num != 0x00) {
```
```
       return new Unknown64(machInst);
```
```
   }
```

    RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0);
    RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5));
    RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10);
    uint64_t imm = sext<4>(bits(machInst, 19, 16));

   uint8_t msz = bits(machInst, 24, 23);

```
   switch (msz) {
```
```
       case 0:
```

   uint8_t msz_esz = bits(machInst, 24, 21);

```
   switch (msz_esz) {
```

       // Load-Broadcast Quad-word Variants

       case 0b0000: // 0x0:
            return new SveLd1RqSI<uint8_t, uint8_t>("ld1rqb",
                    machInst, zt, pg, rn, imm);

```
       case 1:
```

       case 0b0100: // 0x4:
            return new SveLd1RqSI<uint16_t, uint16_t>("ld1rqh",
                    machInst, zt, pg, rn, imm);

```
       case 2:
```

       case 0b1000: // 0x8:
            return new SveLd1RqSI<uint32_t, uint32_t>("ld1rqw",
                    machInst, zt, pg, rn, imm);

```
       case 3:
```

       case 0b1100: // 0xc:
            return new SveLd1RqSI<uint64_t, uint64_t>("ld1rqd",
                    machInst, zt, pg, rn, imm);

       // Load-Broadcast Octa-word Variants

```
       case 0b0001: // 0x1:
```

           return new SveLd1RoSI<uint8_t, uint8_t>("ld1rob",

                   machInst, zt, pg, rn, imm);

```
       case 0b0101: // 0x5:
```

           return new SveLd1RoSI<uint16_t, uint16_t>("ld1roh",

                   machInst, zt, pg, rn, imm);

```
       case 0b1001: // 0x9:
```

           return new SveLd1RoSI<uint32_t, uint32_t>("ld1row",

                   machInst, zt, pg, rn, imm);

```
       case 0b1101: // 0xd:
```

           return new SveLd1RoSI<uint64_t, uint64_t>("ld1rod",

                   machInst, zt, pg, rn, imm);

```
       default:
```

         return new Unknown64(machInst);
    }

    return new Unknown64(machInst);

} // decodeSveLoadBcastQuadSI

} // decodeSveLoadBcastMultiSI

StaticInstPtr
decodeSveContigLoadSS(ExtMachInst machInst)
@@ -3388,10 +3418,10 @@
{
switch (bits(machInst, 15, 13)) {
case 0x0:

       return decodeSveLoadBcastQuadSS(machInst);

       return decodeSveLoadBcastMultiSS(machInst);
      case 0x1:
        if (bits(machInst, 20) == 0x0) {

           return decodeSveLoadBcastQuadSI(machInst);

           return decodeSveLoadBcastMultiSI(machInst);
        }
        break;
      case 0x2:

diff --git a/src/arch/arm/isa/insts/sve_mem.isa
b/src/arch/arm/isa/insts/sve_mem.isa
index 8a73d13..bece368 100644
--- a/src/arch/arm/isa/insts/sve_mem.isa
+++ b/src/arch/arm/isa/insts/sve_mem.isa
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2019 ARM Limited
+// Copyright (c) 2017-2020 ARM Limited
// All rights reserved
//
// The license below extends only to copyright in the software and shall
@@ -1480,20 +1480,33 @@
exec_output += SveStructMemExecDeclare.subst(substDict)

  # Generates definitions for SVE load-and-replicate quadword

instructions

def emitSveLoadAndReplQuad(offsetIsImm):

def emitSveLoadAndReplMulti(offsetIsImm, numQwordSegments):
global header_output, exec_output, decoders

   assert(numQwordSegments in (1, 2)) # Quadword or Octaword

```
   from collections import namedtuple
```

   InstConfig = namedtuple("_InstConfig", "mnemonic classname

baseclass")

```
   INST_CONFIGURATIONS = {
```

       # (offsetIsImm, numQwordSegments) -> InstConfig Recors

       (True, 1): InstConfig("ld1rq", "SveLd1RqSI", "SveContigMemSI"),

```
       (False, 1):  
```

InstConfig("ld1rq", "SveLd1RqSS", "SveContigMemSS"),

       (True, 2): InstConfig("ld1ro", "SveLd1RoSI", "SveContigMemSI"),

```
       (False, 2):  
```

InstConfig("ld1ro", "SveLd1RoSS", "SveContigMemSS"),

```
   }
```

   inst_config = INST_CONFIGURATIONS[(offsetIsImm, numQwordSegments)]

   memAccessSize = numQwordSegments * 16;
    tplHeader = 'template <class RegElemType, class MemElemType>'
    tplArgs = '<RegElemType, MemElemType>'
    eaCode = SPAlignmentCheckCode + '''

```
   int memAccessSize = 16;
```
```
   EA = XBase + '''
```

   int memAccessSize = %(memAccessSize)d;

   EA = XBase + ''' % dict(memAccessSize=memAccessSize)
    if offsetIsImm:

       eaCode += '(((int64_t) this->imm) * 16);'

       eaCode += ('(((int64_t) this->imm) * %(memAccessSize)d);'

                  % dict(memAccessSize=memAccessSize))
    else:
        eaCode += '(XOffset * sizeof(MemElemType));'
    loadRdEnableCode = '''

```
   eCount = 16/sizeof(RegElemType);
```

   auto rdEn = std::vector<bool>(16, true);

   eCount = %(memAccessSize)d/sizeof(RegElemType);

   auto rdEn = std::vector<bool>(%(memAccessSize)d, true);
    for (int i = 0; i < eCount; ++i) {
        if (!GpOp_x[i]) {
            for (int j = 0; j < sizeof(RegElemType); ++j) {

@@ -1501,26 +1514,40 @@
}
}
}

```
   '''
```

   ''' % dict(memAccessSize=memAccessSize)
    memAccCode = '''

```
   __uint128_t qword;
```

   RegElemType* qp = reinterpret_cast<RegElemType*>(&qword);

   for (int i = 0; i < 16/sizeof(RegElemType); ++i) {

   // Copy active elements of the data from memory into a temporary

```
   // quadword/octaword
```

   __uint128_t qwords[%(numQwordSegments)d];

   eCount = %(memAccessSize)d/sizeof(RegElemType);

   RegElemType* qp = reinterpret_cast<RegElemType*>(&qwords);

   for (int i = 0; i < eCount; ++i) {
        if (GpOp_x[i]) {
            qp[i] = memDataView[i];
        } else {
            qp[i] = 0;
        }
    }

   eCount = ArmStaticInst::getCurSveVecLen<__uint128_t>(

   // Repeat the temporary quadword/octaword segment into the

   // vector register. Zero fill the remainder for non-full

```
   // octawords.
```

   unsigned numQwords = ArmStaticInst::getCurSveVecLen<__uint128_t>(
            xc->tcBase());

```
   for (int i = 0; i < eCount; ++i) {
```
```
       AA64FpDest_uq[i] = qword;
```

   unsigned numFullQwords = numQwords -

                            (numQwords %% %(numQwordSegments)d);

   for (int i = 0; i < numQwords; ++i) {

```
       if (i < numFullQwords) {
```

           AA64FpDest_uq[i] = qwords[i %% %(numQwordSegments)d];

```
       } else {
```
```
           AA64FpDest_uq[i] = 0;
```
```
       }
    }
```

```
   '''
```
```
   iop = ArmInstObjParams('ld1rq',
```

           'SveLd1RqSI' if offsetIsImm else 'SveLd1RqSS',

           'SveContigMemSI' if offsetIsImm else 'SveContigMemSS',

   ''' % dict(memAccessSize=memAccessSize,

              numQwordSegments=numQwordSegments)

```
   iop = ArmInstObjParams(
```
```
           inst_config.mnemonic,
```
```
           inst_config.classname,
```

           inst_config.baseclass,
            {'tpl_header': tplHeader,
             'tpl_args': tplArgs,
             'rden_code': loadRdEnableCode,

@@ -1539,8 +1566,7 @@
SveContigLoadCompleteAcc.subst(iop))
for ttype in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'):
substDict = {'tpl_args': '<%s, %s>' % (ttype, ttype),

               'class_name': 'SveLd1RqSI' if offsetIsImm

                             else 'SveLd1RqSS'}

                    'class_name': inst_config.classname}
        exec_output += SveContigMemExecDeclare.subst(substDict)

# LD1[S]{B,H,W,D} (scalar plus immediate)

@@ -1556,9 +1582,14 @@
emitSveLoadAndRepl()

  # LD1RQ{B,H,W,D} (scalar plus immediate)

emitSveLoadAndReplQuad(offsetIsImm = True)

emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=1)
LD1RQ{B,H,W,D} (scalar plus scalar)

emitSveLoadAndReplQuad(offsetIsImm = False)

emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=1)
LD1RO{B,H,W,D} (scalar plus immediate)
emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=2)
LD1RO{B,H,W,D} (scalar plus scalar)
emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=2)

LD{2,3,4}{B,H,W,D} (scalar plus immediate)

ST{2,3,4}{B,H,W,D} (scalar plus immediate)

--
To view, visit
https://gem5-review.googlesource.com/c/public/gem5/+/70727?usp=email
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings?usp=email

Gerrit-MessageType: merged
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa
Gerrit-Change-Number: 70727
Gerrit-PatchSet: 8
Gerrit-Owner: Giacomo Travaglini giacomo.travaglini@arm.com
Gerrit-Reviewer: Andreas Sandberg andreas.sandberg@arm.com
Gerrit-Reviewer: Bobby Bruce bbruce@ucdavis.edu
Gerrit-Reviewer: Giacomo Travaglini giacomo.travaglini@arm.com
Gerrit-Reviewer: Jason Lowe-Power power.jg@gmail.com
Gerrit-Reviewer: Richard Cooper richard.cooper@arm.com
Gerrit-Reviewer: kokoro noreply+kokoro@google.com

Bobby Bruce has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/70727?usp=email ) Change subject: arch-arm: Support Arm SVE Load-Broadcast Octaword instructions. ...................................................................... arch-arm: Support Arm SVE Load-Broadcast Octaword instructions. Add support for the Arm SVE Load-Broadcast Octaword (LD1RO{B,H,W,D}) instructions. These are similar to the Load-Broadcast Quadword (LD1RQ{B,H,W,D}) instructions, but work on a 32-byte memory segment rather than a 16-byte memory segment. Consequently, the LD1ROx implementations build on the code for the LD1RQx implementations. For more information please refer to the "ARM Architecture Reference Manual Supplement - The Scalable Vector Extension (SVE), for ARMv8-A" (https://developer.arm.com/architectures/cpu-architecture/a-profile/ docs/arm-architecture-reference-manual-supplement-armv8-a) Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa Reviewed-by: Richard Cooper <richard.cooper@arm.com> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70727 Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com> Maintainer: Andreas Sandberg <andreas.sandberg@arm.com> Tested-by: kokoro <noreply+kokoro@google.com> --- M src/arch/arm/isa/formats/sve_2nd_level.isa M src/arch/arm/isa/insts/sve_mem.isa 2 files changed, 112 insertions(+), 51 deletions(-) Approvals: Andreas Sandberg: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa index 440722a..f74181a 100644 --- a/src/arch/arm/isa/formats/sve_2nd_level.isa +++ b/src/arch/arm/isa/formats/sve_2nd_level.isa @@ -3219,66 +3219,96 @@ } // decodeSveMemGather32 StaticInstPtr - decodeSveLoadBcastQuadSS(ExtMachInst machInst) + decodeSveLoadBcastMultiSS(ExtMachInst machInst) { - uint8_t num = bits(machInst, 22, 21); - if (num != 0x00) { - return new Unknown64(machInst); - } - RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0); RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5)); RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10); RegIndex rm = (RegIndex)(uint8_t) bits(machInst, 20, 16); - uint8_t msz = bits(machInst, 24, 23); - switch (msz) { - case 0: + + uint8_t msz_esz = bits(machInst, 24, 21); + + switch (msz_esz) { + // Load-Broadcast Quad-word Variants + case 0b0000: // 0x0: return new SveLd1RqSS<uint8_t, uint8_t>("ld1rqb", machInst, zt, pg, rn, rm); - case 1: - return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh", + case 0b0100: // 0x4: + return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh", machInst, zt, pg, rn, rm); - case 2: + case 0b1000: // 0x8: return new SveLd1RqSS<uint32_t, uint32_t>("ld1rqw", machInst, zt, pg, rn, rm); - case 3: + case 0b1100: // 0xc: return new SveLd1RqSS<uint64_t, uint64_t>("ld1rqd", machInst, zt, pg, rn, rm); + + // Load-Broadcast Octa-word Variants + case 0b0001: // 0x1: + return new SveLd1RoSS<uint8_t, uint8_t>("ld1rob", + machInst, zt, pg, rn, rm); + case 0b0101: // 0x5: + return new SveLd1RoSS<uint16_t, uint16_t>("ld1roh", + machInst, zt, pg, rn, rm); + case 0b1001: // 0x9: + return new SveLd1RoSS<uint32_t, uint32_t>("ld1row", + machInst, zt, pg, rn, rm); + case 0b1101: // 0xd: + return new SveLd1RoSS<uint64_t, uint64_t>("ld1rod", + machInst, zt, pg, rn, rm); + + default: + return new Unknown64(machInst); } return new Unknown64(machInst); - } // decodeSveLoadBcastQuadSS + } // decodeSveLoadBcastMultiSS StaticInstPtr - decodeSveLoadBcastQuadSI(ExtMachInst machInst) + decodeSveLoadBcastMultiSI(ExtMachInst machInst) { - uint8_t num = bits(machInst, 22, 21); - if (num != 0x00) { - return new Unknown64(machInst); - } - RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0); RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5)); RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10); uint64_t imm = sext<4>(bits(machInst, 19, 16)); - uint8_t msz = bits(machInst, 24, 23); - switch (msz) { - case 0: + + uint8_t msz_esz = bits(machInst, 24, 21); + + switch (msz_esz) { + // Load-Broadcast Quad-word Variants + case 0b0000: // 0x0: return new SveLd1RqSI<uint8_t, uint8_t>("ld1rqb", machInst, zt, pg, rn, imm); - case 1: + case 0b0100: // 0x4: return new SveLd1RqSI<uint16_t, uint16_t>("ld1rqh", machInst, zt, pg, rn, imm); - case 2: + case 0b1000: // 0x8: return new SveLd1RqSI<uint32_t, uint32_t>("ld1rqw", machInst, zt, pg, rn, imm); - case 3: + case 0b1100: // 0xc: return new SveLd1RqSI<uint64_t, uint64_t>("ld1rqd", machInst, zt, pg, rn, imm); + + // Load-Broadcast Octa-word Variants + case 0b0001: // 0x1: + return new SveLd1RoSI<uint8_t, uint8_t>("ld1rob", + machInst, zt, pg, rn, imm); + case 0b0101: // 0x5: + return new SveLd1RoSI<uint16_t, uint16_t>("ld1roh", + machInst, zt, pg, rn, imm); + case 0b1001: // 0x9: + return new SveLd1RoSI<uint32_t, uint32_t>("ld1row", + machInst, zt, pg, rn, imm); + case 0b1101: // 0xd: + return new SveLd1RoSI<uint64_t, uint64_t>("ld1rod", + machInst, zt, pg, rn, imm); + + default: + return new Unknown64(machInst); } return new Unknown64(machInst); - } // decodeSveLoadBcastQuadSI + } // decodeSveLoadBcastMultiSI StaticInstPtr decodeSveContigLoadSS(ExtMachInst machInst) @@ -3388,10 +3418,10 @@ { switch (bits(machInst, 15, 13)) { case 0x0: - return decodeSveLoadBcastQuadSS(machInst); + return decodeSveLoadBcastMultiSS(machInst); case 0x1: if (bits(machInst, 20) == 0x0) { - return decodeSveLoadBcastQuadSI(machInst); + return decodeSveLoadBcastMultiSI(machInst); } break; case 0x2: diff --git a/src/arch/arm/isa/insts/sve_mem.isa b/src/arch/arm/isa/insts/sve_mem.isa index 8a73d13..bece368 100644 --- a/src/arch/arm/isa/insts/sve_mem.isa +++ b/src/arch/arm/isa/insts/sve_mem.isa @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2019 ARM Limited +// Copyright (c) 2017-2020 ARM Limited // All rights reserved // // The license below extends only to copyright in the software and shall @@ -1480,20 +1480,33 @@ exec_output += SveStructMemExecDeclare.subst(substDict) # Generates definitions for SVE load-and-replicate quadword instructions - def emitSveLoadAndReplQuad(offsetIsImm): + def emitSveLoadAndReplMulti(offsetIsImm, numQwordSegments): global header_output, exec_output, decoders + assert(numQwordSegments in (1, 2)) # Quadword or Octaword + from collections import namedtuple + InstConfig = namedtuple("_InstConfig", "mnemonic classname baseclass") + INST_CONFIGURATIONS = { + # (offsetIsImm, numQwordSegments) -> InstConfig Recors + (True, 1): InstConfig("ld1rq", "SveLd1RqSI", "SveContigMemSI"), + (False, 1): InstConfig("ld1rq", "SveLd1RqSS", "SveContigMemSS"), + (True, 2): InstConfig("ld1ro", "SveLd1RoSI", "SveContigMemSI"), + (False, 2): InstConfig("ld1ro", "SveLd1RoSS", "SveContigMemSS"), + } + inst_config = INST_CONFIGURATIONS[(offsetIsImm, numQwordSegments)] + memAccessSize = numQwordSegments * 16; tplHeader = 'template <class RegElemType, class MemElemType>' tplArgs = '<RegElemType, MemElemType>' eaCode = SPAlignmentCheckCode + ''' - int memAccessSize = 16; - EA = XBase + ''' + int memAccessSize = %(memAccessSize)d; + EA = XBase + ''' % dict(memAccessSize=memAccessSize) if offsetIsImm: - eaCode += '(((int64_t) this->imm) * 16);' + eaCode += ('(((int64_t) this->imm) * %(memAccessSize)d);' + % dict(memAccessSize=memAccessSize)) else: eaCode += '(XOffset * sizeof(MemElemType));' loadRdEnableCode = ''' - eCount = 16/sizeof(RegElemType); - auto rdEn = std::vector<bool>(16, true); + eCount = %(memAccessSize)d/sizeof(RegElemType); + auto rdEn = std::vector<bool>(%(memAccessSize)d, true); for (int i = 0; i < eCount; ++i) { if (!GpOp_x[i]) { for (int j = 0; j < sizeof(RegElemType); ++j) { @@ -1501,26 +1514,40 @@ } } } - ''' + ''' % dict(memAccessSize=memAccessSize) memAccCode = ''' - __uint128_t qword; - RegElemType* qp = reinterpret_cast<RegElemType*>(&qword); - for (int i = 0; i < 16/sizeof(RegElemType); ++i) { + // Copy active elements of the data from memory into a temporary + // quadword/octaword + __uint128_t qwords[%(numQwordSegments)d]; + eCount = %(memAccessSize)d/sizeof(RegElemType); + RegElemType* qp = reinterpret_cast<RegElemType*>(&qwords); + for (int i = 0; i < eCount; ++i) { if (GpOp_x[i]) { qp[i] = memDataView[i]; } else { qp[i] = 0; } } - eCount = ArmStaticInst::getCurSveVecLen<__uint128_t>( + // Repeat the temporary quadword/octaword segment into the + // vector register. Zero fill the remainder for non-full + // octawords. + unsigned numQwords = ArmStaticInst::getCurSveVecLen<__uint128_t>( xc->tcBase()); - for (int i = 0; i < eCount; ++i) { - AA64FpDest_uq[i] = qword; + unsigned numFullQwords = numQwords - + (numQwords %% %(numQwordSegments)d); + for (int i = 0; i < numQwords; ++i) { + if (i < numFullQwords) { + AA64FpDest_uq[i] = qwords[i %% %(numQwordSegments)d]; + } else { + AA64FpDest_uq[i] = 0; + } } - ''' - iop = ArmInstObjParams('ld1rq', - 'SveLd1RqSI' if offsetIsImm else 'SveLd1RqSS', - 'SveContigMemSI' if offsetIsImm else 'SveContigMemSS', + ''' % dict(memAccessSize=memAccessSize, + numQwordSegments=numQwordSegments) + iop = ArmInstObjParams( + inst_config.mnemonic, + inst_config.classname, + inst_config.baseclass, {'tpl_header': tplHeader, 'tpl_args': tplArgs, 'rden_code': loadRdEnableCode, @@ -1539,8 +1566,7 @@ SveContigLoadCompleteAcc.subst(iop)) for ttype in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'): substDict = {'tpl_args': '<%s, %s>' % (ttype, ttype), - 'class_name': 'SveLd1RqSI' if offsetIsImm - else 'SveLd1RqSS'} + 'class_name': inst_config.classname} exec_output += SveContigMemExecDeclare.subst(substDict) # LD1[S]{B,H,W,D} (scalar plus immediate) @@ -1556,9 +1582,14 @@ emitSveLoadAndRepl() # LD1RQ{B,H,W,D} (scalar plus immediate) - emitSveLoadAndReplQuad(offsetIsImm = True) + emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=1) # LD1RQ{B,H,W,D} (scalar plus scalar) - emitSveLoadAndReplQuad(offsetIsImm = False) + emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=1) + + # LD1RO{B,H,W,D} (scalar plus immediate) + emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=2) + # LD1RO{B,H,W,D} (scalar plus scalar) + emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=2) # LD{2,3,4}{B,H,W,D} (scalar plus immediate) # ST{2,3,4}{B,H,W,D} (scalar plus immediate) -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/70727?usp=email To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings?usp=email Gerrit-MessageType: merged Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa Gerrit-Change-Number: 70727 Gerrit-PatchSet: 8 Gerrit-Owner: Giacomo Travaglini <giacomo.travaglini@arm.com> Gerrit-Reviewer: Andreas Sandberg <andreas.sandberg@arm.com> Gerrit-Reviewer: Bobby Bruce <bbruce@ucdavis.edu> Gerrit-Reviewer: Giacomo Travaglini <giacomo.travaglini@arm.com> Gerrit-Reviewer: Jason Lowe-Power <power.jg@gmail.com> Gerrit-Reviewer: Richard Cooper <richard.cooper@arm.com> Gerrit-Reviewer: kokoro <noreply+kokoro@google.com>