gem5-dev@gem5.org

The gem5 Developer List

View all threads

[M] Change in gem5/gem5[develop]: arch-arm: Add support for Armv8.2-DotProd NEON extension.

BB
Bobby Bruce (Gerrit)
Thu, May 25, 2023 9:36 PM

Bobby Bruce has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/70736?usp=email )

(

7 is the latest approved patch-set.
No files were changed between the latest approved patch-set and the
submitted one.
)Change subject: arch-arm: Add support for Armv8.2-DotProd NEON extension.
......................................................................

arch-arm: Add support for Armv8.2-DotProd NEON extension.

Add support for the Armv8.2-DotProd NEON extension. This provides the
SDOT and UDOT SIMD Dot Product instructions.

For more information please refer to the Arm Architecture Reference
Manual (https://developer.arm.com/documentation/ddi0487/latest/).

Change-Id: I4caa3b97a74c65f32421487c55c3e36427194e61
Reviewed-by: Richard Cooper richard.cooper@arm.com
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70736
Maintainer: Jason Lowe-Power power.jg@gmail.com
Reviewed-by: Andreas Sandberg andreas.sandberg@arm.com
Maintainer: Andreas Sandberg andreas.sandberg@arm.com
Reviewed-by: Jason Lowe-Power power.jg@gmail.com
Tested-by: kokoro noreply+kokoro@google.com

M src/arch/arm/ArmISA.py
M src/arch/arm/ArmSystem.py
M src/arch/arm/isa/formats/neon64.isa
M src/arch/arm/isa/insts/neon64.isa
M src/arch/arm/regs/misc.cc
5 files changed, 107 insertions(+), 0 deletions(-)

Approvals:
Andreas Sandberg: Looks good to me, approved; Looks good to me, approved
Jason Lowe-Power: Looks good to me, but someone else must approve; Looks
good to me, approved
kokoro: Regressions pass

diff --git a/src/arch/arm/ArmISA.py b/src/arch/arm/ArmISA.py
index ffe63eb..8c1ee5a 100644
--- a/src/arch/arm/ArmISA.py
+++ b/src/arch/arm/ArmISA.py
@@ -57,6 +57,7 @@
"FEAT_F64MM",
"FEAT_SVE",
"FEAT_I8MM",

  •    "FEAT_DOTPROD",
        # Armv8.3
        "FEAT_FCMA",
        "FEAT_JSCVT",
    

diff --git a/src/arch/arm/ArmSystem.py b/src/arch/arm/ArmSystem.py
index c5c0f43..eaaf4b1 100644
--- a/src/arch/arm/ArmSystem.py
+++ b/src/arch/arm/ArmSystem.py
@@ -81,6 +81,7 @@
"FEAT_F32MM",  # Optional in Armv8.2
"FEAT_F64MM",  # Optional in Armv8.2
"FEAT_I8MM",  # Optional in Armv8.2

  •    "FEAT_DOTPROD",  # Optional in Armv8.2
        # Armv8.3
        "FEAT_FCMA",
        "FEAT_JSCVT",
    

@@ -169,6 +170,7 @@
"FEAT_F32MM",
"FEAT_F64MM",
"FEAT_I8MM",

  •    "FEAT_DOTPROD",
        # Armv8.3
        "FEAT_FCMA",
        "FEAT_JSCVT",
    

@@ -205,6 +207,7 @@
"FEAT_F32MM",
"FEAT_F64MM",
"FEAT_I8MM",

  •    "FEAT_DOTPROD",
    ]
    

diff --git a/src/arch/arm/isa/formats/neon64.isa
b/src/arch/arm/isa/formats/neon64.isa
index 5cce0d7..e083f6f 100644
--- a/src/arch/arm/isa/formats/neon64.isa
+++ b/src/arch/arm/isa/formats/neon64.isa
@@ -510,6 +510,7 @@
decodeNeon3RegExtension(ExtMachInst machInst)
{
uint8_t q      = bits(machInst, 30);

  •    uint8_t qu     = bits(machInst, 30, 29);
        uint8_t size   = bits(machInst, 23, 22);
        uint8_t opcode = bits(machInst, 15, 11);
    

@@ -532,6 +533,19 @@
else
return decodeNeonSThreeHAndWReg<SqrdmlshDX>(
size, machInst, vd, vn, vm);

  •      case 0x12:
    
  •          switch (qu) {
    
  •            case 0b00:
    
  •              return new SdotDX<int32_t>(machInst, vd, vn, vm);
    
  •            case 0b01:
    
  •              return new UdotDX<uint32_t>(machInst, vd, vn, vm);
    
  •            case 0b10:
    
  •              return new SdotQX<int32_t>(machInst, vd, vn, vm);
    
  •            case 0b11:
    
  •              return new UdotQX<uint32_t>(machInst, vd, vn, vm);
    
  •            default:
    
  •              return new Unknown64(machInst);
    
  •          }
          case 0x18:
          case 0x19:
          case 0x1a:
    

@@ -1351,6 +1365,7 @@
{
uint8_t q = bits(machInst, 30);
uint8_t u = bits(machInst, 29);

  •    uint8_t qu = bits(machInst, 30, 29);
        uint8_t size = bits(machInst, 23, 22);
        uint8_t L = bits(machInst, 21);
        uint8_t M = bits(machInst, 20);
    

@@ -1387,6 +1402,11 @@
}
RegIndex vm_fp = (RegIndex) (uint8_t) (vmh << 4 | vm_bf);

  •    // Index and 2nd register operand for FEAT_DOTPROD and
    
  •    // FEAT_I8MM instructions
    
  •    uint8_t index_dp = (H << 1) | L;
    
  •    RegIndex vm_dp = (RegIndex) (uint8_t) (M << 4 | vm_bf);
    
  •     switch (opcode) {
          case 0x0:
            if (!u || (size == 0x0 || size == 0x3))
    

@@ -1573,6 +1593,23 @@
case 0xf:
return decodeNeonSThreeImmHAndWReg<SqrdmlshElemDX,
SqrdmlshElemQX>(
q, size, machInst, vd, vn, vm, index);

  •      case 0xe:
    
  •          switch (qu) {
    
  •            case 0b00:
    
  •              return new SdotElemDX<int32_t>(machInst,
    
  •                                             vd, vn, vm_dp, index_dp);
    
  •            case 0b01:
    
  •              return new UdotElemDX<uint32_t>(machInst,
    
  •                                              vd, vn, vm_dp, index_dp);
    
  •            case 0b10:
    
  •              return new SdotElemQX<int32_t>(machInst,
    
  •                                             vd, vn, vm_dp, index_dp);
    
  •            case 0b11:
    
  •              return new UdotElemQX<uint32_t>(machInst,
    
  •                                              vd, vn, vm_dp, index_dp);
    
  •            default:
    
  •              return new Unknown64(machInst);
    
  •          }
          default:
            return new Unknown64(machInst);
        }
    

diff --git a/src/arch/arm/isa/insts/neon64.isa
b/src/arch/arm/isa/insts/neon64.isa
index 0da7f06..53c0f11 100644
--- a/src/arch/arm/isa/insts/neon64.isa
+++ b/src/arch/arm/isa/insts/neon64.isa
@@ -1082,6 +1082,71 @@
complex=True)
threeEqualRegInstX("fcmla", "FcmlaQX", "SimdFloatMultAccOp",
floatTypes, 4, fcmla_vec, True, complex=True)
+

  • def intDotInst(name, Name, opClass,
  •               destIsSigned, src1IsSigned, src2IsSigned,
    
  •               rCount, byElem):
    
  •    destType = "int32_t" if destIsSigned else "uint32_t"
    
  •    src1Type = "int8_t" if src1IsSigned else "uint8_t"
    
  •    src2Type = "int8_t" if src2IsSigned else "uint8_t"
    
  •    dotCode = '''
    
  •    using Src1Element = %(src1Type)s;
    
  •    using Src2Element = %(src2Type)s;
    
  •    // Neon dot instructions always generate one output element
    
  •    // from 4 pairs of source elements.
    
  •    static_assert(sizeof(Element) == 4 * sizeof(Src1Element));
    
  •    static_assert(sizeof(Element) == 4 * sizeof(Src2Element));
    
  •    // Extended source element types to avoid overflow of intermediate
    
  •    // calculations.
    
  •    using ExtendedSrc1Element =
    
  •            typename vector_element_traits::
    
  •                extend_element<Element, Src1Element>::type;
    
  •    using ExtendedSrc2Element =
    
  •            typename vector_element_traits::
    
  •                extend_element<Element, Src2Element>::type;
    
  •    for (unsigned i = 0; i < eCount; ++i) {
    
  •        Element src1ElemsPacked = letoh(srcReg1.elements[i]);
    
  •        Element src2ElemsPacked =  
    

letoh(srcReg2.elements[%(src2Index)s]);
+

  •        Src1Element *src1Elems =
    
  •            reinterpret_cast<Src1Element*>(&src1ElemsPacked);
    
  •        Src2Element *src2Elems =
    
  •            reinterpret_cast<Src2Element*>(&src2ElemsPacked);
    
  •        // Dot instructions accumulate into the dest reg
    
  •        Element destElem = letoh(destReg.elements[i]);
    
  •        for (unsigned j = 0; j < 4; ++j) {
    
  •            ExtendedSrc1Element src1Elem =
    
  •                static_cast<ExtendedSrc1Element>(src1Elems[j]);
    
  •            ExtendedSrc2Element src2Elem =
    
  •                static_cast<ExtendedSrc2Element>(src2Elems[j]);
    
  •            destElem += src1Elem * src2Elem;
    
  •        }
    
  •        destReg.elements[i] = htole(destElem);
    
  •    }
    
  •    ''' % dict(src1Type=src1Type, src2Type=src2Type,
    
  •               src2Index="imm" if byElem else "i")
    
  •    threeEqualRegInstX(name, Name, opClass, (destType,), rCount,
    
  •                       dotCode, readDest=True, byElem=byElem,
    
  •                       complex=True)
    
  • SDOT (vector)

  • intDotInst('sdot', 'SdotDX', 'SimdAluOp', True, True, True, 2, False)
  • intDotInst('sdot', 'SdotQX', 'SimdAluOp', True, True, True, 4, False)
  • SDOT (element)

  • intDotInst('sdot', 'SdotElemDX', 'SimdAluOp', True, True, True, 2,
    True)
  • intDotInst('sdot', 'SdotElemQX', 'SimdAluOp', True, True, True, 4,
    True)
  • UDOT (vector)

  • intDotInst('udot', 'UdotDX', 'SimdAluOp', False, False, False, 2,
    False)
  • intDotInst('udot', 'UdotQX', 'SimdAluOp', False, False, False, 4,
    False)
  • UDOT (element)

  • intDotInst('udot', 'UdotElemDX', 'SimdAluOp', False, False, False, 2,
    True)
  • intDotInst('udot', 'UdotElemQX', 'SimdAluOp', False, False, False, 4,
    True)
  • # CLS
    clsCode = '''
            unsigned count = 0;
    

diff --git a/src/arch/arm/regs/misc.cc b/src/arch/arm/regs/misc.cc
index b978044..ed15f25 100644
--- a/src/arch/arm/regs/misc.cc
+++ b/src/arch/arm/regs/misc.cc
@@ -3988,6 +3988,7 @@
isar0_el1.sha1 = 0;
isar0_el1.aes = 0;
}

  •      isar0_el1.dp = release->has(ArmExtension::FEAT_DOTPROD) ? 0x1 :  
    

0x0;
isar0_el1.atomic = release->has(ArmExtension::FEAT_LSE) ? 0x2 :
0x0;
isar0_el1.rdm = release->has(ArmExtension::FEAT_RDM) ? 0x1 : 0x0;
isar0_el1.tme = release->has(ArmExtension::TME) ? 0x1 : 0x0;

--
To view, visit
https://gem5-review.googlesource.com/c/public/gem5/+/70736?usp=email
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings?usp=email

Gerrit-MessageType: merged
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I4caa3b97a74c65f32421487c55c3e36427194e61
Gerrit-Change-Number: 70736
Gerrit-PatchSet: 9
Gerrit-Owner: Giacomo Travaglini giacomo.travaglini@arm.com
Gerrit-Reviewer: Andreas Sandberg andreas.sandberg@arm.com
Gerrit-Reviewer: Bobby Bruce bbruce@ucdavis.edu
Gerrit-Reviewer: Jason Lowe-Power power.jg@gmail.com
Gerrit-Reviewer: kokoro noreply+kokoro@google.com
Gerrit-CC: Richard Cooper richard.cooper@arm.com

Bobby Bruce has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/70736?usp=email ) ( 7 is the latest approved patch-set. No files were changed between the latest approved patch-set and the submitted one. )Change subject: arch-arm: Add support for Armv8.2-DotProd NEON extension. ...................................................................... arch-arm: Add support for Armv8.2-DotProd NEON extension. Add support for the Armv8.2-DotProd NEON extension. This provides the SDOT and UDOT SIMD Dot Product instructions. For more information please refer to the Arm Architecture Reference Manual (https://developer.arm.com/documentation/ddi0487/latest/). Change-Id: I4caa3b97a74c65f32421487c55c3e36427194e61 Reviewed-by: Richard Cooper <richard.cooper@arm.com> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70736 Maintainer: Jason Lowe-Power <power.jg@gmail.com> Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com> Maintainer: Andreas Sandberg <andreas.sandberg@arm.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com> --- M src/arch/arm/ArmISA.py M src/arch/arm/ArmSystem.py M src/arch/arm/isa/formats/neon64.isa M src/arch/arm/isa/insts/neon64.isa M src/arch/arm/regs/misc.cc 5 files changed, 107 insertions(+), 0 deletions(-) Approvals: Andreas Sandberg: Looks good to me, approved; Looks good to me, approved Jason Lowe-Power: Looks good to me, but someone else must approve; Looks good to me, approved kokoro: Regressions pass diff --git a/src/arch/arm/ArmISA.py b/src/arch/arm/ArmISA.py index ffe63eb..8c1ee5a 100644 --- a/src/arch/arm/ArmISA.py +++ b/src/arch/arm/ArmISA.py @@ -57,6 +57,7 @@ "FEAT_F64MM", "FEAT_SVE", "FEAT_I8MM", + "FEAT_DOTPROD", # Armv8.3 "FEAT_FCMA", "FEAT_JSCVT", diff --git a/src/arch/arm/ArmSystem.py b/src/arch/arm/ArmSystem.py index c5c0f43..eaaf4b1 100644 --- a/src/arch/arm/ArmSystem.py +++ b/src/arch/arm/ArmSystem.py @@ -81,6 +81,7 @@ "FEAT_F32MM", # Optional in Armv8.2 "FEAT_F64MM", # Optional in Armv8.2 "FEAT_I8MM", # Optional in Armv8.2 + "FEAT_DOTPROD", # Optional in Armv8.2 # Armv8.3 "FEAT_FCMA", "FEAT_JSCVT", @@ -169,6 +170,7 @@ "FEAT_F32MM", "FEAT_F64MM", "FEAT_I8MM", + "FEAT_DOTPROD", # Armv8.3 "FEAT_FCMA", "FEAT_JSCVT", @@ -205,6 +207,7 @@ "FEAT_F32MM", "FEAT_F64MM", "FEAT_I8MM", + "FEAT_DOTPROD", ] diff --git a/src/arch/arm/isa/formats/neon64.isa b/src/arch/arm/isa/formats/neon64.isa index 5cce0d7..e083f6f 100644 --- a/src/arch/arm/isa/formats/neon64.isa +++ b/src/arch/arm/isa/formats/neon64.isa @@ -510,6 +510,7 @@ decodeNeon3RegExtension(ExtMachInst machInst) { uint8_t q = bits(machInst, 30); + uint8_t qu = bits(machInst, 30, 29); uint8_t size = bits(machInst, 23, 22); uint8_t opcode = bits(machInst, 15, 11); @@ -532,6 +533,19 @@ else return decodeNeonSThreeHAndWReg<SqrdmlshDX>( size, machInst, vd, vn, vm); + case 0x12: + switch (qu) { + case 0b00: + return new SdotDX<int32_t>(machInst, vd, vn, vm); + case 0b01: + return new UdotDX<uint32_t>(machInst, vd, vn, vm); + case 0b10: + return new SdotQX<int32_t>(machInst, vd, vn, vm); + case 0b11: + return new UdotQX<uint32_t>(machInst, vd, vn, vm); + default: + return new Unknown64(machInst); + } case 0x18: case 0x19: case 0x1a: @@ -1351,6 +1365,7 @@ { uint8_t q = bits(machInst, 30); uint8_t u = bits(machInst, 29); + uint8_t qu = bits(machInst, 30, 29); uint8_t size = bits(machInst, 23, 22); uint8_t L = bits(machInst, 21); uint8_t M = bits(machInst, 20); @@ -1387,6 +1402,11 @@ } RegIndex vm_fp = (RegIndex) (uint8_t) (vmh << 4 | vm_bf); + // Index and 2nd register operand for FEAT_DOTPROD and + // FEAT_I8MM instructions + uint8_t index_dp = (H << 1) | L; + RegIndex vm_dp = (RegIndex) (uint8_t) (M << 4 | vm_bf); + switch (opcode) { case 0x0: if (!u || (size == 0x0 || size == 0x3)) @@ -1573,6 +1593,23 @@ case 0xf: return decodeNeonSThreeImmHAndWReg<SqrdmlshElemDX, SqrdmlshElemQX>( q, size, machInst, vd, vn, vm, index); + case 0xe: + switch (qu) { + case 0b00: + return new SdotElemDX<int32_t>(machInst, + vd, vn, vm_dp, index_dp); + case 0b01: + return new UdotElemDX<uint32_t>(machInst, + vd, vn, vm_dp, index_dp); + case 0b10: + return new SdotElemQX<int32_t>(machInst, + vd, vn, vm_dp, index_dp); + case 0b11: + return new UdotElemQX<uint32_t>(machInst, + vd, vn, vm_dp, index_dp); + default: + return new Unknown64(machInst); + } default: return new Unknown64(machInst); } diff --git a/src/arch/arm/isa/insts/neon64.isa b/src/arch/arm/isa/insts/neon64.isa index 0da7f06..53c0f11 100644 --- a/src/arch/arm/isa/insts/neon64.isa +++ b/src/arch/arm/isa/insts/neon64.isa @@ -1082,6 +1082,71 @@ complex=True) threeEqualRegInstX("fcmla", "FcmlaQX", "SimdFloatMultAccOp", floatTypes, 4, fcmla_vec, True, complex=True) + + def intDotInst(name, Name, opClass, + destIsSigned, src1IsSigned, src2IsSigned, + rCount, byElem): + destType = "int32_t" if destIsSigned else "uint32_t" + src1Type = "int8_t" if src1IsSigned else "uint8_t" + src2Type = "int8_t" if src2IsSigned else "uint8_t" + dotCode = ''' + using Src1Element = %(src1Type)s; + using Src2Element = %(src2Type)s; + + // Neon dot instructions always generate one output element + // from 4 pairs of source elements. + static_assert(sizeof(Element) == 4 * sizeof(Src1Element)); + static_assert(sizeof(Element) == 4 * sizeof(Src2Element)); + + // Extended source element types to avoid overflow of intermediate + // calculations. + using ExtendedSrc1Element = + typename vector_element_traits:: + extend_element<Element, Src1Element>::type; + using ExtendedSrc2Element = + typename vector_element_traits:: + extend_element<Element, Src2Element>::type; + + for (unsigned i = 0; i < eCount; ++i) { + Element src1ElemsPacked = letoh(srcReg1.elements[i]); + Element src2ElemsPacked = letoh(srcReg2.elements[%(src2Index)s]); + + Src1Element *src1Elems = + reinterpret_cast<Src1Element*>(&src1ElemsPacked); + Src2Element *src2Elems = + reinterpret_cast<Src2Element*>(&src2ElemsPacked); + + // Dot instructions accumulate into the dest reg + Element destElem = letoh(destReg.elements[i]); + + for (unsigned j = 0; j < 4; ++j) { + ExtendedSrc1Element src1Elem = + static_cast<ExtendedSrc1Element>(src1Elems[j]); + ExtendedSrc2Element src2Elem = + static_cast<ExtendedSrc2Element>(src2Elems[j]); + destElem += src1Elem * src2Elem; + } + destReg.elements[i] = htole(destElem); + } + ''' % dict(src1Type=src1Type, src2Type=src2Type, + src2Index="imm" if byElem else "i") + threeEqualRegInstX(name, Name, opClass, (destType,), rCount, + dotCode, readDest=True, byElem=byElem, + complex=True) + + # SDOT (vector) + intDotInst('sdot', 'SdotDX', 'SimdAluOp', True, True, True, 2, False) + intDotInst('sdot', 'SdotQX', 'SimdAluOp', True, True, True, 4, False) + # SDOT (element) + intDotInst('sdot', 'SdotElemDX', 'SimdAluOp', True, True, True, 2, True) + intDotInst('sdot', 'SdotElemQX', 'SimdAluOp', True, True, True, 4, True) + # UDOT (vector) + intDotInst('udot', 'UdotDX', 'SimdAluOp', False, False, False, 2, False) + intDotInst('udot', 'UdotQX', 'SimdAluOp', False, False, False, 4, False) + # UDOT (element) + intDotInst('udot', 'UdotElemDX', 'SimdAluOp', False, False, False, 2, True) + intDotInst('udot', 'UdotElemQX', 'SimdAluOp', False, False, False, 4, True) + # CLS clsCode = ''' unsigned count = 0; diff --git a/src/arch/arm/regs/misc.cc b/src/arch/arm/regs/misc.cc index b978044..ed15f25 100644 --- a/src/arch/arm/regs/misc.cc +++ b/src/arch/arm/regs/misc.cc @@ -3988,6 +3988,7 @@ isar0_el1.sha1 = 0; isar0_el1.aes = 0; } + isar0_el1.dp = release->has(ArmExtension::FEAT_DOTPROD) ? 0x1 : 0x0; isar0_el1.atomic = release->has(ArmExtension::FEAT_LSE) ? 0x2 : 0x0; isar0_el1.rdm = release->has(ArmExtension::FEAT_RDM) ? 0x1 : 0x0; isar0_el1.tme = release->has(ArmExtension::TME) ? 0x1 : 0x0; -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/70736?usp=email To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings?usp=email Gerrit-MessageType: merged Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I4caa3b97a74c65f32421487c55c3e36427194e61 Gerrit-Change-Number: 70736 Gerrit-PatchSet: 9 Gerrit-Owner: Giacomo Travaglini <giacomo.travaglini@arm.com> Gerrit-Reviewer: Andreas Sandberg <andreas.sandberg@arm.com> Gerrit-Reviewer: Bobby Bruce <bbruce@ucdavis.edu> Gerrit-Reviewer: Jason Lowe-Power <power.jg@gmail.com> Gerrit-Reviewer: kokoro <noreply+kokoro@google.com> Gerrit-CC: Richard Cooper <richard.cooper@arm.com>