gem5-dev@gem5.org

The gem5 Developer List

View all threads

Change in gem5/gem5[develop]: mem: Make DRAMCtrl a ClockedObject

WE
Wendy Elsasser (Gerrit)
Tue, May 12, 2020 6:30 PM

Wendy Elsasser has uploaded this change for review. (
https://gem5-review.googlesource.com/c/public/gem5/+/28968 )

Change subject: mem: Make DRAMCtrl a ClockedObject
......................................................................

mem: Make DRAMCtrl a ClockedObject

Made DRAMCtrl a ClockedObject, with DRAMInterface
defined as an AbstractMemory. The address
ranges are now defined per interface. Currently
the model only includes a DRAMInterface but this
can be expanded for other media types.

The controller object includes a parameter to the
interface, which is setup when gem5 is configured.

Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8

M configs/common/MemConfig.py
M configs/dram/low_power_sweep.py
M configs/dram/sweep.py
M configs/learning_gem5/part1/simple.py
M configs/learning_gem5/part1/two_level.py
M configs/learning_gem5/part2/simple_cache.py
M configs/learning_gem5/part2/simple_memobj.py
M configs/learning_gem5/part3/simple_ruby.py
M src/mem/DRAMCtrl.py
A src/mem/DRAMInterface.py
M src/mem/SConscript
M src/mem/dram_ctrl.cc
M src/mem/dram_ctrl.hh
M src/mem/drampower.cc
M src/mem/drampower.hh
M src/mem/qos/QoSMemCtrl.py
M src/mem/qos/QoSMemSinkCtrl.py
A src/mem/qos/QoSMemSinkInterface.py
M src/mem/qos/SConscript
M src/mem/qos/mem_ctrl.cc
M src/mem/qos/mem_ctrl.hh
M src/mem/qos/mem_sink.cc
M src/mem/qos/mem_sink.hh
M tests/configs/base_config.py
24 files changed, 1,934 insertions(+), 1,760 deletions(-)

diff --git a/configs/common/MemConfig.py b/configs/common/MemConfig.py
index 9443520..ab6b933 100644
--- a/configs/common/MemConfig.py
+++ b/configs/common/MemConfig.py
@@ -40,7 +40,7 @@
from common import ObjectList
from common import HMC

-def create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, intlv_size):
+def create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size):
"""
Helper function for creating a single memoy controller from the given
options.  This function is invoked multiple times in config_mem
function
@@ -59,33 +59,33 @@

  # Create an instance so we can figure out the address
  # mapping and row-buffer size
  • ctrl = cls()
  • interface = intf()

    Only do this for DRAMs

  • if issubclass(cls, m5.objects.DRAMCtrl):
  • if issubclass(intf, m5.objects.DRAMInterface):
    # If the channel bits are appearing after the column
    # bits, we need to add the appropriate number of bits
    # for the row buffer size
  •    if ctrl.addr_mapping.value == 'RoRaBaChCo':
    
  •    if interface.addr_mapping.value == 'RoRaBaChCo':
            # This computation only really needs to happen
            # once, but as we rely on having an instance we
            # end up having to repeat it for each and every
            # one
    
  •        rowbuffer_size = ctrl.device_rowbuffer_size.value * \
    
  •            ctrl.devices_per_rank.value
    
  •        rowbuffer_size = interface.device_rowbuffer_size.value * \
    
  •            interface.devices_per_rank.value
    
            intlv_low_bit = int(math.log(rowbuffer_size, 2))
    
    # We got all we need to configure the appropriate address
    # range
    
  • ctrl.range = m5.objects.AddrRange(r.start, size = r.size(),
  • interface.range = m5.objects.AddrRange(r.start, size = r.size(),
    intlvHighBit =
    intlv_low_bit + intlv_bits - 1,
    xorHighBit =
    xor_low_bit + intlv_bits - 1,
    intlvBits = intlv_bits,
    intlvMatch = i)
  • return ctrl
  • return interface

def config_mem(options, system):
"""
@@ -144,10 +144,10 @@
if 2 ** intlv_bits != nbr_mem_ctrls:
fatal("Number of memory channels must be a power of 2")

  • cls = ObjectList.mem_list.get(opt_mem_type)
  • intf = ObjectList.mem_list.get(opt_mem_type)
    mem_ctrls = []
  • if opt_elastic_trace_en and not issubclass(cls,
    m5.objects.SimpleMemory):
  • if opt_elastic_trace_en and not issubclass(intf,
    m5.objects.SimpleMemory):
    fatal("When elastic trace is enabled, configure mem-type as "
    "simple-mem.")

@@ -158,36 +158,56 @@
intlv_size = max(opt_mem_channels_intlv, system.cache_line_size.value)

  # For every range (most systems will only have one), create an
  • array of controllers and set their parameters to match their

  • address mapping in the case of a DRAM

  • array of memory interfaces and set their parameters to match

  • their address mapping in the case of a DRAM

    for r in system.mem_ranges:
    for i in range(nbr_mem_ctrls):
  •        mem_ctrl = create_mem_ctrl(cls, r, i, nbr_mem_ctrls,  
    

intlv_bits,

  •        # Create the DRAM interface
    
  •        dram_intf = create_mem_intf(intf, r, i, nbr_mem_ctrls,  
    

intlv_bits,
intlv_size)
+
# Set the number of ranks based on the command-line
# options if it was explicitly set

  •        if issubclass(cls, m5.objects.DRAMCtrl) and opt_mem_ranks:
    
  •            mem_ctrl.ranks_per_channel = opt_mem_ranks
    
  •        if issubclass(intf, m5.objects.DRAMInterface) and  
    

opt_mem_ranks:

  •            dram_intf.ranks_per_channel = opt_mem_ranks
    
            # Enable low-power DRAM states if option is set
    
  •        if issubclass(cls, m5.objects.DRAMCtrl):
    
  •            mem_ctrl.enable_dram_powerdown = opt_dram_powerdown
    
  •        if issubclass(intf, m5.objects.DRAMInterface):
    
  •            dram_intf.enable_dram_powerdown = opt_dram_powerdown
    
            if opt_elastic_trace_en:
    
  •            mem_ctrl.latency = '1ns'
    
  •            dram_intf.latency = '1ns'
                print("For elastic trace, over-riding Simple Memory "
                    "latency to 1ns.")
    
  •        # Create the controller that will drive the interface
    
  •        if opt_mem_type == "HMC_2500_1x32":
    
  •            # The static latency of the vault controllers is estimated
    
  •            # to be smaller than a full DRAM channel controller
    
  •            mem_ctrl = m5.objects.DRAMCtrl(min_writes_per_switch = 8,
    
  •                                           static_backend_latency  
    

= '4ns',

  •                                           static_frontend_latency  
    

= '4ns')

  •        else:
    
  •            mem_ctrl = m5.objects.DRAMCtrl()
    
  •        # Override buffer sizes with interface specific values
    
  •        mem_ctrl.write_buffer_size = dram_intf.write_buffer_size
    
  •        mem_ctrl.read_buffer_size = dram_intf.read_buffer_size
    
  •        # Hookup the controller to the interface and add to the list
    
  •        mem_ctrl.dram = dram_intf
            mem_ctrls.append(mem_ctrl)
    
  • subsystem.mem_ctrls = mem_ctrls
  • Connect the controllers to the membus

  • for i in range(len(subsystem.mem_ctrls)):
  • Create a controller and connect the interfaces to a controller

  • for i in range(len(mem_ctrls)):
    if opt_mem_type == "HMC_2500_1x32":
  •        subsystem.mem_ctrls[i].port = xbar[i/4].master
    
  •        # Connect the controllers to the membus
    
  •        mem_ctrls[i].port = xbar[i/4].master
            # Set memory device size. There is an independent controller  
    

for
# each vault. All vaults are same size.

  •        subsystem.mem_ctrls[i].device_size = options.hmc_dev_vault_size
    
  •        mem_ctrls[i].dram.device_size = options.hmc_dev_vault_size
        else:
    
  •        subsystem.mem_ctrls[i].port = xbar.master
    
  •        # Connect the controllers to the membus
    
  •        mem_ctrls[i].port = xbar.master
    
  • subsystem.mem_ctrls = mem_ctrls
    diff --git a/configs/dram/low_power_sweep.py
    b/configs/dram/low_power_sweep.py
    index 9a62393..4a97fcb 100644
    --- a/configs/dram/low_power_sweep.py
    +++ b/configs/dram/low_power_sweep.py
    @@ -1,4 +1,4 @@
    -# Copyright (c) 2014-2015, 2017, 2019 ARM Limited
    +# Copyright (c) 2014-2015, 2017, 2019-2020 ARM Limited

All rights reserved.

The license below extends only to copyright in the software and shall

@@ -37,6 +37,7 @@
from future import absolute_import

import argparse
+import math

import m5
from m5.objects import *
@@ -57,6 +58,10 @@
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

+dram_generators = {

  • "DRAM" : lambda x: x.createDram,
    +}

  • Use a single-channel DDR4-2400 in 16x4 configuration by default

    parser.add_argument("--mem-type", default="DDR4_2400_16x4",
    choices=ObjectList.mem_list.get_names(),
    @@ -77,7 +82,7 @@
    help = "Percentage of read commands")

    parser.add_argument("--addr-map",

  •                choices=m5.objects.AddrMap.vals,
    
  •                choices=ObjectList.dram_addr_map_list.get_names(),
                    default="RoRaBaCoCh", help = "DRAM address map policy")
    

    parser.add_argument("--idle-end", type=int, default=50000000,
    @@ -111,14 +116,19 @@

    Sanity check for memory controller class.

    if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl):

  • fatal("This script assumes the memory is a DRAMCtrl subclass")
  • fatal("This script assumes the controller is a DRAMCtrl subclass")
    +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
  • fatal("This script assumes the memory is a DRAMInterface subclass")

There is no point slowing things down by saving any data.

-system.mem_ctrls[0].null = True
+system.mem_ctrls[0].dram.null = True
+
+# enable DRAM low power states
+system.mem_ctrls[0].dram.enable_dram_powerdown = True

Set the address mapping based on input argument

-system.mem_ctrls[0].addr_mapping = args.addr_map
-system.mem_ctrls[0].page_policy = args.page_policy
+system.mem_ctrls[0].dram.addr_mapping = args.addr_map
+system.mem_ctrls[0].dram.page_policy = args.page_policy

We create a traffic generator state for each param combination we want to

test. Each traffic generator state is specified in the config file and

the
@@ -126,28 +136,23 @@

Stats are dumped and reset at the state transition.

period = 250000000

-# We specify the states in a config file input to the traffic generator.
-cfg_file_name = "lowp_sweep.cfg"
-cfg_file_path = os.path.dirname(file) + "/" +cfg_file_name
-cfg_file = open(cfg_file_path, 'w')

Get the number of banks

-nbr_banks = int(system.mem_ctrls[0].banks_per_rank.value)
+nbr_banks = int(system.mem_ctrls[0].dram.banks_per_rank.value)

determine the burst size in bytes

-burst_size = int((system.mem_ctrls[0].devices_per_rank.value *

  •              system.mem_ctrls[0].device_bus_width.value *
    
  •              system.mem_ctrls[0].burst_length.value) / 8)
    

+burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value *

  •              system.mem_ctrls[0].dram.device_bus_width.value *
    
  •              system.mem_ctrls[0].dram.burst_length.value) / 8)
    

    next, get the page size in bytes (the rowbuffer size is already in bytes)

-page_size = system.mem_ctrls[0].devices_per_rank.value * \

  • system.mem_ctrls[0].device_rowbuffer_size.value
    +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \
  • system.mem_ctrls[0].dram.device_rowbuffer_size.value

Inter-request delay should be such that we can hit as many transitions

to/from low power states as possible to. We provide a min and max itt to

the

traffic generator and it randomises in the range. The parameter is in

seconds and we need it in ticks (ps).

-itt_min = system.mem_ctrls[0].tBURST.value * 1000000000000
+itt_min = system.mem_ctrls[0].dram.tBURST.value * 1000000000000

#The itt value when set to (tRAS + tRP + tCK) covers the case where

a read command is delayed beyond the delay from ACT to PRE_PDN entry of

the
@@ -155,9 +160,9 @@

between a write and power down entry will be tRCD + tCL + tWR + tRP +

tCK.

As we use this delay as a unit and create multiples of it as bigger

delays

for the sweep, this parameter works for reads, writes and mix of them.

-pd_entry_time = (system.mem_ctrls[0].tRAS.value +

  •             system.mem_ctrls[0].tRP.value +
    
  •             system.mem_ctrls[0].tCK.value) * 1000000000000
    

+pd_entry_time = (system.mem_ctrls[0].dram.tRAS.value +

  •             system.mem_ctrls[0].dram.tRP.value +
    
  •             system.mem_ctrls[0].dram.tCK.value) * 1000000000000
    

    We sweep itt max using the multipliers specified by the user.

    itt_max_str = args.itt_list.strip().split()
    @@ -180,42 +185,11 @@

    banks

    bank_util_values = [1, int(nbr_banks/2), nbr_banks]

-# Next we create the config file, but first a comment
-cfg_file.write("""# STATE state# period mode=DRAM
-# read_percent start_addr end_addr req_size min_itt max_itt data_limit
-# stride_size page_size #banks #banks_util addr_map #ranks\n""")

-addr_map = m5.objects.AddrMap.map[args.addr_map]

-nxt_state = 0
-for itt_max in itt_max_values:

  • for bank in bank_util_values:
  •    for stride_size in stride_values:
    
  •        cfg_file.write("STATE %d %d %s %d 0 %d %d "
    
  •                       "%d %d %d %d %d %d %d %d %d\n" %
    
  •                       (nxt_state, period, "DRAM", args.rd_perc,  
    

max_addr,

  •                        burst_size, itt_min, itt_max, 0, stride_size,
    
  •                        page_size, nbr_banks, bank, addr_map,
    
  •                        args.mem_ranks))
    
  •        nxt_state = nxt_state + 1
    
  • State for idle period

    idle_period = args.idle_end
    -cfg_file.write("STATE %d %d IDLE\n" % (nxt_state, idle_period))

-# Init state is state 0
-cfg_file.write("INIT 0\n")

-# Go through the states one by one
-for state in range(1, nxt_state + 1):

  • cfg_file.write("TRANSITION %d %d 1\n" % (state - 1, state))

-# Transition from last state to itself to not break the probability math
-cfg_file.write("TRANSITION %d %d 1\n" % (nxt_state, nxt_state))
-cfg_file.close()

create a traffic generator, and point it to the file we just created

-system.tgen = TrafficGen(config_file = cfg_file_path)
+system.tgen = PyTrafficGen()

add a communication monitor

system.monitor = CommMonitor()
@@ -230,14 +204,34 @@

every period, dump and reset all stats

periodicStatDump(period)

+# run Forrest, run!
root = Root(full_system = False, system = system)
root.system.mem_mode = 'timing'

m5.instantiate()

+def trace():

  • addr_map = ObjectList.dram_addr_map_list.get(args.addr_map)
  • generator = dram_generators"DRAM"
  • for itt_max in itt_max_values:
  •    for bank in bank_util_values:
    
  •        for stride_size in stride_values:
    
  •            num_seq_pkts = int(math.ceil(float(stride_size) /  
    

burst_size))

  •            yield generator(period,
    
  •                            0, max_addr, burst_size, int(itt_min),
    
  •                            int(itt_max), args.rd_perc, 0,
    
  •                            num_seq_pkts, page_size, nbr_banks, bank,
    
  •                            addr_map, args.mem_ranks)
    
  • yield system.tgen.createIdle(idle_period)
  • yield system.tgen.createExit(0)

+system.tgen.start(trace())
+

Simulate for exactly as long as it takes to go through all the states

This is why sim exists.

-m5.simulate(nxt_state * period + idle_period)
+m5.simulate()
+
print("--- Done DRAM low power sweep ---")
print("Fixed params - ")
print("\tburst: %d, banks: %d, max stride: %d, itt min: %s ns" % 
@@ -247,4 +241,3 @@
print("\titt max values", itt_max_values)
print("\tbank utilization values", bank_util_values)
print("\tstride values:", stride_values)
-print("Traffic gen config file:", cfg_file_name)
diff --git a/configs/dram/sweep.py b/configs/dram/sweep.py
index d3c86c3..6a49f44 100644
--- a/configs/dram/sweep.py
+++ b/configs/dram/sweep.py
@@ -116,13 +116,15 @@

the following assumes that we are using the native DRAM

controller, check to be sure

if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl):

  • fatal("This script assumes the memory is a DRAMCtrl subclass")
  • fatal("This script assumes the controller is a DRAMCtrl subclass")
    +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
  • fatal("This script assumes the memory is a DRAMInterface subclass")

there is no point slowing things down by saving any data

-system.mem_ctrls[0].null = True
+system.mem_ctrls[0].dram.null = True

Set the address mapping based on input argument

-system.mem_ctrls[0].addr_mapping = options.addr_map
+system.mem_ctrls[0].dram.addr_mapping = options.addr_map

stay in each state for 0.25 ms, long enough to warm things up, and

short enough to avoid hitting a refresh

@@ -133,21 +135,21 @@

the DRAM maximum bandwidth to ensure that it is saturated

get the number of banks

-nbr_banks = system.mem_ctrls[0].banks_per_rank.value
+nbr_banks = system.mem_ctrls[0].dram.banks_per_rank.value

determine the burst length in bytes

-burst_size = int((system.mem_ctrls[0].devices_per_rank.value *

  •              system.mem_ctrls[0].device_bus_width.value *
    
  •              system.mem_ctrls[0].burst_length.value) / 8)
    

+burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value *

  •              system.mem_ctrls[0].dram.device_bus_width.value *
    
  •              system.mem_ctrls[0].dram.burst_length.value) / 8)
    

    next, get the page size in bytes

-page_size = system.mem_ctrls[0].devices_per_rank.value * \

  • system.mem_ctrls[0].device_rowbuffer_size.value
    +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \
  • system.mem_ctrls[0].dram.device_rowbuffer_size.value

match the maximum bandwidth of the memory, the parameter is in seconds

and we need it in ticks (ps)

-itt =  getattr(system.mem_ctrls[0].tBURST_MIN, 'value',

  •           system.mem_ctrls[0].tBURST.value) * 1000000000000
    

+itt =  getattr(system.mem_ctrls[0].dram.tBURST_MIN, 'value',

  •           system.mem_ctrls[0].dram.tBURST.value) * 1000000000000
    

    assume we start at 0

    max_addr = mem_range.end
    diff --git a/configs/learning_gem5/part1/simple.py
    b/configs/learning_gem5/part1/simple.py
    index ef73a06..cfd15be 100644
    --- a/configs/learning_gem5/part1/simple.py
    +++ b/configs/learning_gem5/part1/simple.py
    @@ -77,8 +77,9 @@
    system.cpu.interrupts[0].int_slave = system.membus.master

    Create a DDR3 memory controller and connect it to the membus

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.master

Connect the system up to the membus

diff --git a/configs/learning_gem5/part1/two_level.py
b/configs/learning_gem5/part1/two_level.py
index 564c785..0dbcfc7 100644
--- a/configs/learning_gem5/part1/two_level.py
+++ b/configs/learning_gem5/part1/two_level.py
@@ -132,8 +132,9 @@
system.system_port = system.membus.slave

Create a DDR3 memory controller

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.master

Create a process for a simple "Hello World" application

diff --git a/configs/learning_gem5/part2/simple_cache.py
b/configs/learning_gem5/part2/simple_cache.py
index 8d98d92..fbea73d 100644
--- a/configs/learning_gem5/part2/simple_cache.py
+++ b/configs/learning_gem5/part2/simple_cache.py
@@ -76,8 +76,9 @@
system.cpu.interrupts[0].int_slave = system.membus.master

Create a DDR3 memory controller and connect it to the membus

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.master

Connect the system up to the membus

diff --git a/configs/learning_gem5/part2/simple_memobj.py
b/configs/learning_gem5/part2/simple_memobj.py
index d30977c..e792eb9 100644
--- a/configs/learning_gem5/part2/simple_memobj.py
+++ b/configs/learning_gem5/part2/simple_memobj.py
@@ -74,8 +74,9 @@
system.cpu.interrupts[0].int_slave = system.membus.master

Create a DDR3 memory controller and connect it to the membus

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.master

Connect the system up to the membus

diff --git a/configs/learning_gem5/part3/simple_ruby.py
b/configs/learning_gem5/part3/simple_ruby.py
index c47ee7e..7f70a8c 100644
--- a/configs/learning_gem5/part3/simple_ruby.py
+++ b/configs/learning_gem5/part3/simple_ruby.py
@@ -68,8 +68,9 @@
system.cpu = [TimingSimpleCPU() for i in range(2)]

Create a DDR3 memory controller and connect it to the membus

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]

create the interrupt controller for the CPU and connect to the membus

for cpu in system.cpu:
diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py
index 0f70dff..dff5000 100644
--- a/src/mem/DRAMCtrl.py
+++ b/src/mem/DRAMCtrl.py
@@ -40,26 +40,12 @@

from m5.params import *
from m5.proxy import *
-from m5.objects.AbstractMemory import *
from m5.objects.QoSMemCtrl import *

Enum for memory scheduling algorithms, currently First-Come

First-Served and a First-Row Hit then First-Come First-Served

class MemSched(Enum): vals = ['fcfs', 'frfcfs']

-# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting
-# channel, rank, bank, row and column, respectively, and going from
-# MSB to LSB.  Available are RoRaBaChCo and RoRaBaCoCh, that are
-# suitable for an open-page policy, optimising for sequential accesses
-# hitting in the open row. For a closed-page policy, RoCoRaBaCh
-# maximises parallelism.
-class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh']

-# Enum for the page policy, either open, open_adaptive, close, or
-# close_adaptive.
-class PageManage(Enum): vals = ['open', 'open_adaptive', 'close',

  •                            'close_adaptive']
    
  • DRAMCtrl is a single-channel single-ported DRAM controller model

    that aims to model the most important system-level performance

    effects of a DRAM without getting into too much detail of the DRAM

@@ -72,8 +58,11 @@
# bus in front of the controller for multiple ports
port = SlavePort("Slave port")

  • the basic configuration of the controller architecture, note

  • that each entry corresponds to a burst for the specific DRAM

  • Interface to volatile, DRAM media

  • dram = Param.DRAMInterface(Parent.any, "DRAM interface")

  • Set default buffer sizes

  • each entry corresponds to a burst for the specific DRAM

    configuration (e.g. x32 with burst length 8 is 32 bytes) and not

    the cacheline size or request/packet size

    write_buffer_size = Param.Unsigned(64, "Number of write queue entries")
    @@ -93,15 +82,6 @@

    scheduler, address map and page policy

    mem_sched_policy = Param.MemSched('frfcfs', "Memory scheduling policy")

  • addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy")

  • page_policy = Param.PageManage('open_adaptive', "Page management
    policy")

  • enforce a limit on the number of accesses per row

  • max_accesses_per_row = Param.Unsigned(16, "Max accesses per row
    before "

  •                                      "closing");
    
  • size of DRAM Chip in Bytes

  • device_size = Param.MemorySize("Size of DRAM chip")

    pipeline latency of the controller and PHY, split into a

    frontend part and a backend part, with reads and writes serviced

@@ -109,1404 +89,3 @@
# serviced by the memory seeing the sum of the two
static_frontend_latency = Param.Latency("10ns", "Static frontend
latency")
static_backend_latency = Param.Latency("10ns", "Static backend
latency")

  • the physical organisation of the DRAM

  • device_bus_width = Param.Unsigned("data bus width in bits for each
    DRAM "\
  •                                  "device/chip")
    
  • burst_length = Param.Unsigned("Burst lenght (BL) in beats")
  • device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\
  •                                       "device/chip")
    
  • devices_per_rank = Param.Unsigned("Number of devices/chips per rank")
  • ranks_per_channel = Param.Unsigned("Number of ranks per channel")
  • default to 0 bank groups per rank, indicating bank group architecture

  • is not used

  • update per memory class when bank group architecture is supported

  • bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per
    rank")
  • banks_per_rank = Param.Unsigned("Number of banks per rank")
  • Enable DRAM powerdown states if True. This is False by default due to

  • performance being lower when enabled

  • enable_dram_powerdown = Param.Bool(False, "Enable powerdown states")
  • For power modelling we need to know if the DRAM has a DLL or not

  • dll = Param.Bool(True, "DRAM has DLL or not")
  • DRAMPower provides in addition to the core power, the possibility to

  • include RD/WR termination and IO power. This calculation assumes some

  • default values. The integration of DRAMPower with gem5 does not

include

  • IO and RD/WR termination power by default. This might be added as an

  • additional feature in the future.

  • timing behaviour and constraints - all in nanoseconds

  • the base clock period of the DRAM

  • tCK = Param.Latency("Clock period")
  • the amount of time in nanoseconds from issuing an activate command

  • to the data being available in the row buffer for a read/write

  • tRCD = Param.Latency("RAS to CAS delay")
  • the time from issuing a read/write command to seeing the actual data

  • tCL = Param.Latency("CAS latency")
  • minimum time between a precharge and subsequent activate

  • tRP = Param.Latency("Row precharge time")
  • minimum time between an activate and a precharge to the same row

  • tRAS = Param.Latency("ACT to PRE delay")
  • minimum time between a write data transfer and a precharge

  • tWR = Param.Latency("Write recovery time")
  • minimum time between a read and precharge command

  • tRTP = Param.Latency("Read to precharge")
  • time to complete a burst transfer, typically the burst length

  • divided by two due to the DDR bus, but by making it a parameter

  • it is easier to also evaluate SDR memories like WideIO.

  • This parameter has to account for burst length.

  • Read/Write requests with data size larger than one full burst are

broken

  • down into multiple requests in the controller

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = Param.Latency("Burst duration "
  •                       "(typically burst length / 2 cycles)")
    
  • tBURST_MAX is the column array cycle delay required before next

access,

  • which could be greater than tBURST when the memory access time is

greater

  • than tBURST

  • tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay")
  • tBURST_MIN is the minimum delay between bursts, which could be less

than

  • tBURST when interleaving is supported

  • tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts")
  • CAS-to-CAS delay for bursts to the same bank group

  • only utilized with bank group architectures; set to 0 for default

case

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay")
  • Write-to-Write delay for bursts to the same bank group

  • only utilized with bank group architectures; set to 0 for default

case

  • This will be used to enable different same bank group delays

  • for writes versus reads

  • tCCD_L_WR = Param.Latency(Self.tCCD_L,
  •    "Same bank group Write to Write delay")
    
  • time taken to complete one refresh cycle (N rows in all banks)

  • tRFC = Param.Latency("Refresh cycle time")
  • refresh command interval, how often a "ref" command needs

  • to be sent. It is 7.8 us for a 64ms refresh requirement

  • tREFI = Param.Latency("Refresh command interval")
  • write-to-read, same rank turnaround penalty

  • tWTR = Param.Latency("Write to read, same rank switching time")
  • write-to-read, same rank turnaround penalty for same bank group

  • tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching "
  •                       "time, same bank group")
    
  • read-to-write, same rank turnaround penalty

  • tRTW = Param.Latency("Read to write, same rank switching time")
  • rank-to-rank bus delay penalty

  • this does not correlate to a memory timing parameter and encompasses:

  • 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD

  • different rank bus delay

  • tCS = Param.Latency("Rank to rank switching time")
  • minimum precharge to precharge delay time

  • tPPD = Param.Latency("0ns", "PRE to PRE delay")
  • maximum delay between two-cycle ACT command phases

  • tAAD = Param.Latency(Self.tCK,
  •                     "Maximum delay between two-cycle ACT commands")
    
  • two_cycle_activate = Param.Bool(False,
  •                     "Two cycles required to send activate")
    
  • minimum row activate to row activate delay time

  • tRRD = Param.Latency("ACT to ACT delay")
  • only utilized with bank group architectures; set to 0 for default

case

  • tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay")
  • time window in which a maximum number of activates are allowed

  • to take place, set to 0 to disable

  • tXAW = Param.Latency("X activation window")
  • activation_limit = Param.Unsigned("Max number of activates in window")
  • time to exit power-down mode

  • Exit power-down to next valid command delay

  • tXP = Param.Latency("0ns", "Power-up Delay")
  • Exit Powerdown to commands requiring a locked DLL

  • tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL")
  • time to exit self-refresh mode

  • tXS = Param.Latency("0ns", "Self-refresh exit latency")
  • time to exit self-refresh mode with locked DLL

  • tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL")
  • number of data beats per clock. with DDR, default is 2, one per edge

  • beats_per_clock = Param.Unsigned(2, "Data beats per clock")
  • data_clock_sync = Param.Bool(False, "Synchronization commands
    required")
  • Currently rolled into other params

  • ######################################################################
  • tRC  - assumed to be tRAS + tRP

  • Power Behaviour and Constraints

  • DRAMs like LPDDR and WideIO have 2 external voltage domains. These

are

  • defined as VDD and VDD2. Each current is defined for each voltage

domain

  • separately. For example, current IDD0 is active-precharge current for

  • voltage domain VDD and current IDD02 is active-precharge current for

  • voltage domain VDD2.

  • By default all currents are set to 0mA. Users who are only

interested in

  • the performance of DRAMs can leave them at 0.

  • Operating 1 Bank Active-Precharge current

  • IDD0 = Param.Current("0mA", "Active precharge current")
  • Operating 1 Bank Active-Precharge current multiple voltage Range

  • IDD02 = Param.Current("0mA", "Active precharge current VDD2")
  • Precharge Power-down Current: Slow exit

  • IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow")
  • Precharge Power-down Current: Slow exit multiple voltage Range

  • IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2")
  • Precharge Power-down Current: Fast exit

  • IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast")
  • Precharge Power-down Current: Fast exit multiple voltage Range

  • IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2")
  • Precharge Standby current

  • IDD2N = Param.Current("0mA", "Precharge Standby current")
  • Precharge Standby current multiple voltage range

  • IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2")
  • Active Power-down current: slow exit

  • IDD3P0 = Param.Current("0mA", "Active Powerdown slow")
  • Active Power-down current: slow exit multiple voltage range

  • IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2")
  • Active Power-down current : fast exit

  • IDD3P1 = Param.Current("0mA", "Active Powerdown fast")
  • Active Power-down current : fast exit multiple voltage range

  • IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2")
  • Active Standby current

  • IDD3N = Param.Current("0mA", "Active Standby current")
  • Active Standby current multiple voltage range

  • IDD3N2 = Param.Current("0mA", "Active Standby current VDD2")
  • Burst Read Operating Current

  • IDD4R = Param.Current("0mA", "READ current")
  • Burst Read Operating Current multiple voltage range

  • IDD4R2 = Param.Current("0mA", "READ current VDD2")
  • Burst Write Operating Current

  • IDD4W = Param.Current("0mA", "WRITE current")
  • Burst Write Operating Current multiple voltage range

  • IDD4W2 = Param.Current("0mA", "WRITE current VDD2")
  • Refresh Current

  • IDD5 = Param.Current("0mA", "Refresh current")
  • Refresh Current multiple voltage range

  • IDD52 = Param.Current("0mA", "Refresh current VDD2")
  • Self-Refresh Current

  • IDD6 = Param.Current("0mA", "Self-refresh Current")
  • Self-Refresh Current multiple voltage range

  • IDD62 = Param.Current("0mA", "Self-refresh Current VDD2")
  • Main voltage range of the DRAM

  • VDD = Param.Voltage("0V", "Main Voltage Range")
  • Second voltage range defined by some DRAMs

  • VDD2 = Param.Voltage("0V", "2nd Voltage Range")

-# A single DDR3-1600 x64 channel (one command and address bus), with
-# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in
-# an 8x8 configuration.
-class DDR3_1600_8x8(DRAMCtrl):

  • size of device in bytes

  • device_size = '512MB'
  • 8x8 configuration, 8 devices each with an 8-bit interface

  • device_bus_width = 8
  • DDR3 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)

  • device_rowbuffer_size = '1kB'
  • 8x8 configuration, so 8 devices

  • devices_per_rank = 8
  • Use two ranks

  • ranks_per_channel = 2
  • DDR3 has 8 banks in all configurations

  • banks_per_rank = 8
  • 800 MHz

  • tCK = '1.25ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 800 MHz

  • tBURST = '5ns'
  • DDR3-1600 11-11-11

  • tRCD = '13.75ns'
  • tCL = '13.75ns'
  • tRP = '13.75ns'
  • tRAS = '35ns'
  • tRRD = '6ns'
  • tXAW = '30ns'
  • activation_limit = 4
  • tRFC = '260ns'
  • tWR = '15ns'
  • Greater of 4 CK or 7.5 ns

  • tWTR = '7.5ns'
  • Greater of 4 CK or 7.5 ns

  • tRTP = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns

  • tRTW = '2.5ns'
  • Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns

  • tCS = '2.5ns'
  • <=85C, half for >85C

  • tREFI = '7.8us'
  • active powerdown and precharge powerdown exit time

  • tXP = '6ns'
  • self refresh exit time

  • tXS = '270ns'
  • Current values from datasheet Die Rev E,J

  • IDD0 = '55mA'
  • IDD2N = '32mA'
  • IDD3N = '38mA'
  • IDD4W = '125mA'
  • IDD4R = '157mA'
  • IDD5 = '235mA'
  • IDD3P1 = '38mA'
  • IDD2P1 = '32mA'
  • IDD6 = '20mA'
  • VDD = '1.5V'

-# A single HMC-2500 x32 model based on:
-# [1] DRAMSpec: a high-level DRAM bank modelling tool
-# developed at the University of Kaiserslautern. This high level tool
-# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to
-# estimate the DRAM bank latency and power numbers.
-# [2] High performance AXI-4.0 based interconnect for extensible smart
memory
-# cubes (E. Azarkhish et. al)
-# Assumed for the HMC model is a 30 nm technology node.
-# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory
(4
-# layers).
-# Each layer has 16 vaults and each vault consists of 2 banks per layer.
-# In order to be able to use the same controller used for 2D DRAM
generations
-# for HMC, the following analogy is done:
-# Channel (DDR) => Vault (HMC)
-# device_size (DDR) => size of a single layer in a vault
-# ranks per channel (DDR) => number of layers
-# banks per rank (DDR) => banks per layer
-# devices per rank (DDR) => devices per layer ( 1 for HMC).
-# The parameters for which no input is available are inherited from the
DDR3
-# configuration.
-# This configuration includes the latencies from the DRAM to the logic
layer
-# of the HMC
-class HMC_2500_1x32(DDR3_1600_8x8):

  • size of device

  • two banks per device with each bank 4MB [2]

  • device_size = '8MB'
  • 1x32 configuration, 1 device with 32 TSVs [2]

  • device_bus_width = 32
  • HMC is a BL8 device [2]

  • burst_length = 8
  • Each device has a page (row buffer) size of 256 bytes [2]

  • device_rowbuffer_size = '256B'
  • 1x32 configuration, so 1 device [2]

  • devices_per_rank = 1
  • 4 layers so 4 ranks [2]

  • ranks_per_channel = 4
  • HMC has 2 banks per layer [2]

  • Each layer represents a rank. With 4 layers and 8 banks in total,

each

  • layer has 2 banks; thus 2 banks per rank.

  • banks_per_rank = 2
  • 1250 MHz [2]

  • tCK = '0.8ns'
  • 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz

  • tBURST = '3.2ns'
  • Values using DRAMSpec HMC model [1]

  • tRCD = '10.2ns'
  • tCL = '9.9ns'
  • tRP = '7.7ns'
  • tRAS = '21.6ns'
  • tRRD depends on the power supply network for each vendor.

  • We assume a tRRD of a double bank approach to be equal to 4 clock

  • cycles (Assumption)

  • tRRD = '3.2ns'
  • activation limit is set to 0 since there are only 2 banks per vault

  • layer.

  • activation_limit = 0
  • Values using DRAMSpec HMC model [1]

  • tRFC = '59ns'
  • tWR = '8ns'
  • tRTP = '4.9ns'
  • Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz

=

  • 0.8 ns (Assumption)

  • tCS = '0.8ns'
  • Value using DRAMSpec HMC model [1]

  • tREFI = '3.9us'
  • The default page policy in the vault controllers is simple closed

page

  • [2] nevertheless 'close' policy opens and closes the row multiple

times

  • for bursts largers than 32Bytes. For this reason we

use 'close_adaptive'

  • page_policy = 'close_adaptive'
  • RoCoRaBaCh resembles the default address mapping in HMC

  • addr_mapping = 'RoCoRaBaCh'
  • min_writes_per_switch = 8
  • These parameters do not directly correlate with buffer_size in real

  • hardware. Nevertheless, their value has been tuned to achieve a

  • bandwidth similar to the cycle-accurate model in [2]

  • write_buffer_size = 32
  • read_buffer_size = 32
  • The static latency of the vault controllers is estimated to be

smaller

  • than a full DRAM channel controller

  • static_backend_latency='4ns'
  • static_frontend_latency='4ns'

-# A single DDR3-2133 x64 channel refining a selected subset of the
-# options for the DDR-1600 configuration, based on the same DDR3-1600
-# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept
-# consistent across the two configurations.
-class DDR3_2133_8x8(DDR3_1600_8x8):

  • 1066 MHz

  • tCK = '0.938ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz

  • tBURST = '3.752ns'
  • DDR3-2133 14-14-14

  • tRCD = '13.09ns'
  • tCL = '13.09ns'
  • tRP = '13.09ns'
  • tRAS = '33ns'
  • tRRD = '5ns'
  • tXAW = '25ns'
  • Current values from datasheet

  • IDD0 = '70mA'
  • IDD2N = '37mA'
  • IDD3N = '44mA'
  • IDD4W = '157mA'
  • IDD4R = '191mA'
  • IDD5 = '250mA'
  • IDD3P1 = '44mA'
  • IDD2P1 = '43mA'
  • IDD6 ='20mA'
  • VDD = '1.5V'

-# A single DDR4-2400 x64 channel (one command and address bus), with
-# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4)
-# in an 16x4 configuration.
-# Total channel capacity is 32GB
-# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel
-class DDR4_2400_16x4(DRAMCtrl):

  • size of device

  • device_size = '1GB'
  • 16x4 configuration, 16 devices each with a 4-bit interface

  • device_bus_width = 4
  • DDR4 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 512 byte (1K columns x4)

  • device_rowbuffer_size = '512B'
  • 16x4 configuration, so 16 devices

  • devices_per_rank = 16
  • Match our DDR3 configurations which is dual rank

  • ranks_per_channel = 2
  • DDR4 has 2 (x16) or 4 (x4 and x8) bank groups

  • Set to 4 for x4 case

  • bank_groups_per_rank = 4
  • DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all

  • configurations). Currently we do not capture the additional

  • constraints incurred by the bank groups

  • banks_per_rank = 16
  • override the default buffer sizes and go for something larger to

  • accommodate the larger bank count

  • write_buffer_size = 128
  • read_buffer_size = 64
  • 1200 MHz

  • tCK = '0.833ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = '3.332ns'
  • @2400 data rate, tCCD_L is 6 CK

  • CAS-to-CAS delay for bursts to the same bank group

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = '5ns';
  • DDR4-2400 17-17-17

  • tRCD = '14.16ns'
  • tCL = '14.16ns'
  • tRP = '14.16ns'
  • tRAS = '32ns'
  • RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns)

  • tRRD = '3.332ns'
  • RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns)

  • tRRD_L = '4.9ns';
  • tFAW for 512B page is MAX(16 CK, 13ns)

  • tXAW = '13.328ns'
  • activation_limit = 4
  • tRFC is 350ns

  • tRFC = '350ns'
  • tWR = '15ns'
  • Here using the average of WTR_S and WTR_L

  • tWTR = '5ns'
  • Greater of 4 CK or 7.5 ns

  • tRTP = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666

ns

  • tRTW = '1.666ns'
  • Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns

  • tCS = '1.666ns'
  • <=85C, half for >85C

  • tREFI = '7.8us'
  • active powerdown and precharge powerdown exit time

  • tXP = '6ns'
  • self refresh exit time

  • exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is:

  • tRFC + 10ns = 340ns

  • tXS = '340ns'
  • Current values from datasheet

  • IDD0 = '43mA'
  • IDD02 = '3mA'
  • IDD2N = '34mA'
  • IDD3N = '38mA'
  • IDD3N2 = '3mA'
  • IDD4W = '103mA'
  • IDD4R = '110mA'
  • IDD5 = '250mA'
  • IDD3P1 = '32mA'
  • IDD2P1 = '25mA'
  • IDD6 = '30mA'
  • VDD = '1.2V'
  • VDD2 = '2.5V'

-# A single DDR4-2400 x64 channel (one command and address bus), with
-# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8)
-# in an 8x8 configuration.
-# Total channel capacity is 16GB
-# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel
-class DDR4_2400_8x8(DDR4_2400_16x4):

  • 8x8 configuration, 8 devices each with an 8-bit interface

  • device_bus_width = 8
  • Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)

  • device_rowbuffer_size = '1kB'
  • 8x8 configuration, so 8 devices

  • devices_per_rank = 8
  • RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns)

  • tRRD_L = '4.9ns';
  • tXAW = '21ns'
  • Current values from datasheet

  • IDD0 = '48mA'
  • IDD3N = '43mA'
  • IDD4W = '123mA'
  • IDD4R = '135mA'
  • IDD3P1 = '37mA'

-# A single DDR4-2400 x64 channel (one command and address bus), with
-# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16)
-# in an 4x16 configuration.
-# Total channel capacity is 4GB
-# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel
-class DDR4_2400_4x16(DDR4_2400_16x4):

  • 4x16 configuration, 4 devices each with an 16-bit interface

  • device_bus_width = 16
  • Each device has a page (row buffer) size of 2 Kbyte (1K columns x16)

  • device_rowbuffer_size = '2kB'
  • 4x16 configuration, so 4 devices

  • devices_per_rank = 4
  • Single rank for x16

  • ranks_per_channel = 1
  • DDR4 has 2 (x16) or 4 (x4 and x8) bank groups

  • Set to 2 for x16 case

  • bank_groups_per_rank = 2
  • DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all

  • configurations). Currently we do not capture the additional

  • constraints incurred by the bank groups

  • banks_per_rank = 8
  • RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns)

  • tRRD = '5.3ns'
  • RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns)

  • tRRD_L = '6.4ns';
  • tXAW = '30ns'
  • Current values from datasheet

  • IDD0 = '80mA'
  • IDD02 = '4mA'
  • IDD2N = '34mA'
  • IDD3N = '47mA'
  • IDD4W = '228mA'
  • IDD4R = '243mA'
  • IDD5 = '280mA'
  • IDD3P1 = '41mA'

-# A single LPDDR2-S4 x32 interface (one command/address bus), with
-# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1)
-# in a 1x32 configuration.
-class LPDDR2_S4_1066_1x32(DRAMCtrl):

  • No DLL in LPDDR2

  • dll = False
  • size of device

  • device_size = '512MB'
  • 1x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • LPDDR2_S4 is a BL4 and BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 1KB

  • (this depends on the memory density)

  • device_rowbuffer_size = '1kB'
  • 1x32 configuration, so 1 device

  • devices_per_rank = 1
  • Use a single rank

  • ranks_per_channel = 1
  • LPDDR2-S4 has 8 banks in all configurations

  • banks_per_rank = 8
  • 533 MHz

  • tCK = '1.876ns'
  • Fixed at 15 ns

  • tRCD = '15ns'
  • 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time

  • tCL = '15ns'
  • Pre-charge one bank 15 ns (all banks 18 ns)

  • tRP = '15ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • tRTP = '7.5ns'
  • 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz.

  • Note this is a BL8 DDR device.

  • Requests larger than 32 bytes are broken down into multiple requests

  • in the controller

  • tBURST = '7.5ns'
  • LPDDR2-S4, 4 Gbit

  • tRFC = '130ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '7.5ns'
  • self refresh exit time

  • tXS = '140ns'
  • Irrespective of speed grade, tWTR is 7.5 ns

  • tWTR = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns

  • tRTW = '3.75ns'
  • Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns

  • tCS = '3.75ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Irrespective of density, tFAW is 50 ns

  • tXAW = '50ns'
  • activation_limit = 4
  • Current values from datasheet

  • IDD0 = '15mA'
  • IDD02 = '70mA'
  • IDD2N = '2mA'
  • IDD2N2 = '30mA'
  • IDD3N = '2.5mA'
  • IDD3N2 = '30mA'
  • IDD4W = '10mA'
  • IDD4W2 = '190mA'
  • IDD4R = '3mA'
  • IDD4R2 = '220mA'
  • IDD5 = '40mA'
  • IDD52 = '150mA'
  • IDD3P1 = '1.2mA'
  • IDD3P12 = '8mA'
  • IDD2P1 = '0.6mA'
  • IDD2P12 = '0.8mA'
  • IDD6 = '1mA'
  • IDD62 = '3.2mA'
  • VDD = '1.8V'
  • VDD2 = '1.2V'

-# A single WideIO x128 interface (one command and address bus), with
-# default timings based on an estimated WIO-200 8 Gbit part.
-class WideIO_200_1x128(DRAMCtrl):

  • No DLL for WideIO

  • dll = False
  • size of device

  • device_size = '1024MB'
  • 1x128 configuration, 1 device with a 128-bit interface

  • device_bus_width = 128
  • This is a BL4 device

  • burst_length = 4
  • Each device has a page (row buffer) size of 4KB

  • (this depends on the memory density)

  • device_rowbuffer_size = '4kB'
  • 1x128 configuration, so 1 device

  • devices_per_rank = 1
  • Use one rank for a one-high die stack

  • ranks_per_channel = 1
  • WideIO has 4 banks in all configurations

  • banks_per_rank = 4
  • 200 MHz

  • tCK = '5ns'
  • WIO-200

  • tRCD = '18ns'
  • tCL = '18ns'
  • tRP = '18ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • Read to precharge is same as the burst

  • tRTP = '20ns'
  • 4 beats across an x128 SDR interface translates to 4 clocks @ 200

MHz.

  • Note this is a BL4 SDR device.

  • tBURST = '20ns'
  • WIO 8 Gb

  • tRFC = '210ns'
  • WIO 8 Gb, <=85C, half for >85C

  • tREFI = '3.9us'
  • Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns

  • tWTR = '15ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns

  • tRTW = '10ns'
  • Default different rank bus delay to 2 CK, @200 MHz = 10 ns

  • tCS = '10ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Two instead of four activation window

  • tXAW = '50ns'
  • activation_limit = 2
  • The WideIO specification does not provide current information

-# A single LPDDR3 x32 interface (one command/address bus), with
-# default timings based on a LPDDR3-1600 4 Gbit part (Micron
-# EDF8132A1MC) in a 1x32 configuration.
-class LPDDR3_1600_1x32(DRAMCtrl):

  • No DLL for LPDDR3

  • dll = False
  • size of device

  • device_size = '512MB'
  • 1x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • LPDDR3 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 4KB

  • device_rowbuffer_size = '4kB'
  • 1x32 configuration, so 1 device

  • devices_per_rank = 1
  • Technically the datasheet is a dual-rank package, but for

  • comparison with the LPDDR2 config we stick to a single rank

  • ranks_per_channel = 1
  • LPDDR3 has 8 banks in all configurations

  • banks_per_rank = 8
  • 800 MHz

  • tCK = '1.25ns'
  • tRCD = '18ns'
  • 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time

  • tCL = '15ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns

  • tRTP = '7.5ns'
  • Pre-charge one bank 18 ns (all banks 21 ns)

  • tRP = '18ns'
  • 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz.

  • Note this is a BL8 DDR device.

  • Requests larger than 32 bytes are broken down into multiple requests

  • in the controller

  • tBURST = '5ns'
  • LPDDR3, 4 Gb

  • tRFC = '130ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '7.5ns'
  • self refresh exit time

  • tXS = '140ns'
  • Irrespective of speed grade, tWTR is 7.5 ns

  • tWTR = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns

  • tRTW = '2.5ns'
  • Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns

  • tCS = '2.5ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Irrespective of size, tFAW is 50 ns

  • tXAW = '50ns'
  • activation_limit = 4
  • Current values from datasheet

  • IDD0 = '8mA'
  • IDD02 = '60mA'
  • IDD2N = '0.8mA'
  • IDD2N2 = '26mA'
  • IDD3N = '2mA'
  • IDD3N2 = '34mA'
  • IDD4W = '2mA'
  • IDD4W2 = '190mA'
  • IDD4R = '2mA'
  • IDD4R2 = '230mA'
  • IDD5 = '28mA'
  • IDD52 = '150mA'
  • IDD3P1 = '1.4mA'
  • IDD3P12 = '11mA'
  • IDD2P1 = '0.8mA'
  • IDD2P12 = '1.8mA'
  • IDD6 = '0.5mA'
  • IDD62 = '1.8mA'
  • VDD = '1.8V'
  • VDD2 = '1.2V'

-# A single GDDR5 x64 interface, with
-# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix
-# H5GQ1H24AFR) in a 2x32 configuration.
-class GDDR5_4000_2x32(DRAMCtrl):

  • size of device

  • device_size = '128MB'
  • 2x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • GDDR5 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 2Kbits (256Bytes)

  • device_rowbuffer_size = '256B'
  • 2x32 configuration, so 2 devices

  • devices_per_rank = 2
  • assume single rank

  • ranks_per_channel = 1
  • GDDR5 has 4 bank groups

  • bank_groups_per_rank = 4
  • GDDR5 has 16 banks with 4 bank groups

  • banks_per_rank = 16
  • 1000 MHz

  • tCK = '1ns'
  • 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz

  • Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz )

  • 8 beats at 4000 MHz = 2 beats at 1000 MHz

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = '2ns'
  • @1000MHz data rate, tCCD_L is 3 CK

  • CAS-to-CAS delay for bursts to the same bank group

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = '3ns';
  • tRCD = '12ns'
  • tCL is not directly found in datasheet and assumed equal tRCD

  • tCL = '12ns'
  • tRP = '12ns'
  • tRAS = '28ns'
  • RRD_S (different bank group)

  • RRD_S is 5.5 ns in datasheet.

  • rounded to the next multiple of tCK

  • tRRD = '6ns'
  • RRD_L (same bank group)

  • RRD_L is 5.5 ns in datasheet.

  • rounded to the next multiple of tCK

  • tRRD_L = '6ns'
  • tXAW = '23ns'
  • tXAW < 4 x tRRD.

  • Therefore, activation limit is set to 0

  • activation_limit = 0
  • tRFC = '65ns'
  • tWR = '12ns'
  • Here using the average of WTR_S and WTR_L

  • tWTR = '5ns'
  • Read-to-Precharge 2 CK

  • tRTP = '2ns'
  • Assume 2 cycles

  • tRTW = '2ns'

-# A single HBM x128 interface (one command and address bus), with
-# default timings based on data publically released
-# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014),
-# IDD measurement values, and by extrapolating data from other classes.
-# Architecture values based on published HBM spec
-# A 4H stack is defined, 2Gb per die for a total of 1GB of memory.
-class HBM_1000_4H_1x128(DRAMCtrl):

  • HBM gen1 supports up to 8 128-bit physical channels

  • Configuration defines a single channel, with the capacity

  • set to (full_ stack_capacity / 8) based on 2Gb dies

  • To use all 8 channels, set 'channels' parameter to 8 in

  • system configuration

  • 128-bit interface legacy mode

  • device_bus_width = 128
  • HBM supports BL4 and BL2 (legacy mode only)

  • burst_length = 4
  • size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack;

  • with 8 channels, 128MB per channel

  • device_size = '128MB'
  • device_rowbuffer_size = '2kB'
  • 1x128 configuration

  • devices_per_rank = 1
  • HBM does not have a CS pin; set rank to 1

  • ranks_per_channel = 1
  • HBM has 8 or 16 banks depending on capacity

  • 2Gb dies have 8 banks

  • banks_per_rank = 8
  • depending on frequency, bank groups may be required

  • will always have 4 bank groups when enabled

  • current specifications do not define the minimum frequency for

  • bank group architecture

  • setting bank_groups_per_rank to 0 to disable until range is defined

  • bank_groups_per_rank = 0
  • 500 MHz for 1Gbps DDR data rate

  • tCK = '2ns'
  • use values from IDD measurement in JEDEC spec

  • use tRP value for tRCD and tCL similar to other classes

  • tRP = '15ns'
  • tRCD = '15ns'
  • tCL = '15ns'
  • tRAS = '33ns'
  • BL2 and BL4 supported, default to BL4

  • DDR @ 500 MHz means 4 * 2ns / 2 = 4ns

  • tBURST = '4ns'
  • value for 2Gb device from JEDEC spec

  • tRFC = '160ns'
  • value for 2Gb device from JEDEC spec

  • tREFI = '3.9us'
  • extrapolate the following from LPDDR configs, using ns values

  • to minimize burst length, prefetch differences

  • tWR = '18ns'
  • tRTP = '7.5ns'
  • tWTR = '10ns'
  • start with 2 cycles turnaround, similar to other memory classes

  • could be more with variations across the stack

  • tRTW = '4ns'
  • single rank device, set to 0

  • tCS = '0ns'
  • from MemCon example, tRRD is 4ns with 2ns tCK

  • tRRD = '4ns'
  • from MemCon example, tFAW is 30ns with 2ns tCK

  • tXAW = '30ns'
  • activation_limit = 4
  • 4tCK

  • tXP = '8ns'
  • start with tRFC + tXP -> 160ns + 8ns = 168ns

  • tXS = '168ns'

-# A single HBM x64 interface (one command and address bus), with
-# default timings based on HBM gen1 and data publically released
-# A 4H stack is defined, 8Gb per die for a total of 4GB of memory.
-# Note: This defines a pseudo-channel with a unique controller
-# instantiated per pseudo-channel
-# Stay at same IO rate (1Gbps) to maintain timing relationship with
-# HBM gen1 class (HBM_1000_4H_x128) where possible
-class HBM_1000_4H_1x64(HBM_1000_4H_1x128):

  • For HBM gen2 with pseudo-channel mode, configure 2X channels.

  • Configuration defines a single pseudo channel, with the capacity

  • set to (full_ stack_capacity / 16) based on 8Gb dies

  • To use all 16 pseudo channels, set 'channels' parameter to 16 in

  • system configuration

  • 64-bit pseudo-channle interface

  • device_bus_width = 64
  • HBM pseudo-channel only supports BL4

  • burst_length = 4
  • size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack;

  • with 16 channels, 256MB per channel

  • device_size = '256MB'
  • page size is halved with pseudo-channel; maintaining the same same

number

  • of rows per pseudo-channel with 2X banks across 2 channels

  • device_rowbuffer_size = '1kB'
  • HBM has 8 or 16 banks depending on capacity

  • Starting with 4Gb dies, 16 banks are defined

  • banks_per_rank = 16
  • reset tRFC for larger, 8Gb device

  • use HBM1 4Gb value as a starting point

  • tRFC = '260ns'
  • start with tRFC + tXP -> 160ns + 8ns = 168ns

  • tXS = '268ns'
  • Default different rank bus delay to 2 CK, @1000 MHz = 2 ns

  • tCS = '2ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '10ns'
  • self refresh exit time

  • tXS = '65ns'

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture
-# burst of 32, which means bursts can be interleaved
-class LPDDR5_5500_1x16_BG_BL32(DRAMCtrl):

  • Increase buffer size to account for more bank resources

  • read_buffer_size = 64
  • Set page policy to better suit DMC Huxley

  • page_policy = 'close_adaptive'
  • 16-bit channel interface

  • device_bus_width = 16
  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL32 for higher command bandwidth

  • burst_length = 32
  • size of device in bytes

  • device_size = '1GB'
  • 2kB page with BG mode

  • device_rowbuffer_size = '2kB'
  • Use a 1x16 configuration

  • devices_per_rank = 1
  • Use a single rank

  • ranks_per_channel = 1
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Initial configuration will have 16 banks with Bank Group Arch

  • to maximim resources and enable higher data rates

  • banks_per_rank = 16
  • bank_groups_per_rank = 4
  • 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK

  • tCK = '1.455ns'
  • Greater of 2 CK or 18ns

  • tRCD = '18ns'
  • Base RL is 16 CK @ 687.5 MHz = 23.28ns

  • tCL = '23.280ns'
  • Greater of 2 CK or 18ns

  • tRP = '18ns'
  • Greater of 3 CK or 42ns

  • tRAS = '42ns'
  • Greater of 3 CK or 34ns

  • tWR = '34ns'
  • active powerdown and precharge powerdown exit time

  • Greater of 3 CK or 7ns

  • tXP = '7ns'
  • self refresh exit time (tRFCab + 7.5ns)

  • tXS = '217.5ns'
  • Greater of 2 CK or 7.5 ns minus 2 CK

  • tRTP = '4.59ns'
  • With BG architecture, burst of 32 transferred in two 16-beat

  • sub-bursts, with a 16-beat gap in between.

  • Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz

  • tBURST is the delay to transfer the Bstof32 =  6 CK @ 687.5 MHz

  • tBURST = '8.73ns'
  • can interleave a Bstof32 from another bank group at tBURST_MIN

  • 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz

  • tBURST_MIN = '2.91ns'
  • tBURST_MAX is the maximum burst delay for same bank group timing

  • this is 8 CK @ 687.5 MHz

  • tBURST_MAX = '11.64ns'
  • 8 CK @ 687.5 MHz

  • tCCD_L = "11.64ns"
  • LPDDR5, 8 Gbit/channel for 280ns tRFCab

  • tRFC = '210ns'
  • tREFI = '3.9us'
  • Greater of 4 CK or 6.25 ns

  • tWTR = '6.25ns'
  • Greater of 4 CK or 12 ns

  • tWTR_L = '12ns'
  • Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL

  • tWCKDQ0/tCK will be 1 CK for most cases

  • For gem5 RL = WL and BL/n is already accounted for with tBURST

  • Result is and additional 1 CK is required

  • tRTW = '1.455ns'
  • Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns

  • tCS = '2.91ns'
  • 2 CK

  • tPPD = '2.91ns'
  • Greater of 2 CK or 5 ns

  • tRRD = '5ns'
  • tRRD_L = '5ns'
  • With Bank Group Arch mode tFAW is 20 ns

  • tXAW = '20ns'
  • activation_limit = 4
  • at 5Gbps, 4:1 WCK to CK ratio required

  • 2 data beats per WCK (DDR) -> 8 per CK

  • beats_per_clock = 8
  • 2 cycles required to send activate command

  • 2 command phases can be sent back-to-back or

  • with a gap up to tAAD = 8 CK

  • two_cycle_activate = True
  • tAAD = '11.640ns'
  • data_clock_sync = True

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture, burst of 16
-class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32):

  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL16 for smaller access granularity

  • burst_length = 16
  • For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST = '2.91ns'
  • tBURST_MIN = '2.91ns'
  • For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST_MAX = '5.82ns'
  • 4 CK @ 687.5 MHz

  • tCCD_L = "5.82ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 8-bank mode, burst of 32
-class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32):

  • 4kB page with 8B mode

  • device_rowbuffer_size = '4kB'
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Select 8B

  • banks_per_rank = 8
  • bank_groups_per_rank = 0
  • For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST = '5.82ns'
  • tBURST_MIN = '5.82ns'
  • tBURST_MAX = '5.82ns'
  • Greater of 4 CK or 12 ns

  • tWTR = '12ns'
  • Greater of 2 CK or 10 ns

  • tRRD = '10ns'
  • With 8B mode tFAW is 40 ns

  • tXAW = '40ns'
  • activation_limit = 4
  • Reset BG arch timing for 8B mode

  • tCCD_L = "0ns"
  • tRRD_L = "0ns"
  • tWTR_L = "0ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture
-# burst of 32, which means bursts can be interleaved
-class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32):

  • 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK

  • tCK = '1.25ns'
  • Base RL is 17 CK @ 800 MHz = 21.25ns

  • tCL = '21.25ns'
  • With BG architecture, burst of 32 transferred in two 16-beat

  • sub-bursts, with a 16-beat gap in between.

  • Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz

  • tBURST is the delay to transfer the Bstof32 =  6 CK @ 800 MHz

  • tBURST = '7.5ns'
  • can interleave a Bstof32 from another bank group at tBURST_MIN

  • 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz

  • tBURST_MIN = '2.5ns'
  • tBURST_MAX is the maximum burst delay for same bank group timing

  • this is 8 CK @ 800 MHz

  • tBURST_MAX = '10ns'
  • 8 CK @ 800 MHz

  • tCCD_L = "10ns"
  • Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL

  • tWCKDQ0/tCK will be 1 CK for most cases

  • For gem5 RL = WL and BL/n is already accounted for with tBURST

  • Result is and additional 1 CK is required

  • tRTW = '1.25ns'
  • Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns

  • tCS = '2.5ns'
  • 2 CK

  • tPPD = '2.5ns'
  • 2 command phases can be sent back-to-back or

  • with a gap up to tAAD = 8 CK

  • tAAD = '10ns'

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on initial
-# JEDEC specifcation
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture, burst of 16
-class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32):

  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL16 for smaller access granularity

  • burst_length = 16
  • For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio

  • tBURST = '2.5ns'
  • tBURST_MIN = '2.5ns'
  • For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio

  • tBURST_MAX = '5ns'
  • 4 CK @ 800 MHz

  • tCCD_L = "5ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 8-bank mode, burst of 32
-class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32):

  • 4kB page with 8B mode

  • device_rowbuffer_size = '4kB'
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Select 8B

  • banks_per_rank = 8
  • bank_groups_per_rank = 0
  • For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio

  • tBURST = '5ns'
  • tBURST_MIN = '5ns'
  • tBURST_MAX = '5ns'
  • Greater of 4 CK or 12 ns

  • tWTR = '12ns'
  • Greater of 2 CK or 10 ns

  • tRRD = '10ns'
  • With 8B mode tFAW is 40 ns

  • tXAW = '40ns'
  • activation_limit = 4
  • Reset BG arch timing for 8B mode

  • tCCD_L = "0ns"
  • tRRD_L = "0ns"
  • tWTR_L = "0ns"
    diff --git a/src/mem/DRAMInterface.py b/src/mem/DRAMInterface.py
    new file mode 100644
    index 0000000..35bf8a3
    --- /dev/null
    +++ b/src/mem/DRAMInterface.py
    @@ -0,0 +1,1483 @@
    +# Copyright (c) 2012-2020 ARM Limited
    +# All rights reserved.
    +#
    +# The license below extends only to copyright in the software and shall
    +# not be construed as granting a license to any other intellectual
    +# property including but not limited to intellectual property relating
    +# to a hardware implementation of the functionality of the software
    +# licensed hereunder.  You may use the software subject to the license
    +# terms below provided that you ensure that this notice is replicated
    +# unmodified and in its entirety in all distributions of the software,
    +# modified or unmodified, in source code or in binary form.
    +#
    +# Copyright (c) 2013 Amin Farmahini-Farahani
    +# Copyright (c) 2015 University of Kaiserslautern
    +# Copyright (c) 2015 The University of Bologna
    +# All rights reserved.
    +#
    +# Redistribution and use in source and binary forms, with or without
    +# modification, are permitted provided that the following conditions are
    +# met: redistributions of source code must retain the above copyright
    +# notice, this list of conditions and the following disclaimer;
    +# redistributions in binary form must reproduce the above copyright
    +# notice, this list of conditions and the following disclaimer in the
    +# documentation and/or other materials provided with the distribution;
    +# neither the name of the copyright holders nor the names of its
    +# contributors may be used to endorse or promote products derived from
    +# this software without specific prior written permission.
    +#
    +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+from AbstractMemory import AbstractMemory
+from DRAMCtrl import *
+
+# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting
+# channel, rank, bank, row and column, respectively, and going from
+# MSB to LSB.  Available are RoRaBaChCo and RoRaBaCoCh, that are
+# suitable for an open-page policy, optimising for sequential accesses
+# hitting in the open row. For a closed-page policy, RoCoRaBaCh
+# maximises parallelism.
+class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh']
+
+# Enum for the page policy, either open, open_adaptive, close, or
+# close_adaptive.
+class PageManage(Enum): vals = ['open', 'open_adaptive', 'close',

  •                            'close_adaptive']
    

+class DRAMInterface(AbstractMemory):

  • type = 'DRAMInterface'
  • cxx_header = "mem/dram_ctrl.hh"
  • scheduler, address map and page policy

  • addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy")
  • page_policy = Param.PageManage('open_adaptive', "Page management
    policy")
  • Allow the interface to set required controller buffer sizes

  • each entry corresponds to a burst for the specific DRAM

  • configuration (e.g. x32 with burst length 8 is 32 bytes) and not

  • the cacheline size or request/packet size

  • write_buffer_size = Param.Unsigned(64, "Number of write queue entries")
  • read_buffer_size = Param.Unsigned(32, "Number of read queue entries")
  • enforce a limit on the number of accesses per row

  • max_accesses_per_row = Param.Unsigned(16, "Max accesses per row
    before "
  •                                      "closing");
    
  • size of DRAM Chip in Bytes

  • device_size = Param.MemorySize("Size of DRAM chip")
  • the physical organisation of the DRAM

  • device_bus_width = Param.Unsigned("data bus width in bits for each
    DRAM "\
  •                                  "device/chip")
    
  • burst_length = Param.Unsigned("Burst lenght (BL) in beats")
  • device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\
  •                                       "device/chip")
    
  • devices_per_rank = Param.Unsigned("Number of devices/chips per rank")
  • ranks_per_channel = Param.Unsigned("Number of ranks per channel")
  • default to 0 bank groups per rank, indicating bank group architecture

  • is not used

  • update per memory class when bank group architecture is supported

  • bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per
    rank")
  • banks_per_rank = Param.Unsigned("Number of banks per rank")
  • Enable DRAM powerdown states if True. This is False by default due to

  • performance being lower when enabled

  • enable_dram_powerdown = Param.Bool(False, "Enable powerdown states")
  • For power modelling we need to know if the DRAM has a DLL or not

  • dll = Param.Bool(True, "DRAM has DLL or not")
  • DRAMPower provides in addition to the core power, the possibility to

  • include RD/WR termination and IO power. This calculation assumes some

  • default values. The integration of DRAMPower with gem5 does not

include

  • IO and RD/WR termination power by default. This might be added as an

  • additional feature in the future.

  • timing behaviour and constraints - all in nanoseconds

  • the base clock period of the DRAM

  • tCK = Param.Latency("Clock period")
  • rank-to-rank bus delay penalty

  • this does not correlate to a memory timing parameter and encompasses:

  • 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD

  • different rank bus delay

  • tCS = Param.Latency("Rank to rank switching time")
  • the amount of time in nanoseconds from issuing an activate command

  • to the data being available in the row buffer for a read/write

  • tRCD = Param.Latency("RAS to CAS delay")
  • the time from issuing a read/write command to seeing the actual data

  • tCL = Param.Latency("CAS latency")
  • minimum time between a precharge and subsequent activate

  • tRP = Param.Latency("Row precharge time")
  • minimum time between an activate and a precharge to the same row

  • tRAS = Param.Latency("ACT to PRE delay")
  • minimum time between a write data transfer and a precharge

  • tWR = Param.Latency("Write recovery time")
  • minimum time between a read and precharge command

  • tRTP = Param.Latency("Read to precharge")
  • time to complete a burst transfer, typically the burst length

  • divided by two due to the DDR bus, but by making it a parameter

  • it is easier to also evaluate SDR memories like WideIO.

  • This parameter has to account for burst length.

  • Read/Write requests with data size larger than one full burst are

broken

  • down into multiple requests in the controller

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = Param.Latency("Burst duration "
  •                       "(typically burst length / 2 cycles)")
    
  • tBURST_MAX is the column array cycle delay required before next

access,

  • which could be greater than tBURST when the memory access time is

greater

  • than tBURST

  • tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay")
  • tBURST_MIN is the minimum delay between bursts, which could be less

than

  • tBURST when interleaving is supported

  • tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts")
  • CAS-to-CAS delay for bursts to the same bank group

  • only utilized with bank group architectures; set to 0 for default

case

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay")
  • Write-to-Write delay for bursts to the same bank group

  • only utilized with bank group architectures; set to 0 for default

case

  • This will be used to enable different same bank group delays

  • for writes versus reads

  • tCCD_L_WR = Param.Latency(Self.tCCD_L, "Same bank group Write to
    Write " \
  •                                       "delay")
    
  • time taken to complete one refresh cycle (N rows in all banks)

  • tRFC = Param.Latency("Refresh cycle time")
  • refresh command interval, how often a "ref" command needs

  • to be sent. It is 7.8 us for a 64ms refresh requirement

  • tREFI = Param.Latency("Refresh command interval")
  • write-to-read, same rank turnaround penalty

  • tWTR = Param.Latency("Write to read, same rank switching time")
  • write-to-read, same rank turnaround penalty for same bank group

  • tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching "
  •                       "time, same bank group")
    
  • read-to-write, same rank turnaround penalty

  • tRTW = Param.Latency("Read to write, same rank switching time")
  • minimum precharge to precharge delay time

  • tPPD = Param.Latency("0ns", "PRE to PRE delay")
  • maximum delay between two-cycle ACT command phases

  • tAAD = Param.Latency(Self.tCK,
  •                     "Maximum delay between two-cycle ACT commands")
    
  • two_cycle_activate = Param.Bool(False,
  •                     "Two cycles required to send activate")
    
  • minimum row activate to row activate delay time

  • tRRD = Param.Latency("ACT to ACT delay")
  • only utilized with bank group architectures; set to 0 for default

case

  • tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay")
  • time window in which a maximum number of activates are allowed

  • to take place, set to 0 to disable

  • tXAW = Param.Latency("X activation window")
  • activation_limit = Param.Unsigned("Max number of activates in window")
  • time to exit power-down mode

  • Exit power-down to next valid command delay

  • tXP = Param.Latency("0ns", "Power-up Delay")
  • Exit Powerdown to commands requiring a locked DLL

  • tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL")
  • time to exit self-refresh mode

  • tXS = Param.Latency("0ns", "Self-refresh exit latency")
  • time to exit self-refresh mode with locked DLL

  • tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL")
  • number of data beats per clock. with DDR, default is 2, one per edge

  • beats_per_clock = Param.Unsigned(2, "Data beats per clock")
  • data_clock_sync = Param.Bool(False, "Synchronization commands
    required")
  • Currently rolled into other params

  • ######################################################################
  • tRC  - assumed to be tRAS + tRP

  • Power Behaviour and Constraints

  • DRAMs like LPDDR and WideIO have 2 external voltage domains. These

are

  • defined as VDD and VDD2. Each current is defined for each voltage

domain

  • separately. For example, current IDD0 is active-precharge current for

  • voltage domain VDD and current IDD02 is active-precharge current for

  • voltage domain VDD2.

  • By default all currents are set to 0mA. Users who are only

interested in

  • the performance of DRAMs can leave them at 0.

  • Power Behaviour and Constraints

  • DRAMs like LPDDR and WideIO have 2 external voltage domains. These

are

  • defined as VDD and VDD2. Each current is defined for each voltage

domain

  • separately. For example, current IDD0 is active-precharge current for

  • voltage domain VDD and current IDD02 is active-precharge current for

  • voltage domain VDD2.

  • By default all currents are set to 0mA. Users who are only

interested in

  • the performance of DRAMs can leave them at 0.

  • Operating 1 Bank Active-Precharge current

  • IDD0 = Param.Current("0mA", "Active precharge current")
  • Operating 1 Bank Active-Precharge current multiple voltage Range

  • IDD02 = Param.Current("0mA", "Active precharge current VDD2")
  • Precharge Power-down Current: Slow exit

  • IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow")
  • Precharge Power-down Current: Slow exit multiple voltage Range

  • IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2")
  • Precharge Power-down Current: Fast exit

  • IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast")
  • Precharge Power-down Current: Fast exit multiple voltage Range

  • IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2")
  • Precharge Standby current

  • IDD2N = Param.Current("0mA", "Precharge Standby current")
  • Precharge Standby current multiple voltage range

  • IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2")
  • Active Power-down current: slow exit

  • IDD3P0 = Param.Current("0mA", "Active Powerdown slow")
  • Active Power-down current: slow exit multiple voltage range

  • IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2")
  • Active Power-down current : fast exit

  • IDD3P1 = Param.Current("0mA", "Active Powerdown fast")
  • Active Power-down current : fast exit multiple voltage range

  • IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2")
  • Active Standby current

  • IDD3N = Param.Current("0mA", "Active Standby current")
  • Active Standby current multiple voltage range

  • IDD3N2 = Param.Current("0mA", "Active Standby current VDD2")
  • Burst Read Operating Current

  • IDD4R = Param.Current("0mA", "READ current")
  • Burst Read Operating Current multiple voltage range

  • IDD4R2 = Param.Current("0mA", "READ current VDD2")
  • Burst Write Operating Current

  • IDD4W = Param.Current("0mA", "WRITE current")
  • Burst Write Operating Current multiple voltage range

  • IDD4W2 = Param.Current("0mA", "WRITE current VDD2")
  • Refresh Current

  • IDD5 = Param.Current("0mA", "Refresh current")
  • Refresh Current multiple voltage range

  • IDD52 = Param.Current("0mA", "Refresh current VDD2")
  • Self-Refresh Current

  • IDD6 = Param.Current("0mA", "Self-refresh Current")
  • Self-Refresh Current multiple voltage range

  • IDD62 = Param.Current("0mA", "Self-refresh Current VDD2")
  • Main voltage range of the DRAM

  • VDD = Param.Voltage("0V", "Main Voltage Range")
  • Second voltage range defined by some DRAMs

  • VDD2 = Param.Voltage("0V", "2nd Voltage Range")

+# A single DDR3-1600 x64 channel (one command and address bus), with
+# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in
+# an 8x8 configuration.
+class DDR3_1600_8x8(DRAMInterface):

  • size of device in bytes

  • device_size = '512MB'
  • 8x8 configuration, 8 devices each with an 8-bit interface

  • device_bus_width = 8
  • DDR3 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)

  • device_rowbuffer_size = '1kB'
  • 8x8 configuration, so 8 devices

  • devices_per_rank = 8
  • Use two ranks

  • ranks_per_channel = 2
  • DDR3 has 8 banks in all configurations

  • banks_per_rank = 8
  • 800 MHz

  • tCK = '1.25ns'
  • Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns

  • tCS = '2.5ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 800 MHz

  • tBURST = '5ns'
  • Greater of 4 CK or 7.5 ns

  • tWTR = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns

  • tRTW = '2.5ns'
  • DDR3-1600 11-11-11

  • tRCD = '13.75ns'
  • tCL = '13.75ns'
  • tRP = '13.75ns'
  • tRAS = '35ns'
  • tRRD = '6ns'
  • tXAW = '30ns'
  • activation_limit = 4
  • tRFC = '260ns'
  • tWR = '15ns'
  • Greater of 4 CK or 7.5 ns

  • tRTP = '7.5ns'
  • <=85C, half for >85C

  • tREFI = '7.8us'
  • active powerdown and precharge powerdown exit time

  • tXP = '6ns'
  • self refresh exit time

  • tXS = '270ns'
  • Current values from datasheet Die Rev E,J

  • IDD0 = '55mA'
  • IDD2N = '32mA'
  • IDD3N = '38mA'
  • IDD4W = '125mA'
  • IDD4R = '157mA'
  • IDD5 = '235mA'
  • IDD3P1 = '38mA'
  • IDD2P1 = '32mA'
  • IDD6 = '20mA'
  • VDD = '1.5V'

+# A single HMC-2500 x32 model based on:
+# [1] DRAMSpec: a high-level DRAM bank modelling tool
+# developed at the University of Kaiserslautern. This high level tool
+# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to
+# estimate the DRAM bank latency and power numbers.
+# [2] High performance AXI-4.0 based interconnect for extensible smart
memory
+# cubes (E. Azarkhish et. al)
+# Assumed for the HMC model is a 30 nm technology node.
+# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory
(4
+# layers).
+# Each layer has 16 vaults and each vault consists of 2 banks per layer.
+# In order to be able to use the same controller used for 2D DRAM
generations
+# for HMC, the following analogy is done:
+# Channel (DDR) => Vault (HMC)
+# device_size (DDR) => size of a single layer in a vault
+# ranks per channel (DDR) => number of layers
+# banks per rank (DDR) => banks per layer
+# devices per rank (DDR) => devices per layer ( 1 for HMC).
+# The parameters for which no input is available are inherited from the
DDR3
+# configuration.
+# This configuration includes the latencies from the DRAM to the logic
layer
+# of the HMC
+class HMC_2500_1x32_Interface(DDR3_1600_8x8):

  • A single HMC-2500 x32 controller

  • The buffer parameters do not directly correlate with buffer_size in

  • real hardware. Nevertheless, their value has been tuned to achieve a

  • bandwidth similar to the cycle-accurate model in [2]

  • write_buffer_size = 32
  • read_buffer_size = 32
  • size of device

  • two banks per device with each bank 4MB [2]

  • device_size = '8MB'
  • 1x32 configuration, 1 device with 32 TSVs [2]

  • device_bus_width = 32
  • HMC is a BL8 device [2]

  • burst_length = 8
  • Each device has a page (row buffer) size of 256 bytes [2]

  • device_rowbuffer_size = '256B'
  • 1x32 configuration, so 1 device [2]

  • devices_per_rank = 1
  • 4 layers so 4 ranks [2]

  • ranks_per_channel = 4
  • HMC has 2 banks per layer [2]

  • Each layer represents a rank. With 4 layers and 8 banks in total,

each

  • layer has 2 banks; thus 2 banks per rank.

  • banks_per_rank = 2
  • 1250 MHz [2]

  • tCK = '0.8ns'
  • Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz

=

  • 0.8 ns (Assumption)

  • tCS = '0.8ns'
  • 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz

  • tBURST = '3.2ns'
  • Values using DRAMSpec HMC model [1]

  • tRCD = '10.2ns'
  • tCL = '9.9ns'
  • tRP = '7.7ns'
  • tRAS = '21.6ns'
  • tRRD depends on the power supply network for each vendor.

  • We assume a tRRD of a double bank approach to be equal to 4 clock

  • cycles (Assumption)

  • tRRD = '3.2ns'
  • activation limit is set to 0 since there are only 2 banks per vault

  • layer.

  • activation_limit = 0
  • Values using DRAMSpec HMC model [1]

  • tRFC = '59ns'
  • tWR = '8ns'
  • tRTP = '4.9ns'
  • Value using DRAMSpec HMC model [1]

  • tREFI = '3.9us'
  • The default page policy in the vault controllers is simple closed

page

  • [2] nevertheless 'close' policy opens and closes the row multiple

times

  • for bursts largers than 32Bytes. For this reason we

use 'close_adaptive'

  • page_policy = 'close_adaptive'
  • RoCoRaBaCh resembles the default address mapping in HMC

  • addr_mapping = 'RoCoRaBaCh'

+# A single DDR3-2133 x64 channel refining a selected subset of the
+# options for the DDR-1600 configuration, based on the same DDR3-1600
+# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept
+# consistent across the two configurations.
+class DDR3_2133_8x8(DDR3_1600_8x8):

  • 1066 MHz

  • tCK = '0.938ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz

  • tBURST = '3.752ns'
  • DDR3-2133 14-14-14

  • tRCD = '13.09ns'
  • tCL = '13.09ns'
  • tRP = '13.09ns'
  • tRAS = '33ns'
  • tRRD = '5ns'
  • tXAW = '25ns'
  • Current values from datasheet

  • IDD0 = '70mA'
  • IDD2N = '37mA'
  • IDD3N = '44mA'
  • IDD4W = '157mA'
  • IDD4R = '191mA'
  • IDD5 = '250mA'
  • IDD3P1 = '44mA'
  • IDD2P1 = '43mA'
  • IDD6 ='20mA'
  • VDD = '1.5V'

+# A single DDR4-2400 x64 channel (one command and address bus), with
+# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4)
+# in an 16x4 configuration.
+# Total channel capacity is 32GB
+# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel
+class DDR4_2400_16x4(DRAMInterface):

  • override the default buffer sizes and go for something larger to

  • accommodate the larger bank count

  • write_buffer_size = 128
  • read_buffer_size = 64
  • size of device

  • device_size = '1GB'
  • 16x4 configuration, 16 devices each with a 4-bit interface

  • device_bus_width = 4
  • DDR4 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 512 byte (1K columns x4)

  • device_rowbuffer_size = '512B'
  • 16x4 configuration, so 16 devices

  • devices_per_rank = 16
  • Match our DDR3 configurations which is dual rank

  • ranks_per_channel = 2
  • DDR4 has 2 (x16) or 4 (x4 and x8) bank groups

  • Set to 4 for x4 case

  • bank_groups_per_rank = 4
  • DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all

  • configurations). Currently we do not capture the additional

  • constraints incurred by the bank groups

  • banks_per_rank = 16
  • 1200 MHz

  • tCK = '0.833ns'
  • Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns

  • tCS = '1.666ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = '3.332ns'
  • Here using the average of WTR_S and WTR_L

  • tWTR = '5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666

ns

  • tRTW = '1.666ns'
  • @2400 data rate, tCCD_L is 6 CK

  • CAS-to-CAS delay for bursts to the same bank group

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = '5ns';
  • DDR4-2400 17-17-17

  • tRCD = '14.16ns'
  • tCL = '14.16ns'
  • tRP = '14.16ns'
  • tRAS = '32ns'
  • RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns)

  • tRRD = '3.332ns'
  • RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns)

  • tRRD_L = '4.9ns';
  • tFAW for 512B page is MAX(16 CK, 13ns)

  • tXAW = '13.328ns'
  • activation_limit = 4
  • tRFC is 350ns

  • tRFC = '350ns'
  • tWR = '15ns'
  • Greater of 4 CK or 7.5 ns

  • tRTP = '7.5ns'
  • <=85C, half for >85C

  • tREFI = '7.8us'
  • active powerdown and precharge powerdown exit time

  • tXP = '6ns'
  • self refresh exit time

  • exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is:

  • tRFC + 10ns = 340ns

  • tXS = '340ns'
  • Current values from datasheet

  • IDD0 = '43mA'
  • IDD02 = '3mA'
  • IDD2N = '34mA'
  • IDD3N = '38mA'
  • IDD3N2 = '3mA'
  • IDD4W = '103mA'
  • IDD4R = '110mA'
  • IDD5 = '250mA'
  • IDD3P1 = '32mA'
  • IDD2P1 = '25mA'
  • IDD6 = '30mA'
  • VDD = '1.2V'
  • VDD2 = '2.5V'

+# A single DDR4-2400 x64 channel (one command and address bus), with
+# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8)
+# in an 8x8 configuration.
+# Total channel capacity is 16GB
+# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel
+class DDR4_2400_8x8(DDR4_2400_16x4):

  • 8x8 configuration, 8 devices each with an 8-bit interface

  • device_bus_width = 8
  • Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)

  • device_rowbuffer_size = '1kB'
  • 8x8 configuration, so 8 devices

  • devices_per_rank = 8
  • RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns)

  • tRRD_L = '4.9ns';
  • tXAW = '21ns'
  • Current values from datasheet

  • IDD0 = '48mA'
  • IDD3N = '43mA'
  • IDD4W = '123mA'
  • IDD4R = '135mA'
  • IDD3P1 = '37mA'

+# A single DDR4-2400 x64 channel (one command and address bus), with
+# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16)
+# in an 4x16 configuration.
+# Total channel capacity is 4GB
+# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel
+class DDR4_2400_4x16(DDR4_2400_16x4):

  • 4x16 configuration, 4 devices each with an 16-bit interface

  • device_bus_width = 16
  • Each device has a page (row buffer) size of 2 Kbyte (1K columns x16)

  • device_rowbuffer_size = '2kB'
  • 4x16 configuration, so 4 devices

  • devices_per_rank = 4
  • Single rank for x16

  • ranks_per_channel = 1
  • DDR4 has 2 (x16) or 4 (x4 and x8) bank groups

  • Set to 2 for x16 case

  • bank_groups_per_rank = 2
  • DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all

  • configurations). Currently we do not capture the additional

  • constraints incurred by the bank groups

  • banks_per_rank = 8
  • RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns)

  • tRRD = '5.3ns'
  • RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns)

  • tRRD_L = '6.4ns';
  • tXAW = '30ns'
  • Current values from datasheet

  • IDD0 = '80mA'
  • IDD02 = '4mA'
  • IDD2N = '34mA'
  • IDD3N = '47mA'
  • IDD4W = '228mA'
  • IDD4R = '243mA'
  • IDD5 = '280mA'
  • IDD3P1 = '41mA'

+# A single LPDDR2-S4 x32 interface (one command/address bus), with
+# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1)
+# in a 1x32 configuration.
+class LPDDR2_S4_1066_1x32(DRAMInterface):

  • No DLL in LPDDR2

  • dll = False
  • size of device

  • device_size = '512MB'
  • 1x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • LPDDR2_S4 is a BL4 and BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 1KB

  • (this depends on the memory density)

  • device_rowbuffer_size = '1kB'
  • 1x32 configuration, so 1 device

  • devices_per_rank = 1
  • Use a single rank

  • ranks_per_channel = 1
  • LPDDR2-S4 has 8 banks in all configurations

  • banks_per_rank = 8
  • 533 MHz

  • tCK = '1.876ns'
  • Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns

  • tCS = '3.75ns'
  • 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz.

  • Note this is a BL8 DDR device.

  • Requests larger than 32 bytes are broken down into multiple requests

  • in the controller

  • tBURST = '7.5ns'
  • Irrespective of speed grade, tWTR is 7.5 ns

  • tWTR = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns

  • tRTW = '3.75ns'
  • Fixed at 15 ns

  • tRCD = '15ns'
  • 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time

  • tCL = '15ns'
  • Pre-charge one bank 15 ns (all banks 18 ns)

  • tRP = '15ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • tRTP = '7.5ns'
  • LPDDR2-S4, 4 Gbit

  • tRFC = '130ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '7.5ns'
  • self refresh exit time

  • tXS = '140ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Irrespective of density, tFAW is 50 ns

  • tXAW = '50ns'
  • activation_limit = 4
  • Current values from datasheet

  • IDD0 = '15mA'
  • IDD02 = '70mA'
  • IDD2N = '2mA'
  • IDD2N2 = '30mA'
  • IDD3N = '2.5mA'
  • IDD3N2 = '30mA'
  • IDD4W = '10mA'
  • IDD4W2 = '190mA'
  • IDD4R = '3mA'
  • IDD4R2 = '220mA'
  • IDD5 = '40mA'
  • IDD52 = '150mA'
  • IDD3P1 = '1.2mA'
  • IDD3P12 = '8mA'
  • IDD2P1 = '0.6mA'
  • IDD2P12 = '0.8mA'
  • IDD6 = '1mA'
  • IDD62 = '3.2mA'
  • VDD = '1.8V'
  • VDD2 = '1.2V'

+# A single WideIO x128 interface (one command and address bus), with
+# default timings based on an estimated WIO-200 8 Gbit part.
+class WideIO_200_1x128(DRAMInterface):

  • No DLL for WideIO

  • dll = False
  • size of device

  • device_size = '1024MB'
  • 1x128 configuration, 1 device with a 128-bit interface

  • device_bus_width = 128
  • This is a BL4 device

  • burst_length = 4
  • Each device has a page (row buffer) size of 4KB

  • (this depends on the memory density)

  • device_rowbuffer_size = '4kB'
  • 1x128 configuration, so 1 device

  • devices_per_rank = 1
  • Use one rank for a one-high die stack

  • ranks_per_channel = 1
  • WideIO has 4 banks in all configurations

  • banks_per_rank = 4
  • 200 MHz

  • tCK = '5ns'
  • Default different rank bus delay to 2 CK, @200 MHz = 10 ns

  • tCS = '10ns'
  • 4 beats across an x128 SDR interface translates to 4 clocks @ 200

MHz.

  • Note this is a BL4 SDR device.

  • tBURST = '20ns'
  • Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns

  • tWTR = '15ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns

  • tRTW = '10ns'
  • WIO-200

  • tRCD = '18ns'
  • tCL = '18ns'
  • tRP = '18ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • Read to precharge is same as the burst

  • tRTP = '20ns'
  • WIO 8 Gb

  • tRFC = '210ns'
  • WIO 8 Gb, <=85C, half for >85C

  • tREFI = '3.9us'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Two instead of four activation window

  • tXAW = '50ns'
  • activation_limit = 2
  • The WideIO specification does not provide current information

+# A single LPDDR3 x32 interface (one command/address bus), with
+# default timings based on a LPDDR3-1600 4 Gbit part (Micron
+# EDF8132A1MC) in a 1x32 configuration.
+class LPDDR3_1600_1x32(DRAMInterface):

  • No DLL for LPDDR3

  • dll = False
  • size of device

  • device_size = '512MB'
  • 1x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • LPDDR3 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 4KB

  • device_rowbuffer_size = '4kB'
  • 1x32 configuration, so 1 device

  • devices_per_rank = 1
  • Technically the datasheet is a dual-rank package, but for

  • comparison with the LPDDR2 config we stick to a single rank

  • ranks_per_channel = 1
  • LPDDR3 has 8 banks in all configurations

  • banks_per_rank = 8
  • 800 MHz

  • tCK = '1.25ns'
  • Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns

  • tCS = '2.5ns'
  • 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz.

  • Note this is a BL8 DDR device.

  • Requests larger than 32 bytes are broken down into multiple requests

  • in the controller

  • tBURST = '5ns'
  • Irrespective of speed grade, tWTR is 7.5 ns

  • tWTR = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns

  • tRTW = '2.5ns'
  • tRCD = '18ns'
  • 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time

  • tCL = '15ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns

  • tRTP = '7.5ns'
  • Pre-charge one bank 18 ns (all banks 21 ns)

  • tRP = '18ns'
  • LPDDR3, 4 Gb

  • tRFC = '130ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '7.5ns'
  • self refresh exit time

  • tXS = '140ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Irrespective of size, tFAW is 50 ns

  • tXAW = '50ns'
  • activation_limit = 4
  • Current values from datasheet

  • IDD0 = '8mA'
  • IDD02 = '60mA'
  • IDD2N = '0.8mA'
  • IDD2N2 = '26mA'
  • IDD3N = '2mA'
  • IDD3N2 = '34mA'
  • IDD4W = '2mA'
  • IDD4W2 = '190mA'
  • IDD4R = '2mA'
  • IDD4R2 = '230mA'
  • IDD5 = '28mA'
  • IDD52 = '150mA'
  • IDD3P1 = '1.4mA'
  • IDD3P12 = '11mA'
  • IDD2P1 = '0.8mA'
  • IDD2P12 = '1.8mA'
  • IDD6 = '0.5mA'
  • IDD62 = '1.8mA'
  • VDD = '1.8V'
  • VDD2 = '1.2V'

+# A single GDDR5 x64 interface, with
+# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix
+# H5GQ1H24AFR) in a 2x32 configuration.
+class GDDR5_4000_2x32(DRAMInterface):

  • size of device

  • device_size = '128MB'
  • 2x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • GDDR5 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 2Kbits (256Bytes)

  • device_rowbuffer_size = '256B'
  • 2x32 configuration, so 2 devices

  • devices_per_rank = 2
  • assume single rank

  • ranks_per_channel = 1
  • GDDR5 has 4 bank groups

  • bank_groups_per_rank = 4
  • GDDR5 has 16 banks with 4 bank groups

  • banks_per_rank = 16
  • 1000 MHz

  • tCK = '1ns'
  • 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz

  • Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz )

  • 8 beats at 4000 MHz = 2 beats at 1000 MHz

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = '2ns'
  • Here using the average of WTR_S and WTR_L

  • tWTR = '5ns'
  • Assume 2 cycles

  • tRTW = '2ns'
  • @1000MHz data rate, tCCD_L is 3 CK

  • CAS-to-CAS delay for bursts to the same bank group

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = '3ns';
  • tRCD = '12ns'
  • tCL is not directly found in datasheet and assumed equal tRCD

  • tCL = '12ns'
  • tRP = '12ns'
  • tRAS = '28ns'
  • RRD_S (different bank group)

  • RRD_S is 5.5 ns in datasheet.

  • rounded to the next multiple of tCK

  • tRRD = '6ns'
  • RRD_L (same bank group)

  • RRD_L is 5.5 ns in datasheet.

  • rounded to the next multiple of tCK

  • tRRD_L = '6ns'
  • tXAW = '23ns'
  • tXAW < 4 x tRRD.

  • Therefore, activation limit is set to 0

  • activation_limit = 0
  • tRFC = '65ns'
  • tWR = '12ns'
  • Read-to-Precharge 2 CK

  • tRTP = '2ns'

+# A single HBM x128 interface (one command and address bus), with
+# default timings based on data publically released
+# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014),
+# IDD measurement values, and by extrapolating data from other classes.
+# Architecture values based on published HBM spec
+# A 4H stack is defined, 2Gb per die for a total of 1GB of memory.
+class HBM_1000_4H_1x128(DRAMInterface):

  • HBM gen1 supports up to 8 128-bit physical channels

  • Configuration defines a single channel, with the capacity

  • set to (full_ stack_capacity / 8) based on 2Gb dies

  • To use all 8 channels, set 'channels' parameter to 8 in

  • system configuration

  • 128-bit interface legacy mode

  • device_bus_width = 128
  • HBM supports BL4 and BL2 (legacy mode only)

  • burst_length = 4
  • size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack;

  • with 8 channels, 128MB per channel

  • device_size = '128MB'
  • device_rowbuffer_size = '2kB'
  • 1x128 configuration

  • devices_per_rank = 1
  • HBM does not have a CS pin; set rank to 1

  • ranks_per_channel = 1
  • HBM has 8 or 16 banks depending on capacity

  • 2Gb dies have 8 banks

  • banks_per_rank = 8
  • depending on frequency, bank groups may be required

  • will always have 4 bank groups when enabled

  • current specifications do not define the minimum frequency for

  • bank group architecture

  • setting bank_groups_per_rank to 0 to disable until range is defined

  • bank_groups_per_rank = 0
  • 500 MHz for 1Gbps DDR data rate

  • tCK = '2ns'
  • single rank device, set to 0

  • tCS = '0ns'
  • BL2 and BL4 supported, default to BL4

  • DDR @ 500 MHz means 4 * 2ns / 2 = 4ns

  • tBURST = '4ns'
  • tWTR = '10ns'
  • start with 2 cycles turnaround, similar to other memory classes

  • could be more with variations across the stack

  • tRTW = '4ns'
  • use values from IDD measurement in JEDEC spec

  • use tRP value for tRCD and tCL similar to other classes

  • tRP = '15ns'
  • tRCD = '15ns'
  • tCL = '15ns'
  • tRAS = '33ns'
  • value for 2Gb device from JEDEC spec

  • tRFC = '160ns'
  • value for 2Gb device from JEDEC spec

  • tREFI = '3.9us'
  • extrapolate the following from LPDDR configs, using ns values

  • to minimize burst length, prefetch differences

  • tWR = '18ns'
  • tRTP = '7.5ns'
  • from MemCon example, tRRD is 4ns with 2ns tCK

  • tRRD = '4ns'
  • from MemCon example, tFAW is 30ns with 2ns tCK

  • tXAW = '30ns'
  • activation_limit = 4
  • 4tCK

  • tXP = '8ns'
  • start with tRFC + tXP -> 160ns + 8ns = 168ns

  • tXS = '168ns'

+# A single HBM x64 interface (one command and address bus), with
+# default timings based on HBM gen1 and data publically released
+# A 4H stack is defined, 8Gb per die for a total of 4GB of memory.
+# Note: This defines a pseudo-channel with a unique controller
+# instantiated per pseudo-channel
+# Stay at same IO rate (1Gbps) to maintain timing relationship with
+# HBM gen1 class (HBM_1000_4H_x128) where possible
+class HBM_1000_4H_1x64(HBM_1000_4H_1x128):

  • For HBM gen2 with pseudo-channel mode, configure 2X channels.

  • Configuration defines a single pseudo channel, with the capacity

  • set to (full_ stack_capacity / 16) based on 8Gb dies

  • To use all 16 pseudo channels, set 'channels' parameter to 16 in

  • system configuration

  • 64-bit pseudo-channle interface

  • device_bus_width = 64
  • HBM pseudo-channel only supports BL4

  • burst_length = 4
  • size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack;

  • with 16 channels, 256MB per channel

  • device_size = '256MB'
  • page size is halved with pseudo-channel; maintaining the same same

number

  • of rows per pseudo-channel with 2X banks across 2 channels

  • device_rowbuffer_size = '1kB'
  • HBM has 8 or 16 banks depending on capacity

  • Starting with 4Gb dies, 16 banks are defined

  • banks_per_rank = 16
  • Default different rank bus delay to 2 CK, @1000 MHz = 2 ns

  • tCS = '2ns'
  • reset tRFC for larger, 8Gb device

  • use HBM1 4Gb value as a starting point

  • tRFC = '260ns'
  • start with tRFC + tXP -> 160ns + 8ns = 168ns

  • tXS = '268ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '10ns'
  • self refresh exit time

  • tXS = '65ns'

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture
+# burst of 32, which means bursts can be interleaved
+class LPDDR5_5500_1x16_BG_BL32(DRAMInterface):
+

  • Increase buffer size to account for more bank resources

  • read_buffer_size = 64
  • Set page policy to better suit DMC Huxley

  • page_policy = 'close_adaptive'
  • 16-bit channel interface

  • device_bus_width = 16
  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL32 for higher command bandwidth

  • burst_length = 32
  • size of device in bytes

  • device_size = '1GB'
  • 2kB page with BG mode

  • device_rowbuffer_size = '2kB'
  • Use a 1x16 configuration

  • devices_per_rank = 1
  • Use a single rank

  • ranks_per_channel = 1
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Initial configuration will have 16 banks with Bank Group Arch

  • to maximim resources and enable higher data rates

  • banks_per_rank = 16
  • bank_groups_per_rank = 4
  • 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK

  • tCK = '1.455ns'
  • Greater of 2 CK or 18ns

  • tRCD = '18ns'
  • Base RL is 16 CK @ 687.5 MHz = 23.28ns

  • tCL = '23.280ns'
  • Greater of 2 CK or 18ns

  • tRP = '18ns'
  • Greater of 3 CK or 42ns

  • tRAS = '42ns'
  • Greater of 3 CK or 34ns

  • tWR = '34ns'
  • active powerdown and precharge powerdown exit time

  • Greater of 3 CK or 7ns

  • tXP = '7ns'
  • self refresh exit time (tRFCab + 7.5ns)

  • tXS = '217.5ns'
  • Greater of 2 CK or 7.5 ns minus 2 CK

  • tRTP = '4.59ns'
  • With BG architecture, burst of 32 transferred in two 16-beat

  • sub-bursts, with a 16-beat gap in between.

  • Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz

  • tBURST is the delay to transfer the Bstof32 =  6 CK @ 687.5 MHz

  • tBURST = '8.73ns'
  • can interleave a Bstof32 from another bank group at tBURST_MIN

  • 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz

  • tBURST_MIN = '2.91ns'
  • tBURST_MAX is the maximum burst delay for same bank group timing

  • this is 8 CK @ 687.5 MHz

  • tBURST_MAX = '11.64ns'
  • 8 CK @ 687.5 MHz

  • tCCD_L = "11.64ns"
  • LPDDR5, 8 Gbit/channel for 280ns tRFCab

  • tRFC = '210ns'
  • tREFI = '3.9us'
  • Greater of 4 CK or 6.25 ns

  • tWTR = '6.25ns'
  • Greater of 4 CK or 12 ns

  • tWTR_L = '12ns'
  • Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL

  • tWCKDQ0/tCK will be 1 CK for most cases

  • For gem5 RL = WL and BL/n is already accounted for with tBURST

  • Result is and additional 1 CK is required

  • tRTW = '1.455ns'
  • Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns

  • tCS = '2.91ns'
  • 2 CK

  • tPPD = '2.91ns'
  • Greater of 2 CK or 5 ns

  • tRRD = '5ns'
  • tRRD_L = '5ns'
  • With Bank Group Arch mode tFAW is 20 ns

  • tXAW = '20ns'
  • activation_limit = 4
  • at 5Gbps, 4:1 WCK to CK ratio required

  • 2 data beats per WCK (DDR) -> 8 per CK

  • beats_per_clock = 8
  • 2 cycles required to send activate command

  • 2 command phases can be sent back-to-back or

  • with a gap up to tAAD = 8 CK

  • two_cycle_activate = True
  • tAAD = '11.640ns'
  • data_clock_sync = True

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture, burst of 16
+class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32):
+

  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL16 for smaller access granularity

  • burst_length = 16
  • For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST = '2.91ns'
  • tBURST_MIN = '2.91ns'
  • For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST_MAX = '5.82ns'
  • 4 CK @ 687.5 MHz

  • tCCD_L = "5.82ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 8-bank mode, burst of 32
+class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32):
+

  • 4kB page with 8B mode

  • device_rowbuffer_size = '4kB'
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Select 8B

  • banks_per_rank = 8
  • bank_groups_per_rank = 0
  • For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST = '5.82ns'
  • tBURST_MIN = '5.82ns'
  • tBURST_MAX = '5.82ns'
  • Greater of 4 CK or 12 ns

  • tWTR = '12ns'
  • Greater of 2 CK or 10 ns

  • tRRD = '10ns'
  • With 8B mode tFAW is 40 ns

  • tXAW = '40ns'
  • activation_limit = 4
  • Reset BG arch timing for 8B mode

  • tCCD_L = "0ns"
  • tRRD_L = "0ns"
  • tWTR_L = "0ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture
+# burst of 32, which means bursts can be interleaved
+class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32):
+

  • 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK

  • tCK = '1.25ns'
  • Base RL is 17 CK @ 800 MHz = 21.25ns

  • tCL = '21.25ns'
  • With BG architecture, burst of 32 transferred in two 16-beat

  • sub-bursts, with a 16-beat gap in between.

  • Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz

  • tBURST is the delay to transfer the Bstof32 =  6 CK @ 800 MHz

  • tBURST = '7.5ns'
  • can interleave a Bstof32 from another bank group at tBURST_MIN

  • 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz

  • tBURST_MIN = '2.5ns'
  • tBURST_MAX is the maximum burst delay for same bank group timing

  • this is 8 CK @ 800 MHz

  • tBURST_MAX = '10ns'
  • 8 CK @ 800 MHz

  • tCCD_L = "10ns"
  • Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL

  • tWCKDQ0/tCK will be 1 CK for most cases

  • For gem5 RL = WL and BL/n is already accounted for with tBURST

  • Result is and additional 1 CK is required

  • tRTW = '1.25ns'
  • Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns

  • tCS = '2.5ns'
  • 2 CK

  • tPPD = '2.5ns'
  • 2 command phases can be sent back-to-back or

  • with a gap up to tAAD = 8 CK

  • tAAD = '10ns'

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on initial
+# JEDEC specifcation
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture, burst of 16
+class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32):
+

  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL16 for smaller access granularity

  • burst_length = 16
  • For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio

  • tBURST = '2.5ns'
  • tBURST_MIN = '2.5ns'
  • For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio

  • tBURST_MAX = '5ns'
  • 4 CK @ 800 MHz

  • tCCD_L = "5ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 8-bank mode, burst of 32
+class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32):
+

  • 4kB page with 8B mode

  • device_rowbuffer_size = '4kB'
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Select 8B

  • banks_per_rank = 8
  • bank_groups_per_rank = 0
  • For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio

  • tBURST = '5ns'
  • tBURST_MIN = '5ns'
  • tBURST_MAX = '5ns'
  • Greater of 4 CK or 12 ns

  • tWTR = '12ns'
  • Greater of 2 CK or 10 ns

  • tRRD = '10ns'
  • With 8B mode tFAW is 40 ns

  • tXAW = '40ns'
  • activation_limit = 4
  • Reset BG arch timing for 8B mode

  • tCCD_L = "0ns"
  • tRRD_L = "0ns"
  • tWTR_L = "0ns"
    diff --git a/src/mem/SConscript b/src/mem/SConscript
    index b77dbb1..76ffdbd 100644
    --- a/src/mem/SConscript
    +++ b/src/mem/SConscript
    @@ -1,6 +1,6 @@

-- mode:python --

-# Copyright (c) 2018 ARM Limited
+# Copyright (c) 2018, 2020 ARM Limited

All rights reserved

The license below extends only to copyright in the software and shall

@@ -47,6 +47,7 @@
SimObject('AddrMapper.py')
SimObject('Bridge.py')
SimObject('DRAMCtrl.py')
+SimObject('DRAMInterface.py')
SimObject('ExternalMaster.py')
SimObject('ExternalSlave.py')
SimObject('MemObject.py')
diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc
index dc244fe..533aa01 100644
--- a/src/mem/dram_ctrl.cc
+++ b/src/mem/dram_ctrl.cc
@@ -47,6 +47,7 @@
#include "debug/DRAMState.hh"
#include "debug/Drain.hh"
#include "debug/QOS.hh"
+#include "params/DRAMInterface.hh"
#include "sim/system.hh"

using namespace std;
@@ -58,12 +59,13 @@
retryRdReq(false), retryWrReq(false),
nextReqEvent([this]{ processNextReqEvent(); }, name()),
respondEvent([this]{ processRespondEvent(); }, name()),

  • dram(p->dram),
    readBufferSize(p->read_buffer_size),
    writeBufferSize(p->write_buffer_size),
    writeHighThreshold(writeBufferSize * p->write_high_thresh_perc /
    100.0),
    writeLowThreshold(writeBufferSize * p->write_low_thresh_perc / 100.0),
    minWritesPerSwitch(p->min_writes_per_switch),
  • writesThisTime(0), readsThisTime(0), tCS(p->tCS),
  • writesThisTime(0), readsThisTime(0),
    memSchedPolicy(p->mem_sched_policy),
    frontendLatency(p->static_frontend_latency),
    backendLatency(p->static_backend_latency),
    @@ -75,37 +77,23 @@
    readQueue.resize(p->qos_priorities);
    writeQueue.resize(p->qos_priorities);

  • dram->setCtrl(this);

  • // perform a basic check of the write thresholds
    if (p->write_low_thresh_perc >= p->write_high_thresh_perc)
        fatal("Write buffer low threshold %d must be smaller than the "
              "high threshold %d\n", p->write_low_thresh_perc,
              p->write_high_thresh_perc);
    
  • // determine the rows per bank by looking at the total capacity
  • uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size());
  • DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity,
  •        AbstractMemory::size());
    
  • // create a DRAM interface
  • // will only populate the ranks if DRAM is configured
  • dram = new DRAMInterface(*this, p, capacity, range);
  • DPRINTF(DRAM, "Created DRAM interface \n");
    }

void
DRAMCtrl::init()
{

  • MemCtrl::init();

  • if (!port.isConnected()) {
    fatal("DRAMCtrl %s is unconnected!\n", name());
    } else {
    port.sendRangeChange();
    }

  • dram->init(range);

  • }

    void
    @@ -115,8 +103,6 @@
    isTimingMode = system()->isTimingMode();

    if (isTimingMode) {
    
  •    dram->startup();
    
  •     // shift the bus busy time sufficiently far ahead that we never
        // have to worry about negative values when computing the time for
        // the next request, this will add an insignificant bubble at the
    

@@ -134,7 +120,7 @@
"is responding");

  // do the actual memory access and turn the packet into a response
  • access(pkt);
  • dram->access(pkt);

    Tick latency = 0;
    if (pkt->hasData()) {
    @@ -264,7 +250,7 @@
    // address of first DRAM packet is kept unaliged. Subsequent DRAM
    packets
    // are aligned to burst size boundaries. This is to ensure we
    accurately
    // check read packets against packets in write queue.

  • const Addr base_addr = getCtrlAddr(pkt->getAddr());
  • const Addr base_addr = dram->getCtrlAddr(pkt->getAddr());
    Addr addr = base_addr;
    unsigned pktsServicedByWrQ = 0;
    BurstHelper* burst_helper = NULL;
    @@ -364,7 +350,7 @@

    // if the request size is larger than burst size, the pkt is split into
    // multiple DRAM packets

  • const Addr base_addr = getCtrlAddr(pkt->getAddr());
  • const Addr base_addr = dram->getCtrlAddr(pkt->getAddr());
    Addr addr = base_addr;
    uint32_t burstSize = dram->bytesPerBurst();
    for (int cnt = 0; cnt < pktCount; ++cnt) {
    @@ -527,7 +513,7 @@
    DRAMPacket* dram_pkt = respQueue.front();

    // media specific checks and functions when read response is complete

  • dram->respondEventDRAM(dram_pkt->rank);
  • dram->respondEvent(dram_pkt->rank);

    if (dram_pkt->burstHelper) {
    // it is a split packet
    @@ -726,12 +712,12 @@
    void
    DRAMCtrl::accessAndRespond(PacketPtr pkt, Tick static_latency)
    {

  • DPRINTF(DRAM, "Responding to Address %lld.. ",pkt->getAddr());
  • DPRINTF(DRAM, "Responding to Address %lld.. \n",pkt->getAddr());

    bool needsResponse = pkt->needsResponse();
    // do the actual memory access which also turns the packet into a
    // response

  • access(pkt);
  • dram->access(pkt);

    // turn packet around to go back to requester if response expected
    if (needsResponse) {
    @@ -876,9 +862,9 @@
    // if not, shift to next burst window
    Tick act_at;
    if (twoCycleActivate)

  •    act_at = ctrl.verifyMultiCmd(act_tick, tAAD);
    
  •    act_at = ctrl->verifyMultiCmd(act_tick, tAAD);
    else
    
  •    act_at = ctrl.verifySingleCmd(act_tick);
    
  •    act_at = ctrl->verifySingleCmd(act_tick);
    
    DPRINTF(DRAM, "Activate at tick %d\n", act_at);
    

@@ -996,7 +982,7 @@
// Issuing an explicit PRE command
// Verify that we have command bandwidth to issue the precharge
// if not, shift to next burst window

  •    pre_at = ctrl.verifySingleCmd(pre_tick);
    
  •    pre_at = ctrl->verifySingleCmd(pre_tick);
        // enforce tPPD
        for (int i = 0; i < banksPerRank; i++) {
            rank_ref.banks[i].preAllowedAt = std::max(pre_at + tPPD,
    

@@ -1046,7 +1032,7 @@

  // first clean up the burstTick set, removing old entries
  // before adding new entries for next burst
  • ctrl.pruneBurstTick();
  • ctrl->pruneBurstTick();

    // get the rank
    Rank& rank_ref = *ranks[dram_pkt->rank];
    @@ -1098,9 +1084,9 @@
    // verify that we have command bandwidth to issue the burst
    // if not, shift to next burst window
    if (dataClockSync && ((cmd_at - rank_ref.lastBurstTick) >
    clkResyncDelay))

  •    cmd_at = ctrl.verifyMultiCmd(cmd_at, tCK);
    
  •    cmd_at = ctrl->verifyMultiCmd(cmd_at, tCK);
    else
    
  •    cmd_at = ctrl.verifySingleCmd(cmd_at);
    
  •    cmd_at = ctrl->verifySingleCmd(cmd_at);
    
    // if we are interleaving bursts, ensure that
    // 1) we don't double interleave on next burst issue
    

@@ -1200,9 +1186,9 @@

      // either look at the read queue or write queue
      const std::vector<DRAMPacketQueue>& queue =
  •            ctrl.selQueue(dram_pkt->isRead());
    
  •            ctrl->selQueue(dram_pkt->isRead());
    
  •    for (uint8_t i = 0; i < ctrl.numPriorities(); ++i) {
    
  •    for (uint8_t i = 0; i < ctrl->numPriorities(); ++i) {
            auto p = queue[i].begin();
            // keep on looking until we find a hit or reach the end of the
            // queue
    

@@ -1273,6 +1259,7 @@
// Update latency stats
stats.totMemAccLat += dram_pkt->readyTime - dram_pkt->entryTime;
stats.totQLat += cmd_at - dram_pkt->entryTime;

  •    stats.totBusLat += tBURST;
    } else {
        // Schedule write done event to decrement event count
        // after the readyTime has been reached
    

@@ -1338,13 +1325,9 @@
// Update latency stats
stats.masterReadTotalLat[dram_pkt->masterId()] +=
dram_pkt->readyTime - dram_pkt->entryTime;

  •    stats.bytesRead += dram->bytesPerBurst();
    
  •    stats.totBusLat += dram->burstDly();
        stats.masterReadBytes[dram_pkt->masterId()] += dram_pkt->size;
    } else {
        ++writesThisTime;
    
  •    stats.bytesWritten += dram->bytesPerBurst();
        stats.masterWriteBytes[dram_pkt->masterId()] += dram_pkt->size;
        stats.masterWriteTotalLat[dram_pkt->masterId()] +=
            dram_pkt->readyTime - dram_pkt->entryTime;
    

@@ -1446,8 +1429,9 @@

              // Figure out which read request goes next
              // If we are changing command type, incorporate the minimum
  •            // bus turnaround delay which will be tCS (different rank)  
    

case

  •            to_read = chooseNext((*queue), switched_cmd_type ? tCS :  
    

0);

  •            // bus turnaround delay which will be rank to rank delay
    
  •            to_read = chooseNext((*queue), switched_cmd_type ?
    
  •                                           dram->rankDelay() : 0);
    
                if (to_read != queue->end()) {
                    // candidate read found
    

@@ -1526,7 +1510,8 @@
// If we are changing command type, incorporate the minimum
// bus turnaround delay
to_write = chooseNext((*queue),

  •                 switched_cmd_type ? std::min(dram->minRdToWr(),  
    

tCS) : 0);

  •                 switched_cmd_type ? std::min(dram->minRdToWr(),
    
  •                                              dram->rankDelay()) : 0);
    
            if (to_write != queue->end()) {
                write_found = true;
    

@@ -1599,11 +1584,8 @@
}
}

-DRAMInterface::DRAMInterface(DRAMCtrl& _ctrl,

  •                                 const DRAMCtrlParams* _p,
    
  •                                 const uint64_t capacity,
    
  •                                 const AddrRange range)
    
  • : SimObject(_p), ctrl(_ctrl),
    +DRAMInterface::DRAMInterface(const DRAMInterfaceParams* _p)
  • : AbstractMemory(_p),
    addrMapping(_p->addr_mapping),
    burstSize((_p->devices_per_rank * _p->burst_length *
    _p->device_bus_width) / 8),
    @@ -1618,7 +1600,7 @@
    bankGroupsPerRank(_p->bank_groups_per_rank),
    bankGroupArch(_p->bank_groups_per_rank > 0),
    banksPerRank(_p->banks_per_rank), rowsPerBank(0),
  •  tCK(_p->tCK), tCL(_p->tCL), tBURST(_p->tBURST),
    
  •  tCK(_p->tCK), tCS(_p->tCS), tCL(_p->tCL), tBURST(_p->tBURST),
      tBURST_MIN(_p->tBURST_MIN), tBURST_MAX(_p->tBURST_MAX),  
    

tRTW(_p->tRTW),
tCCD_L_WR(_p->tCCD_L_WR), tCCD_L(_p->tCCD_L), tRCD(_p->tRCD),
tRP(_p->tRP), tRAS(_p->tRAS), tWR(_p->tWR), tRTP(_p->tRTP),
@@ -1634,12 +1616,12 @@
wrToRdDly(tCL + tBURST + _p->tWTR), rdToWrDly(tBURST + tRTW),
wrToRdDlySameBG(tCL + _p->tBURST_MAX + _p->tWTR_L),
rdToWrDlySameBG(tRTW + _p->tBURST_MAX),

  •  rankToRankDly(ctrl.rankDelay() + tBURST),
    
  •  rankToRankDly(tCS + tBURST),
      pageMgmt(_p->page_policy),
      maxAccessesPerRow(_p->max_accesses_per_row),
      timeStampOffset(0), activeRank(0),
      enableDRAMPowerdown(_p->enable_dram_powerdown),
    
  •  stats(_ctrl, *this)
    
  •  stats(*this)
    

    {
    fatal_if(!isPowerOf2(burstSize), "DRAM burst size %d is not allowed, "
    "must be a power of two\n", burstSize);
    @@ -1651,7 +1633,7 @@

    for (int i = 0; i < ranksPerChannel; i++) {
        DPRINTF(DRAM, "Creating DRAM rank %d \n", i);
    
  •    Rank* rank = new Rank(ctrl, _p, i, *this);
    
  •    Rank* rank = new Rank(_p, i, *this);
        ranks.push_back(rank);
    }
    

@@ -1659,6 +1641,11 @@
uint64_t deviceCapacity = deviceSize / (1024 * 1024) * devicesPerRank *
ranksPerChannel;

  • uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size());
  • DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity,
  •        AbstractMemory::size());
    
  • // if actual DRAM size does not match memory capacity in system warn!
    if (deviceCapacity != capacity / (1024 * 1024))
        warn("DRAM device capacity (%d Mbytes) does not match the "
    

@@ -1713,8 +1700,10 @@
}

void
-DRAMInterface::init(AddrRange range)
+DRAMInterface::init()
{

  • AbstractMemory::init();
  • // a bit of sanity checks on the interleaving, save it for here to
    // ensure that the system pointer is initialised
    if (range.interleaved()) {
    

@@ -1736,7 +1725,7 @@

          // channel striping has to be done at a granularity that
          // is equal or larger to a cache line
  •        if (ctrl.system()->cacheLineSize() > range.granularity()) {
    
  •        if (system()->cacheLineSize() > range.granularity()) {
                fatal("Channel interleaving of %s must be at least as  
    

large "
"as the cache line size\n", name());
}
@@ -1755,8 +1744,10 @@
void
DRAMInterface::startup()
{

  • // timestamp offset should be in clock cycles for DRAMPower
  • timeStampOffset = divCeil(curTick(), tCK);
  • if (system()->isTimingMode()) {

  •    // timestamp offset should be in clock cycles for DRAMPower
    
  •    timeStampOffset = divCeil(curTick(), tCK);
    
  • }

    for (auto r : ranks) {
        r->startup(curTick() + tREFI - tRP);
    

@@ -1802,7 +1793,7 @@
}

void
-DRAMInterface::respondEventDRAM(uint8_t rank)
+DRAMInterface::respondEvent(uint8_t rank)
{
Rank& rank_ref = *ranks[rank];

@@ -1943,7 +1934,7 @@
std::max(ranks[i]->banks[j].preAllowedAt, curTick()) +
tRP;

              // When is the earliest the R/W burst can issue?
  •            const Tick col_allowed_at = ctrl.inReadBusState(false) ?
    
  •            const Tick col_allowed_at = ctrl->inReadBusState(false) ?
    

ranks[i]->banks[j].rdAllowedAt :

ranks[i]->banks[j].wrAllowedAt;
Tick col_at = std::max(col_allowed_at, act_at + tRCD);
@@ -1983,9 +1974,15 @@
return make_pair(bank_mask, hidden_bank_prep);
}

-DRAMInterface::Rank::Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int
_rank,

  •                     DRAMInterface& _dram)
    
  • : EventManager(&_ctrl), ctrl(_ctrl), dram(_dram),
    +DRAMInterface*
    +DRAMInterfaceParams::create()
    +{
  • return new DRAMInterface(this);
    +}

+DRAMInterface::Rank::Rank(const DRAMInterfaceParams* _p,

  •                     int _rank, DRAMInterface& _dram)
    
  • : EventManager(&_dram), dram(_dram),
    pwrStateTrans(PWR_IDLE), pwrStatePostRefresh(PWR_IDLE),
    pwrStateTick(0), refreshDueAt(0), pwrState(PWR_IDLE),
    refreshState(REF_IDLE), inLowPowerState(false), rank(_rank),
    @@ -1998,7 +1995,7 @@
    refreshEvent([this]{ processRefreshEvent(); }, name()),
    powerEvent([this]{ processPowerEvent(); }, name()),
    wakeUpEvent([this]{ processWakeUpEvent(); }, name()),
  •  stats(_ctrl, *this)
    
  •  stats(_dram, *this)
    
    {
    for (int b = 0; b < _p->banks_per_rank; b++) {
    banks[b].bank = b;
    @@ -2049,8 +2046,10 @@
    DRAMInterface::Rank::isQueueEmpty() const
    {
    // check commmands in Q based on current bus direction
  • bool no_queued_cmds = (ctrl.inReadBusState(true) && (readEntries == 0))
  •                   || (!ctrl.inReadBusState(true) && (writeEntries ==  
    

0));

  • bool no_queued_cmds = (dram.ctrl->inReadBusState(true) &&
  •                      (readEntries == 0))
    
  •                   || (!dram.ctrl->inReadBusState(true) &&
    
  •                      (writeEntries == 0));
    return no_queued_cmds;
    
    }

@@ -2174,7 +2173,7 @@
// if a request is at the moment being handled and this request is
// accessing the current rank then wait for it to finish
if ((rank == dram.activeRank)

  •        && (ctrl.nextReqEvent.scheduled())) {
    
  •        && (dram.ctrl->nextReqEvent.scheduled())) {
            // hand control over to the request loop until it is
            // evaluated next
            DPRINTF(DRAM, "Refresh awaiting draining\n");
    

@@ -2249,7 +2248,7 @@
// or have outstanding ACT,RD/WR,Auto-PRE sequence scheduled
// should have outstanding precharge or read response event
assert(prechargeEvent.scheduled() ||

  •               ctrl.respondEvent.scheduled());
    
  •               dram.ctrl->respondEvent.scheduled());
            // will start refresh when pwrState transitions to IDLE
        }
    

@@ -2309,8 +2308,8 @@

      assert(!powerEvent.scheduled());
  •    if ((ctrl.drainState() == DrainState::Draining) ||
    
  •        (ctrl.drainState() == DrainState::Drained)) {
    
  •    if ((dram.ctrl->drainState() == DrainState::Draining) ||
    
  •        (dram.ctrl->drainState() == DrainState::Drained)) {
            // if draining, do not re-enter low-power mode.
            // simply go to IDLE and wait
            schedulePowerEvent(PWR_IDLE, curTick());
    

@@ -2535,10 +2534,10 @@
}

      // completed refresh event, ensure next request is scheduled
  •    if (!ctrl.nextReqEvent.scheduled()) {
    
  •    if (!dram.ctrl->nextReqEvent.scheduled()) {
            DPRINTF(DRAM, "Scheduling next request after refreshing"
                           " rank %d\n", rank);
    
  •        schedule(ctrl.nextReqEvent, curTick());
    
  •        schedule(dram.ctrl->nextReqEvent, curTick());
        }
    }
    

@@ -2597,8 +2596,8 @@
// bypass auto-refresh and go straight to SREF, where memory
// will issue refresh immediately upon entry
if (pwrStatePostRefresh == PWR_PRE_PDN && isQueueEmpty() &&

  •       (ctrl.drainState() != DrainState::Draining) &&
    
  •       (ctrl.drainState() != DrainState::Drained) &&
    
  •       (dram.ctrl->drainState() != DrainState::Draining) &&
    
  •       (dram.ctrl->drainState() != DrainState::Drained) &&
           dram.enableDRAMPowerdown) {
            DPRINTF(DRAMState, "Rank %d bypassing refresh and  
    

transitioning "
"to self refresh at %11u tick\n", rank, curTick());
@@ -2669,7 +2668,7 @@
// power (mW) = ----------- * ----------
//              time (tick)  tick_frequency
stats.averagePower = (stats.totalEnergy.value() /

  •                (curTick() - ctrl.lastStatsResetTick)) *
    
  •                (curTick() - dram.ctrl->lastStatsResetTick)) *
                    (SimClock::Frequency / 1000000000.0);
    
    }

@@ -2699,7 +2698,7 @@
bool
DRAMInterface::Rank::forceSelfRefreshExit() const {
return (readEntries != 0) ||

  •       (!ctrl.inReadBusState(true) && (writeEntries != 0));
    
  •       (!dram.ctrl->inReadBusState(true) && (writeEntries != 0));
    

    }

    DRAMCtrl::CtrlStats::CtrlStats(DRAMCtrl &_ctrl)
    @@ -2710,15 +2709,15 @@
    ADD_STAT(writeReqs, "Number of write requests accepted"),

    ADD_STAT(readBursts,
    
  •         "Number of DRAM read bursts, "
    
  •         "Number of controller read bursts, "
             "including those serviced by the write queue"),
    ADD_STAT(writeBursts,
    
  •         "Number of DRAM write bursts, "
    
  •         "Number of controller write bursts, "
             "including those merged in the write queue"),
    ADD_STAT(servicedByWrQ,
    
  •         "Number of DRAM read bursts serviced by the write queue"),
    
  •         "Number of controller read bursts serviced by the write  
    

queue"),
ADD_STAT(mergedWrBursts,

  •         "Number of DRAM write bursts merged with an existing one"),
    
  •         "Number of controller write bursts merged with an existing  
    

one"),

  ADD_STAT(neitherReadNorWriteReqs,
           "Number of requests that are neither read nor write"),

@@ -2726,9 +2725,6 @@
ADD_STAT(avgRdQLen, "Average read queue length when enqueuing"),
ADD_STAT(avgWrQLen, "Average write queue length when enqueuing"),

  • ADD_STAT(totBusLat, "Total ticks spent in databus transfers"),
  • ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"),
  • ADD_STAT(numRdRetry, "Number of times read queue was full causing  
    

retry"),
ADD_STAT(numWrRetry, "Number of times write queue was full causing
retry"),

@@ -2743,22 +2739,13 @@
ADD_STAT(wrPerTurnAround,
"Writes before turning the bus around for reads"),

  • ADD_STAT(bytesRead, "Total number of bytes read from memory"),
    ADD_STAT(bytesReadWrQ, "Total number of bytes read from write queue"),

  • ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"),
    ADD_STAT(bytesReadSys, "Total read bytes from the system interface
    side"),
    ADD_STAT(bytesWrittenSys,
    "Total written bytes from the system interface side"),

  • ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiByte/s"),

  • ADD_STAT(avgWrBW, "Average achieved write bandwidth in MiByte/s"),
    ADD_STAT(avgRdBWSys, "Average system read bandwidth in MiByte/s"),
    ADD_STAT(avgWrBWSys, "Average system write bandwidth in MiByte/s"),

  • ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"),

  • ADD_STAT(busUtil, "Data bus utilization in percentage"),

  • ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"),

  • ADD_STAT(busUtilWrite, "Data bus utilization in percentage for
    writes"),

    ADD_STAT(totGap, "Total gap between requests"),
    ADD_STAT(avgGap, "Average gap between requests"),
    @@ -2790,12 +2777,11 @@
    {
    using namespace Stats;

  • assert(ctrl._system);

  • const auto max_masters = ctrl._system->maxMasters();

  • assert(ctrl.system());

  • const auto max_masters = ctrl.system()->maxMasters();

    avgRdQLen.precision(2);
    avgWrQLen.precision(2);

  • avgBusLat.precision(2);

    readPktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1);
    writePktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1);
    @@ -2810,14 +2796,9 @@
    .init(ctrl.writeBufferSize)
    .flags(nozero);

  • avgRdBW.precision(2);

  • avgWrBW.precision(2);
    avgRdBWSys.precision(2);
    avgWrBWSys.precision(2);

  • peakBW.precision(2);

  • busUtil.precision(2);
    avgGap.precision(2);

  • busUtilWrite.precision(2);

    // per-master bytes read and written to memory
    masterReadBytes
    @@ -2849,9 +2830,6 @@
    .flags(nonan)
    .precision(2);

  • busUtilRead

  •    .precision(2);
    
  • masterWriteRate
        .flags(nozero | nonan)
        .precision(12);
    

@@ -2865,7 +2843,7 @@
.precision(2);

  for (int i = 0; i < max_masters; i++) {
  •    const std::string master = ctrl._system->getMasterName(i);
    
  •    const std::string master = ctrl.system()->getMasterName(i);
        masterReadBytes.subname(i, master);
        masterReadRate.subname(i, master);
        masterWriteBytes.subname(i, master);
    

@@ -2879,22 +2857,11 @@
}

  // Formula stats
  • avgBusLat = totBusLat / (readBursts - servicedByWrQ);

  • avgRdBW = (bytesRead / 1000000) / simSeconds;

  • avgWrBW = (bytesWritten / 1000000) / simSeconds;
    avgRdBWSys = (bytesReadSys / 1000000) / simSeconds;
    avgWrBWSys = (bytesWrittenSys / 1000000) / simSeconds;

  • peakBW = (SimClock::Frequency / ctrl.dram->burstDataDly()) *

  •          ctrl.dram->bytesPerBurst() / 1000000;
    
  • busUtil = (avgRdBW + avgWrBW) / peakBW * 100;

    avgGap = totGap / (readReqs + writeReqs);

  • busUtilRead = avgRdBW / peakBW * 100;

  • busUtilWrite = avgWrBW / peakBW * 100;

  • masterReadRate = masterReadBytes / simSeconds;
    masterWriteRate = masterWriteBytes / simSeconds;
    masterReadAvgLat = masterReadTotalLat / masterReadAccesses;
    

@@ -2907,8 +2874,8 @@
ctrl.lastStatsResetTick = curTick();
}

-DRAMInterface::DRAMStats::DRAMStats(DRAMCtrl &_ctrl, DRAMInterface &_dram)

  • : Stats::Group(&_ctrl, csprintf("dram").c_str()),
    +DRAMInterface::DRAMStats::DRAMStats(DRAMInterface &_dram)
  • : Stats::Group(&_dram),
    dram(_dram),

    ADD_STAT(readBursts, "Number of DRAM read bursts"),
    @@ -2918,10 +2885,13 @@
    ADD_STAT(perBankWrBursts, "Per bank write bursts"),

    ADD_STAT(totQLat, "Total ticks spent queuing"),

  • ADD_STAT(totBusLat, "Total ticks spent in databus transfers"),
    ADD_STAT(totMemAccLat,
    "Total ticks spent from burst creation until serviced "
    "by the DRAM"),

  • ADD_STAT(avgQLat, "Average queueing delay per DRAM burst"),
    
  • ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"),
    ADD_STAT(avgMemAccLat, "Average memory access latency per DRAM burst"),

    ADD_STAT(readRowHits, "Number of row buffer hits during reads"),
    @@ -2934,6 +2904,12 @@
    ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"),
    ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiBytes/s"),
    ADD_STAT(avgWrBW, "Average DRAM write bandwidth in MiBytes/s"),

  • ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"),

  • ADD_STAT(busUtil, "Data bus utilization in percentage"),

  • ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"),

  • ADD_STAT(busUtilWrite, "Data bus utilization in percentage for
    writes"),

  • ADD_STAT(pageHitRate, "Row buffer hit rate, read and write combined")
    

    {
    @@ -2945,6 +2921,7 @@
    using namespace Stats;

    avgQLat.precision(2);
    
  • avgBusLat.precision(2);
    avgMemAccLat.precision(2);

    readRowHitRate.precision(2);
    @@ -2958,10 +2935,16 @@
    dram.maxAccessesPerRow : dram.rowBufferSize)
    .flags(nozero);

  • peakBW.precision(2);

  • busUtil.precision(2);

  • busUtilWrite.precision(2);

  • busUtilRead.precision(2);

  • pageHitRate.precision(2);
    
    // Formula stats
    avgQLat = totQLat / readBursts;
    
  • avgBusLat = totBusLat / readBursts;
    avgMemAccLat = totMemAccLat / readBursts;

    readRowHitRate = (readRowHits / readBursts) * 100;
    @@ -2969,13 +2952,19 @@

    avgRdBW = (bytesRead / 1000000) / simSeconds;
    avgWrBW = (bytesWritten / 1000000) / simSeconds;

  • peakBW = (SimClock::Frequency / dram.burstDataDly()) *

  •          dram.bytesPerBurst() / 1000000;
    
  • busUtil = (avgRdBW + avgWrBW) / peakBW * 100;

  • busUtilRead = avgRdBW / peakBW * 100;

  • busUtilWrite = avgWrBW / peakBW * 100;

    pageHitRate = (writeRowHits + readRowHits) /
    (writeBursts + readBursts) * 100;
    }

-DRAMInterface::RankStats::RankStats(DRAMCtrl &_ctrl, Rank &_rank)

  • : Stats::Group(&_ctrl, csprintf("dram_rank%d", _rank.rank).c_str()),
    +DRAMInterface::RankStats::RankStats(DRAMInterface &_dram, Rank &_rank)
  • : Stats::Group(&_dram, csprintf("rank%d", _rank.rank).c_str()),
    rank(_rank),

    ADD_STAT(actEnergy, "Energy for activate commands per rank (pJ)"),
    @@ -3034,7 +3023,7 @@
    DRAMCtrl::recvFunctional(PacketPtr pkt)
    {
    // rely on the abstract memory

  • functionalAccess(pkt);
  • dram->functionalAccess(pkt);
    }

Port &
@@ -3099,7 +3088,7 @@
DRAMCtrl::MemoryPort::getAddrRanges() const
{
AddrRangeList ranges;

  • ranges.push_back(ctrl.getAddrRange());
  • ranges.push_back(ctrl.dram->getAddrRange());
    return ranges;
    }

diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh
index 4464f7a..1b6d8b5 100644
--- a/src/mem/dram_ctrl.hh
+++ b/src/mem/dram_ctrl.hh
@@ -56,12 +56,15 @@
#include "enums/AddrMap.hh"
#include "enums/MemSched.hh"
#include "enums/PageManage.hh"
+#include "mem/abstract_mem.hh"
#include "mem/drampower.hh"
#include "mem/qos/mem_ctrl.hh"
#include "mem/qport.hh"
#include "params/DRAMCtrl.hh"
#include "sim/eventq.hh"

+class DRAMInterfaceParams;
+
/**

  • A basic class to track the bank state, i.e. what row is
  • currently open (if any), when is the bank free to accept a new
    @@ -243,7 +246,7 @@
  • The DRAMInterface includes a class for individual ranks
  • and per rank functions.
    /
    -class DRAMInterface : public SimObject
    +class DRAMInterface : public AbstractMemory
    {
    private:
    /
    *
    @@ -340,7 +343,7 @@
    class Rank;
    struct RankStats : public Stats::Group
    {
  •    RankStats(DRAMCtrl &ctrl, Rank &rank);
    
  •    RankStats(DRAMInterface &dram, Rank &rank);
    
        void regStats() override;
        void resetStats() override;
    

@@ -406,13 +409,6 @@
*/
class Rank : public EventManager
{

  •  protected:
    
  •    /**
    
  •     * A reference to the parent DRAMCtrl instance
    
  •     */
    
  •    DRAMCtrl& ctrl;
    
  •   private:
    
        /**
    

@@ -532,10 +528,10 @@
*/
Tick lastBurstTick;

  •    Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank,
    
  •    Rank(const DRAMInterfaceParams* _p, int _rank,
             DRAMInterface& _dram);
    
  •    const std::string name() const { return csprintf("dram_%d", rank);  
    

}

  •    const std::string name() const { return csprintf("%d", rank); }
    
        /**
         * Kick off accounting for power and refresh states and
    

@@ -662,9 +658,9 @@
};

  /**
  • * A reference to the parent DRAMCtrl instance
    
  • * A pointer to the parent DRAMCtrl instance
     */
    
  • DRAMCtrl& ctrl;
  • DRAMCtrl* ctrl;

    /**

    • Memory controller configuration initialized based on parameter
      @@ -695,6 +691,7 @@
    • DRAM timing requirements
      */
      const Tick M5_CLASS_VAR_USED tCK;
  • const Tick tCS;
    const Tick tCL;
    const Tick tBURST;
    const Tick tBURST_MIN;
    @@ -774,7 +771,7 @@
    bool trace = true);

    struct DRAMStats : public Stats::Group {

  •    DRAMStats(DRAMCtrl &ctrl, DRAMInterface &dram);
    
  •    DRAMStats(DRAMInterface &dram);
    
        void regStats() override;
    

@@ -790,10 +787,12 @@

      // Latencies summed over all requests
      Stats::Scalar totQLat;
  •    Stats::Scalar totBusLat;
        Stats::Scalar totMemAccLat;
    
        // Average latencies per request
        Stats::Formula avgQLat;
    
  •    Stats::Formula avgBusLat;
        Stats::Formula avgMemAccLat;
    
        // Row hit count and rate
    

@@ -809,6 +808,11 @@
// Average bandwidth
Stats::Formula avgRdBW;
Stats::Formula avgWrBW;

  •    Stats::Formula peakBW;
    
  •    // bus utilization
    
  •    Stats::Formula busUtil;
    
  •    Stats::Formula busUtilRead;
    
  •    Stats::Formula busUtilWrite;
        Stats::Formula pageHitRate;
    };
    

@@ -820,11 +824,16 @@
std::vector<Rank*> ranks;

public:
  • /** Setting a pointer to the controller */
  • void setCtrl(DRAMCtrl* _ctrl)
  • {
  •    ctrl = _ctrl;
    
  • };
  • /**
     * Initialize the DRAM interface and verify parameters
    
  • * @param range is the address range for this interface
     */
    
  • void init(AddrRange range);
  • void init();

    /**

    • Iterate through dram ranks and instantiate per rank startup routine
      @@ -853,6 +862,20 @@
      void suspend();

    /**

  • * Get an address in a dense range which starts from 0. The input
    
  • * address is the physical address of the request in an address
    
  • * space that contains other SimObjects apart from this
    
  • * controller.
    
  • *
    
  • * @param addr The intput address which should be in the addrRange
    
  • * @return An address in the continues range [0, max)
    
  • */
    
  • Addr getCtrlAddr(Addr addr)

  • {

  •    return range.getOffset(addr);
    
  • }

  • /**

    • @return number of bytes in a burst for this interface
      */
      uint32_t bytesPerBurst () { return burstSize; };
      @@ -887,6 +910,13 @@
      */
      Tick minRdToWr () { return tRTW; };
  • /**

  • * Determine the required delay for an access to a different rank
    
  • *
    
  • * @return required rank to rank delay
    
  • */
    
  • Tick rankDelay() { return tCS; };

  • /*
     * Function to calulate RAS cycle time for use within and
     * outside of this class
    

@@ -968,7 +998,7 @@
*
* @param rank Specifies rank associated with read burst
*/

  • void respondEventDRAM(uint8_t rank);
  • void respondEvent(uint8_t rank);

    /**

    • Check the refresh state to determine if refresh needs
      @@ -1004,8 +1034,7 @@
      virtual void process() { rank->resetStats(); };
      };
  • DRAMInterface(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p,
  •             uint64_t capacity, AddrRange range);
    
  • DRAMInterface(const DRAMInterfaceParams* _p);
    };

/**
@@ -1170,20 +1199,6 @@
void accessAndRespond(PacketPtr pkt, Tick static_latency);

  /**
  • * Get an address in a dense range which starts from 0. The input
    
  • * address is the physical address of the request in an address
    
  • * space that contains other SimObjects apart from this
    
  • * controller.
    
  • *
    
  • * @param addr The intput address which should be in the addrRange
    
  • * @return An address in the continues range [0, max)
    
  • */
    
  • Addr getCtrlAddr(Addr addr)

  • {

  •    return range.getOffset(addr);
    
  • }

  • /**

    • The memory schduler/arbiter - picks which request needs to
    • go next, based on the specified policy such as FCFS or FR-FCFS
    • and moves it to the head of the queue.
      @@ -1265,6 +1280,11 @@
      std::unordered_multiset<Tick> burstTicks;

    /**

  • * Create pointer to interface of the actual dram media
    
  • */
    
  • DRAMInterface* const dram;

  • /**

    • The following are basic design parameters of the memory
    • controller, and are initialized based on parameter values.
    • The rowsPerBank is determined based on the capacity, number of
      @@ -1279,12 +1299,6 @@
      uint32_t readsThisTime;

    /**

  • * Basic memory timing parameters initialized based on parameter
    
  • * values. These will be used across memory interfaces.
    
  • */
    
  • const Tick tCS;
  • /**
    • Memory controller configuration initialized based on parameter
    • values.
      */
      @@ -1338,10 +1352,6 @@
      // Average queue lengths
      Stats::Average avgRdQLen;
      Stats::Average avgWrQLen;
  •    // Latencies summed over all requests
    
  •    Stats::Scalar totBusLat;
    
  •    // Average latencies per request
    
  •    Stats::Formula avgBusLat;
    
        Stats::Scalar numRdRetry;
        Stats::Scalar numWrRetry;
    

@@ -1352,21 +1362,12 @@
Stats::Histogram rdPerTurnAround;
Stats::Histogram wrPerTurnAround;

  •    Stats::Scalar bytesRead;
        Stats::Scalar bytesReadWrQ;
    
  •    Stats::Scalar bytesWritten;
        Stats::Scalar bytesReadSys;
        Stats::Scalar bytesWrittenSys;
        // Average bandwidth
    
  •    Stats::Formula avgRdBW;
    
  •    Stats::Formula avgWrBW;
        Stats::Formula avgRdBWSys;
        Stats::Formula avgWrBWSys;
    
  •    Stats::Formula peakBW;
    
  •    // bus utilization
    
  •    Stats::Formula busUtil;
    
  •    Stats::Formula busUtilRead;
    
  •    Stats::Formula busUtilWrite;
    
        Stats::Scalar totGap;
        Stats::Formula avgGap;
    

@@ -1405,11 +1406,6 @@
/** The time when stats were last reset used to calculate average
power */
Tick lastStatsResetTick;

  • /**
  • * Create pointer to interfasce to the actual media
    
  • */
    
  • DRAMInterface* dram;
  • DRAMCtrl(const DRAMCtrlParams* p);
    
    DrainState drain() override;
    

@@ -1458,13 +1454,6 @@
};

  /**
  • * Determine the required delay for an access to a different rank
    
  • *
    
  • * @return required rank to rank delay
    
  • */
    
  • Tick rankDelay() { return tCS; };
  • /**
    • Check the current direction of the memory channel
    • @param next_state Check either the current or next bus state
      diff --git a/src/mem/drampower.cc b/src/mem/drampower.cc
      index f506928..7a44aa1 100644
      --- a/src/mem/drampower.cc
      +++ b/src/mem/drampower.cc
      @@ -40,13 +40,13 @@
      #include "base/intmath.hh"
      #include "sim/core.hh"

-DRAMPower::DRAMPower(const DRAMCtrlParams* p, bool include_io) :
+DRAMPower::DRAMPower(const DRAMInterfaceParams* p, bool include_io) :
powerlib(libDRAMPower(getMemSpec(p), include_io))
{
}

Data::MemArchitectureSpec
-DRAMPower::getArchParams(const DRAMCtrlParams* p)
+DRAMPower::getArchParams(const DRAMInterfaceParams* p)
{
Data::MemArchitectureSpec archSpec;
archSpec.burstLength = p->burst_length;
@@ -68,7 +68,7 @@
}

Data::MemTimingSpec
-DRAMPower::getTimingParams(const DRAMCtrlParams* p)
+DRAMPower::getTimingParams(const DRAMInterfaceParams* p)
{
// Set the values that are used for power calculations and ignore
// the ones only used by the controller functionality in DRAMPower
@@ -100,7 +100,7 @@
}

Data::MemPowerSpec
-DRAMPower::getPowerParams(const DRAMCtrlParams* p)
+DRAMPower::getPowerParams(const DRAMInterfaceParams* p)
{
// All DRAMPower currents are in mA
Data::MemPowerSpec powerSpec;
@@ -132,7 +132,7 @@
}

Data::MemorySpecification
-DRAMPower::getMemSpec(const DRAMCtrlParams* p)
+DRAMPower::getMemSpec(const DRAMInterfaceParams* p)
{
Data::MemorySpecification memSpec;
memSpec.memArchSpec = getArchParams(p);
@@ -142,13 +142,13 @@
}

bool
-DRAMPower::hasTwoVDD(const DRAMCtrlParams* p)
+DRAMPower::hasTwoVDD(const DRAMInterfaceParams* p)
{
return p->VDD2 == 0 ? false : true;
}

uint8_t
-DRAMPower::getDataRate(const DRAMCtrlParams* p)
+DRAMPower::getDataRate(const DRAMInterfaceParams* p)
{
uint32_t burst_cycles = divCeil(p->tBURST_MAX, p->tCK);
uint8_t data_rate = p->burst_length / burst_cycles;
diff --git a/src/mem/drampower.hh b/src/mem/drampower.hh
index ed47476..da68a78 100644
--- a/src/mem/drampower.hh
+++ b/src/mem/drampower.hh
@@ -44,7 +44,7 @@
#define MEM_DRAM_POWER_HH

#include "libdrampower/LibDRAMPower.h"
-#include "params/DRAMCtrl.hh"
+#include "params/DRAMInterface.hh"

/**

  • DRAMPower is a standalone tool which calculates the power consumed by a
    @@ -57,43 +57,44 @@

    /**

    • Transform the architechture parameters defined in
  • * DRAMCtrlParams to the memSpec of DRAMPower
    
  • * DRAMInterfaceParams to the memSpec of DRAMPower
     */
    
  • static Data::MemArchitectureSpec getArchParams(const DRAMCtrlParams*
    p);
  • static Data::MemArchitectureSpec getArchParams(
  •                                 const DRAMInterfaceParams* p);
    
    /**
    
  • * Transforms the timing parameters defined in DRAMCtrlParams to
    
  • * Transforms the timing parameters defined in DRAMInterfaceParams to
     * the memSpec of DRAMPower
     */
    
  • static Data::MemTimingSpec getTimingParams(const DRAMCtrlParams* p);
  • static Data::MemTimingSpec getTimingParams(const DRAMInterfaceParams*
    p);

    /**

    • Transforms the power and current parameters defined in
  • * DRAMCtrlParam to the memSpec of DRAMPower
    
  • * DRAMInterfaceParams to the memSpec of DRAMPower
     */
    
  • static Data::MemPowerSpec getPowerParams(const DRAMCtrlParams* p);
  • static Data::MemPowerSpec getPowerParams(const DRAMInterfaceParams* p);

    /**

    • Determine data rate, either one or two.
      */
  • static uint8_t getDataRate(const DRAMCtrlParams* p);
  • static uint8_t getDataRate(const DRAMInterfaceParams* p);

    /**

    • Determine if DRAM has two voltage domains (or one)
      */
  • static bool hasTwoVDD(const DRAMCtrlParams* p);
  • static bool hasTwoVDD(const DRAMInterfaceParams* p);

    /**

  • * Return an instance of MemSpec based on the DRAMCtrlParams
    
  • * Return an instance of MemSpec based on the DRAMInterfaceParams
     */
    
  • static Data::MemorySpecification getMemSpec(const DRAMCtrlParams* p);
  • static Data::MemorySpecification getMemSpec(const DRAMInterfaceParams*
    p);

public:

  // Instance of DRAMPower Library
  libDRAMPower powerlib;
  • DRAMPower(const DRAMCtrlParams* p, bool include_io);
  • DRAMPower(const DRAMInterfaceParams* p, bool include_io);

};

diff --git a/src/mem/qos/QoSMemCtrl.py b/src/mem/qos/QoSMemCtrl.py
index 1cd3f0b..f55105b 100644
--- a/src/mem/qos/QoSMemCtrl.py
+++ b/src/mem/qos/QoSMemCtrl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 ARM Limited
+# Copyright (c) 2018-2020 ARM Limited

All rights reserved.

The license below extends only to copyright in the software and shall

@@ -34,18 +34,21 @@

OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from m5.params import *
-from m5.objects.AbstractMemory import AbstractMemory
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
from m5.objects.QoSTurnaround import *

QoS Queue Selection policy used to select packets among same-QoS queues

class QoSQPolicy(Enum): vals = ["fifo", "lifo", "lrg"]

-class QoSMemCtrl(AbstractMemory):
+class QoSMemCtrl(ClockedObject):
type = 'QoSMemCtrl'
cxx_header = "mem/qos/mem_ctrl.hh"
cxx_class = 'QoS::MemCtrl'
abstract = True

  • system = Param.System(Parent.any, "System that the controller belongs
    to.")
  • ##### QoS support parameters ####
    
    # Number of priorities in the system
    

diff --git a/src/mem/qos/QoSMemSinkCtrl.py b/src/mem/qos/QoSMemSinkCtrl.py
index 572cad5..03a988a 100644
--- a/src/mem/qos/QoSMemSinkCtrl.py
+++ b/src/mem/qos/QoSMemSinkCtrl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 ARM Limited
+# Copyright (c) 2018-2020 ARM Limited

All rights reserved.

The license below extends only to copyright in the software and shall

@@ -37,6 +37,7 @@

from m5.params import *
from m5.objects.QoSMemCtrl import *
+from QoSMemSinkInterface import *

class QoSMemSinkCtrl(QoSMemCtrl):
type = 'QoSMemSinkCtrl'
@@ -44,6 +45,10 @@
cxx_class = "QoS::MemSinkCtrl"
port = SlavePort("Slave ports")

  • intf = Param.QoSMemSinkInterface(QoSMemSinkInterface(), "Interface
    to "\
  •                                              "memory")
    
  • # the basic configuration of the controller architecture, note
    # that each entry corresponds to a burst for the specific DRAM
    # configuration (e.g. x32 with burst length 8 is 32 bytes) and not
    

@@ -59,5 +64,3 @@

  # response latency - time to issue a response once a request is  

serviced
response_latency = Param.Latency("20ns", "Memory response latency")

diff --git a/src/mem/qos/QoSMemSinkInterface.py
b/src/mem/qos/QoSMemSinkInterface.py
new file mode 100644
index 0000000..fd8254f
--- /dev/null
+++ b/src/mem/qos/QoSMemSinkInterface.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2020 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Matteo Andreozzi
+#          Wendy Elsasser
+
+from AbstractMemory import AbstractMemory
+
+class QoSMemSinkInterface(AbstractMemory):

  • type = 'QoSMemSinkInterface'
  • cxx_header = "mem/qos/mem_sink.hh"
    diff --git a/src/mem/qos/SConscript b/src/mem/qos/SConscript
    index f8601b6..1d90f9c 100644
    --- a/src/mem/qos/SConscript
    +++ b/src/mem/qos/SConscript
    @@ -1,4 +1,4 @@
    -# Copyright (c) 2018 ARM Limited
    +# Copyright (c) 2018-2020 ARM Limited

All rights reserved

The license below extends only to copyright in the software and shall

@@ -37,6 +37,7 @@

SimObject('QoSMemCtrl.py')
SimObject('QoSMemSinkCtrl.py')
+SimObject('QoSMemSinkInterface.py')
SimObject('QoSPolicy.py')
SimObject('QoSTurnaround.py')

diff --git a/src/mem/qos/mem_ctrl.cc b/src/mem/qos/mem_ctrl.cc
index 50e6035..190960b 100644
--- a/src/mem/qos/mem_ctrl.cc
+++ b/src/mem/qos/mem_ctrl.cc
@@ -1,5 +1,5 @@
/*

    • Copyright (c) 2017-2019 ARM Limited
    • Copyright (c) 2017-2020 ARM Limited
    • All rights reserved
    • The license below extends only to copyright in the software and shall
      @@ -42,7 +42,7 @@
      namespace QoS {

    MemCtrl::MemCtrl(const QoSMemCtrlParams * p)

  • : AbstractMemory(p),
  • : ClockedObject(p),
    policy(p->qos_policy),
    turnPolicy(p->qos_turnaround_policy),
    queuePolicy(QueuePolicy::create(p)),
    @@ -51,7 +51,8 @@
    qosSyncroScheduler(p->qos_syncro_scheduler),
    totalReadQueueSize(0), totalWriteQueueSize(0),
    busState(READ), busStateNext(READ),
  • stats(*this)
  • stats(*this),
  • _system(p->system)
    {
    // Set the priority policy
    if (policy) {
    @@ -77,12 +78,6 @@
    {}

void
-MemCtrl::init()
-{

  • AbstractMemory::init();
    -}

-void
MemCtrl::logRequest(BusState dir, MasterID m_id, uint8_t qos,
Addr addr, uint64_t entries)
{
diff --git a/src/mem/qos/mem_ctrl.hh b/src/mem/qos/mem_ctrl.hh
index 0e29fcc..50ddc94 100644
--- a/src/mem/qos/mem_ctrl.hh
+++ b/src/mem/qos/mem_ctrl.hh
@@ -1,5 +1,5 @@
/*

    • Copyright (c) 2019 ARM Limited
    • Copyright (c) 2020 ARM Limited
    • All rights reserved
    • The license below extends only to copyright in the software and shall
      @@ -36,9 +36,9 @@
      */

    #include "debug/QOS.hh"
    -#include "mem/abstract_mem.hh"
    -#include "mem/qos/q_policy.hh"
    +#include "mem/mem_object.hh"
    #include "mem/qos/policy.hh"
    +#include "mem/qos/q_policy.hh"
    #include "params/QoSMemCtrl.hh"
    #include "sim/system.hh"

@@ -49,6 +49,8 @@
#ifndef MEM_QOS_MEM_CTRL_HH
#define MEM_QOS_MEM_CTRL_HH

+class System;
+
namespace QoS {

/**
@@ -56,7 +58,7 @@

  • which support QoS - it provides access to a set of QoS
  • scheduling policies
    /
    -class MemCtrl: public AbstractMemory
    +class MemCtrl: public ClockedObject
    {
    public:
    /
    * Bus Direction */
    @@ -151,6 +153,9 @@
    Stats::Scalar numStayWriteState;
    } stats;
  • /** Pointer to the System object */
  • System* _system;
  • /**
     * Initializes dynamically counters and
     * statistics for a given Master
    

@@ -266,11 +271,6 @@
virtual ~MemCtrl();

  /**
  • * Initializes this object
    
  • */
    
  • void init() override;
  • /**
    • Gets the current bus state
    • @return current bus state
      @@ -346,6 +346,10 @@
    • @return total number of priority levels
      */
      uint8_t numPriorities() const { return _numPriorities; }
  • /** read the system pointer
  • * @return pointer to the system object */
    
  • System* system() const { return _system; }
    };

template<typename Queues>
diff --git a/src/mem/qos/mem_sink.cc b/src/mem/qos/mem_sink.cc
index 1f104e4..fb06b9d 100644
--- a/src/mem/qos/mem_sink.cc
+++ b/src/mem/qos/mem_sink.cc
@@ -1,5 +1,5 @@
/*

    • Copyright (c) 2018 ARM Limited
    • Copyright (c) 2018-2020 ARM Limited
    • All rights reserved
    • The license below extends only to copyright in the software and shall
      @@ -40,6 +40,7 @@
      #include "debug/Drain.hh"
      #include "debug/QOS.hh"
      #include "mem_sink.hh"
      +#include "params/QoSMemSinkInterface.hh"
      #include "sim/system.hh"

    namespace QoS {
    @@ -50,12 +51,15 @@
    memoryPacketSize(p->memory_packet_size),
    readBufferSize(p->read_buffer_size),
    writeBufferSize(p->write_buffer_size), port(name() + ".port", *this),

  • intf(p->intf),
    retryRdReq(false), retryWrReq(false), nextRequest(0),
    nextReqEvent(this)
    {
    // Resize read and write queue to allocate space
    // for configured QoS priorities
    readQueue.resize(numPriorities());
    writeQueue.resize(numPriorities());

  • intf->setMemCtrl(this);
    }

MemSinkCtrl::~MemSinkCtrl()
@@ -92,7 +96,7 @@
"%s Should not see packets where cache is responding\n",
func);

  • access(pkt);
  • intf->access(pkt);
    return responseLatency;
    }

@@ -101,7 +105,7 @@
{
pkt->pushLabel(name());

  • functionalAccess(pkt);
  • intf->functionalAccess(pkt);

    pkt->popLabel();
    }
    @@ -279,7 +283,7 @@

    // Do the actual memory access which also turns the packet
    // into a response

  • access(pkt);
  • intf->access(pkt);

    // Log the response
    logResponse(pkt->isRead()? READ : WRITE,
    @@ -351,7 +355,7 @@
    MemSinkCtrl::MemoryPort::getAddrRanges() const
    {
    AddrRangeList ranges;

  • ranges.push_back(memory.getAddrRange());
  • ranges.push_back(memory.intf->getAddrRange());
    return ranges;
    }

@@ -390,3 +394,19 @@
return new QoS::MemSinkCtrl(this);
}

+QoSMemSinkInterface::QoSMemSinkInterface(const QoSMemSinkInterfaceParams*
_p)

  • : AbstractMemory(_p)
    +{
    +}

+void
+QoSMemSinkInterface::init()
+{

  • AbstractMemory::init();
    +}

+QoSMemSinkInterface*
+QoSMemSinkInterfaceParams::create()
+{

  • return new QoSMemSinkInterface(this);
    +}
    diff --git a/src/mem/qos/mem_sink.hh b/src/mem/qos/mem_sink.hh
    index 9a51269..3b10abd 100644
    --- a/src/mem/qos/mem_sink.hh
    +++ b/src/mem/qos/mem_sink.hh
    @@ -1,5 +1,5 @@
    /*
    • Copyright (c) 2018 ARM Limited
    • Copyright (c) 2018-2020 ARM Limited
    • All rights reserved
    • The license below extends only to copyright in the software and shall
      @@ -41,10 +41,14 @@
      #ifndef MEM_QOS_MEM_SINK_HH
      #define MEM_QOS_MEM_SINK_HH

+#include "mem/abstract_mem.hh"
#include "mem/qos/mem_ctrl.hh"
#include "mem/qport.hh"
#include "params/QoSMemSinkCtrl.hh"

+class QoSMemSinkInterfaceParams;
+class QoSMemSinkInterface;
+
namespace QoS {

/**
@@ -163,6 +167,11 @@
/** Memory slave port */
MemoryPort port;

  • /**
  • * Create pointer to interface of actual media
    
  • */
    
  • QoSMemSinkInterface* const intf;
  • /** Read request pending */
    bool retryRdReq;
    

@@ -244,4 +253,23 @@

} // namespace QoS

+class QoSMemSinkInterface : public AbstractMemory
+{

  • public:

  • /** Initialize the memory interface */

  • void init();

  • /** Setting a pointer to the interface */

  • void setMemCtrl(QoS::MemSinkCtrl* _ctrl)

  • {

  •    ctrl = _ctrl;
    
  • };

  • /** Pointer to the controller */

  • QoS::MemSinkCtrl* ctrl;

  • QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p);
    +};

  • #endif /* MEM_QOS_MEM_SINK_HH */
    diff --git a/tests/configs/base_config.py b/tests/configs/base_config.py
    index 0f79938..e2d3851 100644
    --- a/tests/configs/base_config.py
    +++ b/tests/configs/base_config.py
    @@ -1,4 +1,4 @@
    -# Copyright (c) 2012-2013, 2017-2018 ARM Limited
    +# Copyright (c) 2012-2013, 2017-2018, 2020 ARM Limited

    All rights reserved.

    The license below extends only to copyright in the software and shall

@@ -221,7 +221,12 @@
super(BaseSESystem, self).init_system(system)

  def create_system(self):
  •    system = System(physmem = self.mem_class(),
    
  •    if issubclass(self.mem_class, m5.objects.DRAMInterface):
    
  •        mem_ctrl = DRAMCtrl()
    
  •        mem_ctrl.dram = self.mem_class()
    
  •    else:
    
  •        mem_ctrl = self.mem_class()
    
  •    system = System(physmem = mem_ctrl,
                        membus = SystemXBar(),
                        mem_mode = self.mem_mode,
                        multi_thread = (self.num_threads > 1))
    

@@ -275,6 +280,16 @@
# the physmem name to avoid bumping all the reference stats
system.physmem = [self.mem_class(range = r)
for r in system.mem_ranges]

  •        if issubclass(self.mem_class, m5.objects.DRAMInterface):
    
  •            mem_ctrls = []
    
  •            for r in system.mem_ranges:
    
  •                mem_ctrl = DRAMCtrl()
    
  •                mem_ctrl.dram = self.mem_class(range = r)
    
  •                mem_ctrls.append(mem_ctrl)
    
  •            system.physmem = mem_ctrls
    
  •        else:
    
  •            system.physmem = [self.mem_class(range = r)
    
  •                              for r in system.mem_ranges]
            for i in range(len(system.physmem)):
                system.physmem[i].port = system.membus.master
    

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28968
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8
Gerrit-Change-Number: 28968
Gerrit-PatchSet: 1
Gerrit-Owner: Wendy Elsasser <wendy.elsasser(a)arm.com>
Gerrit-MessageType: newchange

Wendy Elsasser has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/28968 ) Change subject: mem: Make DRAMCtrl a ClockedObject ...................................................................... mem: Make DRAMCtrl a ClockedObject Made DRAMCtrl a ClockedObject, with DRAMInterface defined as an AbstractMemory. The address ranges are now defined per interface. Currently the model only includes a DRAMInterface but this can be expanded for other media types. The controller object includes a parameter to the interface, which is setup when gem5 is configured. Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8 --- M configs/common/MemConfig.py M configs/dram/low_power_sweep.py M configs/dram/sweep.py M configs/learning_gem5/part1/simple.py M configs/learning_gem5/part1/two_level.py M configs/learning_gem5/part2/simple_cache.py M configs/learning_gem5/part2/simple_memobj.py M configs/learning_gem5/part3/simple_ruby.py M src/mem/DRAMCtrl.py A src/mem/DRAMInterface.py M src/mem/SConscript M src/mem/dram_ctrl.cc M src/mem/dram_ctrl.hh M src/mem/drampower.cc M src/mem/drampower.hh M src/mem/qos/QoSMemCtrl.py M src/mem/qos/QoSMemSinkCtrl.py A src/mem/qos/QoSMemSinkInterface.py M src/mem/qos/SConscript M src/mem/qos/mem_ctrl.cc M src/mem/qos/mem_ctrl.hh M src/mem/qos/mem_sink.cc M src/mem/qos/mem_sink.hh M tests/configs/base_config.py 24 files changed, 1,934 insertions(+), 1,760 deletions(-) diff --git a/configs/common/MemConfig.py b/configs/common/MemConfig.py index 9443520..ab6b933 100644 --- a/configs/common/MemConfig.py +++ b/configs/common/MemConfig.py @@ -40,7 +40,7 @@ from common import ObjectList from common import HMC -def create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, intlv_size): +def create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size): """ Helper function for creating a single memoy controller from the given options. This function is invoked multiple times in config_mem function @@ -59,33 +59,33 @@ # Create an instance so we can figure out the address # mapping and row-buffer size - ctrl = cls() + interface = intf() # Only do this for DRAMs - if issubclass(cls, m5.objects.DRAMCtrl): + if issubclass(intf, m5.objects.DRAMInterface): # If the channel bits are appearing after the column # bits, we need to add the appropriate number of bits # for the row buffer size - if ctrl.addr_mapping.value == 'RoRaBaChCo': + if interface.addr_mapping.value == 'RoRaBaChCo': # This computation only really needs to happen # once, but as we rely on having an instance we # end up having to repeat it for each and every # one - rowbuffer_size = ctrl.device_rowbuffer_size.value * \ - ctrl.devices_per_rank.value + rowbuffer_size = interface.device_rowbuffer_size.value * \ + interface.devices_per_rank.value intlv_low_bit = int(math.log(rowbuffer_size, 2)) # We got all we need to configure the appropriate address # range - ctrl.range = m5.objects.AddrRange(r.start, size = r.size(), + interface.range = m5.objects.AddrRange(r.start, size = r.size(), intlvHighBit = \ intlv_low_bit + intlv_bits - 1, xorHighBit = \ xor_low_bit + intlv_bits - 1, intlvBits = intlv_bits, intlvMatch = i) - return ctrl + return interface def config_mem(options, system): """ @@ -144,10 +144,10 @@ if 2 ** intlv_bits != nbr_mem_ctrls: fatal("Number of memory channels must be a power of 2") - cls = ObjectList.mem_list.get(opt_mem_type) + intf = ObjectList.mem_list.get(opt_mem_type) mem_ctrls = [] - if opt_elastic_trace_en and not issubclass(cls, m5.objects.SimpleMemory): + if opt_elastic_trace_en and not issubclass(intf, m5.objects.SimpleMemory): fatal("When elastic trace is enabled, configure mem-type as " "simple-mem.") @@ -158,36 +158,56 @@ intlv_size = max(opt_mem_channels_intlv, system.cache_line_size.value) # For every range (most systems will only have one), create an - # array of controllers and set their parameters to match their - # address mapping in the case of a DRAM + # array of memory interfaces and set their parameters to match + # their address mapping in the case of a DRAM for r in system.mem_ranges: for i in range(nbr_mem_ctrls): - mem_ctrl = create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, + # Create the DRAM interface + dram_intf = create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size) + # Set the number of ranks based on the command-line # options if it was explicitly set - if issubclass(cls, m5.objects.DRAMCtrl) and opt_mem_ranks: - mem_ctrl.ranks_per_channel = opt_mem_ranks + if issubclass(intf, m5.objects.DRAMInterface) and opt_mem_ranks: + dram_intf.ranks_per_channel = opt_mem_ranks # Enable low-power DRAM states if option is set - if issubclass(cls, m5.objects.DRAMCtrl): - mem_ctrl.enable_dram_powerdown = opt_dram_powerdown + if issubclass(intf, m5.objects.DRAMInterface): + dram_intf.enable_dram_powerdown = opt_dram_powerdown if opt_elastic_trace_en: - mem_ctrl.latency = '1ns' + dram_intf.latency = '1ns' print("For elastic trace, over-riding Simple Memory " "latency to 1ns.") + # Create the controller that will drive the interface + if opt_mem_type == "HMC_2500_1x32": + # The static latency of the vault controllers is estimated + # to be smaller than a full DRAM channel controller + mem_ctrl = m5.objects.DRAMCtrl(min_writes_per_switch = 8, + static_backend_latency = '4ns', + static_frontend_latency = '4ns') + else: + mem_ctrl = m5.objects.DRAMCtrl() + + # Override buffer sizes with interface specific values + mem_ctrl.write_buffer_size = dram_intf.write_buffer_size + mem_ctrl.read_buffer_size = dram_intf.read_buffer_size + + # Hookup the controller to the interface and add to the list + mem_ctrl.dram = dram_intf mem_ctrls.append(mem_ctrl) - subsystem.mem_ctrls = mem_ctrls - - # Connect the controllers to the membus - for i in range(len(subsystem.mem_ctrls)): + # Create a controller and connect the interfaces to a controller + for i in range(len(mem_ctrls)): if opt_mem_type == "HMC_2500_1x32": - subsystem.mem_ctrls[i].port = xbar[i/4].master + # Connect the controllers to the membus + mem_ctrls[i].port = xbar[i/4].master # Set memory device size. There is an independent controller for # each vault. All vaults are same size. - subsystem.mem_ctrls[i].device_size = options.hmc_dev_vault_size + mem_ctrls[i].dram.device_size = options.hmc_dev_vault_size else: - subsystem.mem_ctrls[i].port = xbar.master + # Connect the controllers to the membus + mem_ctrls[i].port = xbar.master + + subsystem.mem_ctrls = mem_ctrls diff --git a/configs/dram/low_power_sweep.py b/configs/dram/low_power_sweep.py index 9a62393..4a97fcb 100644 --- a/configs/dram/low_power_sweep.py +++ b/configs/dram/low_power_sweep.py @@ -1,4 +1,4 @@ -# Copyright (c) 2014-2015, 2017, 2019 ARM Limited +# Copyright (c) 2014-2015, 2017, 2019-2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -37,6 +37,7 @@ from __future__ import absolute_import import argparse +import math import m5 from m5.objects import * @@ -57,6 +58,10 @@ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) +dram_generators = { + "DRAM" : lambda x: x.createDram, +} + # Use a single-channel DDR4-2400 in 16x4 configuration by default parser.add_argument("--mem-type", default="DDR4_2400_16x4", choices=ObjectList.mem_list.get_names(), @@ -77,7 +82,7 @@ help = "Percentage of read commands") parser.add_argument("--addr-map", - choices=m5.objects.AddrMap.vals, + choices=ObjectList.dram_addr_map_list.get_names(), default="RoRaBaCoCh", help = "DRAM address map policy") parser.add_argument("--idle-end", type=int, default=50000000, @@ -111,14 +116,19 @@ # Sanity check for memory controller class. if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl): - fatal("This script assumes the memory is a DRAMCtrl subclass") + fatal("This script assumes the controller is a DRAMCtrl subclass") +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface): + fatal("This script assumes the memory is a DRAMInterface subclass") # There is no point slowing things down by saving any data. -system.mem_ctrls[0].null = True +system.mem_ctrls[0].dram.null = True + +# enable DRAM low power states +system.mem_ctrls[0].dram.enable_dram_powerdown = True # Set the address mapping based on input argument -system.mem_ctrls[0].addr_mapping = args.addr_map -system.mem_ctrls[0].page_policy = args.page_policy +system.mem_ctrls[0].dram.addr_mapping = args.addr_map +system.mem_ctrls[0].dram.page_policy = args.page_policy # We create a traffic generator state for each param combination we want to # test. Each traffic generator state is specified in the config file and the @@ -126,28 +136,23 @@ # Stats are dumped and reset at the state transition. period = 250000000 -# We specify the states in a config file input to the traffic generator. -cfg_file_name = "lowp_sweep.cfg" -cfg_file_path = os.path.dirname(__file__) + "/" +cfg_file_name -cfg_file = open(cfg_file_path, 'w') - # Get the number of banks -nbr_banks = int(system.mem_ctrls[0].banks_per_rank.value) +nbr_banks = int(system.mem_ctrls[0].dram.banks_per_rank.value) # determine the burst size in bytes -burst_size = int((system.mem_ctrls[0].devices_per_rank.value * - system.mem_ctrls[0].device_bus_width.value * - system.mem_ctrls[0].burst_length.value) / 8) +burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value * + system.mem_ctrls[0].dram.device_bus_width.value * + system.mem_ctrls[0].dram.burst_length.value) / 8) # next, get the page size in bytes (the rowbuffer size is already in bytes) -page_size = system.mem_ctrls[0].devices_per_rank.value * \ - system.mem_ctrls[0].device_rowbuffer_size.value +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \ + system.mem_ctrls[0].dram.device_rowbuffer_size.value # Inter-request delay should be such that we can hit as many transitions # to/from low power states as possible to. We provide a min and max itt to the # traffic generator and it randomises in the range. The parameter is in # seconds and we need it in ticks (ps). -itt_min = system.mem_ctrls[0].tBURST.value * 1000000000000 +itt_min = system.mem_ctrls[0].dram.tBURST.value * 1000000000000 #The itt value when set to (tRAS + tRP + tCK) covers the case where # a read command is delayed beyond the delay from ACT to PRE_PDN entry of the @@ -155,9 +160,9 @@ # between a write and power down entry will be tRCD + tCL + tWR + tRP + tCK. # As we use this delay as a unit and create multiples of it as bigger delays # for the sweep, this parameter works for reads, writes and mix of them. -pd_entry_time = (system.mem_ctrls[0].tRAS.value + - system.mem_ctrls[0].tRP.value + - system.mem_ctrls[0].tCK.value) * 1000000000000 +pd_entry_time = (system.mem_ctrls[0].dram.tRAS.value + + system.mem_ctrls[0].dram.tRP.value + + system.mem_ctrls[0].dram.tCK.value) * 1000000000000 # We sweep itt max using the multipliers specified by the user. itt_max_str = args.itt_list.strip().split() @@ -180,42 +185,11 @@ # banks bank_util_values = [1, int(nbr_banks/2), nbr_banks] -# Next we create the config file, but first a comment -cfg_file.write("""# STATE state# period mode=DRAM -# read_percent start_addr end_addr req_size min_itt max_itt data_limit -# stride_size page_size #banks #banks_util addr_map #ranks\n""") - -addr_map = m5.objects.AddrMap.map[args.addr_map] - -nxt_state = 0 -for itt_max in itt_max_values: - for bank in bank_util_values: - for stride_size in stride_values: - cfg_file.write("STATE %d %d %s %d 0 %d %d " - "%d %d %d %d %d %d %d %d %d\n" % - (nxt_state, period, "DRAM", args.rd_perc, max_addr, - burst_size, itt_min, itt_max, 0, stride_size, - page_size, nbr_banks, bank, addr_map, - args.mem_ranks)) - nxt_state = nxt_state + 1 - # State for idle period idle_period = args.idle_end -cfg_file.write("STATE %d %d IDLE\n" % (nxt_state, idle_period)) - -# Init state is state 0 -cfg_file.write("INIT 0\n") - -# Go through the states one by one -for state in range(1, nxt_state + 1): - cfg_file.write("TRANSITION %d %d 1\n" % (state - 1, state)) - -# Transition from last state to itself to not break the probability math -cfg_file.write("TRANSITION %d %d 1\n" % (nxt_state, nxt_state)) -cfg_file.close() # create a traffic generator, and point it to the file we just created -system.tgen = TrafficGen(config_file = cfg_file_path) +system.tgen = PyTrafficGen() # add a communication monitor system.monitor = CommMonitor() @@ -230,14 +204,34 @@ # every period, dump and reset all stats periodicStatDump(period) +# run Forrest, run! root = Root(full_system = False, system = system) root.system.mem_mode = 'timing' m5.instantiate() +def trace(): + addr_map = ObjectList.dram_addr_map_list.get(args.addr_map) + generator = dram_generators["DRAM"](system.tgen) + for itt_max in itt_max_values: + for bank in bank_util_values: + for stride_size in stride_values: + num_seq_pkts = int(math.ceil(float(stride_size) / burst_size)) + yield generator(period, + 0, max_addr, burst_size, int(itt_min), + int(itt_max), args.rd_perc, 0, + num_seq_pkts, page_size, nbr_banks, bank, + addr_map, args.mem_ranks) + + yield system.tgen.createIdle(idle_period) + yield system.tgen.createExit(0) + +system.tgen.start(trace()) + # Simulate for exactly as long as it takes to go through all the states # This is why sim exists. -m5.simulate(nxt_state * period + idle_period) +m5.simulate() + print("--- Done DRAM low power sweep ---") print("Fixed params - ") print("\tburst: %d, banks: %d, max stride: %d, itt min: %s ns" % \ @@ -247,4 +241,3 @@ print("\titt max values", itt_max_values) print("\tbank utilization values", bank_util_values) print("\tstride values:", stride_values) -print("Traffic gen config file:", cfg_file_name) diff --git a/configs/dram/sweep.py b/configs/dram/sweep.py index d3c86c3..6a49f44 100644 --- a/configs/dram/sweep.py +++ b/configs/dram/sweep.py @@ -116,13 +116,15 @@ # the following assumes that we are using the native DRAM # controller, check to be sure if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl): - fatal("This script assumes the memory is a DRAMCtrl subclass") + fatal("This script assumes the controller is a DRAMCtrl subclass") +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface): + fatal("This script assumes the memory is a DRAMInterface subclass") # there is no point slowing things down by saving any data -system.mem_ctrls[0].null = True +system.mem_ctrls[0].dram.null = True # Set the address mapping based on input argument -system.mem_ctrls[0].addr_mapping = options.addr_map +system.mem_ctrls[0].dram.addr_mapping = options.addr_map # stay in each state for 0.25 ms, long enough to warm things up, and # short enough to avoid hitting a refresh @@ -133,21 +135,21 @@ # the DRAM maximum bandwidth to ensure that it is saturated # get the number of banks -nbr_banks = system.mem_ctrls[0].banks_per_rank.value +nbr_banks = system.mem_ctrls[0].dram.banks_per_rank.value # determine the burst length in bytes -burst_size = int((system.mem_ctrls[0].devices_per_rank.value * - system.mem_ctrls[0].device_bus_width.value * - system.mem_ctrls[0].burst_length.value) / 8) +burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value * + system.mem_ctrls[0].dram.device_bus_width.value * + system.mem_ctrls[0].dram.burst_length.value) / 8) # next, get the page size in bytes -page_size = system.mem_ctrls[0].devices_per_rank.value * \ - system.mem_ctrls[0].device_rowbuffer_size.value +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \ + system.mem_ctrls[0].dram.device_rowbuffer_size.value # match the maximum bandwidth of the memory, the parameter is in seconds # and we need it in ticks (ps) -itt = getattr(system.mem_ctrls[0].tBURST_MIN, 'value', - system.mem_ctrls[0].tBURST.value) * 1000000000000 +itt = getattr(system.mem_ctrls[0].dram.tBURST_MIN, 'value', + system.mem_ctrls[0].dram.tBURST.value) * 1000000000000 # assume we start at 0 max_addr = mem_range.end diff --git a/configs/learning_gem5/part1/simple.py b/configs/learning_gem5/part1/simple.py index ef73a06..cfd15be 100644 --- a/configs/learning_gem5/part1/simple.py +++ b/configs/learning_gem5/part1/simple.py @@ -77,8 +77,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part1/two_level.py b/configs/learning_gem5/part1/two_level.py index 564c785..0dbcfc7 100644 --- a/configs/learning_gem5/part1/two_level.py +++ b/configs/learning_gem5/part1/two_level.py @@ -132,8 +132,9 @@ system.system_port = system.membus.slave # Create a DDR3 memory controller -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Create a process for a simple "Hello World" application diff --git a/configs/learning_gem5/part2/simple_cache.py b/configs/learning_gem5/part2/simple_cache.py index 8d98d92..fbea73d 100644 --- a/configs/learning_gem5/part2/simple_cache.py +++ b/configs/learning_gem5/part2/simple_cache.py @@ -76,8 +76,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part2/simple_memobj.py b/configs/learning_gem5/part2/simple_memobj.py index d30977c..e792eb9 100644 --- a/configs/learning_gem5/part2/simple_memobj.py +++ b/configs/learning_gem5/part2/simple_memobj.py @@ -74,8 +74,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part3/simple_ruby.py b/configs/learning_gem5/part3/simple_ruby.py index c47ee7e..7f70a8c 100644 --- a/configs/learning_gem5/part3/simple_ruby.py +++ b/configs/learning_gem5/part3/simple_ruby.py @@ -68,8 +68,9 @@ system.cpu = [TimingSimpleCPU() for i in range(2)] # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] # create the interrupt controller for the CPU and connect to the membus for cpu in system.cpu: diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py index 0f70dff..dff5000 100644 --- a/src/mem/DRAMCtrl.py +++ b/src/mem/DRAMCtrl.py @@ -40,26 +40,12 @@ from m5.params import * from m5.proxy import * -from m5.objects.AbstractMemory import * from m5.objects.QoSMemCtrl import * # Enum for memory scheduling algorithms, currently First-Come # First-Served and a First-Row Hit then First-Come First-Served class MemSched(Enum): vals = ['fcfs', 'frfcfs'] -# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting -# channel, rank, bank, row and column, respectively, and going from -# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are -# suitable for an open-page policy, optimising for sequential accesses -# hitting in the open row. For a closed-page policy, RoCoRaBaCh -# maximises parallelism. -class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh'] - -# Enum for the page policy, either open, open_adaptive, close, or -# close_adaptive. -class PageManage(Enum): vals = ['open', 'open_adaptive', 'close', - 'close_adaptive'] - # DRAMCtrl is a single-channel single-ported DRAM controller model # that aims to model the most important system-level performance # effects of a DRAM without getting into too much detail of the DRAM @@ -72,8 +58,11 @@ # bus in front of the controller for multiple ports port = SlavePort("Slave port") - # the basic configuration of the controller architecture, note - # that each entry corresponds to a burst for the specific DRAM + # Interface to volatile, DRAM media + dram = Param.DRAMInterface(Parent.any, "DRAM interface") + + # Set default buffer sizes + # each entry corresponds to a burst for the specific DRAM # configuration (e.g. x32 with burst length 8 is 32 bytes) and not # the cacheline size or request/packet size write_buffer_size = Param.Unsigned(64, "Number of write queue entries") @@ -93,15 +82,6 @@ # scheduler, address map and page policy mem_sched_policy = Param.MemSched('frfcfs', "Memory scheduling policy") - addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy") - page_policy = Param.PageManage('open_adaptive', "Page management policy") - - # enforce a limit on the number of accesses per row - max_accesses_per_row = Param.Unsigned(16, "Max accesses per row before " - "closing"); - - # size of DRAM Chip in Bytes - device_size = Param.MemorySize("Size of DRAM chip") # pipeline latency of the controller and PHY, split into a # frontend part and a backend part, with reads and writes serviced @@ -109,1404 +89,3 @@ # serviced by the memory seeing the sum of the two static_frontend_latency = Param.Latency("10ns", "Static frontend latency") static_backend_latency = Param.Latency("10ns", "Static backend latency") - - # the physical organisation of the DRAM - device_bus_width = Param.Unsigned("data bus width in bits for each DRAM "\ - "device/chip") - burst_length = Param.Unsigned("Burst lenght (BL) in beats") - device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\ - "device/chip") - devices_per_rank = Param.Unsigned("Number of devices/chips per rank") - ranks_per_channel = Param.Unsigned("Number of ranks per channel") - - # default to 0 bank groups per rank, indicating bank group architecture - # is not used - # update per memory class when bank group architecture is supported - bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per rank") - banks_per_rank = Param.Unsigned("Number of banks per rank") - - # Enable DRAM powerdown states if True. This is False by default due to - # performance being lower when enabled - enable_dram_powerdown = Param.Bool(False, "Enable powerdown states") - - # For power modelling we need to know if the DRAM has a DLL or not - dll = Param.Bool(True, "DRAM has DLL or not") - - # DRAMPower provides in addition to the core power, the possibility to - # include RD/WR termination and IO power. This calculation assumes some - # default values. The integration of DRAMPower with gem5 does not include - # IO and RD/WR termination power by default. This might be added as an - # additional feature in the future. - - # timing behaviour and constraints - all in nanoseconds - - # the base clock period of the DRAM - tCK = Param.Latency("Clock period") - - # the amount of time in nanoseconds from issuing an activate command - # to the data being available in the row buffer for a read/write - tRCD = Param.Latency("RAS to CAS delay") - - # the time from issuing a read/write command to seeing the actual data - tCL = Param.Latency("CAS latency") - - # minimum time between a precharge and subsequent activate - tRP = Param.Latency("Row precharge time") - - # minimum time between an activate and a precharge to the same row - tRAS = Param.Latency("ACT to PRE delay") - - # minimum time between a write data transfer and a precharge - tWR = Param.Latency("Write recovery time") - - # minimum time between a read and precharge command - tRTP = Param.Latency("Read to precharge") - - # time to complete a burst transfer, typically the burst length - # divided by two due to the DDR bus, but by making it a parameter - # it is easier to also evaluate SDR memories like WideIO. - # This parameter has to account for burst length. - # Read/Write requests with data size larger than one full burst are broken - # down into multiple requests in the controller - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = Param.Latency("Burst duration " - "(typically burst length / 2 cycles)") - - # tBURST_MAX is the column array cycle delay required before next access, - # which could be greater than tBURST when the memory access time is greater - # than tBURST - tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay") - - # tBURST_MIN is the minimum delay between bursts, which could be less than - # tBURST when interleaving is supported - tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts") - - # CAS-to-CAS delay for bursts to the same bank group - # only utilized with bank group architectures; set to 0 for default case - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay") - - # Write-to-Write delay for bursts to the same bank group - # only utilized with bank group architectures; set to 0 for default case - # This will be used to enable different same bank group delays - # for writes versus reads - tCCD_L_WR = Param.Latency(Self.tCCD_L, - "Same bank group Write to Write delay") - - # time taken to complete one refresh cycle (N rows in all banks) - tRFC = Param.Latency("Refresh cycle time") - - # refresh command interval, how often a "ref" command needs - # to be sent. It is 7.8 us for a 64ms refresh requirement - tREFI = Param.Latency("Refresh command interval") - - # write-to-read, same rank turnaround penalty - tWTR = Param.Latency("Write to read, same rank switching time") - - # write-to-read, same rank turnaround penalty for same bank group - tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching " - "time, same bank group") - - # read-to-write, same rank turnaround penalty - tRTW = Param.Latency("Read to write, same rank switching time") - - # rank-to-rank bus delay penalty - # this does not correlate to a memory timing parameter and encompasses: - # 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD - # different rank bus delay - tCS = Param.Latency("Rank to rank switching time") - - # minimum precharge to precharge delay time - tPPD = Param.Latency("0ns", "PRE to PRE delay") - - # maximum delay between two-cycle ACT command phases - tAAD = Param.Latency(Self.tCK, - "Maximum delay between two-cycle ACT commands") - - two_cycle_activate = Param.Bool(False, - "Two cycles required to send activate") - - # minimum row activate to row activate delay time - tRRD = Param.Latency("ACT to ACT delay") - - # only utilized with bank group architectures; set to 0 for default case - tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay") - - # time window in which a maximum number of activates are allowed - # to take place, set to 0 to disable - tXAW = Param.Latency("X activation window") - activation_limit = Param.Unsigned("Max number of activates in window") - - # time to exit power-down mode - # Exit power-down to next valid command delay - tXP = Param.Latency("0ns", "Power-up Delay") - - # Exit Powerdown to commands requiring a locked DLL - tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL") - - # time to exit self-refresh mode - tXS = Param.Latency("0ns", "Self-refresh exit latency") - - # time to exit self-refresh mode with locked DLL - tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL") - - # number of data beats per clock. with DDR, default is 2, one per edge - beats_per_clock = Param.Unsigned(2, "Data beats per clock") - - data_clock_sync = Param.Bool(False, "Synchronization commands required") - - # Currently rolled into other params - ###################################################################### - - # tRC - assumed to be tRAS + tRP - - # Power Behaviour and Constraints - # DRAMs like LPDDR and WideIO have 2 external voltage domains. These are - # defined as VDD and VDD2. Each current is defined for each voltage domain - # separately. For example, current IDD0 is active-precharge current for - # voltage domain VDD and current IDD02 is active-precharge current for - # voltage domain VDD2. - # By default all currents are set to 0mA. Users who are only interested in - # the performance of DRAMs can leave them at 0. - - # Operating 1 Bank Active-Precharge current - IDD0 = Param.Current("0mA", "Active precharge current") - - # Operating 1 Bank Active-Precharge current multiple voltage Range - IDD02 = Param.Current("0mA", "Active precharge current VDD2") - - # Precharge Power-down Current: Slow exit - IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow") - - # Precharge Power-down Current: Slow exit multiple voltage Range - IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2") - - # Precharge Power-down Current: Fast exit - IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast") - - # Precharge Power-down Current: Fast exit multiple voltage Range - IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2") - - # Precharge Standby current - IDD2N = Param.Current("0mA", "Precharge Standby current") - - # Precharge Standby current multiple voltage range - IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2") - - # Active Power-down current: slow exit - IDD3P0 = Param.Current("0mA", "Active Powerdown slow") - - # Active Power-down current: slow exit multiple voltage range - IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2") - - # Active Power-down current : fast exit - IDD3P1 = Param.Current("0mA", "Active Powerdown fast") - - # Active Power-down current : fast exit multiple voltage range - IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2") - - # Active Standby current - IDD3N = Param.Current("0mA", "Active Standby current") - - # Active Standby current multiple voltage range - IDD3N2 = Param.Current("0mA", "Active Standby current VDD2") - - # Burst Read Operating Current - IDD4R = Param.Current("0mA", "READ current") - - # Burst Read Operating Current multiple voltage range - IDD4R2 = Param.Current("0mA", "READ current VDD2") - - # Burst Write Operating Current - IDD4W = Param.Current("0mA", "WRITE current") - - # Burst Write Operating Current multiple voltage range - IDD4W2 = Param.Current("0mA", "WRITE current VDD2") - - # Refresh Current - IDD5 = Param.Current("0mA", "Refresh current") - - # Refresh Current multiple voltage range - IDD52 = Param.Current("0mA", "Refresh current VDD2") - - # Self-Refresh Current - IDD6 = Param.Current("0mA", "Self-refresh Current") - - # Self-Refresh Current multiple voltage range - IDD62 = Param.Current("0mA", "Self-refresh Current VDD2") - - # Main voltage range of the DRAM - VDD = Param.Voltage("0V", "Main Voltage Range") - - # Second voltage range defined by some DRAMs - VDD2 = Param.Voltage("0V", "2nd Voltage Range") - -# A single DDR3-1600 x64 channel (one command and address bus), with -# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in -# an 8x8 configuration. -class DDR3_1600_8x8(DRAMCtrl): - # size of device in bytes - device_size = '512MB' - - # 8x8 configuration, 8 devices each with an 8-bit interface - device_bus_width = 8 - - # DDR3 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) - device_rowbuffer_size = '1kB' - - # 8x8 configuration, so 8 devices - devices_per_rank = 8 - - # Use two ranks - ranks_per_channel = 2 - - # DDR3 has 8 banks in all configurations - banks_per_rank = 8 - - # 800 MHz - tCK = '1.25ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 800 MHz - tBURST = '5ns' - - # DDR3-1600 11-11-11 - tRCD = '13.75ns' - tCL = '13.75ns' - tRP = '13.75ns' - tRAS = '35ns' - tRRD = '6ns' - tXAW = '30ns' - activation_limit = 4 - tRFC = '260ns' - - tWR = '15ns' - - # Greater of 4 CK or 7.5 ns - tWTR = '7.5ns' - - # Greater of 4 CK or 7.5 ns - tRTP = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns - tRTW = '2.5ns' - - # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns - tCS = '2.5ns' - - # <=85C, half for >85C - tREFI = '7.8us' - - # active powerdown and precharge powerdown exit time - tXP = '6ns' - - # self refresh exit time - tXS = '270ns' - - # Current values from datasheet Die Rev E,J - IDD0 = '55mA' - IDD2N = '32mA' - IDD3N = '38mA' - IDD4W = '125mA' - IDD4R = '157mA' - IDD5 = '235mA' - IDD3P1 = '38mA' - IDD2P1 = '32mA' - IDD6 = '20mA' - VDD = '1.5V' - -# A single HMC-2500 x32 model based on: -# [1] DRAMSpec: a high-level DRAM bank modelling tool -# developed at the University of Kaiserslautern. This high level tool -# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to -# estimate the DRAM bank latency and power numbers. -# [2] High performance AXI-4.0 based interconnect for extensible smart memory -# cubes (E. Azarkhish et. al) -# Assumed for the HMC model is a 30 nm technology node. -# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory (4 -# layers). -# Each layer has 16 vaults and each vault consists of 2 banks per layer. -# In order to be able to use the same controller used for 2D DRAM generations -# for HMC, the following analogy is done: -# Channel (DDR) => Vault (HMC) -# device_size (DDR) => size of a single layer in a vault -# ranks per channel (DDR) => number of layers -# banks per rank (DDR) => banks per layer -# devices per rank (DDR) => devices per layer ( 1 for HMC). -# The parameters for which no input is available are inherited from the DDR3 -# configuration. -# This configuration includes the latencies from the DRAM to the logic layer -# of the HMC -class HMC_2500_1x32(DDR3_1600_8x8): - # size of device - # two banks per device with each bank 4MB [2] - device_size = '8MB' - - # 1x32 configuration, 1 device with 32 TSVs [2] - device_bus_width = 32 - - # HMC is a BL8 device [2] - burst_length = 8 - - # Each device has a page (row buffer) size of 256 bytes [2] - device_rowbuffer_size = '256B' - - # 1x32 configuration, so 1 device [2] - devices_per_rank = 1 - - # 4 layers so 4 ranks [2] - ranks_per_channel = 4 - - # HMC has 2 banks per layer [2] - # Each layer represents a rank. With 4 layers and 8 banks in total, each - # layer has 2 banks; thus 2 banks per rank. - banks_per_rank = 2 - - # 1250 MHz [2] - tCK = '0.8ns' - - # 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz - tBURST = '3.2ns' - - # Values using DRAMSpec HMC model [1] - tRCD = '10.2ns' - tCL = '9.9ns' - tRP = '7.7ns' - tRAS = '21.6ns' - - # tRRD depends on the power supply network for each vendor. - # We assume a tRRD of a double bank approach to be equal to 4 clock - # cycles (Assumption) - tRRD = '3.2ns' - - # activation limit is set to 0 since there are only 2 banks per vault - # layer. - activation_limit = 0 - - # Values using DRAMSpec HMC model [1] - tRFC = '59ns' - tWR = '8ns' - tRTP = '4.9ns' - - # Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz = - # 0.8 ns (Assumption) - tCS = '0.8ns' - - # Value using DRAMSpec HMC model [1] - tREFI = '3.9us' - - # The default page policy in the vault controllers is simple closed page - # [2] nevertheless 'close' policy opens and closes the row multiple times - # for bursts largers than 32Bytes. For this reason we use 'close_adaptive' - page_policy = 'close_adaptive' - - # RoCoRaBaCh resembles the default address mapping in HMC - addr_mapping = 'RoCoRaBaCh' - min_writes_per_switch = 8 - - # These parameters do not directly correlate with buffer_size in real - # hardware. Nevertheless, their value has been tuned to achieve a - # bandwidth similar to the cycle-accurate model in [2] - write_buffer_size = 32 - read_buffer_size = 32 - - # The static latency of the vault controllers is estimated to be smaller - # than a full DRAM channel controller - static_backend_latency='4ns' - static_frontend_latency='4ns' - -# A single DDR3-2133 x64 channel refining a selected subset of the -# options for the DDR-1600 configuration, based on the same DDR3-1600 -# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept -# consistent across the two configurations. -class DDR3_2133_8x8(DDR3_1600_8x8): - # 1066 MHz - tCK = '0.938ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz - tBURST = '3.752ns' - - # DDR3-2133 14-14-14 - tRCD = '13.09ns' - tCL = '13.09ns' - tRP = '13.09ns' - tRAS = '33ns' - tRRD = '5ns' - tXAW = '25ns' - - # Current values from datasheet - IDD0 = '70mA' - IDD2N = '37mA' - IDD3N = '44mA' - IDD4W = '157mA' - IDD4R = '191mA' - IDD5 = '250mA' - IDD3P1 = '44mA' - IDD2P1 = '43mA' - IDD6 ='20mA' - VDD = '1.5V' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4) -# in an 16x4 configuration. -# Total channel capacity is 32GB -# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel -class DDR4_2400_16x4(DRAMCtrl): - # size of device - device_size = '1GB' - - # 16x4 configuration, 16 devices each with a 4-bit interface - device_bus_width = 4 - - # DDR4 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 512 byte (1K columns x4) - device_rowbuffer_size = '512B' - - # 16x4 configuration, so 16 devices - devices_per_rank = 16 - - # Match our DDR3 configurations which is dual rank - ranks_per_channel = 2 - - # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups - # Set to 4 for x4 case - bank_groups_per_rank = 4 - - # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all - # configurations). Currently we do not capture the additional - # constraints incurred by the bank groups - banks_per_rank = 16 - - # override the default buffer sizes and go for something larger to - # accommodate the larger bank count - write_buffer_size = 128 - read_buffer_size = 64 - - # 1200 MHz - tCK = '0.833ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = '3.332ns' - - # @2400 data rate, tCCD_L is 6 CK - # CAS-to-CAS delay for bursts to the same bank group - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = '5ns'; - - # DDR4-2400 17-17-17 - tRCD = '14.16ns' - tCL = '14.16ns' - tRP = '14.16ns' - tRAS = '32ns' - - # RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns) - tRRD = '3.332ns' - - # RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns) - tRRD_L = '4.9ns'; - - # tFAW for 512B page is MAX(16 CK, 13ns) - tXAW = '13.328ns' - activation_limit = 4 - # tRFC is 350ns - tRFC = '350ns' - - tWR = '15ns' - - # Here using the average of WTR_S and WTR_L - tWTR = '5ns' - - # Greater of 4 CK or 7.5 ns - tRTP = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666 ns - tRTW = '1.666ns' - - # Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns - tCS = '1.666ns' - - # <=85C, half for >85C - tREFI = '7.8us' - - # active powerdown and precharge powerdown exit time - tXP = '6ns' - - # self refresh exit time - # exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is: - # tRFC + 10ns = 340ns - tXS = '340ns' - - # Current values from datasheet - IDD0 = '43mA' - IDD02 = '3mA' - IDD2N = '34mA' - IDD3N = '38mA' - IDD3N2 = '3mA' - IDD4W = '103mA' - IDD4R = '110mA' - IDD5 = '250mA' - IDD3P1 = '32mA' - IDD2P1 = '25mA' - IDD6 = '30mA' - VDD = '1.2V' - VDD2 = '2.5V' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8) -# in an 8x8 configuration. -# Total channel capacity is 16GB -# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel -class DDR4_2400_8x8(DDR4_2400_16x4): - # 8x8 configuration, 8 devices each with an 8-bit interface - device_bus_width = 8 - - # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) - device_rowbuffer_size = '1kB' - - # 8x8 configuration, so 8 devices - devices_per_rank = 8 - - # RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns) - tRRD_L = '4.9ns'; - - tXAW = '21ns' - - # Current values from datasheet - IDD0 = '48mA' - IDD3N = '43mA' - IDD4W = '123mA' - IDD4R = '135mA' - IDD3P1 = '37mA' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16) -# in an 4x16 configuration. -# Total channel capacity is 4GB -# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel -class DDR4_2400_4x16(DDR4_2400_16x4): - # 4x16 configuration, 4 devices each with an 16-bit interface - device_bus_width = 16 - - # Each device has a page (row buffer) size of 2 Kbyte (1K columns x16) - device_rowbuffer_size = '2kB' - - # 4x16 configuration, so 4 devices - devices_per_rank = 4 - - # Single rank for x16 - ranks_per_channel = 1 - - # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups - # Set to 2 for x16 case - bank_groups_per_rank = 2 - - # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all - # configurations). Currently we do not capture the additional - # constraints incurred by the bank groups - banks_per_rank = 8 - - # RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns) - tRRD = '5.3ns' - - # RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns) - tRRD_L = '6.4ns'; - - tXAW = '30ns' - - # Current values from datasheet - IDD0 = '80mA' - IDD02 = '4mA' - IDD2N = '34mA' - IDD3N = '47mA' - IDD4W = '228mA' - IDD4R = '243mA' - IDD5 = '280mA' - IDD3P1 = '41mA' - -# A single LPDDR2-S4 x32 interface (one command/address bus), with -# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1) -# in a 1x32 configuration. -class LPDDR2_S4_1066_1x32(DRAMCtrl): - # No DLL in LPDDR2 - dll = False - - # size of device - device_size = '512MB' - - # 1x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # LPDDR2_S4 is a BL4 and BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 1KB - # (this depends on the memory density) - device_rowbuffer_size = '1kB' - - # 1x32 configuration, so 1 device - devices_per_rank = 1 - - # Use a single rank - ranks_per_channel = 1 - - # LPDDR2-S4 has 8 banks in all configurations - banks_per_rank = 8 - - # 533 MHz - tCK = '1.876ns' - - # Fixed at 15 ns - tRCD = '15ns' - - # 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time - tCL = '15ns' - - # Pre-charge one bank 15 ns (all banks 18 ns) - tRP = '15ns' - - tRAS = '42ns' - tWR = '15ns' - - tRTP = '7.5ns' - - # 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz. - # Note this is a BL8 DDR device. - # Requests larger than 32 bytes are broken down into multiple requests - # in the controller - tBURST = '7.5ns' - - # LPDDR2-S4, 4 Gbit - tRFC = '130ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '7.5ns' - - # self refresh exit time - tXS = '140ns' - - # Irrespective of speed grade, tWTR is 7.5 ns - tWTR = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns - tRTW = '3.75ns' - - # Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns - tCS = '3.75ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Irrespective of density, tFAW is 50 ns - tXAW = '50ns' - activation_limit = 4 - - # Current values from datasheet - IDD0 = '15mA' - IDD02 = '70mA' - IDD2N = '2mA' - IDD2N2 = '30mA' - IDD3N = '2.5mA' - IDD3N2 = '30mA' - IDD4W = '10mA' - IDD4W2 = '190mA' - IDD4R = '3mA' - IDD4R2 = '220mA' - IDD5 = '40mA' - IDD52 = '150mA' - IDD3P1 = '1.2mA' - IDD3P12 = '8mA' - IDD2P1 = '0.6mA' - IDD2P12 = '0.8mA' - IDD6 = '1mA' - IDD62 = '3.2mA' - VDD = '1.8V' - VDD2 = '1.2V' - -# A single WideIO x128 interface (one command and address bus), with -# default timings based on an estimated WIO-200 8 Gbit part. -class WideIO_200_1x128(DRAMCtrl): - # No DLL for WideIO - dll = False - - # size of device - device_size = '1024MB' - - # 1x128 configuration, 1 device with a 128-bit interface - device_bus_width = 128 - - # This is a BL4 device - burst_length = 4 - - # Each device has a page (row buffer) size of 4KB - # (this depends on the memory density) - device_rowbuffer_size = '4kB' - - # 1x128 configuration, so 1 device - devices_per_rank = 1 - - # Use one rank for a one-high die stack - ranks_per_channel = 1 - - # WideIO has 4 banks in all configurations - banks_per_rank = 4 - - # 200 MHz - tCK = '5ns' - - # WIO-200 - tRCD = '18ns' - tCL = '18ns' - tRP = '18ns' - tRAS = '42ns' - tWR = '15ns' - # Read to precharge is same as the burst - tRTP = '20ns' - - # 4 beats across an x128 SDR interface translates to 4 clocks @ 200 MHz. - # Note this is a BL4 SDR device. - tBURST = '20ns' - - # WIO 8 Gb - tRFC = '210ns' - - # WIO 8 Gb, <=85C, half for >85C - tREFI = '3.9us' - - # Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns - tWTR = '15ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns - tRTW = '10ns' - - # Default different rank bus delay to 2 CK, @200 MHz = 10 ns - tCS = '10ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Two instead of four activation window - tXAW = '50ns' - activation_limit = 2 - - # The WideIO specification does not provide current information - -# A single LPDDR3 x32 interface (one command/address bus), with -# default timings based on a LPDDR3-1600 4 Gbit part (Micron -# EDF8132A1MC) in a 1x32 configuration. -class LPDDR3_1600_1x32(DRAMCtrl): - # No DLL for LPDDR3 - dll = False - - # size of device - device_size = '512MB' - - # 1x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # LPDDR3 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 4KB - device_rowbuffer_size = '4kB' - - # 1x32 configuration, so 1 device - devices_per_rank = 1 - - # Technically the datasheet is a dual-rank package, but for - # comparison with the LPDDR2 config we stick to a single rank - ranks_per_channel = 1 - - # LPDDR3 has 8 banks in all configurations - banks_per_rank = 8 - - # 800 MHz - tCK = '1.25ns' - - tRCD = '18ns' - - # 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time - tCL = '15ns' - - tRAS = '42ns' - tWR = '15ns' - - # Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns - tRTP = '7.5ns' - - # Pre-charge one bank 18 ns (all banks 21 ns) - tRP = '18ns' - - # 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz. - # Note this is a BL8 DDR device. - # Requests larger than 32 bytes are broken down into multiple requests - # in the controller - tBURST = '5ns' - - # LPDDR3, 4 Gb - tRFC = '130ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '7.5ns' - - # self refresh exit time - tXS = '140ns' - - # Irrespective of speed grade, tWTR is 7.5 ns - tWTR = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns - tRTW = '2.5ns' - - # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns - tCS = '2.5ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Irrespective of size, tFAW is 50 ns - tXAW = '50ns' - activation_limit = 4 - - # Current values from datasheet - IDD0 = '8mA' - IDD02 = '60mA' - IDD2N = '0.8mA' - IDD2N2 = '26mA' - IDD3N = '2mA' - IDD3N2 = '34mA' - IDD4W = '2mA' - IDD4W2 = '190mA' - IDD4R = '2mA' - IDD4R2 = '230mA' - IDD5 = '28mA' - IDD52 = '150mA' - IDD3P1 = '1.4mA' - IDD3P12 = '11mA' - IDD2P1 = '0.8mA' - IDD2P12 = '1.8mA' - IDD6 = '0.5mA' - IDD62 = '1.8mA' - VDD = '1.8V' - VDD2 = '1.2V' - -# A single GDDR5 x64 interface, with -# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix -# H5GQ1H24AFR) in a 2x32 configuration. -class GDDR5_4000_2x32(DRAMCtrl): - # size of device - device_size = '128MB' - - # 2x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # GDDR5 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 2Kbits (256Bytes) - device_rowbuffer_size = '256B' - - # 2x32 configuration, so 2 devices - devices_per_rank = 2 - - # assume single rank - ranks_per_channel = 1 - - # GDDR5 has 4 bank groups - bank_groups_per_rank = 4 - - # GDDR5 has 16 banks with 4 bank groups - banks_per_rank = 16 - - # 1000 MHz - tCK = '1ns' - - # 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz - # Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz ) - # 8 beats at 4000 MHz = 2 beats at 1000 MHz - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = '2ns' - - # @1000MHz data rate, tCCD_L is 3 CK - # CAS-to-CAS delay for bursts to the same bank group - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = '3ns'; - - tRCD = '12ns' - - # tCL is not directly found in datasheet and assumed equal tRCD - tCL = '12ns' - - tRP = '12ns' - tRAS = '28ns' - - # RRD_S (different bank group) - # RRD_S is 5.5 ns in datasheet. - # rounded to the next multiple of tCK - tRRD = '6ns' - - # RRD_L (same bank group) - # RRD_L is 5.5 ns in datasheet. - # rounded to the next multiple of tCK - tRRD_L = '6ns' - - tXAW = '23ns' - - # tXAW < 4 x tRRD. - # Therefore, activation limit is set to 0 - activation_limit = 0 - - tRFC = '65ns' - tWR = '12ns' - - # Here using the average of WTR_S and WTR_L - tWTR = '5ns' - - # Read-to-Precharge 2 CK - tRTP = '2ns' - - # Assume 2 cycles - tRTW = '2ns' - -# A single HBM x128 interface (one command and address bus), with -# default timings based on data publically released -# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014), -# IDD measurement values, and by extrapolating data from other classes. -# Architecture values based on published HBM spec -# A 4H stack is defined, 2Gb per die for a total of 1GB of memory. -class HBM_1000_4H_1x128(DRAMCtrl): - # HBM gen1 supports up to 8 128-bit physical channels - # Configuration defines a single channel, with the capacity - # set to (full_ stack_capacity / 8) based on 2Gb dies - # To use all 8 channels, set 'channels' parameter to 8 in - # system configuration - - # 128-bit interface legacy mode - device_bus_width = 128 - - # HBM supports BL4 and BL2 (legacy mode only) - burst_length = 4 - - # size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack; - # with 8 channels, 128MB per channel - device_size = '128MB' - - device_rowbuffer_size = '2kB' - - # 1x128 configuration - devices_per_rank = 1 - - # HBM does not have a CS pin; set rank to 1 - ranks_per_channel = 1 - - # HBM has 8 or 16 banks depending on capacity - # 2Gb dies have 8 banks - banks_per_rank = 8 - - # depending on frequency, bank groups may be required - # will always have 4 bank groups when enabled - # current specifications do not define the minimum frequency for - # bank group architecture - # setting bank_groups_per_rank to 0 to disable until range is defined - bank_groups_per_rank = 0 - - # 500 MHz for 1Gbps DDR data rate - tCK = '2ns' - - # use values from IDD measurement in JEDEC spec - # use tRP value for tRCD and tCL similar to other classes - tRP = '15ns' - tRCD = '15ns' - tCL = '15ns' - tRAS = '33ns' - - # BL2 and BL4 supported, default to BL4 - # DDR @ 500 MHz means 4 * 2ns / 2 = 4ns - tBURST = '4ns' - - # value for 2Gb device from JEDEC spec - tRFC = '160ns' - - # value for 2Gb device from JEDEC spec - tREFI = '3.9us' - - # extrapolate the following from LPDDR configs, using ns values - # to minimize burst length, prefetch differences - tWR = '18ns' - tRTP = '7.5ns' - tWTR = '10ns' - - # start with 2 cycles turnaround, similar to other memory classes - # could be more with variations across the stack - tRTW = '4ns' - - # single rank device, set to 0 - tCS = '0ns' - - # from MemCon example, tRRD is 4ns with 2ns tCK - tRRD = '4ns' - - # from MemCon example, tFAW is 30ns with 2ns tCK - tXAW = '30ns' - activation_limit = 4 - - # 4tCK - tXP = '8ns' - - # start with tRFC + tXP -> 160ns + 8ns = 168ns - tXS = '168ns' - -# A single HBM x64 interface (one command and address bus), with -# default timings based on HBM gen1 and data publically released -# A 4H stack is defined, 8Gb per die for a total of 4GB of memory. -# Note: This defines a pseudo-channel with a unique controller -# instantiated per pseudo-channel -# Stay at same IO rate (1Gbps) to maintain timing relationship with -# HBM gen1 class (HBM_1000_4H_x128) where possible -class HBM_1000_4H_1x64(HBM_1000_4H_1x128): - # For HBM gen2 with pseudo-channel mode, configure 2X channels. - # Configuration defines a single pseudo channel, with the capacity - # set to (full_ stack_capacity / 16) based on 8Gb dies - # To use all 16 pseudo channels, set 'channels' parameter to 16 in - # system configuration - - # 64-bit pseudo-channle interface - device_bus_width = 64 - - # HBM pseudo-channel only supports BL4 - burst_length = 4 - - # size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack; - # with 16 channels, 256MB per channel - device_size = '256MB' - - # page size is halved with pseudo-channel; maintaining the same same number - # of rows per pseudo-channel with 2X banks across 2 channels - device_rowbuffer_size = '1kB' - - # HBM has 8 or 16 banks depending on capacity - # Starting with 4Gb dies, 16 banks are defined - banks_per_rank = 16 - - # reset tRFC for larger, 8Gb device - # use HBM1 4Gb value as a starting point - tRFC = '260ns' - - # start with tRFC + tXP -> 160ns + 8ns = 168ns - tXS = '268ns' - # Default different rank bus delay to 2 CK, @1000 MHz = 2 ns - tCS = '2ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '10ns' - - # self refresh exit time - tXS = '65ns' - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture -# burst of 32, which means bursts can be interleaved -class LPDDR5_5500_1x16_BG_BL32(DRAMCtrl): - - # Increase buffer size to account for more bank resources - read_buffer_size = 64 - - # Set page policy to better suit DMC Huxley - page_policy = 'close_adaptive' - - # 16-bit channel interface - device_bus_width = 16 - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL32 for higher command bandwidth - burst_length = 32 - - # size of device in bytes - device_size = '1GB' - - # 2kB page with BG mode - device_rowbuffer_size = '2kB' - - # Use a 1x16 configuration - devices_per_rank = 1 - - # Use a single rank - ranks_per_channel = 1 - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Initial configuration will have 16 banks with Bank Group Arch - # to maximim resources and enable higher data rates - banks_per_rank = 16 - bank_groups_per_rank = 4 - - # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK - tCK = '1.455ns' - - # Greater of 2 CK or 18ns - tRCD = '18ns' - - # Base RL is 16 CK @ 687.5 MHz = 23.28ns - tCL = '23.280ns' - - # Greater of 2 CK or 18ns - tRP = '18ns' - - # Greater of 3 CK or 42ns - tRAS = '42ns' - - # Greater of 3 CK or 34ns - tWR = '34ns' - - # active powerdown and precharge powerdown exit time - # Greater of 3 CK or 7ns - tXP = '7ns' - - # self refresh exit time (tRFCab + 7.5ns) - tXS = '217.5ns' - - # Greater of 2 CK or 7.5 ns minus 2 CK - tRTP = '4.59ns' - - # With BG architecture, burst of 32 transferred in two 16-beat - # sub-bursts, with a 16-beat gap in between. - # Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz - # tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz - tBURST = '8.73ns' - # can interleave a Bstof32 from another bank group at tBURST_MIN - # 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz - tBURST_MIN = '2.91ns' - # tBURST_MAX is the maximum burst delay for same bank group timing - # this is 8 CK @ 687.5 MHz - tBURST_MAX = '11.64ns' - - # 8 CK @ 687.5 MHz - tCCD_L = "11.64ns" - - # LPDDR5, 8 Gbit/channel for 280ns tRFCab - tRFC = '210ns' - tREFI = '3.9us' - - # Greater of 4 CK or 6.25 ns - tWTR = '6.25ns' - # Greater of 4 CK or 12 ns - tWTR_L = '12ns' - - # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL - # tWCKDQ0/tCK will be 1 CK for most cases - # For gem5 RL = WL and BL/n is already accounted for with tBURST - # Result is and additional 1 CK is required - tRTW = '1.455ns' - - # Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns - tCS = '2.91ns' - - # 2 CK - tPPD = '2.91ns' - - # Greater of 2 CK or 5 ns - tRRD = '5ns' - tRRD_L = '5ns' - - # With Bank Group Arch mode tFAW is 20 ns - tXAW = '20ns' - activation_limit = 4 - - # at 5Gbps, 4:1 WCK to CK ratio required - # 2 data beats per WCK (DDR) -> 8 per CK - beats_per_clock = 8 - - # 2 cycles required to send activate command - # 2 command phases can be sent back-to-back or - # with a gap up to tAAD = 8 CK - two_cycle_activate = True - tAAD = '11.640ns' - - data_clock_sync = True - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture, burst of 16 -class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32): - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL16 for smaller access granularity - burst_length = 16 - - # For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio - tBURST = '2.91ns' - tBURST_MIN = '2.91ns' - # For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio - tBURST_MAX = '5.82ns' - - # 4 CK @ 687.5 MHz - tCCD_L = "5.82ns" - - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 8-bank mode, burst of 32 -class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32): - - # 4kB page with 8B mode - device_rowbuffer_size = '4kB' - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Select 8B - banks_per_rank = 8 - bank_groups_per_rank = 0 - - # For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio - tBURST = '5.82ns' - tBURST_MIN = '5.82ns' - tBURST_MAX = '5.82ns' - - # Greater of 4 CK or 12 ns - tWTR = '12ns' - - # Greater of 2 CK or 10 ns - tRRD = '10ns' - - # With 8B mode tFAW is 40 ns - tXAW = '40ns' - activation_limit = 4 - - # Reset BG arch timing for 8B mode - tCCD_L = "0ns" - tRRD_L = "0ns" - tWTR_L = "0ns" - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# 6.4Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture -# burst of 32, which means bursts can be interleaved -class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32): - - # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK - tCK = '1.25ns' - - # Base RL is 17 CK @ 800 MHz = 21.25ns - tCL = '21.25ns' - - # With BG architecture, burst of 32 transferred in two 16-beat - # sub-bursts, with a 16-beat gap in between. - # Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz - # tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz - tBURST = '7.5ns' - # can interleave a Bstof32 from another bank group at tBURST_MIN - # 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz - tBURST_MIN = '2.5ns' - # tBURST_MAX is the maximum burst delay for same bank group timing - # this is 8 CK @ 800 MHz - tBURST_MAX = '10ns' - - # 8 CK @ 800 MHz - tCCD_L = "10ns" - - # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL - # tWCKDQ0/tCK will be 1 CK for most cases - # For gem5 RL = WL and BL/n is already accounted for with tBURST - # Result is and additional 1 CK is required - tRTW = '1.25ns' - - # Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns - tCS = '2.5ns' - - # 2 CK - tPPD = '2.5ns' - - # 2 command phases can be sent back-to-back or - # with a gap up to tAAD = 8 CK - tAAD = '10ns' - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on initial -# JEDEC specifcation -# 6.4Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture, burst of 16 -class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32): - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL16 for smaller access granularity - burst_length = 16 - - # For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio - tBURST = '2.5ns' - tBURST_MIN = '2.5ns' - # For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio - tBURST_MAX = '5ns' - - # 4 CK @ 800 MHz - tCCD_L = "5ns" - - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# 6.4Gbps data rates and 8Gbit die -# Configuring for 8-bank mode, burst of 32 -class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32): - - # 4kB page with 8B mode - device_rowbuffer_size = '4kB' - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Select 8B - banks_per_rank = 8 - bank_groups_per_rank = 0 - - # For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio - tBURST = '5ns' - tBURST_MIN = '5ns' - tBURST_MAX = '5ns' - - # Greater of 4 CK or 12 ns - tWTR = '12ns' - - # Greater of 2 CK or 10 ns - tRRD = '10ns' - - # With 8B mode tFAW is 40 ns - tXAW = '40ns' - activation_limit = 4 - - # Reset BG arch timing for 8B mode - tCCD_L = "0ns" - tRRD_L = "0ns" - tWTR_L = "0ns" diff --git a/src/mem/DRAMInterface.py b/src/mem/DRAMInterface.py new file mode 100644 index 0000000..35bf8a3 --- /dev/null +++ b/src/mem/DRAMInterface.py @@ -0,0 +1,1483 @@ +# Copyright (c) 2012-2020 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Copyright (c) 2013 Amin Farmahini-Farahani +# Copyright (c) 2015 University of Kaiserslautern +# Copyright (c) 2015 The University of Bologna +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from AbstractMemory import AbstractMemory +from DRAMCtrl import * + +# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting +# channel, rank, bank, row and column, respectively, and going from +# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are +# suitable for an open-page policy, optimising for sequential accesses +# hitting in the open row. For a closed-page policy, RoCoRaBaCh +# maximises parallelism. +class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh'] + +# Enum for the page policy, either open, open_adaptive, close, or +# close_adaptive. +class PageManage(Enum): vals = ['open', 'open_adaptive', 'close', + 'close_adaptive'] + +class DRAMInterface(AbstractMemory): + type = 'DRAMInterface' + cxx_header = "mem/dram_ctrl.hh" + + # scheduler, address map and page policy + addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy") + page_policy = Param.PageManage('open_adaptive', "Page management policy") + + # Allow the interface to set required controller buffer sizes + # each entry corresponds to a burst for the specific DRAM + # configuration (e.g. x32 with burst length 8 is 32 bytes) and not + # the cacheline size or request/packet size + write_buffer_size = Param.Unsigned(64, "Number of write queue entries") + read_buffer_size = Param.Unsigned(32, "Number of read queue entries") + + # enforce a limit on the number of accesses per row + max_accesses_per_row = Param.Unsigned(16, "Max accesses per row before " + "closing"); + + # size of DRAM Chip in Bytes + device_size = Param.MemorySize("Size of DRAM chip") + # the physical organisation of the DRAM + device_bus_width = Param.Unsigned("data bus width in bits for each DRAM "\ + "device/chip") + burst_length = Param.Unsigned("Burst lenght (BL) in beats") + device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\ + "device/chip") + devices_per_rank = Param.Unsigned("Number of devices/chips per rank") + + ranks_per_channel = Param.Unsigned("Number of ranks per channel") + # default to 0 bank groups per rank, indicating bank group architecture + # is not used + # update per memory class when bank group architecture is supported + bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per rank") + banks_per_rank = Param.Unsigned("Number of banks per rank") + + # Enable DRAM powerdown states if True. This is False by default due to + # performance being lower when enabled + enable_dram_powerdown = Param.Bool(False, "Enable powerdown states") + + # For power modelling we need to know if the DRAM has a DLL or not + dll = Param.Bool(True, "DRAM has DLL or not") + + # DRAMPower provides in addition to the core power, the possibility to + # include RD/WR termination and IO power. This calculation assumes some + # default values. The integration of DRAMPower with gem5 does not include + # IO and RD/WR termination power by default. This might be added as an + # additional feature in the future. + + # timing behaviour and constraints - all in nanoseconds + + # the base clock period of the DRAM + + tCK = Param.Latency("Clock period") + + # rank-to-rank bus delay penalty + # this does not correlate to a memory timing parameter and encompasses: + # 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD + # different rank bus delay + tCS = Param.Latency("Rank to rank switching time") + + # the amount of time in nanoseconds from issuing an activate command + # to the data being available in the row buffer for a read/write + tRCD = Param.Latency("RAS to CAS delay") + + # the time from issuing a read/write command to seeing the actual data + tCL = Param.Latency("CAS latency") + + # minimum time between a precharge and subsequent activate + tRP = Param.Latency("Row precharge time") + + # minimum time between an activate and a precharge to the same row + tRAS = Param.Latency("ACT to PRE delay") + + # minimum time between a write data transfer and a precharge + tWR = Param.Latency("Write recovery time") + + # minimum time between a read and precharge command + tRTP = Param.Latency("Read to precharge") + + # time to complete a burst transfer, typically the burst length + # divided by two due to the DDR bus, but by making it a parameter + # it is easier to also evaluate SDR memories like WideIO. + # This parameter has to account for burst length. + # Read/Write requests with data size larger than one full burst are broken + # down into multiple requests in the controller + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = Param.Latency("Burst duration " + "(typically burst length / 2 cycles)") + + # tBURST_MAX is the column array cycle delay required before next access, + # which could be greater than tBURST when the memory access time is greater + # than tBURST + tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay") + + # tBURST_MIN is the minimum delay between bursts, which could be less than + # tBURST when interleaving is supported + tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts") + + # CAS-to-CAS delay for bursts to the same bank group + # only utilized with bank group architectures; set to 0 for default case + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay") + + # Write-to-Write delay for bursts to the same bank group + # only utilized with bank group architectures; set to 0 for default case + # This will be used to enable different same bank group delays + # for writes versus reads + tCCD_L_WR = Param.Latency(Self.tCCD_L, "Same bank group Write to Write " \ + "delay") + + # time taken to complete one refresh cycle (N rows in all banks) + tRFC = Param.Latency("Refresh cycle time") + + # refresh command interval, how often a "ref" command needs + # to be sent. It is 7.8 us for a 64ms refresh requirement + tREFI = Param.Latency("Refresh command interval") + + # write-to-read, same rank turnaround penalty + tWTR = Param.Latency("Write to read, same rank switching time") + + # write-to-read, same rank turnaround penalty for same bank group + tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching " + "time, same bank group") + + # read-to-write, same rank turnaround penalty + tRTW = Param.Latency("Read to write, same rank switching time") + + # minimum precharge to precharge delay time + tPPD = Param.Latency("0ns", "PRE to PRE delay") + + # maximum delay between two-cycle ACT command phases + tAAD = Param.Latency(Self.tCK, + "Maximum delay between two-cycle ACT commands") + + two_cycle_activate = Param.Bool(False, + "Two cycles required to send activate") + + # minimum row activate to row activate delay time + tRRD = Param.Latency("ACT to ACT delay") + + # only utilized with bank group architectures; set to 0 for default case + tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay") + + # time window in which a maximum number of activates are allowed + # to take place, set to 0 to disable + tXAW = Param.Latency("X activation window") + activation_limit = Param.Unsigned("Max number of activates in window") + + # time to exit power-down mode + # Exit power-down to next valid command delay + tXP = Param.Latency("0ns", "Power-up Delay") + + # Exit Powerdown to commands requiring a locked DLL + tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL") + + # time to exit self-refresh mode + tXS = Param.Latency("0ns", "Self-refresh exit latency") + + # time to exit self-refresh mode with locked DLL + tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL") + + # number of data beats per clock. with DDR, default is 2, one per edge + beats_per_clock = Param.Unsigned(2, "Data beats per clock") + + data_clock_sync = Param.Bool(False, "Synchronization commands required") + + # Currently rolled into other params + ###################################################################### + + # tRC - assumed to be tRAS + tRP + + # Power Behaviour and Constraints + # DRAMs like LPDDR and WideIO have 2 external voltage domains. These are + # defined as VDD and VDD2. Each current is defined for each voltage domain + # separately. For example, current IDD0 is active-precharge current for + # voltage domain VDD and current IDD02 is active-precharge current for + # voltage domain VDD2. + # By default all currents are set to 0mA. Users who are only interested in + # the performance of DRAMs can leave them at 0. + + # Power Behaviour and Constraints + # DRAMs like LPDDR and WideIO have 2 external voltage domains. These are + # defined as VDD and VDD2. Each current is defined for each voltage domain + # separately. For example, current IDD0 is active-precharge current for + # voltage domain VDD and current IDD02 is active-precharge current for + # voltage domain VDD2. + # By default all currents are set to 0mA. Users who are only interested in + # the performance of DRAMs can leave them at 0. + + # Operating 1 Bank Active-Precharge current + IDD0 = Param.Current("0mA", "Active precharge current") + + # Operating 1 Bank Active-Precharge current multiple voltage Range + IDD02 = Param.Current("0mA", "Active precharge current VDD2") + + # Precharge Power-down Current: Slow exit + IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow") + + # Precharge Power-down Current: Slow exit multiple voltage Range + IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2") + + # Precharge Power-down Current: Fast exit + IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast") + + # Precharge Power-down Current: Fast exit multiple voltage Range + IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2") + + # Precharge Standby current + IDD2N = Param.Current("0mA", "Precharge Standby current") + + # Precharge Standby current multiple voltage range + IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2") + + # Active Power-down current: slow exit + IDD3P0 = Param.Current("0mA", "Active Powerdown slow") + + # Active Power-down current: slow exit multiple voltage range + IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2") + + # Active Power-down current : fast exit + IDD3P1 = Param.Current("0mA", "Active Powerdown fast") + + # Active Power-down current : fast exit multiple voltage range + IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2") + + # Active Standby current + IDD3N = Param.Current("0mA", "Active Standby current") + + # Active Standby current multiple voltage range + IDD3N2 = Param.Current("0mA", "Active Standby current VDD2") + + # Burst Read Operating Current + IDD4R = Param.Current("0mA", "READ current") + + # Burst Read Operating Current multiple voltage range + IDD4R2 = Param.Current("0mA", "READ current VDD2") + + # Burst Write Operating Current + IDD4W = Param.Current("0mA", "WRITE current") + + # Burst Write Operating Current multiple voltage range + IDD4W2 = Param.Current("0mA", "WRITE current VDD2") + + # Refresh Current + IDD5 = Param.Current("0mA", "Refresh current") + + # Refresh Current multiple voltage range + IDD52 = Param.Current("0mA", "Refresh current VDD2") + + # Self-Refresh Current + IDD6 = Param.Current("0mA", "Self-refresh Current") + + # Self-Refresh Current multiple voltage range + IDD62 = Param.Current("0mA", "Self-refresh Current VDD2") + + # Main voltage range of the DRAM + VDD = Param.Voltage("0V", "Main Voltage Range") + + # Second voltage range defined by some DRAMs + VDD2 = Param.Voltage("0V", "2nd Voltage Range") + +# A single DDR3-1600 x64 channel (one command and address bus), with +# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in +# an 8x8 configuration. +class DDR3_1600_8x8(DRAMInterface): + # size of device in bytes + device_size = '512MB' + + # 8x8 configuration, 8 devices each with an 8-bit interface + device_bus_width = 8 + + # DDR3 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) + device_rowbuffer_size = '1kB' + + # 8x8 configuration, so 8 devices + devices_per_rank = 8 + + # Use two ranks + ranks_per_channel = 2 + + # DDR3 has 8 banks in all configurations + banks_per_rank = 8 + + # 800 MHz + tCK = '1.25ns' + + # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns + tCS = '2.5ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 800 MHz + tBURST = '5ns' + + # Greater of 4 CK or 7.5 ns + tWTR = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns + tRTW = '2.5ns' + + # DDR3-1600 11-11-11 + tRCD = '13.75ns' + tCL = '13.75ns' + tRP = '13.75ns' + tRAS = '35ns' + tRRD = '6ns' + tXAW = '30ns' + activation_limit = 4 + tRFC = '260ns' + + tWR = '15ns' + + # Greater of 4 CK or 7.5 ns + tRTP = '7.5ns' + # <=85C, half for >85C + tREFI = '7.8us' + + # active powerdown and precharge powerdown exit time + tXP = '6ns' + + # self refresh exit time + tXS = '270ns' + + # Current values from datasheet Die Rev E,J + IDD0 = '55mA' + IDD2N = '32mA' + IDD3N = '38mA' + IDD4W = '125mA' + IDD4R = '157mA' + IDD5 = '235mA' + IDD3P1 = '38mA' + IDD2P1 = '32mA' + IDD6 = '20mA' + VDD = '1.5V' + +# A single HMC-2500 x32 model based on: +# [1] DRAMSpec: a high-level DRAM bank modelling tool +# developed at the University of Kaiserslautern. This high level tool +# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to +# estimate the DRAM bank latency and power numbers. +# [2] High performance AXI-4.0 based interconnect for extensible smart memory +# cubes (E. Azarkhish et. al) +# Assumed for the HMC model is a 30 nm technology node. +# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory (4 +# layers). +# Each layer has 16 vaults and each vault consists of 2 banks per layer. +# In order to be able to use the same controller used for 2D DRAM generations +# for HMC, the following analogy is done: +# Channel (DDR) => Vault (HMC) +# device_size (DDR) => size of a single layer in a vault +# ranks per channel (DDR) => number of layers +# banks per rank (DDR) => banks per layer +# devices per rank (DDR) => devices per layer ( 1 for HMC). +# The parameters for which no input is available are inherited from the DDR3 +# configuration. +# This configuration includes the latencies from the DRAM to the logic layer +# of the HMC +class HMC_2500_1x32_Interface(DDR3_1600_8x8): + # A single HMC-2500 x32 controller + # The buffer parameters do not directly correlate with buffer_size in + # real hardware. Nevertheless, their value has been tuned to achieve a + # bandwidth similar to the cycle-accurate model in [2] + write_buffer_size = 32 + read_buffer_size = 32 + + # size of device + # two banks per device with each bank 4MB [2] + device_size = '8MB' + + # 1x32 configuration, 1 device with 32 TSVs [2] + device_bus_width = 32 + + # HMC is a BL8 device [2] + burst_length = 8 + + # Each device has a page (row buffer) size of 256 bytes [2] + device_rowbuffer_size = '256B' + + # 1x32 configuration, so 1 device [2] + devices_per_rank = 1 + + # 4 layers so 4 ranks [2] + ranks_per_channel = 4 + + # HMC has 2 banks per layer [2] + # Each layer represents a rank. With 4 layers and 8 banks in total, each + # layer has 2 banks; thus 2 banks per rank. + banks_per_rank = 2 + + # 1250 MHz [2] + tCK = '0.8ns' + + # Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz = + # 0.8 ns (Assumption) + tCS = '0.8ns' + + # 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz + tBURST = '3.2ns' + + # Values using DRAMSpec HMC model [1] + tRCD = '10.2ns' + tCL = '9.9ns' + tRP = '7.7ns' + tRAS = '21.6ns' + + # tRRD depends on the power supply network for each vendor. + # We assume a tRRD of a double bank approach to be equal to 4 clock + # cycles (Assumption) + tRRD = '3.2ns' + + # activation limit is set to 0 since there are only 2 banks per vault + # layer. + activation_limit = 0 + + # Values using DRAMSpec HMC model [1] + tRFC = '59ns' + tWR = '8ns' + tRTP = '4.9ns' + + # Value using DRAMSpec HMC model [1] + tREFI = '3.9us' + + # The default page policy in the vault controllers is simple closed page + # [2] nevertheless 'close' policy opens and closes the row multiple times + # for bursts largers than 32Bytes. For this reason we use 'close_adaptive' + page_policy = 'close_adaptive' + + # RoCoRaBaCh resembles the default address mapping in HMC + addr_mapping = 'RoCoRaBaCh' + +# A single DDR3-2133 x64 channel refining a selected subset of the +# options for the DDR-1600 configuration, based on the same DDR3-1600 +# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept +# consistent across the two configurations. +class DDR3_2133_8x8(DDR3_1600_8x8): + # 1066 MHz + tCK = '0.938ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz + tBURST = '3.752ns' + + # DDR3-2133 14-14-14 + tRCD = '13.09ns' + tCL = '13.09ns' + tRP = '13.09ns' + tRAS = '33ns' + tRRD = '5ns' + tXAW = '25ns' + + # Current values from datasheet + IDD0 = '70mA' + IDD2N = '37mA' + IDD3N = '44mA' + IDD4W = '157mA' + IDD4R = '191mA' + IDD5 = '250mA' + IDD3P1 = '44mA' + IDD2P1 = '43mA' + IDD6 ='20mA' + VDD = '1.5V' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4) +# in an 16x4 configuration. +# Total channel capacity is 32GB +# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel +class DDR4_2400_16x4(DRAMInterface): + # override the default buffer sizes and go for something larger to + # accommodate the larger bank count + write_buffer_size = 128 + read_buffer_size = 64 + + # size of device + device_size = '1GB' + + # 16x4 configuration, 16 devices each with a 4-bit interface + device_bus_width = 4 + + # DDR4 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 512 byte (1K columns x4) + device_rowbuffer_size = '512B' + + # 16x4 configuration, so 16 devices + devices_per_rank = 16 + + # Match our DDR3 configurations which is dual rank + ranks_per_channel = 2 + + # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups + # Set to 4 for x4 case + bank_groups_per_rank = 4 + + # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all + # configurations). Currently we do not capture the additional + # constraints incurred by the bank groups + banks_per_rank = 16 + + # 1200 MHz + tCK = '0.833ns' + + # Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns + tCS = '1.666ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = '3.332ns' + + # Here using the average of WTR_S and WTR_L + tWTR = '5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666 ns + tRTW = '1.666ns' + + # @2400 data rate, tCCD_L is 6 CK + # CAS-to-CAS delay for bursts to the same bank group + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = '5ns'; + + # DDR4-2400 17-17-17 + tRCD = '14.16ns' + tCL = '14.16ns' + tRP = '14.16ns' + tRAS = '32ns' + + # RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns) + tRRD = '3.332ns' + + # RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns) + tRRD_L = '4.9ns'; + + # tFAW for 512B page is MAX(16 CK, 13ns) + tXAW = '13.328ns' + activation_limit = 4 + # tRFC is 350ns + tRFC = '350ns' + + tWR = '15ns' + + # Greater of 4 CK or 7.5 ns + tRTP = '7.5ns' + + # <=85C, half for >85C + tREFI = '7.8us' + + # active powerdown and precharge powerdown exit time + tXP = '6ns' + + # self refresh exit time + # exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is: + # tRFC + 10ns = 340ns + tXS = '340ns' + + # Current values from datasheet + IDD0 = '43mA' + IDD02 = '3mA' + IDD2N = '34mA' + IDD3N = '38mA' + IDD3N2 = '3mA' + IDD4W = '103mA' + IDD4R = '110mA' + IDD5 = '250mA' + IDD3P1 = '32mA' + IDD2P1 = '25mA' + IDD6 = '30mA' + VDD = '1.2V' + VDD2 = '2.5V' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8) +# in an 8x8 configuration. +# Total channel capacity is 16GB +# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel +class DDR4_2400_8x8(DDR4_2400_16x4): + # 8x8 configuration, 8 devices each with an 8-bit interface + device_bus_width = 8 + + # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) + device_rowbuffer_size = '1kB' + + # 8x8 configuration, so 8 devices + devices_per_rank = 8 + + # RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns) + tRRD_L = '4.9ns'; + + tXAW = '21ns' + + # Current values from datasheet + IDD0 = '48mA' + IDD3N = '43mA' + IDD4W = '123mA' + IDD4R = '135mA' + IDD3P1 = '37mA' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16) +# in an 4x16 configuration. +# Total channel capacity is 4GB +# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel +class DDR4_2400_4x16(DDR4_2400_16x4): + # 4x16 configuration, 4 devices each with an 16-bit interface + device_bus_width = 16 + + # Each device has a page (row buffer) size of 2 Kbyte (1K columns x16) + device_rowbuffer_size = '2kB' + + # 4x16 configuration, so 4 devices + devices_per_rank = 4 + + # Single rank for x16 + ranks_per_channel = 1 + + # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups + # Set to 2 for x16 case + bank_groups_per_rank = 2 + + # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all + # configurations). Currently we do not capture the additional + # constraints incurred by the bank groups + banks_per_rank = 8 + + # RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns) + tRRD = '5.3ns' + + # RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns) + tRRD_L = '6.4ns'; + + tXAW = '30ns' + + # Current values from datasheet + IDD0 = '80mA' + IDD02 = '4mA' + IDD2N = '34mA' + IDD3N = '47mA' + IDD4W = '228mA' + IDD4R = '243mA' + IDD5 = '280mA' + IDD3P1 = '41mA' + +# A single LPDDR2-S4 x32 interface (one command/address bus), with +# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1) +# in a 1x32 configuration. +class LPDDR2_S4_1066_1x32(DRAMInterface): + # No DLL in LPDDR2 + dll = False + + # size of device + device_size = '512MB' + + # 1x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # LPDDR2_S4 is a BL4 and BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1KB + # (this depends on the memory density) + device_rowbuffer_size = '1kB' + + # 1x32 configuration, so 1 device + devices_per_rank = 1 + + # Use a single rank + ranks_per_channel = 1 + + # LPDDR2-S4 has 8 banks in all configurations + banks_per_rank = 8 + + # 533 MHz + tCK = '1.876ns' + + # Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns + tCS = '3.75ns' + + # 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz. + # Note this is a BL8 DDR device. + # Requests larger than 32 bytes are broken down into multiple requests + # in the controller + tBURST = '7.5ns' + + # Irrespective of speed grade, tWTR is 7.5 ns + tWTR = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns + tRTW = '3.75ns' + + # Fixed at 15 ns + tRCD = '15ns' + + # 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time + tCL = '15ns' + + # Pre-charge one bank 15 ns (all banks 18 ns) + tRP = '15ns' + + tRAS = '42ns' + tWR = '15ns' + + tRTP = '7.5ns' + + # LPDDR2-S4, 4 Gbit + tRFC = '130ns' + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '7.5ns' + + # self refresh exit time + tXS = '140ns' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Irrespective of density, tFAW is 50 ns + tXAW = '50ns' + activation_limit = 4 + + # Current values from datasheet + IDD0 = '15mA' + IDD02 = '70mA' + IDD2N = '2mA' + IDD2N2 = '30mA' + IDD3N = '2.5mA' + IDD3N2 = '30mA' + IDD4W = '10mA' + IDD4W2 = '190mA' + IDD4R = '3mA' + IDD4R2 = '220mA' + IDD5 = '40mA' + IDD52 = '150mA' + IDD3P1 = '1.2mA' + IDD3P12 = '8mA' + IDD2P1 = '0.6mA' + IDD2P12 = '0.8mA' + IDD6 = '1mA' + IDD62 = '3.2mA' + VDD = '1.8V' + VDD2 = '1.2V' + +# A single WideIO x128 interface (one command and address bus), with +# default timings based on an estimated WIO-200 8 Gbit part. +class WideIO_200_1x128(DRAMInterface): + # No DLL for WideIO + dll = False + + # size of device + device_size = '1024MB' + + # 1x128 configuration, 1 device with a 128-bit interface + device_bus_width = 128 + + # This is a BL4 device + burst_length = 4 + + # Each device has a page (row buffer) size of 4KB + # (this depends on the memory density) + device_rowbuffer_size = '4kB' + + # 1x128 configuration, so 1 device + devices_per_rank = 1 + + # Use one rank for a one-high die stack + ranks_per_channel = 1 + + # WideIO has 4 banks in all configurations + banks_per_rank = 4 + + # 200 MHz + tCK = '5ns' + + # Default different rank bus delay to 2 CK, @200 MHz = 10 ns + tCS = '10ns' + + # 4 beats across an x128 SDR interface translates to 4 clocks @ 200 MHz. + # Note this is a BL4 SDR device. + tBURST = '20ns' + + # Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns + tWTR = '15ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns + tRTW = '10ns' + + # WIO-200 + tRCD = '18ns' + tCL = '18ns' + tRP = '18ns' + tRAS = '42ns' + tWR = '15ns' + # Read to precharge is same as the burst + tRTP = '20ns' + + # WIO 8 Gb + tRFC = '210ns' + + # WIO 8 Gb, <=85C, half for >85C + tREFI = '3.9us' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Two instead of four activation window + tXAW = '50ns' + activation_limit = 2 + + # The WideIO specification does not provide current information + +# A single LPDDR3 x32 interface (one command/address bus), with +# default timings based on a LPDDR3-1600 4 Gbit part (Micron +# EDF8132A1MC) in a 1x32 configuration. +class LPDDR3_1600_1x32(DRAMInterface): + # No DLL for LPDDR3 + dll = False + + # size of device + device_size = '512MB' + + # 1x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # LPDDR3 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 4KB + device_rowbuffer_size = '4kB' + + # 1x32 configuration, so 1 device + devices_per_rank = 1 + + # Technically the datasheet is a dual-rank package, but for + # comparison with the LPDDR2 config we stick to a single rank + ranks_per_channel = 1 + + # LPDDR3 has 8 banks in all configurations + banks_per_rank = 8 + + # 800 MHz + tCK = '1.25ns' + + # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns + tCS = '2.5ns' + + # 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz. + # Note this is a BL8 DDR device. + # Requests larger than 32 bytes are broken down into multiple requests + # in the controller + tBURST = '5ns' + + # Irrespective of speed grade, tWTR is 7.5 ns + tWTR = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns + tRTW = '2.5ns' + + tRCD = '18ns' + + # 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time + tCL = '15ns' + + tRAS = '42ns' + tWR = '15ns' + + # Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns + tRTP = '7.5ns' + + # Pre-charge one bank 18 ns (all banks 21 ns) + tRP = '18ns' + + # LPDDR3, 4 Gb + tRFC = '130ns' + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '7.5ns' + + # self refresh exit time + tXS = '140ns' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Irrespective of size, tFAW is 50 ns + tXAW = '50ns' + activation_limit = 4 + + # Current values from datasheet + IDD0 = '8mA' + IDD02 = '60mA' + IDD2N = '0.8mA' + IDD2N2 = '26mA' + IDD3N = '2mA' + IDD3N2 = '34mA' + IDD4W = '2mA' + IDD4W2 = '190mA' + IDD4R = '2mA' + IDD4R2 = '230mA' + IDD5 = '28mA' + IDD52 = '150mA' + IDD3P1 = '1.4mA' + IDD3P12 = '11mA' + IDD2P1 = '0.8mA' + IDD2P12 = '1.8mA' + IDD6 = '0.5mA' + IDD62 = '1.8mA' + VDD = '1.8V' + VDD2 = '1.2V' + +# A single GDDR5 x64 interface, with +# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix +# H5GQ1H24AFR) in a 2x32 configuration. +class GDDR5_4000_2x32(DRAMInterface): + # size of device + device_size = '128MB' + + # 2x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # GDDR5 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 2Kbits (256Bytes) + device_rowbuffer_size = '256B' + + # 2x32 configuration, so 2 devices + devices_per_rank = 2 + + # assume single rank + ranks_per_channel = 1 + + # GDDR5 has 4 bank groups + bank_groups_per_rank = 4 + + # GDDR5 has 16 banks with 4 bank groups + banks_per_rank = 16 + + # 1000 MHz + tCK = '1ns' + + # 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz + # Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz ) + # 8 beats at 4000 MHz = 2 beats at 1000 MHz + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = '2ns' + + # Here using the average of WTR_S and WTR_L + tWTR = '5ns' + + # Assume 2 cycles + tRTW = '2ns' + + # @1000MHz data rate, tCCD_L is 3 CK + # CAS-to-CAS delay for bursts to the same bank group + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = '3ns'; + + tRCD = '12ns' + + # tCL is not directly found in datasheet and assumed equal tRCD + tCL = '12ns' + + tRP = '12ns' + tRAS = '28ns' + + # RRD_S (different bank group) + # RRD_S is 5.5 ns in datasheet. + # rounded to the next multiple of tCK + tRRD = '6ns' + + # RRD_L (same bank group) + # RRD_L is 5.5 ns in datasheet. + # rounded to the next multiple of tCK + tRRD_L = '6ns' + + tXAW = '23ns' + + # tXAW < 4 x tRRD. + # Therefore, activation limit is set to 0 + activation_limit = 0 + + tRFC = '65ns' + tWR = '12ns' + + # Read-to-Precharge 2 CK + tRTP = '2ns' + +# A single HBM x128 interface (one command and address bus), with +# default timings based on data publically released +# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014), +# IDD measurement values, and by extrapolating data from other classes. +# Architecture values based on published HBM spec +# A 4H stack is defined, 2Gb per die for a total of 1GB of memory. +class HBM_1000_4H_1x128(DRAMInterface): + # HBM gen1 supports up to 8 128-bit physical channels + # Configuration defines a single channel, with the capacity + # set to (full_ stack_capacity / 8) based on 2Gb dies + # To use all 8 channels, set 'channels' parameter to 8 in + # system configuration + + # 128-bit interface legacy mode + device_bus_width = 128 + + # HBM supports BL4 and BL2 (legacy mode only) + burst_length = 4 + + # size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack; + # with 8 channels, 128MB per channel + device_size = '128MB' + + device_rowbuffer_size = '2kB' + + # 1x128 configuration + devices_per_rank = 1 + + # HBM does not have a CS pin; set rank to 1 + ranks_per_channel = 1 + + # HBM has 8 or 16 banks depending on capacity + # 2Gb dies have 8 banks + banks_per_rank = 8 + + # depending on frequency, bank groups may be required + # will always have 4 bank groups when enabled + # current specifications do not define the minimum frequency for + # bank group architecture + # setting bank_groups_per_rank to 0 to disable until range is defined + bank_groups_per_rank = 0 + + # 500 MHz for 1Gbps DDR data rate + tCK = '2ns' + + # single rank device, set to 0 + tCS = '0ns' + + # BL2 and BL4 supported, default to BL4 + # DDR @ 500 MHz means 4 * 2ns / 2 = 4ns + tBURST = '4ns' + + tWTR = '10ns' + + # start with 2 cycles turnaround, similar to other memory classes + # could be more with variations across the stack + tRTW = '4ns' + + # use values from IDD measurement in JEDEC spec + # use tRP value for tRCD and tCL similar to other classes + tRP = '15ns' + tRCD = '15ns' + tCL = '15ns' + tRAS = '33ns' + + # value for 2Gb device from JEDEC spec + tRFC = '160ns' + + # value for 2Gb device from JEDEC spec + tREFI = '3.9us' + + # extrapolate the following from LPDDR configs, using ns values + # to minimize burst length, prefetch differences + tWR = '18ns' + tRTP = '7.5ns' + # from MemCon example, tRRD is 4ns with 2ns tCK + tRRD = '4ns' + + # from MemCon example, tFAW is 30ns with 2ns tCK + tXAW = '30ns' + activation_limit = 4 + + # 4tCK + tXP = '8ns' + + # start with tRFC + tXP -> 160ns + 8ns = 168ns + tXS = '168ns' + +# A single HBM x64 interface (one command and address bus), with +# default timings based on HBM gen1 and data publically released +# A 4H stack is defined, 8Gb per die for a total of 4GB of memory. +# Note: This defines a pseudo-channel with a unique controller +# instantiated per pseudo-channel +# Stay at same IO rate (1Gbps) to maintain timing relationship with +# HBM gen1 class (HBM_1000_4H_x128) where possible +class HBM_1000_4H_1x64(HBM_1000_4H_1x128): + # For HBM gen2 with pseudo-channel mode, configure 2X channels. + # Configuration defines a single pseudo channel, with the capacity + # set to (full_ stack_capacity / 16) based on 8Gb dies + # To use all 16 pseudo channels, set 'channels' parameter to 16 in + # system configuration + + # 64-bit pseudo-channle interface + device_bus_width = 64 + + # HBM pseudo-channel only supports BL4 + burst_length = 4 + + # size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack; + # with 16 channels, 256MB per channel + device_size = '256MB' + + # page size is halved with pseudo-channel; maintaining the same same number + # of rows per pseudo-channel with 2X banks across 2 channels + device_rowbuffer_size = '1kB' + + # HBM has 8 or 16 banks depending on capacity + # Starting with 4Gb dies, 16 banks are defined + banks_per_rank = 16 + + # Default different rank bus delay to 2 CK, @1000 MHz = 2 ns + tCS = '2ns' + + # reset tRFC for larger, 8Gb device + # use HBM1 4Gb value as a starting point + tRFC = '260ns' + + # start with tRFC + tXP -> 160ns + 8ns = 168ns + tXS = '268ns' + + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '10ns' + + # self refresh exit time + tXS = '65ns' + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture +# burst of 32, which means bursts can be interleaved +class LPDDR5_5500_1x16_BG_BL32(DRAMInterface): + + # Increase buffer size to account for more bank resources + read_buffer_size = 64 + + # Set page policy to better suit DMC Huxley + page_policy = 'close_adaptive' + + # 16-bit channel interface + device_bus_width = 16 + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL32 for higher command bandwidth + burst_length = 32 + + # size of device in bytes + device_size = '1GB' + + # 2kB page with BG mode + device_rowbuffer_size = '2kB' + + # Use a 1x16 configuration + devices_per_rank = 1 + + # Use a single rank + ranks_per_channel = 1 + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Initial configuration will have 16 banks with Bank Group Arch + # to maximim resources and enable higher data rates + banks_per_rank = 16 + bank_groups_per_rank = 4 + + # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK + tCK = '1.455ns' + + # Greater of 2 CK or 18ns + tRCD = '18ns' + + # Base RL is 16 CK @ 687.5 MHz = 23.28ns + tCL = '23.280ns' + + # Greater of 2 CK or 18ns + tRP = '18ns' + + # Greater of 3 CK or 42ns + tRAS = '42ns' + + # Greater of 3 CK or 34ns + tWR = '34ns' + + # active powerdown and precharge powerdown exit time + # Greater of 3 CK or 7ns + tXP = '7ns' + + # self refresh exit time (tRFCab + 7.5ns) + tXS = '217.5ns' + + # Greater of 2 CK or 7.5 ns minus 2 CK + tRTP = '4.59ns' + + # With BG architecture, burst of 32 transferred in two 16-beat + # sub-bursts, with a 16-beat gap in between. + # Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz + # tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz + tBURST = '8.73ns' + # can interleave a Bstof32 from another bank group at tBURST_MIN + # 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz + tBURST_MIN = '2.91ns' + # tBURST_MAX is the maximum burst delay for same bank group timing + # this is 8 CK @ 687.5 MHz + tBURST_MAX = '11.64ns' + + # 8 CK @ 687.5 MHz + tCCD_L = "11.64ns" + + # LPDDR5, 8 Gbit/channel for 280ns tRFCab + tRFC = '210ns' + tREFI = '3.9us' + + # Greater of 4 CK or 6.25 ns + tWTR = '6.25ns' + # Greater of 4 CK or 12 ns + tWTR_L = '12ns' + + # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL + # tWCKDQ0/tCK will be 1 CK for most cases + # For gem5 RL = WL and BL/n is already accounted for with tBURST + # Result is and additional 1 CK is required + tRTW = '1.455ns' + + # Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns + tCS = '2.91ns' + + # 2 CK + tPPD = '2.91ns' + + # Greater of 2 CK or 5 ns + tRRD = '5ns' + tRRD_L = '5ns' + + # With Bank Group Arch mode tFAW is 20 ns + tXAW = '20ns' + activation_limit = 4 + + # at 5Gbps, 4:1 WCK to CK ratio required + # 2 data beats per WCK (DDR) -> 8 per CK + beats_per_clock = 8 + + # 2 cycles required to send activate command + # 2 command phases can be sent back-to-back or + # with a gap up to tAAD = 8 CK + two_cycle_activate = True + tAAD = '11.640ns' + + data_clock_sync = True + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture, burst of 16 +class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32): + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL16 for smaller access granularity + burst_length = 16 + + # For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio + tBURST = '2.91ns' + tBURST_MIN = '2.91ns' + # For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio + tBURST_MAX = '5.82ns' + + # 4 CK @ 687.5 MHz + tCCD_L = "5.82ns" + + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 8-bank mode, burst of 32 +class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32): + + # 4kB page with 8B mode + device_rowbuffer_size = '4kB' + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Select 8B + banks_per_rank = 8 + bank_groups_per_rank = 0 + + # For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio + tBURST = '5.82ns' + tBURST_MIN = '5.82ns' + tBURST_MAX = '5.82ns' + + # Greater of 4 CK or 12 ns + tWTR = '12ns' + + # Greater of 2 CK or 10 ns + tRRD = '10ns' + + # With 8B mode tFAW is 40 ns + tXAW = '40ns' + activation_limit = 4 + + # Reset BG arch timing for 8B mode + tCCD_L = "0ns" + tRRD_L = "0ns" + tWTR_L = "0ns" + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# 6.4Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture +# burst of 32, which means bursts can be interleaved +class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32): + + # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK + tCK = '1.25ns' + + # Base RL is 17 CK @ 800 MHz = 21.25ns + tCL = '21.25ns' + + # With BG architecture, burst of 32 transferred in two 16-beat + # sub-bursts, with a 16-beat gap in between. + # Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz + # tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz + tBURST = '7.5ns' + # can interleave a Bstof32 from another bank group at tBURST_MIN + # 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz + tBURST_MIN = '2.5ns' + # tBURST_MAX is the maximum burst delay for same bank group timing + # this is 8 CK @ 800 MHz + tBURST_MAX = '10ns' + + # 8 CK @ 800 MHz + tCCD_L = "10ns" + + # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL + # tWCKDQ0/tCK will be 1 CK for most cases + # For gem5 RL = WL and BL/n is already accounted for with tBURST + # Result is and additional 1 CK is required + tRTW = '1.25ns' + + # Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns + tCS = '2.5ns' + + # 2 CK + tPPD = '2.5ns' + + # 2 command phases can be sent back-to-back or + # with a gap up to tAAD = 8 CK + tAAD = '10ns' + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on initial +# JEDEC specifcation +# 6.4Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture, burst of 16 +class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32): + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL16 for smaller access granularity + burst_length = 16 + + # For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio + tBURST = '2.5ns' + tBURST_MIN = '2.5ns' + # For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio + tBURST_MAX = '5ns' + + # 4 CK @ 800 MHz + tCCD_L = "5ns" + + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# 6.4Gbps data rates and 8Gbit die +# Configuring for 8-bank mode, burst of 32 +class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32): + + # 4kB page with 8B mode + device_rowbuffer_size = '4kB' + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Select 8B + banks_per_rank = 8 + bank_groups_per_rank = 0 + + # For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio + tBURST = '5ns' + tBURST_MIN = '5ns' + tBURST_MAX = '5ns' + + # Greater of 4 CK or 12 ns + tWTR = '12ns' + + # Greater of 2 CK or 10 ns + tRRD = '10ns' + + # With 8B mode tFAW is 40 ns + tXAW = '40ns' + activation_limit = 4 + + # Reset BG arch timing for 8B mode + tCCD_L = "0ns" + tRRD_L = "0ns" + tWTR_L = "0ns" diff --git a/src/mem/SConscript b/src/mem/SConscript index b77dbb1..76ffdbd 100644 --- a/src/mem/SConscript +++ b/src/mem/SConscript @@ -1,6 +1,6 @@ # -*- mode:python -*- # -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018, 2020 ARM Limited # All rights reserved # # The license below extends only to copyright in the software and shall @@ -47,6 +47,7 @@ SimObject('AddrMapper.py') SimObject('Bridge.py') SimObject('DRAMCtrl.py') +SimObject('DRAMInterface.py') SimObject('ExternalMaster.py') SimObject('ExternalSlave.py') SimObject('MemObject.py') diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc index dc244fe..533aa01 100644 --- a/src/mem/dram_ctrl.cc +++ b/src/mem/dram_ctrl.cc @@ -47,6 +47,7 @@ #include "debug/DRAMState.hh" #include "debug/Drain.hh" #include "debug/QOS.hh" +#include "params/DRAMInterface.hh" #include "sim/system.hh" using namespace std; @@ -58,12 +59,13 @@ retryRdReq(false), retryWrReq(false), nextReqEvent([this]{ processNextReqEvent(); }, name()), respondEvent([this]{ processRespondEvent(); }, name()), + dram(p->dram), readBufferSize(p->read_buffer_size), writeBufferSize(p->write_buffer_size), writeHighThreshold(writeBufferSize * p->write_high_thresh_perc / 100.0), writeLowThreshold(writeBufferSize * p->write_low_thresh_perc / 100.0), minWritesPerSwitch(p->min_writes_per_switch), - writesThisTime(0), readsThisTime(0), tCS(p->tCS), + writesThisTime(0), readsThisTime(0), memSchedPolicy(p->mem_sched_policy), frontendLatency(p->static_frontend_latency), backendLatency(p->static_backend_latency), @@ -75,37 +77,23 @@ readQueue.resize(p->qos_priorities); writeQueue.resize(p->qos_priorities); + dram->setCtrl(this); + // perform a basic check of the write thresholds if (p->write_low_thresh_perc >= p->write_high_thresh_perc) fatal("Write buffer low threshold %d must be smaller than the " "high threshold %d\n", p->write_low_thresh_perc, p->write_high_thresh_perc); - - // determine the rows per bank by looking at the total capacity - uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size()); - - DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity, - AbstractMemory::size()); - - // create a DRAM interface - // will only populate the ranks if DRAM is configured - dram = new DRAMInterface(*this, p, capacity, range); - DPRINTF(DRAM, "Created DRAM interface \n"); } void DRAMCtrl::init() { - MemCtrl::init(); - if (!port.isConnected()) { fatal("DRAMCtrl %s is unconnected!\n", name()); } else { port.sendRangeChange(); } - - dram->init(range); - } void @@ -115,8 +103,6 @@ isTimingMode = system()->isTimingMode(); if (isTimingMode) { - dram->startup(); - // shift the bus busy time sufficiently far ahead that we never // have to worry about negative values when computing the time for // the next request, this will add an insignificant bubble at the @@ -134,7 +120,7 @@ "is responding"); // do the actual memory access and turn the packet into a response - access(pkt); + dram->access(pkt); Tick latency = 0; if (pkt->hasData()) { @@ -264,7 +250,7 @@ // address of first DRAM packet is kept unaliged. Subsequent DRAM packets // are aligned to burst size boundaries. This is to ensure we accurately // check read packets against packets in write queue. - const Addr base_addr = getCtrlAddr(pkt->getAddr()); + const Addr base_addr = dram->getCtrlAddr(pkt->getAddr()); Addr addr = base_addr; unsigned pktsServicedByWrQ = 0; BurstHelper* burst_helper = NULL; @@ -364,7 +350,7 @@ // if the request size is larger than burst size, the pkt is split into // multiple DRAM packets - const Addr base_addr = getCtrlAddr(pkt->getAddr()); + const Addr base_addr = dram->getCtrlAddr(pkt->getAddr()); Addr addr = base_addr; uint32_t burstSize = dram->bytesPerBurst(); for (int cnt = 0; cnt < pktCount; ++cnt) { @@ -527,7 +513,7 @@ DRAMPacket* dram_pkt = respQueue.front(); // media specific checks and functions when read response is complete - dram->respondEventDRAM(dram_pkt->rank); + dram->respondEvent(dram_pkt->rank); if (dram_pkt->burstHelper) { // it is a split packet @@ -726,12 +712,12 @@ void DRAMCtrl::accessAndRespond(PacketPtr pkt, Tick static_latency) { - DPRINTF(DRAM, "Responding to Address %lld.. ",pkt->getAddr()); + DPRINTF(DRAM, "Responding to Address %lld.. \n",pkt->getAddr()); bool needsResponse = pkt->needsResponse(); // do the actual memory access which also turns the packet into a // response - access(pkt); + dram->access(pkt); // turn packet around to go back to requester if response expected if (needsResponse) { @@ -876,9 +862,9 @@ // if not, shift to next burst window Tick act_at; if (twoCycleActivate) - act_at = ctrl.verifyMultiCmd(act_tick, tAAD); + act_at = ctrl->verifyMultiCmd(act_tick, tAAD); else - act_at = ctrl.verifySingleCmd(act_tick); + act_at = ctrl->verifySingleCmd(act_tick); DPRINTF(DRAM, "Activate at tick %d\n", act_at); @@ -996,7 +982,7 @@ // Issuing an explicit PRE command // Verify that we have command bandwidth to issue the precharge // if not, shift to next burst window - pre_at = ctrl.verifySingleCmd(pre_tick); + pre_at = ctrl->verifySingleCmd(pre_tick); // enforce tPPD for (int i = 0; i < banksPerRank; i++) { rank_ref.banks[i].preAllowedAt = std::max(pre_at + tPPD, @@ -1046,7 +1032,7 @@ // first clean up the burstTick set, removing old entries // before adding new entries for next burst - ctrl.pruneBurstTick(); + ctrl->pruneBurstTick(); // get the rank Rank& rank_ref = *ranks[dram_pkt->rank]; @@ -1098,9 +1084,9 @@ // verify that we have command bandwidth to issue the burst // if not, shift to next burst window if (dataClockSync && ((cmd_at - rank_ref.lastBurstTick) > clkResyncDelay)) - cmd_at = ctrl.verifyMultiCmd(cmd_at, tCK); + cmd_at = ctrl->verifyMultiCmd(cmd_at, tCK); else - cmd_at = ctrl.verifySingleCmd(cmd_at); + cmd_at = ctrl->verifySingleCmd(cmd_at); // if we are interleaving bursts, ensure that // 1) we don't double interleave on next burst issue @@ -1200,9 +1186,9 @@ // either look at the read queue or write queue const std::vector<DRAMPacketQueue>& queue = - ctrl.selQueue(dram_pkt->isRead()); + ctrl->selQueue(dram_pkt->isRead()); - for (uint8_t i = 0; i < ctrl.numPriorities(); ++i) { + for (uint8_t i = 0; i < ctrl->numPriorities(); ++i) { auto p = queue[i].begin(); // keep on looking until we find a hit or reach the end of the // queue @@ -1273,6 +1259,7 @@ // Update latency stats stats.totMemAccLat += dram_pkt->readyTime - dram_pkt->entryTime; stats.totQLat += cmd_at - dram_pkt->entryTime; + stats.totBusLat += tBURST; } else { // Schedule write done event to decrement event count // after the readyTime has been reached @@ -1338,13 +1325,9 @@ // Update latency stats stats.masterReadTotalLat[dram_pkt->masterId()] += dram_pkt->readyTime - dram_pkt->entryTime; - - stats.bytesRead += dram->bytesPerBurst(); - stats.totBusLat += dram->burstDly(); stats.masterReadBytes[dram_pkt->masterId()] += dram_pkt->size; } else { ++writesThisTime; - stats.bytesWritten += dram->bytesPerBurst(); stats.masterWriteBytes[dram_pkt->masterId()] += dram_pkt->size; stats.masterWriteTotalLat[dram_pkt->masterId()] += dram_pkt->readyTime - dram_pkt->entryTime; @@ -1446,8 +1429,9 @@ // Figure out which read request goes next // If we are changing command type, incorporate the minimum - // bus turnaround delay which will be tCS (different rank) case - to_read = chooseNext((*queue), switched_cmd_type ? tCS : 0); + // bus turnaround delay which will be rank to rank delay + to_read = chooseNext((*queue), switched_cmd_type ? + dram->rankDelay() : 0); if (to_read != queue->end()) { // candidate read found @@ -1526,7 +1510,8 @@ // If we are changing command type, incorporate the minimum // bus turnaround delay to_write = chooseNext((*queue), - switched_cmd_type ? std::min(dram->minRdToWr(), tCS) : 0); + switched_cmd_type ? std::min(dram->minRdToWr(), + dram->rankDelay()) : 0); if (to_write != queue->end()) { write_found = true; @@ -1599,11 +1584,8 @@ } } -DRAMInterface::DRAMInterface(DRAMCtrl& _ctrl, - const DRAMCtrlParams* _p, - const uint64_t capacity, - const AddrRange range) - : SimObject(_p), ctrl(_ctrl), +DRAMInterface::DRAMInterface(const DRAMInterfaceParams* _p) + : AbstractMemory(_p), addrMapping(_p->addr_mapping), burstSize((_p->devices_per_rank * _p->burst_length * _p->device_bus_width) / 8), @@ -1618,7 +1600,7 @@ bankGroupsPerRank(_p->bank_groups_per_rank), bankGroupArch(_p->bank_groups_per_rank > 0), banksPerRank(_p->banks_per_rank), rowsPerBank(0), - tCK(_p->tCK), tCL(_p->tCL), tBURST(_p->tBURST), + tCK(_p->tCK), tCS(_p->tCS), tCL(_p->tCL), tBURST(_p->tBURST), tBURST_MIN(_p->tBURST_MIN), tBURST_MAX(_p->tBURST_MAX), tRTW(_p->tRTW), tCCD_L_WR(_p->tCCD_L_WR), tCCD_L(_p->tCCD_L), tRCD(_p->tRCD), tRP(_p->tRP), tRAS(_p->tRAS), tWR(_p->tWR), tRTP(_p->tRTP), @@ -1634,12 +1616,12 @@ wrToRdDly(tCL + tBURST + _p->tWTR), rdToWrDly(tBURST + tRTW), wrToRdDlySameBG(tCL + _p->tBURST_MAX + _p->tWTR_L), rdToWrDlySameBG(tRTW + _p->tBURST_MAX), - rankToRankDly(ctrl.rankDelay() + tBURST), + rankToRankDly(tCS + tBURST), pageMgmt(_p->page_policy), maxAccessesPerRow(_p->max_accesses_per_row), timeStampOffset(0), activeRank(0), enableDRAMPowerdown(_p->enable_dram_powerdown), - stats(_ctrl, *this) + stats(*this) { fatal_if(!isPowerOf2(burstSize), "DRAM burst size %d is not allowed, " "must be a power of two\n", burstSize); @@ -1651,7 +1633,7 @@ for (int i = 0; i < ranksPerChannel; i++) { DPRINTF(DRAM, "Creating DRAM rank %d \n", i); - Rank* rank = new Rank(ctrl, _p, i, *this); + Rank* rank = new Rank(_p, i, *this); ranks.push_back(rank); } @@ -1659,6 +1641,11 @@ uint64_t deviceCapacity = deviceSize / (1024 * 1024) * devicesPerRank * ranksPerChannel; + uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size()); + + DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity, + AbstractMemory::size()); + // if actual DRAM size does not match memory capacity in system warn! if (deviceCapacity != capacity / (1024 * 1024)) warn("DRAM device capacity (%d Mbytes) does not match the " @@ -1713,8 +1700,10 @@ } void -DRAMInterface::init(AddrRange range) +DRAMInterface::init() { + AbstractMemory::init(); + // a bit of sanity checks on the interleaving, save it for here to // ensure that the system pointer is initialised if (range.interleaved()) { @@ -1736,7 +1725,7 @@ // channel striping has to be done at a granularity that // is equal or larger to a cache line - if (ctrl.system()->cacheLineSize() > range.granularity()) { + if (system()->cacheLineSize() > range.granularity()) { fatal("Channel interleaving of %s must be at least as large " "as the cache line size\n", name()); } @@ -1755,8 +1744,10 @@ void DRAMInterface::startup() { - // timestamp offset should be in clock cycles for DRAMPower - timeStampOffset = divCeil(curTick(), tCK); + if (system()->isTimingMode()) { + // timestamp offset should be in clock cycles for DRAMPower + timeStampOffset = divCeil(curTick(), tCK); + } for (auto r : ranks) { r->startup(curTick() + tREFI - tRP); @@ -1802,7 +1793,7 @@ } void -DRAMInterface::respondEventDRAM(uint8_t rank) +DRAMInterface::respondEvent(uint8_t rank) { Rank& rank_ref = *ranks[rank]; @@ -1943,7 +1934,7 @@ std::max(ranks[i]->banks[j].preAllowedAt, curTick()) + tRP; // When is the earliest the R/W burst can issue? - const Tick col_allowed_at = ctrl.inReadBusState(false) ? + const Tick col_allowed_at = ctrl->inReadBusState(false) ? ranks[i]->banks[j].rdAllowedAt : ranks[i]->banks[j].wrAllowedAt; Tick col_at = std::max(col_allowed_at, act_at + tRCD); @@ -1983,9 +1974,15 @@ return make_pair(bank_mask, hidden_bank_prep); } -DRAMInterface::Rank::Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank, - DRAMInterface& _dram) - : EventManager(&_ctrl), ctrl(_ctrl), dram(_dram), +DRAMInterface* +DRAMInterfaceParams::create() +{ + return new DRAMInterface(this); +} + +DRAMInterface::Rank::Rank(const DRAMInterfaceParams* _p, + int _rank, DRAMInterface& _dram) + : EventManager(&_dram), dram(_dram), pwrStateTrans(PWR_IDLE), pwrStatePostRefresh(PWR_IDLE), pwrStateTick(0), refreshDueAt(0), pwrState(PWR_IDLE), refreshState(REF_IDLE), inLowPowerState(false), rank(_rank), @@ -1998,7 +1995,7 @@ refreshEvent([this]{ processRefreshEvent(); }, name()), powerEvent([this]{ processPowerEvent(); }, name()), wakeUpEvent([this]{ processWakeUpEvent(); }, name()), - stats(_ctrl, *this) + stats(_dram, *this) { for (int b = 0; b < _p->banks_per_rank; b++) { banks[b].bank = b; @@ -2049,8 +2046,10 @@ DRAMInterface::Rank::isQueueEmpty() const { // check commmands in Q based on current bus direction - bool no_queued_cmds = (ctrl.inReadBusState(true) && (readEntries == 0)) - || (!ctrl.inReadBusState(true) && (writeEntries == 0)); + bool no_queued_cmds = (dram.ctrl->inReadBusState(true) && + (readEntries == 0)) + || (!dram.ctrl->inReadBusState(true) && + (writeEntries == 0)); return no_queued_cmds; } @@ -2174,7 +2173,7 @@ // if a request is at the moment being handled and this request is // accessing the current rank then wait for it to finish if ((rank == dram.activeRank) - && (ctrl.nextReqEvent.scheduled())) { + && (dram.ctrl->nextReqEvent.scheduled())) { // hand control over to the request loop until it is // evaluated next DPRINTF(DRAM, "Refresh awaiting draining\n"); @@ -2249,7 +2248,7 @@ // or have outstanding ACT,RD/WR,Auto-PRE sequence scheduled // should have outstanding precharge or read response event assert(prechargeEvent.scheduled() || - ctrl.respondEvent.scheduled()); + dram.ctrl->respondEvent.scheduled()); // will start refresh when pwrState transitions to IDLE } @@ -2309,8 +2308,8 @@ assert(!powerEvent.scheduled()); - if ((ctrl.drainState() == DrainState::Draining) || - (ctrl.drainState() == DrainState::Drained)) { + if ((dram.ctrl->drainState() == DrainState::Draining) || + (dram.ctrl->drainState() == DrainState::Drained)) { // if draining, do not re-enter low-power mode. // simply go to IDLE and wait schedulePowerEvent(PWR_IDLE, curTick()); @@ -2535,10 +2534,10 @@ } // completed refresh event, ensure next request is scheduled - if (!ctrl.nextReqEvent.scheduled()) { + if (!dram.ctrl->nextReqEvent.scheduled()) { DPRINTF(DRAM, "Scheduling next request after refreshing" " rank %d\n", rank); - schedule(ctrl.nextReqEvent, curTick()); + schedule(dram.ctrl->nextReqEvent, curTick()); } } @@ -2597,8 +2596,8 @@ // bypass auto-refresh and go straight to SREF, where memory // will issue refresh immediately upon entry if (pwrStatePostRefresh == PWR_PRE_PDN && isQueueEmpty() && - (ctrl.drainState() != DrainState::Draining) && - (ctrl.drainState() != DrainState::Drained) && + (dram.ctrl->drainState() != DrainState::Draining) && + (dram.ctrl->drainState() != DrainState::Drained) && dram.enableDRAMPowerdown) { DPRINTF(DRAMState, "Rank %d bypassing refresh and transitioning " "to self refresh at %11u tick\n", rank, curTick()); @@ -2669,7 +2668,7 @@ // power (mW) = ----------- * ---------- // time (tick) tick_frequency stats.averagePower = (stats.totalEnergy.value() / - (curTick() - ctrl.lastStatsResetTick)) * + (curTick() - dram.ctrl->lastStatsResetTick)) * (SimClock::Frequency / 1000000000.0); } @@ -2699,7 +2698,7 @@ bool DRAMInterface::Rank::forceSelfRefreshExit() const { return (readEntries != 0) || - (!ctrl.inReadBusState(true) && (writeEntries != 0)); + (!dram.ctrl->inReadBusState(true) && (writeEntries != 0)); } DRAMCtrl::CtrlStats::CtrlStats(DRAMCtrl &_ctrl) @@ -2710,15 +2709,15 @@ ADD_STAT(writeReqs, "Number of write requests accepted"), ADD_STAT(readBursts, - "Number of DRAM read bursts, " + "Number of controller read bursts, " "including those serviced by the write queue"), ADD_STAT(writeBursts, - "Number of DRAM write bursts, " + "Number of controller write bursts, " "including those merged in the write queue"), ADD_STAT(servicedByWrQ, - "Number of DRAM read bursts serviced by the write queue"), + "Number of controller read bursts serviced by the write queue"), ADD_STAT(mergedWrBursts, - "Number of DRAM write bursts merged with an existing one"), + "Number of controller write bursts merged with an existing one"), ADD_STAT(neitherReadNorWriteReqs, "Number of requests that are neither read nor write"), @@ -2726,9 +2725,6 @@ ADD_STAT(avgRdQLen, "Average read queue length when enqueuing"), ADD_STAT(avgWrQLen, "Average write queue length when enqueuing"), - ADD_STAT(totBusLat, "Total ticks spent in databus transfers"), - ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"), - ADD_STAT(numRdRetry, "Number of times read queue was full causing retry"), ADD_STAT(numWrRetry, "Number of times write queue was full causing retry"), @@ -2743,22 +2739,13 @@ ADD_STAT(wrPerTurnAround, "Writes before turning the bus around for reads"), - ADD_STAT(bytesRead, "Total number of bytes read from memory"), ADD_STAT(bytesReadWrQ, "Total number of bytes read from write queue"), - ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"), ADD_STAT(bytesReadSys, "Total read bytes from the system interface side"), ADD_STAT(bytesWrittenSys, "Total written bytes from the system interface side"), - ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiByte/s"), - ADD_STAT(avgWrBW, "Average achieved write bandwidth in MiByte/s"), ADD_STAT(avgRdBWSys, "Average system read bandwidth in MiByte/s"), ADD_STAT(avgWrBWSys, "Average system write bandwidth in MiByte/s"), - ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"), - - ADD_STAT(busUtil, "Data bus utilization in percentage"), - ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"), - ADD_STAT(busUtilWrite, "Data bus utilization in percentage for writes"), ADD_STAT(totGap, "Total gap between requests"), ADD_STAT(avgGap, "Average gap between requests"), @@ -2790,12 +2777,11 @@ { using namespace Stats; - assert(ctrl._system); - const auto max_masters = ctrl._system->maxMasters(); + assert(ctrl.system()); + const auto max_masters = ctrl.system()->maxMasters(); avgRdQLen.precision(2); avgWrQLen.precision(2); - avgBusLat.precision(2); readPktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1); writePktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1); @@ -2810,14 +2796,9 @@ .init(ctrl.writeBufferSize) .flags(nozero); - avgRdBW.precision(2); - avgWrBW.precision(2); avgRdBWSys.precision(2); avgWrBWSys.precision(2); - peakBW.precision(2); - busUtil.precision(2); avgGap.precision(2); - busUtilWrite.precision(2); // per-master bytes read and written to memory masterReadBytes @@ -2849,9 +2830,6 @@ .flags(nonan) .precision(2); - busUtilRead - .precision(2); - masterWriteRate .flags(nozero | nonan) .precision(12); @@ -2865,7 +2843,7 @@ .precision(2); for (int i = 0; i < max_masters; i++) { - const std::string master = ctrl._system->getMasterName(i); + const std::string master = ctrl.system()->getMasterName(i); masterReadBytes.subname(i, master); masterReadRate.subname(i, master); masterWriteBytes.subname(i, master); @@ -2879,22 +2857,11 @@ } // Formula stats - avgBusLat = totBusLat / (readBursts - servicedByWrQ); - - avgRdBW = (bytesRead / 1000000) / simSeconds; - avgWrBW = (bytesWritten / 1000000) / simSeconds; avgRdBWSys = (bytesReadSys / 1000000) / simSeconds; avgWrBWSys = (bytesWrittenSys / 1000000) / simSeconds; - peakBW = (SimClock::Frequency / ctrl.dram->burstDataDly()) * - ctrl.dram->bytesPerBurst() / 1000000; - - busUtil = (avgRdBW + avgWrBW) / peakBW * 100; avgGap = totGap / (readReqs + writeReqs); - busUtilRead = avgRdBW / peakBW * 100; - busUtilWrite = avgWrBW / peakBW * 100; - masterReadRate = masterReadBytes / simSeconds; masterWriteRate = masterWriteBytes / simSeconds; masterReadAvgLat = masterReadTotalLat / masterReadAccesses; @@ -2907,8 +2874,8 @@ ctrl.lastStatsResetTick = curTick(); } -DRAMInterface::DRAMStats::DRAMStats(DRAMCtrl &_ctrl, DRAMInterface &_dram) - : Stats::Group(&_ctrl, csprintf("dram").c_str()), +DRAMInterface::DRAMStats::DRAMStats(DRAMInterface &_dram) + : Stats::Group(&_dram), dram(_dram), ADD_STAT(readBursts, "Number of DRAM read bursts"), @@ -2918,10 +2885,13 @@ ADD_STAT(perBankWrBursts, "Per bank write bursts"), ADD_STAT(totQLat, "Total ticks spent queuing"), + ADD_STAT(totBusLat, "Total ticks spent in databus transfers"), ADD_STAT(totMemAccLat, "Total ticks spent from burst creation until serviced " "by the DRAM"), + ADD_STAT(avgQLat, "Average queueing delay per DRAM burst"), + ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"), ADD_STAT(avgMemAccLat, "Average memory access latency per DRAM burst"), ADD_STAT(readRowHits, "Number of row buffer hits during reads"), @@ -2934,6 +2904,12 @@ ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"), ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiBytes/s"), ADD_STAT(avgWrBW, "Average DRAM write bandwidth in MiBytes/s"), + ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"), + + ADD_STAT(busUtil, "Data bus utilization in percentage"), + ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"), + ADD_STAT(busUtilWrite, "Data bus utilization in percentage for writes"), + ADD_STAT(pageHitRate, "Row buffer hit rate, read and write combined") { @@ -2945,6 +2921,7 @@ using namespace Stats; avgQLat.precision(2); + avgBusLat.precision(2); avgMemAccLat.precision(2); readRowHitRate.precision(2); @@ -2958,10 +2935,16 @@ dram.maxAccessesPerRow : dram.rowBufferSize) .flags(nozero); + peakBW.precision(2); + busUtil.precision(2); + busUtilWrite.precision(2); + busUtilRead.precision(2); + pageHitRate.precision(2); // Formula stats avgQLat = totQLat / readBursts; + avgBusLat = totBusLat / readBursts; avgMemAccLat = totMemAccLat / readBursts; readRowHitRate = (readRowHits / readBursts) * 100; @@ -2969,13 +2952,19 @@ avgRdBW = (bytesRead / 1000000) / simSeconds; avgWrBW = (bytesWritten / 1000000) / simSeconds; + peakBW = (SimClock::Frequency / dram.burstDataDly()) * + dram.bytesPerBurst() / 1000000; + + busUtil = (avgRdBW + avgWrBW) / peakBW * 100; + busUtilRead = avgRdBW / peakBW * 100; + busUtilWrite = avgWrBW / peakBW * 100; pageHitRate = (writeRowHits + readRowHits) / (writeBursts + readBursts) * 100; } -DRAMInterface::RankStats::RankStats(DRAMCtrl &_ctrl, Rank &_rank) - : Stats::Group(&_ctrl, csprintf("dram_rank%d", _rank.rank).c_str()), +DRAMInterface::RankStats::RankStats(DRAMInterface &_dram, Rank &_rank) + : Stats::Group(&_dram, csprintf("rank%d", _rank.rank).c_str()), rank(_rank), ADD_STAT(actEnergy, "Energy for activate commands per rank (pJ)"), @@ -3034,7 +3023,7 @@ DRAMCtrl::recvFunctional(PacketPtr pkt) { // rely on the abstract memory - functionalAccess(pkt); + dram->functionalAccess(pkt); } Port & @@ -3099,7 +3088,7 @@ DRAMCtrl::MemoryPort::getAddrRanges() const { AddrRangeList ranges; - ranges.push_back(ctrl.getAddrRange()); + ranges.push_back(ctrl.dram->getAddrRange()); return ranges; } diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh index 4464f7a..1b6d8b5 100644 --- a/src/mem/dram_ctrl.hh +++ b/src/mem/dram_ctrl.hh @@ -56,12 +56,15 @@ #include "enums/AddrMap.hh" #include "enums/MemSched.hh" #include "enums/PageManage.hh" +#include "mem/abstract_mem.hh" #include "mem/drampower.hh" #include "mem/qos/mem_ctrl.hh" #include "mem/qport.hh" #include "params/DRAMCtrl.hh" #include "sim/eventq.hh" +class DRAMInterfaceParams; + /** * A basic class to track the bank state, i.e. what row is * currently open (if any), when is the bank free to accept a new @@ -243,7 +246,7 @@ * The DRAMInterface includes a class for individual ranks * and per rank functions. */ -class DRAMInterface : public SimObject +class DRAMInterface : public AbstractMemory { private: /** @@ -340,7 +343,7 @@ class Rank; struct RankStats : public Stats::Group { - RankStats(DRAMCtrl &ctrl, Rank &rank); + RankStats(DRAMInterface &dram, Rank &rank); void regStats() override; void resetStats() override; @@ -406,13 +409,6 @@ */ class Rank : public EventManager { - protected: - - /** - * A reference to the parent DRAMCtrl instance - */ - DRAMCtrl& ctrl; - private: /** @@ -532,10 +528,10 @@ */ Tick lastBurstTick; - Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank, + Rank(const DRAMInterfaceParams* _p, int _rank, DRAMInterface& _dram); - const std::string name() const { return csprintf("dram_%d", rank); } + const std::string name() const { return csprintf("%d", rank); } /** * Kick off accounting for power and refresh states and @@ -662,9 +658,9 @@ }; /** - * A reference to the parent DRAMCtrl instance + * A pointer to the parent DRAMCtrl instance */ - DRAMCtrl& ctrl; + DRAMCtrl* ctrl; /** * Memory controller configuration initialized based on parameter @@ -695,6 +691,7 @@ * DRAM timing requirements */ const Tick M5_CLASS_VAR_USED tCK; + const Tick tCS; const Tick tCL; const Tick tBURST; const Tick tBURST_MIN; @@ -774,7 +771,7 @@ bool trace = true); struct DRAMStats : public Stats::Group { - DRAMStats(DRAMCtrl &ctrl, DRAMInterface &dram); + DRAMStats(DRAMInterface &dram); void regStats() override; @@ -790,10 +787,12 @@ // Latencies summed over all requests Stats::Scalar totQLat; + Stats::Scalar totBusLat; Stats::Scalar totMemAccLat; // Average latencies per request Stats::Formula avgQLat; + Stats::Formula avgBusLat; Stats::Formula avgMemAccLat; // Row hit count and rate @@ -809,6 +808,11 @@ // Average bandwidth Stats::Formula avgRdBW; Stats::Formula avgWrBW; + Stats::Formula peakBW; + // bus utilization + Stats::Formula busUtil; + Stats::Formula busUtilRead; + Stats::Formula busUtilWrite; Stats::Formula pageHitRate; }; @@ -820,11 +824,16 @@ std::vector<Rank*> ranks; public: + /** Setting a pointer to the controller */ + void setCtrl(DRAMCtrl* _ctrl) + { + ctrl = _ctrl; + }; + /** * Initialize the DRAM interface and verify parameters - * @param range is the address range for this interface */ - void init(AddrRange range); + void init(); /** * Iterate through dram ranks and instantiate per rank startup routine @@ -853,6 +862,20 @@ void suspend(); /** + * Get an address in a dense range which starts from 0. The input + * address is the physical address of the request in an address + * space that contains other SimObjects apart from this + * controller. + * + * @param addr The intput address which should be in the addrRange + * @return An address in the continues range [0, max) + */ + Addr getCtrlAddr(Addr addr) + { + return range.getOffset(addr); + } + + /** * @return number of bytes in a burst for this interface */ uint32_t bytesPerBurst () { return burstSize; }; @@ -887,6 +910,13 @@ */ Tick minRdToWr () { return tRTW; }; + /** + * Determine the required delay for an access to a different rank + * + * @return required rank to rank delay + */ + Tick rankDelay() { return tCS; }; + /* * Function to calulate RAS cycle time for use within and * outside of this class @@ -968,7 +998,7 @@ * * @param rank Specifies rank associated with read burst */ - void respondEventDRAM(uint8_t rank); + void respondEvent(uint8_t rank); /** * Check the refresh state to determine if refresh needs @@ -1004,8 +1034,7 @@ virtual void process() { rank->resetStats(); }; }; - DRAMInterface(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, - uint64_t capacity, AddrRange range); + DRAMInterface(const DRAMInterfaceParams* _p); }; /** @@ -1170,20 +1199,6 @@ void accessAndRespond(PacketPtr pkt, Tick static_latency); /** - * Get an address in a dense range which starts from 0. The input - * address is the physical address of the request in an address - * space that contains other SimObjects apart from this - * controller. - * - * @param addr The intput address which should be in the addrRange - * @return An address in the continues range [0, max) - */ - Addr getCtrlAddr(Addr addr) - { - return range.getOffset(addr); - } - - /** * The memory schduler/arbiter - picks which request needs to * go next, based on the specified policy such as FCFS or FR-FCFS * and moves it to the head of the queue. @@ -1265,6 +1280,11 @@ std::unordered_multiset<Tick> burstTicks; /** + * Create pointer to interface of the actual dram media + */ + DRAMInterface* const dram; + + /** * The following are basic design parameters of the memory * controller, and are initialized based on parameter values. * The rowsPerBank is determined based on the capacity, number of @@ -1279,12 +1299,6 @@ uint32_t readsThisTime; /** - * Basic memory timing parameters initialized based on parameter - * values. These will be used across memory interfaces. - */ - const Tick tCS; - - /** * Memory controller configuration initialized based on parameter * values. */ @@ -1338,10 +1352,6 @@ // Average queue lengths Stats::Average avgRdQLen; Stats::Average avgWrQLen; - // Latencies summed over all requests - Stats::Scalar totBusLat; - // Average latencies per request - Stats::Formula avgBusLat; Stats::Scalar numRdRetry; Stats::Scalar numWrRetry; @@ -1352,21 +1362,12 @@ Stats::Histogram rdPerTurnAround; Stats::Histogram wrPerTurnAround; - Stats::Scalar bytesRead; Stats::Scalar bytesReadWrQ; - Stats::Scalar bytesWritten; Stats::Scalar bytesReadSys; Stats::Scalar bytesWrittenSys; // Average bandwidth - Stats::Formula avgRdBW; - Stats::Formula avgWrBW; Stats::Formula avgRdBWSys; Stats::Formula avgWrBWSys; - Stats::Formula peakBW; - // bus utilization - Stats::Formula busUtil; - Stats::Formula busUtilRead; - Stats::Formula busUtilWrite; Stats::Scalar totGap; Stats::Formula avgGap; @@ -1405,11 +1406,6 @@ /** The time when stats were last reset used to calculate average power */ Tick lastStatsResetTick; - /** - * Create pointer to interfasce to the actual media - */ - DRAMInterface* dram; - DRAMCtrl(const DRAMCtrlParams* p); DrainState drain() override; @@ -1458,13 +1454,6 @@ }; /** - * Determine the required delay for an access to a different rank - * - * @return required rank to rank delay - */ - Tick rankDelay() { return tCS; }; - - /** * Check the current direction of the memory channel * * @param next_state Check either the current or next bus state diff --git a/src/mem/drampower.cc b/src/mem/drampower.cc index f506928..7a44aa1 100644 --- a/src/mem/drampower.cc +++ b/src/mem/drampower.cc @@ -40,13 +40,13 @@ #include "base/intmath.hh" #include "sim/core.hh" -DRAMPower::DRAMPower(const DRAMCtrlParams* p, bool include_io) : +DRAMPower::DRAMPower(const DRAMInterfaceParams* p, bool include_io) : powerlib(libDRAMPower(getMemSpec(p), include_io)) { } Data::MemArchitectureSpec -DRAMPower::getArchParams(const DRAMCtrlParams* p) +DRAMPower::getArchParams(const DRAMInterfaceParams* p) { Data::MemArchitectureSpec archSpec; archSpec.burstLength = p->burst_length; @@ -68,7 +68,7 @@ } Data::MemTimingSpec -DRAMPower::getTimingParams(const DRAMCtrlParams* p) +DRAMPower::getTimingParams(const DRAMInterfaceParams* p) { // Set the values that are used for power calculations and ignore // the ones only used by the controller functionality in DRAMPower @@ -100,7 +100,7 @@ } Data::MemPowerSpec -DRAMPower::getPowerParams(const DRAMCtrlParams* p) +DRAMPower::getPowerParams(const DRAMInterfaceParams* p) { // All DRAMPower currents are in mA Data::MemPowerSpec powerSpec; @@ -132,7 +132,7 @@ } Data::MemorySpecification -DRAMPower::getMemSpec(const DRAMCtrlParams* p) +DRAMPower::getMemSpec(const DRAMInterfaceParams* p) { Data::MemorySpecification memSpec; memSpec.memArchSpec = getArchParams(p); @@ -142,13 +142,13 @@ } bool -DRAMPower::hasTwoVDD(const DRAMCtrlParams* p) +DRAMPower::hasTwoVDD(const DRAMInterfaceParams* p) { return p->VDD2 == 0 ? false : true; } uint8_t -DRAMPower::getDataRate(const DRAMCtrlParams* p) +DRAMPower::getDataRate(const DRAMInterfaceParams* p) { uint32_t burst_cycles = divCeil(p->tBURST_MAX, p->tCK); uint8_t data_rate = p->burst_length / burst_cycles; diff --git a/src/mem/drampower.hh b/src/mem/drampower.hh index ed47476..da68a78 100644 --- a/src/mem/drampower.hh +++ b/src/mem/drampower.hh @@ -44,7 +44,7 @@ #define __MEM_DRAM_POWER_HH__ #include "libdrampower/LibDRAMPower.h" -#include "params/DRAMCtrl.hh" +#include "params/DRAMInterface.hh" /** * DRAMPower is a standalone tool which calculates the power consumed by a @@ -57,43 +57,44 @@ /** * Transform the architechture parameters defined in - * DRAMCtrlParams to the memSpec of DRAMPower + * DRAMInterfaceParams to the memSpec of DRAMPower */ - static Data::MemArchitectureSpec getArchParams(const DRAMCtrlParams* p); + static Data::MemArchitectureSpec getArchParams( + const DRAMInterfaceParams* p); /** - * Transforms the timing parameters defined in DRAMCtrlParams to + * Transforms the timing parameters defined in DRAMInterfaceParams to * the memSpec of DRAMPower */ - static Data::MemTimingSpec getTimingParams(const DRAMCtrlParams* p); + static Data::MemTimingSpec getTimingParams(const DRAMInterfaceParams* p); /** * Transforms the power and current parameters defined in - * DRAMCtrlParam to the memSpec of DRAMPower + * DRAMInterfaceParams to the memSpec of DRAMPower */ - static Data::MemPowerSpec getPowerParams(const DRAMCtrlParams* p); + static Data::MemPowerSpec getPowerParams(const DRAMInterfaceParams* p); /** * Determine data rate, either one or two. */ - static uint8_t getDataRate(const DRAMCtrlParams* p); + static uint8_t getDataRate(const DRAMInterfaceParams* p); /** * Determine if DRAM has two voltage domains (or one) */ - static bool hasTwoVDD(const DRAMCtrlParams* p); + static bool hasTwoVDD(const DRAMInterfaceParams* p); /** - * Return an instance of MemSpec based on the DRAMCtrlParams + * Return an instance of MemSpec based on the DRAMInterfaceParams */ - static Data::MemorySpecification getMemSpec(const DRAMCtrlParams* p); + static Data::MemorySpecification getMemSpec(const DRAMInterfaceParams* p); public: // Instance of DRAMPower Library libDRAMPower powerlib; - DRAMPower(const DRAMCtrlParams* p, bool include_io); + DRAMPower(const DRAMInterfaceParams* p, bool include_io); }; diff --git a/src/mem/qos/QoSMemCtrl.py b/src/mem/qos/QoSMemCtrl.py index 1cd3f0b..f55105b 100644 --- a/src/mem/qos/QoSMemCtrl.py +++ b/src/mem/qos/QoSMemCtrl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -34,18 +34,21 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from m5.params import * -from m5.objects.AbstractMemory import AbstractMemory +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject from m5.objects.QoSTurnaround import * # QoS Queue Selection policy used to select packets among same-QoS queues class QoSQPolicy(Enum): vals = ["fifo", "lifo", "lrg"] -class QoSMemCtrl(AbstractMemory): +class QoSMemCtrl(ClockedObject): type = 'QoSMemCtrl' cxx_header = "mem/qos/mem_ctrl.hh" cxx_class = 'QoS::MemCtrl' abstract = True + system = Param.System(Parent.any, "System that the controller belongs to.") + ##### QoS support parameters #### # Number of priorities in the system diff --git a/src/mem/qos/QoSMemSinkCtrl.py b/src/mem/qos/QoSMemSinkCtrl.py index 572cad5..03a988a 100644 --- a/src/mem/qos/QoSMemSinkCtrl.py +++ b/src/mem/qos/QoSMemSinkCtrl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -37,6 +37,7 @@ from m5.params import * from m5.objects.QoSMemCtrl import * +from QoSMemSinkInterface import * class QoSMemSinkCtrl(QoSMemCtrl): type = 'QoSMemSinkCtrl' @@ -44,6 +45,10 @@ cxx_class = "QoS::MemSinkCtrl" port = SlavePort("Slave ports") + + intf = Param.QoSMemSinkInterface(QoSMemSinkInterface(), "Interface to "\ + "memory") + # the basic configuration of the controller architecture, note # that each entry corresponds to a burst for the specific DRAM # configuration (e.g. x32 with burst length 8 is 32 bytes) and not @@ -59,5 +64,3 @@ # response latency - time to issue a response once a request is serviced response_latency = Param.Latency("20ns", "Memory response latency") - - diff --git a/src/mem/qos/QoSMemSinkInterface.py b/src/mem/qos/QoSMemSinkInterface.py new file mode 100644 index 0000000..fd8254f --- /dev/null +++ b/src/mem/qos/QoSMemSinkInterface.py @@ -0,0 +1,43 @@ +# Copyright (c) 2020 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Matteo Andreozzi +# Wendy Elsasser + +from AbstractMemory import AbstractMemory + +class QoSMemSinkInterface(AbstractMemory): + type = 'QoSMemSinkInterface' + cxx_header = "mem/qos/mem_sink.hh" diff --git a/src/mem/qos/SConscript b/src/mem/qos/SConscript index f8601b6..1d90f9c 100644 --- a/src/mem/qos/SConscript +++ b/src/mem/qos/SConscript @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved # # The license below extends only to copyright in the software and shall @@ -37,6 +37,7 @@ SimObject('QoSMemCtrl.py') SimObject('QoSMemSinkCtrl.py') +SimObject('QoSMemSinkInterface.py') SimObject('QoSPolicy.py') SimObject('QoSTurnaround.py') diff --git a/src/mem/qos/mem_ctrl.cc b/src/mem/qos/mem_ctrl.cc index 50e6035..190960b 100644 --- a/src/mem/qos/mem_ctrl.cc +++ b/src/mem/qos/mem_ctrl.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited + * Copyright (c) 2017-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -42,7 +42,7 @@ namespace QoS { MemCtrl::MemCtrl(const QoSMemCtrlParams * p) - : AbstractMemory(p), + : ClockedObject(p), policy(p->qos_policy), turnPolicy(p->qos_turnaround_policy), queuePolicy(QueuePolicy::create(p)), @@ -51,7 +51,8 @@ qosSyncroScheduler(p->qos_syncro_scheduler), totalReadQueueSize(0), totalWriteQueueSize(0), busState(READ), busStateNext(READ), - stats(*this) + stats(*this), + _system(p->system) { // Set the priority policy if (policy) { @@ -77,12 +78,6 @@ {} void -MemCtrl::init() -{ - AbstractMemory::init(); -} - -void MemCtrl::logRequest(BusState dir, MasterID m_id, uint8_t qos, Addr addr, uint64_t entries) { diff --git a/src/mem/qos/mem_ctrl.hh b/src/mem/qos/mem_ctrl.hh index 0e29fcc..50ddc94 100644 --- a/src/mem/qos/mem_ctrl.hh +++ b/src/mem/qos/mem_ctrl.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited + * Copyright (c) 2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -36,9 +36,9 @@ */ #include "debug/QOS.hh" -#include "mem/abstract_mem.hh" -#include "mem/qos/q_policy.hh" +#include "mem/mem_object.hh" #include "mem/qos/policy.hh" +#include "mem/qos/q_policy.hh" #include "params/QoSMemCtrl.hh" #include "sim/system.hh" @@ -49,6 +49,8 @@ #ifndef __MEM_QOS_MEM_CTRL_HH__ #define __MEM_QOS_MEM_CTRL_HH__ +class System; + namespace QoS { /** @@ -56,7 +58,7 @@ * which support QoS - it provides access to a set of QoS * scheduling policies */ -class MemCtrl: public AbstractMemory +class MemCtrl: public ClockedObject { public: /** Bus Direction */ @@ -151,6 +153,9 @@ Stats::Scalar numStayWriteState; } stats; + /** Pointer to the System object */ + System* _system; + /** * Initializes dynamically counters and * statistics for a given Master @@ -266,11 +271,6 @@ virtual ~MemCtrl(); /** - * Initializes this object - */ - void init() override; - - /** * Gets the current bus state * * @return current bus state @@ -346,6 +346,10 @@ * @return total number of priority levels */ uint8_t numPriorities() const { return _numPriorities; } + + /** read the system pointer + * @return pointer to the system object */ + System* system() const { return _system; } }; template<typename Queues> diff --git a/src/mem/qos/mem_sink.cc b/src/mem/qos/mem_sink.cc index 1f104e4..fb06b9d 100644 --- a/src/mem/qos/mem_sink.cc +++ b/src/mem/qos/mem_sink.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited + * Copyright (c) 2018-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -40,6 +40,7 @@ #include "debug/Drain.hh" #include "debug/QOS.hh" #include "mem_sink.hh" +#include "params/QoSMemSinkInterface.hh" #include "sim/system.hh" namespace QoS { @@ -50,12 +51,15 @@ memoryPacketSize(p->memory_packet_size), readBufferSize(p->read_buffer_size), writeBufferSize(p->write_buffer_size), port(name() + ".port", *this), + intf(p->intf), retryRdReq(false), retryWrReq(false), nextRequest(0), nextReqEvent(this) { // Resize read and write queue to allocate space // for configured QoS priorities readQueue.resize(numPriorities()); writeQueue.resize(numPriorities()); + + intf->setMemCtrl(this); } MemSinkCtrl::~MemSinkCtrl() @@ -92,7 +96,7 @@ "%s Should not see packets where cache is responding\n", __func__); - access(pkt); + intf->access(pkt); return responseLatency; } @@ -101,7 +105,7 @@ { pkt->pushLabel(name()); - functionalAccess(pkt); + intf->functionalAccess(pkt); pkt->popLabel(); } @@ -279,7 +283,7 @@ // Do the actual memory access which also turns the packet // into a response - access(pkt); + intf->access(pkt); // Log the response logResponse(pkt->isRead()? READ : WRITE, @@ -351,7 +355,7 @@ MemSinkCtrl::MemoryPort::getAddrRanges() const { AddrRangeList ranges; - ranges.push_back(memory.getAddrRange()); + ranges.push_back(memory.intf->getAddrRange()); return ranges; } @@ -390,3 +394,19 @@ return new QoS::MemSinkCtrl(this); } +QoSMemSinkInterface::QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p) + : AbstractMemory(_p) +{ +} + +void +QoSMemSinkInterface::init() +{ + AbstractMemory::init(); +} + +QoSMemSinkInterface* +QoSMemSinkInterfaceParams::create() +{ + return new QoSMemSinkInterface(this); +} diff --git a/src/mem/qos/mem_sink.hh b/src/mem/qos/mem_sink.hh index 9a51269..3b10abd 100644 --- a/src/mem/qos/mem_sink.hh +++ b/src/mem/qos/mem_sink.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited + * Copyright (c) 2018-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -41,10 +41,14 @@ #ifndef __MEM_QOS_MEM_SINK_HH__ #define __MEM_QOS_MEM_SINK_HH__ +#include "mem/abstract_mem.hh" #include "mem/qos/mem_ctrl.hh" #include "mem/qport.hh" #include "params/QoSMemSinkCtrl.hh" +class QoSMemSinkInterfaceParams; +class QoSMemSinkInterface; + namespace QoS { /** @@ -163,6 +167,11 @@ /** Memory slave port */ MemoryPort port; + /** + * Create pointer to interface of actual media + */ + QoSMemSinkInterface* const intf; + /** Read request pending */ bool retryRdReq; @@ -244,4 +253,23 @@ } // namespace QoS +class QoSMemSinkInterface : public AbstractMemory +{ + public: + /** Initialize the memory interface */ + void init(); + + /** Setting a pointer to the interface */ + void setMemCtrl(QoS::MemSinkCtrl* _ctrl) + { + ctrl = _ctrl; + }; + + /** Pointer to the controller */ + QoS::MemSinkCtrl* ctrl; + + QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p); +}; + + #endif /* __MEM_QOS_MEM_SINK_HH__ */ diff --git a/tests/configs/base_config.py b/tests/configs/base_config.py index 0f79938..e2d3851 100644 --- a/tests/configs/base_config.py +++ b/tests/configs/base_config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2012-2013, 2017-2018 ARM Limited +# Copyright (c) 2012-2013, 2017-2018, 2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -221,7 +221,12 @@ super(BaseSESystem, self).init_system(system) def create_system(self): - system = System(physmem = self.mem_class(), + if issubclass(self.mem_class, m5.objects.DRAMInterface): + mem_ctrl = DRAMCtrl() + mem_ctrl.dram = self.mem_class() + else: + mem_ctrl = self.mem_class() + system = System(physmem = mem_ctrl, membus = SystemXBar(), mem_mode = self.mem_mode, multi_thread = (self.num_threads > 1)) @@ -275,6 +280,16 @@ # the physmem name to avoid bumping all the reference stats system.physmem = [self.mem_class(range = r) for r in system.mem_ranges] + if issubclass(self.mem_class, m5.objects.DRAMInterface): + mem_ctrls = [] + for r in system.mem_ranges: + mem_ctrl = DRAMCtrl() + mem_ctrl.dram = self.mem_class(range = r) + mem_ctrls.append(mem_ctrl) + system.physmem = mem_ctrls + else: + system.physmem = [self.mem_class(range = r) + for r in system.mem_ranges] for i in range(len(system.physmem)): system.physmem[i].port = system.membus.master -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28968 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8 Gerrit-Change-Number: 28968 Gerrit-PatchSet: 1 Gerrit-Owner: Wendy Elsasser <wendy.elsasser(a)arm.com> Gerrit-MessageType: newchange
JL
Jason Lowe-Power (Gerrit)
Tue, Sep 8, 2020 4:38 PM

Jason Lowe-Power has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/28968 )

Change subject: mem: Make MemCtrl a ClockedObject
......................................................................

mem: Make MemCtrl a ClockedObject

Made DRAMCtrl a ClockedObject, with DRAMInterface
defined as an AbstractMemory. The address
ranges are now defined per interface. Currently
the model only includes a DRAMInterface but this
can be expanded for other media types.

The controller object includes a parameter to the
interface, which is setup when gem5 is configured.

Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28968
Reviewed-by: Jason Lowe-Power <power.jg(a)gmail.com>
Maintainer: Jason Lowe-Power <power.jg(a)gmail.com>
Tested-by: kokoro <noreply+kokoro(a)google.com>

M configs/common/MemConfig.py
M configs/dram/low_power_sweep.py
M configs/dram/sweep.py
M configs/example/memcheck.py
M configs/learning_gem5/part1/simple.py
M configs/learning_gem5/part1/two_level.py
M configs/learning_gem5/part2/simple_cache.py
M configs/learning_gem5/part2/simple_memobj.py
M configs/learning_gem5/part3/simple_ruby.py
M configs/ruby/Ruby.py
M src/mem/DRAMCtrl.py
A src/mem/DRAMInterface.py
M src/mem/SConscript
M src/mem/dram_ctrl.cc
M src/mem/dram_ctrl.hh
M src/mem/drampower.cc
M src/mem/drampower.hh
M src/mem/qos/QoSMemCtrl.py
M src/mem/qos/QoSMemSinkCtrl.py
A src/mem/qos/QoSMemSinkInterface.py
M src/mem/qos/SConscript
M src/mem/qos/mem_ctrl.cc
M src/mem/qos/mem_ctrl.hh
M src/mem/qos/mem_sink.cc
M src/mem/qos/mem_sink.hh
M tests/gem5/configs/base_config.py
26 files changed, 1,913 insertions(+), 1,736 deletions(-)

Approvals:
Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved
kokoro: Regressions pass

diff --git a/configs/common/MemConfig.py b/configs/common/MemConfig.py
index b530145..1ace875 100644
--- a/configs/common/MemConfig.py
+++ b/configs/common/MemConfig.py
@@ -40,7 +40,7 @@
from common import ObjectList
from common import HMC

-def create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, intlv_size,
+def create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size,
xor_low_bit):
"""
Helper function for creating a single memoy controller from the given
@@ -63,32 +63,32 @@

  # Create an instance so we can figure out the address
  # mapping and row-buffer size
  • ctrl = cls()
  • interface = intf()

    Only do this for DRAMs

  • if issubclass(cls, m5.objects.DRAMCtrl):
  • if issubclass(intf, m5.objects.DRAMInterface):
    # If the channel bits are appearing after the column
    # bits, we need to add the appropriate number of bits
    # for the row buffer size
  •    if ctrl.addr_mapping.value == 'RoRaBaChCo':
    
  •    if interface.addr_mapping.value == 'RoRaBaChCo':
            # This computation only really needs to happen
            # once, but as we rely on having an instance we
            # end up having to repeat it for each and every
            # one
    
  •        rowbuffer_size = ctrl.device_rowbuffer_size.value * \
    
  •            ctrl.devices_per_rank.value
    
  •        rowbuffer_size = interface.device_rowbuffer_size.value * \
    
  •            interface.devices_per_rank.value
    
            intlv_low_bit = int(math.log(rowbuffer_size, 2))
    
    # We got all we need to configure the appropriate address
    # range
    
  • ctrl.range = m5.objects.AddrRange(r.start, size = r.size(),
  • interface.range = m5.objects.AddrRange(r.start, size = r.size(),
    intlvHighBit =
    intlv_low_bit + intlv_bits - 1,
    xorHighBit = xor_high_bit,
    intlvBits = intlv_bits,
    intlvMatch = i)
  • return ctrl
  • return interface

def config_mem(options, system):
"""
@@ -148,10 +148,10 @@
if 2 ** intlv_bits != nbr_mem_ctrls:
fatal("Number of memory channels must be a power of 2")

  • cls = ObjectList.mem_list.get(opt_mem_type)
  • intf = ObjectList.mem_list.get(opt_mem_type)
    mem_ctrls = []
  • if opt_elastic_trace_en and not issubclass(cls,
    m5.objects.SimpleMemory):
  • if opt_elastic_trace_en and not issubclass(intf,
    m5.objects.SimpleMemory):
    fatal("When elastic trace is enabled, configure mem-type as "
    "simple-mem.")

@@ -162,36 +162,53 @@
intlv_size = max(opt_mem_channels_intlv, system.cache_line_size.value)

  # For every range (most systems will only have one), create an
  • array of controllers and set their parameters to match their

  • address mapping in the case of a DRAM

  • array of memory interfaces and set their parameters to match

  • their address mapping in the case of a DRAM

    for r in system.mem_ranges:
    for i in range(nbr_mem_ctrls):
  •        mem_ctrl = create_mem_ctrl(cls, r, i, nbr_mem_ctrls,  
    

intlv_bits,

  •        # Create the DRAM interface
    
  •        dram_intf = create_mem_intf(intf, r, i, nbr_mem_ctrls,  
    

intlv_bits,
intlv_size, opt_xor_low_bit)
+
# Set the number of ranks based on the command-line
# options if it was explicitly set

  •        if issubclass(cls, m5.objects.DRAMCtrl) and opt_mem_ranks:
    
  •            mem_ctrl.ranks_per_channel = opt_mem_ranks
    
  •        if issubclass(intf, m5.objects.DRAMInterface) and  
    

opt_mem_ranks:

  •            dram_intf.ranks_per_channel = opt_mem_ranks
    
            # Enable low-power DRAM states if option is set
    
  •        if issubclass(cls, m5.objects.DRAMCtrl):
    
  •            mem_ctrl.enable_dram_powerdown = opt_dram_powerdown
    
  •        if issubclass(intf, m5.objects.DRAMInterface):
    
  •            dram_intf.enable_dram_powerdown = opt_dram_powerdown
    
            if opt_elastic_trace_en:
    
  •            mem_ctrl.latency = '1ns'
    
  •            dram_intf.latency = '1ns'
                print("For elastic trace, over-riding Simple Memory "
                    "latency to 1ns.")
    
  •        # Create the controller that will drive the interface
    
  •        if opt_mem_type == "HMC_2500_1x32":
    
  •            # The static latency of the vault controllers is estimated
    
  •            # to be smaller than a full DRAM channel controller
    
  •            mem_ctrl = m5.objects.DRAMCtrl(min_writes_per_switch = 8,
    
  •                                           static_backend_latency  
    

= '4ns',

  •                                           static_frontend_latency  
    

= '4ns')

  •        else:
    
  •            mem_ctrl = m5.objects.DRAMCtrl()
    
  •        # Hookup the controller to the interface and add to the list
    
  •        mem_ctrl.dram = dram_intf
            mem_ctrls.append(mem_ctrl)
    
  • Create a controller and connect the interfaces to a controller

  • for i in range(len(mem_ctrls)):

  •    if opt_mem_type == "HMC_2500_1x32":
    
  •        # Connect the controllers to the membus
    
  •        mem_ctrls[i].port = xbar[i/4].master
    
  •        # Set memory device size. There is an independent controller  
    

for

  •        # each vault. All vaults are same size.
    
  •        mem_ctrls[i].dram.device_size = options.hmc_dev_vault_size
    
  •    else:
    
  •        # Connect the controllers to the membus
    
  •        mem_ctrls[i].port = xbar.master
    
  • subsystem.mem_ctrls = mem_ctrls
    
  • Connect the controllers to the membus

  • for i in range(len(subsystem.mem_ctrls)):
  •    if opt_mem_type == "HMC_2500_1x32":
    
  •        subsystem.mem_ctrls[i].port = xbar[i/4].master
    
  •        # Set memory device size. There is an independent controller  
    

for

  •        # each vault. All vaults are same size.
    
  •        subsystem.mem_ctrls[i].device_size = options.hmc_dev_vault_size
    
  •    else:
    
  •        subsystem.mem_ctrls[i].port = xbar.master
    

diff --git a/configs/dram/low_power_sweep.py
b/configs/dram/low_power_sweep.py
index 9a62393..0da2b93 100644
--- a/configs/dram/low_power_sweep.py
+++ b/configs/dram/low_power_sweep.py
@@ -111,14 +111,19 @@

Sanity check for memory controller class.

if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl):

  • fatal("This script assumes the memory is a DRAMCtrl subclass")
  • fatal("This script assumes the controller is a DRAMCtrl subclass")
    +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
  • fatal("This script assumes the memory is a DRAMInterface subclass")

There is no point slowing things down by saving any data.

-system.mem_ctrls[0].null = True
+system.mem_ctrls[0].dram.null = True
+
+# enable DRAM low power states
+system.mem_ctrls[0].dram.enable_dram_powerdown = True

Set the address mapping based on input argument

-system.mem_ctrls[0].addr_mapping = args.addr_map
-system.mem_ctrls[0].page_policy = args.page_policy
+system.mem_ctrls[0].dram.addr_mapping = args.addr_map
+system.mem_ctrls[0].dram.page_policy = args.page_policy

We create a traffic generator state for each param combination we want to

test. Each traffic generator state is specified in the config file and

the
@@ -132,22 +137,22 @@
cfg_file = open(cfg_file_path, 'w')

Get the number of banks

-nbr_banks = int(system.mem_ctrls[0].banks_per_rank.value)
+nbr_banks = int(system.mem_ctrls[0].dram.banks_per_rank.value)

determine the burst size in bytes

-burst_size = int((system.mem_ctrls[0].devices_per_rank.value *

  •              system.mem_ctrls[0].device_bus_width.value *
    
  •              system.mem_ctrls[0].burst_length.value) / 8)
    

+burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value *

  •              system.mem_ctrls[0].dram.device_bus_width.value *
    
  •              system.mem_ctrls[0].dram.burst_length.value) / 8)
    

    next, get the page size in bytes (the rowbuffer size is already in bytes)

-page_size = system.mem_ctrls[0].devices_per_rank.value * \

  • system.mem_ctrls[0].device_rowbuffer_size.value
    +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \
  • system.mem_ctrls[0].dram.device_rowbuffer_size.value

Inter-request delay should be such that we can hit as many transitions

to/from low power states as possible to. We provide a min and max itt to

the

traffic generator and it randomises in the range. The parameter is in

seconds and we need it in ticks (ps).

-itt_min = system.mem_ctrls[0].tBURST.value * 1000000000000
+itt_min = system.mem_ctrls[0].dram.tBURST.value * 1000000000000

#The itt value when set to (tRAS + tRP + tCK) covers the case where

a read command is delayed beyond the delay from ACT to PRE_PDN entry of

the
@@ -155,9 +160,9 @@

between a write and power down entry will be tRCD + tCL + tWR + tRP +

tCK.

As we use this delay as a unit and create multiples of it as bigger

delays

for the sweep, this parameter works for reads, writes and mix of them.

-pd_entry_time = (system.mem_ctrls[0].tRAS.value +

  •             system.mem_ctrls[0].tRP.value +
    
  •             system.mem_ctrls[0].tCK.value) * 1000000000000
    

+pd_entry_time = (system.mem_ctrls[0].dram.tRAS.value +

  •             system.mem_ctrls[0].dram.tRP.value +
    
  •             system.mem_ctrls[0].dram.tCK.value) * 1000000000000
    

    We sweep itt max using the multipliers specified by the user.

    itt_max_str = args.itt_list.strip().split()
    diff --git a/configs/dram/sweep.py b/configs/dram/sweep.py
    index a340b46..a771c5c 100644
    --- a/configs/dram/sweep.py
    +++ b/configs/dram/sweep.py
    @@ -116,13 +116,15 @@

    the following assumes that we are using the native DRAM

    controller, check to be sure

    if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl):

  • fatal("This script assumes the memory is a DRAMCtrl subclass")
  • fatal("This script assumes the controller is a DRAMCtrl subclass")
    +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
  • fatal("This script assumes the memory is a DRAMInterface subclass")

there is no point slowing things down by saving any data

-system.mem_ctrls[0].null = True
+system.mem_ctrls[0].dram.null = True

Set the address mapping based on input argument

-system.mem_ctrls[0].addr_mapping = options.addr_map
+system.mem_ctrls[0].dram.addr_mapping = options.addr_map

stay in each state for 0.25 ms, long enough to warm things up, and

short enough to avoid hitting a refresh

@@ -133,21 +135,21 @@

the DRAM maximum bandwidth to ensure that it is saturated

get the number of banks

-nbr_banks = system.mem_ctrls[0].banks_per_rank.value
+nbr_banks = system.mem_ctrls[0].dram.banks_per_rank.value

determine the burst length in bytes

-burst_size = int((system.mem_ctrls[0].devices_per_rank.value *

  •              system.mem_ctrls[0].device_bus_width.value *
    
  •              system.mem_ctrls[0].burst_length.value) / 8)
    

+burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value *

  •              system.mem_ctrls[0].dram.device_bus_width.value *
    
  •              system.mem_ctrls[0].dram.burst_length.value) / 8)
    

    next, get the page size in bytes

-page_size = system.mem_ctrls[0].devices_per_rank.value * \

  • system.mem_ctrls[0].device_rowbuffer_size.value
    +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \
  • system.mem_ctrls[0].dram.device_rowbuffer_size.value

match the maximum bandwidth of the memory, the parameter is in seconds

and we need it in ticks (ps)

-itt =  getattr(system.mem_ctrls[0].tBURST_MIN, 'value',

  •           system.mem_ctrls[0].tBURST.value) * 1000000000000
    

+itt =  getattr(system.mem_ctrls[0].dram.tBURST_MIN, 'value',

  •           system.mem_ctrls[0].dram.tBURST.value) * 1000000000000
    

    assume we start at 0

    max_addr = mem_range.end
    diff --git a/configs/example/memcheck.py b/configs/example/memcheck.py
    index 6d80d60..6bccd54 100644
    --- a/configs/example/memcheck.py
    +++ b/configs/example/memcheck.py
    @@ -217,7 +217,7 @@
    proto_tester = TrafficGen(config_file = cfg_file_path)

    Set up the system along with a DRAM controller

-system = System(physmem = DDR3_1600_8x8())
+system = System(physmem = DRAMCtrl(dram = DDR3_1600_8x8()))

system.voltage_domain = VoltageDomain(voltage = '1V')

diff --git a/configs/learning_gem5/part1/simple.py
b/configs/learning_gem5/part1/simple.py
index ef73a06..cfd15be 100644
--- a/configs/learning_gem5/part1/simple.py
+++ b/configs/learning_gem5/part1/simple.py
@@ -77,8 +77,9 @@
system.cpu.interrupts[0].int_slave = system.membus.master

Create a DDR3 memory controller and connect it to the membus

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.master

Connect the system up to the membus

diff --git a/configs/learning_gem5/part1/two_level.py
b/configs/learning_gem5/part1/two_level.py
index 564c785..0dbcfc7 100644
--- a/configs/learning_gem5/part1/two_level.py
+++ b/configs/learning_gem5/part1/two_level.py
@@ -132,8 +132,9 @@
system.system_port = system.membus.slave

Create a DDR3 memory controller

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.master

Create a process for a simple "Hello World" application

diff --git a/configs/learning_gem5/part2/simple_cache.py
b/configs/learning_gem5/part2/simple_cache.py
index 8d98d92..fbea73d 100644
--- a/configs/learning_gem5/part2/simple_cache.py
+++ b/configs/learning_gem5/part2/simple_cache.py
@@ -76,8 +76,9 @@
system.cpu.interrupts[0].int_slave = system.membus.master

Create a DDR3 memory controller and connect it to the membus

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.master

Connect the system up to the membus

diff --git a/configs/learning_gem5/part2/simple_memobj.py
b/configs/learning_gem5/part2/simple_memobj.py
index d30977c..e792eb9 100644
--- a/configs/learning_gem5/part2/simple_memobj.py
+++ b/configs/learning_gem5/part2/simple_memobj.py
@@ -74,8 +74,9 @@
system.cpu.interrupts[0].int_slave = system.membus.master

Create a DDR3 memory controller and connect it to the membus

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.master

Connect the system up to the membus

diff --git a/configs/learning_gem5/part3/simple_ruby.py
b/configs/learning_gem5/part3/simple_ruby.py
index c47ee7e..7f70a8c 100644
--- a/configs/learning_gem5/part3/simple_ruby.py
+++ b/configs/learning_gem5/part3/simple_ruby.py
@@ -68,8 +68,9 @@
system.cpu = [TimingSimpleCPU() for i in range(2)]

Create a DDR3 memory controller and connect it to the membus

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]

create the interrupt controller for the CPU and connect to the membus

for cpu in system.cpu:
diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py
index 9bceaa3..9f400a8 100644
--- a/configs/ruby/Ruby.py
+++ b/configs/ruby/Ruby.py
@@ -130,15 +130,16 @@
dir_ranges = []
for r in system.mem_ranges:
mem_type = ObjectList.mem_list.get(options.mem_type)

  •        mem_ctrl = MemConfig.create_mem_ctrl(mem_type, r, index,
    
  •        dram_intf = MemConfig.create_mem_intf(mem_type, r, index,
                options.num_dirs, int(math.log(options.num_dirs, 2)),
                intlv_size, options.xor_low_bit)
    
  •        mem_ctrl = m5.objects.DRAMCtrl(dram = dram_intf)
    
            if options.access_backing_store:
                mem_ctrl.kvm_map=False
    
            mem_ctrls.append(mem_ctrl)
    
  •        dir_ranges.append(mem_ctrl.range)
    
  •        dir_ranges.append(mem_ctrl.dram.range)
    
            if crossbar != None:
                mem_ctrl.port = crossbar.master
    

diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py
index 0f70dff..b7b43dc 100644
--- a/src/mem/DRAMCtrl.py
+++ b/src/mem/DRAMCtrl.py
@@ -40,26 +40,12 @@

from m5.params import *
from m5.proxy import *
-from m5.objects.AbstractMemory import *
from m5.objects.QoSMemCtrl import *

Enum for memory scheduling algorithms, currently First-Come

First-Served and a First-Row Hit then First-Come First-Served

class MemSched(Enum): vals = ['fcfs', 'frfcfs']

-# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting
-# channel, rank, bank, row and column, respectively, and going from
-# MSB to LSB.  Available are RoRaBaChCo and RoRaBaCoCh, that are
-# suitable for an open-page policy, optimising for sequential accesses
-# hitting in the open row. For a closed-page policy, RoCoRaBaCh
-# maximises parallelism.
-class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh']

-# Enum for the page policy, either open, open_adaptive, close, or
-# close_adaptive.
-class PageManage(Enum): vals = ['open', 'open_adaptive', 'close',

  •                            'close_adaptive']
    
  • DRAMCtrl is a single-channel single-ported DRAM controller model

    that aims to model the most important system-level performance

    effects of a DRAM without getting into too much detail of the DRAM

@@ -72,12 +58,11 @@
# bus in front of the controller for multiple ports
port = SlavePort("Slave port")

  • the basic configuration of the controller architecture, note

  • that each entry corresponds to a burst for the specific DRAM

  • configuration (e.g. x32 with burst length 8 is 32 bytes) and not

  • the cacheline size or request/packet size

  • write_buffer_size = Param.Unsigned(64, "Number of write queue entries")
  • read_buffer_size = Param.Unsigned(32, "Number of read queue entries")
  • Interface to volatile, DRAM media

  • dram = Param.DRAMInterface("DRAM interface")

  • read and write buffer depths are set in the interface

  • the controller will read these values when instantiated

    threshold in percent for when to forcefully trigger writes and

    start emptying the write buffer

@@ -93,15 +78,6 @@

  # scheduler, address map and page policy
  mem_sched_policy = Param.MemSched('frfcfs', "Memory scheduling policy")
  • addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy")

  • page_policy = Param.PageManage('open_adaptive', "Page management
    policy")

  • enforce a limit on the number of accesses per row

  • max_accesses_per_row = Param.Unsigned(16, "Max accesses per row
    before "

  •                                      "closing");
    
  • size of DRAM Chip in Bytes

  • device_size = Param.MemorySize("Size of DRAM chip")

    pipeline latency of the controller and PHY, split into a

    frontend part and a backend part, with reads and writes serviced

@@ -109,1404 +85,3 @@
# serviced by the memory seeing the sum of the two
static_frontend_latency = Param.Latency("10ns", "Static frontend
latency")
static_backend_latency = Param.Latency("10ns", "Static backend
latency")

  • the physical organisation of the DRAM

  • device_bus_width = Param.Unsigned("data bus width in bits for each
    DRAM "\
  •                                  "device/chip")
    
  • burst_length = Param.Unsigned("Burst lenght (BL) in beats")
  • device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\
  •                                       "device/chip")
    
  • devices_per_rank = Param.Unsigned("Number of devices/chips per rank")
  • ranks_per_channel = Param.Unsigned("Number of ranks per channel")
  • default to 0 bank groups per rank, indicating bank group architecture

  • is not used

  • update per memory class when bank group architecture is supported

  • bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per
    rank")
  • banks_per_rank = Param.Unsigned("Number of banks per rank")
  • Enable DRAM powerdown states if True. This is False by default due to

  • performance being lower when enabled

  • enable_dram_powerdown = Param.Bool(False, "Enable powerdown states")
  • For power modelling we need to know if the DRAM has a DLL or not

  • dll = Param.Bool(True, "DRAM has DLL or not")
  • DRAMPower provides in addition to the core power, the possibility to

  • include RD/WR termination and IO power. This calculation assumes some

  • default values. The integration of DRAMPower with gem5 does not

include

  • IO and RD/WR termination power by default. This might be added as an

  • additional feature in the future.

  • timing behaviour and constraints - all in nanoseconds

  • the base clock period of the DRAM

  • tCK = Param.Latency("Clock period")
  • the amount of time in nanoseconds from issuing an activate command

  • to the data being available in the row buffer for a read/write

  • tRCD = Param.Latency("RAS to CAS delay")
  • the time from issuing a read/write command to seeing the actual data

  • tCL = Param.Latency("CAS latency")
  • minimum time between a precharge and subsequent activate

  • tRP = Param.Latency("Row precharge time")
  • minimum time between an activate and a precharge to the same row

  • tRAS = Param.Latency("ACT to PRE delay")
  • minimum time between a write data transfer and a precharge

  • tWR = Param.Latency("Write recovery time")
  • minimum time between a read and precharge command

  • tRTP = Param.Latency("Read to precharge")
  • time to complete a burst transfer, typically the burst length

  • divided by two due to the DDR bus, but by making it a parameter

  • it is easier to also evaluate SDR memories like WideIO.

  • This parameter has to account for burst length.

  • Read/Write requests with data size larger than one full burst are

broken

  • down into multiple requests in the controller

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = Param.Latency("Burst duration "
  •                       "(typically burst length / 2 cycles)")
    
  • tBURST_MAX is the column array cycle delay required before next

access,

  • which could be greater than tBURST when the memory access time is

greater

  • than tBURST

  • tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay")
  • tBURST_MIN is the minimum delay between bursts, which could be less

than

  • tBURST when interleaving is supported

  • tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts")
  • CAS-to-CAS delay for bursts to the same bank group

  • only utilized with bank group architectures; set to 0 for default

case

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay")
  • Write-to-Write delay for bursts to the same bank group

  • only utilized with bank group architectures; set to 0 for default

case

  • This will be used to enable different same bank group delays

  • for writes versus reads

  • tCCD_L_WR = Param.Latency(Self.tCCD_L,
  •    "Same bank group Write to Write delay")
    
  • time taken to complete one refresh cycle (N rows in all banks)

  • tRFC = Param.Latency("Refresh cycle time")
  • refresh command interval, how often a "ref" command needs

  • to be sent. It is 7.8 us for a 64ms refresh requirement

  • tREFI = Param.Latency("Refresh command interval")
  • write-to-read, same rank turnaround penalty

  • tWTR = Param.Latency("Write to read, same rank switching time")
  • write-to-read, same rank turnaround penalty for same bank group

  • tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching "
  •                       "time, same bank group")
    
  • read-to-write, same rank turnaround penalty

  • tRTW = Param.Latency("Read to write, same rank switching time")
  • rank-to-rank bus delay penalty

  • this does not correlate to a memory timing parameter and encompasses:

  • 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD

  • different rank bus delay

  • tCS = Param.Latency("Rank to rank switching time")
  • minimum precharge to precharge delay time

  • tPPD = Param.Latency("0ns", "PRE to PRE delay")
  • maximum delay between two-cycle ACT command phases

  • tAAD = Param.Latency(Self.tCK,
  •                     "Maximum delay between two-cycle ACT commands")
    
  • two_cycle_activate = Param.Bool(False,
  •                     "Two cycles required to send activate")
    
  • minimum row activate to row activate delay time

  • tRRD = Param.Latency("ACT to ACT delay")
  • only utilized with bank group architectures; set to 0 for default

case

  • tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay")
  • time window in which a maximum number of activates are allowed

  • to take place, set to 0 to disable

  • tXAW = Param.Latency("X activation window")
  • activation_limit = Param.Unsigned("Max number of activates in window")
  • time to exit power-down mode

  • Exit power-down to next valid command delay

  • tXP = Param.Latency("0ns", "Power-up Delay")
  • Exit Powerdown to commands requiring a locked DLL

  • tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL")
  • time to exit self-refresh mode

  • tXS = Param.Latency("0ns", "Self-refresh exit latency")
  • time to exit self-refresh mode with locked DLL

  • tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL")
  • number of data beats per clock. with DDR, default is 2, one per edge

  • beats_per_clock = Param.Unsigned(2, "Data beats per clock")
  • data_clock_sync = Param.Bool(False, "Synchronization commands
    required")
  • Currently rolled into other params

  • ######################################################################
  • tRC  - assumed to be tRAS + tRP

  • Power Behaviour and Constraints

  • DRAMs like LPDDR and WideIO have 2 external voltage domains. These

are

  • defined as VDD and VDD2. Each current is defined for each voltage

domain

  • separately. For example, current IDD0 is active-precharge current for

  • voltage domain VDD and current IDD02 is active-precharge current for

  • voltage domain VDD2.

  • By default all currents are set to 0mA. Users who are only

interested in

  • the performance of DRAMs can leave them at 0.

  • Operating 1 Bank Active-Precharge current

  • IDD0 = Param.Current("0mA", "Active precharge current")
  • Operating 1 Bank Active-Precharge current multiple voltage Range

  • IDD02 = Param.Current("0mA", "Active precharge current VDD2")
  • Precharge Power-down Current: Slow exit

  • IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow")
  • Precharge Power-down Current: Slow exit multiple voltage Range

  • IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2")
  • Precharge Power-down Current: Fast exit

  • IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast")
  • Precharge Power-down Current: Fast exit multiple voltage Range

  • IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2")
  • Precharge Standby current

  • IDD2N = Param.Current("0mA", "Precharge Standby current")
  • Precharge Standby current multiple voltage range

  • IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2")
  • Active Power-down current: slow exit

  • IDD3P0 = Param.Current("0mA", "Active Powerdown slow")
  • Active Power-down current: slow exit multiple voltage range

  • IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2")
  • Active Power-down current : fast exit

  • IDD3P1 = Param.Current("0mA", "Active Powerdown fast")
  • Active Power-down current : fast exit multiple voltage range

  • IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2")
  • Active Standby current

  • IDD3N = Param.Current("0mA", "Active Standby current")
  • Active Standby current multiple voltage range

  • IDD3N2 = Param.Current("0mA", "Active Standby current VDD2")
  • Burst Read Operating Current

  • IDD4R = Param.Current("0mA", "READ current")
  • Burst Read Operating Current multiple voltage range

  • IDD4R2 = Param.Current("0mA", "READ current VDD2")
  • Burst Write Operating Current

  • IDD4W = Param.Current("0mA", "WRITE current")
  • Burst Write Operating Current multiple voltage range

  • IDD4W2 = Param.Current("0mA", "WRITE current VDD2")
  • Refresh Current

  • IDD5 = Param.Current("0mA", "Refresh current")
  • Refresh Current multiple voltage range

  • IDD52 = Param.Current("0mA", "Refresh current VDD2")
  • Self-Refresh Current

  • IDD6 = Param.Current("0mA", "Self-refresh Current")
  • Self-Refresh Current multiple voltage range

  • IDD62 = Param.Current("0mA", "Self-refresh Current VDD2")
  • Main voltage range of the DRAM

  • VDD = Param.Voltage("0V", "Main Voltage Range")
  • Second voltage range defined by some DRAMs

  • VDD2 = Param.Voltage("0V", "2nd Voltage Range")

-# A single DDR3-1600 x64 channel (one command and address bus), with
-# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in
-# an 8x8 configuration.
-class DDR3_1600_8x8(DRAMCtrl):

  • size of device in bytes

  • device_size = '512MB'
  • 8x8 configuration, 8 devices each with an 8-bit interface

  • device_bus_width = 8
  • DDR3 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)

  • device_rowbuffer_size = '1kB'
  • 8x8 configuration, so 8 devices

  • devices_per_rank = 8
  • Use two ranks

  • ranks_per_channel = 2
  • DDR3 has 8 banks in all configurations

  • banks_per_rank = 8
  • 800 MHz

  • tCK = '1.25ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 800 MHz

  • tBURST = '5ns'
  • DDR3-1600 11-11-11

  • tRCD = '13.75ns'
  • tCL = '13.75ns'
  • tRP = '13.75ns'
  • tRAS = '35ns'
  • tRRD = '6ns'
  • tXAW = '30ns'
  • activation_limit = 4
  • tRFC = '260ns'
  • tWR = '15ns'
  • Greater of 4 CK or 7.5 ns

  • tWTR = '7.5ns'
  • Greater of 4 CK or 7.5 ns

  • tRTP = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns

  • tRTW = '2.5ns'
  • Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns

  • tCS = '2.5ns'
  • <=85C, half for >85C

  • tREFI = '7.8us'
  • active powerdown and precharge powerdown exit time

  • tXP = '6ns'
  • self refresh exit time

  • tXS = '270ns'
  • Current values from datasheet Die Rev E,J

  • IDD0 = '55mA'
  • IDD2N = '32mA'
  • IDD3N = '38mA'
  • IDD4W = '125mA'
  • IDD4R = '157mA'
  • IDD5 = '235mA'
  • IDD3P1 = '38mA'
  • IDD2P1 = '32mA'
  • IDD6 = '20mA'
  • VDD = '1.5V'

-# A single HMC-2500 x32 model based on:
-# [1] DRAMSpec: a high-level DRAM bank modelling tool
-# developed at the University of Kaiserslautern. This high level tool
-# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to
-# estimate the DRAM bank latency and power numbers.
-# [2] High performance AXI-4.0 based interconnect for extensible smart
memory
-# cubes (E. Azarkhish et. al)
-# Assumed for the HMC model is a 30 nm technology node.
-# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory
(4
-# layers).
-# Each layer has 16 vaults and each vault consists of 2 banks per layer.
-# In order to be able to use the same controller used for 2D DRAM
generations
-# for HMC, the following analogy is done:
-# Channel (DDR) => Vault (HMC)
-# device_size (DDR) => size of a single layer in a vault
-# ranks per channel (DDR) => number of layers
-# banks per rank (DDR) => banks per layer
-# devices per rank (DDR) => devices per layer ( 1 for HMC).
-# The parameters for which no input is available are inherited from the
DDR3
-# configuration.
-# This configuration includes the latencies from the DRAM to the logic
layer
-# of the HMC
-class HMC_2500_1x32(DDR3_1600_8x8):

  • size of device

  • two banks per device with each bank 4MB [2]

  • device_size = '8MB'
  • 1x32 configuration, 1 device with 32 TSVs [2]

  • device_bus_width = 32
  • HMC is a BL8 device [2]

  • burst_length = 8
  • Each device has a page (row buffer) size of 256 bytes [2]

  • device_rowbuffer_size = '256B'
  • 1x32 configuration, so 1 device [2]

  • devices_per_rank = 1
  • 4 layers so 4 ranks [2]

  • ranks_per_channel = 4
  • HMC has 2 banks per layer [2]

  • Each layer represents a rank. With 4 layers and 8 banks in total,

each

  • layer has 2 banks; thus 2 banks per rank.

  • banks_per_rank = 2
  • 1250 MHz [2]

  • tCK = '0.8ns'
  • 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz

  • tBURST = '3.2ns'
  • Values using DRAMSpec HMC model [1]

  • tRCD = '10.2ns'
  • tCL = '9.9ns'
  • tRP = '7.7ns'
  • tRAS = '21.6ns'
  • tRRD depends on the power supply network for each vendor.

  • We assume a tRRD of a double bank approach to be equal to 4 clock

  • cycles (Assumption)

  • tRRD = '3.2ns'
  • activation limit is set to 0 since there are only 2 banks per vault

  • layer.

  • activation_limit = 0
  • Values using DRAMSpec HMC model [1]

  • tRFC = '59ns'
  • tWR = '8ns'
  • tRTP = '4.9ns'
  • Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz

=

  • 0.8 ns (Assumption)

  • tCS = '0.8ns'
  • Value using DRAMSpec HMC model [1]

  • tREFI = '3.9us'
  • The default page policy in the vault controllers is simple closed

page

  • [2] nevertheless 'close' policy opens and closes the row multiple

times

  • for bursts largers than 32Bytes. For this reason we

use 'close_adaptive'

  • page_policy = 'close_adaptive'
  • RoCoRaBaCh resembles the default address mapping in HMC

  • addr_mapping = 'RoCoRaBaCh'
  • min_writes_per_switch = 8
  • These parameters do not directly correlate with buffer_size in real

  • hardware. Nevertheless, their value has been tuned to achieve a

  • bandwidth similar to the cycle-accurate model in [2]

  • write_buffer_size = 32
  • read_buffer_size = 32
  • The static latency of the vault controllers is estimated to be

smaller

  • than a full DRAM channel controller

  • static_backend_latency='4ns'
  • static_frontend_latency='4ns'

-# A single DDR3-2133 x64 channel refining a selected subset of the
-# options for the DDR-1600 configuration, based on the same DDR3-1600
-# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept
-# consistent across the two configurations.
-class DDR3_2133_8x8(DDR3_1600_8x8):

  • 1066 MHz

  • tCK = '0.938ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz

  • tBURST = '3.752ns'
  • DDR3-2133 14-14-14

  • tRCD = '13.09ns'
  • tCL = '13.09ns'
  • tRP = '13.09ns'
  • tRAS = '33ns'
  • tRRD = '5ns'
  • tXAW = '25ns'
  • Current values from datasheet

  • IDD0 = '70mA'
  • IDD2N = '37mA'
  • IDD3N = '44mA'
  • IDD4W = '157mA'
  • IDD4R = '191mA'
  • IDD5 = '250mA'
  • IDD3P1 = '44mA'
  • IDD2P1 = '43mA'
  • IDD6 ='20mA'
  • VDD = '1.5V'

-# A single DDR4-2400 x64 channel (one command and address bus), with
-# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4)
-# in an 16x4 configuration.
-# Total channel capacity is 32GB
-# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel
-class DDR4_2400_16x4(DRAMCtrl):

  • size of device

  • device_size = '1GB'
  • 16x4 configuration, 16 devices each with a 4-bit interface

  • device_bus_width = 4
  • DDR4 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 512 byte (1K columns x4)

  • device_rowbuffer_size = '512B'
  • 16x4 configuration, so 16 devices

  • devices_per_rank = 16
  • Match our DDR3 configurations which is dual rank

  • ranks_per_channel = 2
  • DDR4 has 2 (x16) or 4 (x4 and x8) bank groups

  • Set to 4 for x4 case

  • bank_groups_per_rank = 4
  • DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all

  • configurations). Currently we do not capture the additional

  • constraints incurred by the bank groups

  • banks_per_rank = 16
  • override the default buffer sizes and go for something larger to

  • accommodate the larger bank count

  • write_buffer_size = 128
  • read_buffer_size = 64
  • 1200 MHz

  • tCK = '0.833ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = '3.332ns'
  • @2400 data rate, tCCD_L is 6 CK

  • CAS-to-CAS delay for bursts to the same bank group

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = '5ns';
  • DDR4-2400 17-17-17

  • tRCD = '14.16ns'
  • tCL = '14.16ns'
  • tRP = '14.16ns'
  • tRAS = '32ns'
  • RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns)

  • tRRD = '3.332ns'
  • RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns)

  • tRRD_L = '4.9ns';
  • tFAW for 512B page is MAX(16 CK, 13ns)

  • tXAW = '13.328ns'
  • activation_limit = 4
  • tRFC is 350ns

  • tRFC = '350ns'
  • tWR = '15ns'
  • Here using the average of WTR_S and WTR_L

  • tWTR = '5ns'
  • Greater of 4 CK or 7.5 ns

  • tRTP = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666

ns

  • tRTW = '1.666ns'
  • Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns

  • tCS = '1.666ns'
  • <=85C, half for >85C

  • tREFI = '7.8us'
  • active powerdown and precharge powerdown exit time

  • tXP = '6ns'
  • self refresh exit time

  • exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is:

  • tRFC + 10ns = 340ns

  • tXS = '340ns'
  • Current values from datasheet

  • IDD0 = '43mA'
  • IDD02 = '3mA'
  • IDD2N = '34mA'
  • IDD3N = '38mA'
  • IDD3N2 = '3mA'
  • IDD4W = '103mA'
  • IDD4R = '110mA'
  • IDD5 = '250mA'
  • IDD3P1 = '32mA'
  • IDD2P1 = '25mA'
  • IDD6 = '30mA'
  • VDD = '1.2V'
  • VDD2 = '2.5V'

-# A single DDR4-2400 x64 channel (one command and address bus), with
-# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8)
-# in an 8x8 configuration.
-# Total channel capacity is 16GB
-# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel
-class DDR4_2400_8x8(DDR4_2400_16x4):

  • 8x8 configuration, 8 devices each with an 8-bit interface

  • device_bus_width = 8
  • Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)

  • device_rowbuffer_size = '1kB'
  • 8x8 configuration, so 8 devices

  • devices_per_rank = 8
  • RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns)

  • tRRD_L = '4.9ns';
  • tXAW = '21ns'
  • Current values from datasheet

  • IDD0 = '48mA'
  • IDD3N = '43mA'
  • IDD4W = '123mA'
  • IDD4R = '135mA'
  • IDD3P1 = '37mA'

-# A single DDR4-2400 x64 channel (one command and address bus), with
-# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16)
-# in an 4x16 configuration.
-# Total channel capacity is 4GB
-# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel
-class DDR4_2400_4x16(DDR4_2400_16x4):

  • 4x16 configuration, 4 devices each with an 16-bit interface

  • device_bus_width = 16
  • Each device has a page (row buffer) size of 2 Kbyte (1K columns x16)

  • device_rowbuffer_size = '2kB'
  • 4x16 configuration, so 4 devices

  • devices_per_rank = 4
  • Single rank for x16

  • ranks_per_channel = 1
  • DDR4 has 2 (x16) or 4 (x4 and x8) bank groups

  • Set to 2 for x16 case

  • bank_groups_per_rank = 2
  • DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all

  • configurations). Currently we do not capture the additional

  • constraints incurred by the bank groups

  • banks_per_rank = 8
  • RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns)

  • tRRD = '5.3ns'
  • RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns)

  • tRRD_L = '6.4ns';
  • tXAW = '30ns'
  • Current values from datasheet

  • IDD0 = '80mA'
  • IDD02 = '4mA'
  • IDD2N = '34mA'
  • IDD3N = '47mA'
  • IDD4W = '228mA'
  • IDD4R = '243mA'
  • IDD5 = '280mA'
  • IDD3P1 = '41mA'

-# A single LPDDR2-S4 x32 interface (one command/address bus), with
-# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1)
-# in a 1x32 configuration.
-class LPDDR2_S4_1066_1x32(DRAMCtrl):

  • No DLL in LPDDR2

  • dll = False
  • size of device

  • device_size = '512MB'
  • 1x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • LPDDR2_S4 is a BL4 and BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 1KB

  • (this depends on the memory density)

  • device_rowbuffer_size = '1kB'
  • 1x32 configuration, so 1 device

  • devices_per_rank = 1
  • Use a single rank

  • ranks_per_channel = 1
  • LPDDR2-S4 has 8 banks in all configurations

  • banks_per_rank = 8
  • 533 MHz

  • tCK = '1.876ns'
  • Fixed at 15 ns

  • tRCD = '15ns'
  • 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time

  • tCL = '15ns'
  • Pre-charge one bank 15 ns (all banks 18 ns)

  • tRP = '15ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • tRTP = '7.5ns'
  • 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz.

  • Note this is a BL8 DDR device.

  • Requests larger than 32 bytes are broken down into multiple requests

  • in the controller

  • tBURST = '7.5ns'
  • LPDDR2-S4, 4 Gbit

  • tRFC = '130ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '7.5ns'
  • self refresh exit time

  • tXS = '140ns'
  • Irrespective of speed grade, tWTR is 7.5 ns

  • tWTR = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns

  • tRTW = '3.75ns'
  • Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns

  • tCS = '3.75ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Irrespective of density, tFAW is 50 ns

  • tXAW = '50ns'
  • activation_limit = 4
  • Current values from datasheet

  • IDD0 = '15mA'
  • IDD02 = '70mA'
  • IDD2N = '2mA'
  • IDD2N2 = '30mA'
  • IDD3N = '2.5mA'
  • IDD3N2 = '30mA'
  • IDD4W = '10mA'
  • IDD4W2 = '190mA'
  • IDD4R = '3mA'
  • IDD4R2 = '220mA'
  • IDD5 = '40mA'
  • IDD52 = '150mA'
  • IDD3P1 = '1.2mA'
  • IDD3P12 = '8mA'
  • IDD2P1 = '0.6mA'
  • IDD2P12 = '0.8mA'
  • IDD6 = '1mA'
  • IDD62 = '3.2mA'
  • VDD = '1.8V'
  • VDD2 = '1.2V'

-# A single WideIO x128 interface (one command and address bus), with
-# default timings based on an estimated WIO-200 8 Gbit part.
-class WideIO_200_1x128(DRAMCtrl):

  • No DLL for WideIO

  • dll = False
  • size of device

  • device_size = '1024MB'
  • 1x128 configuration, 1 device with a 128-bit interface

  • device_bus_width = 128
  • This is a BL4 device

  • burst_length = 4
  • Each device has a page (row buffer) size of 4KB

  • (this depends on the memory density)

  • device_rowbuffer_size = '4kB'
  • 1x128 configuration, so 1 device

  • devices_per_rank = 1
  • Use one rank for a one-high die stack

  • ranks_per_channel = 1
  • WideIO has 4 banks in all configurations

  • banks_per_rank = 4
  • 200 MHz

  • tCK = '5ns'
  • WIO-200

  • tRCD = '18ns'
  • tCL = '18ns'
  • tRP = '18ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • Read to precharge is same as the burst

  • tRTP = '20ns'
  • 4 beats across an x128 SDR interface translates to 4 clocks @ 200

MHz.

  • Note this is a BL4 SDR device.

  • tBURST = '20ns'
  • WIO 8 Gb

  • tRFC = '210ns'
  • WIO 8 Gb, <=85C, half for >85C

  • tREFI = '3.9us'
  • Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns

  • tWTR = '15ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns

  • tRTW = '10ns'
  • Default different rank bus delay to 2 CK, @200 MHz = 10 ns

  • tCS = '10ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Two instead of four activation window

  • tXAW = '50ns'
  • activation_limit = 2
  • The WideIO specification does not provide current information

-# A single LPDDR3 x32 interface (one command/address bus), with
-# default timings based on a LPDDR3-1600 4 Gbit part (Micron
-# EDF8132A1MC) in a 1x32 configuration.
-class LPDDR3_1600_1x32(DRAMCtrl):

  • No DLL for LPDDR3

  • dll = False
  • size of device

  • device_size = '512MB'
  • 1x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • LPDDR3 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 4KB

  • device_rowbuffer_size = '4kB'
  • 1x32 configuration, so 1 device

  • devices_per_rank = 1
  • Technically the datasheet is a dual-rank package, but for

  • comparison with the LPDDR2 config we stick to a single rank

  • ranks_per_channel = 1
  • LPDDR3 has 8 banks in all configurations

  • banks_per_rank = 8
  • 800 MHz

  • tCK = '1.25ns'
  • tRCD = '18ns'
  • 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time

  • tCL = '15ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns

  • tRTP = '7.5ns'
  • Pre-charge one bank 18 ns (all banks 21 ns)

  • tRP = '18ns'
  • 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz.

  • Note this is a BL8 DDR device.

  • Requests larger than 32 bytes are broken down into multiple requests

  • in the controller

  • tBURST = '5ns'
  • LPDDR3, 4 Gb

  • tRFC = '130ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '7.5ns'
  • self refresh exit time

  • tXS = '140ns'
  • Irrespective of speed grade, tWTR is 7.5 ns

  • tWTR = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns

  • tRTW = '2.5ns'
  • Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns

  • tCS = '2.5ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Irrespective of size, tFAW is 50 ns

  • tXAW = '50ns'
  • activation_limit = 4
  • Current values from datasheet

  • IDD0 = '8mA'
  • IDD02 = '60mA'
  • IDD2N = '0.8mA'
  • IDD2N2 = '26mA'
  • IDD3N = '2mA'
  • IDD3N2 = '34mA'
  • IDD4W = '2mA'
  • IDD4W2 = '190mA'
  • IDD4R = '2mA'
  • IDD4R2 = '230mA'
  • IDD5 = '28mA'
  • IDD52 = '150mA'
  • IDD3P1 = '1.4mA'
  • IDD3P12 = '11mA'
  • IDD2P1 = '0.8mA'
  • IDD2P12 = '1.8mA'
  • IDD6 = '0.5mA'
  • IDD62 = '1.8mA'
  • VDD = '1.8V'
  • VDD2 = '1.2V'

-# A single GDDR5 x64 interface, with
-# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix
-# H5GQ1H24AFR) in a 2x32 configuration.
-class GDDR5_4000_2x32(DRAMCtrl):

  • size of device

  • device_size = '128MB'
  • 2x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • GDDR5 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 2Kbits (256Bytes)

  • device_rowbuffer_size = '256B'
  • 2x32 configuration, so 2 devices

  • devices_per_rank = 2
  • assume single rank

  • ranks_per_channel = 1
  • GDDR5 has 4 bank groups

  • bank_groups_per_rank = 4
  • GDDR5 has 16 banks with 4 bank groups

  • banks_per_rank = 16
  • 1000 MHz

  • tCK = '1ns'
  • 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz

  • Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz )

  • 8 beats at 4000 MHz = 2 beats at 1000 MHz

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = '2ns'
  • @1000MHz data rate, tCCD_L is 3 CK

  • CAS-to-CAS delay for bursts to the same bank group

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = '3ns';
  • tRCD = '12ns'
  • tCL is not directly found in datasheet and assumed equal tRCD

  • tCL = '12ns'
  • tRP = '12ns'
  • tRAS = '28ns'
  • RRD_S (different bank group)

  • RRD_S is 5.5 ns in datasheet.

  • rounded to the next multiple of tCK

  • tRRD = '6ns'
  • RRD_L (same bank group)

  • RRD_L is 5.5 ns in datasheet.

  • rounded to the next multiple of tCK

  • tRRD_L = '6ns'
  • tXAW = '23ns'
  • tXAW < 4 x tRRD.

  • Therefore, activation limit is set to 0

  • activation_limit = 0
  • tRFC = '65ns'
  • tWR = '12ns'
  • Here using the average of WTR_S and WTR_L

  • tWTR = '5ns'
  • Read-to-Precharge 2 CK

  • tRTP = '2ns'
  • Assume 2 cycles

  • tRTW = '2ns'

-# A single HBM x128 interface (one command and address bus), with
-# default timings based on data publically released
-# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014),
-# IDD measurement values, and by extrapolating data from other classes.
-# Architecture values based on published HBM spec
-# A 4H stack is defined, 2Gb per die for a total of 1GB of memory.
-class HBM_1000_4H_1x128(DRAMCtrl):

  • HBM gen1 supports up to 8 128-bit physical channels

  • Configuration defines a single channel, with the capacity

  • set to (full_ stack_capacity / 8) based on 2Gb dies

  • To use all 8 channels, set 'channels' parameter to 8 in

  • system configuration

  • 128-bit interface legacy mode

  • device_bus_width = 128
  • HBM supports BL4 and BL2 (legacy mode only)

  • burst_length = 4
  • size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack;

  • with 8 channels, 128MB per channel

  • device_size = '128MB'
  • device_rowbuffer_size = '2kB'
  • 1x128 configuration

  • devices_per_rank = 1
  • HBM does not have a CS pin; set rank to 1

  • ranks_per_channel = 1
  • HBM has 8 or 16 banks depending on capacity

  • 2Gb dies have 8 banks

  • banks_per_rank = 8
  • depending on frequency, bank groups may be required

  • will always have 4 bank groups when enabled

  • current specifications do not define the minimum frequency for

  • bank group architecture

  • setting bank_groups_per_rank to 0 to disable until range is defined

  • bank_groups_per_rank = 0
  • 500 MHz for 1Gbps DDR data rate

  • tCK = '2ns'
  • use values from IDD measurement in JEDEC spec

  • use tRP value for tRCD and tCL similar to other classes

  • tRP = '15ns'
  • tRCD = '15ns'
  • tCL = '15ns'
  • tRAS = '33ns'
  • BL2 and BL4 supported, default to BL4

  • DDR @ 500 MHz means 4 * 2ns / 2 = 4ns

  • tBURST = '4ns'
  • value for 2Gb device from JEDEC spec

  • tRFC = '160ns'
  • value for 2Gb device from JEDEC spec

  • tREFI = '3.9us'
  • extrapolate the following from LPDDR configs, using ns values

  • to minimize burst length, prefetch differences

  • tWR = '18ns'
  • tRTP = '7.5ns'
  • tWTR = '10ns'
  • start with 2 cycles turnaround, similar to other memory classes

  • could be more with variations across the stack

  • tRTW = '4ns'
  • single rank device, set to 0

  • tCS = '0ns'
  • from MemCon example, tRRD is 4ns with 2ns tCK

  • tRRD = '4ns'
  • from MemCon example, tFAW is 30ns with 2ns tCK

  • tXAW = '30ns'
  • activation_limit = 4
  • 4tCK

  • tXP = '8ns'
  • start with tRFC + tXP -> 160ns + 8ns = 168ns

  • tXS = '168ns'

-# A single HBM x64 interface (one command and address bus), with
-# default timings based on HBM gen1 and data publically released
-# A 4H stack is defined, 8Gb per die for a total of 4GB of memory.
-# Note: This defines a pseudo-channel with a unique controller
-# instantiated per pseudo-channel
-# Stay at same IO rate (1Gbps) to maintain timing relationship with
-# HBM gen1 class (HBM_1000_4H_x128) where possible
-class HBM_1000_4H_1x64(HBM_1000_4H_1x128):

  • For HBM gen2 with pseudo-channel mode, configure 2X channels.

  • Configuration defines a single pseudo channel, with the capacity

  • set to (full_ stack_capacity / 16) based on 8Gb dies

  • To use all 16 pseudo channels, set 'channels' parameter to 16 in

  • system configuration

  • 64-bit pseudo-channle interface

  • device_bus_width = 64
  • HBM pseudo-channel only supports BL4

  • burst_length = 4
  • size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack;

  • with 16 channels, 256MB per channel

  • device_size = '256MB'
  • page size is halved with pseudo-channel; maintaining the same same

number

  • of rows per pseudo-channel with 2X banks across 2 channels

  • device_rowbuffer_size = '1kB'
  • HBM has 8 or 16 banks depending on capacity

  • Starting with 4Gb dies, 16 banks are defined

  • banks_per_rank = 16
  • reset tRFC for larger, 8Gb device

  • use HBM1 4Gb value as a starting point

  • tRFC = '260ns'
  • start with tRFC + tXP -> 160ns + 8ns = 168ns

  • tXS = '268ns'
  • Default different rank bus delay to 2 CK, @1000 MHz = 2 ns

  • tCS = '2ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '10ns'
  • self refresh exit time

  • tXS = '65ns'

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture
-# burst of 32, which means bursts can be interleaved
-class LPDDR5_5500_1x16_BG_BL32(DRAMCtrl):

  • Increase buffer size to account for more bank resources

  • read_buffer_size = 64
  • Set page policy to better suit DMC Huxley

  • page_policy = 'close_adaptive'
  • 16-bit channel interface

  • device_bus_width = 16
  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL32 for higher command bandwidth

  • burst_length = 32
  • size of device in bytes

  • device_size = '1GB'
  • 2kB page with BG mode

  • device_rowbuffer_size = '2kB'
  • Use a 1x16 configuration

  • devices_per_rank = 1
  • Use a single rank

  • ranks_per_channel = 1
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Initial configuration will have 16 banks with Bank Group Arch

  • to maximim resources and enable higher data rates

  • banks_per_rank = 16
  • bank_groups_per_rank = 4
  • 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK

  • tCK = '1.455ns'
  • Greater of 2 CK or 18ns

  • tRCD = '18ns'
  • Base RL is 16 CK @ 687.5 MHz = 23.28ns

  • tCL = '23.280ns'
  • Greater of 2 CK or 18ns

  • tRP = '18ns'
  • Greater of 3 CK or 42ns

  • tRAS = '42ns'
  • Greater of 3 CK or 34ns

  • tWR = '34ns'
  • active powerdown and precharge powerdown exit time

  • Greater of 3 CK or 7ns

  • tXP = '7ns'
  • self refresh exit time (tRFCab + 7.5ns)

  • tXS = '217.5ns'
  • Greater of 2 CK or 7.5 ns minus 2 CK

  • tRTP = '4.59ns'
  • With BG architecture, burst of 32 transferred in two 16-beat

  • sub-bursts, with a 16-beat gap in between.

  • Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz

  • tBURST is the delay to transfer the Bstof32 =  6 CK @ 687.5 MHz

  • tBURST = '8.73ns'
  • can interleave a Bstof32 from another bank group at tBURST_MIN

  • 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz

  • tBURST_MIN = '2.91ns'
  • tBURST_MAX is the maximum burst delay for same bank group timing

  • this is 8 CK @ 687.5 MHz

  • tBURST_MAX = '11.64ns'
  • 8 CK @ 687.5 MHz

  • tCCD_L = "11.64ns"
  • LPDDR5, 8 Gbit/channel for 280ns tRFCab

  • tRFC = '210ns'
  • tREFI = '3.9us'
  • Greater of 4 CK or 6.25 ns

  • tWTR = '6.25ns'
  • Greater of 4 CK or 12 ns

  • tWTR_L = '12ns'
  • Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL

  • tWCKDQ0/tCK will be 1 CK for most cases

  • For gem5 RL = WL and BL/n is already accounted for with tBURST

  • Result is and additional 1 CK is required

  • tRTW = '1.455ns'
  • Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns

  • tCS = '2.91ns'
  • 2 CK

  • tPPD = '2.91ns'
  • Greater of 2 CK or 5 ns

  • tRRD = '5ns'
  • tRRD_L = '5ns'
  • With Bank Group Arch mode tFAW is 20 ns

  • tXAW = '20ns'
  • activation_limit = 4
  • at 5Gbps, 4:1 WCK to CK ratio required

  • 2 data beats per WCK (DDR) -> 8 per CK

  • beats_per_clock = 8
  • 2 cycles required to send activate command

  • 2 command phases can be sent back-to-back or

  • with a gap up to tAAD = 8 CK

  • two_cycle_activate = True
  • tAAD = '11.640ns'
  • data_clock_sync = True

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture, burst of 16
-class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32):

  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL16 for smaller access granularity

  • burst_length = 16
  • For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST = '2.91ns'
  • tBURST_MIN = '2.91ns'
  • For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST_MAX = '5.82ns'
  • 4 CK @ 687.5 MHz

  • tCCD_L = "5.82ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 8-bank mode, burst of 32
-class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32):

  • 4kB page with 8B mode

  • device_rowbuffer_size = '4kB'
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Select 8B

  • banks_per_rank = 8
  • bank_groups_per_rank = 0
  • For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST = '5.82ns'
  • tBURST_MIN = '5.82ns'
  • tBURST_MAX = '5.82ns'
  • Greater of 4 CK or 12 ns

  • tWTR = '12ns'
  • Greater of 2 CK or 10 ns

  • tRRD = '10ns'
  • With 8B mode tFAW is 40 ns

  • tXAW = '40ns'
  • activation_limit = 4
  • Reset BG arch timing for 8B mode

  • tCCD_L = "0ns"
  • tRRD_L = "0ns"
  • tWTR_L = "0ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture
-# burst of 32, which means bursts can be interleaved
-class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32):

  • 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK

  • tCK = '1.25ns'
  • Base RL is 17 CK @ 800 MHz = 21.25ns

  • tCL = '21.25ns'
  • With BG architecture, burst of 32 transferred in two 16-beat

  • sub-bursts, with a 16-beat gap in between.

  • Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz

  • tBURST is the delay to transfer the Bstof32 =  6 CK @ 800 MHz

  • tBURST = '7.5ns'
  • can interleave a Bstof32 from another bank group at tBURST_MIN

  • 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz

  • tBURST_MIN = '2.5ns'
  • tBURST_MAX is the maximum burst delay for same bank group timing

  • this is 8 CK @ 800 MHz

  • tBURST_MAX = '10ns'
  • 8 CK @ 800 MHz

  • tCCD_L = "10ns"
  • Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL

  • tWCKDQ0/tCK will be 1 CK for most cases

  • For gem5 RL = WL and BL/n is already accounted for with tBURST

  • Result is and additional 1 CK is required

  • tRTW = '1.25ns'
  • Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns

  • tCS = '2.5ns'
  • 2 CK

  • tPPD = '2.5ns'
  • 2 command phases can be sent back-to-back or

  • with a gap up to tAAD = 8 CK

  • tAAD = '10ns'

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on initial
-# JEDEC specifcation
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture, burst of 16
-class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32):

  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL16 for smaller access granularity

  • burst_length = 16
  • For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio

  • tBURST = '2.5ns'
  • tBURST_MIN = '2.5ns'
  • For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio

  • tBURST_MAX = '5ns'
  • 4 CK @ 800 MHz

  • tCCD_L = "5ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 8-bank mode, burst of 32
-class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32):

  • 4kB page with 8B mode

  • device_rowbuffer_size = '4kB'
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Select 8B

  • banks_per_rank = 8
  • bank_groups_per_rank = 0
  • For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio

  • tBURST = '5ns'
  • tBURST_MIN = '5ns'
  • tBURST_MAX = '5ns'
  • Greater of 4 CK or 12 ns

  • tWTR = '12ns'
  • Greater of 2 CK or 10 ns

  • tRRD = '10ns'
  • With 8B mode tFAW is 40 ns

  • tXAW = '40ns'
  • activation_limit = 4
  • Reset BG arch timing for 8B mode

  • tCCD_L = "0ns"
  • tRRD_L = "0ns"
  • tWTR_L = "0ns"
    diff --git a/src/mem/DRAMInterface.py b/src/mem/DRAMInterface.py
    new file mode 100644
    index 0000000..f571920
    --- /dev/null
    +++ b/src/mem/DRAMInterface.py
    @@ -0,0 +1,1473 @@
    +# Copyright (c) 2012-2020 ARM Limited
    +# All rights reserved.
    +#
    +# The license below extends only to copyright in the software and shall
    +# not be construed as granting a license to any other intellectual
    +# property including but not limited to intellectual property relating
    +# to a hardware implementation of the functionality of the software
    +# licensed hereunder.  You may use the software subject to the license
    +# terms below provided that you ensure that this notice is replicated
    +# unmodified and in its entirety in all distributions of the software,
    +# modified or unmodified, in source code or in binary form.
    +#
    +# Copyright (c) 2013 Amin Farmahini-Farahani
    +# Copyright (c) 2015 University of Kaiserslautern
    +# Copyright (c) 2015 The University of Bologna
    +# All rights reserved.
    +#
    +# Redistribution and use in source and binary forms, with or without
    +# modification, are permitted provided that the following conditions are
    +# met: redistributions of source code must retain the above copyright
    +# notice, this list of conditions and the following disclaimer;
    +# redistributions in binary form must reproduce the above copyright
    +# notice, this list of conditions and the following disclaimer in the
    +# documentation and/or other materials provided with the distribution;
    +# neither the name of the copyright holders nor the names of its
    +# contributors may be used to endorse or promote products derived from
    +# this software without specific prior written permission.
    +#
    +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.AbstractMemory import AbstractMemory
+
+# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting
+# channel, rank, bank, row and column, respectively, and going from
+# MSB to LSB.  Available are RoRaBaChCo and RoRaBaCoCh, that are
+# suitable for an open-page policy, optimising for sequential accesses
+# hitting in the open row. For a closed-page policy, RoCoRaBaCh
+# maximises parallelism.
+class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh']
+
+# Enum for the page policy, either open, open_adaptive, close, or
+# close_adaptive.
+class PageManage(Enum): vals = ['open', 'open_adaptive', 'close',

  •                            'close_adaptive']
    

+class DRAMInterface(AbstractMemory):

  • type = 'DRAMInterface'
  • cxx_header = "mem/dram_ctrl.hh"
  • Allow the interface to set required controller buffer sizes

  • each entry corresponds to a burst for the specific DRAM

  • configuration (e.g. x32 with burst length 8 is 32 bytes) and not

  • the cacheline size or request/packet size

  • write_buffer_size = Param.Unsigned(64, "Number of write queue entries")
  • read_buffer_size = Param.Unsigned(32, "Number of read queue entries")
  • scheduler, address map and page policy

  • addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy")
  • page_policy = Param.PageManage('open_adaptive', "Page management
    policy")
  • enforce a limit on the number of accesses per row

  • max_accesses_per_row = Param.Unsigned(16, "Max accesses per row
    before "
  •                                      "closing");
    
  • size of DRAM Chip in Bytes

  • device_size = Param.MemorySize("Size of DRAM chip")
  • the physical organisation of the DRAM

  • device_bus_width = Param.Unsigned("data bus width in bits for each
    DRAM "\
  •                                  "device/chip")
    
  • burst_length = Param.Unsigned("Burst lenght (BL) in beats")
  • device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\
  •                                       "device/chip")
    
  • devices_per_rank = Param.Unsigned("Number of devices/chips per rank")
  • ranks_per_channel = Param.Unsigned("Number of ranks per channel")
  • default to 0 bank groups per rank, indicating bank group architecture

  • is not used

  • update per memory class when bank group architecture is supported

  • bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per
    rank")
  • banks_per_rank = Param.Unsigned("Number of banks per rank")
  • Enable DRAM powerdown states if True. This is False by default due to

  • performance being lower when enabled

  • enable_dram_powerdown = Param.Bool(False, "Enable powerdown states")
  • For power modelling we need to know if the DRAM has a DLL or not

  • dll = Param.Bool(True, "DRAM has DLL or not")
  • DRAMPower provides in addition to the core power, the possibility to

  • include RD/WR termination and IO power. This calculation assumes some

  • default values. The integration of DRAMPower with gem5 does not

include

  • IO and RD/WR termination power by default. This might be added as an

  • additional feature in the future.

  • timing behaviour and constraints - all in nanoseconds

  • the base clock period of the DRAM

  • tCK = Param.Latency("Clock period")
  • the amount of time in nanoseconds from issuing an activate command

  • to the data being available in the row buffer for a read/write

  • tRCD = Param.Latency("RAS to CAS delay")
  • the time from issuing a read/write command to seeing the actual data

  • tCL = Param.Latency("CAS latency")
  • minimum time between a precharge and subsequent activate

  • tRP = Param.Latency("Row precharge time")
  • minimum time between an activate and a precharge to the same row

  • tRAS = Param.Latency("ACT to PRE delay")
  • minimum time between a write data transfer and a precharge

  • tWR = Param.Latency("Write recovery time")
  • minimum time between a read and precharge command

  • tRTP = Param.Latency("Read to precharge")
  • time to complete a burst transfer, typically the burst length

  • divided by two due to the DDR bus, but by making it a parameter

  • it is easier to also evaluate SDR memories like WideIO.

  • This parameter has to account for burst length.

  • Read/Write requests with data size larger than one full burst are

broken

  • down into multiple requests in the controller

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = Param.Latency("Burst duration "
  •                       "(typically burst length / 2 cycles)")
    
  • tBURST_MAX is the column array cycle delay required before next

access,

  • which could be greater than tBURST when the memory access time is

greater

  • than tBURST

  • tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay")
  • tBURST_MIN is the minimum delay between bursts, which could be less

than

  • tBURST when interleaving is supported

  • tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts")
  • CAS-to-CAS delay for bursts to the same bank group

  • only utilized with bank group architectures; set to 0 for default

case

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay")
  • Write-to-Write delay for bursts to the same bank group

  • only utilized with bank group architectures; set to 0 for default

case

  • This will be used to enable different same bank group delays

  • for writes versus reads

  • tCCD_L_WR = Param.Latency(Self.tCCD_L,
  •  "Same bank group Write to Write delay")
    
  • time taken to complete one refresh cycle (N rows in all banks)

  • tRFC = Param.Latency("Refresh cycle time")
  • refresh command interval, how often a "ref" command needs

  • to be sent. It is 7.8 us for a 64ms refresh requirement

  • tREFI = Param.Latency("Refresh command interval")
  • write-to-read, same rank turnaround penalty

  • tWTR = Param.Latency("Write to read, same rank switching time")
  • write-to-read, same rank turnaround penalty for same bank group

  • tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching "
  •                       "time, same bank group")
    
  • read-to-write, same rank turnaround penalty

  • tRTW = Param.Latency("Read to write, same rank switching time")
  • rank-to-rank bus delay penalty

  • this does not correlate to a memory timing parameter and encompasses:

  • 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD

  • different rank bus delay

  • tCS = Param.Latency("Rank to rank switching time")
  • minimum precharge to precharge delay time

  • tPPD = Param.Latency("0ns", "PRE to PRE delay")
  • maximum delay between two-cycle ACT command phases

  • tAAD = Param.Latency(Self.tCK,
  •                     "Maximum delay between two-cycle ACT commands")
    
  • two_cycle_activate = Param.Bool(False,
  •                     "Two cycles required to send activate")
    
  • minimum row activate to row activate delay time

  • tRRD = Param.Latency("ACT to ACT delay")
  • only utilized with bank group architectures; set to 0 for default

case

  • tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay")
  • time window in which a maximum number of activates are allowed

  • to take place, set to 0 to disable

  • tXAW = Param.Latency("X activation window")
  • activation_limit = Param.Unsigned("Max number of activates in window")
  • time to exit power-down mode

  • Exit power-down to next valid command delay

  • tXP = Param.Latency("0ns", "Power-up Delay")
  • Exit Powerdown to commands requiring a locked DLL

  • tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL")
  • time to exit self-refresh mode

  • tXS = Param.Latency("0ns", "Self-refresh exit latency")
  • time to exit self-refresh mode with locked DLL

  • tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL")
  • number of data beats per clock. with DDR, default is 2, one per edge

  • beats_per_clock = Param.Unsigned(2, "Data beats per clock")
  • data_clock_sync = Param.Bool(False, "Synchronization commands
    required")
  • Currently rolled into other params

  • ######################################################################
  • tRC  - assumed to be tRAS + tRP

  • Power Behaviour and Constraints

  • DRAMs like LPDDR and WideIO have 2 external voltage domains. These

are

  • defined as VDD and VDD2. Each current is defined for each voltage

domain

  • separately. For example, current IDD0 is active-precharge current for

  • voltage domain VDD and current IDD02 is active-precharge current for

  • voltage domain VDD2.

  • By default all currents are set to 0mA. Users who are only

interested in

  • the performance of DRAMs can leave them at 0.

  • Operating 1 Bank Active-Precharge current

  • IDD0 = Param.Current("0mA", "Active precharge current")
  • Operating 1 Bank Active-Precharge current multiple voltage Range

  • IDD02 = Param.Current("0mA", "Active precharge current VDD2")
  • Precharge Power-down Current: Slow exit

  • IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow")
  • Precharge Power-down Current: Slow exit multiple voltage Range

  • IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2")
  • Precharge Power-down Current: Fast exit

  • IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast")
  • Precharge Power-down Current: Fast exit multiple voltage Range

  • IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2")
  • Precharge Standby current

  • IDD2N = Param.Current("0mA", "Precharge Standby current")
  • Precharge Standby current multiple voltage range

  • IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2")
  • Active Power-down current: slow exit

  • IDD3P0 = Param.Current("0mA", "Active Powerdown slow")
  • Active Power-down current: slow exit multiple voltage range

  • IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2")
  • Active Power-down current : fast exit

  • IDD3P1 = Param.Current("0mA", "Active Powerdown fast")
  • Active Power-down current : fast exit multiple voltage range

  • IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2")
  • Active Standby current

  • IDD3N = Param.Current("0mA", "Active Standby current")
  • Active Standby current multiple voltage range

  • IDD3N2 = Param.Current("0mA", "Active Standby current VDD2")
  • Burst Read Operating Current

  • IDD4R = Param.Current("0mA", "READ current")
  • Burst Read Operating Current multiple voltage range

  • IDD4R2 = Param.Current("0mA", "READ current VDD2")
  • Burst Write Operating Current

  • IDD4W = Param.Current("0mA", "WRITE current")
  • Burst Write Operating Current multiple voltage range

  • IDD4W2 = Param.Current("0mA", "WRITE current VDD2")
  • Refresh Current

  • IDD5 = Param.Current("0mA", "Refresh current")
  • Refresh Current multiple voltage range

  • IDD52 = Param.Current("0mA", "Refresh current VDD2")
  • Self-Refresh Current

  • IDD6 = Param.Current("0mA", "Self-refresh Current")
  • Self-Refresh Current multiple voltage range

  • IDD62 = Param.Current("0mA", "Self-refresh Current VDD2")
  • Main voltage range of the DRAM

  • VDD = Param.Voltage("0V", "Main Voltage Range")
  • Second voltage range defined by some DRAMs

  • VDD2 = Param.Voltage("0V", "2nd Voltage Range")

+# A single DDR3-1600 x64 channel (one command and address bus), with
+# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in
+# an 8x8 configuration.
+class DDR3_1600_8x8(DRAMInterface):

  • size of device in bytes

  • device_size = '512MB'
  • 8x8 configuration, 8 devices each with an 8-bit interface

  • device_bus_width = 8
  • DDR3 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)

  • device_rowbuffer_size = '1kB'
  • 8x8 configuration, so 8 devices

  • devices_per_rank = 8
  • Use two ranks

  • ranks_per_channel = 2
  • DDR3 has 8 banks in all configurations

  • banks_per_rank = 8
  • 800 MHz

  • tCK = '1.25ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 800 MHz

  • tBURST = '5ns'
  • DDR3-1600 11-11-11

  • tRCD = '13.75ns'
  • tCL = '13.75ns'
  • tRP = '13.75ns'
  • tRAS = '35ns'
  • tRRD = '6ns'
  • tXAW = '30ns'
  • activation_limit = 4
  • tRFC = '260ns'
  • tWR = '15ns'
  • Greater of 4 CK or 7.5 ns

  • tWTR = '7.5ns'
  • Greater of 4 CK or 7.5 ns

  • tRTP = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns

  • tRTW = '2.5ns'
  • Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns

  • tCS = '2.5ns'
  • <=85C, half for >85C

  • tREFI = '7.8us'
  • active powerdown and precharge powerdown exit time

  • tXP = '6ns'
  • self refresh exit time

  • tXS = '270ns'
  • Current values from datasheet Die Rev E,J

  • IDD0 = '55mA'
  • IDD2N = '32mA'
  • IDD3N = '38mA'
  • IDD4W = '125mA'
  • IDD4R = '157mA'
  • IDD5 = '235mA'
  • IDD3P1 = '38mA'
  • IDD2P1 = '32mA'
  • IDD6 = '20mA'
  • VDD = '1.5V'

+# A single HMC-2500 x32 model based on:
+# [1] DRAMSpec: a high-level DRAM bank modelling tool
+# developed at the University of Kaiserslautern. This high level tool
+# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to
+# estimate the DRAM bank latency and power numbers.
+# [2] High performance AXI-4.0 based interconnect for extensible smart
memory
+# cubes (E. Azarkhish et. al)
+# Assumed for the HMC model is a 30 nm technology node.
+# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory
(4
+# layers).
+# Each layer has 16 vaults and each vault consists of 2 banks per layer.
+# In order to be able to use the same controller used for 2D DRAM
generations
+# for HMC, the following analogy is done:
+# Channel (DDR) => Vault (HMC)
+# device_size (DDR) => size of a single layer in a vault
+# ranks per channel (DDR) => number of layers
+# banks per rank (DDR) => banks per layer
+# devices per rank (DDR) => devices per layer ( 1 for HMC).
+# The parameters for which no input is available are inherited from the
DDR3
+# configuration.
+# This configuration includes the latencies from the DRAM to the logic
layer
+# of the HMC
+class HMC_2500_1x32(DDR3_1600_8x8):

  • size of device

  • two banks per device with each bank 4MB [2]

  • device_size = '8MB'
  • 1x32 configuration, 1 device with 32 TSVs [2]

  • device_bus_width = 32
  • HMC is a BL8 device [2]

  • burst_length = 8
  • Each device has a page (row buffer) size of 256 bytes [2]

  • device_rowbuffer_size = '256B'
  • 1x32 configuration, so 1 device [2]

  • devices_per_rank = 1
  • 4 layers so 4 ranks [2]

  • ranks_per_channel = 4
  • HMC has 2 banks per layer [2]

  • Each layer represents a rank. With 4 layers and 8 banks in total,

each

  • layer has 2 banks; thus 2 banks per rank.

  • banks_per_rank = 2
  • 1250 MHz [2]

  • tCK = '0.8ns'
  • 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz

  • tBURST = '3.2ns'
  • Values using DRAMSpec HMC model [1]

  • tRCD = '10.2ns'
  • tCL = '9.9ns'
  • tRP = '7.7ns'
  • tRAS = '21.6ns'
  • tRRD depends on the power supply network for each vendor.

  • We assume a tRRD of a double bank approach to be equal to 4 clock

  • cycles (Assumption)

  • tRRD = '3.2ns'
  • activation limit is set to 0 since there are only 2 banks per vault

  • layer.

  • activation_limit = 0
  • Values using DRAMSpec HMC model [1]

  • tRFC = '59ns'
  • tWR = '8ns'
  • tRTP = '4.9ns'
  • Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz

=

  • 0.8 ns (Assumption)

  • tCS = '0.8ns'
  • Value using DRAMSpec HMC model [1]

  • tREFI = '3.9us'
  • The default page policy in the vault controllers is simple closed

page

  • [2] nevertheless 'close' policy opens and closes the row multiple

times

  • for bursts largers than 32Bytes. For this reason we

use 'close_adaptive'

  • page_policy = 'close_adaptive'
  • RoCoRaBaCh resembles the default address mapping in HMC

  • addr_mapping = 'RoCoRaBaCh'
  • These parameters do not directly correlate with buffer_size in real

  • hardware. Nevertheless, their value has been tuned to achieve a

  • bandwidth similar to the cycle-accurate model in [2]

  • write_buffer_size = 32
  • read_buffer_size = 32

+# A single DDR3-2133 x64 channel refining a selected subset of the
+# options for the DDR-1600 configuration, based on the same DDR3-1600
+# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept
+# consistent across the two configurations.
+class DDR3_2133_8x8(DDR3_1600_8x8):

  • 1066 MHz

  • tCK = '0.938ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz

  • tBURST = '3.752ns'
  • DDR3-2133 14-14-14

  • tRCD = '13.09ns'
  • tCL = '13.09ns'
  • tRP = '13.09ns'
  • tRAS = '33ns'
  • tRRD = '5ns'
  • tXAW = '25ns'
  • Current values from datasheet

  • IDD0 = '70mA'
  • IDD2N = '37mA'
  • IDD3N = '44mA'
  • IDD4W = '157mA'
  • IDD4R = '191mA'
  • IDD5 = '250mA'
  • IDD3P1 = '44mA'
  • IDD2P1 = '43mA'
  • IDD6 ='20mA'
  • VDD = '1.5V'

+# A single DDR4-2400 x64 channel (one command and address bus), with
+# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4)
+# in an 16x4 configuration.
+# Total channel capacity is 32GB
+# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel
+class DDR4_2400_16x4(DRAMInterface):

  • size of device

  • device_size = '1GB'
  • 16x4 configuration, 16 devices each with a 4-bit interface

  • device_bus_width = 4
  • DDR4 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 512 byte (1K columns x4)

  • device_rowbuffer_size = '512B'
  • 16x4 configuration, so 16 devices

  • devices_per_rank = 16
  • Match our DDR3 configurations which is dual rank

  • ranks_per_channel = 2
  • DDR4 has 2 (x16) or 4 (x4 and x8) bank groups

  • Set to 4 for x4 case

  • bank_groups_per_rank = 4
  • DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all

  • configurations). Currently we do not capture the additional

  • constraints incurred by the bank groups

  • banks_per_rank = 16
  • override the default buffer sizes and go for something larger to

  • accommodate the larger bank count

  • write_buffer_size = 128
  • read_buffer_size = 64
  • 1200 MHz

  • tCK = '0.833ns'
  • 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = '3.332ns'
  • @2400 data rate, tCCD_L is 6 CK

  • CAS-to-CAS delay for bursts to the same bank group

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = '5ns';
  • DDR4-2400 17-17-17

  • tRCD = '14.16ns'
  • tCL = '14.16ns'
  • tRP = '14.16ns'
  • tRAS = '32ns'
  • RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns)

  • tRRD = '3.332ns'
  • RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns)

  • tRRD_L = '4.9ns';
  • tFAW for 512B page is MAX(16 CK, 13ns)

  • tXAW = '13.328ns'
  • activation_limit = 4
  • tRFC is 350ns

  • tRFC = '350ns'
  • tWR = '15ns'
  • Here using the average of WTR_S and WTR_L

  • tWTR = '5ns'
  • Greater of 4 CK or 7.5 ns

  • tRTP = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666

ns

  • tRTW = '1.666ns'
  • Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns

  • tCS = '1.666ns'
  • <=85C, half for >85C

  • tREFI = '7.8us'
  • active powerdown and precharge powerdown exit time

  • tXP = '6ns'
  • self refresh exit time

  • exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is:

  • tRFC + 10ns = 340ns

  • tXS = '340ns'
  • Current values from datasheet

  • IDD0 = '43mA'
  • IDD02 = '3mA'
  • IDD2N = '34mA'
  • IDD3N = '38mA'
  • IDD3N2 = '3mA'
  • IDD4W = '103mA'
  • IDD4R = '110mA'
  • IDD5 = '250mA'
  • IDD3P1 = '32mA'
  • IDD2P1 = '25mA'
  • IDD6 = '30mA'
  • VDD = '1.2V'
  • VDD2 = '2.5V'

+# A single DDR4-2400 x64 channel (one command and address bus), with
+# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8)
+# in an 8x8 configuration.
+# Total channel capacity is 16GB
+# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel
+class DDR4_2400_8x8(DDR4_2400_16x4):

  • 8x8 configuration, 8 devices each with an 8-bit interface

  • device_bus_width = 8
  • Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)

  • device_rowbuffer_size = '1kB'
  • 8x8 configuration, so 8 devices

  • devices_per_rank = 8
  • RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns)

  • tRRD_L = '4.9ns';
  • tXAW = '21ns'
  • Current values from datasheet

  • IDD0 = '48mA'
  • IDD3N = '43mA'
  • IDD4W = '123mA'
  • IDD4R = '135mA'
  • IDD3P1 = '37mA'

+# A single DDR4-2400 x64 channel (one command and address bus), with
+# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16)
+# in an 4x16 configuration.
+# Total channel capacity is 4GB
+# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel
+class DDR4_2400_4x16(DDR4_2400_16x4):

  • 4x16 configuration, 4 devices each with an 16-bit interface

  • device_bus_width = 16
  • Each device has a page (row buffer) size of 2 Kbyte (1K columns x16)

  • device_rowbuffer_size = '2kB'
  • 4x16 configuration, so 4 devices

  • devices_per_rank = 4
  • Single rank for x16

  • ranks_per_channel = 1
  • DDR4 has 2 (x16) or 4 (x4 and x8) bank groups

  • Set to 2 for x16 case

  • bank_groups_per_rank = 2
  • DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all

  • configurations). Currently we do not capture the additional

  • constraints incurred by the bank groups

  • banks_per_rank = 8
  • RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns)

  • tRRD = '5.3ns'
  • RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns)

  • tRRD_L = '6.4ns';
  • tXAW = '30ns'
  • Current values from datasheet

  • IDD0 = '80mA'
  • IDD02 = '4mA'
  • IDD2N = '34mA'
  • IDD3N = '47mA'
  • IDD4W = '228mA'
  • IDD4R = '243mA'
  • IDD5 = '280mA'
  • IDD3P1 = '41mA'

+# A single LPDDR2-S4 x32 interface (one command/address bus), with
+# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1)
+# in a 1x32 configuration.
+class LPDDR2_S4_1066_1x32(DRAMInterface):

  • No DLL in LPDDR2

  • dll = False
  • size of device

  • device_size = '512MB'
  • 1x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • LPDDR2_S4 is a BL4 and BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 1KB

  • (this depends on the memory density)

  • device_rowbuffer_size = '1kB'
  • 1x32 configuration, so 1 device

  • devices_per_rank = 1
  • Use a single rank

  • ranks_per_channel = 1
  • LPDDR2-S4 has 8 banks in all configurations

  • banks_per_rank = 8
  • 533 MHz

  • tCK = '1.876ns'
  • Fixed at 15 ns

  • tRCD = '15ns'
  • 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time

  • tCL = '15ns'
  • Pre-charge one bank 15 ns (all banks 18 ns)

  • tRP = '15ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • tRTP = '7.5ns'
  • 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz.

  • Note this is a BL8 DDR device.

  • Requests larger than 32 bytes are broken down into multiple requests

  • in the controller

  • tBURST = '7.5ns'
  • LPDDR2-S4, 4 Gbit

  • tRFC = '130ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '7.5ns'
  • self refresh exit time

  • tXS = '140ns'
  • Irrespective of speed grade, tWTR is 7.5 ns

  • tWTR = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns

  • tRTW = '3.75ns'
  • Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns

  • tCS = '3.75ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Irrespective of density, tFAW is 50 ns

  • tXAW = '50ns'
  • activation_limit = 4
  • Current values from datasheet

  • IDD0 = '15mA'
  • IDD02 = '70mA'
  • IDD2N = '2mA'
  • IDD2N2 = '30mA'
  • IDD3N = '2.5mA'
  • IDD3N2 = '30mA'
  • IDD4W = '10mA'
  • IDD4W2 = '190mA'
  • IDD4R = '3mA'
  • IDD4R2 = '220mA'
  • IDD5 = '40mA'
  • IDD52 = '150mA'
  • IDD3P1 = '1.2mA'
  • IDD3P12 = '8mA'
  • IDD2P1 = '0.6mA'
  • IDD2P12 = '0.8mA'
  • IDD6 = '1mA'
  • IDD62 = '3.2mA'
  • VDD = '1.8V'
  • VDD2 = '1.2V'

+# A single WideIO x128 interface (one command and address bus), with
+# default timings based on an estimated WIO-200 8 Gbit part.
+class WideIO_200_1x128(DRAMInterface):

  • No DLL for WideIO

  • dll = False
  • size of device

  • device_size = '1024MB'
  • 1x128 configuration, 1 device with a 128-bit interface

  • device_bus_width = 128
  • This is a BL4 device

  • burst_length = 4
  • Each device has a page (row buffer) size of 4KB

  • (this depends on the memory density)

  • device_rowbuffer_size = '4kB'
  • 1x128 configuration, so 1 device

  • devices_per_rank = 1
  • Use one rank for a one-high die stack

  • ranks_per_channel = 1
  • WideIO has 4 banks in all configurations

  • banks_per_rank = 4
  • 200 MHz

  • tCK = '5ns'
  • WIO-200

  • tRCD = '18ns'
  • tCL = '18ns'
  • tRP = '18ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • Read to precharge is same as the burst

  • tRTP = '20ns'
  • 4 beats across an x128 SDR interface translates to 4 clocks @ 200

MHz.

  • Note this is a BL4 SDR device.

  • tBURST = '20ns'
  • WIO 8 Gb

  • tRFC = '210ns'
  • WIO 8 Gb, <=85C, half for >85C

  • tREFI = '3.9us'
  • Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns

  • tWTR = '15ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns

  • tRTW = '10ns'
  • Default different rank bus delay to 2 CK, @200 MHz = 10 ns

  • tCS = '10ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Two instead of four activation window

  • tXAW = '50ns'
  • activation_limit = 2
  • The WideIO specification does not provide current information

+# A single LPDDR3 x32 interface (one command/address bus), with
+# default timings based on a LPDDR3-1600 4 Gbit part (Micron
+# EDF8132A1MC) in a 1x32 configuration.
+class LPDDR3_1600_1x32(DRAMInterface):

  • No DLL for LPDDR3

  • dll = False
  • size of device

  • device_size = '512MB'
  • 1x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • LPDDR3 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 4KB

  • device_rowbuffer_size = '4kB'
  • 1x32 configuration, so 1 device

  • devices_per_rank = 1
  • Technically the datasheet is a dual-rank package, but for

  • comparison with the LPDDR2 config we stick to a single rank

  • ranks_per_channel = 1
  • LPDDR3 has 8 banks in all configurations

  • banks_per_rank = 8
  • 800 MHz

  • tCK = '1.25ns'
  • tRCD = '18ns'
  • 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time

  • tCL = '15ns'
  • tRAS = '42ns'
  • tWR = '15ns'
  • Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns

  • tRTP = '7.5ns'
  • Pre-charge one bank 18 ns (all banks 21 ns)

  • tRP = '18ns'
  • 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz.

  • Note this is a BL8 DDR device.

  • Requests larger than 32 bytes are broken down into multiple requests

  • in the controller

  • tBURST = '5ns'
  • LPDDR3, 4 Gb

  • tRFC = '130ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '7.5ns'
  • self refresh exit time

  • tXS = '140ns'
  • Irrespective of speed grade, tWTR is 7.5 ns

  • tWTR = '7.5ns'
  • Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns

  • tRTW = '2.5ns'
  • Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns

  • tCS = '2.5ns'
  • Activate to activate irrespective of density and speed grade

  • tRRD = '10.0ns'
  • Irrespective of size, tFAW is 50 ns

  • tXAW = '50ns'
  • activation_limit = 4
  • Current values from datasheet

  • IDD0 = '8mA'
  • IDD02 = '60mA'
  • IDD2N = '0.8mA'
  • IDD2N2 = '26mA'
  • IDD3N = '2mA'
  • IDD3N2 = '34mA'
  • IDD4W = '2mA'
  • IDD4W2 = '190mA'
  • IDD4R = '2mA'
  • IDD4R2 = '230mA'
  • IDD5 = '28mA'
  • IDD52 = '150mA'
  • IDD3P1 = '1.4mA'
  • IDD3P12 = '11mA'
  • IDD2P1 = '0.8mA'
  • IDD2P12 = '1.8mA'
  • IDD6 = '0.5mA'
  • IDD62 = '1.8mA'
  • VDD = '1.8V'
  • VDD2 = '1.2V'

+# A single GDDR5 x64 interface, with
+# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix
+# H5GQ1H24AFR) in a 2x32 configuration.
+class GDDR5_4000_2x32(DRAMInterface):

  • size of device

  • device_size = '128MB'
  • 2x32 configuration, 1 device with a 32-bit interface

  • device_bus_width = 32
  • GDDR5 is a BL8 device

  • burst_length = 8
  • Each device has a page (row buffer) size of 2Kbits (256Bytes)

  • device_rowbuffer_size = '256B'
  • 2x32 configuration, so 2 devices

  • devices_per_rank = 2
  • assume single rank

  • ranks_per_channel = 1
  • GDDR5 has 4 bank groups

  • bank_groups_per_rank = 4
  • GDDR5 has 16 banks with 4 bank groups

  • banks_per_rank = 16
  • 1000 MHz

  • tCK = '1ns'
  • 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz

  • Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz )

  • 8 beats at 4000 MHz = 2 beats at 1000 MHz

  • tBURST is equivalent to the CAS-to-CAS delay (tCCD)

  • With bank group architectures, tBURST represents the CAS-to-CAS

  • delay for bursts to different bank groups (tCCD_S)

  • tBURST = '2ns'
  • @1000MHz data rate, tCCD_L is 3 CK

  • CAS-to-CAS delay for bursts to the same bank group

  • tBURST is equivalent to tCCD_S; no explicit parameter required

  • for CAS-to-CAS delay for bursts to different bank groups

  • tCCD_L = '3ns';
  • tRCD = '12ns'
  • tCL is not directly found in datasheet and assumed equal tRCD

  • tCL = '12ns'
  • tRP = '12ns'
  • tRAS = '28ns'
  • RRD_S (different bank group)

  • RRD_S is 5.5 ns in datasheet.

  • rounded to the next multiple of tCK

  • tRRD = '6ns'
  • RRD_L (same bank group)

  • RRD_L is 5.5 ns in datasheet.

  • rounded to the next multiple of tCK

  • tRRD_L = '6ns'
  • tXAW = '23ns'
  • tXAW < 4 x tRRD.

  • Therefore, activation limit is set to 0

  • activation_limit = 0
  • tRFC = '65ns'
  • tWR = '12ns'
  • Here using the average of WTR_S and WTR_L

  • tWTR = '5ns'
  • Read-to-Precharge 2 CK

  • tRTP = '2ns'
  • Assume 2 cycles

  • tRTW = '2ns'

+# A single HBM x128 interface (one command and address bus), with
+# default timings based on data publically released
+# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014),
+# IDD measurement values, and by extrapolating data from other classes.
+# Architecture values based on published HBM spec
+# A 4H stack is defined, 2Gb per die for a total of 1GB of memory.
+class HBM_1000_4H_1x128(DRAMInterface):

  • HBM gen1 supports up to 8 128-bit physical channels

  • Configuration defines a single channel, with the capacity

  • set to (full_ stack_capacity / 8) based on 2Gb dies

  • To use all 8 channels, set 'channels' parameter to 8 in

  • system configuration

  • 128-bit interface legacy mode

  • device_bus_width = 128
  • HBM supports BL4 and BL2 (legacy mode only)

  • burst_length = 4
  • size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack;

  • with 8 channels, 128MB per channel

  • device_size = '128MB'
  • device_rowbuffer_size = '2kB'
  • 1x128 configuration

  • devices_per_rank = 1
  • HBM does not have a CS pin; set rank to 1

  • ranks_per_channel = 1
  • HBM has 8 or 16 banks depending on capacity

  • 2Gb dies have 8 banks

  • banks_per_rank = 8
  • depending on frequency, bank groups may be required

  • will always have 4 bank groups when enabled

  • current specifications do not define the minimum frequency for

  • bank group architecture

  • setting bank_groups_per_rank to 0 to disable until range is defined

  • bank_groups_per_rank = 0
  • 500 MHz for 1Gbps DDR data rate

  • tCK = '2ns'
  • use values from IDD measurement in JEDEC spec

  • use tRP value for tRCD and tCL similar to other classes

  • tRP = '15ns'
  • tRCD = '15ns'
  • tCL = '15ns'
  • tRAS = '33ns'
  • BL2 and BL4 supported, default to BL4

  • DDR @ 500 MHz means 4 * 2ns / 2 = 4ns

  • tBURST = '4ns'
  • value for 2Gb device from JEDEC spec

  • tRFC = '160ns'
  • value for 2Gb device from JEDEC spec

  • tREFI = '3.9us'
  • extrapolate the following from LPDDR configs, using ns values

  • to minimize burst length, prefetch differences

  • tWR = '18ns'
  • tRTP = '7.5ns'
  • tWTR = '10ns'
  • start with 2 cycles turnaround, similar to other memory classes

  • could be more with variations across the stack

  • tRTW = '4ns'
  • single rank device, set to 0

  • tCS = '0ns'
  • from MemCon example, tRRD is 4ns with 2ns tCK

  • tRRD = '4ns'
  • from MemCon example, tFAW is 30ns with 2ns tCK

  • tXAW = '30ns'
  • activation_limit = 4
  • 4tCK

  • tXP = '8ns'
  • start with tRFC + tXP -> 160ns + 8ns = 168ns

  • tXS = '168ns'

+# A single HBM x64 interface (one command and address bus), with
+# default timings based on HBM gen1 and data publically released
+# A 4H stack is defined, 8Gb per die for a total of 4GB of memory.
+# Note: This defines a pseudo-channel with a unique controller
+# instantiated per pseudo-channel
+# Stay at same IO rate (1Gbps) to maintain timing relationship with
+# HBM gen1 class (HBM_1000_4H_x128) where possible
+class HBM_1000_4H_1x64(HBM_1000_4H_1x128):

  • For HBM gen2 with pseudo-channel mode, configure 2X channels.

  • Configuration defines a single pseudo channel, with the capacity

  • set to (full_ stack_capacity / 16) based on 8Gb dies

  • To use all 16 pseudo channels, set 'channels' parameter to 16 in

  • system configuration

  • 64-bit pseudo-channle interface

  • device_bus_width = 64
  • HBM pseudo-channel only supports BL4

  • burst_length = 4
  • size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack;

  • with 16 channels, 256MB per channel

  • device_size = '256MB'
  • page size is halved with pseudo-channel; maintaining the same same

number

  • of rows per pseudo-channel with 2X banks across 2 channels

  • device_rowbuffer_size = '1kB'
  • HBM has 8 or 16 banks depending on capacity

  • Starting with 4Gb dies, 16 banks are defined

  • banks_per_rank = 16
  • reset tRFC for larger, 8Gb device

  • use HBM1 4Gb value as a starting point

  • tRFC = '260ns'
  • start with tRFC + tXP -> 160ns + 8ns = 168ns

  • tXS = '268ns'
  • Default different rank bus delay to 2 CK, @1000 MHz = 2 ns

  • tCS = '2ns'
  • tREFI = '3.9us'
  • active powerdown and precharge powerdown exit time

  • tXP = '10ns'
  • self refresh exit time

  • tXS = '65ns'

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture
+# burst of 32, which means bursts can be interleaved
+class LPDDR5_5500_1x16_BG_BL32(DRAMInterface):
+

  • Increase buffer size to account for more bank resources

  • read_buffer_size = 64
  • Set page policy to better suit DMC Huxley

  • page_policy = 'close_adaptive'
  • 16-bit channel interface

  • device_bus_width = 16
  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL32 for higher command bandwidth

  • burst_length = 32
  • size of device in bytes

  • device_size = '1GB'
  • 2kB page with BG mode

  • device_rowbuffer_size = '2kB'
  • Use a 1x16 configuration

  • devices_per_rank = 1
  • Use a single rank

  • ranks_per_channel = 1
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Initial configuration will have 16 banks with Bank Group Arch

  • to maximim resources and enable higher data rates

  • banks_per_rank = 16
  • bank_groups_per_rank = 4
  • 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK

  • tCK = '1.455ns'
  • Greater of 2 CK or 18ns

  • tRCD = '18ns'
  • Base RL is 16 CK @ 687.5 MHz = 23.28ns

  • tCL = '23.280ns'
  • Greater of 2 CK or 18ns

  • tRP = '18ns'
  • Greater of 3 CK or 42ns

  • tRAS = '42ns'
  • Greater of 3 CK or 34ns

  • tWR = '34ns'
  • active powerdown and precharge powerdown exit time

  • Greater of 3 CK or 7ns

  • tXP = '7ns'
  • self refresh exit time (tRFCab + 7.5ns)

  • tXS = '217.5ns'
  • Greater of 2 CK or 7.5 ns minus 2 CK

  • tRTP = '4.59ns'
  • With BG architecture, burst of 32 transferred in two 16-beat

  • sub-bursts, with a 16-beat gap in between.

  • Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz

  • tBURST is the delay to transfer the Bstof32 =  6 CK @ 687.5 MHz

  • tBURST = '8.73ns'
  • can interleave a Bstof32 from another bank group at tBURST_MIN

  • 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz

  • tBURST_MIN = '2.91ns'
  • tBURST_MAX is the maximum burst delay for same bank group timing

  • this is 8 CK @ 687.5 MHz

  • tBURST_MAX = '11.64ns'
  • 8 CK @ 687.5 MHz

  • tCCD_L = "11.64ns"
  • LPDDR5, 8 Gbit/channel for 280ns tRFCab

  • tRFC = '210ns'
  • tREFI = '3.9us'
  • Greater of 4 CK or 6.25 ns

  • tWTR = '6.25ns'
  • Greater of 4 CK or 12 ns

  • tWTR_L = '12ns'
  • Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL

  • tWCKDQ0/tCK will be 1 CK for most cases

  • For gem5 RL = WL and BL/n is already accounted for with tBURST

  • Result is and additional 1 CK is required

  • tRTW = '1.455ns'
  • Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns

  • tCS = '2.91ns'
  • 2 CK

  • tPPD = '2.91ns'
  • Greater of 2 CK or 5 ns

  • tRRD = '5ns'
  • tRRD_L = '5ns'
  • With Bank Group Arch mode tFAW is 20 ns

  • tXAW = '20ns'
  • activation_limit = 4
  • at 5Gbps, 4:1 WCK to CK ratio required

  • 2 data beats per WCK (DDR) -> 8 per CK

  • beats_per_clock = 8
  • 2 cycles required to send activate command

  • 2 command phases can be sent back-to-back or

  • with a gap up to tAAD = 8 CK

  • two_cycle_activate = True
  • tAAD = '11.640ns'
  • data_clock_sync = True

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture, burst of 16
+class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32):
+

  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL16 for smaller access granularity

  • burst_length = 16
  • For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST = '2.91ns'
  • tBURST_MIN = '2.91ns'
  • For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST_MAX = '5.82ns'
  • 4 CK @ 687.5 MHz

  • tCCD_L = "5.82ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 8-bank mode, burst of 32
+class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32):
+

  • 4kB page with 8B mode

  • device_rowbuffer_size = '4kB'
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Select 8B

  • banks_per_rank = 8
  • bank_groups_per_rank = 0
  • For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio

  • tBURST = '5.82ns'
  • tBURST_MIN = '5.82ns'
  • tBURST_MAX = '5.82ns'
  • Greater of 4 CK or 12 ns

  • tWTR = '12ns'
  • Greater of 2 CK or 10 ns

  • tRRD = '10ns'
  • With 8B mode tFAW is 40 ns

  • tXAW = '40ns'
  • activation_limit = 4
  • Reset BG arch timing for 8B mode

  • tCCD_L = "0ns"
  • tRRD_L = "0ns"
  • tWTR_L = "0ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture
+# burst of 32, which means bursts can be interleaved
+class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32):
+

  • 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK

  • tCK = '1.25ns'
  • Base RL is 17 CK @ 800 MHz = 21.25ns

  • tCL = '21.25ns'
  • With BG architecture, burst of 32 transferred in two 16-beat

  • sub-bursts, with a 16-beat gap in between.

  • Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz

  • tBURST is the delay to transfer the Bstof32 =  6 CK @ 800 MHz

  • tBURST = '7.5ns'
  • can interleave a Bstof32 from another bank group at tBURST_MIN

  • 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz

  • tBURST_MIN = '2.5ns'
  • tBURST_MAX is the maximum burst delay for same bank group timing

  • this is 8 CK @ 800 MHz

  • tBURST_MAX = '10ns'
  • 8 CK @ 800 MHz

  • tCCD_L = "10ns"
  • Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL

  • tWCKDQ0/tCK will be 1 CK for most cases

  • For gem5 RL = WL and BL/n is already accounted for with tBURST

  • Result is and additional 1 CK is required

  • tRTW = '1.25ns'
  • Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns

  • tCS = '2.5ns'
  • 2 CK

  • tPPD = '2.5ns'
  • 2 command phases can be sent back-to-back or

  • with a gap up to tAAD = 8 CK

  • tAAD = '10ns'

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on initial
+# JEDEC specifcation
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture, burst of 16
+class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32):
+

  • LPDDR5 is a BL16 or BL32 device

  • With BG mode, BL16 and BL32 are supported

  • Use BL16 for smaller access granularity

  • burst_length = 16
  • For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio

  • tBURST = '2.5ns'
  • tBURST_MIN = '2.5ns'
  • For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio

  • tBURST_MAX = '5ns'
  • 4 CK @ 800 MHz

  • tCCD_L = "5ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 8-bank mode, burst of 32
+class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32):
+

  • 4kB page with 8B mode

  • device_rowbuffer_size = '4kB'
  • LPDDR5 supports configurable bank options

  • 8B  : BL32, all frequencies

  • 16B : BL32 or BL16, <=3.2Gbps

  • 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps

  • Select 8B

  • banks_per_rank = 8
  • bank_groups_per_rank = 0
  • For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio

  • tBURST = '5ns'
  • tBURST_MIN = '5ns'
  • tBURST_MAX = '5ns'
  • Greater of 4 CK or 12 ns

  • tWTR = '12ns'
  • Greater of 2 CK or 10 ns

  • tRRD = '10ns'
  • With 8B mode tFAW is 40 ns

  • tXAW = '40ns'
  • activation_limit = 4
  • Reset BG arch timing for 8B mode

  • tCCD_L = "0ns"
  • tRRD_L = "0ns"
  • tWTR_L = "0ns"
    diff --git a/src/mem/SConscript b/src/mem/SConscript
    index 2fe179d..ceeed98 100644
    --- a/src/mem/SConscript
    +++ b/src/mem/SConscript
    @@ -1,6 +1,6 @@

-- mode:python --

-# Copyright (c) 2018-2019 ARM Limited
+# Copyright (c) 2018-2020 ARM Limited

All rights reserved

The license below extends only to copyright in the software and shall

@@ -47,6 +47,7 @@
SimObject('AddrMapper.py')
SimObject('Bridge.py')
SimObject('DRAMCtrl.py')
+SimObject('DRAMInterface.py')
SimObject('ExternalMaster.py')
SimObject('ExternalSlave.py')
SimObject('MemObject.py')
diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc
index b646581..4055505 100644
--- a/src/mem/dram_ctrl.cc
+++ b/src/mem/dram_ctrl.cc
@@ -47,6 +47,7 @@
#include "debug/DRAMState.hh"
#include "debug/Drain.hh"
#include "debug/QOS.hh"
+#include "params/DRAMInterface.hh"
#include "sim/system.hh"

using namespace std;
@@ -58,12 +59,13 @@
retryRdReq(false), retryWrReq(false),
nextReqEvent([this]{ processNextReqEvent(); }, name()),
respondEvent([this]{ processRespondEvent(); }, name()),

  • readBufferSize(p->read_buffer_size),
  • writeBufferSize(p->write_buffer_size),
  • dram(p->dram),
  • readBufferSize(dram->readBufferSize),
  • writeBufferSize(dram->writeBufferSize),
    writeHighThreshold(writeBufferSize * p->write_high_thresh_perc /
    100.0),
    writeLowThreshold(writeBufferSize * p->write_low_thresh_perc / 100.0),
    minWritesPerSwitch(p->min_writes_per_switch),
  • writesThisTime(0), readsThisTime(0), tCS(p->tCS),
  • writesThisTime(0), readsThisTime(0),
    memSchedPolicy(p->mem_sched_policy),
    frontendLatency(p->static_frontend_latency),
    backendLatency(p->static_backend_latency),
    @@ -74,37 +76,23 @@
    readQueue.resize(p->qos_priorities);
    writeQueue.resize(p->qos_priorities);

  • dram->setCtrl(this);

  • // perform a basic check of the write thresholds
    if (p->write_low_thresh_perc >= p->write_high_thresh_perc)
        fatal("Write buffer low threshold %d must be smaller than the "
              "high threshold %d\n", p->write_low_thresh_perc,
              p->write_high_thresh_perc);
    
  • // determine the rows per bank by looking at the total capacity
  • uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size());
  • DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity,
  •        AbstractMemory::size());
    
  • // create a DRAM interface
  • // will only populate the ranks if DRAM is configured
  • dram = new DRAMInterface(*this, p, capacity, range);
  • DPRINTF(DRAM, "Created DRAM interface \n");
    }

void
DRAMCtrl::init()
{

  • MemCtrl::init();

  • if (!port.isConnected()) {
    fatal("DRAMCtrl %s is unconnected!\n", name());
    } else {
    port.sendRangeChange();
    }

  • dram->init(range);

  • }

    void
    @@ -114,8 +102,6 @@
    isTimingMode = system()->isTimingMode();

    if (isTimingMode) {
    
  •    dram->startupRanks();
    
  •     // shift the bus busy time sufficiently far ahead that we never
        // have to worry about negative values when computing the time for
        // the next request, this will add an insignificant bubble at the
    

@@ -133,7 +119,7 @@
"is responding");

  // do the actual memory access and turn the packet into a response
  • access(pkt);
  • dram->access(pkt);

    Tick latency = 0;
    if (pkt->hasData()) {
    @@ -263,7 +249,7 @@
    // address of first DRAM packet is kept unaliged. Subsequent DRAM
    packets
    // are aligned to burst size boundaries. This is to ensure we
    accurately
    // check read packets against packets in write queue.

  • const Addr base_addr = getCtrlAddr(pkt->getAddr());
  • const Addr base_addr = dram->getCtrlAddr(pkt->getAddr());
    Addr addr = base_addr;
    unsigned pktsServicedByWrQ = 0;
    BurstHelper* burst_helper = NULL;
    @@ -363,7 +349,7 @@

    // if the request size is larger than burst size, the pkt is split into
    // multiple DRAM packets

  • const Addr base_addr = getCtrlAddr(pkt->getAddr());
  • const Addr base_addr = dram->getCtrlAddr(pkt->getAddr());
    Addr addr = base_addr;
    uint32_t burstSize = dram->bytesPerBurst();
    for (int cnt = 0; cnt < pktCount; ++cnt) {
    @@ -526,7 +512,7 @@
    DRAMPacket* dram_pkt = respQueue.front();

    // media specific checks and functions when read response is complete

  • dram->respondEventDRAM(dram_pkt->rank);
  • dram->respondEvent(dram_pkt->rank);

    if (dram_pkt->burstHelper) {
    // it is a split packet
    @@ -727,12 +713,12 @@
    void
    DRAMCtrl::accessAndRespond(PacketPtr pkt, Tick static_latency)
    {

  • DPRINTF(DRAM, "Responding to Address %lld.. ",pkt->getAddr());
  • DPRINTF(DRAM, "Responding to Address %lld.. \n",pkt->getAddr());

    bool needsResponse = pkt->needsResponse();
    // do the actual memory access which also turns the packet into a
    // response

  • access(pkt);
  • dram->access(pkt);

    // turn packet around to go back to requester if response expected
    if (needsResponse) {
    @@ -877,9 +863,9 @@
    // if not, shift to next burst window
    Tick act_at;
    if (twoCycleActivate)

  •    act_at = ctrl.verifyMultiCmd(act_tick, tAAD);
    
  •    act_at = ctrl->verifyMultiCmd(act_tick, tAAD);
    else
    
  •    act_at = ctrl.verifySingleCmd(act_tick);
    
  •    act_at = ctrl->verifySingleCmd(act_tick);
    
    DPRINTF(DRAM, "Activate at tick %d\n", act_at);
    

@@ -997,7 +983,7 @@
// Issuing an explicit PRE command
// Verify that we have command bandwidth to issue the precharge
// if not, shift to next burst window

  •    pre_at = ctrl.verifySingleCmd(pre_tick);
    
  •    pre_at = ctrl->verifySingleCmd(pre_tick);
        // enforce tPPD
        for (int i = 0; i < banksPerRank; i++) {
            rank_ref.banks[i].preAllowedAt = std::max(pre_at + tPPD,
    

@@ -1096,9 +1082,9 @@
// verify that we have command bandwidth to issue the burst
// if not, shift to next burst window
if (dataClockSync && ((cmd_at - rank_ref.lastBurstTick) >
clkResyncDelay))

  •    cmd_at = ctrl.verifyMultiCmd(cmd_at, tCK);
    
  •    cmd_at = ctrl->verifyMultiCmd(cmd_at, tCK);
    else
    
  •    cmd_at = ctrl.verifySingleCmd(cmd_at);
    
  •    cmd_at = ctrl->verifySingleCmd(cmd_at);
    
    // if we are interleaving bursts, ensure that
    // 1) we don't double interleave on next burst issue
    

@@ -1196,7 +1182,7 @@
bool got_more_hits = false;
bool got_bank_conflict = false;

  •    for (uint8_t i = 0; i < ctrl.numPriorities(); ++i) {
    
  •    for (uint8_t i = 0; i < ctrl->numPriorities(); ++i) {
            auto p = queue[i].begin();
            // keep on looking until we find a hit or reach the end of the
            // queue
    

@@ -1267,6 +1253,7 @@
// Update latency stats
stats.totMemAccLat += dram_pkt->readyTime - dram_pkt->entryTime;
stats.totQLat += cmd_at - dram_pkt->entryTime;

  •    stats.totBusLat += tBURST;
    } else {
        // Schedule write done event to decrement event count
        // after the readyTime has been reached
    

@@ -1350,13 +1337,9 @@
// Update latency stats
stats.masterReadTotalLat[dram_pkt->masterId()] +=
dram_pkt->readyTime - dram_pkt->entryTime;

  •    stats.bytesRead += dram->bytesPerBurst();
    
  •    stats.totBusLat += dram->burstDelay();
        stats.masterReadBytes[dram_pkt->masterId()] += dram_pkt->size;
    } else {
        ++writesThisTime;
    
  •    stats.bytesWritten += dram->bytesPerBurst();
        stats.masterWriteBytes[dram_pkt->masterId()] += dram_pkt->size;
        stats.masterWriteTotalLat[dram_pkt->masterId()] +=
            dram_pkt->readyTime - dram_pkt->entryTime;
    

@@ -1458,8 +1441,9 @@

              // Figure out which read request goes next
              // If we are changing command type, incorporate the minimum
  •            // bus turnaround delay which will be tCS (different rank)  
    

case

  •            to_read = chooseNext((*queue), switched_cmd_type ? tCS :  
    

0);

  •            // bus turnaround delay which will be rank to rank delay
    
  •            to_read = chooseNext((*queue), switched_cmd_type ?
    
  •                                           dram->rankDelay() : 0);
    
                if (to_read != queue->end()) {
                    // candidate read found
    

@@ -1538,7 +1522,8 @@
// If we are changing command type, incorporate the minimum
// bus turnaround delay
to_write = chooseNext((*queue),

  •                 switched_cmd_type ? std::min(dram->minRdToWr(),  
    

tCS) : 0);

  •                 switched_cmd_type ? std::min(dram->minRdToWr(),
    
  •                                              dram->rankDelay()) : 0);
    
            if (to_write != queue->end()) {
                write_found = true;
    

@@ -1611,11 +1596,8 @@
}
}

-DRAMInterface::DRAMInterface(DRAMCtrl& _ctrl,

  •                                 const DRAMCtrlParams* _p,
    
  •                                 const uint64_t capacity,
    
  •                                 const AddrRange range)
    
  • : SimObject(_p), ctrl(_ctrl),
    +DRAMInterface::DRAMInterface(const DRAMInterfaceParams* _p)
  • : AbstractMemory(_p),
    addrMapping(_p->addr_mapping),
    burstSize((_p->devices_per_rank * _p->burst_length *
    _p->device_bus_width) / 8),
    @@ -1630,7 +1612,7 @@
    bankGroupsPerRank(_p->bank_groups_per_rank),
    bankGroupArch(_p->bank_groups_per_rank > 0),
    banksPerRank(_p->banks_per_rank), rowsPerBank(0),
  •  tCK(_p->tCK), tCL(_p->tCL), tBURST(_p->tBURST),
    
  •  tCK(_p->tCK), tCS(_p->tCS), tCL(_p->tCL), tBURST(_p->tBURST),
      tBURST_MIN(_p->tBURST_MIN), tBURST_MAX(_p->tBURST_MAX),  
    

tRTW(_p->tRTW),
tCCD_L_WR(_p->tCCD_L_WR), tCCD_L(_p->tCCD_L), tRCD(_p->tRCD),
tRP(_p->tRP), tRAS(_p->tRAS), tWR(_p->tWR), tRTP(_p->tRTP),
@@ -1646,13 +1628,15 @@
wrToRdDly(tCL + tBURST + _p->tWTR), rdToWrDly(tBURST + tRTW),
wrToRdDlySameBG(tCL + _p->tBURST_MAX + _p->tWTR_L),
rdToWrDlySameBG(tRTW + _p->tBURST_MAX),

  •  rankToRankDly(ctrl.rankDelay() + tBURST),
    
  •  rankToRankDly(tCS + tBURST),
      pageMgmt(_p->page_policy),
      maxAccessesPerRow(_p->max_accesses_per_row),
      timeStampOffset(0), activeRank(0),
      enableDRAMPowerdown(_p->enable_dram_powerdown),
      lastStatsResetTick(0),
    
  •  stats(_ctrl, *this)
    
  •  stats(*this),
    
  •  readBufferSize(_p->read_buffer_size),
    
  •  writeBufferSize(_p->write_buffer_size)
    

    {
    fatal_if(!isPowerOf2(burstSize), "DRAM burst size %d is not allowed, "
    "must be a power of two\n", burstSize);
    @@ -1664,7 +1648,7 @@

    for (int i = 0; i < ranksPerChannel; i++) {
        DPRINTF(DRAM, "Creating DRAM rank %d \n", i);
    
  •    Rank* rank = new Rank(ctrl, _p, i, *this);
    
  •    Rank* rank = new Rank(_p, i, *this);
        ranks.push_back(rank);
    }
    

@@ -1672,6 +1656,11 @@
uint64_t deviceCapacity = deviceSize / (1024 * 1024) * devicesPerRank *
ranksPerChannel;

  • uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size());
  • DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity,
  •        AbstractMemory::size());
    
  • // if actual DRAM size does not match memory capacity in system warn!
    if (deviceCapacity != capacity / (1024 * 1024))
        warn("DRAM device capacity (%d Mbytes) does not match the "
    

@@ -1726,8 +1715,10 @@
}

void
-DRAMInterface::init(AddrRange range)
+DRAMInterface::init()
{

  • AbstractMemory::init();
  • // a bit of sanity checks on the interleaving, save it for here to
    // ensure that the system pointer is initialised
    if (range.interleaved()) {
    

@@ -1749,7 +1740,7 @@

          // channel striping has to be done at a granularity that
          // is equal or larger to a cache line
  •        if (ctrl.system()->cacheLineSize() > range.granularity()) {
    
  •        if (system()->cacheLineSize() > range.granularity()) {
                fatal("Channel interleaving of %s must be at least as  
    

large "
"as the cache line size\n", name());
}
@@ -1766,10 +1757,12 @@
}

void
-DRAMInterface::startupRanks()
+DRAMInterface::startup()
{

  • // timestamp offset should be in clock cycles for DRAMPower
  • timeStampOffset = divCeil(curTick(), tCK);
  • if (system()->isTimingMode()) {

  •    // timestamp offset should be in clock cycles for DRAMPower
    
  •    timeStampOffset = divCeil(curTick(), tCK);
    
  • }

    for (auto r : ranks) {
        r->startup(curTick() + tREFI - tRP);
    

@@ -1815,7 +1808,7 @@
}

void
-DRAMInterface::respondEventDRAM(uint8_t rank)
+DRAMInterface::respondEvent(uint8_t rank)
{
Rank& rank_ref = *ranks[rank];

@@ -1956,7 +1949,7 @@
std::max(ranks[i]->banks[j].preAllowedAt, curTick()) +
tRP;

              // When is the earliest the R/W burst can issue?
  •            const Tick col_allowed_at = ctrl.inReadBusState(false) ?
    
  •            const Tick col_allowed_at = ctrl->inReadBusState(false) ?
    

ranks[i]->banks[j].rdAllowedAt :

ranks[i]->banks[j].wrAllowedAt;
Tick col_at = std::max(col_allowed_at, act_at + tRCD);
@@ -1996,9 +1989,15 @@
return make_pair(bank_mask, hidden_bank_prep);
}

-DRAMInterface::Rank::Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int
_rank,

  •                     DRAMInterface& _dram)
    
  • : EventManager(&_ctrl), ctrl(_ctrl), dram(_dram),
    +DRAMInterface*
    +DRAMInterfaceParams::create()
    +{
  • return new DRAMInterface(this);
    +}

+DRAMInterface::Rank::Rank(const DRAMInterfaceParams* _p,

  •                     int _rank, DRAMInterface& _dram)
    
  • : EventManager(&_dram), dram(_dram),
    pwrStateTrans(PWR_IDLE), pwrStatePostRefresh(PWR_IDLE),
    pwrStateTick(0), refreshDueAt(0), pwrState(PWR_IDLE),
    refreshState(REF_IDLE), inLowPowerState(false), rank(_rank),
    @@ -2011,7 +2010,7 @@
    refreshEvent([this]{ processRefreshEvent(); }, name()),
    powerEvent([this]{ processPowerEvent(); }, name()),
    wakeUpEvent([this]{ processWakeUpEvent(); }, name()),
  •  stats(_ctrl, *this)
    
  •  stats(_dram, *this)
    
    {
    for (int b = 0; b < _p->banks_per_rank; b++) {
    banks[b].bank = b;
    @@ -2062,8 +2061,10 @@
    DRAMInterface::Rank::isQueueEmpty() const
    {
    // check commmands in Q based on current bus direction
  • bool no_queued_cmds = (ctrl.inReadBusState(true) && (readEntries == 0))
  •                   || (ctrl.inWriteBusState(true) && (writeEntries ==  
    

0));

  • bool no_queued_cmds = (dram.ctrl->inReadBusState(true) &&
  •                      (readEntries == 0))
    
  •                   || (dram.ctrl->inWriteBusState(true) &&
    
  •                      (writeEntries == 0));
    return no_queued_cmds;
    
    }

@@ -2187,7 +2188,7 @@
// if a request is at the moment being handled and this request is
// accessing the current rank then wait for it to finish
if ((rank == dram.activeRank)

  •        && (ctrl.requestEventScheduled())) {
    
  •        && (dram.ctrl->requestEventScheduled())) {
            // hand control over to the request loop until it is
            // evaluated next
            DPRINTF(DRAM, "Refresh awaiting draining\n");
    

@@ -2262,7 +2263,7 @@
// or have outstanding ACT,RD/WR,Auto-PRE sequence scheduled
// should have outstanding precharge or read response event
assert(prechargeEvent.scheduled() ||

  •               ctrl.respondEventScheduled());
    
  •               dram.ctrl->respondEventScheduled());
            // will start refresh when pwrState transitions to IDLE
        }
    

@@ -2322,8 +2323,8 @@

      assert(!powerEvent.scheduled());
  •    if ((ctrl.drainState() == DrainState::Draining) ||
    
  •        (ctrl.drainState() == DrainState::Drained)) {
    
  •    if ((dram.ctrl->drainState() == DrainState::Draining) ||
    
  •        (dram.ctrl->drainState() == DrainState::Drained)) {
            // if draining, do not re-enter low-power mode.
            // simply go to IDLE and wait
            schedulePowerEvent(PWR_IDLE, curTick());
    

@@ -2548,10 +2549,10 @@
}

      // completed refresh event, ensure next request is scheduled
  •    if (!ctrl.requestEventScheduled()) {
    
  •    if (!dram.ctrl->requestEventScheduled()) {
            DPRINTF(DRAM, "Scheduling next request after refreshing"
                           " rank %d\n", rank);
    
  •        ctrl.restartScheduler(curTick());
    
  •        dram.ctrl->restartScheduler(curTick());
        }
    }
    

@@ -2610,8 +2611,8 @@
// bypass auto-refresh and go straight to SREF, where memory
// will issue refresh immediately upon entry
if (pwrStatePostRefresh == PWR_PRE_PDN && isQueueEmpty() &&

  •       (ctrl.drainState() != DrainState::Draining) &&
    
  •       (ctrl.drainState() != DrainState::Drained) &&
    
  •       (dram.ctrl->drainState() != DrainState::Draining) &&
    
  •       (dram.ctrl->drainState() != DrainState::Drained) &&
           dram.enableDRAMPowerdown) {
            DPRINTF(DRAMState, "Rank %d bypassing refresh and  
    

transitioning "
"to self refresh at %11u tick\n", rank, curTick());
@@ -2712,7 +2713,7 @@
bool
DRAMInterface::Rank::forceSelfRefreshExit() const {
return (readEntries != 0) ||

  •       (ctrl.inWriteBusState(true) && (writeEntries != 0));
    
  •       (dram.ctrl->inWriteBusState(true) && (writeEntries != 0));
    

    }

    DRAMCtrl::CtrlStats::CtrlStats(DRAMCtrl &_ctrl)
    @@ -2723,15 +2724,15 @@
    ADD_STAT(writeReqs, "Number of write requests accepted"),

    ADD_STAT(readBursts,
    
  •         "Number of DRAM read bursts, "
    
  •         "Number of controller read bursts, "
             "including those serviced by the write queue"),
    ADD_STAT(writeBursts,
    
  •         "Number of DRAM write bursts, "
    
  •         "Number of controller write bursts, "
             "including those merged in the write queue"),
    ADD_STAT(servicedByWrQ,
    
  •         "Number of DRAM read bursts serviced by the write queue"),
    
  •         "Number of controller read bursts serviced by the write  
    

queue"),
ADD_STAT(mergedWrBursts,

  •         "Number of DRAM write bursts merged with an existing one"),
    
  •         "Number of controller write bursts merged with an existing  
    

one"),

  ADD_STAT(neitherReadNorWriteReqs,
           "Number of requests that are neither read nor write"),

@@ -2739,9 +2740,6 @@
ADD_STAT(avgRdQLen, "Average read queue length when enqueuing"),
ADD_STAT(avgWrQLen, "Average write queue length when enqueuing"),

  • ADD_STAT(totBusLat, "Total ticks spent in databus transfers"),
  • ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"),
  • ADD_STAT(numRdRetry, "Number of times read queue was full causing  
    

retry"),
ADD_STAT(numWrRetry, "Number of times write queue was full causing
retry"),

@@ -2756,22 +2754,13 @@
ADD_STAT(wrPerTurnAround,
"Writes before turning the bus around for reads"),

  • ADD_STAT(bytesRead, "Total number of bytes read from memory"),
    ADD_STAT(bytesReadWrQ, "Total number of bytes read from write queue"),

  • ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"),
    ADD_STAT(bytesReadSys, "Total read bytes from the system interface
    side"),
    ADD_STAT(bytesWrittenSys,
    "Total written bytes from the system interface side"),

  • ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiByte/s"),

  • ADD_STAT(avgWrBW, "Average achieved write bandwidth in MiByte/s"),
    ADD_STAT(avgRdBWSys, "Average system read bandwidth in MiByte/s"),
    ADD_STAT(avgWrBWSys, "Average system write bandwidth in MiByte/s"),

  • ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"),

  • ADD_STAT(busUtil, "Data bus utilization in percentage"),

  • ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"),

  • ADD_STAT(busUtilWrite, "Data bus utilization in percentage for
    writes"),

    ADD_STAT(totGap, "Total gap between requests"),
    ADD_STAT(avgGap, "Average gap between requests"),
    @@ -2803,12 +2792,11 @@
    {
    using namespace Stats;

  • assert(ctrl._system);

  • const auto max_masters = ctrl._system->maxMasters();

  • assert(ctrl.system());

  • const auto max_masters = ctrl.system()->maxMasters();

    avgRdQLen.precision(2);
    avgWrQLen.precision(2);

  • avgBusLat.precision(2);

    readPktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1);
    writePktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1);
    @@ -2823,14 +2811,9 @@
    .init(ctrl.writeBufferSize)
    .flags(nozero);

  • avgRdBW.precision(2);

  • avgWrBW.precision(2);
    avgRdBWSys.precision(2);
    avgWrBWSys.precision(2);

  • peakBW.precision(2);

  • busUtil.precision(2);
    avgGap.precision(2);

  • busUtilWrite.precision(2);

    // per-master bytes read and written to memory
    masterReadBytes
    @@ -2862,9 +2845,6 @@
    .flags(nonan)
    .precision(2);

  • busUtilRead

  •    .precision(2);
    
  • masterWriteRate
        .flags(nozero | nonan)
        .precision(12);
    

@@ -2878,7 +2858,7 @@
.precision(2);

  for (int i = 0; i < max_masters; i++) {
  •    const std::string master = ctrl._system->getMasterName(i);
    
  •    const std::string master = ctrl.system()->getMasterName(i);
        masterReadBytes.subname(i, master);
        masterReadRate.subname(i, master);
        masterWriteBytes.subname(i, master);
    

@@ -2892,22 +2872,11 @@
}

  // Formula stats
  • avgBusLat = totBusLat / (readBursts - servicedByWrQ);

  • avgRdBW = (bytesRead / 1000000) / simSeconds;

  • avgWrBW = (bytesWritten / 1000000) / simSeconds;
    avgRdBWSys = (bytesReadSys / 1000000) / simSeconds;
    avgWrBWSys = (bytesWrittenSys / 1000000) / simSeconds;

  • peakBW = (SimClock::Frequency / ctrl.dram->burstDataDelay()) *

  •          ctrl.dram->bytesPerBurst() / 1000000;
    
  • busUtil = (avgRdBW + avgWrBW) / peakBW * 100;

    avgGap = totGap / (readReqs + writeReqs);

  • busUtilRead = avgRdBW / peakBW * 100;

  • busUtilWrite = avgWrBW / peakBW * 100;

  • masterReadRate = masterReadBytes / simSeconds;
    masterWriteRate = masterWriteBytes / simSeconds;
    masterReadAvgLat = masterReadTotalLat / masterReadAccesses;
    

@@ -2920,8 +2889,8 @@
dram.lastStatsResetTick = curTick();
}

-DRAMInterface::DRAMStats::DRAMStats(DRAMCtrl &_ctrl, DRAMInterface &_dram)

  • : Stats::Group(&_ctrl, csprintf("dram").c_str()),
    +DRAMInterface::DRAMStats::DRAMStats(DRAMInterface &_dram)
  • : Stats::Group(&_dram),
    dram(_dram),

    ADD_STAT(readBursts, "Number of DRAM read bursts"),
    @@ -2931,10 +2900,13 @@
    ADD_STAT(perBankWrBursts, "Per bank write bursts"),

    ADD_STAT(totQLat, "Total ticks spent queuing"),

  • ADD_STAT(totBusLat, "Total ticks spent in databus transfers"),
    ADD_STAT(totMemAccLat,
    "Total ticks spent from burst creation until serviced "
    "by the DRAM"),

  • ADD_STAT(avgQLat, "Average queueing delay per DRAM burst"),
    
  • ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"),
    ADD_STAT(avgMemAccLat, "Average memory access latency per DRAM burst"),

    ADD_STAT(readRowHits, "Number of row buffer hits during reads"),
    @@ -2947,6 +2919,12 @@
    ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"),
    ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiBytes/s"),
    ADD_STAT(avgWrBW, "Average DRAM write bandwidth in MiBytes/s"),

  • ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"),

  • ADD_STAT(busUtil, "Data bus utilization in percentage"),

  • ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"),

  • ADD_STAT(busUtilWrite, "Data bus utilization in percentage for
    writes"),

  • ADD_STAT(pageHitRate, "Row buffer hit rate, read and write combined")
    

    {
    @@ -2958,6 +2936,7 @@
    using namespace Stats;

    avgQLat.precision(2);
    
  • avgBusLat.precision(2);
    avgMemAccLat.precision(2);

    readRowHitRate.precision(2);
    @@ -2971,10 +2950,16 @@
    dram.maxAccessesPerRow : dram.rowBufferSize)
    .flags(nozero);

  • peakBW.precision(2);

  • busUtil.precision(2);

  • busUtilWrite.precision(2);

  • busUtilRead.precision(2);

  • pageHitRate.precision(2);
    
    // Formula stats
    avgQLat = totQLat / readBursts;
    
  • avgBusLat = totBusLat / readBursts;
    avgMemAccLat = totMemAccLat / readBursts;

    readRowHitRate = (readRowHits / readBursts) * 100;
    @@ -2982,13 +2967,19 @@

    avgRdBW = (bytesRead / 1000000) / simSeconds;
    avgWrBW = (bytesWritten / 1000000) / simSeconds;

  • peakBW = (SimClock::Frequency / dram.burstDataDelay()) *

  •          dram.bytesPerBurst() / 1000000;
    
  • busUtil = (avgRdBW + avgWrBW) / peakBW * 100;

  • busUtilRead = avgRdBW / peakBW * 100;

  • busUtilWrite = avgWrBW / peakBW * 100;

    pageHitRate = (writeRowHits + readRowHits) /
    (writeBursts + readBursts) * 100;
    }

-DRAMInterface::RankStats::RankStats(DRAMCtrl &_ctrl, Rank &_rank)

  • : Stats::Group(&_ctrl, csprintf("dram_rank%d", _rank.rank).c_str()),
    +DRAMInterface::RankStats::RankStats(DRAMInterface &_dram, Rank &_rank)
  • : Stats::Group(&_dram, csprintf("rank%d", _rank.rank).c_str()),
    rank(_rank),

    ADD_STAT(actEnergy, "Energy for activate commands per rank (pJ)"),
    @@ -3047,7 +3038,7 @@
    DRAMCtrl::recvFunctional(PacketPtr pkt)
    {
    // rely on the abstract memory

  • functionalAccess(pkt);
  • dram->functionalAccess(pkt);
    }

Port &
@@ -3093,6 +3084,7 @@
// if we switched to timing mode, kick things into action,
// and behave as if we restored from a checkpoint
startup();

  •    dram->startup();
    } else if (isTimingMode && !system()->isTimingMode()) {
        // if we switch from timing mode, stop the refresh events to
        // not cause issues with KVM
    

@@ -3112,7 +3104,7 @@
DRAMCtrl::MemoryPort::getAddrRanges() const
{
AddrRangeList ranges;

  • ranges.push_back(ctrl.getAddrRange());
  • ranges.push_back(ctrl.dram->getAddrRange());
    return ranges;
    }

diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh
index dc030b1..417e935 100644
--- a/src/mem/dram_ctrl.hh
+++ b/src/mem/dram_ctrl.hh
@@ -55,12 +55,15 @@
#include "enums/AddrMap.hh"
#include "enums/MemSched.hh"
#include "enums/PageManage.hh"
+#include "mem/abstract_mem.hh"
#include "mem/drampower.hh"
#include "mem/qos/mem_ctrl.hh"
#include "mem/qport.hh"
#include "params/DRAMCtrl.hh"
#include "sim/eventq.hh"

+class DRAMInterfaceParams;
+
/**

  • A basic class to track the bank state, i.e. what row is
  • currently open (if any), when is the bank free to accept a new
    @@ -242,7 +245,7 @@
  • The DRAMInterface includes a class for individual ranks
  • and per rank functions.
    /
    -class DRAMInterface : public SimObject
    +class DRAMInterface : public AbstractMemory
    {
    private:
    /
    *
    @@ -342,7 +345,7 @@
    class Rank;
    struct RankStats : public Stats::Group
    {
  •    RankStats(DRAMCtrl &ctrl, Rank &rank);
    
  •    RankStats(DRAMInterface &dram, Rank &rank);
    
        void regStats() override;
        void resetStats() override;
    

@@ -408,13 +411,6 @@
*/
class Rank : public EventManager
{

  •  protected:
    
  •    /**
    
  •     * A reference to the parent DRAMCtrl instance
    
  •     */
    
  •    DRAMCtrl& ctrl;
    
  •   private:
    
        /**
    

@@ -534,10 +530,10 @@
*/
Tick lastBurstTick;

  •    Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank,
    
  •    Rank(const DRAMInterfaceParams* _p, int _rank,
             DRAMInterface& _dram);
    
  •    const std::string name() const { return csprintf("dram_%d", rank);  
    

}

  •    const std::string name() const { return csprintf("%d", rank); }
    
        /**
         * Kick off accounting for power and refresh states and
    

@@ -659,15 +655,16 @@
* @param next Memory Command
* @return true if timeStamp of Command 1 < timeStamp of Command 2
*/

  • static bool sortTime(const Command& cmd, const Command& cmd_next)
  • static bool
  • sortTime(const Command& cmd, const Command& cmd_next)
    {
    return cmd.timeStamp < cmd_next.timeStamp;
  • };
  • }

    /**
    
  • * A reference to the parent DRAMCtrl instance
    
  • * A pointer to the parent DRAMCtrl instance
     */
    
  • DRAMCtrl& ctrl;
  • DRAMCtrl* ctrl;

    /**

    • Memory controller configuration initialized based on parameter
      @@ -698,6 +695,7 @@
    • DRAM timing requirements
      */
      const Tick M5_CLASS_VAR_USED tCK;
  • const Tick tCS;
    const Tick tCL;
    const Tick tBURST;
    const Tick tBURST_MIN;
    @@ -781,7 +779,7 @@

    struct DRAMStats : public Stats::Group
    {

  •    DRAMStats(DRAMCtrl &ctrl, DRAMInterface &dram);
    
  •    DRAMStats(DRAMInterface &dram);
    
        void regStats() override;
        void resetStats() override;
    

@@ -798,10 +796,12 @@

      // Latencies summed over all requests
      Stats::Scalar totQLat;
  •    Stats::Scalar totBusLat;
        Stats::Scalar totMemAccLat;
    
        // Average latencies per request
        Stats::Formula avgQLat;
    
  •    Stats::Formula avgBusLat;
        Stats::Formula avgMemAccLat;
    
        // Row hit count and rate
    

@@ -817,6 +817,11 @@
// Average bandwidth
Stats::Formula avgRdBW;
Stats::Formula avgWrBW;

  •    Stats::Formula peakBW;
    
  •    // bus utilization
    
  •    Stats::Formula busUtil;
    
  •    Stats::Formula busUtilRead;
    
  •    Stats::Formula busUtilWrite;
        Stats::Formula pageHitRate;
    };
    

@@ -828,16 +833,28 @@
std::vector<Rank*> ranks;

public:
  • /**
  •  * Buffer sizes for read and write queues in the controller
    
  •  * These are passed to the controller on instantiation
    
  •  * Defining them here allows for buffers to be resized based
    
  •  * on memory type / configuration.
    
  •  */
    
  • const uint32_t readBufferSize;
  • const uint32_t writeBufferSize;
  • /** Setting a pointer to the controller */
  • void setCtrl(DRAMCtrl* _ctrl) { ctrl = _ctrl; }
  • /**
     * Initialize the DRAM interface and verify parameters
    
  • * @param range is the address range for this interface
     */
    
  • void init(AddrRange range);
  • void init() override;

    /**

    • Iterate through dram ranks and instantiate per rank startup routine
      */
  • void startupRanks();
  • void startup() override;

    /**

    • Iterate through dram ranks to exit self-refresh in order to drain
      @@ -861,15 +878,26 @@
      void suspend();

    /**

  • * Get an address in a dense range which starts from 0. The input
    
  • * address is the physical address of the request in an address
    
  • * space that contains other SimObjects apart from this
    
  • * controller.
    
  • *
    
  • * @param addr The intput address which should be in the addrRange
    
  • * @return An address in the continues range [0, max)
    
  • */
    
  • Addr getCtrlAddr(Addr addr) { return range.getOffset(addr); }

  • /**

    • @return number of bytes in a burst for this interface
      */
  • uint32_t bytesPerBurst() const { return burstSize; };
  • uint32_t bytesPerBurst() const { return burstSize; }

    /**
    *

    • @return number of ranks per channel for this interface
      */
  • uint32_t numRanks() const { return ranksPerChannel; };
  • uint32_t numRanks() const { return ranksPerChannel; }

    /*

    • @return time to send a burst of data
      @@ -879,7 +907,8 @@
      /*
    • @return time to send a burst of data without gaps
      */
  • Tick burstDataDelay() const
  • Tick
  • burstDataDelay() const
    {
    return (burstInterleave ? tBURST_MAX / 2 : tBURST);
    }
    @@ -893,7 +922,14 @@
    *
    • @return additional bus turnaround required for read-to-write
      */
  • Tick minRdToWr() const { return tRTW; };
  • Tick minRdToWr() const { return tRTW; }

  • /**

  • * Determine the required delay for an access to a different rank
    
  • *
    
  • * @return required rank to rank delay
    
  • */
    
  • Tick rankDelay() const { return tCS; }

    /*

    • Function to calulate RAS cycle time for use within and
      @@ -957,7 +993,8 @@
    •                This requires the DRAM to be in the
      
    •                REF IDLE state
      

    */

  • bool burstReady(uint8_t rank) const
  • bool
  • burstReady(uint8_t rank) const
    {
    return ranks[rank]->inRefIdleState();
    }
    @@ -979,7 +1016,7 @@
    *
    • @param rank Specifies rank associated with read burst
      */
  • void respondEventDRAM(uint8_t rank);
  • void respondEvent(uint8_t rank);

    /**

    • Check the refresh state to determine if refresh needs
      @@ -989,8 +1026,7 @@
      */
      void checkRefreshState(uint8_t rank);
  • DRAMInterface(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p,
  •             uint64_t capacity, AddrRange range);
    
  • DRAMInterface(const DRAMInterfaceParams* _p);
    };

/**
@@ -1141,20 +1177,6 @@
void accessAndRespond(PacketPtr pkt, Tick static_latency);

  /**
  • * Get an address in a dense range which starts from 0. The input
    
  • * address is the physical address of the request in an address
    
  • * space that contains other SimObjects apart from this
    
  • * controller.
    
  • *
    
  • * @param addr The intput address which should be in the addrRange
    
  • * @return An address in the continues range [0, max)
    
  • */
    
  • Addr getCtrlAddr(Addr addr)

  • {

  •    return range.getOffset(addr);
    
  • }

  • /**

    • The memory schduler/arbiter - picks which request needs to
    • go next, based on the specified policy such as FCFS or FR-FCFS
    • and moves it to the head of the queue.
      @@ -1237,6 +1259,11 @@
      std::unordered_multiset<Tick> burstTicks;

    /**

  • * Create pointer to interface of the actual dram media
    
  • */
    
  • DRAMInterface* const dram;

  • /**

    • The following are basic design parameters of the memory
    • controller, and are initialized based on parameter values.
    • The rowsPerBank is determined based on the capacity, number of
      @@ -1251,12 +1278,6 @@
      uint32_t readsThisTime;

    /**

  • * Basic memory timing parameters initialized based on parameter
    
  • * values. These will be used across memory interfaces.
    
  • */
    
  • const Tick tCS;
  • /**
    • Memory controller configuration initialized based on parameter
    • values.
      */
      @@ -1310,10 +1331,6 @@
      // Average queue lengths
      Stats::Average avgRdQLen;
      Stats::Average avgWrQLen;
  •    // Latencies summed over all requests
    
  •    Stats::Scalar totBusLat;
    
  •    // Average latencies per request
    
  •    Stats::Formula avgBusLat;
    
        Stats::Scalar numRdRetry;
        Stats::Scalar numWrRetry;
    

@@ -1324,21 +1341,12 @@
Stats::Histogram rdPerTurnAround;
Stats::Histogram wrPerTurnAround;

  •    Stats::Scalar bytesRead;
        Stats::Scalar bytesReadWrQ;
    
  •    Stats::Scalar bytesWritten;
        Stats::Scalar bytesReadSys;
        Stats::Scalar bytesWrittenSys;
        // Average bandwidth
    
  •    Stats::Formula avgRdBW;
    
  •    Stats::Formula avgWrBW;
        Stats::Formula avgRdBWSys;
        Stats::Formula avgWrBWSys;
    
  •    Stats::Formula peakBW;
    
  •    // bus utilization
    
  •    Stats::Formula busUtil;
    
  •    Stats::Formula busUtilRead;
    
  •    Stats::Formula busUtilWrite;
    
        Stats::Scalar totGap;
        Stats::Formula avgGap;
    

@@ -1367,11 +1375,6 @@
CtrlStats stats;

  /**
  • * Create pointer to interfasce to the actual media
    
  • */
    
  • DRAMInterface* dram;

  • /**

    • Upstream caches need this packet until true is returned, so
    • hold it for deletion until a subsequent call
      */
      @@ -1449,13 +1452,6 @@
      void restartScheduler(Tick tick) { schedule(nextReqEvent, tick); }

    /**

  • * Determine the required delay for an access to a different rank
    
  • *
    
  • * @return required rank to rank delay
    
  • */
    
  • Tick rankDelay() const { return tCS; }

  • /**

    • Check the current direction of the memory channel
    • @param next_state Check either the current or next bus state
      diff --git a/src/mem/drampower.cc b/src/mem/drampower.cc
      index 13551a0..96dcb55 100644
      --- a/src/mem/drampower.cc
      +++ b/src/mem/drampower.cc
      @@ -40,13 +40,13 @@
      #include "base/intmath.hh"
      #include "sim/core.hh"

-DRAMPower::DRAMPower(const DRAMCtrlParams* p, bool include_io) :
+DRAMPower::DRAMPower(const DRAMInterfaceParams* p, bool include_io) :
powerlib(libDRAMPower(getMemSpec(p), include_io))
{
}

Data::MemArchitectureSpec
-DRAMPower::getArchParams(const DRAMCtrlParams* p)
+DRAMPower::getArchParams(const DRAMInterfaceParams* p)
{
Data::MemArchitectureSpec archSpec;
archSpec.burstLength = p->burst_length;
@@ -68,7 +68,7 @@
}

Data::MemTimingSpec
-DRAMPower::getTimingParams(const DRAMCtrlParams* p)
+DRAMPower::getTimingParams(const DRAMInterfaceParams* p)
{
// Set the values that are used for power calculations and ignore
// the ones only used by the controller functionality in DRAMPower
@@ -100,7 +100,7 @@
}

Data::MemPowerSpec
-DRAMPower::getPowerParams(const DRAMCtrlParams* p)
+DRAMPower::getPowerParams(const DRAMInterfaceParams* p)
{
// All DRAMPower currents are in mA
Data::MemPowerSpec powerSpec;
@@ -132,7 +132,7 @@
}

Data::MemorySpecification
-DRAMPower::getMemSpec(const DRAMCtrlParams* p)
+DRAMPower::getMemSpec(const DRAMInterfaceParams* p)
{
Data::MemorySpecification memSpec;
memSpec.memArchSpec = getArchParams(p);
@@ -142,7 +142,18 @@
}

bool
-DRAMPower::hasTwoVDD(const DRAMCtrlParams* p)
+DRAMPower::hasTwoVDD(const DRAMInterfaceParams* p)
{
return p->VDD2 == 0 ? false : true;
}
+
+uint8_t
+DRAMPower::getDataRate(const DRAMInterfaceParams* p)
+{

  • uint32_t burst_cycles = divCeil(p->tBURST_MAX, p->tCK);
  • uint8_t data_rate = p->burst_length / burst_cycles;
  • // 4 for GDDR5
  • if (data_rate != 1 && data_rate != 2 && data_rate != 4 && data_rate !=
  •    fatal("Got unexpected data rate %d, should be 1 or 2 or 4 or 8\n");
    
  • return data_rate;
    +}
    diff --git a/src/mem/drampower.hh b/src/mem/drampower.hh
    index da24bca..da68a78 100644
    --- a/src/mem/drampower.hh
    +++ b/src/mem/drampower.hh
    @@ -44,7 +44,7 @@
    #define MEM_DRAM_POWER_HH

#include "libdrampower/LibDRAMPower.h"
-#include "params/DRAMCtrl.hh"
+#include "params/DRAMInterface.hh"

/**

  • DRAMPower is a standalone tool which calculates the power consumed by a
    @@ -57,38 +57,44 @@

    /**

    • Transform the architechture parameters defined in
  • * DRAMCtrlParams to the memSpec of DRAMPower
    
  • * DRAMInterfaceParams to the memSpec of DRAMPower
     */
    
  • static Data::MemArchitectureSpec getArchParams(const DRAMCtrlParams*
    p);
  • static Data::MemArchitectureSpec getArchParams(
  •                                 const DRAMInterfaceParams* p);
    
    /**
    
  • * Transforms the timing parameters defined in DRAMCtrlParams to
    
  • * Transforms the timing parameters defined in DRAMInterfaceParams to
     * the memSpec of DRAMPower
     */
    
  • static Data::MemTimingSpec getTimingParams(const DRAMCtrlParams* p);
  • static Data::MemTimingSpec getTimingParams(const DRAMInterfaceParams*
    p);

    /**

    • Transforms the power and current parameters defined in
  • * DRAMCtrlParam to the memSpec of DRAMPower
    
  • * DRAMInterfaceParams to the memSpec of DRAMPower
     */
    
  • static Data::MemPowerSpec getPowerParams(const DRAMCtrlParams* p);
  • static Data::MemPowerSpec getPowerParams(const DRAMInterfaceParams* p);

  • /**

  • * Determine data rate, either one or two.
    
  • */
    
  • static uint8_t getDataRate(const DRAMInterfaceParams* p);

    /**

    • Determine if DRAM has two voltage domains (or one)
      */
  • static bool hasTwoVDD(const DRAMCtrlParams* p);
  • static bool hasTwoVDD(const DRAMInterfaceParams* p);

    /**

  • * Return an instance of MemSpec based on the DRAMCtrlParams
    
  • * Return an instance of MemSpec based on the DRAMInterfaceParams
     */
    
  • static Data::MemorySpecification getMemSpec(const DRAMCtrlParams* p);
  • static Data::MemorySpecification getMemSpec(const DRAMInterfaceParams*
    p);

public:

  // Instance of DRAMPower Library
  libDRAMPower powerlib;
  • DRAMPower(const DRAMCtrlParams* p, bool include_io);
  • DRAMPower(const DRAMInterfaceParams* p, bool include_io);

};

diff --git a/src/mem/qos/QoSMemCtrl.py b/src/mem/qos/QoSMemCtrl.py
index 1cd3f0b..f55105b 100644
--- a/src/mem/qos/QoSMemCtrl.py
+++ b/src/mem/qos/QoSMemCtrl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 ARM Limited
+# Copyright (c) 2018-2020 ARM Limited

All rights reserved.

The license below extends only to copyright in the software and shall

@@ -34,18 +34,21 @@

OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from m5.params import *
-from m5.objects.AbstractMemory import AbstractMemory
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
from m5.objects.QoSTurnaround import *

QoS Queue Selection policy used to select packets among same-QoS queues

class QoSQPolicy(Enum): vals = ["fifo", "lifo", "lrg"]

-class QoSMemCtrl(AbstractMemory):
+class QoSMemCtrl(ClockedObject):
type = 'QoSMemCtrl'
cxx_header = "mem/qos/mem_ctrl.hh"
cxx_class = 'QoS::MemCtrl'
abstract = True

  • system = Param.System(Parent.any, "System that the controller belongs
    to.")
  • ##### QoS support parameters ####
    
    # Number of priorities in the system
    

diff --git a/src/mem/qos/QoSMemSinkCtrl.py b/src/mem/qos/QoSMemSinkCtrl.py
index 6c4f263..fafac64 100644
--- a/src/mem/qos/QoSMemSinkCtrl.py
+++ b/src/mem/qos/QoSMemSinkCtrl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 ARM Limited
+# Copyright (c) 2018-2020 ARM Limited

All rights reserved.

The license below extends only to copyright in the software and shall

@@ -37,6 +37,7 @@

from m5.params import *
from m5.objects.QoSMemCtrl import *
+from m5.objects.QoSMemSinkInterface import *

class QoSMemSinkCtrl(QoSMemCtrl):
type = 'QoSMemSinkCtrl'
@@ -44,6 +45,10 @@
cxx_class = "QoS::MemSinkCtrl"
port = ResponsePort("Response ports")

  • interface = Param.QoSMemSinkInterface(QoSMemSinkInterface(),
  •            "Interface to memory")
    
  • # the basic configuration of the controller architecture, note
    # that each entry corresponds to a burst for the specific DRAM
    # configuration (e.g. x32 with burst length 8 is 32 bytes) and not
    

@@ -59,5 +64,3 @@

  # response latency - time to issue a response once a request is  

serviced
response_latency = Param.Latency("20ns", "Memory response latency")

diff --git a/src/mem/qos/QoSMemSinkInterface.py
b/src/mem/qos/QoSMemSinkInterface.py
new file mode 100644
index 0000000..5c79f64
--- /dev/null
+++ b/src/mem/qos/QoSMemSinkInterface.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.objects.AbstractMemory import AbstractMemory
+
+class QoSMemSinkInterface(AbstractMemory):

  • type = 'QoSMemSinkInterface'
  • cxx_header = "mem/qos/mem_sink.hh"
    diff --git a/src/mem/qos/SConscript b/src/mem/qos/SConscript
    index f8601b6..1d90f9c 100644
    --- a/src/mem/qos/SConscript
    +++ b/src/mem/qos/SConscript
    @@ -1,4 +1,4 @@
    -# Copyright (c) 2018 ARM Limited
    +# Copyright (c) 2018-2020 ARM Limited

All rights reserved

The license below extends only to copyright in the software and shall

@@ -37,6 +37,7 @@

SimObject('QoSMemCtrl.py')
SimObject('QoSMemSinkCtrl.py')
+SimObject('QoSMemSinkInterface.py')
SimObject('QoSPolicy.py')
SimObject('QoSTurnaround.py')

diff --git a/src/mem/qos/mem_ctrl.cc b/src/mem/qos/mem_ctrl.cc
index 50e6035..190960b 100644
--- a/src/mem/qos/mem_ctrl.cc
+++ b/src/mem/qos/mem_ctrl.cc
@@ -1,5 +1,5 @@
/*

    • Copyright (c) 2017-2019 ARM Limited
    • Copyright (c) 2017-2020 ARM Limited
    • All rights reserved
    • The license below extends only to copyright in the software and shall
      @@ -42,7 +42,7 @@
      namespace QoS {

    MemCtrl::MemCtrl(const QoSMemCtrlParams * p)

  • : AbstractMemory(p),
  • : ClockedObject(p),
    policy(p->qos_policy),
    turnPolicy(p->qos_turnaround_policy),
    queuePolicy(QueuePolicy::create(p)),
    @@ -51,7 +51,8 @@
    qosSyncroScheduler(p->qos_syncro_scheduler),
    totalReadQueueSize(0), totalWriteQueueSize(0),
    busState(READ), busStateNext(READ),
  • stats(*this)
  • stats(*this),
  • _system(p->system)
    {
    // Set the priority policy
    if (policy) {
    @@ -77,12 +78,6 @@
    {}

void
-MemCtrl::init()
-{

  • AbstractMemory::init();
    -}

-void
MemCtrl::logRequest(BusState dir, MasterID m_id, uint8_t qos,
Addr addr, uint64_t entries)
{
diff --git a/src/mem/qos/mem_ctrl.hh b/src/mem/qos/mem_ctrl.hh
index 0e29fcc..5d7c9d6 100644
--- a/src/mem/qos/mem_ctrl.hh
+++ b/src/mem/qos/mem_ctrl.hh
@@ -1,5 +1,5 @@
/*

    • Copyright (c) 2019 ARM Limited
    • Copyright (c) 2020 ARM Limited
    • All rights reserved
    • The license below extends only to copyright in the software and shall
      @@ -36,10 +36,10 @@
      */

    #include "debug/QOS.hh"
    -#include "mem/abstract_mem.hh"
    -#include "mem/qos/q_policy.hh"
    #include "mem/qos/policy.hh"
    +#include "mem/qos/q_policy.hh"
    #include "params/QoSMemCtrl.hh"
    +#include "sim/clocked_object.hh"
    #include "sim/system.hh"

    #include <unordered_map>
    @@ -56,7 +56,7 @@

    • which support QoS - it provides access to a set of QoS
    • scheduling policies
      /
      -class MemCtrl: public AbstractMemory
      +class MemCtrl : public ClockedObject
      {
      public:
      /
      * Bus Direction */
      @@ -151,6 +151,9 @@
      Stats::Scalar numStayWriteState;
      } stats;
  • /** Pointer to the System object */

  • System* _system;

  • /**
     * Initializes dynamically counters and
     * statistics for a given Master
    

@@ -266,11 +269,6 @@
virtual ~MemCtrl();

  /**
  • * Initializes this object
    
  • */
    
  • void init() override;
  • /**
    • Gets the current bus state
    • @return current bus state
      @@ -346,6 +344,10 @@
    • @return total number of priority levels
      */
      uint8_t numPriorities() const { return _numPriorities; }
  • /** read the system pointer
  • * @return pointer to the system object */
    
  • System* system() const { return _system; }
    };

template<typename Queues>
diff --git a/src/mem/qos/mem_sink.cc b/src/mem/qos/mem_sink.cc
index 1f104e4..dbdf548 100644
--- a/src/mem/qos/mem_sink.cc
+++ b/src/mem/qos/mem_sink.cc
@@ -1,5 +1,5 @@
/*

    • Copyright (c) 2018 ARM Limited
    • Copyright (c) 2018-2020 ARM Limited
    • All rights reserved
    • The license below extends only to copyright in the software and shall
      @@ -40,6 +40,7 @@
      #include "debug/Drain.hh"
      #include "debug/QOS.hh"
      #include "mem_sink.hh"
      +#include "params/QoSMemSinkInterface.hh"
      #include "sim/system.hh"

    namespace QoS {
    @@ -50,12 +51,15 @@
    memoryPacketSize(p->memory_packet_size),
    readBufferSize(p->read_buffer_size),
    writeBufferSize(p->write_buffer_size), port(name() + ".port", *this),

  • interface(p->interface),
    retryRdReq(false), retryWrReq(false), nextRequest(0),
    nextReqEvent(this)
    {
    // Resize read and write queue to allocate space
    // for configured QoS priorities
    readQueue.resize(numPriorities());
    writeQueue.resize(numPriorities());

  • interface->setMemCtrl(this);
    }

MemSinkCtrl::~MemSinkCtrl()
@@ -92,7 +96,7 @@
"%s Should not see packets where cache is responding\n",
func);

  • access(pkt);
  • interface->access(pkt);
    return responseLatency;
    }

@@ -101,7 +105,7 @@
{
pkt->pushLabel(name());

  • functionalAccess(pkt);
  • interface->functionalAccess(pkt);

    pkt->popLabel();
    }
    @@ -279,7 +283,7 @@

    // Do the actual memory access which also turns the packet
    // into a response

  • access(pkt);
  • interface->access(pkt);

    // Log the response
    logResponse(pkt->isRead()? READ : WRITE,
    @@ -351,7 +355,7 @@
    MemSinkCtrl::MemoryPort::getAddrRanges() const
    {
    AddrRangeList ranges;

  • ranges.push_back(memory.getAddrRange());
  • ranges.push_back(memory.interface->getAddrRange());
    return ranges;
    }

@@ -390,3 +394,13 @@
return new QoS::MemSinkCtrl(this);
}

+QoSMemSinkInterface::QoSMemSinkInterface(const QoSMemSinkInterfaceParams*
_p)

  • : AbstractMemory(_p)
    +{
    +}

+QoSMemSinkInterface*
+QoSMemSinkInterfaceParams::create()
+{

  • return new QoSMemSinkInterface(this);
    +}
    diff --git a/src/mem/qos/mem_sink.hh b/src/mem/qos/mem_sink.hh
    index 9a51269..5f6c1be 100644
    --- a/src/mem/qos/mem_sink.hh
    +++ b/src/mem/qos/mem_sink.hh
    @@ -1,5 +1,5 @@
    /*
    • Copyright (c) 2018 ARM Limited
    • Copyright (c) 2018-2020 ARM Limited
    • All rights reserved
    • The license below extends only to copyright in the software and shall
      @@ -41,10 +41,14 @@
      #ifndef MEM_QOS_MEM_SINK_HH
      #define MEM_QOS_MEM_SINK_HH

+#include "mem/abstract_mem.hh"
#include "mem/qos/mem_ctrl.hh"
#include "mem/qport.hh"
#include "params/QoSMemSinkCtrl.hh"

+class QoSMemSinkInterfaceParams;
+class QoSMemSinkInterface;
+
namespace QoS {

/**
@@ -163,6 +167,11 @@
/** Memory slave port */
MemoryPort port;

  • /**
  • * Create pointer to interface of actual media
    
  • */
    
  • QoSMemSinkInterface* const interface;
  • /** Read request pending */
    bool retryRdReq;
    

@@ -244,4 +253,17 @@

} // namespace QoS

+class QoSMemSinkInterface : public AbstractMemory
+{

  • public:

  • /** Setting a pointer to the interface */

  • void setMemCtrl(QoS::MemSinkCtrl* _ctrl) { ctrl = _ctrl; };

  • /** Pointer to the controller */

  • QoS::MemSinkCtrl* ctrl;

  • QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p);
    +};

  • #endif /* MEM_QOS_MEM_SINK_HH */
    diff --git a/tests/gem5/configs/base_config.py
    b/tests/gem5/configs/base_config.py
    index b5bddf4..cbea768 100644
    --- a/tests/gem5/configs/base_config.py
    +++ b/tests/gem5/configs/base_config.py
    @@ -1,4 +1,4 @@
    -# Copyright (c) 2012-2013, 2017-2018 ARM Limited
    +# Copyright (c) 2012-2013, 2017-2018, 2020 ARM Limited

    All rights reserved.

    The license below extends only to copyright in the software and shall

@@ -220,7 +220,12 @@
super(BaseSESystem, self).init_system(system)

  def create_system(self):
  •    system = System(physmem = self.mem_class(),
    
  •    if issubclass(self.mem_class, m5.objects.DRAMInterface):
    
  •        mem_ctrl = DRAMCtrl()
    
  •        mem_ctrl.dram = self.mem_class()
    
  •    else:
    
  •        mem_ctrl = self.mem_class()
    
  •    system = System(physmem = mem_ctrl,
                        membus = SystemXBar(),
                        mem_mode = self.mem_mode,
                        multi_thread = (self.num_threads > 1))
    

@@ -272,8 +277,16 @@
else:
# create the memory controllers and connect them, stick with
# the physmem name to avoid bumping all the reference stats

  •        system.physmem = [self.mem_class(range = r)
    
  •                          for r in system.mem_ranges]
    
  •        if issubclass(self.mem_class, m5.objects.DRAMInterface):
    
  •            mem_ctrls = []
    
  •            for r in system.mem_ranges:
    
  •                mem_ctrl = DRAMCtrl()
    
  •                mem_ctrl.dram = self.mem_class(range = r)
    
  •                mem_ctrls.append(mem_ctrl)
    
  •            system.physmem = mem_ctrls
    
  •        else:
    
  •            system.physmem = [self.mem_class(range = r)
    
  •                              for r in system.mem_ranges]
            for i in range(len(system.physmem)):
                system.physmem[i].port = system.membus.master
    

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28968
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8
Gerrit-Change-Number: 28968
Gerrit-PatchSet: 13
Gerrit-Owner: Wendy Elsasser <wendy.elsasser(a)arm.com>
Gerrit-Reviewer: Daniel Carvalho <odanrc(a)yahoo.com.br>
Gerrit-Reviewer: Jason Lowe-Power <power.jg(a)gmail.com>
Gerrit-Reviewer: John Alsop <johnathan.alsop(a)amd.com>
Gerrit-Reviewer: Matthew Poremba <matthew.poremba(a)amd.com>
Gerrit-Reviewer: Nikos Nikoleris <nikos.nikoleris(a)arm.com>
Gerrit-Reviewer: Srikant Bharadwaj <srikant.bharadwaj(a)amd.com>
Gerrit-Reviewer: kokoro <noreply+kokoro(a)google.com>
Gerrit-MessageType: merged

Jason Lowe-Power has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/28968 ) Change subject: mem: Make MemCtrl a ClockedObject ...................................................................... mem: Make MemCtrl a ClockedObject Made DRAMCtrl a ClockedObject, with DRAMInterface defined as an AbstractMemory. The address ranges are now defined per interface. Currently the model only includes a DRAMInterface but this can be expanded for other media types. The controller object includes a parameter to the interface, which is setup when gem5 is configured. Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28968 Reviewed-by: Jason Lowe-Power <power.jg(a)gmail.com> Maintainer: Jason Lowe-Power <power.jg(a)gmail.com> Tested-by: kokoro <noreply+kokoro(a)google.com> --- M configs/common/MemConfig.py M configs/dram/low_power_sweep.py M configs/dram/sweep.py M configs/example/memcheck.py M configs/learning_gem5/part1/simple.py M configs/learning_gem5/part1/two_level.py M configs/learning_gem5/part2/simple_cache.py M configs/learning_gem5/part2/simple_memobj.py M configs/learning_gem5/part3/simple_ruby.py M configs/ruby/Ruby.py M src/mem/DRAMCtrl.py A src/mem/DRAMInterface.py M src/mem/SConscript M src/mem/dram_ctrl.cc M src/mem/dram_ctrl.hh M src/mem/drampower.cc M src/mem/drampower.hh M src/mem/qos/QoSMemCtrl.py M src/mem/qos/QoSMemSinkCtrl.py A src/mem/qos/QoSMemSinkInterface.py M src/mem/qos/SConscript M src/mem/qos/mem_ctrl.cc M src/mem/qos/mem_ctrl.hh M src/mem/qos/mem_sink.cc M src/mem/qos/mem_sink.hh M tests/gem5/configs/base_config.py 26 files changed, 1,913 insertions(+), 1,736 deletions(-) Approvals: Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/configs/common/MemConfig.py b/configs/common/MemConfig.py index b530145..1ace875 100644 --- a/configs/common/MemConfig.py +++ b/configs/common/MemConfig.py @@ -40,7 +40,7 @@ from common import ObjectList from common import HMC -def create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, intlv_size,\ +def create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size, xor_low_bit): """ Helper function for creating a single memoy controller from the given @@ -63,32 +63,32 @@ # Create an instance so we can figure out the address # mapping and row-buffer size - ctrl = cls() + interface = intf() # Only do this for DRAMs - if issubclass(cls, m5.objects.DRAMCtrl): + if issubclass(intf, m5.objects.DRAMInterface): # If the channel bits are appearing after the column # bits, we need to add the appropriate number of bits # for the row buffer size - if ctrl.addr_mapping.value == 'RoRaBaChCo': + if interface.addr_mapping.value == 'RoRaBaChCo': # This computation only really needs to happen # once, but as we rely on having an instance we # end up having to repeat it for each and every # one - rowbuffer_size = ctrl.device_rowbuffer_size.value * \ - ctrl.devices_per_rank.value + rowbuffer_size = interface.device_rowbuffer_size.value * \ + interface.devices_per_rank.value intlv_low_bit = int(math.log(rowbuffer_size, 2)) # We got all we need to configure the appropriate address # range - ctrl.range = m5.objects.AddrRange(r.start, size = r.size(), + interface.range = m5.objects.AddrRange(r.start, size = r.size(), intlvHighBit = \ intlv_low_bit + intlv_bits - 1, xorHighBit = xor_high_bit, intlvBits = intlv_bits, intlvMatch = i) - return ctrl + return interface def config_mem(options, system): """ @@ -148,10 +148,10 @@ if 2 ** intlv_bits != nbr_mem_ctrls: fatal("Number of memory channels must be a power of 2") - cls = ObjectList.mem_list.get(opt_mem_type) + intf = ObjectList.mem_list.get(opt_mem_type) mem_ctrls = [] - if opt_elastic_trace_en and not issubclass(cls, m5.objects.SimpleMemory): + if opt_elastic_trace_en and not issubclass(intf, m5.objects.SimpleMemory): fatal("When elastic trace is enabled, configure mem-type as " "simple-mem.") @@ -162,36 +162,53 @@ intlv_size = max(opt_mem_channels_intlv, system.cache_line_size.value) # For every range (most systems will only have one), create an - # array of controllers and set their parameters to match their - # address mapping in the case of a DRAM + # array of memory interfaces and set their parameters to match + # their address mapping in the case of a DRAM for r in system.mem_ranges: for i in range(nbr_mem_ctrls): - mem_ctrl = create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, + # Create the DRAM interface + dram_intf = create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size, opt_xor_low_bit) + # Set the number of ranks based on the command-line # options if it was explicitly set - if issubclass(cls, m5.objects.DRAMCtrl) and opt_mem_ranks: - mem_ctrl.ranks_per_channel = opt_mem_ranks + if issubclass(intf, m5.objects.DRAMInterface) and opt_mem_ranks: + dram_intf.ranks_per_channel = opt_mem_ranks # Enable low-power DRAM states if option is set - if issubclass(cls, m5.objects.DRAMCtrl): - mem_ctrl.enable_dram_powerdown = opt_dram_powerdown + if issubclass(intf, m5.objects.DRAMInterface): + dram_intf.enable_dram_powerdown = opt_dram_powerdown if opt_elastic_trace_en: - mem_ctrl.latency = '1ns' + dram_intf.latency = '1ns' print("For elastic trace, over-riding Simple Memory " "latency to 1ns.") + # Create the controller that will drive the interface + if opt_mem_type == "HMC_2500_1x32": + # The static latency of the vault controllers is estimated + # to be smaller than a full DRAM channel controller + mem_ctrl = m5.objects.DRAMCtrl(min_writes_per_switch = 8, + static_backend_latency = '4ns', + static_frontend_latency = '4ns') + else: + mem_ctrl = m5.objects.DRAMCtrl() + + # Hookup the controller to the interface and add to the list + mem_ctrl.dram = dram_intf mem_ctrls.append(mem_ctrl) + # Create a controller and connect the interfaces to a controller + for i in range(len(mem_ctrls)): + if opt_mem_type == "HMC_2500_1x32": + # Connect the controllers to the membus + mem_ctrls[i].port = xbar[i/4].master + # Set memory device size. There is an independent controller for + # each vault. All vaults are same size. + mem_ctrls[i].dram.device_size = options.hmc_dev_vault_size + else: + # Connect the controllers to the membus + mem_ctrls[i].port = xbar.master + subsystem.mem_ctrls = mem_ctrls - # Connect the controllers to the membus - for i in range(len(subsystem.mem_ctrls)): - if opt_mem_type == "HMC_2500_1x32": - subsystem.mem_ctrls[i].port = xbar[i/4].master - # Set memory device size. There is an independent controller for - # each vault. All vaults are same size. - subsystem.mem_ctrls[i].device_size = options.hmc_dev_vault_size - else: - subsystem.mem_ctrls[i].port = xbar.master diff --git a/configs/dram/low_power_sweep.py b/configs/dram/low_power_sweep.py index 9a62393..0da2b93 100644 --- a/configs/dram/low_power_sweep.py +++ b/configs/dram/low_power_sweep.py @@ -111,14 +111,19 @@ # Sanity check for memory controller class. if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl): - fatal("This script assumes the memory is a DRAMCtrl subclass") + fatal("This script assumes the controller is a DRAMCtrl subclass") +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface): + fatal("This script assumes the memory is a DRAMInterface subclass") # There is no point slowing things down by saving any data. -system.mem_ctrls[0].null = True +system.mem_ctrls[0].dram.null = True + +# enable DRAM low power states +system.mem_ctrls[0].dram.enable_dram_powerdown = True # Set the address mapping based on input argument -system.mem_ctrls[0].addr_mapping = args.addr_map -system.mem_ctrls[0].page_policy = args.page_policy +system.mem_ctrls[0].dram.addr_mapping = args.addr_map +system.mem_ctrls[0].dram.page_policy = args.page_policy # We create a traffic generator state for each param combination we want to # test. Each traffic generator state is specified in the config file and the @@ -132,22 +137,22 @@ cfg_file = open(cfg_file_path, 'w') # Get the number of banks -nbr_banks = int(system.mem_ctrls[0].banks_per_rank.value) +nbr_banks = int(system.mem_ctrls[0].dram.banks_per_rank.value) # determine the burst size in bytes -burst_size = int((system.mem_ctrls[0].devices_per_rank.value * - system.mem_ctrls[0].device_bus_width.value * - system.mem_ctrls[0].burst_length.value) / 8) +burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value * + system.mem_ctrls[0].dram.device_bus_width.value * + system.mem_ctrls[0].dram.burst_length.value) / 8) # next, get the page size in bytes (the rowbuffer size is already in bytes) -page_size = system.mem_ctrls[0].devices_per_rank.value * \ - system.mem_ctrls[0].device_rowbuffer_size.value +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \ + system.mem_ctrls[0].dram.device_rowbuffer_size.value # Inter-request delay should be such that we can hit as many transitions # to/from low power states as possible to. We provide a min and max itt to the # traffic generator and it randomises in the range. The parameter is in # seconds and we need it in ticks (ps). -itt_min = system.mem_ctrls[0].tBURST.value * 1000000000000 +itt_min = system.mem_ctrls[0].dram.tBURST.value * 1000000000000 #The itt value when set to (tRAS + tRP + tCK) covers the case where # a read command is delayed beyond the delay from ACT to PRE_PDN entry of the @@ -155,9 +160,9 @@ # between a write and power down entry will be tRCD + tCL + tWR + tRP + tCK. # As we use this delay as a unit and create multiples of it as bigger delays # for the sweep, this parameter works for reads, writes and mix of them. -pd_entry_time = (system.mem_ctrls[0].tRAS.value + - system.mem_ctrls[0].tRP.value + - system.mem_ctrls[0].tCK.value) * 1000000000000 +pd_entry_time = (system.mem_ctrls[0].dram.tRAS.value + + system.mem_ctrls[0].dram.tRP.value + + system.mem_ctrls[0].dram.tCK.value) * 1000000000000 # We sweep itt max using the multipliers specified by the user. itt_max_str = args.itt_list.strip().split() diff --git a/configs/dram/sweep.py b/configs/dram/sweep.py index a340b46..a771c5c 100644 --- a/configs/dram/sweep.py +++ b/configs/dram/sweep.py @@ -116,13 +116,15 @@ # the following assumes that we are using the native DRAM # controller, check to be sure if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl): - fatal("This script assumes the memory is a DRAMCtrl subclass") + fatal("This script assumes the controller is a DRAMCtrl subclass") +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface): + fatal("This script assumes the memory is a DRAMInterface subclass") # there is no point slowing things down by saving any data -system.mem_ctrls[0].null = True +system.mem_ctrls[0].dram.null = True # Set the address mapping based on input argument -system.mem_ctrls[0].addr_mapping = options.addr_map +system.mem_ctrls[0].dram.addr_mapping = options.addr_map # stay in each state for 0.25 ms, long enough to warm things up, and # short enough to avoid hitting a refresh @@ -133,21 +135,21 @@ # the DRAM maximum bandwidth to ensure that it is saturated # get the number of banks -nbr_banks = system.mem_ctrls[0].banks_per_rank.value +nbr_banks = system.mem_ctrls[0].dram.banks_per_rank.value # determine the burst length in bytes -burst_size = int((system.mem_ctrls[0].devices_per_rank.value * - system.mem_ctrls[0].device_bus_width.value * - system.mem_ctrls[0].burst_length.value) / 8) +burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value * + system.mem_ctrls[0].dram.device_bus_width.value * + system.mem_ctrls[0].dram.burst_length.value) / 8) # next, get the page size in bytes -page_size = system.mem_ctrls[0].devices_per_rank.value * \ - system.mem_ctrls[0].device_rowbuffer_size.value +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \ + system.mem_ctrls[0].dram.device_rowbuffer_size.value # match the maximum bandwidth of the memory, the parameter is in seconds # and we need it in ticks (ps) -itt = getattr(system.mem_ctrls[0].tBURST_MIN, 'value', - system.mem_ctrls[0].tBURST.value) * 1000000000000 +itt = getattr(system.mem_ctrls[0].dram.tBURST_MIN, 'value', + system.mem_ctrls[0].dram.tBURST.value) * 1000000000000 # assume we start at 0 max_addr = mem_range.end diff --git a/configs/example/memcheck.py b/configs/example/memcheck.py index 6d80d60..6bccd54 100644 --- a/configs/example/memcheck.py +++ b/configs/example/memcheck.py @@ -217,7 +217,7 @@ proto_tester = TrafficGen(config_file = cfg_file_path) # Set up the system along with a DRAM controller -system = System(physmem = DDR3_1600_8x8()) +system = System(physmem = DRAMCtrl(dram = DDR3_1600_8x8())) system.voltage_domain = VoltageDomain(voltage = '1V') diff --git a/configs/learning_gem5/part1/simple.py b/configs/learning_gem5/part1/simple.py index ef73a06..cfd15be 100644 --- a/configs/learning_gem5/part1/simple.py +++ b/configs/learning_gem5/part1/simple.py @@ -77,8 +77,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part1/two_level.py b/configs/learning_gem5/part1/two_level.py index 564c785..0dbcfc7 100644 --- a/configs/learning_gem5/part1/two_level.py +++ b/configs/learning_gem5/part1/two_level.py @@ -132,8 +132,9 @@ system.system_port = system.membus.slave # Create a DDR3 memory controller -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Create a process for a simple "Hello World" application diff --git a/configs/learning_gem5/part2/simple_cache.py b/configs/learning_gem5/part2/simple_cache.py index 8d98d92..fbea73d 100644 --- a/configs/learning_gem5/part2/simple_cache.py +++ b/configs/learning_gem5/part2/simple_cache.py @@ -76,8 +76,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part2/simple_memobj.py b/configs/learning_gem5/part2/simple_memobj.py index d30977c..e792eb9 100644 --- a/configs/learning_gem5/part2/simple_memobj.py +++ b/configs/learning_gem5/part2/simple_memobj.py @@ -74,8 +74,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part3/simple_ruby.py b/configs/learning_gem5/part3/simple_ruby.py index c47ee7e..7f70a8c 100644 --- a/configs/learning_gem5/part3/simple_ruby.py +++ b/configs/learning_gem5/part3/simple_ruby.py @@ -68,8 +68,9 @@ system.cpu = [TimingSimpleCPU() for i in range(2)] # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] # create the interrupt controller for the CPU and connect to the membus for cpu in system.cpu: diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py index 9bceaa3..9f400a8 100644 --- a/configs/ruby/Ruby.py +++ b/configs/ruby/Ruby.py @@ -130,15 +130,16 @@ dir_ranges = [] for r in system.mem_ranges: mem_type = ObjectList.mem_list.get(options.mem_type) - mem_ctrl = MemConfig.create_mem_ctrl(mem_type, r, index, + dram_intf = MemConfig.create_mem_intf(mem_type, r, index, options.num_dirs, int(math.log(options.num_dirs, 2)), intlv_size, options.xor_low_bit) + mem_ctrl = m5.objects.DRAMCtrl(dram = dram_intf) if options.access_backing_store: mem_ctrl.kvm_map=False mem_ctrls.append(mem_ctrl) - dir_ranges.append(mem_ctrl.range) + dir_ranges.append(mem_ctrl.dram.range) if crossbar != None: mem_ctrl.port = crossbar.master diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py index 0f70dff..b7b43dc 100644 --- a/src/mem/DRAMCtrl.py +++ b/src/mem/DRAMCtrl.py @@ -40,26 +40,12 @@ from m5.params import * from m5.proxy import * -from m5.objects.AbstractMemory import * from m5.objects.QoSMemCtrl import * # Enum for memory scheduling algorithms, currently First-Come # First-Served and a First-Row Hit then First-Come First-Served class MemSched(Enum): vals = ['fcfs', 'frfcfs'] -# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting -# channel, rank, bank, row and column, respectively, and going from -# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are -# suitable for an open-page policy, optimising for sequential accesses -# hitting in the open row. For a closed-page policy, RoCoRaBaCh -# maximises parallelism. -class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh'] - -# Enum for the page policy, either open, open_adaptive, close, or -# close_adaptive. -class PageManage(Enum): vals = ['open', 'open_adaptive', 'close', - 'close_adaptive'] - # DRAMCtrl is a single-channel single-ported DRAM controller model # that aims to model the most important system-level performance # effects of a DRAM without getting into too much detail of the DRAM @@ -72,12 +58,11 @@ # bus in front of the controller for multiple ports port = SlavePort("Slave port") - # the basic configuration of the controller architecture, note - # that each entry corresponds to a burst for the specific DRAM - # configuration (e.g. x32 with burst length 8 is 32 bytes) and not - # the cacheline size or request/packet size - write_buffer_size = Param.Unsigned(64, "Number of write queue entries") - read_buffer_size = Param.Unsigned(32, "Number of read queue entries") + # Interface to volatile, DRAM media + dram = Param.DRAMInterface("DRAM interface") + + # read and write buffer depths are set in the interface + # the controller will read these values when instantiated # threshold in percent for when to forcefully trigger writes and # start emptying the write buffer @@ -93,15 +78,6 @@ # scheduler, address map and page policy mem_sched_policy = Param.MemSched('frfcfs', "Memory scheduling policy") - addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy") - page_policy = Param.PageManage('open_adaptive', "Page management policy") - - # enforce a limit on the number of accesses per row - max_accesses_per_row = Param.Unsigned(16, "Max accesses per row before " - "closing"); - - # size of DRAM Chip in Bytes - device_size = Param.MemorySize("Size of DRAM chip") # pipeline latency of the controller and PHY, split into a # frontend part and a backend part, with reads and writes serviced @@ -109,1404 +85,3 @@ # serviced by the memory seeing the sum of the two static_frontend_latency = Param.Latency("10ns", "Static frontend latency") static_backend_latency = Param.Latency("10ns", "Static backend latency") - - # the physical organisation of the DRAM - device_bus_width = Param.Unsigned("data bus width in bits for each DRAM "\ - "device/chip") - burst_length = Param.Unsigned("Burst lenght (BL) in beats") - device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\ - "device/chip") - devices_per_rank = Param.Unsigned("Number of devices/chips per rank") - ranks_per_channel = Param.Unsigned("Number of ranks per channel") - - # default to 0 bank groups per rank, indicating bank group architecture - # is not used - # update per memory class when bank group architecture is supported - bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per rank") - banks_per_rank = Param.Unsigned("Number of banks per rank") - - # Enable DRAM powerdown states if True. This is False by default due to - # performance being lower when enabled - enable_dram_powerdown = Param.Bool(False, "Enable powerdown states") - - # For power modelling we need to know if the DRAM has a DLL or not - dll = Param.Bool(True, "DRAM has DLL or not") - - # DRAMPower provides in addition to the core power, the possibility to - # include RD/WR termination and IO power. This calculation assumes some - # default values. The integration of DRAMPower with gem5 does not include - # IO and RD/WR termination power by default. This might be added as an - # additional feature in the future. - - # timing behaviour and constraints - all in nanoseconds - - # the base clock period of the DRAM - tCK = Param.Latency("Clock period") - - # the amount of time in nanoseconds from issuing an activate command - # to the data being available in the row buffer for a read/write - tRCD = Param.Latency("RAS to CAS delay") - - # the time from issuing a read/write command to seeing the actual data - tCL = Param.Latency("CAS latency") - - # minimum time between a precharge and subsequent activate - tRP = Param.Latency("Row precharge time") - - # minimum time between an activate and a precharge to the same row - tRAS = Param.Latency("ACT to PRE delay") - - # minimum time between a write data transfer and a precharge - tWR = Param.Latency("Write recovery time") - - # minimum time between a read and precharge command - tRTP = Param.Latency("Read to precharge") - - # time to complete a burst transfer, typically the burst length - # divided by two due to the DDR bus, but by making it a parameter - # it is easier to also evaluate SDR memories like WideIO. - # This parameter has to account for burst length. - # Read/Write requests with data size larger than one full burst are broken - # down into multiple requests in the controller - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = Param.Latency("Burst duration " - "(typically burst length / 2 cycles)") - - # tBURST_MAX is the column array cycle delay required before next access, - # which could be greater than tBURST when the memory access time is greater - # than tBURST - tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay") - - # tBURST_MIN is the minimum delay between bursts, which could be less than - # tBURST when interleaving is supported - tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts") - - # CAS-to-CAS delay for bursts to the same bank group - # only utilized with bank group architectures; set to 0 for default case - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay") - - # Write-to-Write delay for bursts to the same bank group - # only utilized with bank group architectures; set to 0 for default case - # This will be used to enable different same bank group delays - # for writes versus reads - tCCD_L_WR = Param.Latency(Self.tCCD_L, - "Same bank group Write to Write delay") - - # time taken to complete one refresh cycle (N rows in all banks) - tRFC = Param.Latency("Refresh cycle time") - - # refresh command interval, how often a "ref" command needs - # to be sent. It is 7.8 us for a 64ms refresh requirement - tREFI = Param.Latency("Refresh command interval") - - # write-to-read, same rank turnaround penalty - tWTR = Param.Latency("Write to read, same rank switching time") - - # write-to-read, same rank turnaround penalty for same bank group - tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching " - "time, same bank group") - - # read-to-write, same rank turnaround penalty - tRTW = Param.Latency("Read to write, same rank switching time") - - # rank-to-rank bus delay penalty - # this does not correlate to a memory timing parameter and encompasses: - # 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD - # different rank bus delay - tCS = Param.Latency("Rank to rank switching time") - - # minimum precharge to precharge delay time - tPPD = Param.Latency("0ns", "PRE to PRE delay") - - # maximum delay between two-cycle ACT command phases - tAAD = Param.Latency(Self.tCK, - "Maximum delay between two-cycle ACT commands") - - two_cycle_activate = Param.Bool(False, - "Two cycles required to send activate") - - # minimum row activate to row activate delay time - tRRD = Param.Latency("ACT to ACT delay") - - # only utilized with bank group architectures; set to 0 for default case - tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay") - - # time window in which a maximum number of activates are allowed - # to take place, set to 0 to disable - tXAW = Param.Latency("X activation window") - activation_limit = Param.Unsigned("Max number of activates in window") - - # time to exit power-down mode - # Exit power-down to next valid command delay - tXP = Param.Latency("0ns", "Power-up Delay") - - # Exit Powerdown to commands requiring a locked DLL - tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL") - - # time to exit self-refresh mode - tXS = Param.Latency("0ns", "Self-refresh exit latency") - - # time to exit self-refresh mode with locked DLL - tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL") - - # number of data beats per clock. with DDR, default is 2, one per edge - beats_per_clock = Param.Unsigned(2, "Data beats per clock") - - data_clock_sync = Param.Bool(False, "Synchronization commands required") - - # Currently rolled into other params - ###################################################################### - - # tRC - assumed to be tRAS + tRP - - # Power Behaviour and Constraints - # DRAMs like LPDDR and WideIO have 2 external voltage domains. These are - # defined as VDD and VDD2. Each current is defined for each voltage domain - # separately. For example, current IDD0 is active-precharge current for - # voltage domain VDD and current IDD02 is active-precharge current for - # voltage domain VDD2. - # By default all currents are set to 0mA. Users who are only interested in - # the performance of DRAMs can leave them at 0. - - # Operating 1 Bank Active-Precharge current - IDD0 = Param.Current("0mA", "Active precharge current") - - # Operating 1 Bank Active-Precharge current multiple voltage Range - IDD02 = Param.Current("0mA", "Active precharge current VDD2") - - # Precharge Power-down Current: Slow exit - IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow") - - # Precharge Power-down Current: Slow exit multiple voltage Range - IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2") - - # Precharge Power-down Current: Fast exit - IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast") - - # Precharge Power-down Current: Fast exit multiple voltage Range - IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2") - - # Precharge Standby current - IDD2N = Param.Current("0mA", "Precharge Standby current") - - # Precharge Standby current multiple voltage range - IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2") - - # Active Power-down current: slow exit - IDD3P0 = Param.Current("0mA", "Active Powerdown slow") - - # Active Power-down current: slow exit multiple voltage range - IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2") - - # Active Power-down current : fast exit - IDD3P1 = Param.Current("0mA", "Active Powerdown fast") - - # Active Power-down current : fast exit multiple voltage range - IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2") - - # Active Standby current - IDD3N = Param.Current("0mA", "Active Standby current") - - # Active Standby current multiple voltage range - IDD3N2 = Param.Current("0mA", "Active Standby current VDD2") - - # Burst Read Operating Current - IDD4R = Param.Current("0mA", "READ current") - - # Burst Read Operating Current multiple voltage range - IDD4R2 = Param.Current("0mA", "READ current VDD2") - - # Burst Write Operating Current - IDD4W = Param.Current("0mA", "WRITE current") - - # Burst Write Operating Current multiple voltage range - IDD4W2 = Param.Current("0mA", "WRITE current VDD2") - - # Refresh Current - IDD5 = Param.Current("0mA", "Refresh current") - - # Refresh Current multiple voltage range - IDD52 = Param.Current("0mA", "Refresh current VDD2") - - # Self-Refresh Current - IDD6 = Param.Current("0mA", "Self-refresh Current") - - # Self-Refresh Current multiple voltage range - IDD62 = Param.Current("0mA", "Self-refresh Current VDD2") - - # Main voltage range of the DRAM - VDD = Param.Voltage("0V", "Main Voltage Range") - - # Second voltage range defined by some DRAMs - VDD2 = Param.Voltage("0V", "2nd Voltage Range") - -# A single DDR3-1600 x64 channel (one command and address bus), with -# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in -# an 8x8 configuration. -class DDR3_1600_8x8(DRAMCtrl): - # size of device in bytes - device_size = '512MB' - - # 8x8 configuration, 8 devices each with an 8-bit interface - device_bus_width = 8 - - # DDR3 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) - device_rowbuffer_size = '1kB' - - # 8x8 configuration, so 8 devices - devices_per_rank = 8 - - # Use two ranks - ranks_per_channel = 2 - - # DDR3 has 8 banks in all configurations - banks_per_rank = 8 - - # 800 MHz - tCK = '1.25ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 800 MHz - tBURST = '5ns' - - # DDR3-1600 11-11-11 - tRCD = '13.75ns' - tCL = '13.75ns' - tRP = '13.75ns' - tRAS = '35ns' - tRRD = '6ns' - tXAW = '30ns' - activation_limit = 4 - tRFC = '260ns' - - tWR = '15ns' - - # Greater of 4 CK or 7.5 ns - tWTR = '7.5ns' - - # Greater of 4 CK or 7.5 ns - tRTP = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns - tRTW = '2.5ns' - - # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns - tCS = '2.5ns' - - # <=85C, half for >85C - tREFI = '7.8us' - - # active powerdown and precharge powerdown exit time - tXP = '6ns' - - # self refresh exit time - tXS = '270ns' - - # Current values from datasheet Die Rev E,J - IDD0 = '55mA' - IDD2N = '32mA' - IDD3N = '38mA' - IDD4W = '125mA' - IDD4R = '157mA' - IDD5 = '235mA' - IDD3P1 = '38mA' - IDD2P1 = '32mA' - IDD6 = '20mA' - VDD = '1.5V' - -# A single HMC-2500 x32 model based on: -# [1] DRAMSpec: a high-level DRAM bank modelling tool -# developed at the University of Kaiserslautern. This high level tool -# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to -# estimate the DRAM bank latency and power numbers. -# [2] High performance AXI-4.0 based interconnect for extensible smart memory -# cubes (E. Azarkhish et. al) -# Assumed for the HMC model is a 30 nm technology node. -# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory (4 -# layers). -# Each layer has 16 vaults and each vault consists of 2 banks per layer. -# In order to be able to use the same controller used for 2D DRAM generations -# for HMC, the following analogy is done: -# Channel (DDR) => Vault (HMC) -# device_size (DDR) => size of a single layer in a vault -# ranks per channel (DDR) => number of layers -# banks per rank (DDR) => banks per layer -# devices per rank (DDR) => devices per layer ( 1 for HMC). -# The parameters for which no input is available are inherited from the DDR3 -# configuration. -# This configuration includes the latencies from the DRAM to the logic layer -# of the HMC -class HMC_2500_1x32(DDR3_1600_8x8): - # size of device - # two banks per device with each bank 4MB [2] - device_size = '8MB' - - # 1x32 configuration, 1 device with 32 TSVs [2] - device_bus_width = 32 - - # HMC is a BL8 device [2] - burst_length = 8 - - # Each device has a page (row buffer) size of 256 bytes [2] - device_rowbuffer_size = '256B' - - # 1x32 configuration, so 1 device [2] - devices_per_rank = 1 - - # 4 layers so 4 ranks [2] - ranks_per_channel = 4 - - # HMC has 2 banks per layer [2] - # Each layer represents a rank. With 4 layers and 8 banks in total, each - # layer has 2 banks; thus 2 banks per rank. - banks_per_rank = 2 - - # 1250 MHz [2] - tCK = '0.8ns' - - # 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz - tBURST = '3.2ns' - - # Values using DRAMSpec HMC model [1] - tRCD = '10.2ns' - tCL = '9.9ns' - tRP = '7.7ns' - tRAS = '21.6ns' - - # tRRD depends on the power supply network for each vendor. - # We assume a tRRD of a double bank approach to be equal to 4 clock - # cycles (Assumption) - tRRD = '3.2ns' - - # activation limit is set to 0 since there are only 2 banks per vault - # layer. - activation_limit = 0 - - # Values using DRAMSpec HMC model [1] - tRFC = '59ns' - tWR = '8ns' - tRTP = '4.9ns' - - # Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz = - # 0.8 ns (Assumption) - tCS = '0.8ns' - - # Value using DRAMSpec HMC model [1] - tREFI = '3.9us' - - # The default page policy in the vault controllers is simple closed page - # [2] nevertheless 'close' policy opens and closes the row multiple times - # for bursts largers than 32Bytes. For this reason we use 'close_adaptive' - page_policy = 'close_adaptive' - - # RoCoRaBaCh resembles the default address mapping in HMC - addr_mapping = 'RoCoRaBaCh' - min_writes_per_switch = 8 - - # These parameters do not directly correlate with buffer_size in real - # hardware. Nevertheless, their value has been tuned to achieve a - # bandwidth similar to the cycle-accurate model in [2] - write_buffer_size = 32 - read_buffer_size = 32 - - # The static latency of the vault controllers is estimated to be smaller - # than a full DRAM channel controller - static_backend_latency='4ns' - static_frontend_latency='4ns' - -# A single DDR3-2133 x64 channel refining a selected subset of the -# options for the DDR-1600 configuration, based on the same DDR3-1600 -# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept -# consistent across the two configurations. -class DDR3_2133_8x8(DDR3_1600_8x8): - # 1066 MHz - tCK = '0.938ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz - tBURST = '3.752ns' - - # DDR3-2133 14-14-14 - tRCD = '13.09ns' - tCL = '13.09ns' - tRP = '13.09ns' - tRAS = '33ns' - tRRD = '5ns' - tXAW = '25ns' - - # Current values from datasheet - IDD0 = '70mA' - IDD2N = '37mA' - IDD3N = '44mA' - IDD4W = '157mA' - IDD4R = '191mA' - IDD5 = '250mA' - IDD3P1 = '44mA' - IDD2P1 = '43mA' - IDD6 ='20mA' - VDD = '1.5V' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4) -# in an 16x4 configuration. -# Total channel capacity is 32GB -# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel -class DDR4_2400_16x4(DRAMCtrl): - # size of device - device_size = '1GB' - - # 16x4 configuration, 16 devices each with a 4-bit interface - device_bus_width = 4 - - # DDR4 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 512 byte (1K columns x4) - device_rowbuffer_size = '512B' - - # 16x4 configuration, so 16 devices - devices_per_rank = 16 - - # Match our DDR3 configurations which is dual rank - ranks_per_channel = 2 - - # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups - # Set to 4 for x4 case - bank_groups_per_rank = 4 - - # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all - # configurations). Currently we do not capture the additional - # constraints incurred by the bank groups - banks_per_rank = 16 - - # override the default buffer sizes and go for something larger to - # accommodate the larger bank count - write_buffer_size = 128 - read_buffer_size = 64 - - # 1200 MHz - tCK = '0.833ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = '3.332ns' - - # @2400 data rate, tCCD_L is 6 CK - # CAS-to-CAS delay for bursts to the same bank group - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = '5ns'; - - # DDR4-2400 17-17-17 - tRCD = '14.16ns' - tCL = '14.16ns' - tRP = '14.16ns' - tRAS = '32ns' - - # RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns) - tRRD = '3.332ns' - - # RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns) - tRRD_L = '4.9ns'; - - # tFAW for 512B page is MAX(16 CK, 13ns) - tXAW = '13.328ns' - activation_limit = 4 - # tRFC is 350ns - tRFC = '350ns' - - tWR = '15ns' - - # Here using the average of WTR_S and WTR_L - tWTR = '5ns' - - # Greater of 4 CK or 7.5 ns - tRTP = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666 ns - tRTW = '1.666ns' - - # Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns - tCS = '1.666ns' - - # <=85C, half for >85C - tREFI = '7.8us' - - # active powerdown and precharge powerdown exit time - tXP = '6ns' - - # self refresh exit time - # exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is: - # tRFC + 10ns = 340ns - tXS = '340ns' - - # Current values from datasheet - IDD0 = '43mA' - IDD02 = '3mA' - IDD2N = '34mA' - IDD3N = '38mA' - IDD3N2 = '3mA' - IDD4W = '103mA' - IDD4R = '110mA' - IDD5 = '250mA' - IDD3P1 = '32mA' - IDD2P1 = '25mA' - IDD6 = '30mA' - VDD = '1.2V' - VDD2 = '2.5V' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8) -# in an 8x8 configuration. -# Total channel capacity is 16GB -# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel -class DDR4_2400_8x8(DDR4_2400_16x4): - # 8x8 configuration, 8 devices each with an 8-bit interface - device_bus_width = 8 - - # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) - device_rowbuffer_size = '1kB' - - # 8x8 configuration, so 8 devices - devices_per_rank = 8 - - # RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns) - tRRD_L = '4.9ns'; - - tXAW = '21ns' - - # Current values from datasheet - IDD0 = '48mA' - IDD3N = '43mA' - IDD4W = '123mA' - IDD4R = '135mA' - IDD3P1 = '37mA' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16) -# in an 4x16 configuration. -# Total channel capacity is 4GB -# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel -class DDR4_2400_4x16(DDR4_2400_16x4): - # 4x16 configuration, 4 devices each with an 16-bit interface - device_bus_width = 16 - - # Each device has a page (row buffer) size of 2 Kbyte (1K columns x16) - device_rowbuffer_size = '2kB' - - # 4x16 configuration, so 4 devices - devices_per_rank = 4 - - # Single rank for x16 - ranks_per_channel = 1 - - # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups - # Set to 2 for x16 case - bank_groups_per_rank = 2 - - # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all - # configurations). Currently we do not capture the additional - # constraints incurred by the bank groups - banks_per_rank = 8 - - # RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns) - tRRD = '5.3ns' - - # RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns) - tRRD_L = '6.4ns'; - - tXAW = '30ns' - - # Current values from datasheet - IDD0 = '80mA' - IDD02 = '4mA' - IDD2N = '34mA' - IDD3N = '47mA' - IDD4W = '228mA' - IDD4R = '243mA' - IDD5 = '280mA' - IDD3P1 = '41mA' - -# A single LPDDR2-S4 x32 interface (one command/address bus), with -# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1) -# in a 1x32 configuration. -class LPDDR2_S4_1066_1x32(DRAMCtrl): - # No DLL in LPDDR2 - dll = False - - # size of device - device_size = '512MB' - - # 1x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # LPDDR2_S4 is a BL4 and BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 1KB - # (this depends on the memory density) - device_rowbuffer_size = '1kB' - - # 1x32 configuration, so 1 device - devices_per_rank = 1 - - # Use a single rank - ranks_per_channel = 1 - - # LPDDR2-S4 has 8 banks in all configurations - banks_per_rank = 8 - - # 533 MHz - tCK = '1.876ns' - - # Fixed at 15 ns - tRCD = '15ns' - - # 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time - tCL = '15ns' - - # Pre-charge one bank 15 ns (all banks 18 ns) - tRP = '15ns' - - tRAS = '42ns' - tWR = '15ns' - - tRTP = '7.5ns' - - # 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz. - # Note this is a BL8 DDR device. - # Requests larger than 32 bytes are broken down into multiple requests - # in the controller - tBURST = '7.5ns' - - # LPDDR2-S4, 4 Gbit - tRFC = '130ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '7.5ns' - - # self refresh exit time - tXS = '140ns' - - # Irrespective of speed grade, tWTR is 7.5 ns - tWTR = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns - tRTW = '3.75ns' - - # Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns - tCS = '3.75ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Irrespective of density, tFAW is 50 ns - tXAW = '50ns' - activation_limit = 4 - - # Current values from datasheet - IDD0 = '15mA' - IDD02 = '70mA' - IDD2N = '2mA' - IDD2N2 = '30mA' - IDD3N = '2.5mA' - IDD3N2 = '30mA' - IDD4W = '10mA' - IDD4W2 = '190mA' - IDD4R = '3mA' - IDD4R2 = '220mA' - IDD5 = '40mA' - IDD52 = '150mA' - IDD3P1 = '1.2mA' - IDD3P12 = '8mA' - IDD2P1 = '0.6mA' - IDD2P12 = '0.8mA' - IDD6 = '1mA' - IDD62 = '3.2mA' - VDD = '1.8V' - VDD2 = '1.2V' - -# A single WideIO x128 interface (one command and address bus), with -# default timings based on an estimated WIO-200 8 Gbit part. -class WideIO_200_1x128(DRAMCtrl): - # No DLL for WideIO - dll = False - - # size of device - device_size = '1024MB' - - # 1x128 configuration, 1 device with a 128-bit interface - device_bus_width = 128 - - # This is a BL4 device - burst_length = 4 - - # Each device has a page (row buffer) size of 4KB - # (this depends on the memory density) - device_rowbuffer_size = '4kB' - - # 1x128 configuration, so 1 device - devices_per_rank = 1 - - # Use one rank for a one-high die stack - ranks_per_channel = 1 - - # WideIO has 4 banks in all configurations - banks_per_rank = 4 - - # 200 MHz - tCK = '5ns' - - # WIO-200 - tRCD = '18ns' - tCL = '18ns' - tRP = '18ns' - tRAS = '42ns' - tWR = '15ns' - # Read to precharge is same as the burst - tRTP = '20ns' - - # 4 beats across an x128 SDR interface translates to 4 clocks @ 200 MHz. - # Note this is a BL4 SDR device. - tBURST = '20ns' - - # WIO 8 Gb - tRFC = '210ns' - - # WIO 8 Gb, <=85C, half for >85C - tREFI = '3.9us' - - # Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns - tWTR = '15ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns - tRTW = '10ns' - - # Default different rank bus delay to 2 CK, @200 MHz = 10 ns - tCS = '10ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Two instead of four activation window - tXAW = '50ns' - activation_limit = 2 - - # The WideIO specification does not provide current information - -# A single LPDDR3 x32 interface (one command/address bus), with -# default timings based on a LPDDR3-1600 4 Gbit part (Micron -# EDF8132A1MC) in a 1x32 configuration. -class LPDDR3_1600_1x32(DRAMCtrl): - # No DLL for LPDDR3 - dll = False - - # size of device - device_size = '512MB' - - # 1x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # LPDDR3 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 4KB - device_rowbuffer_size = '4kB' - - # 1x32 configuration, so 1 device - devices_per_rank = 1 - - # Technically the datasheet is a dual-rank package, but for - # comparison with the LPDDR2 config we stick to a single rank - ranks_per_channel = 1 - - # LPDDR3 has 8 banks in all configurations - banks_per_rank = 8 - - # 800 MHz - tCK = '1.25ns' - - tRCD = '18ns' - - # 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time - tCL = '15ns' - - tRAS = '42ns' - tWR = '15ns' - - # Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns - tRTP = '7.5ns' - - # Pre-charge one bank 18 ns (all banks 21 ns) - tRP = '18ns' - - # 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz. - # Note this is a BL8 DDR device. - # Requests larger than 32 bytes are broken down into multiple requests - # in the controller - tBURST = '5ns' - - # LPDDR3, 4 Gb - tRFC = '130ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '7.5ns' - - # self refresh exit time - tXS = '140ns' - - # Irrespective of speed grade, tWTR is 7.5 ns - tWTR = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns - tRTW = '2.5ns' - - # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns - tCS = '2.5ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Irrespective of size, tFAW is 50 ns - tXAW = '50ns' - activation_limit = 4 - - # Current values from datasheet - IDD0 = '8mA' - IDD02 = '60mA' - IDD2N = '0.8mA' - IDD2N2 = '26mA' - IDD3N = '2mA' - IDD3N2 = '34mA' - IDD4W = '2mA' - IDD4W2 = '190mA' - IDD4R = '2mA' - IDD4R2 = '230mA' - IDD5 = '28mA' - IDD52 = '150mA' - IDD3P1 = '1.4mA' - IDD3P12 = '11mA' - IDD2P1 = '0.8mA' - IDD2P12 = '1.8mA' - IDD6 = '0.5mA' - IDD62 = '1.8mA' - VDD = '1.8V' - VDD2 = '1.2V' - -# A single GDDR5 x64 interface, with -# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix -# H5GQ1H24AFR) in a 2x32 configuration. -class GDDR5_4000_2x32(DRAMCtrl): - # size of device - device_size = '128MB' - - # 2x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # GDDR5 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 2Kbits (256Bytes) - device_rowbuffer_size = '256B' - - # 2x32 configuration, so 2 devices - devices_per_rank = 2 - - # assume single rank - ranks_per_channel = 1 - - # GDDR5 has 4 bank groups - bank_groups_per_rank = 4 - - # GDDR5 has 16 banks with 4 bank groups - banks_per_rank = 16 - - # 1000 MHz - tCK = '1ns' - - # 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz - # Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz ) - # 8 beats at 4000 MHz = 2 beats at 1000 MHz - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = '2ns' - - # @1000MHz data rate, tCCD_L is 3 CK - # CAS-to-CAS delay for bursts to the same bank group - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = '3ns'; - - tRCD = '12ns' - - # tCL is not directly found in datasheet and assumed equal tRCD - tCL = '12ns' - - tRP = '12ns' - tRAS = '28ns' - - # RRD_S (different bank group) - # RRD_S is 5.5 ns in datasheet. - # rounded to the next multiple of tCK - tRRD = '6ns' - - # RRD_L (same bank group) - # RRD_L is 5.5 ns in datasheet. - # rounded to the next multiple of tCK - tRRD_L = '6ns' - - tXAW = '23ns' - - # tXAW < 4 x tRRD. - # Therefore, activation limit is set to 0 - activation_limit = 0 - - tRFC = '65ns' - tWR = '12ns' - - # Here using the average of WTR_S and WTR_L - tWTR = '5ns' - - # Read-to-Precharge 2 CK - tRTP = '2ns' - - # Assume 2 cycles - tRTW = '2ns' - -# A single HBM x128 interface (one command and address bus), with -# default timings based on data publically released -# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014), -# IDD measurement values, and by extrapolating data from other classes. -# Architecture values based on published HBM spec -# A 4H stack is defined, 2Gb per die for a total of 1GB of memory. -class HBM_1000_4H_1x128(DRAMCtrl): - # HBM gen1 supports up to 8 128-bit physical channels - # Configuration defines a single channel, with the capacity - # set to (full_ stack_capacity / 8) based on 2Gb dies - # To use all 8 channels, set 'channels' parameter to 8 in - # system configuration - - # 128-bit interface legacy mode - device_bus_width = 128 - - # HBM supports BL4 and BL2 (legacy mode only) - burst_length = 4 - - # size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack; - # with 8 channels, 128MB per channel - device_size = '128MB' - - device_rowbuffer_size = '2kB' - - # 1x128 configuration - devices_per_rank = 1 - - # HBM does not have a CS pin; set rank to 1 - ranks_per_channel = 1 - - # HBM has 8 or 16 banks depending on capacity - # 2Gb dies have 8 banks - banks_per_rank = 8 - - # depending on frequency, bank groups may be required - # will always have 4 bank groups when enabled - # current specifications do not define the minimum frequency for - # bank group architecture - # setting bank_groups_per_rank to 0 to disable until range is defined - bank_groups_per_rank = 0 - - # 500 MHz for 1Gbps DDR data rate - tCK = '2ns' - - # use values from IDD measurement in JEDEC spec - # use tRP value for tRCD and tCL similar to other classes - tRP = '15ns' - tRCD = '15ns' - tCL = '15ns' - tRAS = '33ns' - - # BL2 and BL4 supported, default to BL4 - # DDR @ 500 MHz means 4 * 2ns / 2 = 4ns - tBURST = '4ns' - - # value for 2Gb device from JEDEC spec - tRFC = '160ns' - - # value for 2Gb device from JEDEC spec - tREFI = '3.9us' - - # extrapolate the following from LPDDR configs, using ns values - # to minimize burst length, prefetch differences - tWR = '18ns' - tRTP = '7.5ns' - tWTR = '10ns' - - # start with 2 cycles turnaround, similar to other memory classes - # could be more with variations across the stack - tRTW = '4ns' - - # single rank device, set to 0 - tCS = '0ns' - - # from MemCon example, tRRD is 4ns with 2ns tCK - tRRD = '4ns' - - # from MemCon example, tFAW is 30ns with 2ns tCK - tXAW = '30ns' - activation_limit = 4 - - # 4tCK - tXP = '8ns' - - # start with tRFC + tXP -> 160ns + 8ns = 168ns - tXS = '168ns' - -# A single HBM x64 interface (one command and address bus), with -# default timings based on HBM gen1 and data publically released -# A 4H stack is defined, 8Gb per die for a total of 4GB of memory. -# Note: This defines a pseudo-channel with a unique controller -# instantiated per pseudo-channel -# Stay at same IO rate (1Gbps) to maintain timing relationship with -# HBM gen1 class (HBM_1000_4H_x128) where possible -class HBM_1000_4H_1x64(HBM_1000_4H_1x128): - # For HBM gen2 with pseudo-channel mode, configure 2X channels. - # Configuration defines a single pseudo channel, with the capacity - # set to (full_ stack_capacity / 16) based on 8Gb dies - # To use all 16 pseudo channels, set 'channels' parameter to 16 in - # system configuration - - # 64-bit pseudo-channle interface - device_bus_width = 64 - - # HBM pseudo-channel only supports BL4 - burst_length = 4 - - # size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack; - # with 16 channels, 256MB per channel - device_size = '256MB' - - # page size is halved with pseudo-channel; maintaining the same same number - # of rows per pseudo-channel with 2X banks across 2 channels - device_rowbuffer_size = '1kB' - - # HBM has 8 or 16 banks depending on capacity - # Starting with 4Gb dies, 16 banks are defined - banks_per_rank = 16 - - # reset tRFC for larger, 8Gb device - # use HBM1 4Gb value as a starting point - tRFC = '260ns' - - # start with tRFC + tXP -> 160ns + 8ns = 168ns - tXS = '268ns' - # Default different rank bus delay to 2 CK, @1000 MHz = 2 ns - tCS = '2ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '10ns' - - # self refresh exit time - tXS = '65ns' - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture -# burst of 32, which means bursts can be interleaved -class LPDDR5_5500_1x16_BG_BL32(DRAMCtrl): - - # Increase buffer size to account for more bank resources - read_buffer_size = 64 - - # Set page policy to better suit DMC Huxley - page_policy = 'close_adaptive' - - # 16-bit channel interface - device_bus_width = 16 - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL32 for higher command bandwidth - burst_length = 32 - - # size of device in bytes - device_size = '1GB' - - # 2kB page with BG mode - device_rowbuffer_size = '2kB' - - # Use a 1x16 configuration - devices_per_rank = 1 - - # Use a single rank - ranks_per_channel = 1 - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Initial configuration will have 16 banks with Bank Group Arch - # to maximim resources and enable higher data rates - banks_per_rank = 16 - bank_groups_per_rank = 4 - - # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK - tCK = '1.455ns' - - # Greater of 2 CK or 18ns - tRCD = '18ns' - - # Base RL is 16 CK @ 687.5 MHz = 23.28ns - tCL = '23.280ns' - - # Greater of 2 CK or 18ns - tRP = '18ns' - - # Greater of 3 CK or 42ns - tRAS = '42ns' - - # Greater of 3 CK or 34ns - tWR = '34ns' - - # active powerdown and precharge powerdown exit time - # Greater of 3 CK or 7ns - tXP = '7ns' - - # self refresh exit time (tRFCab + 7.5ns) - tXS = '217.5ns' - - # Greater of 2 CK or 7.5 ns minus 2 CK - tRTP = '4.59ns' - - # With BG architecture, burst of 32 transferred in two 16-beat - # sub-bursts, with a 16-beat gap in between. - # Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz - # tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz - tBURST = '8.73ns' - # can interleave a Bstof32 from another bank group at tBURST_MIN - # 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz - tBURST_MIN = '2.91ns' - # tBURST_MAX is the maximum burst delay for same bank group timing - # this is 8 CK @ 687.5 MHz - tBURST_MAX = '11.64ns' - - # 8 CK @ 687.5 MHz - tCCD_L = "11.64ns" - - # LPDDR5, 8 Gbit/channel for 280ns tRFCab - tRFC = '210ns' - tREFI = '3.9us' - - # Greater of 4 CK or 6.25 ns - tWTR = '6.25ns' - # Greater of 4 CK or 12 ns - tWTR_L = '12ns' - - # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL - # tWCKDQ0/tCK will be 1 CK for most cases - # For gem5 RL = WL and BL/n is already accounted for with tBURST - # Result is and additional 1 CK is required - tRTW = '1.455ns' - - # Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns - tCS = '2.91ns' - - # 2 CK - tPPD = '2.91ns' - - # Greater of 2 CK or 5 ns - tRRD = '5ns' - tRRD_L = '5ns' - - # With Bank Group Arch mode tFAW is 20 ns - tXAW = '20ns' - activation_limit = 4 - - # at 5Gbps, 4:1 WCK to CK ratio required - # 2 data beats per WCK (DDR) -> 8 per CK - beats_per_clock = 8 - - # 2 cycles required to send activate command - # 2 command phases can be sent back-to-back or - # with a gap up to tAAD = 8 CK - two_cycle_activate = True - tAAD = '11.640ns' - - data_clock_sync = True - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture, burst of 16 -class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32): - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL16 for smaller access granularity - burst_length = 16 - - # For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio - tBURST = '2.91ns' - tBURST_MIN = '2.91ns' - # For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio - tBURST_MAX = '5.82ns' - - # 4 CK @ 687.5 MHz - tCCD_L = "5.82ns" - - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 8-bank mode, burst of 32 -class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32): - - # 4kB page with 8B mode - device_rowbuffer_size = '4kB' - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Select 8B - banks_per_rank = 8 - bank_groups_per_rank = 0 - - # For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio - tBURST = '5.82ns' - tBURST_MIN = '5.82ns' - tBURST_MAX = '5.82ns' - - # Greater of 4 CK or 12 ns - tWTR = '12ns' - - # Greater of 2 CK or 10 ns - tRRD = '10ns' - - # With 8B mode tFAW is 40 ns - tXAW = '40ns' - activation_limit = 4 - - # Reset BG arch timing for 8B mode - tCCD_L = "0ns" - tRRD_L = "0ns" - tWTR_L = "0ns" - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# 6.4Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture -# burst of 32, which means bursts can be interleaved -class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32): - - # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK - tCK = '1.25ns' - - # Base RL is 17 CK @ 800 MHz = 21.25ns - tCL = '21.25ns' - - # With BG architecture, burst of 32 transferred in two 16-beat - # sub-bursts, with a 16-beat gap in between. - # Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz - # tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz - tBURST = '7.5ns' - # can interleave a Bstof32 from another bank group at tBURST_MIN - # 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz - tBURST_MIN = '2.5ns' - # tBURST_MAX is the maximum burst delay for same bank group timing - # this is 8 CK @ 800 MHz - tBURST_MAX = '10ns' - - # 8 CK @ 800 MHz - tCCD_L = "10ns" - - # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL - # tWCKDQ0/tCK will be 1 CK for most cases - # For gem5 RL = WL and BL/n is already accounted for with tBURST - # Result is and additional 1 CK is required - tRTW = '1.25ns' - - # Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns - tCS = '2.5ns' - - # 2 CK - tPPD = '2.5ns' - - # 2 command phases can be sent back-to-back or - # with a gap up to tAAD = 8 CK - tAAD = '10ns' - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on initial -# JEDEC specifcation -# 6.4Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture, burst of 16 -class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32): - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL16 for smaller access granularity - burst_length = 16 - - # For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio - tBURST = '2.5ns' - tBURST_MIN = '2.5ns' - # For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio - tBURST_MAX = '5ns' - - # 4 CK @ 800 MHz - tCCD_L = "5ns" - - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# 6.4Gbps data rates and 8Gbit die -# Configuring for 8-bank mode, burst of 32 -class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32): - - # 4kB page with 8B mode - device_rowbuffer_size = '4kB' - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Select 8B - banks_per_rank = 8 - bank_groups_per_rank = 0 - - # For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio - tBURST = '5ns' - tBURST_MIN = '5ns' - tBURST_MAX = '5ns' - - # Greater of 4 CK or 12 ns - tWTR = '12ns' - - # Greater of 2 CK or 10 ns - tRRD = '10ns' - - # With 8B mode tFAW is 40 ns - tXAW = '40ns' - activation_limit = 4 - - # Reset BG arch timing for 8B mode - tCCD_L = "0ns" - tRRD_L = "0ns" - tWTR_L = "0ns" diff --git a/src/mem/DRAMInterface.py b/src/mem/DRAMInterface.py new file mode 100644 index 0000000..f571920 --- /dev/null +++ b/src/mem/DRAMInterface.py @@ -0,0 +1,1473 @@ +# Copyright (c) 2012-2020 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Copyright (c) 2013 Amin Farmahini-Farahani +# Copyright (c) 2015 University of Kaiserslautern +# Copyright (c) 2015 The University of Bologna +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * + +from m5.objects.AbstractMemory import AbstractMemory + +# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting +# channel, rank, bank, row and column, respectively, and going from +# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are +# suitable for an open-page policy, optimising for sequential accesses +# hitting in the open row. For a closed-page policy, RoCoRaBaCh +# maximises parallelism. +class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh'] + +# Enum for the page policy, either open, open_adaptive, close, or +# close_adaptive. +class PageManage(Enum): vals = ['open', 'open_adaptive', 'close', + 'close_adaptive'] + +class DRAMInterface(AbstractMemory): + type = 'DRAMInterface' + cxx_header = "mem/dram_ctrl.hh" + + # Allow the interface to set required controller buffer sizes + # each entry corresponds to a burst for the specific DRAM + # configuration (e.g. x32 with burst length 8 is 32 bytes) and not + # the cacheline size or request/packet size + write_buffer_size = Param.Unsigned(64, "Number of write queue entries") + read_buffer_size = Param.Unsigned(32, "Number of read queue entries") + + # scheduler, address map and page policy + addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy") + page_policy = Param.PageManage('open_adaptive', "Page management policy") + + # enforce a limit on the number of accesses per row + max_accesses_per_row = Param.Unsigned(16, "Max accesses per row before " + "closing"); + + # size of DRAM Chip in Bytes + device_size = Param.MemorySize("Size of DRAM chip") + # the physical organisation of the DRAM + device_bus_width = Param.Unsigned("data bus width in bits for each DRAM "\ + "device/chip") + burst_length = Param.Unsigned("Burst lenght (BL) in beats") + device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\ + "device/chip") + devices_per_rank = Param.Unsigned("Number of devices/chips per rank") + ranks_per_channel = Param.Unsigned("Number of ranks per channel") + + # default to 0 bank groups per rank, indicating bank group architecture + # is not used + # update per memory class when bank group architecture is supported + bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per rank") + banks_per_rank = Param.Unsigned("Number of banks per rank") + + # Enable DRAM powerdown states if True. This is False by default due to + # performance being lower when enabled + enable_dram_powerdown = Param.Bool(False, "Enable powerdown states") + + # For power modelling we need to know if the DRAM has a DLL or not + dll = Param.Bool(True, "DRAM has DLL or not") + + # DRAMPower provides in addition to the core power, the possibility to + # include RD/WR termination and IO power. This calculation assumes some + # default values. The integration of DRAMPower with gem5 does not include + # IO and RD/WR termination power by default. This might be added as an + # additional feature in the future. + + # timing behaviour and constraints - all in nanoseconds + + # the base clock period of the DRAM + tCK = Param.Latency("Clock period") + + # the amount of time in nanoseconds from issuing an activate command + # to the data being available in the row buffer for a read/write + tRCD = Param.Latency("RAS to CAS delay") + + # the time from issuing a read/write command to seeing the actual data + tCL = Param.Latency("CAS latency") + + # minimum time between a precharge and subsequent activate + tRP = Param.Latency("Row precharge time") + + # minimum time between an activate and a precharge to the same row + tRAS = Param.Latency("ACT to PRE delay") + + # minimum time between a write data transfer and a precharge + tWR = Param.Latency("Write recovery time") + + # minimum time between a read and precharge command + tRTP = Param.Latency("Read to precharge") + + # time to complete a burst transfer, typically the burst length + # divided by two due to the DDR bus, but by making it a parameter + # it is easier to also evaluate SDR memories like WideIO. + # This parameter has to account for burst length. + # Read/Write requests with data size larger than one full burst are broken + # down into multiple requests in the controller + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = Param.Latency("Burst duration " + "(typically burst length / 2 cycles)") + + # tBURST_MAX is the column array cycle delay required before next access, + # which could be greater than tBURST when the memory access time is greater + # than tBURST + tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay") + + # tBURST_MIN is the minimum delay between bursts, which could be less than + # tBURST when interleaving is supported + tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts") + + # CAS-to-CAS delay for bursts to the same bank group + # only utilized with bank group architectures; set to 0 for default case + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay") + + # Write-to-Write delay for bursts to the same bank group + # only utilized with bank group architectures; set to 0 for default case + # This will be used to enable different same bank group delays + # for writes versus reads + tCCD_L_WR = Param.Latency(Self.tCCD_L, + "Same bank group Write to Write delay") + + # time taken to complete one refresh cycle (N rows in all banks) + tRFC = Param.Latency("Refresh cycle time") + + # refresh command interval, how often a "ref" command needs + # to be sent. It is 7.8 us for a 64ms refresh requirement + tREFI = Param.Latency("Refresh command interval") + + # write-to-read, same rank turnaround penalty + tWTR = Param.Latency("Write to read, same rank switching time") + + # write-to-read, same rank turnaround penalty for same bank group + tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching " + "time, same bank group") + + # read-to-write, same rank turnaround penalty + tRTW = Param.Latency("Read to write, same rank switching time") + + # rank-to-rank bus delay penalty + # this does not correlate to a memory timing parameter and encompasses: + # 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD + # different rank bus delay + tCS = Param.Latency("Rank to rank switching time") + + # minimum precharge to precharge delay time + tPPD = Param.Latency("0ns", "PRE to PRE delay") + + # maximum delay between two-cycle ACT command phases + tAAD = Param.Latency(Self.tCK, + "Maximum delay between two-cycle ACT commands") + + two_cycle_activate = Param.Bool(False, + "Two cycles required to send activate") + + # minimum row activate to row activate delay time + tRRD = Param.Latency("ACT to ACT delay") + + # only utilized with bank group architectures; set to 0 for default case + tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay") + + # time window in which a maximum number of activates are allowed + # to take place, set to 0 to disable + tXAW = Param.Latency("X activation window") + activation_limit = Param.Unsigned("Max number of activates in window") + + # time to exit power-down mode + # Exit power-down to next valid command delay + tXP = Param.Latency("0ns", "Power-up Delay") + + # Exit Powerdown to commands requiring a locked DLL + tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL") + + # time to exit self-refresh mode + tXS = Param.Latency("0ns", "Self-refresh exit latency") + + # time to exit self-refresh mode with locked DLL + tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL") + + # number of data beats per clock. with DDR, default is 2, one per edge + beats_per_clock = Param.Unsigned(2, "Data beats per clock") + + data_clock_sync = Param.Bool(False, "Synchronization commands required") + + # Currently rolled into other params + ###################################################################### + + # tRC - assumed to be tRAS + tRP + + # Power Behaviour and Constraints + # DRAMs like LPDDR and WideIO have 2 external voltage domains. These are + # defined as VDD and VDD2. Each current is defined for each voltage domain + # separately. For example, current IDD0 is active-precharge current for + # voltage domain VDD and current IDD02 is active-precharge current for + # voltage domain VDD2. + # By default all currents are set to 0mA. Users who are only interested in + # the performance of DRAMs can leave them at 0. + + # Operating 1 Bank Active-Precharge current + IDD0 = Param.Current("0mA", "Active precharge current") + + # Operating 1 Bank Active-Precharge current multiple voltage Range + IDD02 = Param.Current("0mA", "Active precharge current VDD2") + + # Precharge Power-down Current: Slow exit + IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow") + + # Precharge Power-down Current: Slow exit multiple voltage Range + IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2") + + # Precharge Power-down Current: Fast exit + IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast") + + # Precharge Power-down Current: Fast exit multiple voltage Range + IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2") + + # Precharge Standby current + IDD2N = Param.Current("0mA", "Precharge Standby current") + + # Precharge Standby current multiple voltage range + IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2") + + # Active Power-down current: slow exit + IDD3P0 = Param.Current("0mA", "Active Powerdown slow") + + # Active Power-down current: slow exit multiple voltage range + IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2") + + # Active Power-down current : fast exit + IDD3P1 = Param.Current("0mA", "Active Powerdown fast") + + # Active Power-down current : fast exit multiple voltage range + IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2") + + # Active Standby current + IDD3N = Param.Current("0mA", "Active Standby current") + + # Active Standby current multiple voltage range + IDD3N2 = Param.Current("0mA", "Active Standby current VDD2") + + # Burst Read Operating Current + IDD4R = Param.Current("0mA", "READ current") + + # Burst Read Operating Current multiple voltage range + IDD4R2 = Param.Current("0mA", "READ current VDD2") + + # Burst Write Operating Current + IDD4W = Param.Current("0mA", "WRITE current") + + # Burst Write Operating Current multiple voltage range + IDD4W2 = Param.Current("0mA", "WRITE current VDD2") + + # Refresh Current + IDD5 = Param.Current("0mA", "Refresh current") + + # Refresh Current multiple voltage range + IDD52 = Param.Current("0mA", "Refresh current VDD2") + + # Self-Refresh Current + IDD6 = Param.Current("0mA", "Self-refresh Current") + + # Self-Refresh Current multiple voltage range + IDD62 = Param.Current("0mA", "Self-refresh Current VDD2") + + # Main voltage range of the DRAM + VDD = Param.Voltage("0V", "Main Voltage Range") + + # Second voltage range defined by some DRAMs + VDD2 = Param.Voltage("0V", "2nd Voltage Range") + +# A single DDR3-1600 x64 channel (one command and address bus), with +# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in +# an 8x8 configuration. +class DDR3_1600_8x8(DRAMInterface): + # size of device in bytes + device_size = '512MB' + + # 8x8 configuration, 8 devices each with an 8-bit interface + device_bus_width = 8 + + # DDR3 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) + device_rowbuffer_size = '1kB' + + # 8x8 configuration, so 8 devices + devices_per_rank = 8 + + # Use two ranks + ranks_per_channel = 2 + + # DDR3 has 8 banks in all configurations + banks_per_rank = 8 + + # 800 MHz + tCK = '1.25ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 800 MHz + tBURST = '5ns' + + # DDR3-1600 11-11-11 + tRCD = '13.75ns' + tCL = '13.75ns' + tRP = '13.75ns' + tRAS = '35ns' + tRRD = '6ns' + tXAW = '30ns' + activation_limit = 4 + tRFC = '260ns' + + tWR = '15ns' + + # Greater of 4 CK or 7.5 ns + tWTR = '7.5ns' + + # Greater of 4 CK or 7.5 ns + tRTP = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns + tRTW = '2.5ns' + + # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns + tCS = '2.5ns' + + # <=85C, half for >85C + tREFI = '7.8us' + + # active powerdown and precharge powerdown exit time + tXP = '6ns' + + # self refresh exit time + tXS = '270ns' + + # Current values from datasheet Die Rev E,J + IDD0 = '55mA' + IDD2N = '32mA' + IDD3N = '38mA' + IDD4W = '125mA' + IDD4R = '157mA' + IDD5 = '235mA' + IDD3P1 = '38mA' + IDD2P1 = '32mA' + IDD6 = '20mA' + VDD = '1.5V' + +# A single HMC-2500 x32 model based on: +# [1] DRAMSpec: a high-level DRAM bank modelling tool +# developed at the University of Kaiserslautern. This high level tool +# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to +# estimate the DRAM bank latency and power numbers. +# [2] High performance AXI-4.0 based interconnect for extensible smart memory +# cubes (E. Azarkhish et. al) +# Assumed for the HMC model is a 30 nm technology node. +# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory (4 +# layers). +# Each layer has 16 vaults and each vault consists of 2 banks per layer. +# In order to be able to use the same controller used for 2D DRAM generations +# for HMC, the following analogy is done: +# Channel (DDR) => Vault (HMC) +# device_size (DDR) => size of a single layer in a vault +# ranks per channel (DDR) => number of layers +# banks per rank (DDR) => banks per layer +# devices per rank (DDR) => devices per layer ( 1 for HMC). +# The parameters for which no input is available are inherited from the DDR3 +# configuration. +# This configuration includes the latencies from the DRAM to the logic layer +# of the HMC +class HMC_2500_1x32(DDR3_1600_8x8): + # size of device + # two banks per device with each bank 4MB [2] + device_size = '8MB' + + # 1x32 configuration, 1 device with 32 TSVs [2] + device_bus_width = 32 + + # HMC is a BL8 device [2] + burst_length = 8 + + # Each device has a page (row buffer) size of 256 bytes [2] + device_rowbuffer_size = '256B' + + # 1x32 configuration, so 1 device [2] + devices_per_rank = 1 + + # 4 layers so 4 ranks [2] + ranks_per_channel = 4 + + # HMC has 2 banks per layer [2] + # Each layer represents a rank. With 4 layers and 8 banks in total, each + # layer has 2 banks; thus 2 banks per rank. + banks_per_rank = 2 + + # 1250 MHz [2] + tCK = '0.8ns' + + # 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz + tBURST = '3.2ns' + + # Values using DRAMSpec HMC model [1] + tRCD = '10.2ns' + tCL = '9.9ns' + tRP = '7.7ns' + tRAS = '21.6ns' + + # tRRD depends on the power supply network for each vendor. + # We assume a tRRD of a double bank approach to be equal to 4 clock + # cycles (Assumption) + tRRD = '3.2ns' + + # activation limit is set to 0 since there are only 2 banks per vault + # layer. + activation_limit = 0 + + # Values using DRAMSpec HMC model [1] + tRFC = '59ns' + tWR = '8ns' + tRTP = '4.9ns' + + # Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz = + # 0.8 ns (Assumption) + tCS = '0.8ns' + + # Value using DRAMSpec HMC model [1] + tREFI = '3.9us' + + # The default page policy in the vault controllers is simple closed page + # [2] nevertheless 'close' policy opens and closes the row multiple times + # for bursts largers than 32Bytes. For this reason we use 'close_adaptive' + page_policy = 'close_adaptive' + + # RoCoRaBaCh resembles the default address mapping in HMC + addr_mapping = 'RoCoRaBaCh' + + # These parameters do not directly correlate with buffer_size in real + # hardware. Nevertheless, their value has been tuned to achieve a + # bandwidth similar to the cycle-accurate model in [2] + write_buffer_size = 32 + read_buffer_size = 32 + +# A single DDR3-2133 x64 channel refining a selected subset of the +# options for the DDR-1600 configuration, based on the same DDR3-1600 +# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept +# consistent across the two configurations. +class DDR3_2133_8x8(DDR3_1600_8x8): + # 1066 MHz + tCK = '0.938ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz + tBURST = '3.752ns' + + # DDR3-2133 14-14-14 + tRCD = '13.09ns' + tCL = '13.09ns' + tRP = '13.09ns' + tRAS = '33ns' + tRRD = '5ns' + tXAW = '25ns' + + # Current values from datasheet + IDD0 = '70mA' + IDD2N = '37mA' + IDD3N = '44mA' + IDD4W = '157mA' + IDD4R = '191mA' + IDD5 = '250mA' + IDD3P1 = '44mA' + IDD2P1 = '43mA' + IDD6 ='20mA' + VDD = '1.5V' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4) +# in an 16x4 configuration. +# Total channel capacity is 32GB +# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel +class DDR4_2400_16x4(DRAMInterface): + # size of device + device_size = '1GB' + + # 16x4 configuration, 16 devices each with a 4-bit interface + device_bus_width = 4 + + # DDR4 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 512 byte (1K columns x4) + device_rowbuffer_size = '512B' + + # 16x4 configuration, so 16 devices + devices_per_rank = 16 + + # Match our DDR3 configurations which is dual rank + ranks_per_channel = 2 + + # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups + # Set to 4 for x4 case + bank_groups_per_rank = 4 + + # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all + # configurations). Currently we do not capture the additional + # constraints incurred by the bank groups + banks_per_rank = 16 + + # override the default buffer sizes and go for something larger to + # accommodate the larger bank count + write_buffer_size = 128 + read_buffer_size = 64 + + # 1200 MHz + tCK = '0.833ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = '3.332ns' + + # @2400 data rate, tCCD_L is 6 CK + # CAS-to-CAS delay for bursts to the same bank group + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = '5ns'; + + # DDR4-2400 17-17-17 + tRCD = '14.16ns' + tCL = '14.16ns' + tRP = '14.16ns' + tRAS = '32ns' + + # RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns) + tRRD = '3.332ns' + + # RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns) + tRRD_L = '4.9ns'; + + # tFAW for 512B page is MAX(16 CK, 13ns) + tXAW = '13.328ns' + activation_limit = 4 + # tRFC is 350ns + tRFC = '350ns' + + tWR = '15ns' + + # Here using the average of WTR_S and WTR_L + tWTR = '5ns' + + # Greater of 4 CK or 7.5 ns + tRTP = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666 ns + tRTW = '1.666ns' + + # Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns + tCS = '1.666ns' + + # <=85C, half for >85C + tREFI = '7.8us' + + # active powerdown and precharge powerdown exit time + tXP = '6ns' + + # self refresh exit time + # exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is: + # tRFC + 10ns = 340ns + tXS = '340ns' + + # Current values from datasheet + IDD0 = '43mA' + IDD02 = '3mA' + IDD2N = '34mA' + IDD3N = '38mA' + IDD3N2 = '3mA' + IDD4W = '103mA' + IDD4R = '110mA' + IDD5 = '250mA' + IDD3P1 = '32mA' + IDD2P1 = '25mA' + IDD6 = '30mA' + VDD = '1.2V' + VDD2 = '2.5V' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8) +# in an 8x8 configuration. +# Total channel capacity is 16GB +# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel +class DDR4_2400_8x8(DDR4_2400_16x4): + # 8x8 configuration, 8 devices each with an 8-bit interface + device_bus_width = 8 + + # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) + device_rowbuffer_size = '1kB' + + # 8x8 configuration, so 8 devices + devices_per_rank = 8 + + # RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns) + tRRD_L = '4.9ns'; + + tXAW = '21ns' + + # Current values from datasheet + IDD0 = '48mA' + IDD3N = '43mA' + IDD4W = '123mA' + IDD4R = '135mA' + IDD3P1 = '37mA' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16) +# in an 4x16 configuration. +# Total channel capacity is 4GB +# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel +class DDR4_2400_4x16(DDR4_2400_16x4): + # 4x16 configuration, 4 devices each with an 16-bit interface + device_bus_width = 16 + + # Each device has a page (row buffer) size of 2 Kbyte (1K columns x16) + device_rowbuffer_size = '2kB' + + # 4x16 configuration, so 4 devices + devices_per_rank = 4 + + # Single rank for x16 + ranks_per_channel = 1 + + # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups + # Set to 2 for x16 case + bank_groups_per_rank = 2 + + # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all + # configurations). Currently we do not capture the additional + # constraints incurred by the bank groups + banks_per_rank = 8 + + # RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns) + tRRD = '5.3ns' + + # RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns) + tRRD_L = '6.4ns'; + + tXAW = '30ns' + + # Current values from datasheet + IDD0 = '80mA' + IDD02 = '4mA' + IDD2N = '34mA' + IDD3N = '47mA' + IDD4W = '228mA' + IDD4R = '243mA' + IDD5 = '280mA' + IDD3P1 = '41mA' + +# A single LPDDR2-S4 x32 interface (one command/address bus), with +# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1) +# in a 1x32 configuration. +class LPDDR2_S4_1066_1x32(DRAMInterface): + # No DLL in LPDDR2 + dll = False + + # size of device + device_size = '512MB' + + # 1x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # LPDDR2_S4 is a BL4 and BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1KB + # (this depends on the memory density) + device_rowbuffer_size = '1kB' + + # 1x32 configuration, so 1 device + devices_per_rank = 1 + + # Use a single rank + ranks_per_channel = 1 + + # LPDDR2-S4 has 8 banks in all configurations + banks_per_rank = 8 + + # 533 MHz + tCK = '1.876ns' + + # Fixed at 15 ns + tRCD = '15ns' + + # 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time + tCL = '15ns' + + # Pre-charge one bank 15 ns (all banks 18 ns) + tRP = '15ns' + + tRAS = '42ns' + tWR = '15ns' + + tRTP = '7.5ns' + + # 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz. + # Note this is a BL8 DDR device. + # Requests larger than 32 bytes are broken down into multiple requests + # in the controller + tBURST = '7.5ns' + + # LPDDR2-S4, 4 Gbit + tRFC = '130ns' + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '7.5ns' + + # self refresh exit time + tXS = '140ns' + + # Irrespective of speed grade, tWTR is 7.5 ns + tWTR = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns + tRTW = '3.75ns' + + # Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns + tCS = '3.75ns' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Irrespective of density, tFAW is 50 ns + tXAW = '50ns' + activation_limit = 4 + + # Current values from datasheet + IDD0 = '15mA' + IDD02 = '70mA' + IDD2N = '2mA' + IDD2N2 = '30mA' + IDD3N = '2.5mA' + IDD3N2 = '30mA' + IDD4W = '10mA' + IDD4W2 = '190mA' + IDD4R = '3mA' + IDD4R2 = '220mA' + IDD5 = '40mA' + IDD52 = '150mA' + IDD3P1 = '1.2mA' + IDD3P12 = '8mA' + IDD2P1 = '0.6mA' + IDD2P12 = '0.8mA' + IDD6 = '1mA' + IDD62 = '3.2mA' + VDD = '1.8V' + VDD2 = '1.2V' + +# A single WideIO x128 interface (one command and address bus), with +# default timings based on an estimated WIO-200 8 Gbit part. +class WideIO_200_1x128(DRAMInterface): + # No DLL for WideIO + dll = False + + # size of device + device_size = '1024MB' + + # 1x128 configuration, 1 device with a 128-bit interface + device_bus_width = 128 + + # This is a BL4 device + burst_length = 4 + + # Each device has a page (row buffer) size of 4KB + # (this depends on the memory density) + device_rowbuffer_size = '4kB' + + # 1x128 configuration, so 1 device + devices_per_rank = 1 + + # Use one rank for a one-high die stack + ranks_per_channel = 1 + + # WideIO has 4 banks in all configurations + banks_per_rank = 4 + + # 200 MHz + tCK = '5ns' + + # WIO-200 + tRCD = '18ns' + tCL = '18ns' + tRP = '18ns' + tRAS = '42ns' + tWR = '15ns' + # Read to precharge is same as the burst + tRTP = '20ns' + + # 4 beats across an x128 SDR interface translates to 4 clocks @ 200 MHz. + # Note this is a BL4 SDR device. + tBURST = '20ns' + + # WIO 8 Gb + tRFC = '210ns' + + # WIO 8 Gb, <=85C, half for >85C + tREFI = '3.9us' + + # Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns + tWTR = '15ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns + tRTW = '10ns' + + # Default different rank bus delay to 2 CK, @200 MHz = 10 ns + tCS = '10ns' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Two instead of four activation window + tXAW = '50ns' + activation_limit = 2 + + # The WideIO specification does not provide current information + +# A single LPDDR3 x32 interface (one command/address bus), with +# default timings based on a LPDDR3-1600 4 Gbit part (Micron +# EDF8132A1MC) in a 1x32 configuration. +class LPDDR3_1600_1x32(DRAMInterface): + # No DLL for LPDDR3 + dll = False + + # size of device + device_size = '512MB' + + # 1x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # LPDDR3 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 4KB + device_rowbuffer_size = '4kB' + + # 1x32 configuration, so 1 device + devices_per_rank = 1 + + # Technically the datasheet is a dual-rank package, but for + # comparison with the LPDDR2 config we stick to a single rank + ranks_per_channel = 1 + + # LPDDR3 has 8 banks in all configurations + banks_per_rank = 8 + + # 800 MHz + tCK = '1.25ns' + + tRCD = '18ns' + + # 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time + tCL = '15ns' + + tRAS = '42ns' + tWR = '15ns' + + # Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns + tRTP = '7.5ns' + + # Pre-charge one bank 18 ns (all banks 21 ns) + tRP = '18ns' + + # 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz. + # Note this is a BL8 DDR device. + # Requests larger than 32 bytes are broken down into multiple requests + # in the controller + tBURST = '5ns' + + # LPDDR3, 4 Gb + tRFC = '130ns' + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '7.5ns' + + # self refresh exit time + tXS = '140ns' + + # Irrespective of speed grade, tWTR is 7.5 ns + tWTR = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns + tRTW = '2.5ns' + + # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns + tCS = '2.5ns' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Irrespective of size, tFAW is 50 ns + tXAW = '50ns' + activation_limit = 4 + + # Current values from datasheet + IDD0 = '8mA' + IDD02 = '60mA' + IDD2N = '0.8mA' + IDD2N2 = '26mA' + IDD3N = '2mA' + IDD3N2 = '34mA' + IDD4W = '2mA' + IDD4W2 = '190mA' + IDD4R = '2mA' + IDD4R2 = '230mA' + IDD5 = '28mA' + IDD52 = '150mA' + IDD3P1 = '1.4mA' + IDD3P12 = '11mA' + IDD2P1 = '0.8mA' + IDD2P12 = '1.8mA' + IDD6 = '0.5mA' + IDD62 = '1.8mA' + VDD = '1.8V' + VDD2 = '1.2V' + +# A single GDDR5 x64 interface, with +# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix +# H5GQ1H24AFR) in a 2x32 configuration. +class GDDR5_4000_2x32(DRAMInterface): + # size of device + device_size = '128MB' + + # 2x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # GDDR5 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 2Kbits (256Bytes) + device_rowbuffer_size = '256B' + + # 2x32 configuration, so 2 devices + devices_per_rank = 2 + + # assume single rank + ranks_per_channel = 1 + + # GDDR5 has 4 bank groups + bank_groups_per_rank = 4 + + # GDDR5 has 16 banks with 4 bank groups + banks_per_rank = 16 + + # 1000 MHz + tCK = '1ns' + + # 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz + # Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz ) + # 8 beats at 4000 MHz = 2 beats at 1000 MHz + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = '2ns' + + # @1000MHz data rate, tCCD_L is 3 CK + # CAS-to-CAS delay for bursts to the same bank group + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = '3ns'; + + tRCD = '12ns' + + # tCL is not directly found in datasheet and assumed equal tRCD + tCL = '12ns' + + tRP = '12ns' + tRAS = '28ns' + + # RRD_S (different bank group) + # RRD_S is 5.5 ns in datasheet. + # rounded to the next multiple of tCK + tRRD = '6ns' + + # RRD_L (same bank group) + # RRD_L is 5.5 ns in datasheet. + # rounded to the next multiple of tCK + tRRD_L = '6ns' + + tXAW = '23ns' + + # tXAW < 4 x tRRD. + # Therefore, activation limit is set to 0 + activation_limit = 0 + + tRFC = '65ns' + tWR = '12ns' + + # Here using the average of WTR_S and WTR_L + tWTR = '5ns' + + # Read-to-Precharge 2 CK + tRTP = '2ns' + + # Assume 2 cycles + tRTW = '2ns' + +# A single HBM x128 interface (one command and address bus), with +# default timings based on data publically released +# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014), +# IDD measurement values, and by extrapolating data from other classes. +# Architecture values based on published HBM spec +# A 4H stack is defined, 2Gb per die for a total of 1GB of memory. +class HBM_1000_4H_1x128(DRAMInterface): + # HBM gen1 supports up to 8 128-bit physical channels + # Configuration defines a single channel, with the capacity + # set to (full_ stack_capacity / 8) based on 2Gb dies + # To use all 8 channels, set 'channels' parameter to 8 in + # system configuration + + # 128-bit interface legacy mode + device_bus_width = 128 + + # HBM supports BL4 and BL2 (legacy mode only) + burst_length = 4 + + # size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack; + # with 8 channels, 128MB per channel + device_size = '128MB' + + device_rowbuffer_size = '2kB' + + # 1x128 configuration + devices_per_rank = 1 + + # HBM does not have a CS pin; set rank to 1 + ranks_per_channel = 1 + + # HBM has 8 or 16 banks depending on capacity + # 2Gb dies have 8 banks + banks_per_rank = 8 + + # depending on frequency, bank groups may be required + # will always have 4 bank groups when enabled + # current specifications do not define the minimum frequency for + # bank group architecture + # setting bank_groups_per_rank to 0 to disable until range is defined + bank_groups_per_rank = 0 + + # 500 MHz for 1Gbps DDR data rate + tCK = '2ns' + + # use values from IDD measurement in JEDEC spec + # use tRP value for tRCD and tCL similar to other classes + tRP = '15ns' + tRCD = '15ns' + tCL = '15ns' + tRAS = '33ns' + + # BL2 and BL4 supported, default to BL4 + # DDR @ 500 MHz means 4 * 2ns / 2 = 4ns + tBURST = '4ns' + + # value for 2Gb device from JEDEC spec + tRFC = '160ns' + + # value for 2Gb device from JEDEC spec + tREFI = '3.9us' + + # extrapolate the following from LPDDR configs, using ns values + # to minimize burst length, prefetch differences + tWR = '18ns' + tRTP = '7.5ns' + tWTR = '10ns' + + # start with 2 cycles turnaround, similar to other memory classes + # could be more with variations across the stack + tRTW = '4ns' + + # single rank device, set to 0 + tCS = '0ns' + + # from MemCon example, tRRD is 4ns with 2ns tCK + tRRD = '4ns' + + # from MemCon example, tFAW is 30ns with 2ns tCK + tXAW = '30ns' + activation_limit = 4 + + # 4tCK + tXP = '8ns' + + # start with tRFC + tXP -> 160ns + 8ns = 168ns + tXS = '168ns' + +# A single HBM x64 interface (one command and address bus), with +# default timings based on HBM gen1 and data publically released +# A 4H stack is defined, 8Gb per die for a total of 4GB of memory. +# Note: This defines a pseudo-channel with a unique controller +# instantiated per pseudo-channel +# Stay at same IO rate (1Gbps) to maintain timing relationship with +# HBM gen1 class (HBM_1000_4H_x128) where possible +class HBM_1000_4H_1x64(HBM_1000_4H_1x128): + # For HBM gen2 with pseudo-channel mode, configure 2X channels. + # Configuration defines a single pseudo channel, with the capacity + # set to (full_ stack_capacity / 16) based on 8Gb dies + # To use all 16 pseudo channels, set 'channels' parameter to 16 in + # system configuration + + # 64-bit pseudo-channle interface + device_bus_width = 64 + + # HBM pseudo-channel only supports BL4 + burst_length = 4 + + # size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack; + # with 16 channels, 256MB per channel + device_size = '256MB' + + # page size is halved with pseudo-channel; maintaining the same same number + # of rows per pseudo-channel with 2X banks across 2 channels + device_rowbuffer_size = '1kB' + + # HBM has 8 or 16 banks depending on capacity + # Starting with 4Gb dies, 16 banks are defined + banks_per_rank = 16 + + # reset tRFC for larger, 8Gb device + # use HBM1 4Gb value as a starting point + tRFC = '260ns' + + # start with tRFC + tXP -> 160ns + 8ns = 168ns + tXS = '268ns' + # Default different rank bus delay to 2 CK, @1000 MHz = 2 ns + tCS = '2ns' + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '10ns' + + # self refresh exit time + tXS = '65ns' + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture +# burst of 32, which means bursts can be interleaved +class LPDDR5_5500_1x16_BG_BL32(DRAMInterface): + + # Increase buffer size to account for more bank resources + read_buffer_size = 64 + + # Set page policy to better suit DMC Huxley + page_policy = 'close_adaptive' + + # 16-bit channel interface + device_bus_width = 16 + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL32 for higher command bandwidth + burst_length = 32 + + # size of device in bytes + device_size = '1GB' + + # 2kB page with BG mode + device_rowbuffer_size = '2kB' + + # Use a 1x16 configuration + devices_per_rank = 1 + + # Use a single rank + ranks_per_channel = 1 + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Initial configuration will have 16 banks with Bank Group Arch + # to maximim resources and enable higher data rates + banks_per_rank = 16 + bank_groups_per_rank = 4 + + # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK + tCK = '1.455ns' + + # Greater of 2 CK or 18ns + tRCD = '18ns' + + # Base RL is 16 CK @ 687.5 MHz = 23.28ns + tCL = '23.280ns' + + # Greater of 2 CK or 18ns + tRP = '18ns' + + # Greater of 3 CK or 42ns + tRAS = '42ns' + + # Greater of 3 CK or 34ns + tWR = '34ns' + + # active powerdown and precharge powerdown exit time + # Greater of 3 CK or 7ns + tXP = '7ns' + + # self refresh exit time (tRFCab + 7.5ns) + tXS = '217.5ns' + + # Greater of 2 CK or 7.5 ns minus 2 CK + tRTP = '4.59ns' + + # With BG architecture, burst of 32 transferred in two 16-beat + # sub-bursts, with a 16-beat gap in between. + # Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz + # tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz + tBURST = '8.73ns' + # can interleave a Bstof32 from another bank group at tBURST_MIN + # 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz + tBURST_MIN = '2.91ns' + # tBURST_MAX is the maximum burst delay for same bank group timing + # this is 8 CK @ 687.5 MHz + tBURST_MAX = '11.64ns' + + # 8 CK @ 687.5 MHz + tCCD_L = "11.64ns" + + # LPDDR5, 8 Gbit/channel for 280ns tRFCab + tRFC = '210ns' + tREFI = '3.9us' + + # Greater of 4 CK or 6.25 ns + tWTR = '6.25ns' + # Greater of 4 CK or 12 ns + tWTR_L = '12ns' + + # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL + # tWCKDQ0/tCK will be 1 CK for most cases + # For gem5 RL = WL and BL/n is already accounted for with tBURST + # Result is and additional 1 CK is required + tRTW = '1.455ns' + + # Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns + tCS = '2.91ns' + + # 2 CK + tPPD = '2.91ns' + + # Greater of 2 CK or 5 ns + tRRD = '5ns' + tRRD_L = '5ns' + + # With Bank Group Arch mode tFAW is 20 ns + tXAW = '20ns' + activation_limit = 4 + + # at 5Gbps, 4:1 WCK to CK ratio required + # 2 data beats per WCK (DDR) -> 8 per CK + beats_per_clock = 8 + + # 2 cycles required to send activate command + # 2 command phases can be sent back-to-back or + # with a gap up to tAAD = 8 CK + two_cycle_activate = True + tAAD = '11.640ns' + + data_clock_sync = True + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture, burst of 16 +class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32): + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL16 for smaller access granularity + burst_length = 16 + + # For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio + tBURST = '2.91ns' + tBURST_MIN = '2.91ns' + # For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio + tBURST_MAX = '5.82ns' + + # 4 CK @ 687.5 MHz + tCCD_L = "5.82ns" + + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 8-bank mode, burst of 32 +class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32): + + # 4kB page with 8B mode + device_rowbuffer_size = '4kB' + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Select 8B + banks_per_rank = 8 + bank_groups_per_rank = 0 + + # For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio + tBURST = '5.82ns' + tBURST_MIN = '5.82ns' + tBURST_MAX = '5.82ns' + + # Greater of 4 CK or 12 ns + tWTR = '12ns' + + # Greater of 2 CK or 10 ns + tRRD = '10ns' + + # With 8B mode tFAW is 40 ns + tXAW = '40ns' + activation_limit = 4 + + # Reset BG arch timing for 8B mode + tCCD_L = "0ns" + tRRD_L = "0ns" + tWTR_L = "0ns" + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# 6.4Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture +# burst of 32, which means bursts can be interleaved +class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32): + + # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK + tCK = '1.25ns' + + # Base RL is 17 CK @ 800 MHz = 21.25ns + tCL = '21.25ns' + + # With BG architecture, burst of 32 transferred in two 16-beat + # sub-bursts, with a 16-beat gap in between. + # Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz + # tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz + tBURST = '7.5ns' + # can interleave a Bstof32 from another bank group at tBURST_MIN + # 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz + tBURST_MIN = '2.5ns' + # tBURST_MAX is the maximum burst delay for same bank group timing + # this is 8 CK @ 800 MHz + tBURST_MAX = '10ns' + + # 8 CK @ 800 MHz + tCCD_L = "10ns" + + # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL + # tWCKDQ0/tCK will be 1 CK for most cases + # For gem5 RL = WL and BL/n is already accounted for with tBURST + # Result is and additional 1 CK is required + tRTW = '1.25ns' + + # Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns + tCS = '2.5ns' + + # 2 CK + tPPD = '2.5ns' + + # 2 command phases can be sent back-to-back or + # with a gap up to tAAD = 8 CK + tAAD = '10ns' + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on initial +# JEDEC specifcation +# 6.4Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture, burst of 16 +class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32): + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL16 for smaller access granularity + burst_length = 16 + + # For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio + tBURST = '2.5ns' + tBURST_MIN = '2.5ns' + # For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio + tBURST_MAX = '5ns' + + # 4 CK @ 800 MHz + tCCD_L = "5ns" + + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# 6.4Gbps data rates and 8Gbit die +# Configuring for 8-bank mode, burst of 32 +class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32): + + # 4kB page with 8B mode + device_rowbuffer_size = '4kB' + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Select 8B + banks_per_rank = 8 + bank_groups_per_rank = 0 + + # For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio + tBURST = '5ns' + tBURST_MIN = '5ns' + tBURST_MAX = '5ns' + + # Greater of 4 CK or 12 ns + tWTR = '12ns' + + # Greater of 2 CK or 10 ns + tRRD = '10ns' + + # With 8B mode tFAW is 40 ns + tXAW = '40ns' + activation_limit = 4 + + # Reset BG arch timing for 8B mode + tCCD_L = "0ns" + tRRD_L = "0ns" + tWTR_L = "0ns" diff --git a/src/mem/SConscript b/src/mem/SConscript index 2fe179d..ceeed98 100644 --- a/src/mem/SConscript +++ b/src/mem/SConscript @@ -1,6 +1,6 @@ # -*- mode:python -*- # -# Copyright (c) 2018-2019 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved # # The license below extends only to copyright in the software and shall @@ -47,6 +47,7 @@ SimObject('AddrMapper.py') SimObject('Bridge.py') SimObject('DRAMCtrl.py') +SimObject('DRAMInterface.py') SimObject('ExternalMaster.py') SimObject('ExternalSlave.py') SimObject('MemObject.py') diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc index b646581..4055505 100644 --- a/src/mem/dram_ctrl.cc +++ b/src/mem/dram_ctrl.cc @@ -47,6 +47,7 @@ #include "debug/DRAMState.hh" #include "debug/Drain.hh" #include "debug/QOS.hh" +#include "params/DRAMInterface.hh" #include "sim/system.hh" using namespace std; @@ -58,12 +59,13 @@ retryRdReq(false), retryWrReq(false), nextReqEvent([this]{ processNextReqEvent(); }, name()), respondEvent([this]{ processRespondEvent(); }, name()), - readBufferSize(p->read_buffer_size), - writeBufferSize(p->write_buffer_size), + dram(p->dram), + readBufferSize(dram->readBufferSize), + writeBufferSize(dram->writeBufferSize), writeHighThreshold(writeBufferSize * p->write_high_thresh_perc / 100.0), writeLowThreshold(writeBufferSize * p->write_low_thresh_perc / 100.0), minWritesPerSwitch(p->min_writes_per_switch), - writesThisTime(0), readsThisTime(0), tCS(p->tCS), + writesThisTime(0), readsThisTime(0), memSchedPolicy(p->mem_sched_policy), frontendLatency(p->static_frontend_latency), backendLatency(p->static_backend_latency), @@ -74,37 +76,23 @@ readQueue.resize(p->qos_priorities); writeQueue.resize(p->qos_priorities); + dram->setCtrl(this); + // perform a basic check of the write thresholds if (p->write_low_thresh_perc >= p->write_high_thresh_perc) fatal("Write buffer low threshold %d must be smaller than the " "high threshold %d\n", p->write_low_thresh_perc, p->write_high_thresh_perc); - - // determine the rows per bank by looking at the total capacity - uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size()); - - DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity, - AbstractMemory::size()); - - // create a DRAM interface - // will only populate the ranks if DRAM is configured - dram = new DRAMInterface(*this, p, capacity, range); - DPRINTF(DRAM, "Created DRAM interface \n"); } void DRAMCtrl::init() { - MemCtrl::init(); - if (!port.isConnected()) { fatal("DRAMCtrl %s is unconnected!\n", name()); } else { port.sendRangeChange(); } - - dram->init(range); - } void @@ -114,8 +102,6 @@ isTimingMode = system()->isTimingMode(); if (isTimingMode) { - dram->startupRanks(); - // shift the bus busy time sufficiently far ahead that we never // have to worry about negative values when computing the time for // the next request, this will add an insignificant bubble at the @@ -133,7 +119,7 @@ "is responding"); // do the actual memory access and turn the packet into a response - access(pkt); + dram->access(pkt); Tick latency = 0; if (pkt->hasData()) { @@ -263,7 +249,7 @@ // address of first DRAM packet is kept unaliged. Subsequent DRAM packets // are aligned to burst size boundaries. This is to ensure we accurately // check read packets against packets in write queue. - const Addr base_addr = getCtrlAddr(pkt->getAddr()); + const Addr base_addr = dram->getCtrlAddr(pkt->getAddr()); Addr addr = base_addr; unsigned pktsServicedByWrQ = 0; BurstHelper* burst_helper = NULL; @@ -363,7 +349,7 @@ // if the request size is larger than burst size, the pkt is split into // multiple DRAM packets - const Addr base_addr = getCtrlAddr(pkt->getAddr()); + const Addr base_addr = dram->getCtrlAddr(pkt->getAddr()); Addr addr = base_addr; uint32_t burstSize = dram->bytesPerBurst(); for (int cnt = 0; cnt < pktCount; ++cnt) { @@ -526,7 +512,7 @@ DRAMPacket* dram_pkt = respQueue.front(); // media specific checks and functions when read response is complete - dram->respondEventDRAM(dram_pkt->rank); + dram->respondEvent(dram_pkt->rank); if (dram_pkt->burstHelper) { // it is a split packet @@ -727,12 +713,12 @@ void DRAMCtrl::accessAndRespond(PacketPtr pkt, Tick static_latency) { - DPRINTF(DRAM, "Responding to Address %lld.. ",pkt->getAddr()); + DPRINTF(DRAM, "Responding to Address %lld.. \n",pkt->getAddr()); bool needsResponse = pkt->needsResponse(); // do the actual memory access which also turns the packet into a // response - access(pkt); + dram->access(pkt); // turn packet around to go back to requester if response expected if (needsResponse) { @@ -877,9 +863,9 @@ // if not, shift to next burst window Tick act_at; if (twoCycleActivate) - act_at = ctrl.verifyMultiCmd(act_tick, tAAD); + act_at = ctrl->verifyMultiCmd(act_tick, tAAD); else - act_at = ctrl.verifySingleCmd(act_tick); + act_at = ctrl->verifySingleCmd(act_tick); DPRINTF(DRAM, "Activate at tick %d\n", act_at); @@ -997,7 +983,7 @@ // Issuing an explicit PRE command // Verify that we have command bandwidth to issue the precharge // if not, shift to next burst window - pre_at = ctrl.verifySingleCmd(pre_tick); + pre_at = ctrl->verifySingleCmd(pre_tick); // enforce tPPD for (int i = 0; i < banksPerRank; i++) { rank_ref.banks[i].preAllowedAt = std::max(pre_at + tPPD, @@ -1096,9 +1082,9 @@ // verify that we have command bandwidth to issue the burst // if not, shift to next burst window if (dataClockSync && ((cmd_at - rank_ref.lastBurstTick) > clkResyncDelay)) - cmd_at = ctrl.verifyMultiCmd(cmd_at, tCK); + cmd_at = ctrl->verifyMultiCmd(cmd_at, tCK); else - cmd_at = ctrl.verifySingleCmd(cmd_at); + cmd_at = ctrl->verifySingleCmd(cmd_at); // if we are interleaving bursts, ensure that // 1) we don't double interleave on next burst issue @@ -1196,7 +1182,7 @@ bool got_more_hits = false; bool got_bank_conflict = false; - for (uint8_t i = 0; i < ctrl.numPriorities(); ++i) { + for (uint8_t i = 0; i < ctrl->numPriorities(); ++i) { auto p = queue[i].begin(); // keep on looking until we find a hit or reach the end of the // queue @@ -1267,6 +1253,7 @@ // Update latency stats stats.totMemAccLat += dram_pkt->readyTime - dram_pkt->entryTime; stats.totQLat += cmd_at - dram_pkt->entryTime; + stats.totBusLat += tBURST; } else { // Schedule write done event to decrement event count // after the readyTime has been reached @@ -1350,13 +1337,9 @@ // Update latency stats stats.masterReadTotalLat[dram_pkt->masterId()] += dram_pkt->readyTime - dram_pkt->entryTime; - - stats.bytesRead += dram->bytesPerBurst(); - stats.totBusLat += dram->burstDelay(); stats.masterReadBytes[dram_pkt->masterId()] += dram_pkt->size; } else { ++writesThisTime; - stats.bytesWritten += dram->bytesPerBurst(); stats.masterWriteBytes[dram_pkt->masterId()] += dram_pkt->size; stats.masterWriteTotalLat[dram_pkt->masterId()] += dram_pkt->readyTime - dram_pkt->entryTime; @@ -1458,8 +1441,9 @@ // Figure out which read request goes next // If we are changing command type, incorporate the minimum - // bus turnaround delay which will be tCS (different rank) case - to_read = chooseNext((*queue), switched_cmd_type ? tCS : 0); + // bus turnaround delay which will be rank to rank delay + to_read = chooseNext((*queue), switched_cmd_type ? + dram->rankDelay() : 0); if (to_read != queue->end()) { // candidate read found @@ -1538,7 +1522,8 @@ // If we are changing command type, incorporate the minimum // bus turnaround delay to_write = chooseNext((*queue), - switched_cmd_type ? std::min(dram->minRdToWr(), tCS) : 0); + switched_cmd_type ? std::min(dram->minRdToWr(), + dram->rankDelay()) : 0); if (to_write != queue->end()) { write_found = true; @@ -1611,11 +1596,8 @@ } } -DRAMInterface::DRAMInterface(DRAMCtrl& _ctrl, - const DRAMCtrlParams* _p, - const uint64_t capacity, - const AddrRange range) - : SimObject(_p), ctrl(_ctrl), +DRAMInterface::DRAMInterface(const DRAMInterfaceParams* _p) + : AbstractMemory(_p), addrMapping(_p->addr_mapping), burstSize((_p->devices_per_rank * _p->burst_length * _p->device_bus_width) / 8), @@ -1630,7 +1612,7 @@ bankGroupsPerRank(_p->bank_groups_per_rank), bankGroupArch(_p->bank_groups_per_rank > 0), banksPerRank(_p->banks_per_rank), rowsPerBank(0), - tCK(_p->tCK), tCL(_p->tCL), tBURST(_p->tBURST), + tCK(_p->tCK), tCS(_p->tCS), tCL(_p->tCL), tBURST(_p->tBURST), tBURST_MIN(_p->tBURST_MIN), tBURST_MAX(_p->tBURST_MAX), tRTW(_p->tRTW), tCCD_L_WR(_p->tCCD_L_WR), tCCD_L(_p->tCCD_L), tRCD(_p->tRCD), tRP(_p->tRP), tRAS(_p->tRAS), tWR(_p->tWR), tRTP(_p->tRTP), @@ -1646,13 +1628,15 @@ wrToRdDly(tCL + tBURST + _p->tWTR), rdToWrDly(tBURST + tRTW), wrToRdDlySameBG(tCL + _p->tBURST_MAX + _p->tWTR_L), rdToWrDlySameBG(tRTW + _p->tBURST_MAX), - rankToRankDly(ctrl.rankDelay() + tBURST), + rankToRankDly(tCS + tBURST), pageMgmt(_p->page_policy), maxAccessesPerRow(_p->max_accesses_per_row), timeStampOffset(0), activeRank(0), enableDRAMPowerdown(_p->enable_dram_powerdown), lastStatsResetTick(0), - stats(_ctrl, *this) + stats(*this), + readBufferSize(_p->read_buffer_size), + writeBufferSize(_p->write_buffer_size) { fatal_if(!isPowerOf2(burstSize), "DRAM burst size %d is not allowed, " "must be a power of two\n", burstSize); @@ -1664,7 +1648,7 @@ for (int i = 0; i < ranksPerChannel; i++) { DPRINTF(DRAM, "Creating DRAM rank %d \n", i); - Rank* rank = new Rank(ctrl, _p, i, *this); + Rank* rank = new Rank(_p, i, *this); ranks.push_back(rank); } @@ -1672,6 +1656,11 @@ uint64_t deviceCapacity = deviceSize / (1024 * 1024) * devicesPerRank * ranksPerChannel; + uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size()); + + DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity, + AbstractMemory::size()); + // if actual DRAM size does not match memory capacity in system warn! if (deviceCapacity != capacity / (1024 * 1024)) warn("DRAM device capacity (%d Mbytes) does not match the " @@ -1726,8 +1715,10 @@ } void -DRAMInterface::init(AddrRange range) +DRAMInterface::init() { + AbstractMemory::init(); + // a bit of sanity checks on the interleaving, save it for here to // ensure that the system pointer is initialised if (range.interleaved()) { @@ -1749,7 +1740,7 @@ // channel striping has to be done at a granularity that // is equal or larger to a cache line - if (ctrl.system()->cacheLineSize() > range.granularity()) { + if (system()->cacheLineSize() > range.granularity()) { fatal("Channel interleaving of %s must be at least as large " "as the cache line size\n", name()); } @@ -1766,10 +1757,12 @@ } void -DRAMInterface::startupRanks() +DRAMInterface::startup() { - // timestamp offset should be in clock cycles for DRAMPower - timeStampOffset = divCeil(curTick(), tCK); + if (system()->isTimingMode()) { + // timestamp offset should be in clock cycles for DRAMPower + timeStampOffset = divCeil(curTick(), tCK); + } for (auto r : ranks) { r->startup(curTick() + tREFI - tRP); @@ -1815,7 +1808,7 @@ } void -DRAMInterface::respondEventDRAM(uint8_t rank) +DRAMInterface::respondEvent(uint8_t rank) { Rank& rank_ref = *ranks[rank]; @@ -1956,7 +1949,7 @@ std::max(ranks[i]->banks[j].preAllowedAt, curTick()) + tRP; // When is the earliest the R/W burst can issue? - const Tick col_allowed_at = ctrl.inReadBusState(false) ? + const Tick col_allowed_at = ctrl->inReadBusState(false) ? ranks[i]->banks[j].rdAllowedAt : ranks[i]->banks[j].wrAllowedAt; Tick col_at = std::max(col_allowed_at, act_at + tRCD); @@ -1996,9 +1989,15 @@ return make_pair(bank_mask, hidden_bank_prep); } -DRAMInterface::Rank::Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank, - DRAMInterface& _dram) - : EventManager(&_ctrl), ctrl(_ctrl), dram(_dram), +DRAMInterface* +DRAMInterfaceParams::create() +{ + return new DRAMInterface(this); +} + +DRAMInterface::Rank::Rank(const DRAMInterfaceParams* _p, + int _rank, DRAMInterface& _dram) + : EventManager(&_dram), dram(_dram), pwrStateTrans(PWR_IDLE), pwrStatePostRefresh(PWR_IDLE), pwrStateTick(0), refreshDueAt(0), pwrState(PWR_IDLE), refreshState(REF_IDLE), inLowPowerState(false), rank(_rank), @@ -2011,7 +2010,7 @@ refreshEvent([this]{ processRefreshEvent(); }, name()), powerEvent([this]{ processPowerEvent(); }, name()), wakeUpEvent([this]{ processWakeUpEvent(); }, name()), - stats(_ctrl, *this) + stats(_dram, *this) { for (int b = 0; b < _p->banks_per_rank; b++) { banks[b].bank = b; @@ -2062,8 +2061,10 @@ DRAMInterface::Rank::isQueueEmpty() const { // check commmands in Q based on current bus direction - bool no_queued_cmds = (ctrl.inReadBusState(true) && (readEntries == 0)) - || (ctrl.inWriteBusState(true) && (writeEntries == 0)); + bool no_queued_cmds = (dram.ctrl->inReadBusState(true) && + (readEntries == 0)) + || (dram.ctrl->inWriteBusState(true) && + (writeEntries == 0)); return no_queued_cmds; } @@ -2187,7 +2188,7 @@ // if a request is at the moment being handled and this request is // accessing the current rank then wait for it to finish if ((rank == dram.activeRank) - && (ctrl.requestEventScheduled())) { + && (dram.ctrl->requestEventScheduled())) { // hand control over to the request loop until it is // evaluated next DPRINTF(DRAM, "Refresh awaiting draining\n"); @@ -2262,7 +2263,7 @@ // or have outstanding ACT,RD/WR,Auto-PRE sequence scheduled // should have outstanding precharge or read response event assert(prechargeEvent.scheduled() || - ctrl.respondEventScheduled()); + dram.ctrl->respondEventScheduled()); // will start refresh when pwrState transitions to IDLE } @@ -2322,8 +2323,8 @@ assert(!powerEvent.scheduled()); - if ((ctrl.drainState() == DrainState::Draining) || - (ctrl.drainState() == DrainState::Drained)) { + if ((dram.ctrl->drainState() == DrainState::Draining) || + (dram.ctrl->drainState() == DrainState::Drained)) { // if draining, do not re-enter low-power mode. // simply go to IDLE and wait schedulePowerEvent(PWR_IDLE, curTick()); @@ -2548,10 +2549,10 @@ } // completed refresh event, ensure next request is scheduled - if (!ctrl.requestEventScheduled()) { + if (!dram.ctrl->requestEventScheduled()) { DPRINTF(DRAM, "Scheduling next request after refreshing" " rank %d\n", rank); - ctrl.restartScheduler(curTick()); + dram.ctrl->restartScheduler(curTick()); } } @@ -2610,8 +2611,8 @@ // bypass auto-refresh and go straight to SREF, where memory // will issue refresh immediately upon entry if (pwrStatePostRefresh == PWR_PRE_PDN && isQueueEmpty() && - (ctrl.drainState() != DrainState::Draining) && - (ctrl.drainState() != DrainState::Drained) && + (dram.ctrl->drainState() != DrainState::Draining) && + (dram.ctrl->drainState() != DrainState::Drained) && dram.enableDRAMPowerdown) { DPRINTF(DRAMState, "Rank %d bypassing refresh and transitioning " "to self refresh at %11u tick\n", rank, curTick()); @@ -2712,7 +2713,7 @@ bool DRAMInterface::Rank::forceSelfRefreshExit() const { return (readEntries != 0) || - (ctrl.inWriteBusState(true) && (writeEntries != 0)); + (dram.ctrl->inWriteBusState(true) && (writeEntries != 0)); } DRAMCtrl::CtrlStats::CtrlStats(DRAMCtrl &_ctrl) @@ -2723,15 +2724,15 @@ ADD_STAT(writeReqs, "Number of write requests accepted"), ADD_STAT(readBursts, - "Number of DRAM read bursts, " + "Number of controller read bursts, " "including those serviced by the write queue"), ADD_STAT(writeBursts, - "Number of DRAM write bursts, " + "Number of controller write bursts, " "including those merged in the write queue"), ADD_STAT(servicedByWrQ, - "Number of DRAM read bursts serviced by the write queue"), + "Number of controller read bursts serviced by the write queue"), ADD_STAT(mergedWrBursts, - "Number of DRAM write bursts merged with an existing one"), + "Number of controller write bursts merged with an existing one"), ADD_STAT(neitherReadNorWriteReqs, "Number of requests that are neither read nor write"), @@ -2739,9 +2740,6 @@ ADD_STAT(avgRdQLen, "Average read queue length when enqueuing"), ADD_STAT(avgWrQLen, "Average write queue length when enqueuing"), - ADD_STAT(totBusLat, "Total ticks spent in databus transfers"), - ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"), - ADD_STAT(numRdRetry, "Number of times read queue was full causing retry"), ADD_STAT(numWrRetry, "Number of times write queue was full causing retry"), @@ -2756,22 +2754,13 @@ ADD_STAT(wrPerTurnAround, "Writes before turning the bus around for reads"), - ADD_STAT(bytesRead, "Total number of bytes read from memory"), ADD_STAT(bytesReadWrQ, "Total number of bytes read from write queue"), - ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"), ADD_STAT(bytesReadSys, "Total read bytes from the system interface side"), ADD_STAT(bytesWrittenSys, "Total written bytes from the system interface side"), - ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiByte/s"), - ADD_STAT(avgWrBW, "Average achieved write bandwidth in MiByte/s"), ADD_STAT(avgRdBWSys, "Average system read bandwidth in MiByte/s"), ADD_STAT(avgWrBWSys, "Average system write bandwidth in MiByte/s"), - ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"), - - ADD_STAT(busUtil, "Data bus utilization in percentage"), - ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"), - ADD_STAT(busUtilWrite, "Data bus utilization in percentage for writes"), ADD_STAT(totGap, "Total gap between requests"), ADD_STAT(avgGap, "Average gap between requests"), @@ -2803,12 +2792,11 @@ { using namespace Stats; - assert(ctrl._system); - const auto max_masters = ctrl._system->maxMasters(); + assert(ctrl.system()); + const auto max_masters = ctrl.system()->maxMasters(); avgRdQLen.precision(2); avgWrQLen.precision(2); - avgBusLat.precision(2); readPktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1); writePktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1); @@ -2823,14 +2811,9 @@ .init(ctrl.writeBufferSize) .flags(nozero); - avgRdBW.precision(2); - avgWrBW.precision(2); avgRdBWSys.precision(2); avgWrBWSys.precision(2); - peakBW.precision(2); - busUtil.precision(2); avgGap.precision(2); - busUtilWrite.precision(2); // per-master bytes read and written to memory masterReadBytes @@ -2862,9 +2845,6 @@ .flags(nonan) .precision(2); - busUtilRead - .precision(2); - masterWriteRate .flags(nozero | nonan) .precision(12); @@ -2878,7 +2858,7 @@ .precision(2); for (int i = 0; i < max_masters; i++) { - const std::string master = ctrl._system->getMasterName(i); + const std::string master = ctrl.system()->getMasterName(i); masterReadBytes.subname(i, master); masterReadRate.subname(i, master); masterWriteBytes.subname(i, master); @@ -2892,22 +2872,11 @@ } // Formula stats - avgBusLat = totBusLat / (readBursts - servicedByWrQ); - - avgRdBW = (bytesRead / 1000000) / simSeconds; - avgWrBW = (bytesWritten / 1000000) / simSeconds; avgRdBWSys = (bytesReadSys / 1000000) / simSeconds; avgWrBWSys = (bytesWrittenSys / 1000000) / simSeconds; - peakBW = (SimClock::Frequency / ctrl.dram->burstDataDelay()) * - ctrl.dram->bytesPerBurst() / 1000000; - - busUtil = (avgRdBW + avgWrBW) / peakBW * 100; avgGap = totGap / (readReqs + writeReqs); - busUtilRead = avgRdBW / peakBW * 100; - busUtilWrite = avgWrBW / peakBW * 100; - masterReadRate = masterReadBytes / simSeconds; masterWriteRate = masterWriteBytes / simSeconds; masterReadAvgLat = masterReadTotalLat / masterReadAccesses; @@ -2920,8 +2889,8 @@ dram.lastStatsResetTick = curTick(); } -DRAMInterface::DRAMStats::DRAMStats(DRAMCtrl &_ctrl, DRAMInterface &_dram) - : Stats::Group(&_ctrl, csprintf("dram").c_str()), +DRAMInterface::DRAMStats::DRAMStats(DRAMInterface &_dram) + : Stats::Group(&_dram), dram(_dram), ADD_STAT(readBursts, "Number of DRAM read bursts"), @@ -2931,10 +2900,13 @@ ADD_STAT(perBankWrBursts, "Per bank write bursts"), ADD_STAT(totQLat, "Total ticks spent queuing"), + ADD_STAT(totBusLat, "Total ticks spent in databus transfers"), ADD_STAT(totMemAccLat, "Total ticks spent from burst creation until serviced " "by the DRAM"), + ADD_STAT(avgQLat, "Average queueing delay per DRAM burst"), + ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"), ADD_STAT(avgMemAccLat, "Average memory access latency per DRAM burst"), ADD_STAT(readRowHits, "Number of row buffer hits during reads"), @@ -2947,6 +2919,12 @@ ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"), ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiBytes/s"), ADD_STAT(avgWrBW, "Average DRAM write bandwidth in MiBytes/s"), + ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"), + + ADD_STAT(busUtil, "Data bus utilization in percentage"), + ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"), + ADD_STAT(busUtilWrite, "Data bus utilization in percentage for writes"), + ADD_STAT(pageHitRate, "Row buffer hit rate, read and write combined") { @@ -2958,6 +2936,7 @@ using namespace Stats; avgQLat.precision(2); + avgBusLat.precision(2); avgMemAccLat.precision(2); readRowHitRate.precision(2); @@ -2971,10 +2950,16 @@ dram.maxAccessesPerRow : dram.rowBufferSize) .flags(nozero); + peakBW.precision(2); + busUtil.precision(2); + busUtilWrite.precision(2); + busUtilRead.precision(2); + pageHitRate.precision(2); // Formula stats avgQLat = totQLat / readBursts; + avgBusLat = totBusLat / readBursts; avgMemAccLat = totMemAccLat / readBursts; readRowHitRate = (readRowHits / readBursts) * 100; @@ -2982,13 +2967,19 @@ avgRdBW = (bytesRead / 1000000) / simSeconds; avgWrBW = (bytesWritten / 1000000) / simSeconds; + peakBW = (SimClock::Frequency / dram.burstDataDelay()) * + dram.bytesPerBurst() / 1000000; + + busUtil = (avgRdBW + avgWrBW) / peakBW * 100; + busUtilRead = avgRdBW / peakBW * 100; + busUtilWrite = avgWrBW / peakBW * 100; pageHitRate = (writeRowHits + readRowHits) / (writeBursts + readBursts) * 100; } -DRAMInterface::RankStats::RankStats(DRAMCtrl &_ctrl, Rank &_rank) - : Stats::Group(&_ctrl, csprintf("dram_rank%d", _rank.rank).c_str()), +DRAMInterface::RankStats::RankStats(DRAMInterface &_dram, Rank &_rank) + : Stats::Group(&_dram, csprintf("rank%d", _rank.rank).c_str()), rank(_rank), ADD_STAT(actEnergy, "Energy for activate commands per rank (pJ)"), @@ -3047,7 +3038,7 @@ DRAMCtrl::recvFunctional(PacketPtr pkt) { // rely on the abstract memory - functionalAccess(pkt); + dram->functionalAccess(pkt); } Port & @@ -3093,6 +3084,7 @@ // if we switched to timing mode, kick things into action, // and behave as if we restored from a checkpoint startup(); + dram->startup(); } else if (isTimingMode && !system()->isTimingMode()) { // if we switch from timing mode, stop the refresh events to // not cause issues with KVM @@ -3112,7 +3104,7 @@ DRAMCtrl::MemoryPort::getAddrRanges() const { AddrRangeList ranges; - ranges.push_back(ctrl.getAddrRange()); + ranges.push_back(ctrl.dram->getAddrRange()); return ranges; } diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh index dc030b1..417e935 100644 --- a/src/mem/dram_ctrl.hh +++ b/src/mem/dram_ctrl.hh @@ -55,12 +55,15 @@ #include "enums/AddrMap.hh" #include "enums/MemSched.hh" #include "enums/PageManage.hh" +#include "mem/abstract_mem.hh" #include "mem/drampower.hh" #include "mem/qos/mem_ctrl.hh" #include "mem/qport.hh" #include "params/DRAMCtrl.hh" #include "sim/eventq.hh" +class DRAMInterfaceParams; + /** * A basic class to track the bank state, i.e. what row is * currently open (if any), when is the bank free to accept a new @@ -242,7 +245,7 @@ * The DRAMInterface includes a class for individual ranks * and per rank functions. */ -class DRAMInterface : public SimObject +class DRAMInterface : public AbstractMemory { private: /** @@ -342,7 +345,7 @@ class Rank; struct RankStats : public Stats::Group { - RankStats(DRAMCtrl &ctrl, Rank &rank); + RankStats(DRAMInterface &dram, Rank &rank); void regStats() override; void resetStats() override; @@ -408,13 +411,6 @@ */ class Rank : public EventManager { - protected: - - /** - * A reference to the parent DRAMCtrl instance - */ - DRAMCtrl& ctrl; - private: /** @@ -534,10 +530,10 @@ */ Tick lastBurstTick; - Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank, + Rank(const DRAMInterfaceParams* _p, int _rank, DRAMInterface& _dram); - const std::string name() const { return csprintf("dram_%d", rank); } + const std::string name() const { return csprintf("%d", rank); } /** * Kick off accounting for power and refresh states and @@ -659,15 +655,16 @@ * @param next Memory Command * @return true if timeStamp of Command 1 < timeStamp of Command 2 */ - static bool sortTime(const Command& cmd, const Command& cmd_next) + static bool + sortTime(const Command& cmd, const Command& cmd_next) { return cmd.timeStamp < cmd_next.timeStamp; - }; + } /** - * A reference to the parent DRAMCtrl instance + * A pointer to the parent DRAMCtrl instance */ - DRAMCtrl& ctrl; + DRAMCtrl* ctrl; /** * Memory controller configuration initialized based on parameter @@ -698,6 +695,7 @@ * DRAM timing requirements */ const Tick M5_CLASS_VAR_USED tCK; + const Tick tCS; const Tick tCL; const Tick tBURST; const Tick tBURST_MIN; @@ -781,7 +779,7 @@ struct DRAMStats : public Stats::Group { - DRAMStats(DRAMCtrl &ctrl, DRAMInterface &dram); + DRAMStats(DRAMInterface &dram); void regStats() override; void resetStats() override; @@ -798,10 +796,12 @@ // Latencies summed over all requests Stats::Scalar totQLat; + Stats::Scalar totBusLat; Stats::Scalar totMemAccLat; // Average latencies per request Stats::Formula avgQLat; + Stats::Formula avgBusLat; Stats::Formula avgMemAccLat; // Row hit count and rate @@ -817,6 +817,11 @@ // Average bandwidth Stats::Formula avgRdBW; Stats::Formula avgWrBW; + Stats::Formula peakBW; + // bus utilization + Stats::Formula busUtil; + Stats::Formula busUtilRead; + Stats::Formula busUtilWrite; Stats::Formula pageHitRate; }; @@ -828,16 +833,28 @@ std::vector<Rank*> ranks; public: + + /** + * Buffer sizes for read and write queues in the controller + * These are passed to the controller on instantiation + * Defining them here allows for buffers to be resized based + * on memory type / configuration. + */ + const uint32_t readBufferSize; + const uint32_t writeBufferSize; + + /** Setting a pointer to the controller */ + void setCtrl(DRAMCtrl* _ctrl) { ctrl = _ctrl; } + /** * Initialize the DRAM interface and verify parameters - * @param range is the address range for this interface */ - void init(AddrRange range); + void init() override; /** * Iterate through dram ranks and instantiate per rank startup routine */ - void startupRanks(); + void startup() override; /** * Iterate through dram ranks to exit self-refresh in order to drain @@ -861,15 +878,26 @@ void suspend(); /** + * Get an address in a dense range which starts from 0. The input + * address is the physical address of the request in an address + * space that contains other SimObjects apart from this + * controller. + * + * @param addr The intput address which should be in the addrRange + * @return An address in the continues range [0, max) + */ + Addr getCtrlAddr(Addr addr) { return range.getOffset(addr); } + + /** * @return number of bytes in a burst for this interface */ - uint32_t bytesPerBurst() const { return burstSize; }; + uint32_t bytesPerBurst() const { return burstSize; } /** * * @return number of ranks per channel for this interface */ - uint32_t numRanks() const { return ranksPerChannel; }; + uint32_t numRanks() const { return ranksPerChannel; } /* * @return time to send a burst of data @@ -879,7 +907,8 @@ /* * @return time to send a burst of data without gaps */ - Tick burstDataDelay() const + Tick + burstDataDelay() const { return (burstInterleave ? tBURST_MAX / 2 : tBURST); } @@ -893,7 +922,14 @@ * * @return additional bus turnaround required for read-to-write */ - Tick minRdToWr() const { return tRTW; }; + Tick minRdToWr() const { return tRTW; } + + /** + * Determine the required delay for an access to a different rank + * + * @return required rank to rank delay + */ + Tick rankDelay() const { return tCS; } /* * Function to calulate RAS cycle time for use within and @@ -957,7 +993,8 @@ * This requires the DRAM to be in the * REF IDLE state */ - bool burstReady(uint8_t rank) const + bool + burstReady(uint8_t rank) const { return ranks[rank]->inRefIdleState(); } @@ -979,7 +1016,7 @@ * * @param rank Specifies rank associated with read burst */ - void respondEventDRAM(uint8_t rank); + void respondEvent(uint8_t rank); /** * Check the refresh state to determine if refresh needs @@ -989,8 +1026,7 @@ */ void checkRefreshState(uint8_t rank); - DRAMInterface(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, - uint64_t capacity, AddrRange range); + DRAMInterface(const DRAMInterfaceParams* _p); }; /** @@ -1141,20 +1177,6 @@ void accessAndRespond(PacketPtr pkt, Tick static_latency); /** - * Get an address in a dense range which starts from 0. The input - * address is the physical address of the request in an address - * space that contains other SimObjects apart from this - * controller. - * - * @param addr The intput address which should be in the addrRange - * @return An address in the continues range [0, max) - */ - Addr getCtrlAddr(Addr addr) - { - return range.getOffset(addr); - } - - /** * The memory schduler/arbiter - picks which request needs to * go next, based on the specified policy such as FCFS or FR-FCFS * and moves it to the head of the queue. @@ -1237,6 +1259,11 @@ std::unordered_multiset<Tick> burstTicks; /** + * Create pointer to interface of the actual dram media + */ + DRAMInterface* const dram; + + /** * The following are basic design parameters of the memory * controller, and are initialized based on parameter values. * The rowsPerBank is determined based on the capacity, number of @@ -1251,12 +1278,6 @@ uint32_t readsThisTime; /** - * Basic memory timing parameters initialized based on parameter - * values. These will be used across memory interfaces. - */ - const Tick tCS; - - /** * Memory controller configuration initialized based on parameter * values. */ @@ -1310,10 +1331,6 @@ // Average queue lengths Stats::Average avgRdQLen; Stats::Average avgWrQLen; - // Latencies summed over all requests - Stats::Scalar totBusLat; - // Average latencies per request - Stats::Formula avgBusLat; Stats::Scalar numRdRetry; Stats::Scalar numWrRetry; @@ -1324,21 +1341,12 @@ Stats::Histogram rdPerTurnAround; Stats::Histogram wrPerTurnAround; - Stats::Scalar bytesRead; Stats::Scalar bytesReadWrQ; - Stats::Scalar bytesWritten; Stats::Scalar bytesReadSys; Stats::Scalar bytesWrittenSys; // Average bandwidth - Stats::Formula avgRdBW; - Stats::Formula avgWrBW; Stats::Formula avgRdBWSys; Stats::Formula avgWrBWSys; - Stats::Formula peakBW; - // bus utilization - Stats::Formula busUtil; - Stats::Formula busUtilRead; - Stats::Formula busUtilWrite; Stats::Scalar totGap; Stats::Formula avgGap; @@ -1367,11 +1375,6 @@ CtrlStats stats; /** - * Create pointer to interfasce to the actual media - */ - DRAMInterface* dram; - - /** * Upstream caches need this packet until true is returned, so * hold it for deletion until a subsequent call */ @@ -1449,13 +1452,6 @@ void restartScheduler(Tick tick) { schedule(nextReqEvent, tick); } /** - * Determine the required delay for an access to a different rank - * - * @return required rank to rank delay - */ - Tick rankDelay() const { return tCS; } - - /** * Check the current direction of the memory channel * * @param next_state Check either the current or next bus state diff --git a/src/mem/drampower.cc b/src/mem/drampower.cc index 13551a0..96dcb55 100644 --- a/src/mem/drampower.cc +++ b/src/mem/drampower.cc @@ -40,13 +40,13 @@ #include "base/intmath.hh" #include "sim/core.hh" -DRAMPower::DRAMPower(const DRAMCtrlParams* p, bool include_io) : +DRAMPower::DRAMPower(const DRAMInterfaceParams* p, bool include_io) : powerlib(libDRAMPower(getMemSpec(p), include_io)) { } Data::MemArchitectureSpec -DRAMPower::getArchParams(const DRAMCtrlParams* p) +DRAMPower::getArchParams(const DRAMInterfaceParams* p) { Data::MemArchitectureSpec archSpec; archSpec.burstLength = p->burst_length; @@ -68,7 +68,7 @@ } Data::MemTimingSpec -DRAMPower::getTimingParams(const DRAMCtrlParams* p) +DRAMPower::getTimingParams(const DRAMInterfaceParams* p) { // Set the values that are used for power calculations and ignore // the ones only used by the controller functionality in DRAMPower @@ -100,7 +100,7 @@ } Data::MemPowerSpec -DRAMPower::getPowerParams(const DRAMCtrlParams* p) +DRAMPower::getPowerParams(const DRAMInterfaceParams* p) { // All DRAMPower currents are in mA Data::MemPowerSpec powerSpec; @@ -132,7 +132,7 @@ } Data::MemorySpecification -DRAMPower::getMemSpec(const DRAMCtrlParams* p) +DRAMPower::getMemSpec(const DRAMInterfaceParams* p) { Data::MemorySpecification memSpec; memSpec.memArchSpec = getArchParams(p); @@ -142,7 +142,18 @@ } bool -DRAMPower::hasTwoVDD(const DRAMCtrlParams* p) +DRAMPower::hasTwoVDD(const DRAMInterfaceParams* p) { return p->VDD2 == 0 ? false : true; } + +uint8_t +DRAMPower::getDataRate(const DRAMInterfaceParams* p) +{ + uint32_t burst_cycles = divCeil(p->tBURST_MAX, p->tCK); + uint8_t data_rate = p->burst_length / burst_cycles; + // 4 for GDDR5 + if (data_rate != 1 && data_rate != 2 && data_rate != 4 && data_rate != 8) + fatal("Got unexpected data rate %d, should be 1 or 2 or 4 or 8\n"); + return data_rate; +} diff --git a/src/mem/drampower.hh b/src/mem/drampower.hh index da24bca..da68a78 100644 --- a/src/mem/drampower.hh +++ b/src/mem/drampower.hh @@ -44,7 +44,7 @@ #define __MEM_DRAM_POWER_HH__ #include "libdrampower/LibDRAMPower.h" -#include "params/DRAMCtrl.hh" +#include "params/DRAMInterface.hh" /** * DRAMPower is a standalone tool which calculates the power consumed by a @@ -57,38 +57,44 @@ /** * Transform the architechture parameters defined in - * DRAMCtrlParams to the memSpec of DRAMPower + * DRAMInterfaceParams to the memSpec of DRAMPower */ - static Data::MemArchitectureSpec getArchParams(const DRAMCtrlParams* p); + static Data::MemArchitectureSpec getArchParams( + const DRAMInterfaceParams* p); /** - * Transforms the timing parameters defined in DRAMCtrlParams to + * Transforms the timing parameters defined in DRAMInterfaceParams to * the memSpec of DRAMPower */ - static Data::MemTimingSpec getTimingParams(const DRAMCtrlParams* p); + static Data::MemTimingSpec getTimingParams(const DRAMInterfaceParams* p); /** * Transforms the power and current parameters defined in - * DRAMCtrlParam to the memSpec of DRAMPower + * DRAMInterfaceParams to the memSpec of DRAMPower */ - static Data::MemPowerSpec getPowerParams(const DRAMCtrlParams* p); + static Data::MemPowerSpec getPowerParams(const DRAMInterfaceParams* p); + + /** + * Determine data rate, either one or two. + */ + static uint8_t getDataRate(const DRAMInterfaceParams* p); /** * Determine if DRAM has two voltage domains (or one) */ - static bool hasTwoVDD(const DRAMCtrlParams* p); + static bool hasTwoVDD(const DRAMInterfaceParams* p); /** - * Return an instance of MemSpec based on the DRAMCtrlParams + * Return an instance of MemSpec based on the DRAMInterfaceParams */ - static Data::MemorySpecification getMemSpec(const DRAMCtrlParams* p); + static Data::MemorySpecification getMemSpec(const DRAMInterfaceParams* p); public: // Instance of DRAMPower Library libDRAMPower powerlib; - DRAMPower(const DRAMCtrlParams* p, bool include_io); + DRAMPower(const DRAMInterfaceParams* p, bool include_io); }; diff --git a/src/mem/qos/QoSMemCtrl.py b/src/mem/qos/QoSMemCtrl.py index 1cd3f0b..f55105b 100644 --- a/src/mem/qos/QoSMemCtrl.py +++ b/src/mem/qos/QoSMemCtrl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -34,18 +34,21 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from m5.params import * -from m5.objects.AbstractMemory import AbstractMemory +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject from m5.objects.QoSTurnaround import * # QoS Queue Selection policy used to select packets among same-QoS queues class QoSQPolicy(Enum): vals = ["fifo", "lifo", "lrg"] -class QoSMemCtrl(AbstractMemory): +class QoSMemCtrl(ClockedObject): type = 'QoSMemCtrl' cxx_header = "mem/qos/mem_ctrl.hh" cxx_class = 'QoS::MemCtrl' abstract = True + system = Param.System(Parent.any, "System that the controller belongs to.") + ##### QoS support parameters #### # Number of priorities in the system diff --git a/src/mem/qos/QoSMemSinkCtrl.py b/src/mem/qos/QoSMemSinkCtrl.py index 6c4f263..fafac64 100644 --- a/src/mem/qos/QoSMemSinkCtrl.py +++ b/src/mem/qos/QoSMemSinkCtrl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -37,6 +37,7 @@ from m5.params import * from m5.objects.QoSMemCtrl import * +from m5.objects.QoSMemSinkInterface import * class QoSMemSinkCtrl(QoSMemCtrl): type = 'QoSMemSinkCtrl' @@ -44,6 +45,10 @@ cxx_class = "QoS::MemSinkCtrl" port = ResponsePort("Response ports") + + interface = Param.QoSMemSinkInterface(QoSMemSinkInterface(), + "Interface to memory") + # the basic configuration of the controller architecture, note # that each entry corresponds to a burst for the specific DRAM # configuration (e.g. x32 with burst length 8 is 32 bytes) and not @@ -59,5 +64,3 @@ # response latency - time to issue a response once a request is serviced response_latency = Param.Latency("20ns", "Memory response latency") - - diff --git a/src/mem/qos/QoSMemSinkInterface.py b/src/mem/qos/QoSMemSinkInterface.py new file mode 100644 index 0000000..5c79f64 --- /dev/null +++ b/src/mem/qos/QoSMemSinkInterface.py @@ -0,0 +1,40 @@ +# Copyright (c) 2020 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.objects.AbstractMemory import AbstractMemory + +class QoSMemSinkInterface(AbstractMemory): + type = 'QoSMemSinkInterface' + cxx_header = "mem/qos/mem_sink.hh" diff --git a/src/mem/qos/SConscript b/src/mem/qos/SConscript index f8601b6..1d90f9c 100644 --- a/src/mem/qos/SConscript +++ b/src/mem/qos/SConscript @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved # # The license below extends only to copyright in the software and shall @@ -37,6 +37,7 @@ SimObject('QoSMemCtrl.py') SimObject('QoSMemSinkCtrl.py') +SimObject('QoSMemSinkInterface.py') SimObject('QoSPolicy.py') SimObject('QoSTurnaround.py') diff --git a/src/mem/qos/mem_ctrl.cc b/src/mem/qos/mem_ctrl.cc index 50e6035..190960b 100644 --- a/src/mem/qos/mem_ctrl.cc +++ b/src/mem/qos/mem_ctrl.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited + * Copyright (c) 2017-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -42,7 +42,7 @@ namespace QoS { MemCtrl::MemCtrl(const QoSMemCtrlParams * p) - : AbstractMemory(p), + : ClockedObject(p), policy(p->qos_policy), turnPolicy(p->qos_turnaround_policy), queuePolicy(QueuePolicy::create(p)), @@ -51,7 +51,8 @@ qosSyncroScheduler(p->qos_syncro_scheduler), totalReadQueueSize(0), totalWriteQueueSize(0), busState(READ), busStateNext(READ), - stats(*this) + stats(*this), + _system(p->system) { // Set the priority policy if (policy) { @@ -77,12 +78,6 @@ {} void -MemCtrl::init() -{ - AbstractMemory::init(); -} - -void MemCtrl::logRequest(BusState dir, MasterID m_id, uint8_t qos, Addr addr, uint64_t entries) { diff --git a/src/mem/qos/mem_ctrl.hh b/src/mem/qos/mem_ctrl.hh index 0e29fcc..5d7c9d6 100644 --- a/src/mem/qos/mem_ctrl.hh +++ b/src/mem/qos/mem_ctrl.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited + * Copyright (c) 2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -36,10 +36,10 @@ */ #include "debug/QOS.hh" -#include "mem/abstract_mem.hh" -#include "mem/qos/q_policy.hh" #include "mem/qos/policy.hh" +#include "mem/qos/q_policy.hh" #include "params/QoSMemCtrl.hh" +#include "sim/clocked_object.hh" #include "sim/system.hh" #include <unordered_map> @@ -56,7 +56,7 @@ * which support QoS - it provides access to a set of QoS * scheduling policies */ -class MemCtrl: public AbstractMemory +class MemCtrl : public ClockedObject { public: /** Bus Direction */ @@ -151,6 +151,9 @@ Stats::Scalar numStayWriteState; } stats; + /** Pointer to the System object */ + System* _system; + /** * Initializes dynamically counters and * statistics for a given Master @@ -266,11 +269,6 @@ virtual ~MemCtrl(); /** - * Initializes this object - */ - void init() override; - - /** * Gets the current bus state * * @return current bus state @@ -346,6 +344,10 @@ * @return total number of priority levels */ uint8_t numPriorities() const { return _numPriorities; } + + /** read the system pointer + * @return pointer to the system object */ + System* system() const { return _system; } }; template<typename Queues> diff --git a/src/mem/qos/mem_sink.cc b/src/mem/qos/mem_sink.cc index 1f104e4..dbdf548 100644 --- a/src/mem/qos/mem_sink.cc +++ b/src/mem/qos/mem_sink.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited + * Copyright (c) 2018-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -40,6 +40,7 @@ #include "debug/Drain.hh" #include "debug/QOS.hh" #include "mem_sink.hh" +#include "params/QoSMemSinkInterface.hh" #include "sim/system.hh" namespace QoS { @@ -50,12 +51,15 @@ memoryPacketSize(p->memory_packet_size), readBufferSize(p->read_buffer_size), writeBufferSize(p->write_buffer_size), port(name() + ".port", *this), + interface(p->interface), retryRdReq(false), retryWrReq(false), nextRequest(0), nextReqEvent(this) { // Resize read and write queue to allocate space // for configured QoS priorities readQueue.resize(numPriorities()); writeQueue.resize(numPriorities()); + + interface->setMemCtrl(this); } MemSinkCtrl::~MemSinkCtrl() @@ -92,7 +96,7 @@ "%s Should not see packets where cache is responding\n", __func__); - access(pkt); + interface->access(pkt); return responseLatency; } @@ -101,7 +105,7 @@ { pkt->pushLabel(name()); - functionalAccess(pkt); + interface->functionalAccess(pkt); pkt->popLabel(); } @@ -279,7 +283,7 @@ // Do the actual memory access which also turns the packet // into a response - access(pkt); + interface->access(pkt); // Log the response logResponse(pkt->isRead()? READ : WRITE, @@ -351,7 +355,7 @@ MemSinkCtrl::MemoryPort::getAddrRanges() const { AddrRangeList ranges; - ranges.push_back(memory.getAddrRange()); + ranges.push_back(memory.interface->getAddrRange()); return ranges; } @@ -390,3 +394,13 @@ return new QoS::MemSinkCtrl(this); } +QoSMemSinkInterface::QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p) + : AbstractMemory(_p) +{ +} + +QoSMemSinkInterface* +QoSMemSinkInterfaceParams::create() +{ + return new QoSMemSinkInterface(this); +} diff --git a/src/mem/qos/mem_sink.hh b/src/mem/qos/mem_sink.hh index 9a51269..5f6c1be 100644 --- a/src/mem/qos/mem_sink.hh +++ b/src/mem/qos/mem_sink.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited + * Copyright (c) 2018-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -41,10 +41,14 @@ #ifndef __MEM_QOS_MEM_SINK_HH__ #define __MEM_QOS_MEM_SINK_HH__ +#include "mem/abstract_mem.hh" #include "mem/qos/mem_ctrl.hh" #include "mem/qport.hh" #include "params/QoSMemSinkCtrl.hh" +class QoSMemSinkInterfaceParams; +class QoSMemSinkInterface; + namespace QoS { /** @@ -163,6 +167,11 @@ /** Memory slave port */ MemoryPort port; + /** + * Create pointer to interface of actual media + */ + QoSMemSinkInterface* const interface; + /** Read request pending */ bool retryRdReq; @@ -244,4 +253,17 @@ } // namespace QoS +class QoSMemSinkInterface : public AbstractMemory +{ + public: + /** Setting a pointer to the interface */ + void setMemCtrl(QoS::MemSinkCtrl* _ctrl) { ctrl = _ctrl; }; + + /** Pointer to the controller */ + QoS::MemSinkCtrl* ctrl; + + QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p); +}; + + #endif /* __MEM_QOS_MEM_SINK_HH__ */ diff --git a/tests/gem5/configs/base_config.py b/tests/gem5/configs/base_config.py index b5bddf4..cbea768 100644 --- a/tests/gem5/configs/base_config.py +++ b/tests/gem5/configs/base_config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2012-2013, 2017-2018 ARM Limited +# Copyright (c) 2012-2013, 2017-2018, 2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -220,7 +220,12 @@ super(BaseSESystem, self).init_system(system) def create_system(self): - system = System(physmem = self.mem_class(), + if issubclass(self.mem_class, m5.objects.DRAMInterface): + mem_ctrl = DRAMCtrl() + mem_ctrl.dram = self.mem_class() + else: + mem_ctrl = self.mem_class() + system = System(physmem = mem_ctrl, membus = SystemXBar(), mem_mode = self.mem_mode, multi_thread = (self.num_threads > 1)) @@ -272,8 +277,16 @@ else: # create the memory controllers and connect them, stick with # the physmem name to avoid bumping all the reference stats - system.physmem = [self.mem_class(range = r) - for r in system.mem_ranges] + if issubclass(self.mem_class, m5.objects.DRAMInterface): + mem_ctrls = [] + for r in system.mem_ranges: + mem_ctrl = DRAMCtrl() + mem_ctrl.dram = self.mem_class(range = r) + mem_ctrls.append(mem_ctrl) + system.physmem = mem_ctrls + else: + system.physmem = [self.mem_class(range = r) + for r in system.mem_ranges] for i in range(len(system.physmem)): system.physmem[i].port = system.membus.master -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28968 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8 Gerrit-Change-Number: 28968 Gerrit-PatchSet: 13 Gerrit-Owner: Wendy Elsasser <wendy.elsasser(a)arm.com> Gerrit-Reviewer: Daniel Carvalho <odanrc(a)yahoo.com.br> Gerrit-Reviewer: Jason Lowe-Power <power.jg(a)gmail.com> Gerrit-Reviewer: John Alsop <johnathan.alsop(a)amd.com> Gerrit-Reviewer: Matthew Poremba <matthew.poremba(a)amd.com> Gerrit-Reviewer: Nikos Nikoleris <nikos.nikoleris(a)arm.com> Gerrit-Reviewer: Srikant Bharadwaj <srikant.bharadwaj(a)amd.com> Gerrit-Reviewer: kokoro <noreply+kokoro(a)google.com> Gerrit-MessageType: merged