gem5-dev@gem5.org

The gem5 Developer List

Change in gem5/gem5[develop]: mem: Make DRAMCtrl a ClockedObject

Wendy Elsasser (Gerrit)

Tue, May 12, 2020 6:30 PM

Wendy Elsasser has uploaded this change for review. (
https://gem5-review.googlesource.com/c/public/gem5/+/28968 )

Change subject: mem: Make DRAMCtrl a ClockedObject
......................................................................

mem: Make DRAMCtrl a ClockedObject

Made DRAMCtrl a ClockedObject, with DRAMInterface
defined as an AbstractMemory. The address
ranges are now defined per interface. Currently
the model only includes a DRAMInterface but this
can be expanded for other media types.

The controller object includes a parameter to the
interface, which is setup when gem5 is configured.

Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8

M configs/common/MemConfig.py
M configs/dram/low_power_sweep.py
M configs/dram/sweep.py
M configs/learning_gem5/part1/simple.py
M configs/learning_gem5/part1/two_level.py
M configs/learning_gem5/part2/simple_cache.py
M configs/learning_gem5/part2/simple_memobj.py
M configs/learning_gem5/part3/simple_ruby.py
M src/mem/DRAMCtrl.py
A src/mem/DRAMInterface.py
M src/mem/SConscript
M src/mem/dram_ctrl.cc
M src/mem/dram_ctrl.hh
M src/mem/drampower.cc
M src/mem/drampower.hh
M src/mem/qos/QoSMemCtrl.py
M src/mem/qos/QoSMemSinkCtrl.py
A src/mem/qos/QoSMemSinkInterface.py
M src/mem/qos/SConscript
M src/mem/qos/mem_ctrl.cc
M src/mem/qos/mem_ctrl.hh
M src/mem/qos/mem_sink.cc
M src/mem/qos/mem_sink.hh
M tests/configs/base_config.py
24 files changed, 1,934 insertions(+), 1,760 deletions(-)

diff --git a/configs/common/MemConfig.py b/configs/common/MemConfig.py
index 9443520..ab6b933 100644
--- a/configs/common/MemConfig.py
+++ b/configs/common/MemConfig.py
@@ -40,7 +40,7 @@
from common import ObjectList
from common import HMC

-def create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, intlv_size):
+def create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size):
"""
Helper function for creating a single memoy controller from the given
options. This function is invoked multiple times in config_mem
function
@@ -59,33 +59,33 @@

  # Create an instance so we can figure out the address
  # mapping and row-buffer size

ctrl = cls()

interface = intf()

Only do this for DRAMs

if issubclass(cls, m5.objects.DRAMCtrl):

if issubclass(intf, m5.objects.DRAMInterface):
# If the channel bits are appearing after the column
# bits, we need to add the appropriate number of bits
# for the row buffer size

   if ctrl.addr_mapping.value == 'RoRaBaChCo':

   if interface.addr_mapping.value == 'RoRaBaChCo':
        # This computation only really needs to happen
        # once, but as we rely on having an instance we
        # end up having to repeat it for each and every
        # one

       rowbuffer_size = ctrl.device_rowbuffer_size.value * \

```
           ctrl.devices_per_rank.value
```

       rowbuffer_size = interface.device_rowbuffer_size.value * \

           interface.devices_per_rank.value

        intlv_low_bit = int(math.log(rowbuffer_size, 2))

# We got all we need to configure the appropriate address
# range

ctrl.range = m5.objects.AddrRange(r.start, size = r.size(),

interface.range = m5.objects.AddrRange(r.start, size = r.size(),
intlvHighBit =
intlv_low_bit + intlv_bits - 1,
xorHighBit =
xor_low_bit + intlv_bits - 1,
intlvBits = intlv_bits,
intlvMatch = i)

return ctrl

return interface

def config_mem(options, system):
"""
@@ -144,10 +144,10 @@
if 2 ** intlv_bits != nbr_mem_ctrls:
fatal("Number of memory channels must be a power of 2")

cls = ObjectList.mem_list.get(opt_mem_type)

intf = ObjectList.mem_list.get(opt_mem_type)
mem_ctrls = []

if opt_elastic_trace_en and not issubclass(cls,
m5.objects.SimpleMemory):

if opt_elastic_trace_en and not issubclass(intf,
m5.objects.SimpleMemory):
fatal("When elastic trace is enabled, configure mem-type as "
"simple-mem.")

@@ -158,36 +158,56 @@
intlv_size = max(opt_mem_channels_intlv, system.cache_line_size.value)

  # For every range (most systems will only have one), create an

array of controllers and set their parameters to match their
address mapping in the case of a DRAM

array of memory interfaces and set their parameters to match
their address mapping in the case of a DRAM
for r in system.mem_ranges:
for i in range(nbr_mem_ctrls):

       mem_ctrl = create_mem_ctrl(cls, r, i, nbr_mem_ctrls,

intlv_bits,

```
       # Create the DRAM interface
```

       dram_intf = create_mem_intf(intf, r, i, nbr_mem_ctrls,

intlv_bits,
intlv_size)
+
# Set the number of ranks based on the command-line
# options if it was explicitly set

       if issubclass(cls, m5.objects.DRAMCtrl) and opt_mem_ranks:

           mem_ctrl.ranks_per_channel = opt_mem_ranks

       if issubclass(intf, m5.objects.DRAMInterface) and

opt_mem_ranks:

           dram_intf.ranks_per_channel = opt_mem_ranks

        # Enable low-power DRAM states if option is set

       if issubclass(cls, m5.objects.DRAMCtrl):

           mem_ctrl.enable_dram_powerdown = opt_dram_powerdown

       if issubclass(intf, m5.objects.DRAMInterface):

           dram_intf.enable_dram_powerdown = opt_dram_powerdown

        if opt_elastic_trace_en:

```
           mem_ctrl.latency = '1ns'
```

           dram_intf.latency = '1ns'
            print("For elastic trace, over-riding Simple Memory "
                "latency to 1ns.")

       # Create the controller that will drive the interface

       if opt_mem_type == "HMC_2500_1x32":

           # The static latency of the vault controllers is estimated

           # to be smaller than a full DRAM channel controller

           mem_ctrl = m5.objects.DRAMCtrl(min_writes_per_switch = 8,

                                          static_backend_latency

= '4ns',

                                          static_frontend_latency

= '4ns')

```
       else:
```

           mem_ctrl = m5.objects.DRAMCtrl()

       # Override buffer sizes with interface specific values

       mem_ctrl.write_buffer_size = dram_intf.write_buffer_size

       mem_ctrl.read_buffer_size = dram_intf.read_buffer_size

       # Hookup the controller to the interface and add to the list

       mem_ctrl.dram = dram_intf
        mem_ctrls.append(mem_ctrl)

subsystem.mem_ctrls = mem_ctrls
Connect the controllers to the membus
for i in range(len(subsystem.mem_ctrls)):

Create a controller and connect the interfaces to a controller
for i in range(len(mem_ctrls)):
if opt_mem_type == "HMC_2500_1x32":

       subsystem.mem_ctrls[i].port = xbar[i/4].master

       # Connect the controllers to the membus

       mem_ctrls[i].port = xbar[i/4].master
        # Set memory device size. There is an independent controller

for
# each vault. All vaults are same size.

       subsystem.mem_ctrls[i].device_size = options.hmc_dev_vault_size

       mem_ctrls[i].dram.device_size = options.hmc_dev_vault_size
    else:

       subsystem.mem_ctrls[i].port = xbar.master

       # Connect the controllers to the membus

```
       mem_ctrls[i].port = xbar.master
```
subsystem.mem_ctrls = mem_ctrls
diff --git a/configs/dram/low_power_sweep.py
b/configs/dram/low_power_sweep.py
index 9a62393..4a97fcb 100644
--- a/configs/dram/low_power_sweep.py
+++ b/configs/dram/low_power_sweep.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2014-2015, 2017, 2019 ARM Limited
+# Copyright (c) 2014-2015, 2017, 2019-2020 ARM Limited

All rights reserved.

The license below extends only to copyright in the software and shall

@@ -37,6 +37,7 @@
from future import absolute_import

import argparse
+import math

import m5
from m5.objects import *
@@ -57,6 +58,10 @@
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

+dram_generators = {

"DRAM" : lambda x: x.createDram,
+}
Use a single-channel DDR4-2400 in 16x4 configuration by default

parser.add_argument("--mem-type", default="DDR4_2400_16x4",
choices=ObjectList.mem_list.get_names(),
@@ -77,7 +82,7 @@
help = "Percentage of read commands")

parser.add_argument("--addr-map",

               choices=m5.objects.AddrMap.vals,

```
               choices=ObjectList.dram_addr_map_list.get_names(),
                default="RoRaBaCoCh", help = "DRAM address map policy")
```
parser.add_argument("--idle-end", type=int, default=50000000,
@@ -111,14 +116,19 @@

Sanity check for memory controller class.

if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl):

fatal("This script assumes the memory is a DRAMCtrl subclass")

fatal("This script assumes the controller is a DRAMCtrl subclass")
+if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
fatal("This script assumes the memory is a DRAMInterface subclass")

There is no point slowing things down by saving any data.

-system.mem_ctrls[0].null = True
+system.mem_ctrls[0].dram.null = True
+
+# enable DRAM low power states
+system.mem_ctrls[0].dram.enable_dram_powerdown = True

Set the address mapping based on input argument

-system.mem_ctrls[0].addr_mapping = args.addr_map
-system.mem_ctrls[0].page_policy = args.page_policy
+system.mem_ctrls[0].dram.addr_mapping = args.addr_map
+system.mem_ctrls[0].dram.page_policy = args.page_policy

We create a traffic generator state for each param combination we want to

test. Each traffic generator state is specified in the config file and

the
@@ -126,28 +136,23 @@

Stats are dumped and reset at the state transition.

period = 250000000

-# We specify the states in a config file input to the traffic generator.
-cfg_file_name = "lowp_sweep.cfg"
-cfg_file_path = os.path.dirname(file) + "/" +cfg_file_name
-cfg_file = open(cfg_file_path, 'w')

Get the number of banks

-nbr_banks = int(system.mem_ctrls[0].banks_per_rank.value)
+nbr_banks = int(system.mem_ctrls[0].dram.banks_per_rank.value)

determine the burst size in bytes

-burst_size = int((system.mem_ctrls[0].devices_per_rank.value *

             system.mem_ctrls[0].device_bus_width.value *

             system.mem_ctrls[0].burst_length.value) / 8)

+burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value *

             system.mem_ctrls[0].dram.device_bus_width.value *

             system.mem_ctrls[0].dram.burst_length.value) / 8)

next, get the page size in bytes (the rowbuffer size is already in bytes)

-page_size = system.mem_ctrls[0].devices_per_rank.value * \

system.mem_ctrls[0].device_rowbuffer_size.value
+page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \

system.mem_ctrls[0].dram.device_rowbuffer_size.value

Inter-request delay should be such that we can hit as many transitions

to/from low power states as possible to. We provide a min and max itt to

the

traffic generator and it randomises in the range. The parameter is in

seconds and we need it in ticks (ps).

-itt_min = system.mem_ctrls[0].tBURST.value * 1000000000000
+itt_min = system.mem_ctrls[0].dram.tBURST.value * 1000000000000

#The itt value when set to (tRAS + tRP + tCK) covers the case where

a read command is delayed beyond the delay from ACT to PRE_PDN entry of

the
@@ -155,9 +160,9 @@

between a write and power down entry will be tRCD + tCL + tWR + tRP +

tCK.

As we use this delay as a unit and create multiples of it as bigger

delays

for the sweep, this parameter works for reads, writes and mix of them.

-pd_entry_time = (system.mem_ctrls[0].tRAS.value +

            system.mem_ctrls[0].tRP.value +

            system.mem_ctrls[0].tCK.value) * 1000000000000

+pd_entry_time = (system.mem_ctrls[0].dram.tRAS.value +

            system.mem_ctrls[0].dram.tRP.value +

```
            system.mem_ctrls[0].dram.tCK.value) * 1000000000000
```
We sweep itt max using the multipliers specified by the user.

itt_max_str = args.itt_list.strip().split()
@@ -180,42 +185,11 @@

banks

bank_util_values = [1, int(nbr_banks/2), nbr_banks]

-# Next we create the config file, but first a comment
-cfg_file.write("""# STATE state# period mode=DRAM
-# read_percent start_addr end_addr req_size min_itt max_itt data_limit
-# stride_size page_size #banks #banks_util addr_map #ranks\n""")

-addr_map = m5.objects.AddrMap.map[args.addr_map]

-nxt_state = 0
-for itt_max in itt_max_values:

for bank in bank_util_values:
```
   for stride_size in stride_values:
```

       cfg_file.write("STATE %d %d %s %d 0 %d %d "

                      "%d %d %d %d %d %d %d %d %d\n" %

                      (nxt_state, period, "DRAM", args.rd_perc,

max_addr,

                       burst_size, itt_min, itt_max, 0, stride_size,

                       page_size, nbr_banks, bank, addr_map,

                       args.mem_ranks))

```
       nxt_state = nxt_state + 1
```
State for idle period
idle_period = args.idle_end
-cfg_file.write("STATE %d %d IDLE\n" % (nxt_state, idle_period))

-# Init state is state 0
-cfg_file.write("INIT 0\n")

-# Go through the states one by one
-for state in range(1, nxt_state + 1):

cfg_file.write("TRANSITION %d %d 1\n" % (state - 1, state))

-# Transition from last state to itself to not break the probability math
-cfg_file.write("TRANSITION %d %d 1\n" % (nxt_state, nxt_state))
-cfg_file.close()

create a traffic generator, and point it to the file we just created

-system.tgen = TrafficGen(config_file = cfg_file_path)
+system.tgen = PyTrafficGen()

add a communication monitor

system.monitor = CommMonitor()
@@ -230,14 +204,34 @@

every period, dump and reset all stats

periodicStatDump(period)

+# run Forrest, run!
root = Root(full_system = False, system = system)
root.system.mem_mode = 'timing'

m5.instantiate()

+def trace():

addr_map = ObjectList.dram_addr_map_list.get(args.addr_map)
generator = dram_generators"DRAM"
for itt_max in itt_max_values:
```
   for bank in bank_util_values:
```

       for stride_size in stride_values:

           num_seq_pkts = int(math.ceil(float(stride_size) /

burst_size))

```
           yield generator(period,
```

                           0, max_addr, burst_size, int(itt_min),

                           int(itt_max), args.rd_perc, 0,

                           num_seq_pkts, page_size, nbr_banks, bank,

                           addr_map, args.mem_ranks)

yield system.tgen.createIdle(idle_period)
yield system.tgen.createExit(0)

+system.tgen.start(trace())
+

Simulate for exactly as long as it takes to go through all the states

This is why sim exists.

-m5.simulate(nxt_state * period + idle_period)
+m5.simulate()
+
print("--- Done DRAM low power sweep ---")
print("Fixed params - ")
print("\tburst: %d, banks: %d, max stride: %d, itt min: %s ns" %
@@ -247,4 +241,3 @@
print("\titt max values", itt_max_values)
print("\tbank utilization values", bank_util_values)
print("\tstride values:", stride_values)
-print("Traffic gen config file:", cfg_file_name)
diff --git a/configs/dram/sweep.py b/configs/dram/sweep.py
index d3c86c3..6a49f44 100644
--- a/configs/dram/sweep.py
+++ b/configs/dram/sweep.py
@@ -116,13 +116,15 @@

the following assumes that we are using the native DRAM

controller, check to be sure

if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl):

fatal("This script assumes the memory is a DRAMCtrl subclass")

fatal("This script assumes the controller is a DRAMCtrl subclass")
+if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
fatal("This script assumes the memory is a DRAMInterface subclass")

there is no point slowing things down by saving any data

-system.mem_ctrls[0].null = True
+system.mem_ctrls[0].dram.null = True

Set the address mapping based on input argument

-system.mem_ctrls[0].addr_mapping = options.addr_map
+system.mem_ctrls[0].dram.addr_mapping = options.addr_map

stay in each state for 0.25 ms, long enough to warm things up, and

short enough to avoid hitting a refresh

@@ -133,21 +135,21 @@

the DRAM maximum bandwidth to ensure that it is saturated

get the number of banks

-nbr_banks = system.mem_ctrls[0].banks_per_rank.value
+nbr_banks = system.mem_ctrls[0].dram.banks_per_rank.value

determine the burst length in bytes

-burst_size = int((system.mem_ctrls[0].devices_per_rank.value *

             system.mem_ctrls[0].device_bus_width.value *

             system.mem_ctrls[0].burst_length.value) / 8)

+burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value *

             system.mem_ctrls[0].dram.device_bus_width.value *

             system.mem_ctrls[0].dram.burst_length.value) / 8)

next, get the page size in bytes

-page_size = system.mem_ctrls[0].devices_per_rank.value * \

system.mem_ctrls[0].device_rowbuffer_size.value
+page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \

system.mem_ctrls[0].dram.device_rowbuffer_size.value

match the maximum bandwidth of the memory, the parameter is in seconds

and we need it in ticks (ps)

-itt = getattr(system.mem_ctrls[0].tBURST_MIN, 'value',

          system.mem_ctrls[0].tBURST.value) * 1000000000000

+itt = getattr(system.mem_ctrls[0].dram.tBURST_MIN, 'value',

```
          system.mem_ctrls[0].dram.tBURST.value) * 1000000000000
```
assume we start at 0

max_addr = mem_range.end
diff --git a/configs/learning_gem5/part1/simple.py
b/configs/learning_gem5/part1/simple.py
index ef73a06..cfd15be 100644
--- a/configs/learning_gem5/part1/simple.py
+++ b/configs/learning_gem5/part1/simple.py
@@ -77,8 +77,9 @@
system.cpu.interrupts[0].int_slave = system.membus.master

Create a DDR3 memory controller and connect it to the membus

-system.mem_ctrl = DDR3_1600_8x8()
-system.mem_ctrl.range = system.mem_ranges[0]
+system.mem_ctrl = DRAMCtrl()
+system.mem_ctrl.dram = DDR3_1600_8x8()
+system.mem_ctrl.dram.range = system.mem_ranges[0]
system.mem_ctrl.port = system.membus.master

Connect the system up to the membus

diff --git a/configs/learning_gem5/part1/two_level.py
b/configs/learning_gem5/part1/two_level.py
index 564c785..0dbcfc7 100644
--- a/configs/learning_gem5/part1/two_level.py
+++ b/configs/learning_gem5/part1/two_level.py
@@ -132,8 +132,9 @@
system.system_port = system.membus.slave

Create a DDR3 memory controller

Create a process for a simple "Hello World" application

diff --git a/configs/learning_gem5/part2/simple_cache.py
b/configs/learning_gem5/part2/simple_cache.py
index 8d98d92..fbea73d 100644
--- a/configs/learning_gem5/part2/simple_cache.py
+++ b/configs/learning_gem5/part2/simple_cache.py
@@ -76,8 +76,9 @@
system.cpu.interrupts[0].int_slave = system.membus.master

Create a DDR3 memory controller and connect it to the membus

Connect the system up to the membus

diff --git a/configs/learning_gem5/part2/simple_memobj.py
b/configs/learning_gem5/part2/simple_memobj.py
index d30977c..e792eb9 100644
--- a/configs/learning_gem5/part2/simple_memobj.py
+++ b/configs/learning_gem5/part2/simple_memobj.py
@@ -74,8 +74,9 @@
system.cpu.interrupts[0].int_slave = system.membus.master

Create a DDR3 memory controller and connect it to the membus

Connect the system up to the membus

diff --git a/configs/learning_gem5/part3/simple_ruby.py
b/configs/learning_gem5/part3/simple_ruby.py
index c47ee7e..7f70a8c 100644
--- a/configs/learning_gem5/part3/simple_ruby.py
+++ b/configs/learning_gem5/part3/simple_ruby.py
@@ -68,8 +68,9 @@
system.cpu = [TimingSimpleCPU() for i in range(2)]

Create a DDR3 memory controller and connect it to the membus

create the interrupt controller for the CPU and connect to the membus

for cpu in system.cpu:
diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py
index 0f70dff..dff5000 100644
--- a/src/mem/DRAMCtrl.py
+++ b/src/mem/DRAMCtrl.py
@@ -40,26 +40,12 @@

from m5.params import *
from m5.proxy import *
-from m5.objects.AbstractMemory import *
from m5.objects.QoSMemCtrl import *

Enum for memory scheduling algorithms, currently First-Come

First-Served and a First-Row Hit then First-Come First-Served

class MemSched(Enum): vals = ['fcfs', 'frfcfs']

-# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting
-# channel, rank, bank, row and column, respectively, and going from
-# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are
-# suitable for an open-page policy, optimising for sequential accesses
-# hitting in the open row. For a closed-page policy, RoCoRaBaCh
-# maximises parallelism.
-class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh']

-# Enum for the page policy, either open, open_adaptive, close, or
-# close_adaptive.
-class PageManage(Enum): vals = ['open', 'open_adaptive', 'close',

                           'close_adaptive']

DRAMCtrl is a single-channel single-ported DRAM controller model

that aims to model the most important system-level performance

effects of a DRAM without getting into too much detail of the DRAM

@@ -72,8 +58,11 @@
# bus in front of the controller for multiple ports
port = SlavePort("Slave port")

the basic configuration of the controller architecture, note
that each entry corresponds to a burst for the specific DRAM

Interface to volatile, DRAM media
dram = Param.DRAMInterface(Parent.any, "DRAM interface")
Set default buffer sizes
each entry corresponds to a burst for the specific DRAM

configuration (e.g. x32 with burst length 8 is 32 bytes) and not

the cacheline size or request/packet size

write_buffer_size = Param.Unsigned(64, "Number of write queue entries")
@@ -93,15 +82,6 @@

scheduler, address map and page policy

mem_sched_policy = Param.MemSched('frfcfs', "Memory scheduling policy")

addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy")
page_policy = Param.PageManage('open_adaptive', "Page management
policy")
enforce a limit on the number of accesses per row
max_accesses_per_row = Param.Unsigned(16, "Max accesses per row
before "

                                     "closing");

size of DRAM Chip in Bytes
device_size = Param.MemorySize("Size of DRAM chip")

pipeline latency of the controller and PHY, split into a

frontend part and a backend part, with reads and writes serviced

@@ -109,1404 +89,3 @@
# serviced by the memory seeing the sum of the two
static_frontend_latency = Param.Latency("10ns", "Static frontend
latency")
static_backend_latency = Param.Latency("10ns", "Static backend
latency")

the physical organisation of the DRAM
device_bus_width = Param.Unsigned("data bus width in bits for each
DRAM "\

                                 "device/chip")

burst_length = Param.Unsigned("Burst lenght (BL) in beats")
device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\

                                      "device/chip")

devices_per_rank = Param.Unsigned("Number of devices/chips per rank")
ranks_per_channel = Param.Unsigned("Number of ranks per channel")
default to 0 bank groups per rank, indicating bank group architecture
is not used
update per memory class when bank group architecture is supported
bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per
rank")
banks_per_rank = Param.Unsigned("Number of banks per rank")
Enable DRAM powerdown states if True. This is False by default due to
performance being lower when enabled
enable_dram_powerdown = Param.Bool(False, "Enable powerdown states")
For power modelling we need to know if the DRAM has a DLL or not
dll = Param.Bool(True, "DRAM has DLL or not")
DRAMPower provides in addition to the core power, the possibility to
include RD/WR termination and IO power. This calculation assumes some
default values. The integration of DRAMPower with gem5 does not

include

IO and RD/WR termination power by default. This might be added as an
additional feature in the future.
timing behaviour and constraints - all in nanoseconds
the base clock period of the DRAM
tCK = Param.Latency("Clock period")
the amount of time in nanoseconds from issuing an activate command
to the data being available in the row buffer for a read/write
tRCD = Param.Latency("RAS to CAS delay")
the time from issuing a read/write command to seeing the actual data
tCL = Param.Latency("CAS latency")
minimum time between a precharge and subsequent activate
tRP = Param.Latency("Row precharge time")
minimum time between an activate and a precharge to the same row
tRAS = Param.Latency("ACT to PRE delay")
minimum time between a write data transfer and a precharge
tWR = Param.Latency("Write recovery time")
minimum time between a read and precharge command
tRTP = Param.Latency("Read to precharge")
time to complete a burst transfer, typically the burst length
divided by two due to the DDR bus, but by making it a parameter
it is easier to also evaluate SDR memories like WideIO.
This parameter has to account for burst length.
Read/Write requests with data size larger than one full burst are

broken

down into multiple requests in the controller
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = Param.Latency("Burst duration "

                      "(typically burst length / 2 cycles)")

tBURST_MAX is the column array cycle delay required before next

access,

which could be greater than tBURST when the memory access time is

greater

than tBURST
tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay")
tBURST_MIN is the minimum delay between bursts, which could be less

than

tBURST when interleaving is supported
tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts")
CAS-to-CAS delay for bursts to the same bank group
only utilized with bank group architectures; set to 0 for default

case

tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay")
Write-to-Write delay for bursts to the same bank group
only utilized with bank group architectures; set to 0 for default

case

This will be used to enable different same bank group delays
for writes versus reads
tCCD_L_WR = Param.Latency(Self.tCCD_L,

   "Same bank group Write to Write delay")

time taken to complete one refresh cycle (N rows in all banks)
tRFC = Param.Latency("Refresh cycle time")
refresh command interval, how often a "ref" command needs
to be sent. It is 7.8 us for a 64ms refresh requirement
tREFI = Param.Latency("Refresh command interval")
write-to-read, same rank turnaround penalty
tWTR = Param.Latency("Write to read, same rank switching time")
write-to-read, same rank turnaround penalty for same bank group
tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching "

                      "time, same bank group")

read-to-write, same rank turnaround penalty
tRTW = Param.Latency("Read to write, same rank switching time")
rank-to-rank bus delay penalty
this does not correlate to a memory timing parameter and encompasses:
1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD
different rank bus delay
tCS = Param.Latency("Rank to rank switching time")
minimum precharge to precharge delay time
tPPD = Param.Latency("0ns", "PRE to PRE delay")
maximum delay between two-cycle ACT command phases
tAAD = Param.Latency(Self.tCK,

                    "Maximum delay between two-cycle ACT commands")

two_cycle_activate = Param.Bool(False,

                    "Two cycles required to send activate")

minimum row activate to row activate delay time
tRRD = Param.Latency("ACT to ACT delay")
only utilized with bank group architectures; set to 0 for default

case

tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay")
time window in which a maximum number of activates are allowed
to take place, set to 0 to disable
tXAW = Param.Latency("X activation window")
activation_limit = Param.Unsigned("Max number of activates in window")
time to exit power-down mode
Exit power-down to next valid command delay
tXP = Param.Latency("0ns", "Power-up Delay")
Exit Powerdown to commands requiring a locked DLL
tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL")
time to exit self-refresh mode
tXS = Param.Latency("0ns", "Self-refresh exit latency")
time to exit self-refresh mode with locked DLL
tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL")
number of data beats per clock. with DDR, default is 2, one per edge
beats_per_clock = Param.Unsigned(2, "Data beats per clock")
data_clock_sync = Param.Bool(False, "Synchronization commands
required")
Currently rolled into other params
######################################################################
tRC - assumed to be tRAS + tRP
Power Behaviour and Constraints
DRAMs like LPDDR and WideIO have 2 external voltage domains. These

are

defined as VDD and VDD2. Each current is defined for each voltage

domain

separately. For example, current IDD0 is active-precharge current for
voltage domain VDD and current IDD02 is active-precharge current for
voltage domain VDD2.
By default all currents are set to 0mA. Users who are only

interested in

the performance of DRAMs can leave them at 0.
Operating 1 Bank Active-Precharge current
IDD0 = Param.Current("0mA", "Active precharge current")
Operating 1 Bank Active-Precharge current multiple voltage Range
IDD02 = Param.Current("0mA", "Active precharge current VDD2")
Precharge Power-down Current: Slow exit
IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow")
Precharge Power-down Current: Slow exit multiple voltage Range
IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2")
Precharge Power-down Current: Fast exit
IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast")
Precharge Power-down Current: Fast exit multiple voltage Range
IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2")
Precharge Standby current
IDD2N = Param.Current("0mA", "Precharge Standby current")
Precharge Standby current multiple voltage range
IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2")
Active Power-down current: slow exit
IDD3P0 = Param.Current("0mA", "Active Powerdown slow")
Active Power-down current: slow exit multiple voltage range
IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2")
Active Power-down current : fast exit
IDD3P1 = Param.Current("0mA", "Active Powerdown fast")
Active Power-down current : fast exit multiple voltage range
IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2")
Active Standby current
IDD3N = Param.Current("0mA", "Active Standby current")
Active Standby current multiple voltage range
IDD3N2 = Param.Current("0mA", "Active Standby current VDD2")
Burst Read Operating Current
IDD4R = Param.Current("0mA", "READ current")
Burst Read Operating Current multiple voltage range
IDD4R2 = Param.Current("0mA", "READ current VDD2")
Burst Write Operating Current
IDD4W = Param.Current("0mA", "WRITE current")
Burst Write Operating Current multiple voltage range
IDD4W2 = Param.Current("0mA", "WRITE current VDD2")
Refresh Current
IDD5 = Param.Current("0mA", "Refresh current")
Refresh Current multiple voltage range
IDD52 = Param.Current("0mA", "Refresh current VDD2")
Self-Refresh Current
IDD6 = Param.Current("0mA", "Self-refresh Current")
Self-Refresh Current multiple voltage range
IDD62 = Param.Current("0mA", "Self-refresh Current VDD2")
Main voltage range of the DRAM
VDD = Param.Voltage("0V", "Main Voltage Range")
Second voltage range defined by some DRAMs
VDD2 = Param.Voltage("0V", "2nd Voltage Range")

-# A single DDR3-1600 x64 channel (one command and address bus), with
-# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in
-# an 8x8 configuration.
-class DDR3_1600_8x8(DRAMCtrl):

size of device in bytes
device_size = '512MB'
8x8 configuration, 8 devices each with an 8-bit interface
device_bus_width = 8
DDR3 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)
device_rowbuffer_size = '1kB'
8x8 configuration, so 8 devices
devices_per_rank = 8
Use two ranks
ranks_per_channel = 2
DDR3 has 8 banks in all configurations
banks_per_rank = 8
800 MHz
tCK = '1.25ns'
8 beats across an x64 interface translates to 4 clocks @ 800 MHz
tBURST = '5ns'
DDR3-1600 11-11-11
tRCD = '13.75ns'
tCL = '13.75ns'
tRP = '13.75ns'
tRAS = '35ns'
tRRD = '6ns'
tXAW = '30ns'
activation_limit = 4
tRFC = '260ns'
tWR = '15ns'
Greater of 4 CK or 7.5 ns
tWTR = '7.5ns'
Greater of 4 CK or 7.5 ns
tRTP = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns
tRTW = '2.5ns'
Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns
tCS = '2.5ns'
<=85C, half for >85C
tREFI = '7.8us'
active powerdown and precharge powerdown exit time
tXP = '6ns'
self refresh exit time
tXS = '270ns'
Current values from datasheet Die Rev E,J
IDD0 = '55mA'
IDD2N = '32mA'
IDD3N = '38mA'
IDD4W = '125mA'
IDD4R = '157mA'
IDD5 = '235mA'
IDD3P1 = '38mA'
IDD2P1 = '32mA'
IDD6 = '20mA'
VDD = '1.5V'

-# A single HMC-2500 x32 model based on:
-# [1] DRAMSpec: a high-level DRAM bank modelling tool
-# developed at the University of Kaiserslautern. This high level tool
-# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to
-# estimate the DRAM bank latency and power numbers.
-# [2] High performance AXI-4.0 based interconnect for extensible smart
memory
-# cubes (E. Azarkhish et. al)
-# Assumed for the HMC model is a 30 nm technology node.
-# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory
(4
-# layers).
-# Each layer has 16 vaults and each vault consists of 2 banks per layer.
-# In order to be able to use the same controller used for 2D DRAM
generations
-# for HMC, the following analogy is done:
-# Channel (DDR) => Vault (HMC)
-# device_size (DDR) => size of a single layer in a vault
-# ranks per channel (DDR) => number of layers
-# banks per rank (DDR) => banks per layer
-# devices per rank (DDR) => devices per layer ( 1 for HMC).
-# The parameters for which no input is available are inherited from the
DDR3
-# configuration.
-# This configuration includes the latencies from the DRAM to the logic
layer
-# of the HMC
-class HMC_2500_1x32(DDR3_1600_8x8):

size of device
two banks per device with each bank 4MB [2]
device_size = '8MB'
1x32 configuration, 1 device with 32 TSVs [2]
device_bus_width = 32
HMC is a BL8 device [2]
burst_length = 8
Each device has a page (row buffer) size of 256 bytes [2]
device_rowbuffer_size = '256B'
1x32 configuration, so 1 device [2]
devices_per_rank = 1
4 layers so 4 ranks [2]
ranks_per_channel = 4
HMC has 2 banks per layer [2]
Each layer represents a rank. With 4 layers and 8 banks in total,

each

layer has 2 banks; thus 2 banks per rank.
banks_per_rank = 2
1250 MHz [2]
tCK = '0.8ns'
8 beats across an x32 interface translates to 4 clocks @ 1250 MHz
tBURST = '3.2ns'
Values using DRAMSpec HMC model [1]
tRCD = '10.2ns'
tCL = '9.9ns'
tRP = '7.7ns'
tRAS = '21.6ns'
tRRD depends on the power supply network for each vendor.
We assume a tRRD of a double bank approach to be equal to 4 clock
cycles (Assumption)
tRRD = '3.2ns'
activation limit is set to 0 since there are only 2 banks per vault
layer.
activation_limit = 0
Values using DRAMSpec HMC model [1]
tRFC = '59ns'
tWR = '8ns'
tRTP = '4.9ns'
Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz

0.8 ns (Assumption)
tCS = '0.8ns'
Value using DRAMSpec HMC model [1]
tREFI = '3.9us'
The default page policy in the vault controllers is simple closed

page

[2] nevertheless 'close' policy opens and closes the row multiple

times

for bursts largers than 32Bytes. For this reason we

use 'close_adaptive'

page_policy = 'close_adaptive'
RoCoRaBaCh resembles the default address mapping in HMC
addr_mapping = 'RoCoRaBaCh'
min_writes_per_switch = 8
These parameters do not directly correlate with buffer_size in real
hardware. Nevertheless, their value has been tuned to achieve a
bandwidth similar to the cycle-accurate model in [2]
write_buffer_size = 32
read_buffer_size = 32
The static latency of the vault controllers is estimated to be

smaller

than a full DRAM channel controller
static_backend_latency='4ns'
static_frontend_latency='4ns'

-# A single DDR3-2133 x64 channel refining a selected subset of the
-# options for the DDR-1600 configuration, based on the same DDR3-1600
-# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept
-# consistent across the two configurations.
-class DDR3_2133_8x8(DDR3_1600_8x8):

1066 MHz
tCK = '0.938ns'
8 beats across an x64 interface translates to 4 clocks @ 1066 MHz
tBURST = '3.752ns'
DDR3-2133 14-14-14
tRCD = '13.09ns'
tCL = '13.09ns'
tRP = '13.09ns'
tRAS = '33ns'
tRRD = '5ns'
tXAW = '25ns'
Current values from datasheet
IDD0 = '70mA'
IDD2N = '37mA'
IDD3N = '44mA'
IDD4W = '157mA'
IDD4R = '191mA'
IDD5 = '250mA'
IDD3P1 = '44mA'
IDD2P1 = '43mA'
IDD6 ='20mA'
VDD = '1.5V'

-# A single DDR4-2400 x64 channel (one command and address bus), with
-# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4)
-# in an 16x4 configuration.
-# Total channel capacity is 32GB
-# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel
-class DDR4_2400_16x4(DRAMCtrl):

size of device
device_size = '1GB'
16x4 configuration, 16 devices each with a 4-bit interface
device_bus_width = 4
DDR4 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 512 byte (1K columns x4)
device_rowbuffer_size = '512B'
16x4 configuration, so 16 devices
devices_per_rank = 16
Match our DDR3 configurations which is dual rank
ranks_per_channel = 2
DDR4 has 2 (x16) or 4 (x4 and x8) bank groups
Set to 4 for x4 case
bank_groups_per_rank = 4
DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all
configurations). Currently we do not capture the additional
constraints incurred by the bank groups
banks_per_rank = 16
override the default buffer sizes and go for something larger to
accommodate the larger bank count
write_buffer_size = 128
read_buffer_size = 64
1200 MHz
tCK = '0.833ns'
8 beats across an x64 interface translates to 4 clocks @ 1200 MHz
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = '3.332ns'
@2400 data rate, tCCD_L is 6 CK
CAS-to-CAS delay for bursts to the same bank group
tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = '5ns';
DDR4-2400 17-17-17
tRCD = '14.16ns'
tCL = '14.16ns'
tRP = '14.16ns'
tRAS = '32ns'
RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns)
tRRD = '3.332ns'
RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns)
tRRD_L = '4.9ns';
tFAW for 512B page is MAX(16 CK, 13ns)
tXAW = '13.328ns'
activation_limit = 4
tRFC is 350ns
tRFC = '350ns'
tWR = '15ns'
Here using the average of WTR_S and WTR_L
tWTR = '5ns'
Greater of 4 CK or 7.5 ns
tRTP = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666

tRTW = '1.666ns'
Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns
tCS = '1.666ns'
<=85C, half for >85C
tREFI = '7.8us'
active powerdown and precharge powerdown exit time
tXP = '6ns'
self refresh exit time
exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is:
tRFC + 10ns = 340ns
tXS = '340ns'
Current values from datasheet
IDD0 = '43mA'
IDD02 = '3mA'
IDD2N = '34mA'
IDD3N = '38mA'
IDD3N2 = '3mA'
IDD4W = '103mA'
IDD4R = '110mA'
IDD5 = '250mA'
IDD3P1 = '32mA'
IDD2P1 = '25mA'
IDD6 = '30mA'
VDD = '1.2V'
VDD2 = '2.5V'

-# A single DDR4-2400 x64 channel (one command and address bus), with
-# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8)
-# in an 8x8 configuration.
-# Total channel capacity is 16GB
-# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel
-class DDR4_2400_8x8(DDR4_2400_16x4):

8x8 configuration, 8 devices each with an 8-bit interface
device_bus_width = 8
Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)
device_rowbuffer_size = '1kB'
8x8 configuration, so 8 devices
devices_per_rank = 8
RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns)
tRRD_L = '4.9ns';
tXAW = '21ns'
Current values from datasheet
IDD0 = '48mA'
IDD3N = '43mA'
IDD4W = '123mA'
IDD4R = '135mA'
IDD3P1 = '37mA'

-# A single DDR4-2400 x64 channel (one command and address bus), with
-# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16)
-# in an 4x16 configuration.
-# Total channel capacity is 4GB
-# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel
-class DDR4_2400_4x16(DDR4_2400_16x4):

4x16 configuration, 4 devices each with an 16-bit interface
device_bus_width = 16
Each device has a page (row buffer) size of 2 Kbyte (1K columns x16)
device_rowbuffer_size = '2kB'
4x16 configuration, so 4 devices
devices_per_rank = 4
Single rank for x16
ranks_per_channel = 1
DDR4 has 2 (x16) or 4 (x4 and x8) bank groups
Set to 2 for x16 case
bank_groups_per_rank = 2
DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all
configurations). Currently we do not capture the additional
constraints incurred by the bank groups
banks_per_rank = 8
RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns)
tRRD = '5.3ns'
RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns)
tRRD_L = '6.4ns';
tXAW = '30ns'
Current values from datasheet
IDD0 = '80mA'
IDD02 = '4mA'
IDD2N = '34mA'
IDD3N = '47mA'
IDD4W = '228mA'
IDD4R = '243mA'
IDD5 = '280mA'
IDD3P1 = '41mA'

-# A single LPDDR2-S4 x32 interface (one command/address bus), with
-# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1)
-# in a 1x32 configuration.
-class LPDDR2_S4_1066_1x32(DRAMCtrl):

No DLL in LPDDR2
dll = False
size of device
device_size = '512MB'
1x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
LPDDR2_S4 is a BL4 and BL8 device
burst_length = 8
Each device has a page (row buffer) size of 1KB
(this depends on the memory density)
device_rowbuffer_size = '1kB'
1x32 configuration, so 1 device
devices_per_rank = 1
Use a single rank
ranks_per_channel = 1
LPDDR2-S4 has 8 banks in all configurations
banks_per_rank = 8
533 MHz
tCK = '1.876ns'
Fixed at 15 ns
tRCD = '15ns'
8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time
tCL = '15ns'
Pre-charge one bank 15 ns (all banks 18 ns)
tRP = '15ns'
tRAS = '42ns'
tWR = '15ns'
tRTP = '7.5ns'
8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz.
Note this is a BL8 DDR device.
Requests larger than 32 bytes are broken down into multiple requests
in the controller
tBURST = '7.5ns'
LPDDR2-S4, 4 Gbit
tRFC = '130ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '7.5ns'
self refresh exit time
tXS = '140ns'
Irrespective of speed grade, tWTR is 7.5 ns
tWTR = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns
tRTW = '3.75ns'
Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns
tCS = '3.75ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Irrespective of density, tFAW is 50 ns
tXAW = '50ns'
activation_limit = 4
Current values from datasheet
IDD0 = '15mA'
IDD02 = '70mA'
IDD2N = '2mA'
IDD2N2 = '30mA'
IDD3N = '2.5mA'
IDD3N2 = '30mA'
IDD4W = '10mA'
IDD4W2 = '190mA'
IDD4R = '3mA'
IDD4R2 = '220mA'
IDD5 = '40mA'
IDD52 = '150mA'
IDD3P1 = '1.2mA'
IDD3P12 = '8mA'
IDD2P1 = '0.6mA'
IDD2P12 = '0.8mA'
IDD6 = '1mA'
IDD62 = '3.2mA'
VDD = '1.8V'
VDD2 = '1.2V'

-# A single WideIO x128 interface (one command and address bus), with
-# default timings based on an estimated WIO-200 8 Gbit part.
-class WideIO_200_1x128(DRAMCtrl):

No DLL for WideIO
dll = False
size of device
device_size = '1024MB'
1x128 configuration, 1 device with a 128-bit interface
device_bus_width = 128
This is a BL4 device
burst_length = 4
Each device has a page (row buffer) size of 4KB
(this depends on the memory density)
device_rowbuffer_size = '4kB'
1x128 configuration, so 1 device
devices_per_rank = 1
Use one rank for a one-high die stack
ranks_per_channel = 1
WideIO has 4 banks in all configurations
banks_per_rank = 4
200 MHz
tCK = '5ns'
WIO-200
tRCD = '18ns'
tCL = '18ns'
tRP = '18ns'
tRAS = '42ns'
tWR = '15ns'
Read to precharge is same as the burst
tRTP = '20ns'
4 beats across an x128 SDR interface translates to 4 clocks @ 200

MHz.

Note this is a BL4 SDR device.
tBURST = '20ns'
WIO 8 Gb
tRFC = '210ns'
WIO 8 Gb, <=85C, half for >85C
tREFI = '3.9us'
Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns
tWTR = '15ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns
tRTW = '10ns'
Default different rank bus delay to 2 CK, @200 MHz = 10 ns
tCS = '10ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Two instead of four activation window
tXAW = '50ns'
activation_limit = 2
The WideIO specification does not provide current information

-# A single LPDDR3 x32 interface (one command/address bus), with
-# default timings based on a LPDDR3-1600 4 Gbit part (Micron
-# EDF8132A1MC) in a 1x32 configuration.
-class LPDDR3_1600_1x32(DRAMCtrl):

No DLL for LPDDR3
dll = False
size of device
device_size = '512MB'
1x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
LPDDR3 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 4KB
device_rowbuffer_size = '4kB'
1x32 configuration, so 1 device
devices_per_rank = 1
Technically the datasheet is a dual-rank package, but for
comparison with the LPDDR2 config we stick to a single rank
ranks_per_channel = 1
LPDDR3 has 8 banks in all configurations
banks_per_rank = 8
800 MHz
tCK = '1.25ns'
tRCD = '18ns'
12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time
tCL = '15ns'
tRAS = '42ns'
tWR = '15ns'
Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns
tRTP = '7.5ns'
Pre-charge one bank 18 ns (all banks 21 ns)
tRP = '18ns'
8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz.
Note this is a BL8 DDR device.
Requests larger than 32 bytes are broken down into multiple requests
in the controller
tBURST = '5ns'
LPDDR3, 4 Gb
tRFC = '130ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '7.5ns'
self refresh exit time
tXS = '140ns'
Irrespective of speed grade, tWTR is 7.5 ns
tWTR = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns
tRTW = '2.5ns'
Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns
tCS = '2.5ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Irrespective of size, tFAW is 50 ns
tXAW = '50ns'
activation_limit = 4
Current values from datasheet
IDD0 = '8mA'
IDD02 = '60mA'
IDD2N = '0.8mA'
IDD2N2 = '26mA'
IDD3N = '2mA'
IDD3N2 = '34mA'
IDD4W = '2mA'
IDD4W2 = '190mA'
IDD4R = '2mA'
IDD4R2 = '230mA'
IDD5 = '28mA'
IDD52 = '150mA'
IDD3P1 = '1.4mA'
IDD3P12 = '11mA'
IDD2P1 = '0.8mA'
IDD2P12 = '1.8mA'
IDD6 = '0.5mA'
IDD62 = '1.8mA'
VDD = '1.8V'
VDD2 = '1.2V'

-# A single GDDR5 x64 interface, with
-# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix
-# H5GQ1H24AFR) in a 2x32 configuration.
-class GDDR5_4000_2x32(DRAMCtrl):

size of device
device_size = '128MB'
2x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
GDDR5 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 2Kbits (256Bytes)
device_rowbuffer_size = '256B'
2x32 configuration, so 2 devices
devices_per_rank = 2
assume single rank
ranks_per_channel = 1
GDDR5 has 4 bank groups
bank_groups_per_rank = 4
GDDR5 has 16 banks with 4 bank groups
banks_per_rank = 16
1000 MHz
tCK = '1ns'
8 beats across an x64 interface translates to 2 clocks @ 1000 MHz
Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz )
8 beats at 4000 MHz = 2 beats at 1000 MHz
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = '2ns'
@1000MHz data rate, tCCD_L is 3 CK
CAS-to-CAS delay for bursts to the same bank group
tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = '3ns';
tRCD = '12ns'
tCL is not directly found in datasheet and assumed equal tRCD
tCL = '12ns'
tRP = '12ns'
tRAS = '28ns'
RRD_S (different bank group)
RRD_S is 5.5 ns in datasheet.
rounded to the next multiple of tCK
tRRD = '6ns'
RRD_L (same bank group)
RRD_L is 5.5 ns in datasheet.
rounded to the next multiple of tCK
tRRD_L = '6ns'
tXAW = '23ns'
tXAW < 4 x tRRD.
Therefore, activation limit is set to 0
activation_limit = 0
tRFC = '65ns'
tWR = '12ns'
Here using the average of WTR_S and WTR_L
tWTR = '5ns'
Read-to-Precharge 2 CK
tRTP = '2ns'
Assume 2 cycles
tRTW = '2ns'

-# A single HBM x128 interface (one command and address bus), with
-# default timings based on data publically released
-# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014),
-# IDD measurement values, and by extrapolating data from other classes.
-# Architecture values based on published HBM spec
-# A 4H stack is defined, 2Gb per die for a total of 1GB of memory.
-class HBM_1000_4H_1x128(DRAMCtrl):

HBM gen1 supports up to 8 128-bit physical channels
Configuration defines a single channel, with the capacity
set to (full_ stack_capacity / 8) based on 2Gb dies
To use all 8 channels, set 'channels' parameter to 8 in
system configuration
128-bit interface legacy mode
device_bus_width = 128
HBM supports BL4 and BL2 (legacy mode only)
burst_length = 4
size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack;
with 8 channels, 128MB per channel
device_size = '128MB'
device_rowbuffer_size = '2kB'
1x128 configuration
devices_per_rank = 1
HBM does not have a CS pin; set rank to 1
ranks_per_channel = 1
HBM has 8 or 16 banks depending on capacity
2Gb dies have 8 banks
banks_per_rank = 8
depending on frequency, bank groups may be required
will always have 4 bank groups when enabled
current specifications do not define the minimum frequency for
bank group architecture
setting bank_groups_per_rank to 0 to disable until range is defined
bank_groups_per_rank = 0
500 MHz for 1Gbps DDR data rate
tCK = '2ns'
use values from IDD measurement in JEDEC spec
use tRP value for tRCD and tCL similar to other classes
tRP = '15ns'
tRCD = '15ns'
tCL = '15ns'
tRAS = '33ns'
BL2 and BL4 supported, default to BL4
DDR @ 500 MHz means 4 * 2ns / 2 = 4ns
tBURST = '4ns'
value for 2Gb device from JEDEC spec
tRFC = '160ns'
value for 2Gb device from JEDEC spec
tREFI = '3.9us'
extrapolate the following from LPDDR configs, using ns values
to minimize burst length, prefetch differences
tWR = '18ns'
tRTP = '7.5ns'
tWTR = '10ns'
start with 2 cycles turnaround, similar to other memory classes
could be more with variations across the stack
tRTW = '4ns'
single rank device, set to 0
tCS = '0ns'
from MemCon example, tRRD is 4ns with 2ns tCK
tRRD = '4ns'
from MemCon example, tFAW is 30ns with 2ns tCK
tXAW = '30ns'
activation_limit = 4
4tCK
tXP = '8ns'
start with tRFC + tXP -> 160ns + 8ns = 168ns
tXS = '168ns'

-# A single HBM x64 interface (one command and address bus), with
-# default timings based on HBM gen1 and data publically released
-# A 4H stack is defined, 8Gb per die for a total of 4GB of memory.
-# Note: This defines a pseudo-channel with a unique controller
-# instantiated per pseudo-channel
-# Stay at same IO rate (1Gbps) to maintain timing relationship with
-# HBM gen1 class (HBM_1000_4H_x128) where possible
-class HBM_1000_4H_1x64(HBM_1000_4H_1x128):

For HBM gen2 with pseudo-channel mode, configure 2X channels.
Configuration defines a single pseudo channel, with the capacity
set to (full_ stack_capacity / 16) based on 8Gb dies
To use all 16 pseudo channels, set 'channels' parameter to 16 in
system configuration
64-bit pseudo-channle interface
device_bus_width = 64
HBM pseudo-channel only supports BL4
burst_length = 4
size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack;
with 16 channels, 256MB per channel
device_size = '256MB'
page size is halved with pseudo-channel; maintaining the same same

number

of rows per pseudo-channel with 2X banks across 2 channels
device_rowbuffer_size = '1kB'
HBM has 8 or 16 banks depending on capacity
Starting with 4Gb dies, 16 banks are defined
banks_per_rank = 16
reset tRFC for larger, 8Gb device
use HBM1 4Gb value as a starting point
tRFC = '260ns'
start with tRFC + tXP -> 160ns + 8ns = 168ns
tXS = '268ns'
Default different rank bus delay to 2 CK, @1000 MHz = 2 ns
tCS = '2ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '10ns'
self refresh exit time
tXS = '65ns'

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture
-# burst of 32, which means bursts can be interleaved
-class LPDDR5_5500_1x16_BG_BL32(DRAMCtrl):

Increase buffer size to account for more bank resources
read_buffer_size = 64
Set page policy to better suit DMC Huxley
page_policy = 'close_adaptive'
16-bit channel interface
device_bus_width = 16
LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL32 for higher command bandwidth
burst_length = 32
size of device in bytes
device_size = '1GB'
2kB page with BG mode
device_rowbuffer_size = '2kB'
Use a 1x16 configuration
devices_per_rank = 1
Use a single rank
ranks_per_channel = 1
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Initial configuration will have 16 banks with Bank Group Arch
to maximim resources and enable higher data rates
banks_per_rank = 16
bank_groups_per_rank = 4
5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK
tCK = '1.455ns'
Greater of 2 CK or 18ns
tRCD = '18ns'
Base RL is 16 CK @ 687.5 MHz = 23.28ns
tCL = '23.280ns'
Greater of 2 CK or 18ns
tRP = '18ns'
Greater of 3 CK or 42ns
tRAS = '42ns'
Greater of 3 CK or 34ns
tWR = '34ns'
active powerdown and precharge powerdown exit time
Greater of 3 CK or 7ns
tXP = '7ns'
self refresh exit time (tRFCab + 7.5ns)
tXS = '217.5ns'
Greater of 2 CK or 7.5 ns minus 2 CK
tRTP = '4.59ns'
With BG architecture, burst of 32 transferred in two 16-beat
sub-bursts, with a 16-beat gap in between.
Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz
tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz
tBURST = '8.73ns'
can interleave a Bstof32 from another bank group at tBURST_MIN
16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz
tBURST_MIN = '2.91ns'
tBURST_MAX is the maximum burst delay for same bank group timing
this is 8 CK @ 687.5 MHz
tBURST_MAX = '11.64ns'
8 CK @ 687.5 MHz
tCCD_L = "11.64ns"
LPDDR5, 8 Gbit/channel for 280ns tRFCab
tRFC = '210ns'
tREFI = '3.9us'
Greater of 4 CK or 6.25 ns
tWTR = '6.25ns'
Greater of 4 CK or 12 ns
tWTR_L = '12ns'
Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL
tWCKDQ0/tCK will be 1 CK for most cases
For gem5 RL = WL and BL/n is already accounted for with tBURST
Result is and additional 1 CK is required
tRTW = '1.455ns'
Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns
tCS = '2.91ns'
2 CK
tPPD = '2.91ns'
Greater of 2 CK or 5 ns
tRRD = '5ns'
tRRD_L = '5ns'
With Bank Group Arch mode tFAW is 20 ns
tXAW = '20ns'
activation_limit = 4
at 5Gbps, 4:1 WCK to CK ratio required
2 data beats per WCK (DDR) -> 8 per CK
beats_per_clock = 8
2 cycles required to send activate command
2 command phases can be sent back-to-back or
with a gap up to tAAD = 8 CK
two_cycle_activate = True
tAAD = '11.640ns'
data_clock_sync = True

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture, burst of 16
-class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32):

LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL16 for smaller access granularity
burst_length = 16
For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio
tBURST = '2.91ns'
tBURST_MIN = '2.91ns'
For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio
tBURST_MAX = '5.82ns'
4 CK @ 687.5 MHz
tCCD_L = "5.82ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 8-bank mode, burst of 32
-class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32):

4kB page with 8B mode
device_rowbuffer_size = '4kB'
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Select 8B
banks_per_rank = 8
bank_groups_per_rank = 0
For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio
tBURST = '5.82ns'
tBURST_MIN = '5.82ns'
tBURST_MAX = '5.82ns'
Greater of 4 CK or 12 ns
tWTR = '12ns'
Greater of 2 CK or 10 ns
tRRD = '10ns'
With 8B mode tFAW is 40 ns
tXAW = '40ns'
activation_limit = 4
Reset BG arch timing for 8B mode
tCCD_L = "0ns"
tRRD_L = "0ns"
tWTR_L = "0ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture
-# burst of 32, which means bursts can be interleaved
-class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32):

5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK
tCK = '1.25ns'
Base RL is 17 CK @ 800 MHz = 21.25ns
tCL = '21.25ns'
With BG architecture, burst of 32 transferred in two 16-beat
sub-bursts, with a 16-beat gap in between.
Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz
tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz
tBURST = '7.5ns'
can interleave a Bstof32 from another bank group at tBURST_MIN
16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz
tBURST_MIN = '2.5ns'
tBURST_MAX is the maximum burst delay for same bank group timing
this is 8 CK @ 800 MHz
tBURST_MAX = '10ns'
8 CK @ 800 MHz
tCCD_L = "10ns"
Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL
tWCKDQ0/tCK will be 1 CK for most cases
For gem5 RL = WL and BL/n is already accounted for with tBURST
Result is and additional 1 CK is required
tRTW = '1.25ns'
Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns
tCS = '2.5ns'
2 CK
tPPD = '2.5ns'
2 command phases can be sent back-to-back or
with a gap up to tAAD = 8 CK
tAAD = '10ns'

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on initial
-# JEDEC specifcation
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture, burst of 16
-class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32):

LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL16 for smaller access granularity
burst_length = 16
For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio
tBURST = '2.5ns'
tBURST_MIN = '2.5ns'
For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio
tBURST_MAX = '5ns'
4 CK @ 800 MHz
tCCD_L = "5ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 8-bank mode, burst of 32
-class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32):

4kB page with 8B mode
device_rowbuffer_size = '4kB'
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Select 8B
banks_per_rank = 8
bank_groups_per_rank = 0
For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio
tBURST = '5ns'
tBURST_MIN = '5ns'
tBURST_MAX = '5ns'
Greater of 4 CK or 12 ns
tWTR = '12ns'
Greater of 2 CK or 10 ns
tRRD = '10ns'
With 8B mode tFAW is 40 ns
tXAW = '40ns'
activation_limit = 4
Reset BG arch timing for 8B mode
tCCD_L = "0ns"
tRRD_L = "0ns"
tWTR_L = "0ns"
diff --git a/src/mem/DRAMInterface.py b/src/mem/DRAMInterface.py
new file mode 100644
index 0000000..35bf8a3
--- /dev/null
+++ b/src/mem/DRAMInterface.py
@@ -0,0 +1,1483 @@
+# Copyright (c) 2012-2020 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder. You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Copyright (c) 2013 Amin Farmahini-Farahani
+# Copyright (c) 2015 University of Kaiserslautern
+# Copyright (c) 2015 The University of Bologna
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+from AbstractMemory import AbstractMemory
+from DRAMCtrl import *
+
+# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting
+# channel, rank, bank, row and column, respectively, and going from
+# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are
+# suitable for an open-page policy, optimising for sequential accesses
+# hitting in the open row. For a closed-page policy, RoCoRaBaCh
+# maximises parallelism.
+class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh']
+
+# Enum for the page policy, either open, open_adaptive, close, or
+# close_adaptive.
+class PageManage(Enum): vals = ['open', 'open_adaptive', 'close',

                           'close_adaptive']

+class DRAMInterface(AbstractMemory):

type = 'DRAMInterface'
cxx_header = "mem/dram_ctrl.hh"
scheduler, address map and page policy
addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy")
page_policy = Param.PageManage('open_adaptive', "Page management
policy")
Allow the interface to set required controller buffer sizes
each entry corresponds to a burst for the specific DRAM
configuration (e.g. x32 with burst length 8 is 32 bytes) and not
the cacheline size or request/packet size
write_buffer_size = Param.Unsigned(64, "Number of write queue entries")
read_buffer_size = Param.Unsigned(32, "Number of read queue entries")
enforce a limit on the number of accesses per row
max_accesses_per_row = Param.Unsigned(16, "Max accesses per row
before "

                                     "closing");

size of DRAM Chip in Bytes
device_size = Param.MemorySize("Size of DRAM chip")
the physical organisation of the DRAM
device_bus_width = Param.Unsigned("data bus width in bits for each
DRAM "\

                                 "device/chip")

burst_length = Param.Unsigned("Burst lenght (BL) in beats")
device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\

                                      "device/chip")

devices_per_rank = Param.Unsigned("Number of devices/chips per rank")
ranks_per_channel = Param.Unsigned("Number of ranks per channel")
default to 0 bank groups per rank, indicating bank group architecture
is not used
update per memory class when bank group architecture is supported
bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per
rank")
banks_per_rank = Param.Unsigned("Number of banks per rank")
Enable DRAM powerdown states if True. This is False by default due to
performance being lower when enabled
enable_dram_powerdown = Param.Bool(False, "Enable powerdown states")
For power modelling we need to know if the DRAM has a DLL or not
dll = Param.Bool(True, "DRAM has DLL or not")
DRAMPower provides in addition to the core power, the possibility to
include RD/WR termination and IO power. This calculation assumes some
default values. The integration of DRAMPower with gem5 does not

include

IO and RD/WR termination power by default. This might be added as an
additional feature in the future.
timing behaviour and constraints - all in nanoseconds
the base clock period of the DRAM
tCK = Param.Latency("Clock period")
rank-to-rank bus delay penalty
this does not correlate to a memory timing parameter and encompasses:
1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD
different rank bus delay
tCS = Param.Latency("Rank to rank switching time")
the amount of time in nanoseconds from issuing an activate command
to the data being available in the row buffer for a read/write
tRCD = Param.Latency("RAS to CAS delay")
the time from issuing a read/write command to seeing the actual data
tCL = Param.Latency("CAS latency")
minimum time between a precharge and subsequent activate
tRP = Param.Latency("Row precharge time")
minimum time between an activate and a precharge to the same row
tRAS = Param.Latency("ACT to PRE delay")
minimum time between a write data transfer and a precharge
tWR = Param.Latency("Write recovery time")
minimum time between a read and precharge command
tRTP = Param.Latency("Read to precharge")
time to complete a burst transfer, typically the burst length
divided by two due to the DDR bus, but by making it a parameter
it is easier to also evaluate SDR memories like WideIO.
This parameter has to account for burst length.
Read/Write requests with data size larger than one full burst are

broken

down into multiple requests in the controller
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = Param.Latency("Burst duration "

                      "(typically burst length / 2 cycles)")

tBURST_MAX is the column array cycle delay required before next

access,

which could be greater than tBURST when the memory access time is

greater

than tBURST
tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay")
tBURST_MIN is the minimum delay between bursts, which could be less

than

tBURST when interleaving is supported
tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts")
CAS-to-CAS delay for bursts to the same bank group
only utilized with bank group architectures; set to 0 for default

case

tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay")
Write-to-Write delay for bursts to the same bank group
only utilized with bank group architectures; set to 0 for default

case

This will be used to enable different same bank group delays
for writes versus reads
tCCD_L_WR = Param.Latency(Self.tCCD_L, "Same bank group Write to
Write " \

                                      "delay")

time taken to complete one refresh cycle (N rows in all banks)
tRFC = Param.Latency("Refresh cycle time")
refresh command interval, how often a "ref" command needs
to be sent. It is 7.8 us for a 64ms refresh requirement
tREFI = Param.Latency("Refresh command interval")
write-to-read, same rank turnaround penalty
tWTR = Param.Latency("Write to read, same rank switching time")
write-to-read, same rank turnaround penalty for same bank group
tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching "

                      "time, same bank group")

read-to-write, same rank turnaround penalty
tRTW = Param.Latency("Read to write, same rank switching time")
minimum precharge to precharge delay time
tPPD = Param.Latency("0ns", "PRE to PRE delay")
maximum delay between two-cycle ACT command phases
tAAD = Param.Latency(Self.tCK,

                    "Maximum delay between two-cycle ACT commands")

two_cycle_activate = Param.Bool(False,

                    "Two cycles required to send activate")

minimum row activate to row activate delay time
tRRD = Param.Latency("ACT to ACT delay")
only utilized with bank group architectures; set to 0 for default

case

tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay")
time window in which a maximum number of activates are allowed
to take place, set to 0 to disable
tXAW = Param.Latency("X activation window")
activation_limit = Param.Unsigned("Max number of activates in window")
time to exit power-down mode
Exit power-down to next valid command delay
tXP = Param.Latency("0ns", "Power-up Delay")
Exit Powerdown to commands requiring a locked DLL
tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL")
time to exit self-refresh mode
tXS = Param.Latency("0ns", "Self-refresh exit latency")
time to exit self-refresh mode with locked DLL
tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL")
number of data beats per clock. with DDR, default is 2, one per edge
beats_per_clock = Param.Unsigned(2, "Data beats per clock")
data_clock_sync = Param.Bool(False, "Synchronization commands
required")
Currently rolled into other params
######################################################################
tRC - assumed to be tRAS + tRP
Power Behaviour and Constraints
DRAMs like LPDDR and WideIO have 2 external voltage domains. These

are

defined as VDD and VDD2. Each current is defined for each voltage

domain

separately. For example, current IDD0 is active-precharge current for
voltage domain VDD and current IDD02 is active-precharge current for
voltage domain VDD2.
By default all currents are set to 0mA. Users who are only

interested in

the performance of DRAMs can leave them at 0.
Power Behaviour and Constraints
DRAMs like LPDDR and WideIO have 2 external voltage domains. These

are

defined as VDD and VDD2. Each current is defined for each voltage

domain

separately. For example, current IDD0 is active-precharge current for
voltage domain VDD and current IDD02 is active-precharge current for
voltage domain VDD2.
By default all currents are set to 0mA. Users who are only

interested in

the performance of DRAMs can leave them at 0.
Operating 1 Bank Active-Precharge current
IDD0 = Param.Current("0mA", "Active precharge current")
Operating 1 Bank Active-Precharge current multiple voltage Range
IDD02 = Param.Current("0mA", "Active precharge current VDD2")
Precharge Power-down Current: Slow exit
IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow")
Precharge Power-down Current: Slow exit multiple voltage Range
IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2")
Precharge Power-down Current: Fast exit
IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast")
Precharge Power-down Current: Fast exit multiple voltage Range
IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2")
Precharge Standby current
IDD2N = Param.Current("0mA", "Precharge Standby current")
Precharge Standby current multiple voltage range
IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2")
Active Power-down current: slow exit
IDD3P0 = Param.Current("0mA", "Active Powerdown slow")
Active Power-down current: slow exit multiple voltage range
IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2")
Active Power-down current : fast exit
IDD3P1 = Param.Current("0mA", "Active Powerdown fast")
Active Power-down current : fast exit multiple voltage range
IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2")
Active Standby current
IDD3N = Param.Current("0mA", "Active Standby current")
Active Standby current multiple voltage range
IDD3N2 = Param.Current("0mA", "Active Standby current VDD2")
Burst Read Operating Current
IDD4R = Param.Current("0mA", "READ current")
Burst Read Operating Current multiple voltage range
IDD4R2 = Param.Current("0mA", "READ current VDD2")
Burst Write Operating Current
IDD4W = Param.Current("0mA", "WRITE current")
Burst Write Operating Current multiple voltage range
IDD4W2 = Param.Current("0mA", "WRITE current VDD2")
Refresh Current
IDD5 = Param.Current("0mA", "Refresh current")
Refresh Current multiple voltage range
IDD52 = Param.Current("0mA", "Refresh current VDD2")
Self-Refresh Current
IDD6 = Param.Current("0mA", "Self-refresh Current")
Self-Refresh Current multiple voltage range
IDD62 = Param.Current("0mA", "Self-refresh Current VDD2")
Main voltage range of the DRAM
VDD = Param.Voltage("0V", "Main Voltage Range")
Second voltage range defined by some DRAMs
VDD2 = Param.Voltage("0V", "2nd Voltage Range")

+# A single DDR3-1600 x64 channel (one command and address bus), with
+# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in
+# an 8x8 configuration.
+class DDR3_1600_8x8(DRAMInterface):

size of device in bytes
device_size = '512MB'
8x8 configuration, 8 devices each with an 8-bit interface
device_bus_width = 8
DDR3 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)
device_rowbuffer_size = '1kB'
8x8 configuration, so 8 devices
devices_per_rank = 8
Use two ranks
ranks_per_channel = 2
DDR3 has 8 banks in all configurations
banks_per_rank = 8
800 MHz
tCK = '1.25ns'
Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns
tCS = '2.5ns'
8 beats across an x64 interface translates to 4 clocks @ 800 MHz
tBURST = '5ns'
Greater of 4 CK or 7.5 ns
tWTR = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns
tRTW = '2.5ns'
DDR3-1600 11-11-11
tRCD = '13.75ns'
tCL = '13.75ns'
tRP = '13.75ns'
tRAS = '35ns'
tRRD = '6ns'
tXAW = '30ns'
activation_limit = 4
tRFC = '260ns'
tWR = '15ns'
Greater of 4 CK or 7.5 ns
tRTP = '7.5ns'
<=85C, half for >85C
tREFI = '7.8us'
active powerdown and precharge powerdown exit time
tXP = '6ns'
self refresh exit time
tXS = '270ns'
Current values from datasheet Die Rev E,J
IDD0 = '55mA'
IDD2N = '32mA'
IDD3N = '38mA'
IDD4W = '125mA'
IDD4R = '157mA'
IDD5 = '235mA'
IDD3P1 = '38mA'
IDD2P1 = '32mA'
IDD6 = '20mA'
VDD = '1.5V'

+# A single HMC-2500 x32 model based on:
+# [1] DRAMSpec: a high-level DRAM bank modelling tool
+# developed at the University of Kaiserslautern. This high level tool
+# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to
+# estimate the DRAM bank latency and power numbers.
+# [2] High performance AXI-4.0 based interconnect for extensible smart
memory
+# cubes (E. Azarkhish et. al)
+# Assumed for the HMC model is a 30 nm technology node.
+# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory
(4
+# layers).
+# Each layer has 16 vaults and each vault consists of 2 banks per layer.
+# In order to be able to use the same controller used for 2D DRAM
generations
+# for HMC, the following analogy is done:
+# Channel (DDR) => Vault (HMC)
+# device_size (DDR) => size of a single layer in a vault
+# ranks per channel (DDR) => number of layers
+# banks per rank (DDR) => banks per layer
+# devices per rank (DDR) => devices per layer ( 1 for HMC).
+# The parameters for which no input is available are inherited from the
DDR3
+# configuration.
+# This configuration includes the latencies from the DRAM to the logic
layer
+# of the HMC
+class HMC_2500_1x32_Interface(DDR3_1600_8x8):

A single HMC-2500 x32 controller
The buffer parameters do not directly correlate with buffer_size in
real hardware. Nevertheless, their value has been tuned to achieve a
bandwidth similar to the cycle-accurate model in [2]
write_buffer_size = 32
read_buffer_size = 32
size of device
two banks per device with each bank 4MB [2]
device_size = '8MB'
1x32 configuration, 1 device with 32 TSVs [2]
device_bus_width = 32
HMC is a BL8 device [2]
burst_length = 8
Each device has a page (row buffer) size of 256 bytes [2]
device_rowbuffer_size = '256B'
1x32 configuration, so 1 device [2]
devices_per_rank = 1
4 layers so 4 ranks [2]
ranks_per_channel = 4
HMC has 2 banks per layer [2]
Each layer represents a rank. With 4 layers and 8 banks in total,

each

layer has 2 banks; thus 2 banks per rank.
banks_per_rank = 2
1250 MHz [2]
tCK = '0.8ns'
Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz

0.8 ns (Assumption)
tCS = '0.8ns'
8 beats across an x32 interface translates to 4 clocks @ 1250 MHz
tBURST = '3.2ns'
Values using DRAMSpec HMC model [1]
tRCD = '10.2ns'
tCL = '9.9ns'
tRP = '7.7ns'
tRAS = '21.6ns'
tRRD depends on the power supply network for each vendor.
We assume a tRRD of a double bank approach to be equal to 4 clock
cycles (Assumption)
tRRD = '3.2ns'
activation limit is set to 0 since there are only 2 banks per vault
layer.
activation_limit = 0
Values using DRAMSpec HMC model [1]
tRFC = '59ns'
tWR = '8ns'
tRTP = '4.9ns'
Value using DRAMSpec HMC model [1]
tREFI = '3.9us'
The default page policy in the vault controllers is simple closed

page

[2] nevertheless 'close' policy opens and closes the row multiple

times

for bursts largers than 32Bytes. For this reason we

use 'close_adaptive'

page_policy = 'close_adaptive'
RoCoRaBaCh resembles the default address mapping in HMC
addr_mapping = 'RoCoRaBaCh'

+# A single DDR3-2133 x64 channel refining a selected subset of the
+# options for the DDR-1600 configuration, based on the same DDR3-1600
+# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept
+# consistent across the two configurations.
+class DDR3_2133_8x8(DDR3_1600_8x8):

1066 MHz
tCK = '0.938ns'
8 beats across an x64 interface translates to 4 clocks @ 1066 MHz
tBURST = '3.752ns'
DDR3-2133 14-14-14
tRCD = '13.09ns'
tCL = '13.09ns'
tRP = '13.09ns'
tRAS = '33ns'
tRRD = '5ns'
tXAW = '25ns'
Current values from datasheet
IDD0 = '70mA'
IDD2N = '37mA'
IDD3N = '44mA'
IDD4W = '157mA'
IDD4R = '191mA'
IDD5 = '250mA'
IDD3P1 = '44mA'
IDD2P1 = '43mA'
IDD6 ='20mA'
VDD = '1.5V'

+# A single DDR4-2400 x64 channel (one command and address bus), with
+# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4)
+# in an 16x4 configuration.
+# Total channel capacity is 32GB
+# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel
+class DDR4_2400_16x4(DRAMInterface):

override the default buffer sizes and go for something larger to
accommodate the larger bank count
write_buffer_size = 128
read_buffer_size = 64
size of device
device_size = '1GB'
16x4 configuration, 16 devices each with a 4-bit interface
device_bus_width = 4
DDR4 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 512 byte (1K columns x4)
device_rowbuffer_size = '512B'
16x4 configuration, so 16 devices
devices_per_rank = 16
Match our DDR3 configurations which is dual rank
ranks_per_channel = 2
DDR4 has 2 (x16) or 4 (x4 and x8) bank groups
Set to 4 for x4 case
bank_groups_per_rank = 4
DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all
configurations). Currently we do not capture the additional
constraints incurred by the bank groups
banks_per_rank = 16
1200 MHz
tCK = '0.833ns'
Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns
tCS = '1.666ns'
8 beats across an x64 interface translates to 4 clocks @ 1200 MHz
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = '3.332ns'
Here using the average of WTR_S and WTR_L
tWTR = '5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666

tRTW = '1.666ns'
@2400 data rate, tCCD_L is 6 CK
CAS-to-CAS delay for bursts to the same bank group
tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = '5ns';
DDR4-2400 17-17-17
tRCD = '14.16ns'
tCL = '14.16ns'
tRP = '14.16ns'
tRAS = '32ns'
RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns)
tRRD = '3.332ns'
RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns)
tRRD_L = '4.9ns';
tFAW for 512B page is MAX(16 CK, 13ns)
tXAW = '13.328ns'
activation_limit = 4
tRFC is 350ns
tRFC = '350ns'
tWR = '15ns'
Greater of 4 CK or 7.5 ns
tRTP = '7.5ns'
<=85C, half for >85C
tREFI = '7.8us'
active powerdown and precharge powerdown exit time
tXP = '6ns'
self refresh exit time
exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is:
tRFC + 10ns = 340ns
tXS = '340ns'
Current values from datasheet
IDD0 = '43mA'
IDD02 = '3mA'
IDD2N = '34mA'
IDD3N = '38mA'
IDD3N2 = '3mA'
IDD4W = '103mA'
IDD4R = '110mA'
IDD5 = '250mA'
IDD3P1 = '32mA'
IDD2P1 = '25mA'
IDD6 = '30mA'
VDD = '1.2V'
VDD2 = '2.5V'

+# A single DDR4-2400 x64 channel (one command and address bus), with
+# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8)
+# in an 8x8 configuration.
+# Total channel capacity is 16GB
+# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel
+class DDR4_2400_8x8(DDR4_2400_16x4):

8x8 configuration, 8 devices each with an 8-bit interface
device_bus_width = 8
Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)
device_rowbuffer_size = '1kB'
8x8 configuration, so 8 devices
devices_per_rank = 8
RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns)
tRRD_L = '4.9ns';
tXAW = '21ns'
Current values from datasheet
IDD0 = '48mA'
IDD3N = '43mA'
IDD4W = '123mA'
IDD4R = '135mA'
IDD3P1 = '37mA'

+# A single DDR4-2400 x64 channel (one command and address bus), with
+# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16)
+# in an 4x16 configuration.
+# Total channel capacity is 4GB
+# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel
+class DDR4_2400_4x16(DDR4_2400_16x4):

4x16 configuration, 4 devices each with an 16-bit interface
device_bus_width = 16
Each device has a page (row buffer) size of 2 Kbyte (1K columns x16)
device_rowbuffer_size = '2kB'
4x16 configuration, so 4 devices
devices_per_rank = 4
Single rank for x16
ranks_per_channel = 1
DDR4 has 2 (x16) or 4 (x4 and x8) bank groups
Set to 2 for x16 case
bank_groups_per_rank = 2
DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all
configurations). Currently we do not capture the additional
constraints incurred by the bank groups
banks_per_rank = 8
RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns)
tRRD = '5.3ns'
RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns)
tRRD_L = '6.4ns';
tXAW = '30ns'
Current values from datasheet
IDD0 = '80mA'
IDD02 = '4mA'
IDD2N = '34mA'
IDD3N = '47mA'
IDD4W = '228mA'
IDD4R = '243mA'
IDD5 = '280mA'
IDD3P1 = '41mA'

+# A single LPDDR2-S4 x32 interface (one command/address bus), with
+# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1)
+# in a 1x32 configuration.
+class LPDDR2_S4_1066_1x32(DRAMInterface):

No DLL in LPDDR2
dll = False
size of device
device_size = '512MB'
1x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
LPDDR2_S4 is a BL4 and BL8 device
burst_length = 8
Each device has a page (row buffer) size of 1KB
(this depends on the memory density)
device_rowbuffer_size = '1kB'
1x32 configuration, so 1 device
devices_per_rank = 1
Use a single rank
ranks_per_channel = 1
LPDDR2-S4 has 8 banks in all configurations
banks_per_rank = 8
533 MHz
tCK = '1.876ns'
Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns
tCS = '3.75ns'
8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz.
Note this is a BL8 DDR device.
Requests larger than 32 bytes are broken down into multiple requests
in the controller
tBURST = '7.5ns'
Irrespective of speed grade, tWTR is 7.5 ns
tWTR = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns
tRTW = '3.75ns'
Fixed at 15 ns
tRCD = '15ns'
8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time
tCL = '15ns'
Pre-charge one bank 15 ns (all banks 18 ns)
tRP = '15ns'
tRAS = '42ns'
tWR = '15ns'
tRTP = '7.5ns'
LPDDR2-S4, 4 Gbit
tRFC = '130ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '7.5ns'
self refresh exit time
tXS = '140ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Irrespective of density, tFAW is 50 ns
tXAW = '50ns'
activation_limit = 4
Current values from datasheet
IDD0 = '15mA'
IDD02 = '70mA'
IDD2N = '2mA'
IDD2N2 = '30mA'
IDD3N = '2.5mA'
IDD3N2 = '30mA'
IDD4W = '10mA'
IDD4W2 = '190mA'
IDD4R = '3mA'
IDD4R2 = '220mA'
IDD5 = '40mA'
IDD52 = '150mA'
IDD3P1 = '1.2mA'
IDD3P12 = '8mA'
IDD2P1 = '0.6mA'
IDD2P12 = '0.8mA'
IDD6 = '1mA'
IDD62 = '3.2mA'
VDD = '1.8V'
VDD2 = '1.2V'

+# A single WideIO x128 interface (one command and address bus), with
+# default timings based on an estimated WIO-200 8 Gbit part.
+class WideIO_200_1x128(DRAMInterface):

No DLL for WideIO
dll = False
size of device
device_size = '1024MB'
1x128 configuration, 1 device with a 128-bit interface
device_bus_width = 128
This is a BL4 device
burst_length = 4
Each device has a page (row buffer) size of 4KB
(this depends on the memory density)
device_rowbuffer_size = '4kB'
1x128 configuration, so 1 device
devices_per_rank = 1
Use one rank for a one-high die stack
ranks_per_channel = 1
WideIO has 4 banks in all configurations
banks_per_rank = 4
200 MHz
tCK = '5ns'
Default different rank bus delay to 2 CK, @200 MHz = 10 ns
tCS = '10ns'
4 beats across an x128 SDR interface translates to 4 clocks @ 200

MHz.

Note this is a BL4 SDR device.
tBURST = '20ns'
Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns
tWTR = '15ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns
tRTW = '10ns'
WIO-200
tRCD = '18ns'
tCL = '18ns'
tRP = '18ns'
tRAS = '42ns'
tWR = '15ns'
Read to precharge is same as the burst
tRTP = '20ns'
WIO 8 Gb
tRFC = '210ns'
WIO 8 Gb, <=85C, half for >85C
tREFI = '3.9us'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Two instead of four activation window
tXAW = '50ns'
activation_limit = 2
The WideIO specification does not provide current information

+# A single LPDDR3 x32 interface (one command/address bus), with
+# default timings based on a LPDDR3-1600 4 Gbit part (Micron
+# EDF8132A1MC) in a 1x32 configuration.
+class LPDDR3_1600_1x32(DRAMInterface):

No DLL for LPDDR3
dll = False
size of device
device_size = '512MB'
1x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
LPDDR3 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 4KB
device_rowbuffer_size = '4kB'
1x32 configuration, so 1 device
devices_per_rank = 1
Technically the datasheet is a dual-rank package, but for
comparison with the LPDDR2 config we stick to a single rank
ranks_per_channel = 1
LPDDR3 has 8 banks in all configurations
banks_per_rank = 8
800 MHz
tCK = '1.25ns'
Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns
tCS = '2.5ns'
8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz.
Note this is a BL8 DDR device.
Requests larger than 32 bytes are broken down into multiple requests
in the controller
tBURST = '5ns'
Irrespective of speed grade, tWTR is 7.5 ns
tWTR = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns
tRTW = '2.5ns'
tRCD = '18ns'
12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time
tCL = '15ns'
tRAS = '42ns'
tWR = '15ns'
Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns
tRTP = '7.5ns'
Pre-charge one bank 18 ns (all banks 21 ns)
tRP = '18ns'
LPDDR3, 4 Gb
tRFC = '130ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '7.5ns'
self refresh exit time
tXS = '140ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Irrespective of size, tFAW is 50 ns
tXAW = '50ns'
activation_limit = 4
Current values from datasheet
IDD0 = '8mA'
IDD02 = '60mA'
IDD2N = '0.8mA'
IDD2N2 = '26mA'
IDD3N = '2mA'
IDD3N2 = '34mA'
IDD4W = '2mA'
IDD4W2 = '190mA'
IDD4R = '2mA'
IDD4R2 = '230mA'
IDD5 = '28mA'
IDD52 = '150mA'
IDD3P1 = '1.4mA'
IDD3P12 = '11mA'
IDD2P1 = '0.8mA'
IDD2P12 = '1.8mA'
IDD6 = '0.5mA'
IDD62 = '1.8mA'
VDD = '1.8V'
VDD2 = '1.2V'

+# A single GDDR5 x64 interface, with
+# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix
+# H5GQ1H24AFR) in a 2x32 configuration.
+class GDDR5_4000_2x32(DRAMInterface):

size of device
device_size = '128MB'
2x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
GDDR5 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 2Kbits (256Bytes)
device_rowbuffer_size = '256B'
2x32 configuration, so 2 devices
devices_per_rank = 2
assume single rank
ranks_per_channel = 1
GDDR5 has 4 bank groups
bank_groups_per_rank = 4
GDDR5 has 16 banks with 4 bank groups
banks_per_rank = 16
1000 MHz
tCK = '1ns'
8 beats across an x64 interface translates to 2 clocks @ 1000 MHz
Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz )
8 beats at 4000 MHz = 2 beats at 1000 MHz
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = '2ns'
Here using the average of WTR_S and WTR_L
tWTR = '5ns'
Assume 2 cycles
tRTW = '2ns'
@1000MHz data rate, tCCD_L is 3 CK
CAS-to-CAS delay for bursts to the same bank group
tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = '3ns';
tRCD = '12ns'
tCL is not directly found in datasheet and assumed equal tRCD
tCL = '12ns'
tRP = '12ns'
tRAS = '28ns'
RRD_S (different bank group)
RRD_S is 5.5 ns in datasheet.
rounded to the next multiple of tCK
tRRD = '6ns'
RRD_L (same bank group)
RRD_L is 5.5 ns in datasheet.
rounded to the next multiple of tCK
tRRD_L = '6ns'
tXAW = '23ns'
tXAW < 4 x tRRD.
Therefore, activation limit is set to 0
activation_limit = 0
tRFC = '65ns'
tWR = '12ns'
Read-to-Precharge 2 CK
tRTP = '2ns'

+# A single HBM x128 interface (one command and address bus), with
+# default timings based on data publically released
+# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014),
+# IDD measurement values, and by extrapolating data from other classes.
+# Architecture values based on published HBM spec
+# A 4H stack is defined, 2Gb per die for a total of 1GB of memory.
+class HBM_1000_4H_1x128(DRAMInterface):

HBM gen1 supports up to 8 128-bit physical channels
Configuration defines a single channel, with the capacity
set to (full_ stack_capacity / 8) based on 2Gb dies
To use all 8 channels, set 'channels' parameter to 8 in
system configuration
128-bit interface legacy mode
device_bus_width = 128
HBM supports BL4 and BL2 (legacy mode only)
burst_length = 4
size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack;
with 8 channels, 128MB per channel
device_size = '128MB'
device_rowbuffer_size = '2kB'
1x128 configuration
devices_per_rank = 1
HBM does not have a CS pin; set rank to 1
ranks_per_channel = 1
HBM has 8 or 16 banks depending on capacity
2Gb dies have 8 banks
banks_per_rank = 8
depending on frequency, bank groups may be required
will always have 4 bank groups when enabled
current specifications do not define the minimum frequency for
bank group architecture
setting bank_groups_per_rank to 0 to disable until range is defined
bank_groups_per_rank = 0
500 MHz for 1Gbps DDR data rate
tCK = '2ns'
single rank device, set to 0
tCS = '0ns'
BL2 and BL4 supported, default to BL4
DDR @ 500 MHz means 4 * 2ns / 2 = 4ns
tBURST = '4ns'
tWTR = '10ns'
start with 2 cycles turnaround, similar to other memory classes
could be more with variations across the stack
tRTW = '4ns'
use values from IDD measurement in JEDEC spec
use tRP value for tRCD and tCL similar to other classes
tRP = '15ns'
tRCD = '15ns'
tCL = '15ns'
tRAS = '33ns'
value for 2Gb device from JEDEC spec
tRFC = '160ns'
value for 2Gb device from JEDEC spec
tREFI = '3.9us'
extrapolate the following from LPDDR configs, using ns values
to minimize burst length, prefetch differences
tWR = '18ns'
tRTP = '7.5ns'
from MemCon example, tRRD is 4ns with 2ns tCK
tRRD = '4ns'
from MemCon example, tFAW is 30ns with 2ns tCK
tXAW = '30ns'
activation_limit = 4
4tCK
tXP = '8ns'
start with tRFC + tXP -> 160ns + 8ns = 168ns
tXS = '168ns'

+# A single HBM x64 interface (one command and address bus), with
+# default timings based on HBM gen1 and data publically released
+# A 4H stack is defined, 8Gb per die for a total of 4GB of memory.
+# Note: This defines a pseudo-channel with a unique controller
+# instantiated per pseudo-channel
+# Stay at same IO rate (1Gbps) to maintain timing relationship with
+# HBM gen1 class (HBM_1000_4H_x128) where possible
+class HBM_1000_4H_1x64(HBM_1000_4H_1x128):

For HBM gen2 with pseudo-channel mode, configure 2X channels.
Configuration defines a single pseudo channel, with the capacity
set to (full_ stack_capacity / 16) based on 8Gb dies
To use all 16 pseudo channels, set 'channels' parameter to 16 in
system configuration
64-bit pseudo-channle interface
device_bus_width = 64
HBM pseudo-channel only supports BL4
burst_length = 4
size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack;
with 16 channels, 256MB per channel
device_size = '256MB'
page size is halved with pseudo-channel; maintaining the same same

number

of rows per pseudo-channel with 2X banks across 2 channels
device_rowbuffer_size = '1kB'
HBM has 8 or 16 banks depending on capacity
Starting with 4Gb dies, 16 banks are defined
banks_per_rank = 16
Default different rank bus delay to 2 CK, @1000 MHz = 2 ns
tCS = '2ns'
reset tRFC for larger, 8Gb device
use HBM1 4Gb value as a starting point
tRFC = '260ns'
start with tRFC + tXP -> 160ns + 8ns = 168ns
tXS = '268ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '10ns'
self refresh exit time
tXS = '65ns'

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture
+# burst of 32, which means bursts can be interleaved
+class LPDDR5_5500_1x16_BG_BL32(DRAMInterface):
+

Increase buffer size to account for more bank resources
read_buffer_size = 64
Set page policy to better suit DMC Huxley
page_policy = 'close_adaptive'
16-bit channel interface
device_bus_width = 16
LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL32 for higher command bandwidth
burst_length = 32
size of device in bytes
device_size = '1GB'
2kB page with BG mode
device_rowbuffer_size = '2kB'
Use a 1x16 configuration
devices_per_rank = 1
Use a single rank
ranks_per_channel = 1
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Initial configuration will have 16 banks with Bank Group Arch
to maximim resources and enable higher data rates
banks_per_rank = 16
bank_groups_per_rank = 4
5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK
tCK = '1.455ns'
Greater of 2 CK or 18ns
tRCD = '18ns'
Base RL is 16 CK @ 687.5 MHz = 23.28ns
tCL = '23.280ns'
Greater of 2 CK or 18ns
tRP = '18ns'
Greater of 3 CK or 42ns
tRAS = '42ns'
Greater of 3 CK or 34ns
tWR = '34ns'
active powerdown and precharge powerdown exit time
Greater of 3 CK or 7ns
tXP = '7ns'
self refresh exit time (tRFCab + 7.5ns)
tXS = '217.5ns'
Greater of 2 CK or 7.5 ns minus 2 CK
tRTP = '4.59ns'
With BG architecture, burst of 32 transferred in two 16-beat
sub-bursts, with a 16-beat gap in between.
Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz
tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz
tBURST = '8.73ns'
can interleave a Bstof32 from another bank group at tBURST_MIN
16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz
tBURST_MIN = '2.91ns'
tBURST_MAX is the maximum burst delay for same bank group timing
this is 8 CK @ 687.5 MHz
tBURST_MAX = '11.64ns'
8 CK @ 687.5 MHz
tCCD_L = "11.64ns"
LPDDR5, 8 Gbit/channel for 280ns tRFCab
tRFC = '210ns'
tREFI = '3.9us'
Greater of 4 CK or 6.25 ns
tWTR = '6.25ns'
Greater of 4 CK or 12 ns
tWTR_L = '12ns'
Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL
tWCKDQ0/tCK will be 1 CK for most cases
For gem5 RL = WL and BL/n is already accounted for with tBURST
Result is and additional 1 CK is required
tRTW = '1.455ns'
Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns
tCS = '2.91ns'
2 CK
tPPD = '2.91ns'
Greater of 2 CK or 5 ns
tRRD = '5ns'
tRRD_L = '5ns'
With Bank Group Arch mode tFAW is 20 ns
tXAW = '20ns'
activation_limit = 4
at 5Gbps, 4:1 WCK to CK ratio required
2 data beats per WCK (DDR) -> 8 per CK
beats_per_clock = 8
2 cycles required to send activate command
2 command phases can be sent back-to-back or
with a gap up to tAAD = 8 CK
two_cycle_activate = True
tAAD = '11.640ns'
data_clock_sync = True

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture, burst of 16
+class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32):
+

LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL16 for smaller access granularity
burst_length = 16
For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio
tBURST = '2.91ns'
tBURST_MIN = '2.91ns'
For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio
tBURST_MAX = '5.82ns'
4 CK @ 687.5 MHz
tCCD_L = "5.82ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 8-bank mode, burst of 32
+class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32):
+

4kB page with 8B mode
device_rowbuffer_size = '4kB'
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Select 8B
banks_per_rank = 8
bank_groups_per_rank = 0
For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio
tBURST = '5.82ns'
tBURST_MIN = '5.82ns'
tBURST_MAX = '5.82ns'
Greater of 4 CK or 12 ns
tWTR = '12ns'
Greater of 2 CK or 10 ns
tRRD = '10ns'
With 8B mode tFAW is 40 ns
tXAW = '40ns'
activation_limit = 4
Reset BG arch timing for 8B mode
tCCD_L = "0ns"
tRRD_L = "0ns"
tWTR_L = "0ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture
+# burst of 32, which means bursts can be interleaved
+class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32):
+

5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK
tCK = '1.25ns'
Base RL is 17 CK @ 800 MHz = 21.25ns
tCL = '21.25ns'
With BG architecture, burst of 32 transferred in two 16-beat
sub-bursts, with a 16-beat gap in between.
Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz
tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz
tBURST = '7.5ns'
can interleave a Bstof32 from another bank group at tBURST_MIN
16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz
tBURST_MIN = '2.5ns'
tBURST_MAX is the maximum burst delay for same bank group timing
this is 8 CK @ 800 MHz
tBURST_MAX = '10ns'
8 CK @ 800 MHz
tCCD_L = "10ns"
Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL
tWCKDQ0/tCK will be 1 CK for most cases
For gem5 RL = WL and BL/n is already accounted for with tBURST
Result is and additional 1 CK is required
tRTW = '1.25ns'
Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns
tCS = '2.5ns'
2 CK
tPPD = '2.5ns'
2 command phases can be sent back-to-back or
with a gap up to tAAD = 8 CK
tAAD = '10ns'

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on initial
+# JEDEC specifcation
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture, burst of 16
+class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32):
+

LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL16 for smaller access granularity
burst_length = 16
For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio
tBURST = '2.5ns'
tBURST_MIN = '2.5ns'
For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio
tBURST_MAX = '5ns'
4 CK @ 800 MHz
tCCD_L = "5ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 8-bank mode, burst of 32
+class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32):
+

4kB page with 8B mode
device_rowbuffer_size = '4kB'
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Select 8B
banks_per_rank = 8
bank_groups_per_rank = 0
For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio
tBURST = '5ns'
tBURST_MIN = '5ns'
tBURST_MAX = '5ns'
Greater of 4 CK or 12 ns
tWTR = '12ns'
Greater of 2 CK or 10 ns
tRRD = '10ns'
With 8B mode tFAW is 40 ns
tXAW = '40ns'
activation_limit = 4
Reset BG arch timing for 8B mode
tCCD_L = "0ns"
tRRD_L = "0ns"
tWTR_L = "0ns"
diff --git a/src/mem/SConscript b/src/mem/SConscript
index b77dbb1..76ffdbd 100644
--- a/src/mem/SConscript
+++ b/src/mem/SConscript
@@ -1,6 +1,6 @@

-- mode:python --

All rights reserved

The license below extends only to copyright in the software and shall

@@ -47,6 +47,7 @@
SimObject('AddrMapper.py')
SimObject('Bridge.py')
SimObject('DRAMCtrl.py')
+SimObject('DRAMInterface.py')
SimObject('ExternalMaster.py')
SimObject('ExternalSlave.py')
SimObject('MemObject.py')
diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc
index dc244fe..533aa01 100644
--- a/src/mem/dram_ctrl.cc
+++ b/src/mem/dram_ctrl.cc
@@ -47,6 +47,7 @@
#include "debug/DRAMState.hh"
#include "debug/Drain.hh"
#include "debug/QOS.hh"
+#include "params/DRAMInterface.hh"
#include "sim/system.hh"

using namespace std;
@@ -58,12 +59,13 @@
retryRdReq(false), retryWrReq(false),
nextReqEvent([this]{ processNextReqEvent(); }, name()),
respondEvent([this]{ processRespondEvent(); }, name()),

dram(p->dram),
readBufferSize(p->read_buffer_size),
writeBufferSize(p->write_buffer_size),
writeHighThreshold(writeBufferSize * p->write_high_thresh_perc /
100.0),
writeLowThreshold(writeBufferSize * p->write_low_thresh_perc / 100.0),
minWritesPerSwitch(p->min_writes_per_switch),

writesThisTime(0), readsThisTime(0), tCS(p->tCS),

writesThisTime(0), readsThisTime(0),
memSchedPolicy(p->mem_sched_policy),
frontendLatency(p->static_frontend_latency),
backendLatency(p->static_backend_latency),
@@ -75,37 +77,23 @@
readQueue.resize(p->qos_priorities);
writeQueue.resize(p->qos_priorities);
dram->setCtrl(this);

// perform a basic check of the write thresholds
if (p->write_low_thresh_perc >= p->write_high_thresh_perc)
    fatal("Write buffer low threshold %d must be smaller than the "
          "high threshold %d\n", p->write_low_thresh_perc,
          p->write_high_thresh_perc);

// determine the rows per bank by looking at the total capacity
uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size());
DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity,
```
       AbstractMemory::size());
```
// create a DRAM interface
// will only populate the ranks if DRAM is configured
dram = new DRAMInterface(*this, p, capacity, range);
DPRINTF(DRAM, "Created DRAM interface \n");
}

void
DRAMCtrl::init()
{

MemCtrl::init();
if (!port.isConnected()) {
fatal("DRAMCtrl %s is unconnected!\n", name());
} else {
port.sendRangeChange();
}
dram->init(range);
}

void
@@ -115,8 +103,6 @@
isTimingMode = system()->isTimingMode();
```
if (isTimingMode) {
```
```
   dram->startup();
```

    // shift the bus busy time sufficiently far ahead that we never
    // have to worry about negative values when computing the time for
    // the next request, this will add an insignificant bubble at the

@@ -134,7 +120,7 @@
"is responding");

  // do the actual memory access and turn the packet into a response

access(pkt);

dram->access(pkt);

Tick latency = 0;
if (pkt->hasData()) {
@@ -264,7 +250,7 @@
// address of first DRAM packet is kept unaliged. Subsequent DRAM
packets
// are aligned to burst size boundaries. This is to ensure we
accurately
// check read packets against packets in write queue.

const Addr base_addr = getCtrlAddr(pkt->getAddr());

const Addr base_addr = dram->getCtrlAddr(pkt->getAddr());
Addr addr = base_addr;
unsigned pktsServicedByWrQ = 0;
BurstHelper* burst_helper = NULL;
@@ -364,7 +350,7 @@

// if the request size is larger than burst size, the pkt is split into
// multiple DRAM packets

const Addr base_addr = getCtrlAddr(pkt->getAddr());

const Addr base_addr = dram->getCtrlAddr(pkt->getAddr());
Addr addr = base_addr;
uint32_t burstSize = dram->bytesPerBurst();
for (int cnt = 0; cnt < pktCount; ++cnt) {
@@ -527,7 +513,7 @@
DRAMPacket* dram_pkt = respQueue.front();

// media specific checks and functions when read response is complete

dram->respondEventDRAM(dram_pkt->rank);

dram->respondEvent(dram_pkt->rank);

if (dram_pkt->burstHelper) {
// it is a split packet
@@ -726,12 +712,12 @@
void
DRAMCtrl::accessAndRespond(PacketPtr pkt, Tick static_latency)
{

DPRINTF(DRAM, "Responding to Address %lld.. ",pkt->getAddr());

DPRINTF(DRAM, "Responding to Address %lld.. \n",pkt->getAddr());

bool needsResponse = pkt->needsResponse();
// do the actual memory access which also turns the packet into a
// response

access(pkt);

dram->access(pkt);

// turn packet around to go back to requester if response expected
if (needsResponse) {
@@ -876,9 +862,9 @@
// if not, shift to next burst window
Tick act_at;
if (twoCycleActivate)

   act_at = ctrl.verifyMultiCmd(act_tick, tAAD);

   act_at = ctrl->verifyMultiCmd(act_tick, tAAD);
else

   act_at = ctrl.verifySingleCmd(act_tick);

   act_at = ctrl->verifySingleCmd(act_tick);

DPRINTF(DRAM, "Activate at tick %d\n", act_at);

@@ -996,7 +982,7 @@
// Issuing an explicit PRE command
// Verify that we have command bandwidth to issue the precharge
// if not, shift to next burst window

   pre_at = ctrl.verifySingleCmd(pre_tick);

   pre_at = ctrl->verifySingleCmd(pre_tick);
    // enforce tPPD
    for (int i = 0; i < banksPerRank; i++) {
        rank_ref.banks[i].preAllowedAt = std::max(pre_at + tPPD,

@@ -1046,7 +1032,7 @@

  // first clean up the burstTick set, removing old entries
  // before adding new entries for next burst

ctrl.pruneBurstTick();

ctrl->pruneBurstTick();

// get the rank
Rank& rank_ref = *ranks[dram_pkt->rank];
@@ -1098,9 +1084,9 @@
// verify that we have command bandwidth to issue the burst
// if not, shift to next burst window
if (dataClockSync && ((cmd_at - rank_ref.lastBurstTick) >
clkResyncDelay))

   cmd_at = ctrl.verifyMultiCmd(cmd_at, tCK);

   cmd_at = ctrl->verifyMultiCmd(cmd_at, tCK);
else

   cmd_at = ctrl.verifySingleCmd(cmd_at);

   cmd_at = ctrl->verifySingleCmd(cmd_at);

// if we are interleaving bursts, ensure that
// 1) we don't double interleave on next burst issue

@@ -1200,9 +1186,9 @@

      // either look at the read queue or write queue
      const std::vector<DRAMPacketQueue>& queue =

           ctrl.selQueue(dram_pkt->isRead());

           ctrl->selQueue(dram_pkt->isRead());

   for (uint8_t i = 0; i < ctrl.numPriorities(); ++i) {

   for (uint8_t i = 0; i < ctrl->numPriorities(); ++i) {
        auto p = queue[i].begin();
        // keep on looking until we find a hit or reach the end of the
        // queue

@@ -1273,6 +1259,7 @@
// Update latency stats
stats.totMemAccLat += dram_pkt->readyTime - dram_pkt->entryTime;
stats.totQLat += cmd_at - dram_pkt->entryTime;

   stats.totBusLat += tBURST;
} else {
    // Schedule write done event to decrement event count
    // after the readyTime has been reached

@@ -1338,13 +1325,9 @@
// Update latency stats
stats.masterReadTotalLat[dram_pkt->masterId()] +=
dram_pkt->readyTime - dram_pkt->entryTime;

   stats.bytesRead += dram->bytesPerBurst();

   stats.totBusLat += dram->burstDly();
    stats.masterReadBytes[dram_pkt->masterId()] += dram_pkt->size;
} else {
    ++writesThisTime;

   stats.bytesWritten += dram->bytesPerBurst();
    stats.masterWriteBytes[dram_pkt->masterId()] += dram_pkt->size;
    stats.masterWriteTotalLat[dram_pkt->masterId()] +=
        dram_pkt->readyTime - dram_pkt->entryTime;

@@ -1446,8 +1429,9 @@

              // Figure out which read request goes next
              // If we are changing command type, incorporate the minimum

           // bus turnaround delay which will be tCS (different rank)

case

           to_read = chooseNext((*queue), switched_cmd_type ? tCS :

0);

           // bus turnaround delay which will be rank to rank delay

           to_read = chooseNext((*queue), switched_cmd_type ?

                                          dram->rankDelay() : 0);

            if (to_read != queue->end()) {
                // candidate read found

@@ -1526,7 +1510,8 @@
// If we are changing command type, incorporate the minimum
// bus turnaround delay
to_write = chooseNext((*queue),

                switched_cmd_type ? std::min(dram->minRdToWr(),

tCS) : 0);

                switched_cmd_type ? std::min(dram->minRdToWr(),

                                             dram->rankDelay()) : 0);

        if (to_write != queue->end()) {
            write_found = true;

@@ -1599,11 +1584,8 @@
}
}

-DRAMInterface::DRAMInterface(DRAMCtrl& _ctrl,

                                const DRAMCtrlParams* _p,

                                const uint64_t capacity,

                                const AddrRange range)

: SimObject(_p), ctrl(_ctrl),
+DRAMInterface::DRAMInterface(const DRAMInterfaceParams* _p)

: AbstractMemory(_p),
addrMapping(_p->addr_mapping),
burstSize((_p->devices_per_rank * _p->burst_length *
_p->device_bus_width) / 8),
@@ -1618,7 +1600,7 @@
bankGroupsPerRank(_p->bank_groups_per_rank),
bankGroupArch(_p->bank_groups_per_rank > 0),
banksPerRank(_p->banks_per_rank), rowsPerBank(0),

 tCK(_p->tCK), tCL(_p->tCL), tBURST(_p->tBURST),

 tCK(_p->tCK), tCS(_p->tCS), tCL(_p->tCL), tBURST(_p->tBURST),
  tBURST_MIN(_p->tBURST_MIN), tBURST_MAX(_p->tBURST_MAX),

tRTW(_p->tRTW),
tCCD_L_WR(_p->tCCD_L_WR), tCCD_L(_p->tCCD_L), tRCD(_p->tRCD),
tRP(_p->tRP), tRAS(_p->tRAS), tWR(_p->tWR), tRTP(_p->tRTP),
@@ -1634,12 +1616,12 @@
wrToRdDly(tCL + tBURST + _p->tWTR), rdToWrDly(tBURST + tRTW),
wrToRdDlySameBG(tCL + _p->tBURST_MAX + _p->tWTR_L),
rdToWrDlySameBG(tRTW + _p->tBURST_MAX),

 rankToRankDly(ctrl.rankDelay() + tBURST),

 rankToRankDly(tCS + tBURST),
  pageMgmt(_p->page_policy),
  maxAccessesPerRow(_p->max_accesses_per_row),
  timeStampOffset(0), activeRank(0),
  enableDRAMPowerdown(_p->enable_dram_powerdown),

```
 stats(_ctrl, *this)
```

```
 stats(*this)
```
{
fatal_if(!isPowerOf2(burstSize), "DRAM burst size %d is not allowed, "
"must be a power of two\n", burstSize);
@@ -1651,7 +1633,7 @@
```
for (int i = 0; i < ranksPerChannel; i++) {
    DPRINTF(DRAM, "Creating DRAM rank %d \n", i);
```

   Rank* rank = new Rank(ctrl, _p, i, *this);

   Rank* rank = new Rank(_p, i, *this);
    ranks.push_back(rank);
}

@@ -1659,6 +1641,11 @@
uint64_t deviceCapacity = deviceSize / (1024 * 1024) * devicesPerRank *
ranksPerChannel;

uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size());
DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity,
```
       AbstractMemory::size());
```

// if actual DRAM size does not match memory capacity in system warn!
if (deviceCapacity != capacity / (1024 * 1024))
    warn("DRAM device capacity (%d Mbytes) does not match the "

@@ -1713,8 +1700,10 @@
}

void
-DRAMInterface::init(AddrRange range)
+DRAMInterface::init()
{

AbstractMemory::init();

// a bit of sanity checks on the interleaving, save it for here to
// ensure that the system pointer is initialised
if (range.interleaved()) {

@@ -1736,7 +1725,7 @@

          // channel striping has to be done at a granularity that
          // is equal or larger to a cache line

       if (ctrl.system()->cacheLineSize() > range.granularity()) {

       if (system()->cacheLineSize() > range.granularity()) {
            fatal("Channel interleaving of %s must be at least as

large "
"as the cache line size\n", name());
}
@@ -1755,8 +1744,10 @@
void
DRAMInterface::startup()
{

// timestamp offset should be in clock cycles for DRAMPower
timeStampOffset = divCeil(curTick(), tCK);

if (system()->isTimingMode()) {

   // timestamp offset should be in clock cycles for DRAMPower

   timeStampOffset = divCeil(curTick(), tCK);

}

for (auto r : ranks) {
    r->startup(curTick() + tREFI - tRP);

@@ -1802,7 +1793,7 @@
}

void
-DRAMInterface::respondEventDRAM(uint8_t rank)
+DRAMInterface::respondEvent(uint8_t rank)
{
Rank& rank_ref = *ranks[rank];

@@ -1943,7 +1934,7 @@
std::max(ranks[i]->banks[j].preAllowedAt, curTick()) +
tRP;

              // When is the earliest the R/W burst can issue?

           const Tick col_allowed_at = ctrl.inReadBusState(false) ?

           const Tick col_allowed_at = ctrl->inReadBusState(false) ?

ranks[i]->banks[j].rdAllowedAt :

ranks[i]->banks[j].wrAllowedAt;
Tick col_at = std::max(col_allowed_at, act_at + tRCD);
@@ -1983,9 +1974,15 @@
return make_pair(bank_mask, hidden_bank_prep);
}

-DRAMInterface::Rank::Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int
_rank,

                    DRAMInterface& _dram)

: EventManager(&_ctrl), ctrl(_ctrl), dram(_dram),
+DRAMInterface*
+DRAMInterfaceParams::create()
+{

return new DRAMInterface(this);
+}

+DRAMInterface::Rank::Rank(const DRAMInterfaceParams* _p,

                    int _rank, DRAMInterface& _dram)

: EventManager(&_dram), dram(_dram),
pwrStateTrans(PWR_IDLE), pwrStatePostRefresh(PWR_IDLE),
pwrStateTick(0), refreshDueAt(0), pwrState(PWR_IDLE),
refreshState(REF_IDLE), inLowPowerState(false), rank(_rank),
@@ -1998,7 +1995,7 @@
refreshEvent([this]{ processRefreshEvent(); }, name()),
powerEvent([this]{ processPowerEvent(); }, name()),
wakeUpEvent([this]{ processWakeUpEvent(); }, name()),

```
 stats(_ctrl, *this)
```

```
 stats(_dram, *this)
```
{
for (int b = 0; b < _p->banks_per_rank; b++) {
banks[b].bank = b;
@@ -2049,8 +2046,10 @@
DRAMInterface::Rank::isQueueEmpty() const
{
// check commmands in Q based on current bus direction

bool no_queued_cmds = (ctrl.inReadBusState(true) && (readEntries == 0))

                  || (!ctrl.inReadBusState(true) && (writeEntries ==

0));

bool no_queued_cmds = (dram.ctrl->inReadBusState(true) &&

                     (readEntries == 0))

                  || (!dram.ctrl->inReadBusState(true) &&

                     (writeEntries == 0));
return no_queued_cmds;

}

@@ -2174,7 +2173,7 @@
// if a request is at the moment being handled and this request is
// accessing the current rank then wait for it to finish
if ((rank == dram.activeRank)

       && (ctrl.nextReqEvent.scheduled())) {

       && (dram.ctrl->nextReqEvent.scheduled())) {
        // hand control over to the request loop until it is
        // evaluated next
        DPRINTF(DRAM, "Refresh awaiting draining\n");

@@ -2249,7 +2248,7 @@
// or have outstanding ACT,RD/WR,Auto-PRE sequence scheduled
// should have outstanding precharge or read response event
assert(prechargeEvent.scheduled() ||

              ctrl.respondEvent.scheduled());

              dram.ctrl->respondEvent.scheduled());
        // will start refresh when pwrState transitions to IDLE
    }

@@ -2309,8 +2308,8 @@

      assert(!powerEvent.scheduled());

   if ((ctrl.drainState() == DrainState::Draining) ||

       (ctrl.drainState() == DrainState::Drained)) {

   if ((dram.ctrl->drainState() == DrainState::Draining) ||

       (dram.ctrl->drainState() == DrainState::Drained)) {
        // if draining, do not re-enter low-power mode.
        // simply go to IDLE and wait
        schedulePowerEvent(PWR_IDLE, curTick());

@@ -2535,10 +2534,10 @@
}

      // completed refresh event, ensure next request is scheduled

   if (!ctrl.nextReqEvent.scheduled()) {

   if (!dram.ctrl->nextReqEvent.scheduled()) {
        DPRINTF(DRAM, "Scheduling next request after refreshing"
                       " rank %d\n", rank);

       schedule(ctrl.nextReqEvent, curTick());

       schedule(dram.ctrl->nextReqEvent, curTick());
    }
}

@@ -2597,8 +2596,8 @@
// bypass auto-refresh and go straight to SREF, where memory
// will issue refresh immediately upon entry
if (pwrStatePostRefresh == PWR_PRE_PDN && isQueueEmpty() &&

      (ctrl.drainState() != DrainState::Draining) &&

      (ctrl.drainState() != DrainState::Drained) &&

      (dram.ctrl->drainState() != DrainState::Draining) &&

      (dram.ctrl->drainState() != DrainState::Drained) &&
       dram.enableDRAMPowerdown) {
        DPRINTF(DRAMState, "Rank %d bypassing refresh and

transitioning "
"to self refresh at %11u tick\n", rank, curTick());
@@ -2669,7 +2668,7 @@
// power (mW) = ----------- * ----------
// time (tick) tick_frequency
stats.averagePower = (stats.totalEnergy.value() /

               (curTick() - ctrl.lastStatsResetTick)) *

               (curTick() - dram.ctrl->lastStatsResetTick)) *
                (SimClock::Frequency / 1000000000.0);

}

@@ -2699,7 +2698,7 @@
bool
DRAMInterface::Rank::forceSelfRefreshExit() const {
return (readEntries != 0) ||

      (!ctrl.inReadBusState(true) && (writeEntries != 0));

```
      (!dram.ctrl->inReadBusState(true) && (writeEntries != 0));
```
}

DRAMCtrl::CtrlStats::CtrlStats(DRAMCtrl &_ctrl)
@@ -2710,15 +2709,15 @@
ADD_STAT(writeReqs, "Number of write requests accepted"),
```
ADD_STAT(readBursts,
```

```
        "Number of DRAM read bursts, "
```

        "Number of controller read bursts, "
         "including those serviced by the write queue"),
ADD_STAT(writeBursts,

        "Number of DRAM write bursts, "

        "Number of controller write bursts, "
         "including those merged in the write queue"),
ADD_STAT(servicedByWrQ,

        "Number of DRAM read bursts serviced by the write queue"),

        "Number of controller read bursts serviced by the write

queue"),
ADD_STAT(mergedWrBursts,

        "Number of DRAM write bursts merged with an existing one"),

        "Number of controller write bursts merged with an existing

one"),

  ADD_STAT(neitherReadNorWriteReqs,
           "Number of requests that are neither read nor write"),

@@ -2726,9 +2725,6 @@
ADD_STAT(avgRdQLen, "Average read queue length when enqueuing"),
ADD_STAT(avgWrQLen, "Average write queue length when enqueuing"),

ADD_STAT(totBusLat, "Total ticks spent in databus transfers"),
ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"),

ADD_STAT(numRdRetry, "Number of times read queue was full causing

retry"),
ADD_STAT(numWrRetry, "Number of times write queue was full causing
retry"),

@@ -2743,22 +2739,13 @@
ADD_STAT(wrPerTurnAround,
"Writes before turning the bus around for reads"),

ADD_STAT(bytesRead, "Total number of bytes read from memory"),
ADD_STAT(bytesReadWrQ, "Total number of bytes read from write queue"),
ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"),
ADD_STAT(bytesReadSys, "Total read bytes from the system interface
side"),
ADD_STAT(bytesWrittenSys,
"Total written bytes from the system interface side"),
ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiByte/s"),
ADD_STAT(avgWrBW, "Average achieved write bandwidth in MiByte/s"),
ADD_STAT(avgRdBWSys, "Average system read bandwidth in MiByte/s"),
ADD_STAT(avgWrBWSys, "Average system write bandwidth in MiByte/s"),
ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"),
ADD_STAT(busUtil, "Data bus utilization in percentage"),
ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"),
ADD_STAT(busUtilWrite, "Data bus utilization in percentage for
writes"),

ADD_STAT(totGap, "Total gap between requests"),
ADD_STAT(avgGap, "Average gap between requests"),
@@ -2790,12 +2777,11 @@
{
using namespace Stats;
assert(ctrl._system);
const auto max_masters = ctrl._system->maxMasters();

assert(ctrl.system());
const auto max_masters = ctrl.system()->maxMasters();

avgRdQLen.precision(2);
avgWrQLen.precision(2);

avgBusLat.precision(2);

readPktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1);
writePktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1);
@@ -2810,14 +2796,9 @@
.init(ctrl.writeBufferSize)
.flags(nozero);
avgRdBW.precision(2);
avgWrBW.precision(2);
avgRdBWSys.precision(2);
avgWrBWSys.precision(2);
peakBW.precision(2);
busUtil.precision(2);
avgGap.precision(2);
busUtilWrite.precision(2);

// per-master bytes read and written to memory
masterReadBytes
@@ -2849,9 +2830,6 @@
.flags(nonan)
.precision(2);
busUtilRead
```
   .precision(2);
```

masterWriteRate
    .flags(nozero | nonan)
    .precision(12);

@@ -2865,7 +2843,7 @@
.precision(2);

  for (int i = 0; i < max_masters; i++) {

   const std::string master = ctrl._system->getMasterName(i);

   const std::string master = ctrl.system()->getMasterName(i);
    masterReadBytes.subname(i, master);
    masterReadRate.subname(i, master);
    masterWriteBytes.subname(i, master);

@@ -2879,22 +2857,11 @@
}

  // Formula stats

avgBusLat = totBusLat / (readBursts - servicedByWrQ);
avgRdBW = (bytesRead / 1000000) / simSeconds;
avgWrBW = (bytesWritten / 1000000) / simSeconds;
avgRdBWSys = (bytesReadSys / 1000000) / simSeconds;
avgWrBWSys = (bytesWrittenSys / 1000000) / simSeconds;
peakBW = (SimClock::Frequency / ctrl.dram->burstDataDly()) *

         ctrl.dram->bytesPerBurst() / 1000000;

busUtil = (avgRdBW + avgWrBW) / peakBW * 100;

avgGap = totGap / (readReqs + writeReqs);
busUtilRead = avgRdBW / peakBW * 100;
busUtilWrite = avgWrBW / peakBW * 100;

masterReadRate = masterReadBytes / simSeconds;
masterWriteRate = masterWriteBytes / simSeconds;
masterReadAvgLat = masterReadTotalLat / masterReadAccesses;

@@ -2907,8 +2874,8 @@
ctrl.lastStatsResetTick = curTick();
}

-DRAMInterface::DRAMStats::DRAMStats(DRAMCtrl &_ctrl, DRAMInterface &_dram)

: Stats::Group(&_ctrl, csprintf("dram").c_str()),
+DRAMInterface::DRAMStats::DRAMStats(DRAMInterface &_dram)

: Stats::Group(&_dram),
dram(_dram),

ADD_STAT(readBursts, "Number of DRAM read bursts"),
@@ -2918,10 +2885,13 @@
ADD_STAT(perBankWrBursts, "Per bank write bursts"),

ADD_STAT(totQLat, "Total ticks spent queuing"),
ADD_STAT(totBusLat, "Total ticks spent in databus transfers"),
ADD_STAT(totMemAccLat,
"Total ticks spent from burst creation until serviced "
"by the DRAM"),

ADD_STAT(avgQLat, "Average queueing delay per DRAM burst"),

ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"),
ADD_STAT(avgMemAccLat, "Average memory access latency per DRAM burst"),

ADD_STAT(readRowHits, "Number of row buffer hits during reads"),
@@ -2934,6 +2904,12 @@
ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"),
ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiBytes/s"),
ADD_STAT(avgWrBW, "Average DRAM write bandwidth in MiBytes/s"),
ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"),
ADD_STAT(busUtil, "Data bus utilization in percentage"),
ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"),
ADD_STAT(busUtilWrite, "Data bus utilization in percentage for
writes"),

ADD_STAT(pageHitRate, "Row buffer hit rate, read and write combined")

{
@@ -2945,6 +2921,7 @@
using namespace Stats;

avgQLat.precision(2);

avgBusLat.precision(2);
avgMemAccLat.precision(2);

readRowHitRate.precision(2);
@@ -2958,10 +2935,16 @@
dram.maxAccessesPerRow : dram.rowBufferSize)
.flags(nozero);
peakBW.precision(2);
busUtil.precision(2);
busUtilWrite.precision(2);
busUtilRead.precision(2);

pageHitRate.precision(2);

// Formula stats
avgQLat = totQLat / readBursts;

avgBusLat = totBusLat / readBursts;
avgMemAccLat = totMemAccLat / readBursts;

readRowHitRate = (readRowHits / readBursts) * 100;
@@ -2969,13 +2952,19 @@

avgRdBW = (bytesRead / 1000000) / simSeconds;
avgWrBW = (bytesWritten / 1000000) / simSeconds;
peakBW = (SimClock::Frequency / dram.burstDataDly()) *

         dram.bytesPerBurst() / 1000000;

busUtil = (avgRdBW + avgWrBW) / peakBW * 100;
busUtilRead = avgRdBW / peakBW * 100;
busUtilWrite = avgWrBW / peakBW * 100;

pageHitRate = (writeRowHits + readRowHits) /
(writeBursts + readBursts) * 100;
}

-DRAMInterface::RankStats::RankStats(DRAMCtrl &_ctrl, Rank &_rank)

: Stats::Group(&_ctrl, csprintf("dram_rank%d", _rank.rank).c_str()),
+DRAMInterface::RankStats::RankStats(DRAMInterface &_dram, Rank &_rank)

: Stats::Group(&_dram, csprintf("rank%d", _rank.rank).c_str()),
rank(_rank),

ADD_STAT(actEnergy, "Energy for activate commands per rank (pJ)"),
@@ -3034,7 +3023,7 @@
DRAMCtrl::recvFunctional(PacketPtr pkt)
{
// rely on the abstract memory

functionalAccess(pkt);

dram->functionalAccess(pkt);
}

Port &
@@ -3099,7 +3088,7 @@
DRAMCtrl::MemoryPort::getAddrRanges() const
{
AddrRangeList ranges;

ranges.push_back(ctrl.getAddrRange());

ranges.push_back(ctrl.dram->getAddrRange());
return ranges;
}

diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh
index 4464f7a..1b6d8b5 100644
--- a/src/mem/dram_ctrl.hh
+++ b/src/mem/dram_ctrl.hh
@@ -56,12 +56,15 @@
#include "enums/AddrMap.hh"
#include "enums/MemSched.hh"
#include "enums/PageManage.hh"
+#include "mem/abstract_mem.hh"
#include "mem/drampower.hh"
#include "mem/qos/mem_ctrl.hh"
#include "mem/qport.hh"
#include "params/DRAMCtrl.hh"
#include "sim/eventq.hh"

+class DRAMInterfaceParams;
+
/**

A basic class to track the bank state, i.e. what row is
currently open (if any), when is the bank free to accept a new
@@ -243,7 +246,7 @@
The DRAMInterface includes a class for individual ranks
and per rank functions.
/
-class DRAMInterface : public SimObject
+class DRAMInterface : public AbstractMemory
{
private:
/*
@@ -340,7 +343,7 @@
class Rank;
struct RankStats : public Stats::Group
{

   RankStats(DRAMCtrl &ctrl, Rank &rank);

   RankStats(DRAMInterface &dram, Rank &rank);

    void regStats() override;
    void resetStats() override;

@@ -406,13 +409,6 @@
*/
class Rank : public EventManager
{

```
 protected:
```
```
   /**
```

    * A reference to the parent DRAMCtrl instance

```
    */
```
```
   DRAMCtrl& ctrl;
```
```
  private:

    /**
```

@@ -532,10 +528,10 @@
*/
Tick lastBurstTick;

   Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank,

   Rank(const DRAMInterfaceParams* _p, int _rank,
         DRAMInterface& _dram);

   const std::string name() const { return csprintf("dram_%d", rank);

}

   const std::string name() const { return csprintf("%d", rank); }

    /**
     * Kick off accounting for power and refresh states and

@@ -662,9 +658,9 @@
};

/**

* A reference to the parent DRAMCtrl instance

* A pointer to the parent DRAMCtrl instance
 */

DRAMCtrl& ctrl;

DRAMCtrl* ctrl;

/**
- Memory controller configuration initialized based on parameter
  @@ -695,6 +691,7 @@
- DRAM timing requirements
  */
  const Tick M5_CLASS_VAR_USED tCK;
const Tick tCS;
const Tick tCL;
const Tick tBURST;
const Tick tBURST_MIN;
@@ -774,7 +771,7 @@
bool trace = true);

struct DRAMStats : public Stats::Group {

   DRAMStats(DRAMCtrl &ctrl, DRAMInterface &dram);

   DRAMStats(DRAMInterface &dram);

    void regStats() override;

@@ -790,10 +787,12 @@

      // Latencies summed over all requests
      Stats::Scalar totQLat;

   Stats::Scalar totBusLat;
    Stats::Scalar totMemAccLat;

    // Average latencies per request
    Stats::Formula avgQLat;

   Stats::Formula avgBusLat;
    Stats::Formula avgMemAccLat;

    // Row hit count and rate

@@ -809,6 +808,11 @@
// Average bandwidth
Stats::Formula avgRdBW;
Stats::Formula avgWrBW;

```
   Stats::Formula peakBW;
```
```
   // bus utilization
```
```
   Stats::Formula busUtil;
```
```
   Stats::Formula busUtilRead;
```

   Stats::Formula busUtilWrite;
    Stats::Formula pageHitRate;
};

@@ -820,11 +824,16 @@
std::vector<Rank*> ranks;

public:

/** Setting a pointer to the controller */
void setCtrl(DRAMCtrl* _ctrl)
{
```
   ctrl = _ctrl;
```
};

/**
 * Initialize the DRAM interface and verify parameters

* @param range is the address range for this interface
 */

void init(AddrRange range);

void init();

/**
- Iterate through dram ranks and instantiate per rank startup routine
  @@ -853,6 +862,20 @@
  void suspend();
/**

* Get an address in a dense range which starts from 0. The input

* address is the physical address of the request in an address

* space that contains other SimObjects apart from this

```
* controller.
```
```
*
```

* @param addr The intput address which should be in the addrRange

* @return An address in the continues range [0, max)

```
*/
```
Addr getCtrlAddr(Addr addr)
{
```
   return range.getOffset(addr);
```
}
/**
- @return number of bytes in a burst for this interface
  */
  uint32_t bytesPerBurst () { return burstSize; };
  @@ -887,6 +910,13 @@
  */
  Tick minRdToWr () { return tRTW; };
/**

* Determine the required delay for an access to a different rank

```
*
```
```
* @return required rank to rank delay
```
```
*/
```
Tick rankDelay() { return tCS; };

/*
 * Function to calulate RAS cycle time for use within and
 * outside of this class

@@ -968,7 +998,7 @@
*
* @param rank Specifies rank associated with read burst
*/

void respondEventDRAM(uint8_t rank);

void respondEvent(uint8_t rank);

/**
- Check the refresh state to determine if refresh needs
  @@ -1004,8 +1034,7 @@
  virtual void process() { rank->resetStats(); };
  };

DRAMInterface(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p,

            uint64_t capacity, AddrRange range);

DRAMInterface(const DRAMInterfaceParams* _p);
};

/**
@@ -1170,20 +1199,6 @@
void accessAndRespond(PacketPtr pkt, Tick static_latency);

/**

* Get an address in a dense range which starts from 0. The input

* address is the physical address of the request in an address

* space that contains other SimObjects apart from this

```
* controller.
```
```
*
```

* @param addr The intput address which should be in the addrRange

* @return An address in the continues range [0, max)

```
*/
```
Addr getCtrlAddr(Addr addr)
{
```
   return range.getOffset(addr);
```
}
/**
- The memory schduler/arbiter - picks which request needs to
- go next, based on the specified policy such as FCFS or FR-FCFS
- and moves it to the head of the queue.
  @@ -1265,6 +1280,11 @@
  std::unordered_multiset<Tick> burstTicks;
/**

* Create pointer to interface of the actual dram media

```
*/
```
DRAMInterface* const dram;
/**
- The following are basic design parameters of the memory
- controller, and are initialized based on parameter values.
- The rowsPerBank is determined based on the capacity, number of
  @@ -1279,12 +1299,6 @@
  uint32_t readsThisTime;
/**

* Basic memory timing parameters initialized based on parameter

* values. These will be used across memory interfaces.

```
*/
```
const Tick tCS;
/**
- Memory controller configuration initialized based on parameter
- values.
  */
  @@ -1338,10 +1352,6 @@
  // Average queue lengths
  Stats::Average avgRdQLen;
  Stats::Average avgWrQLen;

   // Latencies summed over all requests

```
   Stats::Scalar totBusLat;
```
```
   // Average latencies per request
```

   Stats::Formula avgBusLat;

    Stats::Scalar numRdRetry;
    Stats::Scalar numWrRetry;

@@ -1352,21 +1362,12 @@
Stats::Histogram rdPerTurnAround;
Stats::Histogram wrPerTurnAround;

   Stats::Scalar bytesRead;
    Stats::Scalar bytesReadWrQ;

   Stats::Scalar bytesWritten;
    Stats::Scalar bytesReadSys;
    Stats::Scalar bytesWrittenSys;
    // Average bandwidth

```
   Stats::Formula avgRdBW;
```

   Stats::Formula avgWrBW;
    Stats::Formula avgRdBWSys;
    Stats::Formula avgWrBWSys;

```
   Stats::Formula peakBW;
```
```
   // bus utilization
```
```
   Stats::Formula busUtil;
```
```
   Stats::Formula busUtilRead;
```

   Stats::Formula busUtilWrite;

    Stats::Scalar totGap;
    Stats::Formula avgGap;

@@ -1405,11 +1406,6 @@
/** The time when stats were last reset used to calculate average
power */
Tick lastStatsResetTick;

* Create pointer to interfasce to the actual media

```
*/
```
DRAMInterface* dram;

DRAMCtrl(const DRAMCtrlParams* p);

DrainState drain() override;

@@ -1458,13 +1454,6 @@
};

/**

* Determine the required delay for an access to a different rank

```
*
```
```
* @return required rank to rank delay
```
```
*/
```
Tick rankDelay() { return tCS; };
/**
- Check the current direction of the memory channel
- @param next_state Check either the current or next bus state
  diff --git a/src/mem/drampower.cc b/src/mem/drampower.cc
  index f506928..7a44aa1 100644
  --- a/src/mem/drampower.cc
  +++ b/src/mem/drampower.cc
  @@ -40,13 +40,13 @@
  #include "base/intmath.hh"
  #include "sim/core.hh"

-DRAMPower::DRAMPower(const DRAMCtrlParams* p, bool include_io) :
+DRAMPower::DRAMPower(const DRAMInterfaceParams* p, bool include_io) :
powerlib(libDRAMPower(getMemSpec(p), include_io))
{
}

Data::MemArchitectureSpec
-DRAMPower::getArchParams(const DRAMCtrlParams* p)
+DRAMPower::getArchParams(const DRAMInterfaceParams* p)
{
Data::MemArchitectureSpec archSpec;
archSpec.burstLength = p->burst_length;
@@ -68,7 +68,7 @@
}

Data::MemTimingSpec
-DRAMPower::getTimingParams(const DRAMCtrlParams* p)
+DRAMPower::getTimingParams(const DRAMInterfaceParams* p)
{
// Set the values that are used for power calculations and ignore
// the ones only used by the controller functionality in DRAMPower
@@ -100,7 +100,7 @@
}

Data::MemPowerSpec
-DRAMPower::getPowerParams(const DRAMCtrlParams* p)
+DRAMPower::getPowerParams(const DRAMInterfaceParams* p)
{
// All DRAMPower currents are in mA
Data::MemPowerSpec powerSpec;
@@ -132,7 +132,7 @@
}

Data::MemorySpecification
-DRAMPower::getMemSpec(const DRAMCtrlParams* p)
+DRAMPower::getMemSpec(const DRAMInterfaceParams* p)
{
Data::MemorySpecification memSpec;
memSpec.memArchSpec = getArchParams(p);
@@ -142,13 +142,13 @@
}

bool
-DRAMPower::hasTwoVDD(const DRAMCtrlParams* p)
+DRAMPower::hasTwoVDD(const DRAMInterfaceParams* p)
{
return p->VDD2 == 0 ? false : true;
}

uint8_t
-DRAMPower::getDataRate(const DRAMCtrlParams* p)
+DRAMPower::getDataRate(const DRAMInterfaceParams* p)
{
uint32_t burst_cycles = divCeil(p->tBURST_MAX, p->tCK);
uint8_t data_rate = p->burst_length / burst_cycles;
diff --git a/src/mem/drampower.hh b/src/mem/drampower.hh
index ed47476..da68a78 100644
--- a/src/mem/drampower.hh
+++ b/src/mem/drampower.hh
@@ -44,7 +44,7 @@
#define MEM_DRAM_POWER_HH

#include "libdrampower/LibDRAMPower.h"
-#include "params/DRAMCtrl.hh"
+#include "params/DRAMInterface.hh"

/**

DRAMPower is a standalone tool which calculates the power consumed by a
@@ -57,43 +57,44 @@

/**
- Transform the architechture parameters defined in

* DRAMCtrlParams to the memSpec of DRAMPower

* DRAMInterfaceParams to the memSpec of DRAMPower
 */

static Data::MemArchitectureSpec getArchParams(const DRAMCtrlParams*
p);

static Data::MemArchitectureSpec getArchParams(

                                const DRAMInterfaceParams* p);

/**

* Transforms the timing parameters defined in DRAMCtrlParams to

* Transforms the timing parameters defined in DRAMInterfaceParams to
 * the memSpec of DRAMPower
 */

static Data::MemTimingSpec getTimingParams(const DRAMCtrlParams* p);

static Data::MemTimingSpec getTimingParams(const DRAMInterfaceParams*
p);

/**
- Transforms the power and current parameters defined in

* DRAMCtrlParam to the memSpec of DRAMPower

* DRAMInterfaceParams to the memSpec of DRAMPower
 */

static Data::MemPowerSpec getPowerParams(const DRAMCtrlParams* p);

static Data::MemPowerSpec getPowerParams(const DRAMInterfaceParams* p);

/**
- Determine data rate, either one or two.
  */

static uint8_t getDataRate(const DRAMCtrlParams* p);

static uint8_t getDataRate(const DRAMInterfaceParams* p);

/**
- Determine if DRAM has two voltage domains (or one)
  */

static bool hasTwoVDD(const DRAMCtrlParams* p);

static bool hasTwoVDD(const DRAMInterfaceParams* p);

/**

* Return an instance of MemSpec based on the DRAMCtrlParams

* Return an instance of MemSpec based on the DRAMInterfaceParams
 */

static Data::MemorySpecification getMemSpec(const DRAMCtrlParams* p);

static Data::MemorySpecification getMemSpec(const DRAMInterfaceParams*
p);

public:

  // Instance of DRAMPower Library
  libDRAMPower powerlib;

DRAMPower(const DRAMCtrlParams* p, bool include_io);

DRAMPower(const DRAMInterfaceParams* p, bool include_io);

};

diff --git a/src/mem/qos/QoSMemCtrl.py b/src/mem/qos/QoSMemCtrl.py
index 1cd3f0b..f55105b 100644
--- a/src/mem/qos/QoSMemCtrl.py
+++ b/src/mem/qos/QoSMemCtrl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 ARM Limited
+# Copyright (c) 2018-2020 ARM Limited

All rights reserved.

The license below extends only to copyright in the software and shall

@@ -34,18 +34,21 @@

OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from m5.params import *
-from m5.objects.AbstractMemory import AbstractMemory
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
from m5.objects.QoSTurnaround import *

QoS Queue Selection policy used to select packets among same-QoS queues

class QoSQPolicy(Enum): vals = ["fifo", "lifo", "lrg"]

-class QoSMemCtrl(AbstractMemory):
+class QoSMemCtrl(ClockedObject):
type = 'QoSMemCtrl'
cxx_header = "mem/qos/mem_ctrl.hh"
cxx_class = 'QoS::MemCtrl'
abstract = True

system = Param.System(Parent.any, "System that the controller belongs
to.")

##### QoS support parameters ####

# Number of priorities in the system

diff --git a/src/mem/qos/QoSMemSinkCtrl.py b/src/mem/qos/QoSMemSinkCtrl.py
index 572cad5..03a988a 100644
--- a/src/mem/qos/QoSMemSinkCtrl.py
+++ b/src/mem/qos/QoSMemSinkCtrl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 ARM Limited
+# Copyright (c) 2018-2020 ARM Limited

All rights reserved.

The license below extends only to copyright in the software and shall

@@ -37,6 +37,7 @@

from m5.params import *
from m5.objects.QoSMemCtrl import *
+from QoSMemSinkInterface import *

class QoSMemSinkCtrl(QoSMemCtrl):
type = 'QoSMemSinkCtrl'
@@ -44,6 +45,10 @@
cxx_class = "QoS::MemSinkCtrl"
port = SlavePort("Slave ports")

intf = Param.QoSMemSinkInterface(QoSMemSinkInterface(), "Interface
to "\

                                             "memory")

# the basic configuration of the controller architecture, note
# that each entry corresponds to a burst for the specific DRAM
# configuration (e.g. x32 with burst length 8 is 32 bytes) and not

@@ -59,5 +64,3 @@

  # response latency - time to issue a response once a request is

serviced
response_latency = Param.Latency("20ns", "Memory response latency")

diff --git a/src/mem/qos/QoSMemSinkInterface.py
b/src/mem/qos/QoSMemSinkInterface.py
new file mode 100644
index 0000000..fd8254f
--- /dev/null
+++ b/src/mem/qos/QoSMemSinkInterface.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2020 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder. You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Matteo Andreozzi
+# Wendy Elsasser
+
+from AbstractMemory import AbstractMemory
+
+class QoSMemSinkInterface(AbstractMemory):

type = 'QoSMemSinkInterface'
cxx_header = "mem/qos/mem_sink.hh"
diff --git a/src/mem/qos/SConscript b/src/mem/qos/SConscript
index f8601b6..1d90f9c 100644
--- a/src/mem/qos/SConscript
+++ b/src/mem/qos/SConscript
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 ARM Limited
+# Copyright (c) 2018-2020 ARM Limited

All rights reserved

The license below extends only to copyright in the software and shall

@@ -37,6 +37,7 @@

SimObject('QoSMemCtrl.py')
SimObject('QoSMemSinkCtrl.py')
+SimObject('QoSMemSinkInterface.py')
SimObject('QoSPolicy.py')
SimObject('QoSTurnaround.py')

diff --git a/src/mem/qos/mem_ctrl.cc b/src/mem/qos/mem_ctrl.cc
index 50e6035..190960b 100644
--- a/src/mem/qos/mem_ctrl.cc
+++ b/src/mem/qos/mem_ctrl.cc
@@ -1,5 +1,5 @@
/*

- Copyright (c) 2017-2019 ARM Limited

- Copyright (c) 2017-2020 ARM Limited
- All rights reserved
- The license below extends only to copyright in the software and shall
  @@ -42,7 +42,7 @@
  namespace QoS {
MemCtrl::MemCtrl(const QoSMemCtrlParams * p)

: AbstractMemory(p),

: ClockedObject(p),
policy(p->qos_policy),
turnPolicy(p->qos_turnaround_policy),
queuePolicy(QueuePolicy::create(p)),
@@ -51,7 +51,8 @@
qosSyncroScheduler(p->qos_syncro_scheduler),
totalReadQueueSize(0), totalWriteQueueSize(0),
busState(READ), busStateNext(READ),

stats(*this)

stats(*this),
_system(p->system)
{
// Set the priority policy
if (policy) {
@@ -77,12 +78,6 @@
{}

void
-MemCtrl::init()
-{

AbstractMemory::init();
-}

-void
MemCtrl::logRequest(BusState dir, MasterID m_id, uint8_t qos,
Addr addr, uint64_t entries)
{
diff --git a/src/mem/qos/mem_ctrl.hh b/src/mem/qos/mem_ctrl.hh
index 0e29fcc..50ddc94 100644
--- a/src/mem/qos/mem_ctrl.hh
+++ b/src/mem/qos/mem_ctrl.hh
@@ -1,5 +1,5 @@
/*

- Copyright (c) 2019 ARM Limited

- Copyright (c) 2020 ARM Limited
- All rights reserved
- The license below extends only to copyright in the software and shall
  @@ -36,9 +36,9 @@
  */
#include "debug/QOS.hh"
-#include "mem/abstract_mem.hh"
-#include "mem/qos/q_policy.hh"
+#include "mem/mem_object.hh"
#include "mem/qos/policy.hh"
+#include "mem/qos/q_policy.hh"
#include "params/QoSMemCtrl.hh"
#include "sim/system.hh"

@@ -49,6 +49,8 @@
#ifndef MEM_QOS_MEM_CTRL_HH
#define MEM_QOS_MEM_CTRL_HH

+class System;
+
namespace QoS {

/**
@@ -56,7 +58,7 @@

which support QoS - it provides access to a set of QoS
scheduling policies
/
-class MemCtrl: public AbstractMemory
+class MemCtrl: public ClockedObject
{
public:
/* Bus Direction */
@@ -151,6 +153,9 @@
Stats::Scalar numStayWriteState;
} stats;

/** Pointer to the System object */
System* _system;

/**
 * Initializes dynamically counters and
 * statistics for a given Master

@@ -266,11 +271,6 @@
virtual ~MemCtrl();

/**

```
* Initializes this object
```
```
*/
```
void init() override;
/**
- Gets the current bus state
- @return current bus state
  @@ -346,6 +346,10 @@
- @return total number of priority levels
  */
  uint8_t numPriorities() const { return _numPriorities; }

/** read the system pointer

* @return pointer to the system object */

System* system() const { return _system; }
};

template<typename Queues>
diff --git a/src/mem/qos/mem_sink.cc b/src/mem/qos/mem_sink.cc
index 1f104e4..fb06b9d 100644
--- a/src/mem/qos/mem_sink.cc
+++ b/src/mem/qos/mem_sink.cc
@@ -1,5 +1,5 @@
/*

- Copyright (c) 2018 ARM Limited

- Copyright (c) 2018-2020 ARM Limited
- All rights reserved
- The license below extends only to copyright in the software and shall
  @@ -40,6 +40,7 @@
  #include "debug/Drain.hh"
  #include "debug/QOS.hh"
  #include "mem_sink.hh"
  +#include "params/QoSMemSinkInterface.hh"
  #include "sim/system.hh"
namespace QoS {
@@ -50,12 +51,15 @@
memoryPacketSize(p->memory_packet_size),
readBufferSize(p->read_buffer_size),
writeBufferSize(p->write_buffer_size), port(name() + ".port", *this),
intf(p->intf),
retryRdReq(false), retryWrReq(false), nextRequest(0),
nextReqEvent(this)
{
// Resize read and write queue to allocate space
// for configured QoS priorities
readQueue.resize(numPriorities());
writeQueue.resize(numPriorities());
intf->setMemCtrl(this);
}

MemSinkCtrl::~MemSinkCtrl()
@@ -92,7 +96,7 @@
"%s Should not see packets where cache is responding\n",
func);

access(pkt);

intf->access(pkt);
return responseLatency;
}

@@ -101,7 +105,7 @@
{
pkt->pushLabel(name());

functionalAccess(pkt);

intf->functionalAccess(pkt);

pkt->popLabel();
}
@@ -279,7 +283,7 @@

// Do the actual memory access which also turns the packet
// into a response

access(pkt);

intf->access(pkt);

// Log the response
logResponse(pkt->isRead()? READ : WRITE,
@@ -351,7 +355,7 @@
MemSinkCtrl::MemoryPort::getAddrRanges() const
{
AddrRangeList ranges;

ranges.push_back(memory.getAddrRange());

ranges.push_back(memory.intf->getAddrRange());
return ranges;
}

@@ -390,3 +394,19 @@
return new QoS::MemSinkCtrl(this);
}

+QoSMemSinkInterface::QoSMemSinkInterface(const QoSMemSinkInterfaceParams*
_p)

: AbstractMemory(_p)
+{
+}

+void
+QoSMemSinkInterface::init()
+{

AbstractMemory::init();
+}

+QoSMemSinkInterface*
+QoSMemSinkInterfaceParams::create()
+{

return new QoSMemSinkInterface(this);
+}
diff --git a/src/mem/qos/mem_sink.hh b/src/mem/qos/mem_sink.hh
index 9a51269..3b10abd 100644
--- a/src/mem/qos/mem_sink.hh
+++ b/src/mem/qos/mem_sink.hh
@@ -1,5 +1,5 @@
/*

- Copyright (c) 2018 ARM Limited

- Copyright (c) 2018-2020 ARM Limited
- All rights reserved
- The license below extends only to copyright in the software and shall
  @@ -41,10 +41,14 @@
  #ifndef MEM_QOS_MEM_SINK_HH
  #define MEM_QOS_MEM_SINK_HH

+#include "mem/abstract_mem.hh"
#include "mem/qos/mem_ctrl.hh"
#include "mem/qport.hh"
#include "params/QoSMemSinkCtrl.hh"

+class QoSMemSinkInterfaceParams;
+class QoSMemSinkInterface;
+
namespace QoS {

/**
@@ -163,6 +167,11 @@
/** Memory slave port */
MemoryPort port;

* Create pointer to interface of actual media

```
*/
```
QoSMemSinkInterface* const intf;

/** Read request pending */
bool retryRdReq;

@@ -244,4 +253,23 @@

} // namespace QoS

+class QoSMemSinkInterface : public AbstractMemory
+{

public:
/** Initialize the memory interface */
void init();
/** Setting a pointer to the interface */
void setMemCtrl(QoS::MemSinkCtrl* _ctrl)
{
```
   ctrl = _ctrl;
```
};
/** Pointer to the controller */
QoS::MemSinkCtrl* ctrl;
QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p);
+};
#endif /* MEM_QOS_MEM_SINK_HH */
diff --git a/tests/configs/base_config.py b/tests/configs/base_config.py
index 0f79938..e2d3851 100644
--- a/tests/configs/base_config.py
+++ b/tests/configs/base_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012-2013, 2017-2018 ARM Limited
+# Copyright (c) 2012-2013, 2017-2018, 2020 ARM Limited

All rights reserved.

The license below extends only to copyright in the software and shall

@@ -221,7 +221,12 @@
super(BaseSESystem, self).init_system(system)

  def create_system(self):

   system = System(physmem = self.mem_class(),

   if issubclass(self.mem_class, m5.objects.DRAMInterface):

```
       mem_ctrl = DRAMCtrl()
```

       mem_ctrl.dram = self.mem_class()

```
   else:
```
```
       mem_ctrl = self.mem_class()
```

   system = System(physmem = mem_ctrl,
                    membus = SystemXBar(),
                    mem_mode = self.mem_mode,
                    multi_thread = (self.num_threads > 1))

@@ -275,6 +280,16 @@
# the physmem name to avoid bumping all the reference stats
system.physmem = [self.mem_class(range = r)
for r in system.mem_ranges]

       if issubclass(self.mem_class, m5.objects.DRAMInterface):

```
           mem_ctrls = []
```
```
           for r in system.mem_ranges:
```
```
               mem_ctrl = DRAMCtrl()
```

               mem_ctrl.dram = self.mem_class(range = r)

               mem_ctrls.append(mem_ctrl)

```
           system.physmem = mem_ctrls
```
```
       else:
```

           system.physmem = [self.mem_class(range = r)

                             for r in system.mem_ranges]
        for i in range(len(system.physmem)):
            system.physmem[i].port = system.membus.master

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28968
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8
Gerrit-Change-Number: 28968
Gerrit-PatchSet: 1
Gerrit-Owner: Wendy Elsasser <wendy.elsasser(a)arm.com>
Gerrit-MessageType: newchange

Wendy Elsasser has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/28968 ) Change subject: mem: Make DRAMCtrl a ClockedObject ...................................................................... mem: Make DRAMCtrl a ClockedObject Made DRAMCtrl a ClockedObject, with DRAMInterface defined as an AbstractMemory. The address ranges are now defined per interface. Currently the model only includes a DRAMInterface but this can be expanded for other media types. The controller object includes a parameter to the interface, which is setup when gem5 is configured. Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8 --- M configs/common/MemConfig.py M configs/dram/low_power_sweep.py M configs/dram/sweep.py M configs/learning_gem5/part1/simple.py M configs/learning_gem5/part1/two_level.py M configs/learning_gem5/part2/simple_cache.py M configs/learning_gem5/part2/simple_memobj.py M configs/learning_gem5/part3/simple_ruby.py M src/mem/DRAMCtrl.py A src/mem/DRAMInterface.py M src/mem/SConscript M src/mem/dram_ctrl.cc M src/mem/dram_ctrl.hh M src/mem/drampower.cc M src/mem/drampower.hh M src/mem/qos/QoSMemCtrl.py M src/mem/qos/QoSMemSinkCtrl.py A src/mem/qos/QoSMemSinkInterface.py M src/mem/qos/SConscript M src/mem/qos/mem_ctrl.cc M src/mem/qos/mem_ctrl.hh M src/mem/qos/mem_sink.cc M src/mem/qos/mem_sink.hh M tests/configs/base_config.py 24 files changed, 1,934 insertions(+), 1,760 deletions(-) diff --git a/configs/common/MemConfig.py b/configs/common/MemConfig.py index 9443520..ab6b933 100644 --- a/configs/common/MemConfig.py +++ b/configs/common/MemConfig.py @@ -40,7 +40,7 @@ from common import ObjectList from common import HMC -def create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, intlv_size): +def create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size): """ Helper function for creating a single memoy controller from the given options. This function is invoked multiple times in config_mem function @@ -59,33 +59,33 @@ # Create an instance so we can figure out the address # mapping and row-buffer size - ctrl = cls() + interface = intf() # Only do this for DRAMs - if issubclass(cls, m5.objects.DRAMCtrl): + if issubclass(intf, m5.objects.DRAMInterface): # If the channel bits are appearing after the column # bits, we need to add the appropriate number of bits # for the row buffer size - if ctrl.addr_mapping.value == 'RoRaBaChCo': + if interface.addr_mapping.value == 'RoRaBaChCo': # This computation only really needs to happen # once, but as we rely on having an instance we # end up having to repeat it for each and every # one - rowbuffer_size = ctrl.device_rowbuffer_size.value * \ - ctrl.devices_per_rank.value + rowbuffer_size = interface.device_rowbuffer_size.value * \ + interface.devices_per_rank.value intlv_low_bit = int(math.log(rowbuffer_size, 2)) # We got all we need to configure the appropriate address # range - ctrl.range = m5.objects.AddrRange(r.start, size = r.size(), + interface.range = m5.objects.AddrRange(r.start, size = r.size(), intlvHighBit = \ intlv_low_bit + intlv_bits - 1, xorHighBit = \ xor_low_bit + intlv_bits - 1, intlvBits = intlv_bits, intlvMatch = i) - return ctrl + return interface def config_mem(options, system): """ @@ -144,10 +144,10 @@ if 2 ** intlv_bits != nbr_mem_ctrls: fatal("Number of memory channels must be a power of 2") - cls = ObjectList.mem_list.get(opt_mem_type) + intf = ObjectList.mem_list.get(opt_mem_type) mem_ctrls = [] - if opt_elastic_trace_en and not issubclass(cls, m5.objects.SimpleMemory): + if opt_elastic_trace_en and not issubclass(intf, m5.objects.SimpleMemory): fatal("When elastic trace is enabled, configure mem-type as " "simple-mem.") @@ -158,36 +158,56 @@ intlv_size = max(opt_mem_channels_intlv, system.cache_line_size.value) # For every range (most systems will only have one), create an - # array of controllers and set their parameters to match their - # address mapping in the case of a DRAM + # array of memory interfaces and set their parameters to match + # their address mapping in the case of a DRAM for r in system.mem_ranges: for i in range(nbr_mem_ctrls): - mem_ctrl = create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, + # Create the DRAM interface + dram_intf = create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size) + # Set the number of ranks based on the command-line # options if it was explicitly set - if issubclass(cls, m5.objects.DRAMCtrl) and opt_mem_ranks: - mem_ctrl.ranks_per_channel = opt_mem_ranks + if issubclass(intf, m5.objects.DRAMInterface) and opt_mem_ranks: + dram_intf.ranks_per_channel = opt_mem_ranks # Enable low-power DRAM states if option is set - if issubclass(cls, m5.objects.DRAMCtrl): - mem_ctrl.enable_dram_powerdown = opt_dram_powerdown + if issubclass(intf, m5.objects.DRAMInterface): + dram_intf.enable_dram_powerdown = opt_dram_powerdown if opt_elastic_trace_en: - mem_ctrl.latency = '1ns' + dram_intf.latency = '1ns' print("For elastic trace, over-riding Simple Memory " "latency to 1ns.") + # Create the controller that will drive the interface + if opt_mem_type == "HMC_2500_1x32": + # The static latency of the vault controllers is estimated + # to be smaller than a full DRAM channel controller + mem_ctrl = m5.objects.DRAMCtrl(min_writes_per_switch = 8, + static_backend_latency = '4ns', + static_frontend_latency = '4ns') + else: + mem_ctrl = m5.objects.DRAMCtrl() + + # Override buffer sizes with interface specific values + mem_ctrl.write_buffer_size = dram_intf.write_buffer_size + mem_ctrl.read_buffer_size = dram_intf.read_buffer_size + + # Hookup the controller to the interface and add to the list + mem_ctrl.dram = dram_intf mem_ctrls.append(mem_ctrl) - subsystem.mem_ctrls = mem_ctrls - - # Connect the controllers to the membus - for i in range(len(subsystem.mem_ctrls)): + # Create a controller and connect the interfaces to a controller + for i in range(len(mem_ctrls)): if opt_mem_type == "HMC_2500_1x32": - subsystem.mem_ctrls[i].port = xbar[i/4].master + # Connect the controllers to the membus + mem_ctrls[i].port = xbar[i/4].master # Set memory device size. There is an independent controller for # each vault. All vaults are same size. - subsystem.mem_ctrls[i].device_size = options.hmc_dev_vault_size + mem_ctrls[i].dram.device_size = options.hmc_dev_vault_size else: - subsystem.mem_ctrls[i].port = xbar.master + # Connect the controllers to the membus + mem_ctrls[i].port = xbar.master + + subsystem.mem_ctrls = mem_ctrls diff --git a/configs/dram/low_power_sweep.py b/configs/dram/low_power_sweep.py index 9a62393..4a97fcb 100644 --- a/configs/dram/low_power_sweep.py +++ b/configs/dram/low_power_sweep.py @@ -1,4 +1,4 @@ -# Copyright (c) 2014-2015, 2017, 2019 ARM Limited +# Copyright (c) 2014-2015, 2017, 2019-2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -37,6 +37,7 @@ from __future__ import absolute_import import argparse +import math import m5 from m5.objects import * @@ -57,6 +58,10 @@ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) +dram_generators = { + "DRAM" : lambda x: x.createDram, +} + # Use a single-channel DDR4-2400 in 16x4 configuration by default parser.add_argument("--mem-type", default="DDR4_2400_16x4", choices=ObjectList.mem_list.get_names(), @@ -77,7 +82,7 @@ help = "Percentage of read commands") parser.add_argument("--addr-map", - choices=m5.objects.AddrMap.vals, + choices=ObjectList.dram_addr_map_list.get_names(), default="RoRaBaCoCh", help = "DRAM address map policy") parser.add_argument("--idle-end", type=int, default=50000000, @@ -111,14 +116,19 @@ # Sanity check for memory controller class. if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl): - fatal("This script assumes the memory is a DRAMCtrl subclass") + fatal("This script assumes the controller is a DRAMCtrl subclass") +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface): + fatal("This script assumes the memory is a DRAMInterface subclass") # There is no point slowing things down by saving any data. -system.mem_ctrls[0].null = True +system.mem_ctrls[0].dram.null = True + +# enable DRAM low power states +system.mem_ctrls[0].dram.enable_dram_powerdown = True # Set the address mapping based on input argument -system.mem_ctrls[0].addr_mapping = args.addr_map -system.mem_ctrls[0].page_policy = args.page_policy +system.mem_ctrls[0].dram.addr_mapping = args.addr_map +system.mem_ctrls[0].dram.page_policy = args.page_policy # We create a traffic generator state for each param combination we want to # test. Each traffic generator state is specified in the config file and the @@ -126,28 +136,23 @@ # Stats are dumped and reset at the state transition. period = 250000000 -# We specify the states in a config file input to the traffic generator. -cfg_file_name = "lowp_sweep.cfg" -cfg_file_path = os.path.dirname(__file__) + "/" +cfg_file_name -cfg_file = open(cfg_file_path, 'w') - # Get the number of banks -nbr_banks = int(system.mem_ctrls[0].banks_per_rank.value) +nbr_banks = int(system.mem_ctrls[0].dram.banks_per_rank.value) # determine the burst size in bytes -burst_size = int((system.mem_ctrls[0].devices_per_rank.value * - system.mem_ctrls[0].device_bus_width.value * - system.mem_ctrls[0].burst_length.value) / 8) +burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value * + system.mem_ctrls[0].dram.device_bus_width.value * + system.mem_ctrls[0].dram.burst_length.value) / 8) # next, get the page size in bytes (the rowbuffer size is already in bytes) -page_size = system.mem_ctrls[0].devices_per_rank.value * \ - system.mem_ctrls[0].device_rowbuffer_size.value +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \ + system.mem_ctrls[0].dram.device_rowbuffer_size.value # Inter-request delay should be such that we can hit as many transitions # to/from low power states as possible to. We provide a min and max itt to the # traffic generator and it randomises in the range. The parameter is in # seconds and we need it in ticks (ps). -itt_min = system.mem_ctrls[0].tBURST.value * 1000000000000 +itt_min = system.mem_ctrls[0].dram.tBURST.value * 1000000000000 #The itt value when set to (tRAS + tRP + tCK) covers the case where # a read command is delayed beyond the delay from ACT to PRE_PDN entry of the @@ -155,9 +160,9 @@ # between a write and power down entry will be tRCD + tCL + tWR + tRP + tCK. # As we use this delay as a unit and create multiples of it as bigger delays # for the sweep, this parameter works for reads, writes and mix of them. -pd_entry_time = (system.mem_ctrls[0].tRAS.value + - system.mem_ctrls[0].tRP.value + - system.mem_ctrls[0].tCK.value) * 1000000000000 +pd_entry_time = (system.mem_ctrls[0].dram.tRAS.value + + system.mem_ctrls[0].dram.tRP.value + + system.mem_ctrls[0].dram.tCK.value) * 1000000000000 # We sweep itt max using the multipliers specified by the user. itt_max_str = args.itt_list.strip().split() @@ -180,42 +185,11 @@ # banks bank_util_values = [1, int(nbr_banks/2), nbr_banks] -# Next we create the config file, but first a comment -cfg_file.write("""# STATE state# period mode=DRAM -# read_percent start_addr end_addr req_size min_itt max_itt data_limit -# stride_size page_size #banks #banks_util addr_map #ranks\n""") - -addr_map = m5.objects.AddrMap.map[args.addr_map] - -nxt_state = 0 -for itt_max in itt_max_values: - for bank in bank_util_values: - for stride_size in stride_values: - cfg_file.write("STATE %d %d %s %d 0 %d %d " - "%d %d %d %d %d %d %d %d %d\n" % - (nxt_state, period, "DRAM", args.rd_perc, max_addr, - burst_size, itt_min, itt_max, 0, stride_size, - page_size, nbr_banks, bank, addr_map, - args.mem_ranks)) - nxt_state = nxt_state + 1 - # State for idle period idle_period = args.idle_end -cfg_file.write("STATE %d %d IDLE\n" % (nxt_state, idle_period)) - -# Init state is state 0 -cfg_file.write("INIT 0\n") - -# Go through the states one by one -for state in range(1, nxt_state + 1): - cfg_file.write("TRANSITION %d %d 1\n" % (state - 1, state)) - -# Transition from last state to itself to not break the probability math -cfg_file.write("TRANSITION %d %d 1\n" % (nxt_state, nxt_state)) -cfg_file.close() # create a traffic generator, and point it to the file we just created -system.tgen = TrafficGen(config_file = cfg_file_path) +system.tgen = PyTrafficGen() # add a communication monitor system.monitor = CommMonitor() @@ -230,14 +204,34 @@ # every period, dump and reset all stats periodicStatDump(period) +# run Forrest, run! root = Root(full_system = False, system = system) root.system.mem_mode = 'timing' m5.instantiate() +def trace(): + addr_map = ObjectList.dram_addr_map_list.get(args.addr_map) + generator = dram_generators["DRAM"](system.tgen) + for itt_max in itt_max_values: + for bank in bank_util_values: + for stride_size in stride_values: + num_seq_pkts = int(math.ceil(float(stride_size) / burst_size)) + yield generator(period, + 0, max_addr, burst_size, int(itt_min), + int(itt_max), args.rd_perc, 0, + num_seq_pkts, page_size, nbr_banks, bank, + addr_map, args.mem_ranks) + + yield system.tgen.createIdle(idle_period) + yield system.tgen.createExit(0) + +system.tgen.start(trace()) + # Simulate for exactly as long as it takes to go through all the states # This is why sim exists. -m5.simulate(nxt_state * period + idle_period) +m5.simulate() + print("--- Done DRAM low power sweep ---") print("Fixed params - ") print("\tburst: %d, banks: %d, max stride: %d, itt min: %s ns" % \ @@ -247,4 +241,3 @@ print("\titt max values", itt_max_values) print("\tbank utilization values", bank_util_values) print("\tstride values:", stride_values) -print("Traffic gen config file:", cfg_file_name) diff --git a/configs/dram/sweep.py b/configs/dram/sweep.py index d3c86c3..6a49f44 100644 --- a/configs/dram/sweep.py +++ b/configs/dram/sweep.py @@ -116,13 +116,15 @@ # the following assumes that we are using the native DRAM # controller, check to be sure if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl): - fatal("This script assumes the memory is a DRAMCtrl subclass") + fatal("This script assumes the controller is a DRAMCtrl subclass") +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface): + fatal("This script assumes the memory is a DRAMInterface subclass") # there is no point slowing things down by saving any data -system.mem_ctrls[0].null = True +system.mem_ctrls[0].dram.null = True # Set the address mapping based on input argument -system.mem_ctrls[0].addr_mapping = options.addr_map +system.mem_ctrls[0].dram.addr_mapping = options.addr_map # stay in each state for 0.25 ms, long enough to warm things up, and # short enough to avoid hitting a refresh @@ -133,21 +135,21 @@ # the DRAM maximum bandwidth to ensure that it is saturated # get the number of banks -nbr_banks = system.mem_ctrls[0].banks_per_rank.value +nbr_banks = system.mem_ctrls[0].dram.banks_per_rank.value # determine the burst length in bytes -burst_size = int((system.mem_ctrls[0].devices_per_rank.value * - system.mem_ctrls[0].device_bus_width.value * - system.mem_ctrls[0].burst_length.value) / 8) +burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value * + system.mem_ctrls[0].dram.device_bus_width.value * + system.mem_ctrls[0].dram.burst_length.value) / 8) # next, get the page size in bytes -page_size = system.mem_ctrls[0].devices_per_rank.value * \ - system.mem_ctrls[0].device_rowbuffer_size.value +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \ + system.mem_ctrls[0].dram.device_rowbuffer_size.value # match the maximum bandwidth of the memory, the parameter is in seconds # and we need it in ticks (ps) -itt = getattr(system.mem_ctrls[0].tBURST_MIN, 'value', - system.mem_ctrls[0].tBURST.value) * 1000000000000 +itt = getattr(system.mem_ctrls[0].dram.tBURST_MIN, 'value', + system.mem_ctrls[0].dram.tBURST.value) * 1000000000000 # assume we start at 0 max_addr = mem_range.end diff --git a/configs/learning_gem5/part1/simple.py b/configs/learning_gem5/part1/simple.py index ef73a06..cfd15be 100644 --- a/configs/learning_gem5/part1/simple.py +++ b/configs/learning_gem5/part1/simple.py @@ -77,8 +77,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part1/two_level.py b/configs/learning_gem5/part1/two_level.py index 564c785..0dbcfc7 100644 --- a/configs/learning_gem5/part1/two_level.py +++ b/configs/learning_gem5/part1/two_level.py @@ -132,8 +132,9 @@ system.system_port = system.membus.slave # Create a DDR3 memory controller -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Create a process for a simple "Hello World" application diff --git a/configs/learning_gem5/part2/simple_cache.py b/configs/learning_gem5/part2/simple_cache.py index 8d98d92..fbea73d 100644 --- a/configs/learning_gem5/part2/simple_cache.py +++ b/configs/learning_gem5/part2/simple_cache.py @@ -76,8 +76,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part2/simple_memobj.py b/configs/learning_gem5/part2/simple_memobj.py index d30977c..e792eb9 100644 --- a/configs/learning_gem5/part2/simple_memobj.py +++ b/configs/learning_gem5/part2/simple_memobj.py @@ -74,8 +74,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part3/simple_ruby.py b/configs/learning_gem5/part3/simple_ruby.py index c47ee7e..7f70a8c 100644 --- a/configs/learning_gem5/part3/simple_ruby.py +++ b/configs/learning_gem5/part3/simple_ruby.py @@ -68,8 +68,9 @@ system.cpu = [TimingSimpleCPU() for i in range(2)] # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] # create the interrupt controller for the CPU and connect to the membus for cpu in system.cpu: diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py index 0f70dff..dff5000 100644 --- a/src/mem/DRAMCtrl.py +++ b/src/mem/DRAMCtrl.py @@ -40,26 +40,12 @@ from m5.params import * from m5.proxy import * -from m5.objects.AbstractMemory import * from m5.objects.QoSMemCtrl import * # Enum for memory scheduling algorithms, currently First-Come # First-Served and a First-Row Hit then First-Come First-Served class MemSched(Enum): vals = ['fcfs', 'frfcfs'] -# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting -# channel, rank, bank, row and column, respectively, and going from -# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are -# suitable for an open-page policy, optimising for sequential accesses -# hitting in the open row. For a closed-page policy, RoCoRaBaCh -# maximises parallelism. -class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh'] - -# Enum for the page policy, either open, open_adaptive, close, or -# close_adaptive. -class PageManage(Enum): vals = ['open', 'open_adaptive', 'close', - 'close_adaptive'] - # DRAMCtrl is a single-channel single-ported DRAM controller model # that aims to model the most important system-level performance # effects of a DRAM without getting into too much detail of the DRAM @@ -72,8 +58,11 @@ # bus in front of the controller for multiple ports port = SlavePort("Slave port") - # the basic configuration of the controller architecture, note - # that each entry corresponds to a burst for the specific DRAM + # Interface to volatile, DRAM media + dram = Param.DRAMInterface(Parent.any, "DRAM interface") + + # Set default buffer sizes + # each entry corresponds to a burst for the specific DRAM # configuration (e.g. x32 with burst length 8 is 32 bytes) and not # the cacheline size or request/packet size write_buffer_size = Param.Unsigned(64, "Number of write queue entries") @@ -93,15 +82,6 @@ # scheduler, address map and page policy mem_sched_policy = Param.MemSched('frfcfs', "Memory scheduling policy") - addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy") - page_policy = Param.PageManage('open_adaptive', "Page management policy") - - # enforce a limit on the number of accesses per row - max_accesses_per_row = Param.Unsigned(16, "Max accesses per row before " - "closing"); - - # size of DRAM Chip in Bytes - device_size = Param.MemorySize("Size of DRAM chip") # pipeline latency of the controller and PHY, split into a # frontend part and a backend part, with reads and writes serviced @@ -109,1404 +89,3 @@ # serviced by the memory seeing the sum of the two static_frontend_latency = Param.Latency("10ns", "Static frontend latency") static_backend_latency = Param.Latency("10ns", "Static backend latency") - - # the physical organisation of the DRAM - device_bus_width = Param.Unsigned("data bus width in bits for each DRAM "\ - "device/chip") - burst_length = Param.Unsigned("Burst lenght (BL) in beats") - device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\ - "device/chip") - devices_per_rank = Param.Unsigned("Number of devices/chips per rank") - ranks_per_channel = Param.Unsigned("Number of ranks per channel") - - # default to 0 bank groups per rank, indicating bank group architecture - # is not used - # update per memory class when bank group architecture is supported - bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per rank") - banks_per_rank = Param.Unsigned("Number of banks per rank") - - # Enable DRAM powerdown states if True. This is False by default due to - # performance being lower when enabled - enable_dram_powerdown = Param.Bool(False, "Enable powerdown states") - - # For power modelling we need to know if the DRAM has a DLL or not - dll = Param.Bool(True, "DRAM has DLL or not") - - # DRAMPower provides in addition to the core power, the possibility to - # include RD/WR termination and IO power. This calculation assumes some - # default values. The integration of DRAMPower with gem5 does not include - # IO and RD/WR termination power by default. This might be added as an - # additional feature in the future. - - # timing behaviour and constraints - all in nanoseconds - - # the base clock period of the DRAM - tCK = Param.Latency("Clock period") - - # the amount of time in nanoseconds from issuing an activate command - # to the data being available in the row buffer for a read/write - tRCD = Param.Latency("RAS to CAS delay") - - # the time from issuing a read/write command to seeing the actual data - tCL = Param.Latency("CAS latency") - - # minimum time between a precharge and subsequent activate - tRP = Param.Latency("Row precharge time") - - # minimum time between an activate and a precharge to the same row - tRAS = Param.Latency("ACT to PRE delay") - - # minimum time between a write data transfer and a precharge - tWR = Param.Latency("Write recovery time") - - # minimum time between a read and precharge command - tRTP = Param.Latency("Read to precharge") - - # time to complete a burst transfer, typically the burst length - # divided by two due to the DDR bus, but by making it a parameter - # it is easier to also evaluate SDR memories like WideIO. - # This parameter has to account for burst length. - # Read/Write requests with data size larger than one full burst are broken - # down into multiple requests in the controller - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = Param.Latency("Burst duration " - "(typically burst length / 2 cycles)") - - # tBURST_MAX is the column array cycle delay required before next access, - # which could be greater than tBURST when the memory access time is greater - # than tBURST - tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay") - - # tBURST_MIN is the minimum delay between bursts, which could be less than - # tBURST when interleaving is supported - tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts") - - # CAS-to-CAS delay for bursts to the same bank group - # only utilized with bank group architectures; set to 0 for default case - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay") - - # Write-to-Write delay for bursts to the same bank group - # only utilized with bank group architectures; set to 0 for default case - # This will be used to enable different same bank group delays - # for writes versus reads - tCCD_L_WR = Param.Latency(Self.tCCD_L, - "Same bank group Write to Write delay") - - # time taken to complete one refresh cycle (N rows in all banks) - tRFC = Param.Latency("Refresh cycle time") - - # refresh command interval, how often a "ref" command needs - # to be sent. It is 7.8 us for a 64ms refresh requirement - tREFI = Param.Latency("Refresh command interval") - - # write-to-read, same rank turnaround penalty - tWTR = Param.Latency("Write to read, same rank switching time") - - # write-to-read, same rank turnaround penalty for same bank group - tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching " - "time, same bank group") - - # read-to-write, same rank turnaround penalty - tRTW = Param.Latency("Read to write, same rank switching time") - - # rank-to-rank bus delay penalty - # this does not correlate to a memory timing parameter and encompasses: - # 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD - # different rank bus delay - tCS = Param.Latency("Rank to rank switching time") - - # minimum precharge to precharge delay time - tPPD = Param.Latency("0ns", "PRE to PRE delay") - - # maximum delay between two-cycle ACT command phases - tAAD = Param.Latency(Self.tCK, - "Maximum delay between two-cycle ACT commands") - - two_cycle_activate = Param.Bool(False, - "Two cycles required to send activate") - - # minimum row activate to row activate delay time - tRRD = Param.Latency("ACT to ACT delay") - - # only utilized with bank group architectures; set to 0 for default case - tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay") - - # time window in which a maximum number of activates are allowed - # to take place, set to 0 to disable - tXAW = Param.Latency("X activation window") - activation_limit = Param.Unsigned("Max number of activates in window") - - # time to exit power-down mode - # Exit power-down to next valid command delay - tXP = Param.Latency("0ns", "Power-up Delay") - - # Exit Powerdown to commands requiring a locked DLL - tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL") - - # time to exit self-refresh mode - tXS = Param.Latency("0ns", "Self-refresh exit latency") - - # time to exit self-refresh mode with locked DLL - tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL") - - # number of data beats per clock. with DDR, default is 2, one per edge - beats_per_clock = Param.Unsigned(2, "Data beats per clock") - - data_clock_sync = Param.Bool(False, "Synchronization commands required") - - # Currently rolled into other params - ###################################################################### - - # tRC - assumed to be tRAS + tRP - - # Power Behaviour and Constraints - # DRAMs like LPDDR and WideIO have 2 external voltage domains. These are - # defined as VDD and VDD2. Each current is defined for each voltage domain - # separately. For example, current IDD0 is active-precharge current for - # voltage domain VDD and current IDD02 is active-precharge current for - # voltage domain VDD2. - # By default all currents are set to 0mA. Users who are only interested in - # the performance of DRAMs can leave them at 0. - - # Operating 1 Bank Active-Precharge current - IDD0 = Param.Current("0mA", "Active precharge current") - - # Operating 1 Bank Active-Precharge current multiple voltage Range - IDD02 = Param.Current("0mA", "Active precharge current VDD2") - - # Precharge Power-down Current: Slow exit - IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow") - - # Precharge Power-down Current: Slow exit multiple voltage Range - IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2") - - # Precharge Power-down Current: Fast exit - IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast") - - # Precharge Power-down Current: Fast exit multiple voltage Range - IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2") - - # Precharge Standby current - IDD2N = Param.Current("0mA", "Precharge Standby current") - - # Precharge Standby current multiple voltage range - IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2") - - # Active Power-down current: slow exit - IDD3P0 = Param.Current("0mA", "Active Powerdown slow") - - # Active Power-down current: slow exit multiple voltage range - IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2") - - # Active Power-down current : fast exit - IDD3P1 = Param.Current("0mA", "Active Powerdown fast") - - # Active Power-down current : fast exit multiple voltage range - IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2") - - # Active Standby current - IDD3N = Param.Current("0mA", "Active Standby current") - - # Active Standby current multiple voltage range - IDD3N2 = Param.Current("0mA", "Active Standby current VDD2") - - # Burst Read Operating Current - IDD4R = Param.Current("0mA", "READ current") - - # Burst Read Operating Current multiple voltage range - IDD4R2 = Param.Current("0mA", "READ current VDD2") - - # Burst Write Operating Current - IDD4W = Param.Current("0mA", "WRITE current") - - # Burst Write Operating Current multiple voltage range - IDD4W2 = Param.Current("0mA", "WRITE current VDD2") - - # Refresh Current - IDD5 = Param.Current("0mA", "Refresh current") - - # Refresh Current multiple voltage range - IDD52 = Param.Current("0mA", "Refresh current VDD2") - - # Self-Refresh Current - IDD6 = Param.Current("0mA", "Self-refresh Current") - - # Self-Refresh Current multiple voltage range - IDD62 = Param.Current("0mA", "Self-refresh Current VDD2") - - # Main voltage range of the DRAM - VDD = Param.Voltage("0V", "Main Voltage Range") - - # Second voltage range defined by some DRAMs - VDD2 = Param.Voltage("0V", "2nd Voltage Range") - -# A single DDR3-1600 x64 channel (one command and address bus), with -# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in -# an 8x8 configuration. -class DDR3_1600_8x8(DRAMCtrl): - # size of device in bytes - device_size = '512MB' - - # 8x8 configuration, 8 devices each with an 8-bit interface - device_bus_width = 8 - - # DDR3 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) - device_rowbuffer_size = '1kB' - - # 8x8 configuration, so 8 devices - devices_per_rank = 8 - - # Use two ranks - ranks_per_channel = 2 - - # DDR3 has 8 banks in all configurations - banks_per_rank = 8 - - # 800 MHz - tCK = '1.25ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 800 MHz - tBURST = '5ns' - - # DDR3-1600 11-11-11 - tRCD = '13.75ns' - tCL = '13.75ns' - tRP = '13.75ns' - tRAS = '35ns' - tRRD = '6ns' - tXAW = '30ns' - activation_limit = 4 - tRFC = '260ns' - - tWR = '15ns' - - # Greater of 4 CK or 7.5 ns - tWTR = '7.5ns' - - # Greater of 4 CK or 7.5 ns - tRTP = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns - tRTW = '2.5ns' - - # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns - tCS = '2.5ns' - - # <=85C, half for >85C - tREFI = '7.8us' - - # active powerdown and precharge powerdown exit time - tXP = '6ns' - - # self refresh exit time - tXS = '270ns' - - # Current values from datasheet Die Rev E,J - IDD0 = '55mA' - IDD2N = '32mA' - IDD3N = '38mA' - IDD4W = '125mA' - IDD4R = '157mA' - IDD5 = '235mA' - IDD3P1 = '38mA' - IDD2P1 = '32mA' - IDD6 = '20mA' - VDD = '1.5V' - -# A single HMC-2500 x32 model based on: -# [1] DRAMSpec: a high-level DRAM bank modelling tool -# developed at the University of Kaiserslautern. This high level tool -# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to -# estimate the DRAM bank latency and power numbers. -# [2] High performance AXI-4.0 based interconnect for extensible smart memory -# cubes (E. Azarkhish et. al) -# Assumed for the HMC model is a 30 nm technology node. -# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory (4 -# layers). -# Each layer has 16 vaults and each vault consists of 2 banks per layer. -# In order to be able to use the same controller used for 2D DRAM generations -# for HMC, the following analogy is done: -# Channel (DDR) => Vault (HMC) -# device_size (DDR) => size of a single layer in a vault -# ranks per channel (DDR) => number of layers -# banks per rank (DDR) => banks per layer -# devices per rank (DDR) => devices per layer ( 1 for HMC). -# The parameters for which no input is available are inherited from the DDR3 -# configuration. -# This configuration includes the latencies from the DRAM to the logic layer -# of the HMC -class HMC_2500_1x32(DDR3_1600_8x8): - # size of device - # two banks per device with each bank 4MB [2] - device_size = '8MB' - - # 1x32 configuration, 1 device with 32 TSVs [2] - device_bus_width = 32 - - # HMC is a BL8 device [2] - burst_length = 8 - - # Each device has a page (row buffer) size of 256 bytes [2] - device_rowbuffer_size = '256B' - - # 1x32 configuration, so 1 device [2] - devices_per_rank = 1 - - # 4 layers so 4 ranks [2] - ranks_per_channel = 4 - - # HMC has 2 banks per layer [2] - # Each layer represents a rank. With 4 layers and 8 banks in total, each - # layer has 2 banks; thus 2 banks per rank. - banks_per_rank = 2 - - # 1250 MHz [2] - tCK = '0.8ns' - - # 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz - tBURST = '3.2ns' - - # Values using DRAMSpec HMC model [1] - tRCD = '10.2ns' - tCL = '9.9ns' - tRP = '7.7ns' - tRAS = '21.6ns' - - # tRRD depends on the power supply network for each vendor. - # We assume a tRRD of a double bank approach to be equal to 4 clock - # cycles (Assumption) - tRRD = '3.2ns' - - # activation limit is set to 0 since there are only 2 banks per vault - # layer. - activation_limit = 0 - - # Values using DRAMSpec HMC model [1] - tRFC = '59ns' - tWR = '8ns' - tRTP = '4.9ns' - - # Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz = - # 0.8 ns (Assumption) - tCS = '0.8ns' - - # Value using DRAMSpec HMC model [1] - tREFI = '3.9us' - - # The default page policy in the vault controllers is simple closed page - # [2] nevertheless 'close' policy opens and closes the row multiple times - # for bursts largers than 32Bytes. For this reason we use 'close_adaptive' - page_policy = 'close_adaptive' - - # RoCoRaBaCh resembles the default address mapping in HMC - addr_mapping = 'RoCoRaBaCh' - min_writes_per_switch = 8 - - # These parameters do not directly correlate with buffer_size in real - # hardware. Nevertheless, their value has been tuned to achieve a - # bandwidth similar to the cycle-accurate model in [2] - write_buffer_size = 32 - read_buffer_size = 32 - - # The static latency of the vault controllers is estimated to be smaller - # than a full DRAM channel controller - static_backend_latency='4ns' - static_frontend_latency='4ns' - -# A single DDR3-2133 x64 channel refining a selected subset of the -# options for the DDR-1600 configuration, based on the same DDR3-1600 -# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept -# consistent across the two configurations. -class DDR3_2133_8x8(DDR3_1600_8x8): - # 1066 MHz - tCK = '0.938ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz - tBURST = '3.752ns' - - # DDR3-2133 14-14-14 - tRCD = '13.09ns' - tCL = '13.09ns' - tRP = '13.09ns' - tRAS = '33ns' - tRRD = '5ns' - tXAW = '25ns' - - # Current values from datasheet - IDD0 = '70mA' - IDD2N = '37mA' - IDD3N = '44mA' - IDD4W = '157mA' - IDD4R = '191mA' - IDD5 = '250mA' - IDD3P1 = '44mA' - IDD2P1 = '43mA' - IDD6 ='20mA' - VDD = '1.5V' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4) -# in an 16x4 configuration. -# Total channel capacity is 32GB -# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel -class DDR4_2400_16x4(DRAMCtrl): - # size of device - device_size = '1GB' - - # 16x4 configuration, 16 devices each with a 4-bit interface - device_bus_width = 4 - - # DDR4 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 512 byte (1K columns x4) - device_rowbuffer_size = '512B' - - # 16x4 configuration, so 16 devices - devices_per_rank = 16 - - # Match our DDR3 configurations which is dual rank - ranks_per_channel = 2 - - # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups - # Set to 4 for x4 case - bank_groups_per_rank = 4 - - # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all - # configurations). Currently we do not capture the additional - # constraints incurred by the bank groups - banks_per_rank = 16 - - # override the default buffer sizes and go for something larger to - # accommodate the larger bank count - write_buffer_size = 128 - read_buffer_size = 64 - - # 1200 MHz - tCK = '0.833ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = '3.332ns' - - # @2400 data rate, tCCD_L is 6 CK - # CAS-to-CAS delay for bursts to the same bank group - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = '5ns'; - - # DDR4-2400 17-17-17 - tRCD = '14.16ns' - tCL = '14.16ns' - tRP = '14.16ns' - tRAS = '32ns' - - # RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns) - tRRD = '3.332ns' - - # RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns) - tRRD_L = '4.9ns'; - - # tFAW for 512B page is MAX(16 CK, 13ns) - tXAW = '13.328ns' - activation_limit = 4 - # tRFC is 350ns - tRFC = '350ns' - - tWR = '15ns' - - # Here using the average of WTR_S and WTR_L - tWTR = '5ns' - - # Greater of 4 CK or 7.5 ns - tRTP = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666 ns - tRTW = '1.666ns' - - # Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns - tCS = '1.666ns' - - # <=85C, half for >85C - tREFI = '7.8us' - - # active powerdown and precharge powerdown exit time - tXP = '6ns' - - # self refresh exit time - # exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is: - # tRFC + 10ns = 340ns - tXS = '340ns' - - # Current values from datasheet - IDD0 = '43mA' - IDD02 = '3mA' - IDD2N = '34mA' - IDD3N = '38mA' - IDD3N2 = '3mA' - IDD4W = '103mA' - IDD4R = '110mA' - IDD5 = '250mA' - IDD3P1 = '32mA' - IDD2P1 = '25mA' - IDD6 = '30mA' - VDD = '1.2V' - VDD2 = '2.5V' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8) -# in an 8x8 configuration. -# Total channel capacity is 16GB -# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel -class DDR4_2400_8x8(DDR4_2400_16x4): - # 8x8 configuration, 8 devices each with an 8-bit interface - device_bus_width = 8 - - # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) - device_rowbuffer_size = '1kB' - - # 8x8 configuration, so 8 devices - devices_per_rank = 8 - - # RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns) - tRRD_L = '4.9ns'; - - tXAW = '21ns' - - # Current values from datasheet - IDD0 = '48mA' - IDD3N = '43mA' - IDD4W = '123mA' - IDD4R = '135mA' - IDD3P1 = '37mA' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16) -# in an 4x16 configuration. -# Total channel capacity is 4GB -# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel -class DDR4_2400_4x16(DDR4_2400_16x4): - # 4x16 configuration, 4 devices each with an 16-bit interface - device_bus_width = 16 - - # Each device has a page (row buffer) size of 2 Kbyte (1K columns x16) - device_rowbuffer_size = '2kB' - - # 4x16 configuration, so 4 devices - devices_per_rank = 4 - - # Single rank for x16 - ranks_per_channel = 1 - - # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups - # Set to 2 for x16 case - bank_groups_per_rank = 2 - - # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all - # configurations). Currently we do not capture the additional - # constraints incurred by the bank groups - banks_per_rank = 8 - - # RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns) - tRRD = '5.3ns' - - # RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns) - tRRD_L = '6.4ns'; - - tXAW = '30ns' - - # Current values from datasheet - IDD0 = '80mA' - IDD02 = '4mA' - IDD2N = '34mA' - IDD3N = '47mA' - IDD4W = '228mA' - IDD4R = '243mA' - IDD5 = '280mA' - IDD3P1 = '41mA' - -# A single LPDDR2-S4 x32 interface (one command/address bus), with -# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1) -# in a 1x32 configuration. -class LPDDR2_S4_1066_1x32(DRAMCtrl): - # No DLL in LPDDR2 - dll = False - - # size of device - device_size = '512MB' - - # 1x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # LPDDR2_S4 is a BL4 and BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 1KB - # (this depends on the memory density) - device_rowbuffer_size = '1kB' - - # 1x32 configuration, so 1 device - devices_per_rank = 1 - - # Use a single rank - ranks_per_channel = 1 - - # LPDDR2-S4 has 8 banks in all configurations - banks_per_rank = 8 - - # 533 MHz - tCK = '1.876ns' - - # Fixed at 15 ns - tRCD = '15ns' - - # 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time - tCL = '15ns' - - # Pre-charge one bank 15 ns (all banks 18 ns) - tRP = '15ns' - - tRAS = '42ns' - tWR = '15ns' - - tRTP = '7.5ns' - - # 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz. - # Note this is a BL8 DDR device. - # Requests larger than 32 bytes are broken down into multiple requests - # in the controller - tBURST = '7.5ns' - - # LPDDR2-S4, 4 Gbit - tRFC = '130ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '7.5ns' - - # self refresh exit time - tXS = '140ns' - - # Irrespective of speed grade, tWTR is 7.5 ns - tWTR = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns - tRTW = '3.75ns' - - # Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns - tCS = '3.75ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Irrespective of density, tFAW is 50 ns - tXAW = '50ns' - activation_limit = 4 - - # Current values from datasheet - IDD0 = '15mA' - IDD02 = '70mA' - IDD2N = '2mA' - IDD2N2 = '30mA' - IDD3N = '2.5mA' - IDD3N2 = '30mA' - IDD4W = '10mA' - IDD4W2 = '190mA' - IDD4R = '3mA' - IDD4R2 = '220mA' - IDD5 = '40mA' - IDD52 = '150mA' - IDD3P1 = '1.2mA' - IDD3P12 = '8mA' - IDD2P1 = '0.6mA' - IDD2P12 = '0.8mA' - IDD6 = '1mA' - IDD62 = '3.2mA' - VDD = '1.8V' - VDD2 = '1.2V' - -# A single WideIO x128 interface (one command and address bus), with -# default timings based on an estimated WIO-200 8 Gbit part. -class WideIO_200_1x128(DRAMCtrl): - # No DLL for WideIO - dll = False - - # size of device - device_size = '1024MB' - - # 1x128 configuration, 1 device with a 128-bit interface - device_bus_width = 128 - - # This is a BL4 device - burst_length = 4 - - # Each device has a page (row buffer) size of 4KB - # (this depends on the memory density) - device_rowbuffer_size = '4kB' - - # 1x128 configuration, so 1 device - devices_per_rank = 1 - - # Use one rank for a one-high die stack - ranks_per_channel = 1 - - # WideIO has 4 banks in all configurations - banks_per_rank = 4 - - # 200 MHz - tCK = '5ns' - - # WIO-200 - tRCD = '18ns' - tCL = '18ns' - tRP = '18ns' - tRAS = '42ns' - tWR = '15ns' - # Read to precharge is same as the burst - tRTP = '20ns' - - # 4 beats across an x128 SDR interface translates to 4 clocks @ 200 MHz. - # Note this is a BL4 SDR device. - tBURST = '20ns' - - # WIO 8 Gb - tRFC = '210ns' - - # WIO 8 Gb, <=85C, half for >85C - tREFI = '3.9us' - - # Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns - tWTR = '15ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns - tRTW = '10ns' - - # Default different rank bus delay to 2 CK, @200 MHz = 10 ns - tCS = '10ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Two instead of four activation window - tXAW = '50ns' - activation_limit = 2 - - # The WideIO specification does not provide current information - -# A single LPDDR3 x32 interface (one command/address bus), with -# default timings based on a LPDDR3-1600 4 Gbit part (Micron -# EDF8132A1MC) in a 1x32 configuration. -class LPDDR3_1600_1x32(DRAMCtrl): - # No DLL for LPDDR3 - dll = False - - # size of device - device_size = '512MB' - - # 1x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # LPDDR3 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 4KB - device_rowbuffer_size = '4kB' - - # 1x32 configuration, so 1 device - devices_per_rank = 1 - - # Technically the datasheet is a dual-rank package, but for - # comparison with the LPDDR2 config we stick to a single rank - ranks_per_channel = 1 - - # LPDDR3 has 8 banks in all configurations - banks_per_rank = 8 - - # 800 MHz - tCK = '1.25ns' - - tRCD = '18ns' - - # 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time - tCL = '15ns' - - tRAS = '42ns' - tWR = '15ns' - - # Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns - tRTP = '7.5ns' - - # Pre-charge one bank 18 ns (all banks 21 ns) - tRP = '18ns' - - # 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz. - # Note this is a BL8 DDR device. - # Requests larger than 32 bytes are broken down into multiple requests - # in the controller - tBURST = '5ns' - - # LPDDR3, 4 Gb - tRFC = '130ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '7.5ns' - - # self refresh exit time - tXS = '140ns' - - # Irrespective of speed grade, tWTR is 7.5 ns - tWTR = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns - tRTW = '2.5ns' - - # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns - tCS = '2.5ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Irrespective of size, tFAW is 50 ns - tXAW = '50ns' - activation_limit = 4 - - # Current values from datasheet - IDD0 = '8mA' - IDD02 = '60mA' - IDD2N = '0.8mA' - IDD2N2 = '26mA' - IDD3N = '2mA' - IDD3N2 = '34mA' - IDD4W = '2mA' - IDD4W2 = '190mA' - IDD4R = '2mA' - IDD4R2 = '230mA' - IDD5 = '28mA' - IDD52 = '150mA' - IDD3P1 = '1.4mA' - IDD3P12 = '11mA' - IDD2P1 = '0.8mA' - IDD2P12 = '1.8mA' - IDD6 = '0.5mA' - IDD62 = '1.8mA' - VDD = '1.8V' - VDD2 = '1.2V' - -# A single GDDR5 x64 interface, with -# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix -# H5GQ1H24AFR) in a 2x32 configuration. -class GDDR5_4000_2x32(DRAMCtrl): - # size of device - device_size = '128MB' - - # 2x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # GDDR5 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 2Kbits (256Bytes) - device_rowbuffer_size = '256B' - - # 2x32 configuration, so 2 devices - devices_per_rank = 2 - - # assume single rank - ranks_per_channel = 1 - - # GDDR5 has 4 bank groups - bank_groups_per_rank = 4 - - # GDDR5 has 16 banks with 4 bank groups - banks_per_rank = 16 - - # 1000 MHz - tCK = '1ns' - - # 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz - # Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz ) - # 8 beats at 4000 MHz = 2 beats at 1000 MHz - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = '2ns' - - # @1000MHz data rate, tCCD_L is 3 CK - # CAS-to-CAS delay for bursts to the same bank group - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = '3ns'; - - tRCD = '12ns' - - # tCL is not directly found in datasheet and assumed equal tRCD - tCL = '12ns' - - tRP = '12ns' - tRAS = '28ns' - - # RRD_S (different bank group) - # RRD_S is 5.5 ns in datasheet. - # rounded to the next multiple of tCK - tRRD = '6ns' - - # RRD_L (same bank group) - # RRD_L is 5.5 ns in datasheet. - # rounded to the next multiple of tCK - tRRD_L = '6ns' - - tXAW = '23ns' - - # tXAW < 4 x tRRD. - # Therefore, activation limit is set to 0 - activation_limit = 0 - - tRFC = '65ns' - tWR = '12ns' - - # Here using the average of WTR_S and WTR_L - tWTR = '5ns' - - # Read-to-Precharge 2 CK - tRTP = '2ns' - - # Assume 2 cycles - tRTW = '2ns' - -# A single HBM x128 interface (one command and address bus), with -# default timings based on data publically released -# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014), -# IDD measurement values, and by extrapolating data from other classes. -# Architecture values based on published HBM spec -# A 4H stack is defined, 2Gb per die for a total of 1GB of memory. -class HBM_1000_4H_1x128(DRAMCtrl): - # HBM gen1 supports up to 8 128-bit physical channels - # Configuration defines a single channel, with the capacity - # set to (full_ stack_capacity / 8) based on 2Gb dies - # To use all 8 channels, set 'channels' parameter to 8 in - # system configuration - - # 128-bit interface legacy mode - device_bus_width = 128 - - # HBM supports BL4 and BL2 (legacy mode only) - burst_length = 4 - - # size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack; - # with 8 channels, 128MB per channel - device_size = '128MB' - - device_rowbuffer_size = '2kB' - - # 1x128 configuration - devices_per_rank = 1 - - # HBM does not have a CS pin; set rank to 1 - ranks_per_channel = 1 - - # HBM has 8 or 16 banks depending on capacity - # 2Gb dies have 8 banks - banks_per_rank = 8 - - # depending on frequency, bank groups may be required - # will always have 4 bank groups when enabled - # current specifications do not define the minimum frequency for - # bank group architecture - # setting bank_groups_per_rank to 0 to disable until range is defined - bank_groups_per_rank = 0 - - # 500 MHz for 1Gbps DDR data rate - tCK = '2ns' - - # use values from IDD measurement in JEDEC spec - # use tRP value for tRCD and tCL similar to other classes - tRP = '15ns' - tRCD = '15ns' - tCL = '15ns' - tRAS = '33ns' - - # BL2 and BL4 supported, default to BL4 - # DDR @ 500 MHz means 4 * 2ns / 2 = 4ns - tBURST = '4ns' - - # value for 2Gb device from JEDEC spec - tRFC = '160ns' - - # value for 2Gb device from JEDEC spec - tREFI = '3.9us' - - # extrapolate the following from LPDDR configs, using ns values - # to minimize burst length, prefetch differences - tWR = '18ns' - tRTP = '7.5ns' - tWTR = '10ns' - - # start with 2 cycles turnaround, similar to other memory classes - # could be more with variations across the stack - tRTW = '4ns' - - # single rank device, set to 0 - tCS = '0ns' - - # from MemCon example, tRRD is 4ns with 2ns tCK - tRRD = '4ns' - - # from MemCon example, tFAW is 30ns with 2ns tCK - tXAW = '30ns' - activation_limit = 4 - - # 4tCK - tXP = '8ns' - - # start with tRFC + tXP -> 160ns + 8ns = 168ns - tXS = '168ns' - -# A single HBM x64 interface (one command and address bus), with -# default timings based on HBM gen1 and data publically released -# A 4H stack is defined, 8Gb per die for a total of 4GB of memory. -# Note: This defines a pseudo-channel with a unique controller -# instantiated per pseudo-channel -# Stay at same IO rate (1Gbps) to maintain timing relationship with -# HBM gen1 class (HBM_1000_4H_x128) where possible -class HBM_1000_4H_1x64(HBM_1000_4H_1x128): - # For HBM gen2 with pseudo-channel mode, configure 2X channels. - # Configuration defines a single pseudo channel, with the capacity - # set to (full_ stack_capacity / 16) based on 8Gb dies - # To use all 16 pseudo channels, set 'channels' parameter to 16 in - # system configuration - - # 64-bit pseudo-channle interface - device_bus_width = 64 - - # HBM pseudo-channel only supports BL4 - burst_length = 4 - - # size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack; - # with 16 channels, 256MB per channel - device_size = '256MB' - - # page size is halved with pseudo-channel; maintaining the same same number - # of rows per pseudo-channel with 2X banks across 2 channels - device_rowbuffer_size = '1kB' - - # HBM has 8 or 16 banks depending on capacity - # Starting with 4Gb dies, 16 banks are defined - banks_per_rank = 16 - - # reset tRFC for larger, 8Gb device - # use HBM1 4Gb value as a starting point - tRFC = '260ns' - - # start with tRFC + tXP -> 160ns + 8ns = 168ns - tXS = '268ns' - # Default different rank bus delay to 2 CK, @1000 MHz = 2 ns - tCS = '2ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '10ns' - - # self refresh exit time - tXS = '65ns' - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture -# burst of 32, which means bursts can be interleaved -class LPDDR5_5500_1x16_BG_BL32(DRAMCtrl): - - # Increase buffer size to account for more bank resources - read_buffer_size = 64 - - # Set page policy to better suit DMC Huxley - page_policy = 'close_adaptive' - - # 16-bit channel interface - device_bus_width = 16 - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL32 for higher command bandwidth - burst_length = 32 - - # size of device in bytes - device_size = '1GB' - - # 2kB page with BG mode - device_rowbuffer_size = '2kB' - - # Use a 1x16 configuration - devices_per_rank = 1 - - # Use a single rank - ranks_per_channel = 1 - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Initial configuration will have 16 banks with Bank Group Arch - # to maximim resources and enable higher data rates - banks_per_rank = 16 - bank_groups_per_rank = 4 - - # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK - tCK = '1.455ns' - - # Greater of 2 CK or 18ns - tRCD = '18ns' - - # Base RL is 16 CK @ 687.5 MHz = 23.28ns - tCL = '23.280ns' - - # Greater of 2 CK or 18ns - tRP = '18ns' - - # Greater of 3 CK or 42ns - tRAS = '42ns' - - # Greater of 3 CK or 34ns - tWR = '34ns' - - # active powerdown and precharge powerdown exit time - # Greater of 3 CK or 7ns - tXP = '7ns' - - # self refresh exit time (tRFCab + 7.5ns) - tXS = '217.5ns' - - # Greater of 2 CK or 7.5 ns minus 2 CK - tRTP = '4.59ns' - - # With BG architecture, burst of 32 transferred in two 16-beat - # sub-bursts, with a 16-beat gap in between. - # Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz - # tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz - tBURST = '8.73ns' - # can interleave a Bstof32 from another bank group at tBURST_MIN - # 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz - tBURST_MIN = '2.91ns' - # tBURST_MAX is the maximum burst delay for same bank group timing - # this is 8 CK @ 687.5 MHz - tBURST_MAX = '11.64ns' - - # 8 CK @ 687.5 MHz - tCCD_L = "11.64ns" - - # LPDDR5, 8 Gbit/channel for 280ns tRFCab - tRFC = '210ns' - tREFI = '3.9us' - - # Greater of 4 CK or 6.25 ns - tWTR = '6.25ns' - # Greater of 4 CK or 12 ns - tWTR_L = '12ns' - - # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL - # tWCKDQ0/tCK will be 1 CK for most cases - # For gem5 RL = WL and BL/n is already accounted for with tBURST - # Result is and additional 1 CK is required - tRTW = '1.455ns' - - # Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns - tCS = '2.91ns' - - # 2 CK - tPPD = '2.91ns' - - # Greater of 2 CK or 5 ns - tRRD = '5ns' - tRRD_L = '5ns' - - # With Bank Group Arch mode tFAW is 20 ns - tXAW = '20ns' - activation_limit = 4 - - # at 5Gbps, 4:1 WCK to CK ratio required - # 2 data beats per WCK (DDR) -> 8 per CK - beats_per_clock = 8 - - # 2 cycles required to send activate command - # 2 command phases can be sent back-to-back or - # with a gap up to tAAD = 8 CK - two_cycle_activate = True - tAAD = '11.640ns' - - data_clock_sync = True - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture, burst of 16 -class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32): - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL16 for smaller access granularity - burst_length = 16 - - # For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio - tBURST = '2.91ns' - tBURST_MIN = '2.91ns' - # For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio - tBURST_MAX = '5.82ns' - - # 4 CK @ 687.5 MHz - tCCD_L = "5.82ns" - - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 8-bank mode, burst of 32 -class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32): - - # 4kB page with 8B mode - device_rowbuffer_size = '4kB' - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Select 8B - banks_per_rank = 8 - bank_groups_per_rank = 0 - - # For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio - tBURST = '5.82ns' - tBURST_MIN = '5.82ns' - tBURST_MAX = '5.82ns' - - # Greater of 4 CK or 12 ns - tWTR = '12ns' - - # Greater of 2 CK or 10 ns - tRRD = '10ns' - - # With 8B mode tFAW is 40 ns - tXAW = '40ns' - activation_limit = 4 - - # Reset BG arch timing for 8B mode - tCCD_L = "0ns" - tRRD_L = "0ns" - tWTR_L = "0ns" - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# 6.4Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture -# burst of 32, which means bursts can be interleaved -class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32): - - # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK - tCK = '1.25ns' - - # Base RL is 17 CK @ 800 MHz = 21.25ns - tCL = '21.25ns' - - # With BG architecture, burst of 32 transferred in two 16-beat - # sub-bursts, with a 16-beat gap in between. - # Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz - # tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz - tBURST = '7.5ns' - # can interleave a Bstof32 from another bank group at tBURST_MIN - # 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz - tBURST_MIN = '2.5ns' - # tBURST_MAX is the maximum burst delay for same bank group timing - # this is 8 CK @ 800 MHz - tBURST_MAX = '10ns' - - # 8 CK @ 800 MHz - tCCD_L = "10ns" - - # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL - # tWCKDQ0/tCK will be 1 CK for most cases - # For gem5 RL = WL and BL/n is already accounted for with tBURST - # Result is and additional 1 CK is required - tRTW = '1.25ns' - - # Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns - tCS = '2.5ns' - - # 2 CK - tPPD = '2.5ns' - - # 2 command phases can be sent back-to-back or - # with a gap up to tAAD = 8 CK - tAAD = '10ns' - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on initial -# JEDEC specifcation -# 6.4Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture, burst of 16 -class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32): - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL16 for smaller access granularity - burst_length = 16 - - # For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio - tBURST = '2.5ns' - tBURST_MIN = '2.5ns' - # For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio - tBURST_MAX = '5ns' - - # 4 CK @ 800 MHz - tCCD_L = "5ns" - - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# 6.4Gbps data rates and 8Gbit die -# Configuring for 8-bank mode, burst of 32 -class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32): - - # 4kB page with 8B mode - device_rowbuffer_size = '4kB' - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Select 8B - banks_per_rank = 8 - bank_groups_per_rank = 0 - - # For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio - tBURST = '5ns' - tBURST_MIN = '5ns' - tBURST_MAX = '5ns' - - # Greater of 4 CK or 12 ns - tWTR = '12ns' - - # Greater of 2 CK or 10 ns - tRRD = '10ns' - - # With 8B mode tFAW is 40 ns - tXAW = '40ns' - activation_limit = 4 - - # Reset BG arch timing for 8B mode - tCCD_L = "0ns" - tRRD_L = "0ns" - tWTR_L = "0ns" diff --git a/src/mem/DRAMInterface.py b/src/mem/DRAMInterface.py new file mode 100644 index 0000000..35bf8a3 --- /dev/null +++ b/src/mem/DRAMInterface.py @@ -0,0 +1,1483 @@ +# Copyright (c) 2012-2020 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Copyright (c) 2013 Amin Farmahini-Farahani +# Copyright (c) 2015 University of Kaiserslautern +# Copyright (c) 2015 The University of Bologna +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from AbstractMemory import AbstractMemory +from DRAMCtrl import * + +# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting +# channel, rank, bank, row and column, respectively, and going from +# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are +# suitable for an open-page policy, optimising for sequential accesses +# hitting in the open row. For a closed-page policy, RoCoRaBaCh +# maximises parallelism. +class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh'] + +# Enum for the page policy, either open, open_adaptive, close, or +# close_adaptive. +class PageManage(Enum): vals = ['open', 'open_adaptive', 'close', + 'close_adaptive'] + +class DRAMInterface(AbstractMemory): + type = 'DRAMInterface' + cxx_header = "mem/dram_ctrl.hh" + + # scheduler, address map and page policy + addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy") + page_policy = Param.PageManage('open_adaptive', "Page management policy") + + # Allow the interface to set required controller buffer sizes + # each entry corresponds to a burst for the specific DRAM + # configuration (e.g. x32 with burst length 8 is 32 bytes) and not + # the cacheline size or request/packet size + write_buffer_size = Param.Unsigned(64, "Number of write queue entries") + read_buffer_size = Param.Unsigned(32, "Number of read queue entries") + + # enforce a limit on the number of accesses per row + max_accesses_per_row = Param.Unsigned(16, "Max accesses per row before " + "closing"); + + # size of DRAM Chip in Bytes + device_size = Param.MemorySize("Size of DRAM chip") + # the physical organisation of the DRAM + device_bus_width = Param.Unsigned("data bus width in bits for each DRAM "\ + "device/chip") + burst_length = Param.Unsigned("Burst lenght (BL) in beats") + device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\ + "device/chip") + devices_per_rank = Param.Unsigned("Number of devices/chips per rank") + + ranks_per_channel = Param.Unsigned("Number of ranks per channel") + # default to 0 bank groups per rank, indicating bank group architecture + # is not used + # update per memory class when bank group architecture is supported + bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per rank") + banks_per_rank = Param.Unsigned("Number of banks per rank") + + # Enable DRAM powerdown states if True. This is False by default due to + # performance being lower when enabled + enable_dram_powerdown = Param.Bool(False, "Enable powerdown states") + + # For power modelling we need to know if the DRAM has a DLL or not + dll = Param.Bool(True, "DRAM has DLL or not") + + # DRAMPower provides in addition to the core power, the possibility to + # include RD/WR termination and IO power. This calculation assumes some + # default values. The integration of DRAMPower with gem5 does not include + # IO and RD/WR termination power by default. This might be added as an + # additional feature in the future. + + # timing behaviour and constraints - all in nanoseconds + + # the base clock period of the DRAM + + tCK = Param.Latency("Clock period") + + # rank-to-rank bus delay penalty + # this does not correlate to a memory timing parameter and encompasses: + # 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD + # different rank bus delay + tCS = Param.Latency("Rank to rank switching time") + + # the amount of time in nanoseconds from issuing an activate command + # to the data being available in the row buffer for a read/write + tRCD = Param.Latency("RAS to CAS delay") + + # the time from issuing a read/write command to seeing the actual data + tCL = Param.Latency("CAS latency") + + # minimum time between a precharge and subsequent activate + tRP = Param.Latency("Row precharge time") + + # minimum time between an activate and a precharge to the same row + tRAS = Param.Latency("ACT to PRE delay") + + # minimum time between a write data transfer and a precharge + tWR = Param.Latency("Write recovery time") + + # minimum time between a read and precharge command + tRTP = Param.Latency("Read to precharge") + + # time to complete a burst transfer, typically the burst length + # divided by two due to the DDR bus, but by making it a parameter + # it is easier to also evaluate SDR memories like WideIO. + # This parameter has to account for burst length. + # Read/Write requests with data size larger than one full burst are broken + # down into multiple requests in the controller + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = Param.Latency("Burst duration " + "(typically burst length / 2 cycles)") + + # tBURST_MAX is the column array cycle delay required before next access, + # which could be greater than tBURST when the memory access time is greater + # than tBURST + tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay") + + # tBURST_MIN is the minimum delay between bursts, which could be less than + # tBURST when interleaving is supported + tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts") + + # CAS-to-CAS delay for bursts to the same bank group + # only utilized with bank group architectures; set to 0 for default case + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay") + + # Write-to-Write delay for bursts to the same bank group + # only utilized with bank group architectures; set to 0 for default case + # This will be used to enable different same bank group delays + # for writes versus reads + tCCD_L_WR = Param.Latency(Self.tCCD_L, "Same bank group Write to Write " \ + "delay") + + # time taken to complete one refresh cycle (N rows in all banks) + tRFC = Param.Latency("Refresh cycle time") + + # refresh command interval, how often a "ref" command needs + # to be sent. It is 7.8 us for a 64ms refresh requirement + tREFI = Param.Latency("Refresh command interval") + + # write-to-read, same rank turnaround penalty + tWTR = Param.Latency("Write to read, same rank switching time") + + # write-to-read, same rank turnaround penalty for same bank group + tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching " + "time, same bank group") + + # read-to-write, same rank turnaround penalty + tRTW = Param.Latency("Read to write, same rank switching time") + + # minimum precharge to precharge delay time + tPPD = Param.Latency("0ns", "PRE to PRE delay") + + # maximum delay between two-cycle ACT command phases + tAAD = Param.Latency(Self.tCK, + "Maximum delay between two-cycle ACT commands") + + two_cycle_activate = Param.Bool(False, + "Two cycles required to send activate") + + # minimum row activate to row activate delay time + tRRD = Param.Latency("ACT to ACT delay") + + # only utilized with bank group architectures; set to 0 for default case + tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay") + + # time window in which a maximum number of activates are allowed + # to take place, set to 0 to disable + tXAW = Param.Latency("X activation window") + activation_limit = Param.Unsigned("Max number of activates in window") + + # time to exit power-down mode + # Exit power-down to next valid command delay + tXP = Param.Latency("0ns", "Power-up Delay") + + # Exit Powerdown to commands requiring a locked DLL + tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL") + + # time to exit self-refresh mode + tXS = Param.Latency("0ns", "Self-refresh exit latency") + + # time to exit self-refresh mode with locked DLL + tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL") + + # number of data beats per clock. with DDR, default is 2, one per edge + beats_per_clock = Param.Unsigned(2, "Data beats per clock") + + data_clock_sync = Param.Bool(False, "Synchronization commands required") + + # Currently rolled into other params + ###################################################################### + + # tRC - assumed to be tRAS + tRP + + # Power Behaviour and Constraints + # DRAMs like LPDDR and WideIO have 2 external voltage domains. These are + # defined as VDD and VDD2. Each current is defined for each voltage domain + # separately. For example, current IDD0 is active-precharge current for + # voltage domain VDD and current IDD02 is active-precharge current for + # voltage domain VDD2. + # By default all currents are set to 0mA. Users who are only interested in + # the performance of DRAMs can leave them at 0. + + # Power Behaviour and Constraints + # DRAMs like LPDDR and WideIO have 2 external voltage domains. These are + # defined as VDD and VDD2. Each current is defined for each voltage domain + # separately. For example, current IDD0 is active-precharge current for + # voltage domain VDD and current IDD02 is active-precharge current for + # voltage domain VDD2. + # By default all currents are set to 0mA. Users who are only interested in + # the performance of DRAMs can leave them at 0. + + # Operating 1 Bank Active-Precharge current + IDD0 = Param.Current("0mA", "Active precharge current") + + # Operating 1 Bank Active-Precharge current multiple voltage Range + IDD02 = Param.Current("0mA", "Active precharge current VDD2") + + # Precharge Power-down Current: Slow exit + IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow") + + # Precharge Power-down Current: Slow exit multiple voltage Range + IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2") + + # Precharge Power-down Current: Fast exit + IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast") + + # Precharge Power-down Current: Fast exit multiple voltage Range + IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2") + + # Precharge Standby current + IDD2N = Param.Current("0mA", "Precharge Standby current") + + # Precharge Standby current multiple voltage range + IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2") + + # Active Power-down current: slow exit + IDD3P0 = Param.Current("0mA", "Active Powerdown slow") + + # Active Power-down current: slow exit multiple voltage range + IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2") + + # Active Power-down current : fast exit + IDD3P1 = Param.Current("0mA", "Active Powerdown fast") + + # Active Power-down current : fast exit multiple voltage range + IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2") + + # Active Standby current + IDD3N = Param.Current("0mA", "Active Standby current") + + # Active Standby current multiple voltage range + IDD3N2 = Param.Current("0mA", "Active Standby current VDD2") + + # Burst Read Operating Current + IDD4R = Param.Current("0mA", "READ current") + + # Burst Read Operating Current multiple voltage range + IDD4R2 = Param.Current("0mA", "READ current VDD2") + + # Burst Write Operating Current + IDD4W = Param.Current("0mA", "WRITE current") + + # Burst Write Operating Current multiple voltage range + IDD4W2 = Param.Current("0mA", "WRITE current VDD2") + + # Refresh Current + IDD5 = Param.Current("0mA", "Refresh current") + + # Refresh Current multiple voltage range + IDD52 = Param.Current("0mA", "Refresh current VDD2") + + # Self-Refresh Current + IDD6 = Param.Current("0mA", "Self-refresh Current") + + # Self-Refresh Current multiple voltage range + IDD62 = Param.Current("0mA", "Self-refresh Current VDD2") + + # Main voltage range of the DRAM + VDD = Param.Voltage("0V", "Main Voltage Range") + + # Second voltage range defined by some DRAMs + VDD2 = Param.Voltage("0V", "2nd Voltage Range") + +# A single DDR3-1600 x64 channel (one command and address bus), with +# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in +# an 8x8 configuration. +class DDR3_1600_8x8(DRAMInterface): + # size of device in bytes + device_size = '512MB' + + # 8x8 configuration, 8 devices each with an 8-bit interface + device_bus_width = 8 + + # DDR3 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) + device_rowbuffer_size = '1kB' + + # 8x8 configuration, so 8 devices + devices_per_rank = 8 + + # Use two ranks + ranks_per_channel = 2 + + # DDR3 has 8 banks in all configurations + banks_per_rank = 8 + + # 800 MHz + tCK = '1.25ns' + + # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns + tCS = '2.5ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 800 MHz + tBURST = '5ns' + + # Greater of 4 CK or 7.5 ns + tWTR = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns + tRTW = '2.5ns' + + # DDR3-1600 11-11-11 + tRCD = '13.75ns' + tCL = '13.75ns' + tRP = '13.75ns' + tRAS = '35ns' + tRRD = '6ns' + tXAW = '30ns' + activation_limit = 4 + tRFC = '260ns' + + tWR = '15ns' + + # Greater of 4 CK or 7.5 ns + tRTP = '7.5ns' + # <=85C, half for >85C + tREFI = '7.8us' + + # active powerdown and precharge powerdown exit time + tXP = '6ns' + + # self refresh exit time + tXS = '270ns' + + # Current values from datasheet Die Rev E,J + IDD0 = '55mA' + IDD2N = '32mA' + IDD3N = '38mA' + IDD4W = '125mA' + IDD4R = '157mA' + IDD5 = '235mA' + IDD3P1 = '38mA' + IDD2P1 = '32mA' + IDD6 = '20mA' + VDD = '1.5V' + +# A single HMC-2500 x32 model based on: +# [1] DRAMSpec: a high-level DRAM bank modelling tool +# developed at the University of Kaiserslautern. This high level tool +# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to +# estimate the DRAM bank latency and power numbers. +# [2] High performance AXI-4.0 based interconnect for extensible smart memory +# cubes (E. Azarkhish et. al) +# Assumed for the HMC model is a 30 nm technology node. +# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory (4 +# layers). +# Each layer has 16 vaults and each vault consists of 2 banks per layer. +# In order to be able to use the same controller used for 2D DRAM generations +# for HMC, the following analogy is done: +# Channel (DDR) => Vault (HMC) +# device_size (DDR) => size of a single layer in a vault +# ranks per channel (DDR) => number of layers +# banks per rank (DDR) => banks per layer +# devices per rank (DDR) => devices per layer ( 1 for HMC). +# The parameters for which no input is available are inherited from the DDR3 +# configuration. +# This configuration includes the latencies from the DRAM to the logic layer +# of the HMC +class HMC_2500_1x32_Interface(DDR3_1600_8x8): + # A single HMC-2500 x32 controller + # The buffer parameters do not directly correlate with buffer_size in + # real hardware. Nevertheless, their value has been tuned to achieve a + # bandwidth similar to the cycle-accurate model in [2] + write_buffer_size = 32 + read_buffer_size = 32 + + # size of device + # two banks per device with each bank 4MB [2] + device_size = '8MB' + + # 1x32 configuration, 1 device with 32 TSVs [2] + device_bus_width = 32 + + # HMC is a BL8 device [2] + burst_length = 8 + + # Each device has a page (row buffer) size of 256 bytes [2] + device_rowbuffer_size = '256B' + + # 1x32 configuration, so 1 device [2] + devices_per_rank = 1 + + # 4 layers so 4 ranks [2] + ranks_per_channel = 4 + + # HMC has 2 banks per layer [2] + # Each layer represents a rank. With 4 layers and 8 banks in total, each + # layer has 2 banks; thus 2 banks per rank. + banks_per_rank = 2 + + # 1250 MHz [2] + tCK = '0.8ns' + + # Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz = + # 0.8 ns (Assumption) + tCS = '0.8ns' + + # 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz + tBURST = '3.2ns' + + # Values using DRAMSpec HMC model [1] + tRCD = '10.2ns' + tCL = '9.9ns' + tRP = '7.7ns' + tRAS = '21.6ns' + + # tRRD depends on the power supply network for each vendor. + # We assume a tRRD of a double bank approach to be equal to 4 clock + # cycles (Assumption) + tRRD = '3.2ns' + + # activation limit is set to 0 since there are only 2 banks per vault + # layer. + activation_limit = 0 + + # Values using DRAMSpec HMC model [1] + tRFC = '59ns' + tWR = '8ns' + tRTP = '4.9ns' + + # Value using DRAMSpec HMC model [1] + tREFI = '3.9us' + + # The default page policy in the vault controllers is simple closed page + # [2] nevertheless 'close' policy opens and closes the row multiple times + # for bursts largers than 32Bytes. For this reason we use 'close_adaptive' + page_policy = 'close_adaptive' + + # RoCoRaBaCh resembles the default address mapping in HMC + addr_mapping = 'RoCoRaBaCh' + +# A single DDR3-2133 x64 channel refining a selected subset of the +# options for the DDR-1600 configuration, based on the same DDR3-1600 +# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept +# consistent across the two configurations. +class DDR3_2133_8x8(DDR3_1600_8x8): + # 1066 MHz + tCK = '0.938ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz + tBURST = '3.752ns' + + # DDR3-2133 14-14-14 + tRCD = '13.09ns' + tCL = '13.09ns' + tRP = '13.09ns' + tRAS = '33ns' + tRRD = '5ns' + tXAW = '25ns' + + # Current values from datasheet + IDD0 = '70mA' + IDD2N = '37mA' + IDD3N = '44mA' + IDD4W = '157mA' + IDD4R = '191mA' + IDD5 = '250mA' + IDD3P1 = '44mA' + IDD2P1 = '43mA' + IDD6 ='20mA' + VDD = '1.5V' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4) +# in an 16x4 configuration. +# Total channel capacity is 32GB +# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel +class DDR4_2400_16x4(DRAMInterface): + # override the default buffer sizes and go for something larger to + # accommodate the larger bank count + write_buffer_size = 128 + read_buffer_size = 64 + + # size of device + device_size = '1GB' + + # 16x4 configuration, 16 devices each with a 4-bit interface + device_bus_width = 4 + + # DDR4 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 512 byte (1K columns x4) + device_rowbuffer_size = '512B' + + # 16x4 configuration, so 16 devices + devices_per_rank = 16 + + # Match our DDR3 configurations which is dual rank + ranks_per_channel = 2 + + # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups + # Set to 4 for x4 case + bank_groups_per_rank = 4 + + # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all + # configurations). Currently we do not capture the additional + # constraints incurred by the bank groups + banks_per_rank = 16 + + # 1200 MHz + tCK = '0.833ns' + + # Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns + tCS = '1.666ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = '3.332ns' + + # Here using the average of WTR_S and WTR_L + tWTR = '5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666 ns + tRTW = '1.666ns' + + # @2400 data rate, tCCD_L is 6 CK + # CAS-to-CAS delay for bursts to the same bank group + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = '5ns'; + + # DDR4-2400 17-17-17 + tRCD = '14.16ns' + tCL = '14.16ns' + tRP = '14.16ns' + tRAS = '32ns' + + # RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns) + tRRD = '3.332ns' + + # RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns) + tRRD_L = '4.9ns'; + + # tFAW for 512B page is MAX(16 CK, 13ns) + tXAW = '13.328ns' + activation_limit = 4 + # tRFC is 350ns + tRFC = '350ns' + + tWR = '15ns' + + # Greater of 4 CK or 7.5 ns + tRTP = '7.5ns' + + # <=85C, half for >85C + tREFI = '7.8us' + + # active powerdown and precharge powerdown exit time + tXP = '6ns' + + # self refresh exit time + # exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is: + # tRFC + 10ns = 340ns + tXS = '340ns' + + # Current values from datasheet + IDD0 = '43mA' + IDD02 = '3mA' + IDD2N = '34mA' + IDD3N = '38mA' + IDD3N2 = '3mA' + IDD4W = '103mA' + IDD4R = '110mA' + IDD5 = '250mA' + IDD3P1 = '32mA' + IDD2P1 = '25mA' + IDD6 = '30mA' + VDD = '1.2V' + VDD2 = '2.5V' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8) +# in an 8x8 configuration. +# Total channel capacity is 16GB +# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel +class DDR4_2400_8x8(DDR4_2400_16x4): + # 8x8 configuration, 8 devices each with an 8-bit interface + device_bus_width = 8 + + # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) + device_rowbuffer_size = '1kB' + + # 8x8 configuration, so 8 devices + devices_per_rank = 8 + + # RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns) + tRRD_L = '4.9ns'; + + tXAW = '21ns' + + # Current values from datasheet + IDD0 = '48mA' + IDD3N = '43mA' + IDD4W = '123mA' + IDD4R = '135mA' + IDD3P1 = '37mA' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16) +# in an 4x16 configuration. +# Total channel capacity is 4GB +# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel +class DDR4_2400_4x16(DDR4_2400_16x4): + # 4x16 configuration, 4 devices each with an 16-bit interface + device_bus_width = 16 + + # Each device has a page (row buffer) size of 2 Kbyte (1K columns x16) + device_rowbuffer_size = '2kB' + + # 4x16 configuration, so 4 devices + devices_per_rank = 4 + + # Single rank for x16 + ranks_per_channel = 1 + + # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups + # Set to 2 for x16 case + bank_groups_per_rank = 2 + + # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all + # configurations). Currently we do not capture the additional + # constraints incurred by the bank groups + banks_per_rank = 8 + + # RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns) + tRRD = '5.3ns' + + # RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns) + tRRD_L = '6.4ns'; + + tXAW = '30ns' + + # Current values from datasheet + IDD0 = '80mA' + IDD02 = '4mA' + IDD2N = '34mA' + IDD3N = '47mA' + IDD4W = '228mA' + IDD4R = '243mA' + IDD5 = '280mA' + IDD3P1 = '41mA' + +# A single LPDDR2-S4 x32 interface (one command/address bus), with +# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1) +# in a 1x32 configuration. +class LPDDR2_S4_1066_1x32(DRAMInterface): + # No DLL in LPDDR2 + dll = False + + # size of device + device_size = '512MB' + + # 1x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # LPDDR2_S4 is a BL4 and BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1KB + # (this depends on the memory density) + device_rowbuffer_size = '1kB' + + # 1x32 configuration, so 1 device + devices_per_rank = 1 + + # Use a single rank + ranks_per_channel = 1 + + # LPDDR2-S4 has 8 banks in all configurations + banks_per_rank = 8 + + # 533 MHz + tCK = '1.876ns' + + # Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns + tCS = '3.75ns' + + # 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz. + # Note this is a BL8 DDR device. + # Requests larger than 32 bytes are broken down into multiple requests + # in the controller + tBURST = '7.5ns' + + # Irrespective of speed grade, tWTR is 7.5 ns + tWTR = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns + tRTW = '3.75ns' + + # Fixed at 15 ns + tRCD = '15ns' + + # 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time + tCL = '15ns' + + # Pre-charge one bank 15 ns (all banks 18 ns) + tRP = '15ns' + + tRAS = '42ns' + tWR = '15ns' + + tRTP = '7.5ns' + + # LPDDR2-S4, 4 Gbit + tRFC = '130ns' + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '7.5ns' + + # self refresh exit time + tXS = '140ns' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Irrespective of density, tFAW is 50 ns + tXAW = '50ns' + activation_limit = 4 + + # Current values from datasheet + IDD0 = '15mA' + IDD02 = '70mA' + IDD2N = '2mA' + IDD2N2 = '30mA' + IDD3N = '2.5mA' + IDD3N2 = '30mA' + IDD4W = '10mA' + IDD4W2 = '190mA' + IDD4R = '3mA' + IDD4R2 = '220mA' + IDD5 = '40mA' + IDD52 = '150mA' + IDD3P1 = '1.2mA' + IDD3P12 = '8mA' + IDD2P1 = '0.6mA' + IDD2P12 = '0.8mA' + IDD6 = '1mA' + IDD62 = '3.2mA' + VDD = '1.8V' + VDD2 = '1.2V' + +# A single WideIO x128 interface (one command and address bus), with +# default timings based on an estimated WIO-200 8 Gbit part. +class WideIO_200_1x128(DRAMInterface): + # No DLL for WideIO + dll = False + + # size of device + device_size = '1024MB' + + # 1x128 configuration, 1 device with a 128-bit interface + device_bus_width = 128 + + # This is a BL4 device + burst_length = 4 + + # Each device has a page (row buffer) size of 4KB + # (this depends on the memory density) + device_rowbuffer_size = '4kB' + + # 1x128 configuration, so 1 device + devices_per_rank = 1 + + # Use one rank for a one-high die stack + ranks_per_channel = 1 + + # WideIO has 4 banks in all configurations + banks_per_rank = 4 + + # 200 MHz + tCK = '5ns' + + # Default different rank bus delay to 2 CK, @200 MHz = 10 ns + tCS = '10ns' + + # 4 beats across an x128 SDR interface translates to 4 clocks @ 200 MHz. + # Note this is a BL4 SDR device. + tBURST = '20ns' + + # Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns + tWTR = '15ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns + tRTW = '10ns' + + # WIO-200 + tRCD = '18ns' + tCL = '18ns' + tRP = '18ns' + tRAS = '42ns' + tWR = '15ns' + # Read to precharge is same as the burst + tRTP = '20ns' + + # WIO 8 Gb + tRFC = '210ns' + + # WIO 8 Gb, <=85C, half for >85C + tREFI = '3.9us' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Two instead of four activation window + tXAW = '50ns' + activation_limit = 2 + + # The WideIO specification does not provide current information + +# A single LPDDR3 x32 interface (one command/address bus), with +# default timings based on a LPDDR3-1600 4 Gbit part (Micron +# EDF8132A1MC) in a 1x32 configuration. +class LPDDR3_1600_1x32(DRAMInterface): + # No DLL for LPDDR3 + dll = False + + # size of device + device_size = '512MB' + + # 1x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # LPDDR3 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 4KB + device_rowbuffer_size = '4kB' + + # 1x32 configuration, so 1 device + devices_per_rank = 1 + + # Technically the datasheet is a dual-rank package, but for + # comparison with the LPDDR2 config we stick to a single rank + ranks_per_channel = 1 + + # LPDDR3 has 8 banks in all configurations + banks_per_rank = 8 + + # 800 MHz + tCK = '1.25ns' + + # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns + tCS = '2.5ns' + + # 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz. + # Note this is a BL8 DDR device. + # Requests larger than 32 bytes are broken down into multiple requests + # in the controller + tBURST = '5ns' + + # Irrespective of speed grade, tWTR is 7.5 ns + tWTR = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns + tRTW = '2.5ns' + + tRCD = '18ns' + + # 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time + tCL = '15ns' + + tRAS = '42ns' + tWR = '15ns' + + # Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns + tRTP = '7.5ns' + + # Pre-charge one bank 18 ns (all banks 21 ns) + tRP = '18ns' + + # LPDDR3, 4 Gb + tRFC = '130ns' + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '7.5ns' + + # self refresh exit time + tXS = '140ns' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Irrespective of size, tFAW is 50 ns + tXAW = '50ns' + activation_limit = 4 + + # Current values from datasheet + IDD0 = '8mA' + IDD02 = '60mA' + IDD2N = '0.8mA' + IDD2N2 = '26mA' + IDD3N = '2mA' + IDD3N2 = '34mA' + IDD4W = '2mA' + IDD4W2 = '190mA' + IDD4R = '2mA' + IDD4R2 = '230mA' + IDD5 = '28mA' + IDD52 = '150mA' + IDD3P1 = '1.4mA' + IDD3P12 = '11mA' + IDD2P1 = '0.8mA' + IDD2P12 = '1.8mA' + IDD6 = '0.5mA' + IDD62 = '1.8mA' + VDD = '1.8V' + VDD2 = '1.2V' + +# A single GDDR5 x64 interface, with +# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix +# H5GQ1H24AFR) in a 2x32 configuration. +class GDDR5_4000_2x32(DRAMInterface): + # size of device + device_size = '128MB' + + # 2x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # GDDR5 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 2Kbits (256Bytes) + device_rowbuffer_size = '256B' + + # 2x32 configuration, so 2 devices + devices_per_rank = 2 + + # assume single rank + ranks_per_channel = 1 + + # GDDR5 has 4 bank groups + bank_groups_per_rank = 4 + + # GDDR5 has 16 banks with 4 bank groups + banks_per_rank = 16 + + # 1000 MHz + tCK = '1ns' + + # 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz + # Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz ) + # 8 beats at 4000 MHz = 2 beats at 1000 MHz + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = '2ns' + + # Here using the average of WTR_S and WTR_L + tWTR = '5ns' + + # Assume 2 cycles + tRTW = '2ns' + + # @1000MHz data rate, tCCD_L is 3 CK + # CAS-to-CAS delay for bursts to the same bank group + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = '3ns'; + + tRCD = '12ns' + + # tCL is not directly found in datasheet and assumed equal tRCD + tCL = '12ns' + + tRP = '12ns' + tRAS = '28ns' + + # RRD_S (different bank group) + # RRD_S is 5.5 ns in datasheet. + # rounded to the next multiple of tCK + tRRD = '6ns' + + # RRD_L (same bank group) + # RRD_L is 5.5 ns in datasheet. + # rounded to the next multiple of tCK + tRRD_L = '6ns' + + tXAW = '23ns' + + # tXAW < 4 x tRRD. + # Therefore, activation limit is set to 0 + activation_limit = 0 + + tRFC = '65ns' + tWR = '12ns' + + # Read-to-Precharge 2 CK + tRTP = '2ns' + +# A single HBM x128 interface (one command and address bus), with +# default timings based on data publically released +# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014), +# IDD measurement values, and by extrapolating data from other classes. +# Architecture values based on published HBM spec +# A 4H stack is defined, 2Gb per die for a total of 1GB of memory. +class HBM_1000_4H_1x128(DRAMInterface): + # HBM gen1 supports up to 8 128-bit physical channels + # Configuration defines a single channel, with the capacity + # set to (full_ stack_capacity / 8) based on 2Gb dies + # To use all 8 channels, set 'channels' parameter to 8 in + # system configuration + + # 128-bit interface legacy mode + device_bus_width = 128 + + # HBM supports BL4 and BL2 (legacy mode only) + burst_length = 4 + + # size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack; + # with 8 channels, 128MB per channel + device_size = '128MB' + + device_rowbuffer_size = '2kB' + + # 1x128 configuration + devices_per_rank = 1 + + # HBM does not have a CS pin; set rank to 1 + ranks_per_channel = 1 + + # HBM has 8 or 16 banks depending on capacity + # 2Gb dies have 8 banks + banks_per_rank = 8 + + # depending on frequency, bank groups may be required + # will always have 4 bank groups when enabled + # current specifications do not define the minimum frequency for + # bank group architecture + # setting bank_groups_per_rank to 0 to disable until range is defined + bank_groups_per_rank = 0 + + # 500 MHz for 1Gbps DDR data rate + tCK = '2ns' + + # single rank device, set to 0 + tCS = '0ns' + + # BL2 and BL4 supported, default to BL4 + # DDR @ 500 MHz means 4 * 2ns / 2 = 4ns + tBURST = '4ns' + + tWTR = '10ns' + + # start with 2 cycles turnaround, similar to other memory classes + # could be more with variations across the stack + tRTW = '4ns' + + # use values from IDD measurement in JEDEC spec + # use tRP value for tRCD and tCL similar to other classes + tRP = '15ns' + tRCD = '15ns' + tCL = '15ns' + tRAS = '33ns' + + # value for 2Gb device from JEDEC spec + tRFC = '160ns' + + # value for 2Gb device from JEDEC spec + tREFI = '3.9us' + + # extrapolate the following from LPDDR configs, using ns values + # to minimize burst length, prefetch differences + tWR = '18ns' + tRTP = '7.5ns' + # from MemCon example, tRRD is 4ns with 2ns tCK + tRRD = '4ns' + + # from MemCon example, tFAW is 30ns with 2ns tCK + tXAW = '30ns' + activation_limit = 4 + + # 4tCK + tXP = '8ns' + + # start with tRFC + tXP -> 160ns + 8ns = 168ns + tXS = '168ns' + +# A single HBM x64 interface (one command and address bus), with +# default timings based on HBM gen1 and data publically released +# A 4H stack is defined, 8Gb per die for a total of 4GB of memory. +# Note: This defines a pseudo-channel with a unique controller +# instantiated per pseudo-channel +# Stay at same IO rate (1Gbps) to maintain timing relationship with +# HBM gen1 class (HBM_1000_4H_x128) where possible +class HBM_1000_4H_1x64(HBM_1000_4H_1x128): + # For HBM gen2 with pseudo-channel mode, configure 2X channels. + # Configuration defines a single pseudo channel, with the capacity + # set to (full_ stack_capacity / 16) based on 8Gb dies + # To use all 16 pseudo channels, set 'channels' parameter to 16 in + # system configuration + + # 64-bit pseudo-channle interface + device_bus_width = 64 + + # HBM pseudo-channel only supports BL4 + burst_length = 4 + + # size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack; + # with 16 channels, 256MB per channel + device_size = '256MB' + + # page size is halved with pseudo-channel; maintaining the same same number + # of rows per pseudo-channel with 2X banks across 2 channels + device_rowbuffer_size = '1kB' + + # HBM has 8 or 16 banks depending on capacity + # Starting with 4Gb dies, 16 banks are defined + banks_per_rank = 16 + + # Default different rank bus delay to 2 CK, @1000 MHz = 2 ns + tCS = '2ns' + + # reset tRFC for larger, 8Gb device + # use HBM1 4Gb value as a starting point + tRFC = '260ns' + + # start with tRFC + tXP -> 160ns + 8ns = 168ns + tXS = '268ns' + + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '10ns' + + # self refresh exit time + tXS = '65ns' + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture +# burst of 32, which means bursts can be interleaved +class LPDDR5_5500_1x16_BG_BL32(DRAMInterface): + + # Increase buffer size to account for more bank resources + read_buffer_size = 64 + + # Set page policy to better suit DMC Huxley + page_policy = 'close_adaptive' + + # 16-bit channel interface + device_bus_width = 16 + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL32 for higher command bandwidth + burst_length = 32 + + # size of device in bytes + device_size = '1GB' + + # 2kB page with BG mode + device_rowbuffer_size = '2kB' + + # Use a 1x16 configuration + devices_per_rank = 1 + + # Use a single rank + ranks_per_channel = 1 + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Initial configuration will have 16 banks with Bank Group Arch + # to maximim resources and enable higher data rates + banks_per_rank = 16 + bank_groups_per_rank = 4 + + # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK + tCK = '1.455ns' + + # Greater of 2 CK or 18ns + tRCD = '18ns' + + # Base RL is 16 CK @ 687.5 MHz = 23.28ns + tCL = '23.280ns' + + # Greater of 2 CK or 18ns + tRP = '18ns' + + # Greater of 3 CK or 42ns + tRAS = '42ns' + + # Greater of 3 CK or 34ns + tWR = '34ns' + + # active powerdown and precharge powerdown exit time + # Greater of 3 CK or 7ns + tXP = '7ns' + + # self refresh exit time (tRFCab + 7.5ns) + tXS = '217.5ns' + + # Greater of 2 CK or 7.5 ns minus 2 CK + tRTP = '4.59ns' + + # With BG architecture, burst of 32 transferred in two 16-beat + # sub-bursts, with a 16-beat gap in between. + # Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz + # tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz + tBURST = '8.73ns' + # can interleave a Bstof32 from another bank group at tBURST_MIN + # 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz + tBURST_MIN = '2.91ns' + # tBURST_MAX is the maximum burst delay for same bank group timing + # this is 8 CK @ 687.5 MHz + tBURST_MAX = '11.64ns' + + # 8 CK @ 687.5 MHz + tCCD_L = "11.64ns" + + # LPDDR5, 8 Gbit/channel for 280ns tRFCab + tRFC = '210ns' + tREFI = '3.9us' + + # Greater of 4 CK or 6.25 ns + tWTR = '6.25ns' + # Greater of 4 CK or 12 ns + tWTR_L = '12ns' + + # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL + # tWCKDQ0/tCK will be 1 CK for most cases + # For gem5 RL = WL and BL/n is already accounted for with tBURST + # Result is and additional 1 CK is required + tRTW = '1.455ns' + + # Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns + tCS = '2.91ns' + + # 2 CK + tPPD = '2.91ns' + + # Greater of 2 CK or 5 ns + tRRD = '5ns' + tRRD_L = '5ns' + + # With Bank Group Arch mode tFAW is 20 ns + tXAW = '20ns' + activation_limit = 4 + + # at 5Gbps, 4:1 WCK to CK ratio required + # 2 data beats per WCK (DDR) -> 8 per CK + beats_per_clock = 8 + + # 2 cycles required to send activate command + # 2 command phases can be sent back-to-back or + # with a gap up to tAAD = 8 CK + two_cycle_activate = True + tAAD = '11.640ns' + + data_clock_sync = True + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture, burst of 16 +class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32): + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL16 for smaller access granularity + burst_length = 16 + + # For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio + tBURST = '2.91ns' + tBURST_MIN = '2.91ns' + # For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio + tBURST_MAX = '5.82ns' + + # 4 CK @ 687.5 MHz + tCCD_L = "5.82ns" + + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 8-bank mode, burst of 32 +class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32): + + # 4kB page with 8B mode + device_rowbuffer_size = '4kB' + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Select 8B + banks_per_rank = 8 + bank_groups_per_rank = 0 + + # For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio + tBURST = '5.82ns' + tBURST_MIN = '5.82ns' + tBURST_MAX = '5.82ns' + + # Greater of 4 CK or 12 ns + tWTR = '12ns' + + # Greater of 2 CK or 10 ns + tRRD = '10ns' + + # With 8B mode tFAW is 40 ns + tXAW = '40ns' + activation_limit = 4 + + # Reset BG arch timing for 8B mode + tCCD_L = "0ns" + tRRD_L = "0ns" + tWTR_L = "0ns" + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# 6.4Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture +# burst of 32, which means bursts can be interleaved +class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32): + + # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK + tCK = '1.25ns' + + # Base RL is 17 CK @ 800 MHz = 21.25ns + tCL = '21.25ns' + + # With BG architecture, burst of 32 transferred in two 16-beat + # sub-bursts, with a 16-beat gap in between. + # Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz + # tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz + tBURST = '7.5ns' + # can interleave a Bstof32 from another bank group at tBURST_MIN + # 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz + tBURST_MIN = '2.5ns' + # tBURST_MAX is the maximum burst delay for same bank group timing + # this is 8 CK @ 800 MHz + tBURST_MAX = '10ns' + + # 8 CK @ 800 MHz + tCCD_L = "10ns" + + # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL + # tWCKDQ0/tCK will be 1 CK for most cases + # For gem5 RL = WL and BL/n is already accounted for with tBURST + # Result is and additional 1 CK is required + tRTW = '1.25ns' + + # Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns + tCS = '2.5ns' + + # 2 CK + tPPD = '2.5ns' + + # 2 command phases can be sent back-to-back or + # with a gap up to tAAD = 8 CK + tAAD = '10ns' + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on initial +# JEDEC specifcation +# 6.4Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture, burst of 16 +class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32): + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL16 for smaller access granularity + burst_length = 16 + + # For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio + tBURST = '2.5ns' + tBURST_MIN = '2.5ns' + # For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio + tBURST_MAX = '5ns' + + # 4 CK @ 800 MHz + tCCD_L = "5ns" + + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# 6.4Gbps data rates and 8Gbit die +# Configuring for 8-bank mode, burst of 32 +class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32): + + # 4kB page with 8B mode + device_rowbuffer_size = '4kB' + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Select 8B + banks_per_rank = 8 + bank_groups_per_rank = 0 + + # For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio + tBURST = '5ns' + tBURST_MIN = '5ns' + tBURST_MAX = '5ns' + + # Greater of 4 CK or 12 ns + tWTR = '12ns' + + # Greater of 2 CK or 10 ns + tRRD = '10ns' + + # With 8B mode tFAW is 40 ns + tXAW = '40ns' + activation_limit = 4 + + # Reset BG arch timing for 8B mode + tCCD_L = "0ns" + tRRD_L = "0ns" + tWTR_L = "0ns" diff --git a/src/mem/SConscript b/src/mem/SConscript index b77dbb1..76ffdbd 100644 --- a/src/mem/SConscript +++ b/src/mem/SConscript @@ -1,6 +1,6 @@ # -*- mode:python -*- # -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018, 2020 ARM Limited # All rights reserved # # The license below extends only to copyright in the software and shall @@ -47,6 +47,7 @@ SimObject('AddrMapper.py') SimObject('Bridge.py') SimObject('DRAMCtrl.py') +SimObject('DRAMInterface.py') SimObject('ExternalMaster.py') SimObject('ExternalSlave.py') SimObject('MemObject.py') diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc index dc244fe..533aa01 100644 --- a/src/mem/dram_ctrl.cc +++ b/src/mem/dram_ctrl.cc @@ -47,6 +47,7 @@ #include "debug/DRAMState.hh" #include "debug/Drain.hh" #include "debug/QOS.hh" +#include "params/DRAMInterface.hh" #include "sim/system.hh" using namespace std; @@ -58,12 +59,13 @@ retryRdReq(false), retryWrReq(false), nextReqEvent([this]{ processNextReqEvent(); }, name()), respondEvent([this]{ processRespondEvent(); }, name()), + dram(p->dram), readBufferSize(p->read_buffer_size), writeBufferSize(p->write_buffer_size), writeHighThreshold(writeBufferSize * p->write_high_thresh_perc / 100.0), writeLowThreshold(writeBufferSize * p->write_low_thresh_perc / 100.0), minWritesPerSwitch(p->min_writes_per_switch), - writesThisTime(0), readsThisTime(0), tCS(p->tCS), + writesThisTime(0), readsThisTime(0), memSchedPolicy(p->mem_sched_policy), frontendLatency(p->static_frontend_latency), backendLatency(p->static_backend_latency), @@ -75,37 +77,23 @@ readQueue.resize(p->qos_priorities); writeQueue.resize(p->qos_priorities); + dram->setCtrl(this); + // perform a basic check of the write thresholds if (p->write_low_thresh_perc >= p->write_high_thresh_perc) fatal("Write buffer low threshold %d must be smaller than the " "high threshold %d\n", p->write_low_thresh_perc, p->write_high_thresh_perc); - - // determine the rows per bank by looking at the total capacity - uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size()); - - DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity, - AbstractMemory::size()); - - // create a DRAM interface - // will only populate the ranks if DRAM is configured - dram = new DRAMInterface(*this, p, capacity, range); - DPRINTF(DRAM, "Created DRAM interface \n"); } void DRAMCtrl::init() { - MemCtrl::init(); - if (!port.isConnected()) { fatal("DRAMCtrl %s is unconnected!\n", name()); } else { port.sendRangeChange(); } - - dram->init(range); - } void @@ -115,8 +103,6 @@ isTimingMode = system()->isTimingMode(); if (isTimingMode) { - dram->startup(); - // shift the bus busy time sufficiently far ahead that we never // have to worry about negative values when computing the time for // the next request, this will add an insignificant bubble at the @@ -134,7 +120,7 @@ "is responding"); // do the actual memory access and turn the packet into a response - access(pkt); + dram->access(pkt); Tick latency = 0; if (pkt->hasData()) { @@ -264,7 +250,7 @@ // address of first DRAM packet is kept unaliged. Subsequent DRAM packets // are aligned to burst size boundaries. This is to ensure we accurately // check read packets against packets in write queue. - const Addr base_addr = getCtrlAddr(pkt->getAddr()); + const Addr base_addr = dram->getCtrlAddr(pkt->getAddr()); Addr addr = base_addr; unsigned pktsServicedByWrQ = 0; BurstHelper* burst_helper = NULL; @@ -364,7 +350,7 @@ // if the request size is larger than burst size, the pkt is split into // multiple DRAM packets - const Addr base_addr = getCtrlAddr(pkt->getAddr()); + const Addr base_addr = dram->getCtrlAddr(pkt->getAddr()); Addr addr = base_addr; uint32_t burstSize = dram->bytesPerBurst(); for (int cnt = 0; cnt < pktCount; ++cnt) { @@ -527,7 +513,7 @@ DRAMPacket* dram_pkt = respQueue.front(); // media specific checks and functions when read response is complete - dram->respondEventDRAM(dram_pkt->rank); + dram->respondEvent(dram_pkt->rank); if (dram_pkt->burstHelper) { // it is a split packet @@ -726,12 +712,12 @@ void DRAMCtrl::accessAndRespond(PacketPtr pkt, Tick static_latency) { - DPRINTF(DRAM, "Responding to Address %lld.. ",pkt->getAddr()); + DPRINTF(DRAM, "Responding to Address %lld.. \n",pkt->getAddr()); bool needsResponse = pkt->needsResponse(); // do the actual memory access which also turns the packet into a // response - access(pkt); + dram->access(pkt); // turn packet around to go back to requester if response expected if (needsResponse) { @@ -876,9 +862,9 @@ // if not, shift to next burst window Tick act_at; if (twoCycleActivate) - act_at = ctrl.verifyMultiCmd(act_tick, tAAD); + act_at = ctrl->verifyMultiCmd(act_tick, tAAD); else - act_at = ctrl.verifySingleCmd(act_tick); + act_at = ctrl->verifySingleCmd(act_tick); DPRINTF(DRAM, "Activate at tick %d\n", act_at); @@ -996,7 +982,7 @@ // Issuing an explicit PRE command // Verify that we have command bandwidth to issue the precharge // if not, shift to next burst window - pre_at = ctrl.verifySingleCmd(pre_tick); + pre_at = ctrl->verifySingleCmd(pre_tick); // enforce tPPD for (int i = 0; i < banksPerRank; i++) { rank_ref.banks[i].preAllowedAt = std::max(pre_at + tPPD, @@ -1046,7 +1032,7 @@ // first clean up the burstTick set, removing old entries // before adding new entries for next burst - ctrl.pruneBurstTick(); + ctrl->pruneBurstTick(); // get the rank Rank& rank_ref = *ranks[dram_pkt->rank]; @@ -1098,9 +1084,9 @@ // verify that we have command bandwidth to issue the burst // if not, shift to next burst window if (dataClockSync && ((cmd_at - rank_ref.lastBurstTick) > clkResyncDelay)) - cmd_at = ctrl.verifyMultiCmd(cmd_at, tCK); + cmd_at = ctrl->verifyMultiCmd(cmd_at, tCK); else - cmd_at = ctrl.verifySingleCmd(cmd_at); + cmd_at = ctrl->verifySingleCmd(cmd_at); // if we are interleaving bursts, ensure that // 1) we don't double interleave on next burst issue @@ -1200,9 +1186,9 @@ // either look at the read queue or write queue const std::vector<DRAMPacketQueue>& queue = - ctrl.selQueue(dram_pkt->isRead()); + ctrl->selQueue(dram_pkt->isRead()); - for (uint8_t i = 0; i < ctrl.numPriorities(); ++i) { + for (uint8_t i = 0; i < ctrl->numPriorities(); ++i) { auto p = queue[i].begin(); // keep on looking until we find a hit or reach the end of the // queue @@ -1273,6 +1259,7 @@ // Update latency stats stats.totMemAccLat += dram_pkt->readyTime - dram_pkt->entryTime; stats.totQLat += cmd_at - dram_pkt->entryTime; + stats.totBusLat += tBURST; } else { // Schedule write done event to decrement event count // after the readyTime has been reached @@ -1338,13 +1325,9 @@ // Update latency stats stats.masterReadTotalLat[dram_pkt->masterId()] += dram_pkt->readyTime - dram_pkt->entryTime; - - stats.bytesRead += dram->bytesPerBurst(); - stats.totBusLat += dram->burstDly(); stats.masterReadBytes[dram_pkt->masterId()] += dram_pkt->size; } else { ++writesThisTime; - stats.bytesWritten += dram->bytesPerBurst(); stats.masterWriteBytes[dram_pkt->masterId()] += dram_pkt->size; stats.masterWriteTotalLat[dram_pkt->masterId()] += dram_pkt->readyTime - dram_pkt->entryTime; @@ -1446,8 +1429,9 @@ // Figure out which read request goes next // If we are changing command type, incorporate the minimum - // bus turnaround delay which will be tCS (different rank) case - to_read = chooseNext((*queue), switched_cmd_type ? tCS : 0); + // bus turnaround delay which will be rank to rank delay + to_read = chooseNext((*queue), switched_cmd_type ? + dram->rankDelay() : 0); if (to_read != queue->end()) { // candidate read found @@ -1526,7 +1510,8 @@ // If we are changing command type, incorporate the minimum // bus turnaround delay to_write = chooseNext((*queue), - switched_cmd_type ? std::min(dram->minRdToWr(), tCS) : 0); + switched_cmd_type ? std::min(dram->minRdToWr(), + dram->rankDelay()) : 0); if (to_write != queue->end()) { write_found = true; @@ -1599,11 +1584,8 @@ } } -DRAMInterface::DRAMInterface(DRAMCtrl& _ctrl, - const DRAMCtrlParams* _p, - const uint64_t capacity, - const AddrRange range) - : SimObject(_p), ctrl(_ctrl), +DRAMInterface::DRAMInterface(const DRAMInterfaceParams* _p) + : AbstractMemory(_p), addrMapping(_p->addr_mapping), burstSize((_p->devices_per_rank * _p->burst_length * _p->device_bus_width) / 8), @@ -1618,7 +1600,7 @@ bankGroupsPerRank(_p->bank_groups_per_rank), bankGroupArch(_p->bank_groups_per_rank > 0), banksPerRank(_p->banks_per_rank), rowsPerBank(0), - tCK(_p->tCK), tCL(_p->tCL), tBURST(_p->tBURST), + tCK(_p->tCK), tCS(_p->tCS), tCL(_p->tCL), tBURST(_p->tBURST), tBURST_MIN(_p->tBURST_MIN), tBURST_MAX(_p->tBURST_MAX), tRTW(_p->tRTW), tCCD_L_WR(_p->tCCD_L_WR), tCCD_L(_p->tCCD_L), tRCD(_p->tRCD), tRP(_p->tRP), tRAS(_p->tRAS), tWR(_p->tWR), tRTP(_p->tRTP), @@ -1634,12 +1616,12 @@ wrToRdDly(tCL + tBURST + _p->tWTR), rdToWrDly(tBURST + tRTW), wrToRdDlySameBG(tCL + _p->tBURST_MAX + _p->tWTR_L), rdToWrDlySameBG(tRTW + _p->tBURST_MAX), - rankToRankDly(ctrl.rankDelay() + tBURST), + rankToRankDly(tCS + tBURST), pageMgmt(_p->page_policy), maxAccessesPerRow(_p->max_accesses_per_row), timeStampOffset(0), activeRank(0), enableDRAMPowerdown(_p->enable_dram_powerdown), - stats(_ctrl, *this) + stats(*this) { fatal_if(!isPowerOf2(burstSize), "DRAM burst size %d is not allowed, " "must be a power of two\n", burstSize); @@ -1651,7 +1633,7 @@ for (int i = 0; i < ranksPerChannel; i++) { DPRINTF(DRAM, "Creating DRAM rank %d \n", i); - Rank* rank = new Rank(ctrl, _p, i, *this); + Rank* rank = new Rank(_p, i, *this); ranks.push_back(rank); } @@ -1659,6 +1641,11 @@ uint64_t deviceCapacity = deviceSize / (1024 * 1024) * devicesPerRank * ranksPerChannel; + uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size()); + + DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity, + AbstractMemory::size()); + // if actual DRAM size does not match memory capacity in system warn! if (deviceCapacity != capacity / (1024 * 1024)) warn("DRAM device capacity (%d Mbytes) does not match the " @@ -1713,8 +1700,10 @@ } void -DRAMInterface::init(AddrRange range) +DRAMInterface::init() { + AbstractMemory::init(); + // a bit of sanity checks on the interleaving, save it for here to // ensure that the system pointer is initialised if (range.interleaved()) { @@ -1736,7 +1725,7 @@ // channel striping has to be done at a granularity that // is equal or larger to a cache line - if (ctrl.system()->cacheLineSize() > range.granularity()) { + if (system()->cacheLineSize() > range.granularity()) { fatal("Channel interleaving of %s must be at least as large " "as the cache line size\n", name()); } @@ -1755,8 +1744,10 @@ void DRAMInterface::startup() { - // timestamp offset should be in clock cycles for DRAMPower - timeStampOffset = divCeil(curTick(), tCK); + if (system()->isTimingMode()) { + // timestamp offset should be in clock cycles for DRAMPower + timeStampOffset = divCeil(curTick(), tCK); + } for (auto r : ranks) { r->startup(curTick() + tREFI - tRP); @@ -1802,7 +1793,7 @@ } void -DRAMInterface::respondEventDRAM(uint8_t rank) +DRAMInterface::respondEvent(uint8_t rank) { Rank& rank_ref = *ranks[rank]; @@ -1943,7 +1934,7 @@ std::max(ranks[i]->banks[j].preAllowedAt, curTick()) + tRP; // When is the earliest the R/W burst can issue? - const Tick col_allowed_at = ctrl.inReadBusState(false) ? + const Tick col_allowed_at = ctrl->inReadBusState(false) ? ranks[i]->banks[j].rdAllowedAt : ranks[i]->banks[j].wrAllowedAt; Tick col_at = std::max(col_allowed_at, act_at + tRCD); @@ -1983,9 +1974,15 @@ return make_pair(bank_mask, hidden_bank_prep); } -DRAMInterface::Rank::Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank, - DRAMInterface& _dram) - : EventManager(&_ctrl), ctrl(_ctrl), dram(_dram), +DRAMInterface* +DRAMInterfaceParams::create() +{ + return new DRAMInterface(this); +} + +DRAMInterface::Rank::Rank(const DRAMInterfaceParams* _p, + int _rank, DRAMInterface& _dram) + : EventManager(&_dram), dram(_dram), pwrStateTrans(PWR_IDLE), pwrStatePostRefresh(PWR_IDLE), pwrStateTick(0), refreshDueAt(0), pwrState(PWR_IDLE), refreshState(REF_IDLE), inLowPowerState(false), rank(_rank), @@ -1998,7 +1995,7 @@ refreshEvent([this]{ processRefreshEvent(); }, name()), powerEvent([this]{ processPowerEvent(); }, name()), wakeUpEvent([this]{ processWakeUpEvent(); }, name()), - stats(_ctrl, *this) + stats(_dram, *this) { for (int b = 0; b < _p->banks_per_rank; b++) { banks[b].bank = b; @@ -2049,8 +2046,10 @@ DRAMInterface::Rank::isQueueEmpty() const { // check commmands in Q based on current bus direction - bool no_queued_cmds = (ctrl.inReadBusState(true) && (readEntries == 0)) - || (!ctrl.inReadBusState(true) && (writeEntries == 0)); + bool no_queued_cmds = (dram.ctrl->inReadBusState(true) && + (readEntries == 0)) + || (!dram.ctrl->inReadBusState(true) && + (writeEntries == 0)); return no_queued_cmds; } @@ -2174,7 +2173,7 @@ // if a request is at the moment being handled and this request is // accessing the current rank then wait for it to finish if ((rank == dram.activeRank) - && (ctrl.nextReqEvent.scheduled())) { + && (dram.ctrl->nextReqEvent.scheduled())) { // hand control over to the request loop until it is // evaluated next DPRINTF(DRAM, "Refresh awaiting draining\n"); @@ -2249,7 +2248,7 @@ // or have outstanding ACT,RD/WR,Auto-PRE sequence scheduled // should have outstanding precharge or read response event assert(prechargeEvent.scheduled() || - ctrl.respondEvent.scheduled()); + dram.ctrl->respondEvent.scheduled()); // will start refresh when pwrState transitions to IDLE } @@ -2309,8 +2308,8 @@ assert(!powerEvent.scheduled()); - if ((ctrl.drainState() == DrainState::Draining) || - (ctrl.drainState() == DrainState::Drained)) { + if ((dram.ctrl->drainState() == DrainState::Draining) || + (dram.ctrl->drainState() == DrainState::Drained)) { // if draining, do not re-enter low-power mode. // simply go to IDLE and wait schedulePowerEvent(PWR_IDLE, curTick()); @@ -2535,10 +2534,10 @@ } // completed refresh event, ensure next request is scheduled - if (!ctrl.nextReqEvent.scheduled()) { + if (!dram.ctrl->nextReqEvent.scheduled()) { DPRINTF(DRAM, "Scheduling next request after refreshing" " rank %d\n", rank); - schedule(ctrl.nextReqEvent, curTick()); + schedule(dram.ctrl->nextReqEvent, curTick()); } } @@ -2597,8 +2596,8 @@ // bypass auto-refresh and go straight to SREF, where memory // will issue refresh immediately upon entry if (pwrStatePostRefresh == PWR_PRE_PDN && isQueueEmpty() && - (ctrl.drainState() != DrainState::Draining) && - (ctrl.drainState() != DrainState::Drained) && + (dram.ctrl->drainState() != DrainState::Draining) && + (dram.ctrl->drainState() != DrainState::Drained) && dram.enableDRAMPowerdown) { DPRINTF(DRAMState, "Rank %d bypassing refresh and transitioning " "to self refresh at %11u tick\n", rank, curTick()); @@ -2669,7 +2668,7 @@ // power (mW) = ----------- * ---------- // time (tick) tick_frequency stats.averagePower = (stats.totalEnergy.value() / - (curTick() - ctrl.lastStatsResetTick)) * + (curTick() - dram.ctrl->lastStatsResetTick)) * (SimClock::Frequency / 1000000000.0); } @@ -2699,7 +2698,7 @@ bool DRAMInterface::Rank::forceSelfRefreshExit() const { return (readEntries != 0) || - (!ctrl.inReadBusState(true) && (writeEntries != 0)); + (!dram.ctrl->inReadBusState(true) && (writeEntries != 0)); } DRAMCtrl::CtrlStats::CtrlStats(DRAMCtrl &_ctrl) @@ -2710,15 +2709,15 @@ ADD_STAT(writeReqs, "Number of write requests accepted"), ADD_STAT(readBursts, - "Number of DRAM read bursts, " + "Number of controller read bursts, " "including those serviced by the write queue"), ADD_STAT(writeBursts, - "Number of DRAM write bursts, " + "Number of controller write bursts, " "including those merged in the write queue"), ADD_STAT(servicedByWrQ, - "Number of DRAM read bursts serviced by the write queue"), + "Number of controller read bursts serviced by the write queue"), ADD_STAT(mergedWrBursts, - "Number of DRAM write bursts merged with an existing one"), + "Number of controller write bursts merged with an existing one"), ADD_STAT(neitherReadNorWriteReqs, "Number of requests that are neither read nor write"), @@ -2726,9 +2725,6 @@ ADD_STAT(avgRdQLen, "Average read queue length when enqueuing"), ADD_STAT(avgWrQLen, "Average write queue length when enqueuing"), - ADD_STAT(totBusLat, "Total ticks spent in databus transfers"), - ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"), - ADD_STAT(numRdRetry, "Number of times read queue was full causing retry"), ADD_STAT(numWrRetry, "Number of times write queue was full causing retry"), @@ -2743,22 +2739,13 @@ ADD_STAT(wrPerTurnAround, "Writes before turning the bus around for reads"), - ADD_STAT(bytesRead, "Total number of bytes read from memory"), ADD_STAT(bytesReadWrQ, "Total number of bytes read from write queue"), - ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"), ADD_STAT(bytesReadSys, "Total read bytes from the system interface side"), ADD_STAT(bytesWrittenSys, "Total written bytes from the system interface side"), - ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiByte/s"), - ADD_STAT(avgWrBW, "Average achieved write bandwidth in MiByte/s"), ADD_STAT(avgRdBWSys, "Average system read bandwidth in MiByte/s"), ADD_STAT(avgWrBWSys, "Average system write bandwidth in MiByte/s"), - ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"), - - ADD_STAT(busUtil, "Data bus utilization in percentage"), - ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"), - ADD_STAT(busUtilWrite, "Data bus utilization in percentage for writes"), ADD_STAT(totGap, "Total gap between requests"), ADD_STAT(avgGap, "Average gap between requests"), @@ -2790,12 +2777,11 @@ { using namespace Stats; - assert(ctrl._system); - const auto max_masters = ctrl._system->maxMasters(); + assert(ctrl.system()); + const auto max_masters = ctrl.system()->maxMasters(); avgRdQLen.precision(2); avgWrQLen.precision(2); - avgBusLat.precision(2); readPktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1); writePktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1); @@ -2810,14 +2796,9 @@ .init(ctrl.writeBufferSize) .flags(nozero); - avgRdBW.precision(2); - avgWrBW.precision(2); avgRdBWSys.precision(2); avgWrBWSys.precision(2); - peakBW.precision(2); - busUtil.precision(2); avgGap.precision(2); - busUtilWrite.precision(2); // per-master bytes read and written to memory masterReadBytes @@ -2849,9 +2830,6 @@ .flags(nonan) .precision(2); - busUtilRead - .precision(2); - masterWriteRate .flags(nozero | nonan) .precision(12); @@ -2865,7 +2843,7 @@ .precision(2); for (int i = 0; i < max_masters; i++) { - const std::string master = ctrl._system->getMasterName(i); + const std::string master = ctrl.system()->getMasterName(i); masterReadBytes.subname(i, master); masterReadRate.subname(i, master); masterWriteBytes.subname(i, master); @@ -2879,22 +2857,11 @@ } // Formula stats - avgBusLat = totBusLat / (readBursts - servicedByWrQ); - - avgRdBW = (bytesRead / 1000000) / simSeconds; - avgWrBW = (bytesWritten / 1000000) / simSeconds; avgRdBWSys = (bytesReadSys / 1000000) / simSeconds; avgWrBWSys = (bytesWrittenSys / 1000000) / simSeconds; - peakBW = (SimClock::Frequency / ctrl.dram->burstDataDly()) * - ctrl.dram->bytesPerBurst() / 1000000; - - busUtil = (avgRdBW + avgWrBW) / peakBW * 100; avgGap = totGap / (readReqs + writeReqs); - busUtilRead = avgRdBW / peakBW * 100; - busUtilWrite = avgWrBW / peakBW * 100; - masterReadRate = masterReadBytes / simSeconds; masterWriteRate = masterWriteBytes / simSeconds; masterReadAvgLat = masterReadTotalLat / masterReadAccesses; @@ -2907,8 +2874,8 @@ ctrl.lastStatsResetTick = curTick(); } -DRAMInterface::DRAMStats::DRAMStats(DRAMCtrl &_ctrl, DRAMInterface &_dram) - : Stats::Group(&_ctrl, csprintf("dram").c_str()), +DRAMInterface::DRAMStats::DRAMStats(DRAMInterface &_dram) + : Stats::Group(&_dram), dram(_dram), ADD_STAT(readBursts, "Number of DRAM read bursts"), @@ -2918,10 +2885,13 @@ ADD_STAT(perBankWrBursts, "Per bank write bursts"), ADD_STAT(totQLat, "Total ticks spent queuing"), + ADD_STAT(totBusLat, "Total ticks spent in databus transfers"), ADD_STAT(totMemAccLat, "Total ticks spent from burst creation until serviced " "by the DRAM"), + ADD_STAT(avgQLat, "Average queueing delay per DRAM burst"), + ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"), ADD_STAT(avgMemAccLat, "Average memory access latency per DRAM burst"), ADD_STAT(readRowHits, "Number of row buffer hits during reads"), @@ -2934,6 +2904,12 @@ ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"), ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiBytes/s"), ADD_STAT(avgWrBW, "Average DRAM write bandwidth in MiBytes/s"), + ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"), + + ADD_STAT(busUtil, "Data bus utilization in percentage"), + ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"), + ADD_STAT(busUtilWrite, "Data bus utilization in percentage for writes"), + ADD_STAT(pageHitRate, "Row buffer hit rate, read and write combined") { @@ -2945,6 +2921,7 @@ using namespace Stats; avgQLat.precision(2); + avgBusLat.precision(2); avgMemAccLat.precision(2); readRowHitRate.precision(2); @@ -2958,10 +2935,16 @@ dram.maxAccessesPerRow : dram.rowBufferSize) .flags(nozero); + peakBW.precision(2); + busUtil.precision(2); + busUtilWrite.precision(2); + busUtilRead.precision(2); + pageHitRate.precision(2); // Formula stats avgQLat = totQLat / readBursts; + avgBusLat = totBusLat / readBursts; avgMemAccLat = totMemAccLat / readBursts; readRowHitRate = (readRowHits / readBursts) * 100; @@ -2969,13 +2952,19 @@ avgRdBW = (bytesRead / 1000000) / simSeconds; avgWrBW = (bytesWritten / 1000000) / simSeconds; + peakBW = (SimClock::Frequency / dram.burstDataDly()) * + dram.bytesPerBurst() / 1000000; + + busUtil = (avgRdBW + avgWrBW) / peakBW * 100; + busUtilRead = avgRdBW / peakBW * 100; + busUtilWrite = avgWrBW / peakBW * 100; pageHitRate = (writeRowHits + readRowHits) / (writeBursts + readBursts) * 100; } -DRAMInterface::RankStats::RankStats(DRAMCtrl &_ctrl, Rank &_rank) - : Stats::Group(&_ctrl, csprintf("dram_rank%d", _rank.rank).c_str()), +DRAMInterface::RankStats::RankStats(DRAMInterface &_dram, Rank &_rank) + : Stats::Group(&_dram, csprintf("rank%d", _rank.rank).c_str()), rank(_rank), ADD_STAT(actEnergy, "Energy for activate commands per rank (pJ)"), @@ -3034,7 +3023,7 @@ DRAMCtrl::recvFunctional(PacketPtr pkt) { // rely on the abstract memory - functionalAccess(pkt); + dram->functionalAccess(pkt); } Port & @@ -3099,7 +3088,7 @@ DRAMCtrl::MemoryPort::getAddrRanges() const { AddrRangeList ranges; - ranges.push_back(ctrl.getAddrRange()); + ranges.push_back(ctrl.dram->getAddrRange()); return ranges; } diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh index 4464f7a..1b6d8b5 100644 --- a/src/mem/dram_ctrl.hh +++ b/src/mem/dram_ctrl.hh @@ -56,12 +56,15 @@ #include "enums/AddrMap.hh" #include "enums/MemSched.hh" #include "enums/PageManage.hh" +#include "mem/abstract_mem.hh" #include "mem/drampower.hh" #include "mem/qos/mem_ctrl.hh" #include "mem/qport.hh" #include "params/DRAMCtrl.hh" #include "sim/eventq.hh" +class DRAMInterfaceParams; + /** * A basic class to track the bank state, i.e. what row is * currently open (if any), when is the bank free to accept a new @@ -243,7 +246,7 @@ * The DRAMInterface includes a class for individual ranks * and per rank functions. */ -class DRAMInterface : public SimObject +class DRAMInterface : public AbstractMemory { private: /** @@ -340,7 +343,7 @@ class Rank; struct RankStats : public Stats::Group { - RankStats(DRAMCtrl &ctrl, Rank &rank); + RankStats(DRAMInterface &dram, Rank &rank); void regStats() override; void resetStats() override; @@ -406,13 +409,6 @@ */ class Rank : public EventManager { - protected: - - /** - * A reference to the parent DRAMCtrl instance - */ - DRAMCtrl& ctrl; - private: /** @@ -532,10 +528,10 @@ */ Tick lastBurstTick; - Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank, + Rank(const DRAMInterfaceParams* _p, int _rank, DRAMInterface& _dram); - const std::string name() const { return csprintf("dram_%d", rank); } + const std::string name() const { return csprintf("%d", rank); } /** * Kick off accounting for power and refresh states and @@ -662,9 +658,9 @@ }; /** - * A reference to the parent DRAMCtrl instance + * A pointer to the parent DRAMCtrl instance */ - DRAMCtrl& ctrl; + DRAMCtrl* ctrl; /** * Memory controller configuration initialized based on parameter @@ -695,6 +691,7 @@ * DRAM timing requirements */ const Tick M5_CLASS_VAR_USED tCK; + const Tick tCS; const Tick tCL; const Tick tBURST; const Tick tBURST_MIN; @@ -774,7 +771,7 @@ bool trace = true); struct DRAMStats : public Stats::Group { - DRAMStats(DRAMCtrl &ctrl, DRAMInterface &dram); + DRAMStats(DRAMInterface &dram); void regStats() override; @@ -790,10 +787,12 @@ // Latencies summed over all requests Stats::Scalar totQLat; + Stats::Scalar totBusLat; Stats::Scalar totMemAccLat; // Average latencies per request Stats::Formula avgQLat; + Stats::Formula avgBusLat; Stats::Formula avgMemAccLat; // Row hit count and rate @@ -809,6 +808,11 @@ // Average bandwidth Stats::Formula avgRdBW; Stats::Formula avgWrBW; + Stats::Formula peakBW; + // bus utilization + Stats::Formula busUtil; + Stats::Formula busUtilRead; + Stats::Formula busUtilWrite; Stats::Formula pageHitRate; }; @@ -820,11 +824,16 @@ std::vector<Rank*> ranks; public: + /** Setting a pointer to the controller */ + void setCtrl(DRAMCtrl* _ctrl) + { + ctrl = _ctrl; + }; + /** * Initialize the DRAM interface and verify parameters - * @param range is the address range for this interface */ - void init(AddrRange range); + void init(); /** * Iterate through dram ranks and instantiate per rank startup routine @@ -853,6 +862,20 @@ void suspend(); /** + * Get an address in a dense range which starts from 0. The input + * address is the physical address of the request in an address + * space that contains other SimObjects apart from this + * controller. + * + * @param addr The intput address which should be in the addrRange + * @return An address in the continues range [0, max) + */ + Addr getCtrlAddr(Addr addr) + { + return range.getOffset(addr); + } + + /** * @return number of bytes in a burst for this interface */ uint32_t bytesPerBurst () { return burstSize; }; @@ -887,6 +910,13 @@ */ Tick minRdToWr () { return tRTW; }; + /** + * Determine the required delay for an access to a different rank + * + * @return required rank to rank delay + */ + Tick rankDelay() { return tCS; }; + /* * Function to calulate RAS cycle time for use within and * outside of this class @@ -968,7 +998,7 @@ * * @param rank Specifies rank associated with read burst */ - void respondEventDRAM(uint8_t rank); + void respondEvent(uint8_t rank); /** * Check the refresh state to determine if refresh needs @@ -1004,8 +1034,7 @@ virtual void process() { rank->resetStats(); }; }; - DRAMInterface(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, - uint64_t capacity, AddrRange range); + DRAMInterface(const DRAMInterfaceParams* _p); }; /** @@ -1170,20 +1199,6 @@ void accessAndRespond(PacketPtr pkt, Tick static_latency); /** - * Get an address in a dense range which starts from 0. The input - * address is the physical address of the request in an address - * space that contains other SimObjects apart from this - * controller. - * - * @param addr The intput address which should be in the addrRange - * @return An address in the continues range [0, max) - */ - Addr getCtrlAddr(Addr addr) - { - return range.getOffset(addr); - } - - /** * The memory schduler/arbiter - picks which request needs to * go next, based on the specified policy such as FCFS or FR-FCFS * and moves it to the head of the queue. @@ -1265,6 +1280,11 @@ std::unordered_multiset<Tick> burstTicks; /** + * Create pointer to interface of the actual dram media + */ + DRAMInterface* const dram; + + /** * The following are basic design parameters of the memory * controller, and are initialized based on parameter values. * The rowsPerBank is determined based on the capacity, number of @@ -1279,12 +1299,6 @@ uint32_t readsThisTime; /** - * Basic memory timing parameters initialized based on parameter - * values. These will be used across memory interfaces. - */ - const Tick tCS; - - /** * Memory controller configuration initialized based on parameter * values. */ @@ -1338,10 +1352,6 @@ // Average queue lengths Stats::Average avgRdQLen; Stats::Average avgWrQLen; - // Latencies summed over all requests - Stats::Scalar totBusLat; - // Average latencies per request - Stats::Formula avgBusLat; Stats::Scalar numRdRetry; Stats::Scalar numWrRetry; @@ -1352,21 +1362,12 @@ Stats::Histogram rdPerTurnAround; Stats::Histogram wrPerTurnAround; - Stats::Scalar bytesRead; Stats::Scalar bytesReadWrQ; - Stats::Scalar bytesWritten; Stats::Scalar bytesReadSys; Stats::Scalar bytesWrittenSys; // Average bandwidth - Stats::Formula avgRdBW; - Stats::Formula avgWrBW; Stats::Formula avgRdBWSys; Stats::Formula avgWrBWSys; - Stats::Formula peakBW; - // bus utilization - Stats::Formula busUtil; - Stats::Formula busUtilRead; - Stats::Formula busUtilWrite; Stats::Scalar totGap; Stats::Formula avgGap; @@ -1405,11 +1406,6 @@ /** The time when stats were last reset used to calculate average power */ Tick lastStatsResetTick; - /** - * Create pointer to interfasce to the actual media - */ - DRAMInterface* dram; - DRAMCtrl(const DRAMCtrlParams* p); DrainState drain() override; @@ -1458,13 +1454,6 @@ }; /** - * Determine the required delay for an access to a different rank - * - * @return required rank to rank delay - */ - Tick rankDelay() { return tCS; }; - - /** * Check the current direction of the memory channel * * @param next_state Check either the current or next bus state diff --git a/src/mem/drampower.cc b/src/mem/drampower.cc index f506928..7a44aa1 100644 --- a/src/mem/drampower.cc +++ b/src/mem/drampower.cc @@ -40,13 +40,13 @@ #include "base/intmath.hh" #include "sim/core.hh" -DRAMPower::DRAMPower(const DRAMCtrlParams* p, bool include_io) : +DRAMPower::DRAMPower(const DRAMInterfaceParams* p, bool include_io) : powerlib(libDRAMPower(getMemSpec(p), include_io)) { } Data::MemArchitectureSpec -DRAMPower::getArchParams(const DRAMCtrlParams* p) +DRAMPower::getArchParams(const DRAMInterfaceParams* p) { Data::MemArchitectureSpec archSpec; archSpec.burstLength = p->burst_length; @@ -68,7 +68,7 @@ } Data::MemTimingSpec -DRAMPower::getTimingParams(const DRAMCtrlParams* p) +DRAMPower::getTimingParams(const DRAMInterfaceParams* p) { // Set the values that are used for power calculations and ignore // the ones only used by the controller functionality in DRAMPower @@ -100,7 +100,7 @@ } Data::MemPowerSpec -DRAMPower::getPowerParams(const DRAMCtrlParams* p) +DRAMPower::getPowerParams(const DRAMInterfaceParams* p) { // All DRAMPower currents are in mA Data::MemPowerSpec powerSpec; @@ -132,7 +132,7 @@ } Data::MemorySpecification -DRAMPower::getMemSpec(const DRAMCtrlParams* p) +DRAMPower::getMemSpec(const DRAMInterfaceParams* p) { Data::MemorySpecification memSpec; memSpec.memArchSpec = getArchParams(p); @@ -142,13 +142,13 @@ } bool -DRAMPower::hasTwoVDD(const DRAMCtrlParams* p) +DRAMPower::hasTwoVDD(const DRAMInterfaceParams* p) { return p->VDD2 == 0 ? false : true; } uint8_t -DRAMPower::getDataRate(const DRAMCtrlParams* p) +DRAMPower::getDataRate(const DRAMInterfaceParams* p) { uint32_t burst_cycles = divCeil(p->tBURST_MAX, p->tCK); uint8_t data_rate = p->burst_length / burst_cycles; diff --git a/src/mem/drampower.hh b/src/mem/drampower.hh index ed47476..da68a78 100644 --- a/src/mem/drampower.hh +++ b/src/mem/drampower.hh @@ -44,7 +44,7 @@ #define __MEM_DRAM_POWER_HH__ #include "libdrampower/LibDRAMPower.h" -#include "params/DRAMCtrl.hh" +#include "params/DRAMInterface.hh" /** * DRAMPower is a standalone tool which calculates the power consumed by a @@ -57,43 +57,44 @@ /** * Transform the architechture parameters defined in - * DRAMCtrlParams to the memSpec of DRAMPower + * DRAMInterfaceParams to the memSpec of DRAMPower */ - static Data::MemArchitectureSpec getArchParams(const DRAMCtrlParams* p); + static Data::MemArchitectureSpec getArchParams( + const DRAMInterfaceParams* p); /** - * Transforms the timing parameters defined in DRAMCtrlParams to + * Transforms the timing parameters defined in DRAMInterfaceParams to * the memSpec of DRAMPower */ - static Data::MemTimingSpec getTimingParams(const DRAMCtrlParams* p); + static Data::MemTimingSpec getTimingParams(const DRAMInterfaceParams* p); /** * Transforms the power and current parameters defined in - * DRAMCtrlParam to the memSpec of DRAMPower + * DRAMInterfaceParams to the memSpec of DRAMPower */ - static Data::MemPowerSpec getPowerParams(const DRAMCtrlParams* p); + static Data::MemPowerSpec getPowerParams(const DRAMInterfaceParams* p); /** * Determine data rate, either one or two. */ - static uint8_t getDataRate(const DRAMCtrlParams* p); + static uint8_t getDataRate(const DRAMInterfaceParams* p); /** * Determine if DRAM has two voltage domains (or one) */ - static bool hasTwoVDD(const DRAMCtrlParams* p); + static bool hasTwoVDD(const DRAMInterfaceParams* p); /** - * Return an instance of MemSpec based on the DRAMCtrlParams + * Return an instance of MemSpec based on the DRAMInterfaceParams */ - static Data::MemorySpecification getMemSpec(const DRAMCtrlParams* p); + static Data::MemorySpecification getMemSpec(const DRAMInterfaceParams* p); public: // Instance of DRAMPower Library libDRAMPower powerlib; - DRAMPower(const DRAMCtrlParams* p, bool include_io); + DRAMPower(const DRAMInterfaceParams* p, bool include_io); }; diff --git a/src/mem/qos/QoSMemCtrl.py b/src/mem/qos/QoSMemCtrl.py index 1cd3f0b..f55105b 100644 --- a/src/mem/qos/QoSMemCtrl.py +++ b/src/mem/qos/QoSMemCtrl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -34,18 +34,21 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from m5.params import * -from m5.objects.AbstractMemory import AbstractMemory +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject from m5.objects.QoSTurnaround import * # QoS Queue Selection policy used to select packets among same-QoS queues class QoSQPolicy(Enum): vals = ["fifo", "lifo", "lrg"] -class QoSMemCtrl(AbstractMemory): +class QoSMemCtrl(ClockedObject): type = 'QoSMemCtrl' cxx_header = "mem/qos/mem_ctrl.hh" cxx_class = 'QoS::MemCtrl' abstract = True + system = Param.System(Parent.any, "System that the controller belongs to.") + ##### QoS support parameters #### # Number of priorities in the system diff --git a/src/mem/qos/QoSMemSinkCtrl.py b/src/mem/qos/QoSMemSinkCtrl.py index 572cad5..03a988a 100644 --- a/src/mem/qos/QoSMemSinkCtrl.py +++ b/src/mem/qos/QoSMemSinkCtrl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -37,6 +37,7 @@ from m5.params import * from m5.objects.QoSMemCtrl import * +from QoSMemSinkInterface import * class QoSMemSinkCtrl(QoSMemCtrl): type = 'QoSMemSinkCtrl' @@ -44,6 +45,10 @@ cxx_class = "QoS::MemSinkCtrl" port = SlavePort("Slave ports") + + intf = Param.QoSMemSinkInterface(QoSMemSinkInterface(), "Interface to "\ + "memory") + # the basic configuration of the controller architecture, note # that each entry corresponds to a burst for the specific DRAM # configuration (e.g. x32 with burst length 8 is 32 bytes) and not @@ -59,5 +64,3 @@ # response latency - time to issue a response once a request is serviced response_latency = Param.Latency("20ns", "Memory response latency") - - diff --git a/src/mem/qos/QoSMemSinkInterface.py b/src/mem/qos/QoSMemSinkInterface.py new file mode 100644 index 0000000..fd8254f --- /dev/null +++ b/src/mem/qos/QoSMemSinkInterface.py @@ -0,0 +1,43 @@ +# Copyright (c) 2020 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Matteo Andreozzi +# Wendy Elsasser + +from AbstractMemory import AbstractMemory + +class QoSMemSinkInterface(AbstractMemory): + type = 'QoSMemSinkInterface' + cxx_header = "mem/qos/mem_sink.hh" diff --git a/src/mem/qos/SConscript b/src/mem/qos/SConscript index f8601b6..1d90f9c 100644 --- a/src/mem/qos/SConscript +++ b/src/mem/qos/SConscript @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved # # The license below extends only to copyright in the software and shall @@ -37,6 +37,7 @@ SimObject('QoSMemCtrl.py') SimObject('QoSMemSinkCtrl.py') +SimObject('QoSMemSinkInterface.py') SimObject('QoSPolicy.py') SimObject('QoSTurnaround.py') diff --git a/src/mem/qos/mem_ctrl.cc b/src/mem/qos/mem_ctrl.cc index 50e6035..190960b 100644 --- a/src/mem/qos/mem_ctrl.cc +++ b/src/mem/qos/mem_ctrl.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited + * Copyright (c) 2017-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -42,7 +42,7 @@ namespace QoS { MemCtrl::MemCtrl(const QoSMemCtrlParams * p) - : AbstractMemory(p), + : ClockedObject(p), policy(p->qos_policy), turnPolicy(p->qos_turnaround_policy), queuePolicy(QueuePolicy::create(p)), @@ -51,7 +51,8 @@ qosSyncroScheduler(p->qos_syncro_scheduler), totalReadQueueSize(0), totalWriteQueueSize(0), busState(READ), busStateNext(READ), - stats(*this) + stats(*this), + _system(p->system) { // Set the priority policy if (policy) { @@ -77,12 +78,6 @@ {} void -MemCtrl::init() -{ - AbstractMemory::init(); -} - -void MemCtrl::logRequest(BusState dir, MasterID m_id, uint8_t qos, Addr addr, uint64_t entries) { diff --git a/src/mem/qos/mem_ctrl.hh b/src/mem/qos/mem_ctrl.hh index 0e29fcc..50ddc94 100644 --- a/src/mem/qos/mem_ctrl.hh +++ b/src/mem/qos/mem_ctrl.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited + * Copyright (c) 2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -36,9 +36,9 @@ */ #include "debug/QOS.hh" -#include "mem/abstract_mem.hh" -#include "mem/qos/q_policy.hh" +#include "mem/mem_object.hh" #include "mem/qos/policy.hh" +#include "mem/qos/q_policy.hh" #include "params/QoSMemCtrl.hh" #include "sim/system.hh" @@ -49,6 +49,8 @@ #ifndef __MEM_QOS_MEM_CTRL_HH__ #define __MEM_QOS_MEM_CTRL_HH__ +class System; + namespace QoS { /** @@ -56,7 +58,7 @@ * which support QoS - it provides access to a set of QoS * scheduling policies */ -class MemCtrl: public AbstractMemory +class MemCtrl: public ClockedObject { public: /** Bus Direction */ @@ -151,6 +153,9 @@ Stats::Scalar numStayWriteState; } stats; + /** Pointer to the System object */ + System* _system; + /** * Initializes dynamically counters and * statistics for a given Master @@ -266,11 +271,6 @@ virtual ~MemCtrl(); /** - * Initializes this object - */ - void init() override; - - /** * Gets the current bus state * * @return current bus state @@ -346,6 +346,10 @@ * @return total number of priority levels */ uint8_t numPriorities() const { return _numPriorities; } + + /** read the system pointer + * @return pointer to the system object */ + System* system() const { return _system; } }; template<typename Queues> diff --git a/src/mem/qos/mem_sink.cc b/src/mem/qos/mem_sink.cc index 1f104e4..fb06b9d 100644 --- a/src/mem/qos/mem_sink.cc +++ b/src/mem/qos/mem_sink.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited + * Copyright (c) 2018-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -40,6 +40,7 @@ #include "debug/Drain.hh" #include "debug/QOS.hh" #include "mem_sink.hh" +#include "params/QoSMemSinkInterface.hh" #include "sim/system.hh" namespace QoS { @@ -50,12 +51,15 @@ memoryPacketSize(p->memory_packet_size), readBufferSize(p->read_buffer_size), writeBufferSize(p->write_buffer_size), port(name() + ".port", *this), + intf(p->intf), retryRdReq(false), retryWrReq(false), nextRequest(0), nextReqEvent(this) { // Resize read and write queue to allocate space // for configured QoS priorities readQueue.resize(numPriorities()); writeQueue.resize(numPriorities()); + + intf->setMemCtrl(this); } MemSinkCtrl::~MemSinkCtrl() @@ -92,7 +96,7 @@ "%s Should not see packets where cache is responding\n", __func__); - access(pkt); + intf->access(pkt); return responseLatency; } @@ -101,7 +105,7 @@ { pkt->pushLabel(name()); - functionalAccess(pkt); + intf->functionalAccess(pkt); pkt->popLabel(); } @@ -279,7 +283,7 @@ // Do the actual memory access which also turns the packet // into a response - access(pkt); + intf->access(pkt); // Log the response logResponse(pkt->isRead()? READ : WRITE, @@ -351,7 +355,7 @@ MemSinkCtrl::MemoryPort::getAddrRanges() const { AddrRangeList ranges; - ranges.push_back(memory.getAddrRange()); + ranges.push_back(memory.intf->getAddrRange()); return ranges; } @@ -390,3 +394,19 @@ return new QoS::MemSinkCtrl(this); } +QoSMemSinkInterface::QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p) + : AbstractMemory(_p) +{ +} + +void +QoSMemSinkInterface::init() +{ + AbstractMemory::init(); +} + +QoSMemSinkInterface* +QoSMemSinkInterfaceParams::create() +{ + return new QoSMemSinkInterface(this); +} diff --git a/src/mem/qos/mem_sink.hh b/src/mem/qos/mem_sink.hh index 9a51269..3b10abd 100644 --- a/src/mem/qos/mem_sink.hh +++ b/src/mem/qos/mem_sink.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited + * Copyright (c) 2018-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -41,10 +41,14 @@ #ifndef __MEM_QOS_MEM_SINK_HH__ #define __MEM_QOS_MEM_SINK_HH__ +#include "mem/abstract_mem.hh" #include "mem/qos/mem_ctrl.hh" #include "mem/qport.hh" #include "params/QoSMemSinkCtrl.hh" +class QoSMemSinkInterfaceParams; +class QoSMemSinkInterface; + namespace QoS { /** @@ -163,6 +167,11 @@ /** Memory slave port */ MemoryPort port; + /** + * Create pointer to interface of actual media + */ + QoSMemSinkInterface* const intf; + /** Read request pending */ bool retryRdReq; @@ -244,4 +253,23 @@ } // namespace QoS +class QoSMemSinkInterface : public AbstractMemory +{ + public: + /** Initialize the memory interface */ + void init(); + + /** Setting a pointer to the interface */ + void setMemCtrl(QoS::MemSinkCtrl* _ctrl) + { + ctrl = _ctrl; + }; + + /** Pointer to the controller */ + QoS::MemSinkCtrl* ctrl; + + QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p); +}; + + #endif /* __MEM_QOS_MEM_SINK_HH__ */ diff --git a/tests/configs/base_config.py b/tests/configs/base_config.py index 0f79938..e2d3851 100644 --- a/tests/configs/base_config.py +++ b/tests/configs/base_config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2012-2013, 2017-2018 ARM Limited +# Copyright (c) 2012-2013, 2017-2018, 2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -221,7 +221,12 @@ super(BaseSESystem, self).init_system(system) def create_system(self): - system = System(physmem = self.mem_class(), + if issubclass(self.mem_class, m5.objects.DRAMInterface): + mem_ctrl = DRAMCtrl() + mem_ctrl.dram = self.mem_class() + else: + mem_ctrl = self.mem_class() + system = System(physmem = mem_ctrl, membus = SystemXBar(), mem_mode = self.mem_mode, multi_thread = (self.num_threads > 1)) @@ -275,6 +280,16 @@ # the physmem name to avoid bumping all the reference stats system.physmem = [self.mem_class(range = r) for r in system.mem_ranges] + if issubclass(self.mem_class, m5.objects.DRAMInterface): + mem_ctrls = [] + for r in system.mem_ranges: + mem_ctrl = DRAMCtrl() + mem_ctrl.dram = self.mem_class(range = r) + mem_ctrls.append(mem_ctrl) + system.physmem = mem_ctrls + else: + system.physmem = [self.mem_class(range = r) + for r in system.mem_ranges] for i in range(len(system.physmem)): system.physmem[i].port = system.membus.master -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28968 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8 Gerrit-Change-Number: 28968 Gerrit-PatchSet: 1 Gerrit-Owner: Wendy Elsasser <wendy.elsasser(a)arm.com> Gerrit-MessageType: newchange

Jason Lowe-Power (Gerrit)

Tue, Sep 8, 2020 4:38 PM

Jason Lowe-Power has submitted this change. (
https://gem5-review.googlesource.com/c/public/gem5/+/28968 )

Change subject: mem: Make MemCtrl a ClockedObject
......................................................................

mem: Make MemCtrl a ClockedObject

The controller object includes a parameter to the
interface, which is setup when gem5 is configured.

Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28968
Reviewed-by: Jason Lowe-Power <power.jg(a)gmail.com>
Maintainer: Jason Lowe-Power <power.jg(a)gmail.com>
Tested-by: kokoro <noreply+kokoro(a)google.com>

M configs/common/MemConfig.py
M configs/dram/low_power_sweep.py
M configs/dram/sweep.py
M configs/example/memcheck.py
M configs/learning_gem5/part1/simple.py
M configs/learning_gem5/part1/two_level.py
M configs/learning_gem5/part2/simple_cache.py
M configs/learning_gem5/part2/simple_memobj.py
M configs/learning_gem5/part3/simple_ruby.py
M configs/ruby/Ruby.py
M src/mem/DRAMCtrl.py
A src/mem/DRAMInterface.py
M src/mem/SConscript
M src/mem/dram_ctrl.cc
M src/mem/dram_ctrl.hh
M src/mem/drampower.cc
M src/mem/drampower.hh
M src/mem/qos/QoSMemCtrl.py
M src/mem/qos/QoSMemSinkCtrl.py
A src/mem/qos/QoSMemSinkInterface.py
M src/mem/qos/SConscript
M src/mem/qos/mem_ctrl.cc
M src/mem/qos/mem_ctrl.hh
M src/mem/qos/mem_sink.cc
M src/mem/qos/mem_sink.hh
M tests/gem5/configs/base_config.py
26 files changed, 1,913 insertions(+), 1,736 deletions(-)

Approvals:
Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved
kokoro: Regressions pass

diff --git a/configs/common/MemConfig.py b/configs/common/MemConfig.py
index b530145..1ace875 100644
--- a/configs/common/MemConfig.py
+++ b/configs/common/MemConfig.py
@@ -40,7 +40,7 @@
from common import ObjectList
from common import HMC

-def create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, intlv_size,
+def create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size,
xor_low_bit):
"""
Helper function for creating a single memoy controller from the given
@@ -63,32 +63,32 @@

  # Create an instance so we can figure out the address
  # mapping and row-buffer size

ctrl = cls()

interface = intf()

Only do this for DRAMs

if issubclass(cls, m5.objects.DRAMCtrl):

if issubclass(intf, m5.objects.DRAMInterface):
# If the channel bits are appearing after the column
# bits, we need to add the appropriate number of bits
# for the row buffer size

   if ctrl.addr_mapping.value == 'RoRaBaChCo':

   if interface.addr_mapping.value == 'RoRaBaChCo':
        # This computation only really needs to happen
        # once, but as we rely on having an instance we
        # end up having to repeat it for each and every
        # one

       rowbuffer_size = ctrl.device_rowbuffer_size.value * \

```
           ctrl.devices_per_rank.value
```

       rowbuffer_size = interface.device_rowbuffer_size.value * \

           interface.devices_per_rank.value

        intlv_low_bit = int(math.log(rowbuffer_size, 2))

# We got all we need to configure the appropriate address
# range

ctrl.range = m5.objects.AddrRange(r.start, size = r.size(),

interface.range = m5.objects.AddrRange(r.start, size = r.size(),
intlvHighBit =
intlv_low_bit + intlv_bits - 1,
xorHighBit = xor_high_bit,
intlvBits = intlv_bits,
intlvMatch = i)

return ctrl

return interface

def config_mem(options, system):
"""
@@ -148,10 +148,10 @@
if 2 ** intlv_bits != nbr_mem_ctrls:
fatal("Number of memory channels must be a power of 2")

cls = ObjectList.mem_list.get(opt_mem_type)

intf = ObjectList.mem_list.get(opt_mem_type)
mem_ctrls = []

if opt_elastic_trace_en and not issubclass(cls,
m5.objects.SimpleMemory):

if opt_elastic_trace_en and not issubclass(intf,
m5.objects.SimpleMemory):
fatal("When elastic trace is enabled, configure mem-type as "
"simple-mem.")

@@ -162,36 +162,53 @@
intlv_size = max(opt_mem_channels_intlv, system.cache_line_size.value)

  # For every range (most systems will only have one), create an

array of controllers and set their parameters to match their
address mapping in the case of a DRAM

array of memory interfaces and set their parameters to match
their address mapping in the case of a DRAM
for r in system.mem_ranges:
for i in range(nbr_mem_ctrls):

       mem_ctrl = create_mem_ctrl(cls, r, i, nbr_mem_ctrls,

intlv_bits,

```
       # Create the DRAM interface
```

       dram_intf = create_mem_intf(intf, r, i, nbr_mem_ctrls,

intlv_bits,
intlv_size, opt_xor_low_bit)
+
# Set the number of ranks based on the command-line
# options if it was explicitly set

       if issubclass(cls, m5.objects.DRAMCtrl) and opt_mem_ranks:

           mem_ctrl.ranks_per_channel = opt_mem_ranks

       if issubclass(intf, m5.objects.DRAMInterface) and

opt_mem_ranks:

           dram_intf.ranks_per_channel = opt_mem_ranks

        # Enable low-power DRAM states if option is set

       if issubclass(cls, m5.objects.DRAMCtrl):

           mem_ctrl.enable_dram_powerdown = opt_dram_powerdown

       if issubclass(intf, m5.objects.DRAMInterface):

           dram_intf.enable_dram_powerdown = opt_dram_powerdown

        if opt_elastic_trace_en:

```
           mem_ctrl.latency = '1ns'
```

           dram_intf.latency = '1ns'
            print("For elastic trace, over-riding Simple Memory "
                "latency to 1ns.")

       # Create the controller that will drive the interface

       if opt_mem_type == "HMC_2500_1x32":

           # The static latency of the vault controllers is estimated

           # to be smaller than a full DRAM channel controller

           mem_ctrl = m5.objects.DRAMCtrl(min_writes_per_switch = 8,

                                          static_backend_latency

= '4ns',

                                          static_frontend_latency

= '4ns')

```
       else:
```

           mem_ctrl = m5.objects.DRAMCtrl()

       # Hookup the controller to the interface and add to the list

       mem_ctrl.dram = dram_intf
        mem_ctrls.append(mem_ctrl)

Create a controller and connect the interfaces to a controller
for i in range(len(mem_ctrls)):
```
   if opt_mem_type == "HMC_2500_1x32":
```

       # Connect the controllers to the membus

       mem_ctrls[i].port = xbar[i/4].master

       # Set memory device size. There is an independent controller

for

       # each vault. All vaults are same size.

       mem_ctrls[i].dram.device_size = options.hmc_dev_vault_size

```
   else:
```

       # Connect the controllers to the membus

```
       mem_ctrls[i].port = xbar.master
```
```
subsystem.mem_ctrls = mem_ctrls
```

Connect the controllers to the membus
for i in range(len(subsystem.mem_ctrls)):
```
   if opt_mem_type == "HMC_2500_1x32":
```

       subsystem.mem_ctrls[i].port = xbar[i/4].master

       # Set memory device size. There is an independent controller

for

       # each vault. All vaults are same size.

       subsystem.mem_ctrls[i].device_size = options.hmc_dev_vault_size

```
   else:
```

       subsystem.mem_ctrls[i].port = xbar.master

diff --git a/configs/dram/low_power_sweep.py
b/configs/dram/low_power_sweep.py
index 9a62393..0da2b93 100644
--- a/configs/dram/low_power_sweep.py
+++ b/configs/dram/low_power_sweep.py
@@ -111,14 +111,19 @@

Sanity check for memory controller class.

if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl):

fatal("This script assumes the memory is a DRAMCtrl subclass")

fatal("This script assumes the controller is a DRAMCtrl subclass")
+if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
fatal("This script assumes the memory is a DRAMInterface subclass")

There is no point slowing things down by saving any data.

-system.mem_ctrls[0].null = True
+system.mem_ctrls[0].dram.null = True
+
+# enable DRAM low power states
+system.mem_ctrls[0].dram.enable_dram_powerdown = True

Set the address mapping based on input argument

We create a traffic generator state for each param combination we want to

test. Each traffic generator state is specified in the config file and

the
@@ -132,22 +137,22 @@
cfg_file = open(cfg_file_path, 'w')

Get the number of banks

-nbr_banks = int(system.mem_ctrls[0].banks_per_rank.value)
+nbr_banks = int(system.mem_ctrls[0].dram.banks_per_rank.value)

determine the burst size in bytes

-burst_size = int((system.mem_ctrls[0].devices_per_rank.value *

             system.mem_ctrls[0].device_bus_width.value *

             system.mem_ctrls[0].burst_length.value) / 8)

+burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value *

             system.mem_ctrls[0].dram.device_bus_width.value *

             system.mem_ctrls[0].dram.burst_length.value) / 8)

next, get the page size in bytes (the rowbuffer size is already in bytes)

-page_size = system.mem_ctrls[0].devices_per_rank.value * \

system.mem_ctrls[0].device_rowbuffer_size.value
+page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \

system.mem_ctrls[0].dram.device_rowbuffer_size.value

Inter-request delay should be such that we can hit as many transitions

to/from low power states as possible to. We provide a min and max itt to

the

traffic generator and it randomises in the range. The parameter is in

seconds and we need it in ticks (ps).

-itt_min = system.mem_ctrls[0].tBURST.value * 1000000000000
+itt_min = system.mem_ctrls[0].dram.tBURST.value * 1000000000000

#The itt value when set to (tRAS + tRP + tCK) covers the case where

a read command is delayed beyond the delay from ACT to PRE_PDN entry of

the
@@ -155,9 +160,9 @@

between a write and power down entry will be tRCD + tCL + tWR + tRP +

tCK.

As we use this delay as a unit and create multiples of it as bigger

delays

for the sweep, this parameter works for reads, writes and mix of them.

-pd_entry_time = (system.mem_ctrls[0].tRAS.value +

            system.mem_ctrls[0].tRP.value +

            system.mem_ctrls[0].tCK.value) * 1000000000000

+pd_entry_time = (system.mem_ctrls[0].dram.tRAS.value +

            system.mem_ctrls[0].dram.tRP.value +

```
            system.mem_ctrls[0].dram.tCK.value) * 1000000000000
```
We sweep itt max using the multipliers specified by the user.

itt_max_str = args.itt_list.strip().split()
diff --git a/configs/dram/sweep.py b/configs/dram/sweep.py
index a340b46..a771c5c 100644
--- a/configs/dram/sweep.py
+++ b/configs/dram/sweep.py
@@ -116,13 +116,15 @@

the following assumes that we are using the native DRAM

controller, check to be sure

if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl):

fatal("This script assumes the memory is a DRAMCtrl subclass")

fatal("This script assumes the controller is a DRAMCtrl subclass")
+if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
fatal("This script assumes the memory is a DRAMInterface subclass")

there is no point slowing things down by saving any data

-system.mem_ctrls[0].null = True
+system.mem_ctrls[0].dram.null = True

Set the address mapping based on input argument

-system.mem_ctrls[0].addr_mapping = options.addr_map
+system.mem_ctrls[0].dram.addr_mapping = options.addr_map

stay in each state for 0.25 ms, long enough to warm things up, and

short enough to avoid hitting a refresh

@@ -133,21 +135,21 @@

the DRAM maximum bandwidth to ensure that it is saturated

get the number of banks

-nbr_banks = system.mem_ctrls[0].banks_per_rank.value
+nbr_banks = system.mem_ctrls[0].dram.banks_per_rank.value

determine the burst length in bytes

-burst_size = int((system.mem_ctrls[0].devices_per_rank.value *

             system.mem_ctrls[0].device_bus_width.value *

             system.mem_ctrls[0].burst_length.value) / 8)

+burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value *

             system.mem_ctrls[0].dram.device_bus_width.value *

             system.mem_ctrls[0].dram.burst_length.value) / 8)

next, get the page size in bytes

-page_size = system.mem_ctrls[0].devices_per_rank.value * \

system.mem_ctrls[0].device_rowbuffer_size.value
+page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \

system.mem_ctrls[0].dram.device_rowbuffer_size.value

match the maximum bandwidth of the memory, the parameter is in seconds

and we need it in ticks (ps)

-itt = getattr(system.mem_ctrls[0].tBURST_MIN, 'value',

          system.mem_ctrls[0].tBURST.value) * 1000000000000

+itt = getattr(system.mem_ctrls[0].dram.tBURST_MIN, 'value',

```
          system.mem_ctrls[0].dram.tBURST.value) * 1000000000000
```
assume we start at 0

max_addr = mem_range.end
diff --git a/configs/example/memcheck.py b/configs/example/memcheck.py
index 6d80d60..6bccd54 100644
--- a/configs/example/memcheck.py
+++ b/configs/example/memcheck.py
@@ -217,7 +217,7 @@
proto_tester = TrafficGen(config_file = cfg_file_path)

Set up the system along with a DRAM controller

-system = System(physmem = DDR3_1600_8x8())
+system = System(physmem = DRAMCtrl(dram = DDR3_1600_8x8()))

system.voltage_domain = VoltageDomain(voltage = '1V')

diff --git a/configs/learning_gem5/part1/simple.py
b/configs/learning_gem5/part1/simple.py
index ef73a06..cfd15be 100644
--- a/configs/learning_gem5/part1/simple.py
+++ b/configs/learning_gem5/part1/simple.py
@@ -77,8 +77,9 @@
system.cpu.interrupts[0].int_slave = system.membus.master

Create a DDR3 memory controller and connect it to the membus

Connect the system up to the membus

Create a DDR3 memory controller

Create a process for a simple "Hello World" application

Create a DDR3 memory controller and connect it to the membus

Connect the system up to the membus

Create a DDR3 memory controller and connect it to the membus

Connect the system up to the membus

Create a DDR3 memory controller and connect it to the membus

create the interrupt controller for the CPU and connect to the membus

for cpu in system.cpu:
diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py
index 9bceaa3..9f400a8 100644
--- a/configs/ruby/Ruby.py
+++ b/configs/ruby/Ruby.py
@@ -130,15 +130,16 @@
dir_ranges = []
for r in system.mem_ranges:
mem_type = ObjectList.mem_list.get(options.mem_type)

       mem_ctrl = MemConfig.create_mem_ctrl(mem_type, r, index,

       dram_intf = MemConfig.create_mem_intf(mem_type, r, index,
            options.num_dirs, int(math.log(options.num_dirs, 2)),
            intlv_size, options.xor_low_bit)

       mem_ctrl = m5.objects.DRAMCtrl(dram = dram_intf)

        if options.access_backing_store:
            mem_ctrl.kvm_map=False

        mem_ctrls.append(mem_ctrl)

       dir_ranges.append(mem_ctrl.range)

       dir_ranges.append(mem_ctrl.dram.range)

        if crossbar != None:
            mem_ctrl.port = crossbar.master

diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py
index 0f70dff..b7b43dc 100644
--- a/src/mem/DRAMCtrl.py
+++ b/src/mem/DRAMCtrl.py
@@ -40,26 +40,12 @@

from m5.params import *
from m5.proxy import *
-from m5.objects.AbstractMemory import *
from m5.objects.QoSMemCtrl import *

Enum for memory scheduling algorithms, currently First-Come

First-Served and a First-Row Hit then First-Come First-Served

class MemSched(Enum): vals = ['fcfs', 'frfcfs']

-# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting
-# channel, rank, bank, row and column, respectively, and going from
-# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are
-# suitable for an open-page policy, optimising for sequential accesses
-# hitting in the open row. For a closed-page policy, RoCoRaBaCh
-# maximises parallelism.
-class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh']

-# Enum for the page policy, either open, open_adaptive, close, or
-# close_adaptive.
-class PageManage(Enum): vals = ['open', 'open_adaptive', 'close',

                           'close_adaptive']

DRAMCtrl is a single-channel single-ported DRAM controller model

that aims to model the most important system-level performance

effects of a DRAM without getting into too much detail of the DRAM

@@ -72,12 +58,11 @@
# bus in front of the controller for multiple ports
port = SlavePort("Slave port")

the basic configuration of the controller architecture, note
that each entry corresponds to a burst for the specific DRAM
configuration (e.g. x32 with burst length 8 is 32 bytes) and not
the cacheline size or request/packet size
write_buffer_size = Param.Unsigned(64, "Number of write queue entries")
read_buffer_size = Param.Unsigned(32, "Number of read queue entries")

Interface to volatile, DRAM media
dram = Param.DRAMInterface("DRAM interface")
read and write buffer depths are set in the interface
the controller will read these values when instantiated

threshold in percent for when to forcefully trigger writes and

start emptying the write buffer

@@ -93,15 +78,6 @@

  # scheduler, address map and page policy
  mem_sched_policy = Param.MemSched('frfcfs', "Memory scheduling policy")

addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy")
page_policy = Param.PageManage('open_adaptive', "Page management
policy")
enforce a limit on the number of accesses per row
max_accesses_per_row = Param.Unsigned(16, "Max accesses per row
before "

                                     "closing");

size of DRAM Chip in Bytes
device_size = Param.MemorySize("Size of DRAM chip")

pipeline latency of the controller and PHY, split into a

frontend part and a backend part, with reads and writes serviced

@@ -109,1404 +85,3 @@
# serviced by the memory seeing the sum of the two
static_frontend_latency = Param.Latency("10ns", "Static frontend
latency")
static_backend_latency = Param.Latency("10ns", "Static backend
latency")

the physical organisation of the DRAM
device_bus_width = Param.Unsigned("data bus width in bits for each
DRAM "\

                                 "device/chip")

burst_length = Param.Unsigned("Burst lenght (BL) in beats")
device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\

                                      "device/chip")

devices_per_rank = Param.Unsigned("Number of devices/chips per rank")
ranks_per_channel = Param.Unsigned("Number of ranks per channel")
default to 0 bank groups per rank, indicating bank group architecture
is not used
update per memory class when bank group architecture is supported
bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per
rank")
banks_per_rank = Param.Unsigned("Number of banks per rank")
Enable DRAM powerdown states if True. This is False by default due to
performance being lower when enabled
enable_dram_powerdown = Param.Bool(False, "Enable powerdown states")
For power modelling we need to know if the DRAM has a DLL or not
dll = Param.Bool(True, "DRAM has DLL or not")
DRAMPower provides in addition to the core power, the possibility to
include RD/WR termination and IO power. This calculation assumes some
default values. The integration of DRAMPower with gem5 does not

include

IO and RD/WR termination power by default. This might be added as an
additional feature in the future.
timing behaviour and constraints - all in nanoseconds
the base clock period of the DRAM
tCK = Param.Latency("Clock period")
the amount of time in nanoseconds from issuing an activate command
to the data being available in the row buffer for a read/write
tRCD = Param.Latency("RAS to CAS delay")
the time from issuing a read/write command to seeing the actual data
tCL = Param.Latency("CAS latency")
minimum time between a precharge and subsequent activate
tRP = Param.Latency("Row precharge time")
minimum time between an activate and a precharge to the same row
tRAS = Param.Latency("ACT to PRE delay")
minimum time between a write data transfer and a precharge
tWR = Param.Latency("Write recovery time")
minimum time between a read and precharge command
tRTP = Param.Latency("Read to precharge")
time to complete a burst transfer, typically the burst length
divided by two due to the DDR bus, but by making it a parameter
it is easier to also evaluate SDR memories like WideIO.
This parameter has to account for burst length.
Read/Write requests with data size larger than one full burst are

broken

down into multiple requests in the controller
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = Param.Latency("Burst duration "

                      "(typically burst length / 2 cycles)")

tBURST_MAX is the column array cycle delay required before next

access,

which could be greater than tBURST when the memory access time is

greater

than tBURST
tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay")
tBURST_MIN is the minimum delay between bursts, which could be less

than

tBURST when interleaving is supported
tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts")
CAS-to-CAS delay for bursts to the same bank group
only utilized with bank group architectures; set to 0 for default

case

tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay")
Write-to-Write delay for bursts to the same bank group
only utilized with bank group architectures; set to 0 for default

case

This will be used to enable different same bank group delays
for writes versus reads
tCCD_L_WR = Param.Latency(Self.tCCD_L,

   "Same bank group Write to Write delay")

time taken to complete one refresh cycle (N rows in all banks)
tRFC = Param.Latency("Refresh cycle time")
refresh command interval, how often a "ref" command needs
to be sent. It is 7.8 us for a 64ms refresh requirement
tREFI = Param.Latency("Refresh command interval")
write-to-read, same rank turnaround penalty
tWTR = Param.Latency("Write to read, same rank switching time")
write-to-read, same rank turnaround penalty for same bank group
tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching "

                      "time, same bank group")

read-to-write, same rank turnaround penalty
tRTW = Param.Latency("Read to write, same rank switching time")
rank-to-rank bus delay penalty
this does not correlate to a memory timing parameter and encompasses:
1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD
different rank bus delay
tCS = Param.Latency("Rank to rank switching time")
minimum precharge to precharge delay time
tPPD = Param.Latency("0ns", "PRE to PRE delay")
maximum delay between two-cycle ACT command phases
tAAD = Param.Latency(Self.tCK,

                    "Maximum delay between two-cycle ACT commands")

two_cycle_activate = Param.Bool(False,

                    "Two cycles required to send activate")

minimum row activate to row activate delay time
tRRD = Param.Latency("ACT to ACT delay")
only utilized with bank group architectures; set to 0 for default

case

tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay")
time window in which a maximum number of activates are allowed
to take place, set to 0 to disable
tXAW = Param.Latency("X activation window")
activation_limit = Param.Unsigned("Max number of activates in window")
time to exit power-down mode
Exit power-down to next valid command delay
tXP = Param.Latency("0ns", "Power-up Delay")
Exit Powerdown to commands requiring a locked DLL
tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL")
time to exit self-refresh mode
tXS = Param.Latency("0ns", "Self-refresh exit latency")
time to exit self-refresh mode with locked DLL
tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL")
number of data beats per clock. with DDR, default is 2, one per edge
beats_per_clock = Param.Unsigned(2, "Data beats per clock")
data_clock_sync = Param.Bool(False, "Synchronization commands
required")
Currently rolled into other params
######################################################################
tRC - assumed to be tRAS + tRP
Power Behaviour and Constraints
DRAMs like LPDDR and WideIO have 2 external voltage domains. These

are

defined as VDD and VDD2. Each current is defined for each voltage

domain

separately. For example, current IDD0 is active-precharge current for
voltage domain VDD and current IDD02 is active-precharge current for
voltage domain VDD2.
By default all currents are set to 0mA. Users who are only

interested in

the performance of DRAMs can leave them at 0.
Operating 1 Bank Active-Precharge current
IDD0 = Param.Current("0mA", "Active precharge current")
Operating 1 Bank Active-Precharge current multiple voltage Range
IDD02 = Param.Current("0mA", "Active precharge current VDD2")
Precharge Power-down Current: Slow exit
IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow")
Precharge Power-down Current: Slow exit multiple voltage Range
IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2")
Precharge Power-down Current: Fast exit
IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast")
Precharge Power-down Current: Fast exit multiple voltage Range
IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2")
Precharge Standby current
IDD2N = Param.Current("0mA", "Precharge Standby current")
Precharge Standby current multiple voltage range
IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2")
Active Power-down current: slow exit
IDD3P0 = Param.Current("0mA", "Active Powerdown slow")
Active Power-down current: slow exit multiple voltage range
IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2")
Active Power-down current : fast exit
IDD3P1 = Param.Current("0mA", "Active Powerdown fast")
Active Power-down current : fast exit multiple voltage range
IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2")
Active Standby current
IDD3N = Param.Current("0mA", "Active Standby current")
Active Standby current multiple voltage range
IDD3N2 = Param.Current("0mA", "Active Standby current VDD2")
Burst Read Operating Current
IDD4R = Param.Current("0mA", "READ current")
Burst Read Operating Current multiple voltage range
IDD4R2 = Param.Current("0mA", "READ current VDD2")
Burst Write Operating Current
IDD4W = Param.Current("0mA", "WRITE current")
Burst Write Operating Current multiple voltage range
IDD4W2 = Param.Current("0mA", "WRITE current VDD2")
Refresh Current
IDD5 = Param.Current("0mA", "Refresh current")
Refresh Current multiple voltage range
IDD52 = Param.Current("0mA", "Refresh current VDD2")
Self-Refresh Current
IDD6 = Param.Current("0mA", "Self-refresh Current")
Self-Refresh Current multiple voltage range
IDD62 = Param.Current("0mA", "Self-refresh Current VDD2")
Main voltage range of the DRAM
VDD = Param.Voltage("0V", "Main Voltage Range")
Second voltage range defined by some DRAMs
VDD2 = Param.Voltage("0V", "2nd Voltage Range")

-# A single DDR3-1600 x64 channel (one command and address bus), with
-# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in
-# an 8x8 configuration.
-class DDR3_1600_8x8(DRAMCtrl):

size of device in bytes
device_size = '512MB'
8x8 configuration, 8 devices each with an 8-bit interface
device_bus_width = 8
DDR3 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)
device_rowbuffer_size = '1kB'
8x8 configuration, so 8 devices
devices_per_rank = 8
Use two ranks
ranks_per_channel = 2
DDR3 has 8 banks in all configurations
banks_per_rank = 8
800 MHz
tCK = '1.25ns'
8 beats across an x64 interface translates to 4 clocks @ 800 MHz
tBURST = '5ns'
DDR3-1600 11-11-11
tRCD = '13.75ns'
tCL = '13.75ns'
tRP = '13.75ns'
tRAS = '35ns'
tRRD = '6ns'
tXAW = '30ns'
activation_limit = 4
tRFC = '260ns'
tWR = '15ns'
Greater of 4 CK or 7.5 ns
tWTR = '7.5ns'
Greater of 4 CK or 7.5 ns
tRTP = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns
tRTW = '2.5ns'
Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns
tCS = '2.5ns'
<=85C, half for >85C
tREFI = '7.8us'
active powerdown and precharge powerdown exit time
tXP = '6ns'
self refresh exit time
tXS = '270ns'
Current values from datasheet Die Rev E,J
IDD0 = '55mA'
IDD2N = '32mA'
IDD3N = '38mA'
IDD4W = '125mA'
IDD4R = '157mA'
IDD5 = '235mA'
IDD3P1 = '38mA'
IDD2P1 = '32mA'
IDD6 = '20mA'
VDD = '1.5V'

size of device
two banks per device with each bank 4MB [2]
device_size = '8MB'
1x32 configuration, 1 device with 32 TSVs [2]
device_bus_width = 32
HMC is a BL8 device [2]
burst_length = 8
Each device has a page (row buffer) size of 256 bytes [2]
device_rowbuffer_size = '256B'
1x32 configuration, so 1 device [2]
devices_per_rank = 1
4 layers so 4 ranks [2]
ranks_per_channel = 4
HMC has 2 banks per layer [2]
Each layer represents a rank. With 4 layers and 8 banks in total,

each

layer has 2 banks; thus 2 banks per rank.
banks_per_rank = 2
1250 MHz [2]
tCK = '0.8ns'
8 beats across an x32 interface translates to 4 clocks @ 1250 MHz
tBURST = '3.2ns'
Values using DRAMSpec HMC model [1]
tRCD = '10.2ns'
tCL = '9.9ns'
tRP = '7.7ns'
tRAS = '21.6ns'
tRRD depends on the power supply network for each vendor.
We assume a tRRD of a double bank approach to be equal to 4 clock
cycles (Assumption)
tRRD = '3.2ns'
activation limit is set to 0 since there are only 2 banks per vault
layer.
activation_limit = 0
Values using DRAMSpec HMC model [1]
tRFC = '59ns'
tWR = '8ns'
tRTP = '4.9ns'
Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz

0.8 ns (Assumption)
tCS = '0.8ns'
Value using DRAMSpec HMC model [1]
tREFI = '3.9us'
The default page policy in the vault controllers is simple closed

page

[2] nevertheless 'close' policy opens and closes the row multiple

times

for bursts largers than 32Bytes. For this reason we

use 'close_adaptive'

page_policy = 'close_adaptive'
RoCoRaBaCh resembles the default address mapping in HMC
addr_mapping = 'RoCoRaBaCh'
min_writes_per_switch = 8
These parameters do not directly correlate with buffer_size in real
hardware. Nevertheless, their value has been tuned to achieve a
bandwidth similar to the cycle-accurate model in [2]
write_buffer_size = 32
read_buffer_size = 32
The static latency of the vault controllers is estimated to be

smaller

than a full DRAM channel controller
static_backend_latency='4ns'
static_frontend_latency='4ns'

1066 MHz
tCK = '0.938ns'
8 beats across an x64 interface translates to 4 clocks @ 1066 MHz
tBURST = '3.752ns'
DDR3-2133 14-14-14
tRCD = '13.09ns'
tCL = '13.09ns'
tRP = '13.09ns'
tRAS = '33ns'
tRRD = '5ns'
tXAW = '25ns'
Current values from datasheet
IDD0 = '70mA'
IDD2N = '37mA'
IDD3N = '44mA'
IDD4W = '157mA'
IDD4R = '191mA'
IDD5 = '250mA'
IDD3P1 = '44mA'
IDD2P1 = '43mA'
IDD6 ='20mA'
VDD = '1.5V'

size of device
device_size = '1GB'
16x4 configuration, 16 devices each with a 4-bit interface
device_bus_width = 4
DDR4 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 512 byte (1K columns x4)
device_rowbuffer_size = '512B'
16x4 configuration, so 16 devices
devices_per_rank = 16
Match our DDR3 configurations which is dual rank
ranks_per_channel = 2
DDR4 has 2 (x16) or 4 (x4 and x8) bank groups
Set to 4 for x4 case
bank_groups_per_rank = 4
DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all
configurations). Currently we do not capture the additional
constraints incurred by the bank groups
banks_per_rank = 16
override the default buffer sizes and go for something larger to
accommodate the larger bank count
write_buffer_size = 128
read_buffer_size = 64
1200 MHz
tCK = '0.833ns'
8 beats across an x64 interface translates to 4 clocks @ 1200 MHz
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = '3.332ns'
@2400 data rate, tCCD_L is 6 CK
CAS-to-CAS delay for bursts to the same bank group
tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = '5ns';
DDR4-2400 17-17-17
tRCD = '14.16ns'
tCL = '14.16ns'
tRP = '14.16ns'
tRAS = '32ns'
RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns)
tRRD = '3.332ns'
RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns)
tRRD_L = '4.9ns';
tFAW for 512B page is MAX(16 CK, 13ns)
tXAW = '13.328ns'
activation_limit = 4
tRFC is 350ns
tRFC = '350ns'
tWR = '15ns'
Here using the average of WTR_S and WTR_L
tWTR = '5ns'
Greater of 4 CK or 7.5 ns
tRTP = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666

tRTW = '1.666ns'
Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns
tCS = '1.666ns'
<=85C, half for >85C
tREFI = '7.8us'
active powerdown and precharge powerdown exit time
tXP = '6ns'
self refresh exit time
exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is:
tRFC + 10ns = 340ns
tXS = '340ns'
Current values from datasheet
IDD0 = '43mA'
IDD02 = '3mA'
IDD2N = '34mA'
IDD3N = '38mA'
IDD3N2 = '3mA'
IDD4W = '103mA'
IDD4R = '110mA'
IDD5 = '250mA'
IDD3P1 = '32mA'
IDD2P1 = '25mA'
IDD6 = '30mA'
VDD = '1.2V'
VDD2 = '2.5V'

8x8 configuration, 8 devices each with an 8-bit interface
device_bus_width = 8
Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)
device_rowbuffer_size = '1kB'
8x8 configuration, so 8 devices
devices_per_rank = 8
RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns)
tRRD_L = '4.9ns';
tXAW = '21ns'
Current values from datasheet
IDD0 = '48mA'
IDD3N = '43mA'
IDD4W = '123mA'
IDD4R = '135mA'
IDD3P1 = '37mA'

4x16 configuration, 4 devices each with an 16-bit interface
device_bus_width = 16
Each device has a page (row buffer) size of 2 Kbyte (1K columns x16)
device_rowbuffer_size = '2kB'
4x16 configuration, so 4 devices
devices_per_rank = 4
Single rank for x16
ranks_per_channel = 1
DDR4 has 2 (x16) or 4 (x4 and x8) bank groups
Set to 2 for x16 case
bank_groups_per_rank = 2
DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all
configurations). Currently we do not capture the additional
constraints incurred by the bank groups
banks_per_rank = 8
RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns)
tRRD = '5.3ns'
RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns)
tRRD_L = '6.4ns';
tXAW = '30ns'
Current values from datasheet
IDD0 = '80mA'
IDD02 = '4mA'
IDD2N = '34mA'
IDD3N = '47mA'
IDD4W = '228mA'
IDD4R = '243mA'
IDD5 = '280mA'
IDD3P1 = '41mA'

No DLL in LPDDR2
dll = False
size of device
device_size = '512MB'
1x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
LPDDR2_S4 is a BL4 and BL8 device
burst_length = 8
Each device has a page (row buffer) size of 1KB
(this depends on the memory density)
device_rowbuffer_size = '1kB'
1x32 configuration, so 1 device
devices_per_rank = 1
Use a single rank
ranks_per_channel = 1
LPDDR2-S4 has 8 banks in all configurations
banks_per_rank = 8
533 MHz
tCK = '1.876ns'
Fixed at 15 ns
tRCD = '15ns'
8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time
tCL = '15ns'
Pre-charge one bank 15 ns (all banks 18 ns)
tRP = '15ns'
tRAS = '42ns'
tWR = '15ns'
tRTP = '7.5ns'
8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz.
Note this is a BL8 DDR device.
Requests larger than 32 bytes are broken down into multiple requests
in the controller
tBURST = '7.5ns'
LPDDR2-S4, 4 Gbit
tRFC = '130ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '7.5ns'
self refresh exit time
tXS = '140ns'
Irrespective of speed grade, tWTR is 7.5 ns
tWTR = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns
tRTW = '3.75ns'
Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns
tCS = '3.75ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Irrespective of density, tFAW is 50 ns
tXAW = '50ns'
activation_limit = 4
Current values from datasheet
IDD0 = '15mA'
IDD02 = '70mA'
IDD2N = '2mA'
IDD2N2 = '30mA'
IDD3N = '2.5mA'
IDD3N2 = '30mA'
IDD4W = '10mA'
IDD4W2 = '190mA'
IDD4R = '3mA'
IDD4R2 = '220mA'
IDD5 = '40mA'
IDD52 = '150mA'
IDD3P1 = '1.2mA'
IDD3P12 = '8mA'
IDD2P1 = '0.6mA'
IDD2P12 = '0.8mA'
IDD6 = '1mA'
IDD62 = '3.2mA'
VDD = '1.8V'
VDD2 = '1.2V'

-# A single WideIO x128 interface (one command and address bus), with
-# default timings based on an estimated WIO-200 8 Gbit part.
-class WideIO_200_1x128(DRAMCtrl):

No DLL for WideIO
dll = False
size of device
device_size = '1024MB'
1x128 configuration, 1 device with a 128-bit interface
device_bus_width = 128
This is a BL4 device
burst_length = 4
Each device has a page (row buffer) size of 4KB
(this depends on the memory density)
device_rowbuffer_size = '4kB'
1x128 configuration, so 1 device
devices_per_rank = 1
Use one rank for a one-high die stack
ranks_per_channel = 1
WideIO has 4 banks in all configurations
banks_per_rank = 4
200 MHz
tCK = '5ns'
WIO-200
tRCD = '18ns'
tCL = '18ns'
tRP = '18ns'
tRAS = '42ns'
tWR = '15ns'
Read to precharge is same as the burst
tRTP = '20ns'
4 beats across an x128 SDR interface translates to 4 clocks @ 200

MHz.

Note this is a BL4 SDR device.
tBURST = '20ns'
WIO 8 Gb
tRFC = '210ns'
WIO 8 Gb, <=85C, half for >85C
tREFI = '3.9us'
Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns
tWTR = '15ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns
tRTW = '10ns'
Default different rank bus delay to 2 CK, @200 MHz = 10 ns
tCS = '10ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Two instead of four activation window
tXAW = '50ns'
activation_limit = 2
The WideIO specification does not provide current information

No DLL for LPDDR3
dll = False
size of device
device_size = '512MB'
1x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
LPDDR3 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 4KB
device_rowbuffer_size = '4kB'
1x32 configuration, so 1 device
devices_per_rank = 1
Technically the datasheet is a dual-rank package, but for
comparison with the LPDDR2 config we stick to a single rank
ranks_per_channel = 1
LPDDR3 has 8 banks in all configurations
banks_per_rank = 8
800 MHz
tCK = '1.25ns'
tRCD = '18ns'
12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time
tCL = '15ns'
tRAS = '42ns'
tWR = '15ns'
Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns
tRTP = '7.5ns'
Pre-charge one bank 18 ns (all banks 21 ns)
tRP = '18ns'
8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz.
Note this is a BL8 DDR device.
Requests larger than 32 bytes are broken down into multiple requests
in the controller
tBURST = '5ns'
LPDDR3, 4 Gb
tRFC = '130ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '7.5ns'
self refresh exit time
tXS = '140ns'
Irrespective of speed grade, tWTR is 7.5 ns
tWTR = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns
tRTW = '2.5ns'
Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns
tCS = '2.5ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Irrespective of size, tFAW is 50 ns
tXAW = '50ns'
activation_limit = 4
Current values from datasheet
IDD0 = '8mA'
IDD02 = '60mA'
IDD2N = '0.8mA'
IDD2N2 = '26mA'
IDD3N = '2mA'
IDD3N2 = '34mA'
IDD4W = '2mA'
IDD4W2 = '190mA'
IDD4R = '2mA'
IDD4R2 = '230mA'
IDD5 = '28mA'
IDD52 = '150mA'
IDD3P1 = '1.4mA'
IDD3P12 = '11mA'
IDD2P1 = '0.8mA'
IDD2P12 = '1.8mA'
IDD6 = '0.5mA'
IDD62 = '1.8mA'
VDD = '1.8V'
VDD2 = '1.2V'

-# A single GDDR5 x64 interface, with
-# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix
-# H5GQ1H24AFR) in a 2x32 configuration.
-class GDDR5_4000_2x32(DRAMCtrl):

size of device
device_size = '128MB'
2x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
GDDR5 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 2Kbits (256Bytes)
device_rowbuffer_size = '256B'
2x32 configuration, so 2 devices
devices_per_rank = 2
assume single rank
ranks_per_channel = 1
GDDR5 has 4 bank groups
bank_groups_per_rank = 4
GDDR5 has 16 banks with 4 bank groups
banks_per_rank = 16
1000 MHz
tCK = '1ns'
8 beats across an x64 interface translates to 2 clocks @ 1000 MHz
Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz )
8 beats at 4000 MHz = 2 beats at 1000 MHz
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = '2ns'
@1000MHz data rate, tCCD_L is 3 CK
CAS-to-CAS delay for bursts to the same bank group
tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = '3ns';
tRCD = '12ns'
tCL is not directly found in datasheet and assumed equal tRCD
tCL = '12ns'
tRP = '12ns'
tRAS = '28ns'
RRD_S (different bank group)
RRD_S is 5.5 ns in datasheet.
rounded to the next multiple of tCK
tRRD = '6ns'
RRD_L (same bank group)
RRD_L is 5.5 ns in datasheet.
rounded to the next multiple of tCK
tRRD_L = '6ns'
tXAW = '23ns'
tXAW < 4 x tRRD.
Therefore, activation limit is set to 0
activation_limit = 0
tRFC = '65ns'
tWR = '12ns'
Here using the average of WTR_S and WTR_L
tWTR = '5ns'
Read-to-Precharge 2 CK
tRTP = '2ns'
Assume 2 cycles
tRTW = '2ns'

HBM gen1 supports up to 8 128-bit physical channels
Configuration defines a single channel, with the capacity
set to (full_ stack_capacity / 8) based on 2Gb dies
To use all 8 channels, set 'channels' parameter to 8 in
system configuration
128-bit interface legacy mode
device_bus_width = 128
HBM supports BL4 and BL2 (legacy mode only)
burst_length = 4
size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack;
with 8 channels, 128MB per channel
device_size = '128MB'
device_rowbuffer_size = '2kB'
1x128 configuration
devices_per_rank = 1
HBM does not have a CS pin; set rank to 1
ranks_per_channel = 1
HBM has 8 or 16 banks depending on capacity
2Gb dies have 8 banks
banks_per_rank = 8
depending on frequency, bank groups may be required
will always have 4 bank groups when enabled
current specifications do not define the minimum frequency for
bank group architecture
setting bank_groups_per_rank to 0 to disable until range is defined
bank_groups_per_rank = 0
500 MHz for 1Gbps DDR data rate
tCK = '2ns'
use values from IDD measurement in JEDEC spec
use tRP value for tRCD and tCL similar to other classes
tRP = '15ns'
tRCD = '15ns'
tCL = '15ns'
tRAS = '33ns'
BL2 and BL4 supported, default to BL4
DDR @ 500 MHz means 4 * 2ns / 2 = 4ns
tBURST = '4ns'
value for 2Gb device from JEDEC spec
tRFC = '160ns'
value for 2Gb device from JEDEC spec
tREFI = '3.9us'
extrapolate the following from LPDDR configs, using ns values
to minimize burst length, prefetch differences
tWR = '18ns'
tRTP = '7.5ns'
tWTR = '10ns'
start with 2 cycles turnaround, similar to other memory classes
could be more with variations across the stack
tRTW = '4ns'
single rank device, set to 0
tCS = '0ns'
from MemCon example, tRRD is 4ns with 2ns tCK
tRRD = '4ns'
from MemCon example, tFAW is 30ns with 2ns tCK
tXAW = '30ns'
activation_limit = 4
4tCK
tXP = '8ns'
start with tRFC + tXP -> 160ns + 8ns = 168ns
tXS = '168ns'

For HBM gen2 with pseudo-channel mode, configure 2X channels.
Configuration defines a single pseudo channel, with the capacity
set to (full_ stack_capacity / 16) based on 8Gb dies
To use all 16 pseudo channels, set 'channels' parameter to 16 in
system configuration
64-bit pseudo-channle interface
device_bus_width = 64
HBM pseudo-channel only supports BL4
burst_length = 4
size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack;
with 16 channels, 256MB per channel
device_size = '256MB'
page size is halved with pseudo-channel; maintaining the same same

number

of rows per pseudo-channel with 2X banks across 2 channels
device_rowbuffer_size = '1kB'
HBM has 8 or 16 banks depending on capacity
Starting with 4Gb dies, 16 banks are defined
banks_per_rank = 16
reset tRFC for larger, 8Gb device
use HBM1 4Gb value as a starting point
tRFC = '260ns'
start with tRFC + tXP -> 160ns + 8ns = 168ns
tXS = '268ns'
Default different rank bus delay to 2 CK, @1000 MHz = 2 ns
tCS = '2ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '10ns'
self refresh exit time
tXS = '65ns'

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture
-# burst of 32, which means bursts can be interleaved
-class LPDDR5_5500_1x16_BG_BL32(DRAMCtrl):

Increase buffer size to account for more bank resources
read_buffer_size = 64
Set page policy to better suit DMC Huxley
page_policy = 'close_adaptive'
16-bit channel interface
device_bus_width = 16
LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL32 for higher command bandwidth
burst_length = 32
size of device in bytes
device_size = '1GB'
2kB page with BG mode
device_rowbuffer_size = '2kB'
Use a 1x16 configuration
devices_per_rank = 1
Use a single rank
ranks_per_channel = 1
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Initial configuration will have 16 banks with Bank Group Arch
to maximim resources and enable higher data rates
banks_per_rank = 16
bank_groups_per_rank = 4
5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK
tCK = '1.455ns'
Greater of 2 CK or 18ns
tRCD = '18ns'
Base RL is 16 CK @ 687.5 MHz = 23.28ns
tCL = '23.280ns'
Greater of 2 CK or 18ns
tRP = '18ns'
Greater of 3 CK or 42ns
tRAS = '42ns'
Greater of 3 CK or 34ns
tWR = '34ns'
active powerdown and precharge powerdown exit time
Greater of 3 CK or 7ns
tXP = '7ns'
self refresh exit time (tRFCab + 7.5ns)
tXS = '217.5ns'
Greater of 2 CK or 7.5 ns minus 2 CK
tRTP = '4.59ns'
With BG architecture, burst of 32 transferred in two 16-beat
sub-bursts, with a 16-beat gap in between.
Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz
tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz
tBURST = '8.73ns'
can interleave a Bstof32 from another bank group at tBURST_MIN
16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz
tBURST_MIN = '2.91ns'
tBURST_MAX is the maximum burst delay for same bank group timing
this is 8 CK @ 687.5 MHz
tBURST_MAX = '11.64ns'
8 CK @ 687.5 MHz
tCCD_L = "11.64ns"
LPDDR5, 8 Gbit/channel for 280ns tRFCab
tRFC = '210ns'
tREFI = '3.9us'
Greater of 4 CK or 6.25 ns
tWTR = '6.25ns'
Greater of 4 CK or 12 ns
tWTR_L = '12ns'
Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL
tWCKDQ0/tCK will be 1 CK for most cases
For gem5 RL = WL and BL/n is already accounted for with tBURST
Result is and additional 1 CK is required
tRTW = '1.455ns'
Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns
tCS = '2.91ns'
2 CK
tPPD = '2.91ns'
Greater of 2 CK or 5 ns
tRRD = '5ns'
tRRD_L = '5ns'
With Bank Group Arch mode tFAW is 20 ns
tXAW = '20ns'
activation_limit = 4
at 5Gbps, 4:1 WCK to CK ratio required
2 data beats per WCK (DDR) -> 8 per CK
beats_per_clock = 8
2 cycles required to send activate command
2 command phases can be sent back-to-back or
with a gap up to tAAD = 8 CK
two_cycle_activate = True
tAAD = '11.640ns'
data_clock_sync = True

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture, burst of 16
-class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32):

LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL16 for smaller access granularity
burst_length = 16
For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio
tBURST = '2.91ns'
tBURST_MIN = '2.91ns'
For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio
tBURST_MAX = '5.82ns'
4 CK @ 687.5 MHz
tCCD_L = "5.82ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# Starting with 5.5Gbps data rates and 8Gbit die
-# Configuring for 8-bank mode, burst of 32
-class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32):

4kB page with 8B mode
device_rowbuffer_size = '4kB'
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Select 8B
banks_per_rank = 8
bank_groups_per_rank = 0
For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio
tBURST = '5.82ns'
tBURST_MIN = '5.82ns'
tBURST_MAX = '5.82ns'
Greater of 4 CK or 12 ns
tWTR = '12ns'
Greater of 2 CK or 10 ns
tRRD = '10ns'
With 8B mode tFAW is 40 ns
tXAW = '40ns'
activation_limit = 4
Reset BG arch timing for 8B mode
tCCD_L = "0ns"
tRRD_L = "0ns"
tWTR_L = "0ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture
-# burst of 32, which means bursts can be interleaved
-class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32):

5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK
tCK = '1.25ns'
Base RL is 17 CK @ 800 MHz = 21.25ns
tCL = '21.25ns'
With BG architecture, burst of 32 transferred in two 16-beat
sub-bursts, with a 16-beat gap in between.
Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz
tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz
tBURST = '7.5ns'
can interleave a Bstof32 from another bank group at tBURST_MIN
16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz
tBURST_MIN = '2.5ns'
tBURST_MAX is the maximum burst delay for same bank group timing
this is 8 CK @ 800 MHz
tBURST_MAX = '10ns'
8 CK @ 800 MHz
tCCD_L = "10ns"
Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL
tWCKDQ0/tCK will be 1 CK for most cases
For gem5 RL = WL and BL/n is already accounted for with tBURST
Result is and additional 1 CK is required
tRTW = '1.25ns'
Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns
tCS = '2.5ns'
2 CK
tPPD = '2.5ns'
2 command phases can be sent back-to-back or
with a gap up to tAAD = 8 CK
tAAD = '10ns'

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on initial
-# JEDEC specifcation
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 16-bank mode with bank-group architecture, burst of 16
-class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32):

LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL16 for smaller access granularity
burst_length = 16
For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio
tBURST = '2.5ns'
tBURST_MIN = '2.5ns'
For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio
tBURST_MAX = '5ns'
4 CK @ 800 MHz
tCCD_L = "5ns"

-# A single LPDDR5 x16 interface (one command/address bus)
-# for a single x16 channel with default timings based on
-# initial JEDEC specification
-# 6.4Gbps data rates and 8Gbit die
-# Configuring for 8-bank mode, burst of 32
-class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32):

4kB page with 8B mode
device_rowbuffer_size = '4kB'
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Select 8B
banks_per_rank = 8
bank_groups_per_rank = 0
For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio
tBURST = '5ns'
tBURST_MIN = '5ns'
tBURST_MAX = '5ns'
Greater of 4 CK or 12 ns
tWTR = '12ns'
Greater of 2 CK or 10 ns
tRRD = '10ns'
With 8B mode tFAW is 40 ns
tXAW = '40ns'
activation_limit = 4
Reset BG arch timing for 8B mode
tCCD_L = "0ns"
tRRD_L = "0ns"
tWTR_L = "0ns"
diff --git a/src/mem/DRAMInterface.py b/src/mem/DRAMInterface.py
new file mode 100644
index 0000000..f571920
--- /dev/null
+++ b/src/mem/DRAMInterface.py
@@ -0,0 +1,1473 @@
+# Copyright (c) 2012-2020 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder. You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Copyright (c) 2013 Amin Farmahini-Farahani
+# Copyright (c) 2015 University of Kaiserslautern
+# Copyright (c) 2015 The University of Bologna
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.AbstractMemory import AbstractMemory
+
+# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting
+# channel, rank, bank, row and column, respectively, and going from
+# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are
+# suitable for an open-page policy, optimising for sequential accesses
+# hitting in the open row. For a closed-page policy, RoCoRaBaCh
+# maximises parallelism.
+class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh']
+
+# Enum for the page policy, either open, open_adaptive, close, or
+# close_adaptive.
+class PageManage(Enum): vals = ['open', 'open_adaptive', 'close',

                           'close_adaptive']

+class DRAMInterface(AbstractMemory):

type = 'DRAMInterface'
cxx_header = "mem/dram_ctrl.hh"
Allow the interface to set required controller buffer sizes
each entry corresponds to a burst for the specific DRAM
configuration (e.g. x32 with burst length 8 is 32 bytes) and not
the cacheline size or request/packet size
write_buffer_size = Param.Unsigned(64, "Number of write queue entries")
read_buffer_size = Param.Unsigned(32, "Number of read queue entries")
scheduler, address map and page policy
addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy")
page_policy = Param.PageManage('open_adaptive', "Page management
policy")
enforce a limit on the number of accesses per row
max_accesses_per_row = Param.Unsigned(16, "Max accesses per row
before "

                                     "closing");

size of DRAM Chip in Bytes
device_size = Param.MemorySize("Size of DRAM chip")
the physical organisation of the DRAM
device_bus_width = Param.Unsigned("data bus width in bits for each
DRAM "\

                                 "device/chip")

burst_length = Param.Unsigned("Burst lenght (BL) in beats")
device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\

                                      "device/chip")

devices_per_rank = Param.Unsigned("Number of devices/chips per rank")
ranks_per_channel = Param.Unsigned("Number of ranks per channel")
default to 0 bank groups per rank, indicating bank group architecture
is not used
update per memory class when bank group architecture is supported
bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per
rank")
banks_per_rank = Param.Unsigned("Number of banks per rank")
Enable DRAM powerdown states if True. This is False by default due to
performance being lower when enabled
enable_dram_powerdown = Param.Bool(False, "Enable powerdown states")
For power modelling we need to know if the DRAM has a DLL or not
dll = Param.Bool(True, "DRAM has DLL or not")
DRAMPower provides in addition to the core power, the possibility to
include RD/WR termination and IO power. This calculation assumes some
default values. The integration of DRAMPower with gem5 does not

include

IO and RD/WR termination power by default. This might be added as an
additional feature in the future.
timing behaviour and constraints - all in nanoseconds
the base clock period of the DRAM
tCK = Param.Latency("Clock period")
the amount of time in nanoseconds from issuing an activate command
to the data being available in the row buffer for a read/write
tRCD = Param.Latency("RAS to CAS delay")
the time from issuing a read/write command to seeing the actual data
tCL = Param.Latency("CAS latency")
minimum time between a precharge and subsequent activate
tRP = Param.Latency("Row precharge time")
minimum time between an activate and a precharge to the same row
tRAS = Param.Latency("ACT to PRE delay")
minimum time between a write data transfer and a precharge
tWR = Param.Latency("Write recovery time")
minimum time between a read and precharge command
tRTP = Param.Latency("Read to precharge")
time to complete a burst transfer, typically the burst length
divided by two due to the DDR bus, but by making it a parameter
it is easier to also evaluate SDR memories like WideIO.
This parameter has to account for burst length.
Read/Write requests with data size larger than one full burst are

broken

down into multiple requests in the controller
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = Param.Latency("Burst duration "

                      "(typically burst length / 2 cycles)")

tBURST_MAX is the column array cycle delay required before next

access,

which could be greater than tBURST when the memory access time is

greater

than tBURST
tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay")
tBURST_MIN is the minimum delay between bursts, which could be less

than

tBURST when interleaving is supported
tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts")
CAS-to-CAS delay for bursts to the same bank group
only utilized with bank group architectures; set to 0 for default

case

tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay")
Write-to-Write delay for bursts to the same bank group
only utilized with bank group architectures; set to 0 for default

case

This will be used to enable different same bank group delays
for writes versus reads
tCCD_L_WR = Param.Latency(Self.tCCD_L,

 "Same bank group Write to Write delay")

time taken to complete one refresh cycle (N rows in all banks)
tRFC = Param.Latency("Refresh cycle time")
refresh command interval, how often a "ref" command needs
to be sent. It is 7.8 us for a 64ms refresh requirement
tREFI = Param.Latency("Refresh command interval")
write-to-read, same rank turnaround penalty
tWTR = Param.Latency("Write to read, same rank switching time")
write-to-read, same rank turnaround penalty for same bank group
tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching "

                      "time, same bank group")

read-to-write, same rank turnaround penalty
tRTW = Param.Latency("Read to write, same rank switching time")
rank-to-rank bus delay penalty
this does not correlate to a memory timing parameter and encompasses:
1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD
different rank bus delay
tCS = Param.Latency("Rank to rank switching time")
minimum precharge to precharge delay time
tPPD = Param.Latency("0ns", "PRE to PRE delay")
maximum delay between two-cycle ACT command phases
tAAD = Param.Latency(Self.tCK,

                    "Maximum delay between two-cycle ACT commands")

two_cycle_activate = Param.Bool(False,

                    "Two cycles required to send activate")

minimum row activate to row activate delay time
tRRD = Param.Latency("ACT to ACT delay")
only utilized with bank group architectures; set to 0 for default

case

tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay")
time window in which a maximum number of activates are allowed
to take place, set to 0 to disable
tXAW = Param.Latency("X activation window")
activation_limit = Param.Unsigned("Max number of activates in window")
time to exit power-down mode
Exit power-down to next valid command delay
tXP = Param.Latency("0ns", "Power-up Delay")
Exit Powerdown to commands requiring a locked DLL
tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL")
time to exit self-refresh mode
tXS = Param.Latency("0ns", "Self-refresh exit latency")
time to exit self-refresh mode with locked DLL
tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL")
number of data beats per clock. with DDR, default is 2, one per edge
beats_per_clock = Param.Unsigned(2, "Data beats per clock")
data_clock_sync = Param.Bool(False, "Synchronization commands
required")
Currently rolled into other params
######################################################################
tRC - assumed to be tRAS + tRP
Power Behaviour and Constraints
DRAMs like LPDDR and WideIO have 2 external voltage domains. These

are

defined as VDD and VDD2. Each current is defined for each voltage

domain

separately. For example, current IDD0 is active-precharge current for
voltage domain VDD and current IDD02 is active-precharge current for
voltage domain VDD2.
By default all currents are set to 0mA. Users who are only

interested in

the performance of DRAMs can leave them at 0.
Operating 1 Bank Active-Precharge current
IDD0 = Param.Current("0mA", "Active precharge current")
Operating 1 Bank Active-Precharge current multiple voltage Range
IDD02 = Param.Current("0mA", "Active precharge current VDD2")
Precharge Power-down Current: Slow exit
IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow")
Precharge Power-down Current: Slow exit multiple voltage Range
IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2")
Precharge Power-down Current: Fast exit
IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast")
Precharge Power-down Current: Fast exit multiple voltage Range
IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2")
Precharge Standby current
IDD2N = Param.Current("0mA", "Precharge Standby current")
Precharge Standby current multiple voltage range
IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2")
Active Power-down current: slow exit
IDD3P0 = Param.Current("0mA", "Active Powerdown slow")
Active Power-down current: slow exit multiple voltage range
IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2")
Active Power-down current : fast exit
IDD3P1 = Param.Current("0mA", "Active Powerdown fast")
Active Power-down current : fast exit multiple voltage range
IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2")
Active Standby current
IDD3N = Param.Current("0mA", "Active Standby current")
Active Standby current multiple voltage range
IDD3N2 = Param.Current("0mA", "Active Standby current VDD2")
Burst Read Operating Current
IDD4R = Param.Current("0mA", "READ current")
Burst Read Operating Current multiple voltage range
IDD4R2 = Param.Current("0mA", "READ current VDD2")
Burst Write Operating Current
IDD4W = Param.Current("0mA", "WRITE current")
Burst Write Operating Current multiple voltage range
IDD4W2 = Param.Current("0mA", "WRITE current VDD2")
Refresh Current
IDD5 = Param.Current("0mA", "Refresh current")
Refresh Current multiple voltage range
IDD52 = Param.Current("0mA", "Refresh current VDD2")
Self-Refresh Current
IDD6 = Param.Current("0mA", "Self-refresh Current")
Self-Refresh Current multiple voltage range
IDD62 = Param.Current("0mA", "Self-refresh Current VDD2")
Main voltage range of the DRAM
VDD = Param.Voltage("0V", "Main Voltage Range")
Second voltage range defined by some DRAMs
VDD2 = Param.Voltage("0V", "2nd Voltage Range")

size of device in bytes
device_size = '512MB'
8x8 configuration, 8 devices each with an 8-bit interface
device_bus_width = 8
DDR3 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)
device_rowbuffer_size = '1kB'
8x8 configuration, so 8 devices
devices_per_rank = 8
Use two ranks
ranks_per_channel = 2
DDR3 has 8 banks in all configurations
banks_per_rank = 8
800 MHz
tCK = '1.25ns'
8 beats across an x64 interface translates to 4 clocks @ 800 MHz
tBURST = '5ns'
DDR3-1600 11-11-11
tRCD = '13.75ns'
tCL = '13.75ns'
tRP = '13.75ns'
tRAS = '35ns'
tRRD = '6ns'
tXAW = '30ns'
activation_limit = 4
tRFC = '260ns'
tWR = '15ns'
Greater of 4 CK or 7.5 ns
tWTR = '7.5ns'
Greater of 4 CK or 7.5 ns
tRTP = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns
tRTW = '2.5ns'
Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns
tCS = '2.5ns'
<=85C, half for >85C
tREFI = '7.8us'
active powerdown and precharge powerdown exit time
tXP = '6ns'
self refresh exit time
tXS = '270ns'
Current values from datasheet Die Rev E,J
IDD0 = '55mA'
IDD2N = '32mA'
IDD3N = '38mA'
IDD4W = '125mA'
IDD4R = '157mA'
IDD5 = '235mA'
IDD3P1 = '38mA'
IDD2P1 = '32mA'
IDD6 = '20mA'
VDD = '1.5V'

size of device
two banks per device with each bank 4MB [2]
device_size = '8MB'
1x32 configuration, 1 device with 32 TSVs [2]
device_bus_width = 32
HMC is a BL8 device [2]
burst_length = 8
Each device has a page (row buffer) size of 256 bytes [2]
device_rowbuffer_size = '256B'
1x32 configuration, so 1 device [2]
devices_per_rank = 1
4 layers so 4 ranks [2]
ranks_per_channel = 4
HMC has 2 banks per layer [2]
Each layer represents a rank. With 4 layers and 8 banks in total,

each

layer has 2 banks; thus 2 banks per rank.
banks_per_rank = 2
1250 MHz [2]
tCK = '0.8ns'
8 beats across an x32 interface translates to 4 clocks @ 1250 MHz
tBURST = '3.2ns'
Values using DRAMSpec HMC model [1]
tRCD = '10.2ns'
tCL = '9.9ns'
tRP = '7.7ns'
tRAS = '21.6ns'
tRRD depends on the power supply network for each vendor.
We assume a tRRD of a double bank approach to be equal to 4 clock
cycles (Assumption)
tRRD = '3.2ns'
activation limit is set to 0 since there are only 2 banks per vault
layer.
activation_limit = 0
Values using DRAMSpec HMC model [1]
tRFC = '59ns'
tWR = '8ns'
tRTP = '4.9ns'
Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz

0.8 ns (Assumption)
tCS = '0.8ns'
Value using DRAMSpec HMC model [1]
tREFI = '3.9us'
The default page policy in the vault controllers is simple closed

page

[2] nevertheless 'close' policy opens and closes the row multiple

times

for bursts largers than 32Bytes. For this reason we

use 'close_adaptive'

page_policy = 'close_adaptive'
RoCoRaBaCh resembles the default address mapping in HMC
addr_mapping = 'RoCoRaBaCh'
These parameters do not directly correlate with buffer_size in real
hardware. Nevertheless, their value has been tuned to achieve a
bandwidth similar to the cycle-accurate model in [2]
write_buffer_size = 32
read_buffer_size = 32

1066 MHz
tCK = '0.938ns'
8 beats across an x64 interface translates to 4 clocks @ 1066 MHz
tBURST = '3.752ns'
DDR3-2133 14-14-14
tRCD = '13.09ns'
tCL = '13.09ns'
tRP = '13.09ns'
tRAS = '33ns'
tRRD = '5ns'
tXAW = '25ns'
Current values from datasheet
IDD0 = '70mA'
IDD2N = '37mA'
IDD3N = '44mA'
IDD4W = '157mA'
IDD4R = '191mA'
IDD5 = '250mA'
IDD3P1 = '44mA'
IDD2P1 = '43mA'
IDD6 ='20mA'
VDD = '1.5V'

size of device
device_size = '1GB'
16x4 configuration, 16 devices each with a 4-bit interface
device_bus_width = 4
DDR4 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 512 byte (1K columns x4)
device_rowbuffer_size = '512B'
16x4 configuration, so 16 devices
devices_per_rank = 16
Match our DDR3 configurations which is dual rank
ranks_per_channel = 2
DDR4 has 2 (x16) or 4 (x4 and x8) bank groups
Set to 4 for x4 case
bank_groups_per_rank = 4
DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all
configurations). Currently we do not capture the additional
constraints incurred by the bank groups
banks_per_rank = 16
override the default buffer sizes and go for something larger to
accommodate the larger bank count
write_buffer_size = 128
read_buffer_size = 64
1200 MHz
tCK = '0.833ns'
8 beats across an x64 interface translates to 4 clocks @ 1200 MHz
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = '3.332ns'
@2400 data rate, tCCD_L is 6 CK
CAS-to-CAS delay for bursts to the same bank group
tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = '5ns';
DDR4-2400 17-17-17
tRCD = '14.16ns'
tCL = '14.16ns'
tRP = '14.16ns'
tRAS = '32ns'
RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns)
tRRD = '3.332ns'
RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns)
tRRD_L = '4.9ns';
tFAW for 512B page is MAX(16 CK, 13ns)
tXAW = '13.328ns'
activation_limit = 4
tRFC is 350ns
tRFC = '350ns'
tWR = '15ns'
Here using the average of WTR_S and WTR_L
tWTR = '5ns'
Greater of 4 CK or 7.5 ns
tRTP = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666

tRTW = '1.666ns'
Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns
tCS = '1.666ns'
<=85C, half for >85C
tREFI = '7.8us'
active powerdown and precharge powerdown exit time
tXP = '6ns'
self refresh exit time
exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is:
tRFC + 10ns = 340ns
tXS = '340ns'
Current values from datasheet
IDD0 = '43mA'
IDD02 = '3mA'
IDD2N = '34mA'
IDD3N = '38mA'
IDD3N2 = '3mA'
IDD4W = '103mA'
IDD4R = '110mA'
IDD5 = '250mA'
IDD3P1 = '32mA'
IDD2P1 = '25mA'
IDD6 = '30mA'
VDD = '1.2V'
VDD2 = '2.5V'

8x8 configuration, 8 devices each with an 8-bit interface
device_bus_width = 8
Each device has a page (row buffer) size of 1 Kbyte (1K columns x8)
device_rowbuffer_size = '1kB'
8x8 configuration, so 8 devices
devices_per_rank = 8
RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns)
tRRD_L = '4.9ns';
tXAW = '21ns'
Current values from datasheet
IDD0 = '48mA'
IDD3N = '43mA'
IDD4W = '123mA'
IDD4R = '135mA'
IDD3P1 = '37mA'

4x16 configuration, 4 devices each with an 16-bit interface
device_bus_width = 16
Each device has a page (row buffer) size of 2 Kbyte (1K columns x16)
device_rowbuffer_size = '2kB'
4x16 configuration, so 4 devices
devices_per_rank = 4
Single rank for x16
ranks_per_channel = 1
DDR4 has 2 (x16) or 4 (x4 and x8) bank groups
Set to 2 for x16 case
bank_groups_per_rank = 2
DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all
configurations). Currently we do not capture the additional
constraints incurred by the bank groups
banks_per_rank = 8
RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns)
tRRD = '5.3ns'
RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns)
tRRD_L = '6.4ns';
tXAW = '30ns'
Current values from datasheet
IDD0 = '80mA'
IDD02 = '4mA'
IDD2N = '34mA'
IDD3N = '47mA'
IDD4W = '228mA'
IDD4R = '243mA'
IDD5 = '280mA'
IDD3P1 = '41mA'

No DLL in LPDDR2
dll = False
size of device
device_size = '512MB'
1x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
LPDDR2_S4 is a BL4 and BL8 device
burst_length = 8
Each device has a page (row buffer) size of 1KB
(this depends on the memory density)
device_rowbuffer_size = '1kB'
1x32 configuration, so 1 device
devices_per_rank = 1
Use a single rank
ranks_per_channel = 1
LPDDR2-S4 has 8 banks in all configurations
banks_per_rank = 8
533 MHz
tCK = '1.876ns'
Fixed at 15 ns
tRCD = '15ns'
8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time
tCL = '15ns'
Pre-charge one bank 15 ns (all banks 18 ns)
tRP = '15ns'
tRAS = '42ns'
tWR = '15ns'
tRTP = '7.5ns'
8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz.
Note this is a BL8 DDR device.
Requests larger than 32 bytes are broken down into multiple requests
in the controller
tBURST = '7.5ns'
LPDDR2-S4, 4 Gbit
tRFC = '130ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '7.5ns'
self refresh exit time
tXS = '140ns'
Irrespective of speed grade, tWTR is 7.5 ns
tWTR = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns
tRTW = '3.75ns'
Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns
tCS = '3.75ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Irrespective of density, tFAW is 50 ns
tXAW = '50ns'
activation_limit = 4
Current values from datasheet
IDD0 = '15mA'
IDD02 = '70mA'
IDD2N = '2mA'
IDD2N2 = '30mA'
IDD3N = '2.5mA'
IDD3N2 = '30mA'
IDD4W = '10mA'
IDD4W2 = '190mA'
IDD4R = '3mA'
IDD4R2 = '220mA'
IDD5 = '40mA'
IDD52 = '150mA'
IDD3P1 = '1.2mA'
IDD3P12 = '8mA'
IDD2P1 = '0.6mA'
IDD2P12 = '0.8mA'
IDD6 = '1mA'
IDD62 = '3.2mA'
VDD = '1.8V'
VDD2 = '1.2V'

+# A single WideIO x128 interface (one command and address bus), with
+# default timings based on an estimated WIO-200 8 Gbit part.
+class WideIO_200_1x128(DRAMInterface):

No DLL for WideIO
dll = False
size of device
device_size = '1024MB'
1x128 configuration, 1 device with a 128-bit interface
device_bus_width = 128
This is a BL4 device
burst_length = 4
Each device has a page (row buffer) size of 4KB
(this depends on the memory density)
device_rowbuffer_size = '4kB'
1x128 configuration, so 1 device
devices_per_rank = 1
Use one rank for a one-high die stack
ranks_per_channel = 1
WideIO has 4 banks in all configurations
banks_per_rank = 4
200 MHz
tCK = '5ns'
WIO-200
tRCD = '18ns'
tCL = '18ns'
tRP = '18ns'
tRAS = '42ns'
tWR = '15ns'
Read to precharge is same as the burst
tRTP = '20ns'
4 beats across an x128 SDR interface translates to 4 clocks @ 200

MHz.

Note this is a BL4 SDR device.
tBURST = '20ns'
WIO 8 Gb
tRFC = '210ns'
WIO 8 Gb, <=85C, half for >85C
tREFI = '3.9us'
Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns
tWTR = '15ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns
tRTW = '10ns'
Default different rank bus delay to 2 CK, @200 MHz = 10 ns
tCS = '10ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Two instead of four activation window
tXAW = '50ns'
activation_limit = 2
The WideIO specification does not provide current information

No DLL for LPDDR3
dll = False
size of device
device_size = '512MB'
1x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
LPDDR3 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 4KB
device_rowbuffer_size = '4kB'
1x32 configuration, so 1 device
devices_per_rank = 1
Technically the datasheet is a dual-rank package, but for
comparison with the LPDDR2 config we stick to a single rank
ranks_per_channel = 1
LPDDR3 has 8 banks in all configurations
banks_per_rank = 8
800 MHz
tCK = '1.25ns'
tRCD = '18ns'
12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time
tCL = '15ns'
tRAS = '42ns'
tWR = '15ns'
Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns
tRTP = '7.5ns'
Pre-charge one bank 18 ns (all banks 21 ns)
tRP = '18ns'
8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz.
Note this is a BL8 DDR device.
Requests larger than 32 bytes are broken down into multiple requests
in the controller
tBURST = '5ns'
LPDDR3, 4 Gb
tRFC = '130ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '7.5ns'
self refresh exit time
tXS = '140ns'
Irrespective of speed grade, tWTR is 7.5 ns
tWTR = '7.5ns'
Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns
tRTW = '2.5ns'
Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns
tCS = '2.5ns'
Activate to activate irrespective of density and speed grade
tRRD = '10.0ns'
Irrespective of size, tFAW is 50 ns
tXAW = '50ns'
activation_limit = 4
Current values from datasheet
IDD0 = '8mA'
IDD02 = '60mA'
IDD2N = '0.8mA'
IDD2N2 = '26mA'
IDD3N = '2mA'
IDD3N2 = '34mA'
IDD4W = '2mA'
IDD4W2 = '190mA'
IDD4R = '2mA'
IDD4R2 = '230mA'
IDD5 = '28mA'
IDD52 = '150mA'
IDD3P1 = '1.4mA'
IDD3P12 = '11mA'
IDD2P1 = '0.8mA'
IDD2P12 = '1.8mA'
IDD6 = '0.5mA'
IDD62 = '1.8mA'
VDD = '1.8V'
VDD2 = '1.2V'

+# A single GDDR5 x64 interface, with
+# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix
+# H5GQ1H24AFR) in a 2x32 configuration.
+class GDDR5_4000_2x32(DRAMInterface):

size of device
device_size = '128MB'
2x32 configuration, 1 device with a 32-bit interface
device_bus_width = 32
GDDR5 is a BL8 device
burst_length = 8
Each device has a page (row buffer) size of 2Kbits (256Bytes)
device_rowbuffer_size = '256B'
2x32 configuration, so 2 devices
devices_per_rank = 2
assume single rank
ranks_per_channel = 1
GDDR5 has 4 bank groups
bank_groups_per_rank = 4
GDDR5 has 16 banks with 4 bank groups
banks_per_rank = 16
1000 MHz
tCK = '1ns'
8 beats across an x64 interface translates to 2 clocks @ 1000 MHz
Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz )
8 beats at 4000 MHz = 2 beats at 1000 MHz
tBURST is equivalent to the CAS-to-CAS delay (tCCD)
With bank group architectures, tBURST represents the CAS-to-CAS
delay for bursts to different bank groups (tCCD_S)
tBURST = '2ns'
@1000MHz data rate, tCCD_L is 3 CK
CAS-to-CAS delay for bursts to the same bank group
tBURST is equivalent to tCCD_S; no explicit parameter required
for CAS-to-CAS delay for bursts to different bank groups
tCCD_L = '3ns';
tRCD = '12ns'
tCL is not directly found in datasheet and assumed equal tRCD
tCL = '12ns'
tRP = '12ns'
tRAS = '28ns'
RRD_S (different bank group)
RRD_S is 5.5 ns in datasheet.
rounded to the next multiple of tCK
tRRD = '6ns'
RRD_L (same bank group)
RRD_L is 5.5 ns in datasheet.
rounded to the next multiple of tCK
tRRD_L = '6ns'
tXAW = '23ns'
tXAW < 4 x tRRD.
Therefore, activation limit is set to 0
activation_limit = 0
tRFC = '65ns'
tWR = '12ns'
Here using the average of WTR_S and WTR_L
tWTR = '5ns'
Read-to-Precharge 2 CK
tRTP = '2ns'
Assume 2 cycles
tRTW = '2ns'

HBM gen1 supports up to 8 128-bit physical channels
Configuration defines a single channel, with the capacity
set to (full_ stack_capacity / 8) based on 2Gb dies
To use all 8 channels, set 'channels' parameter to 8 in
system configuration
128-bit interface legacy mode
device_bus_width = 128
HBM supports BL4 and BL2 (legacy mode only)
burst_length = 4
size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack;
with 8 channels, 128MB per channel
device_size = '128MB'
device_rowbuffer_size = '2kB'
1x128 configuration
devices_per_rank = 1
HBM does not have a CS pin; set rank to 1
ranks_per_channel = 1
HBM has 8 or 16 banks depending on capacity
2Gb dies have 8 banks
banks_per_rank = 8
depending on frequency, bank groups may be required
will always have 4 bank groups when enabled
current specifications do not define the minimum frequency for
bank group architecture
setting bank_groups_per_rank to 0 to disable until range is defined
bank_groups_per_rank = 0
500 MHz for 1Gbps DDR data rate
tCK = '2ns'
use values from IDD measurement in JEDEC spec
use tRP value for tRCD and tCL similar to other classes
tRP = '15ns'
tRCD = '15ns'
tCL = '15ns'
tRAS = '33ns'
BL2 and BL4 supported, default to BL4
DDR @ 500 MHz means 4 * 2ns / 2 = 4ns
tBURST = '4ns'
value for 2Gb device from JEDEC spec
tRFC = '160ns'
value for 2Gb device from JEDEC spec
tREFI = '3.9us'
extrapolate the following from LPDDR configs, using ns values
to minimize burst length, prefetch differences
tWR = '18ns'
tRTP = '7.5ns'
tWTR = '10ns'
start with 2 cycles turnaround, similar to other memory classes
could be more with variations across the stack
tRTW = '4ns'
single rank device, set to 0
tCS = '0ns'
from MemCon example, tRRD is 4ns with 2ns tCK
tRRD = '4ns'
from MemCon example, tFAW is 30ns with 2ns tCK
tXAW = '30ns'
activation_limit = 4
4tCK
tXP = '8ns'
start with tRFC + tXP -> 160ns + 8ns = 168ns
tXS = '168ns'

For HBM gen2 with pseudo-channel mode, configure 2X channels.
Configuration defines a single pseudo channel, with the capacity
set to (full_ stack_capacity / 16) based on 8Gb dies
To use all 16 pseudo channels, set 'channels' parameter to 16 in
system configuration
64-bit pseudo-channle interface
device_bus_width = 64
HBM pseudo-channel only supports BL4
burst_length = 4
size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack;
with 16 channels, 256MB per channel
device_size = '256MB'
page size is halved with pseudo-channel; maintaining the same same

number

of rows per pseudo-channel with 2X banks across 2 channels
device_rowbuffer_size = '1kB'
HBM has 8 or 16 banks depending on capacity
Starting with 4Gb dies, 16 banks are defined
banks_per_rank = 16
reset tRFC for larger, 8Gb device
use HBM1 4Gb value as a starting point
tRFC = '260ns'
start with tRFC + tXP -> 160ns + 8ns = 168ns
tXS = '268ns'
Default different rank bus delay to 2 CK, @1000 MHz = 2 ns
tCS = '2ns'
tREFI = '3.9us'
active powerdown and precharge powerdown exit time
tXP = '10ns'
self refresh exit time
tXS = '65ns'

Increase buffer size to account for more bank resources
read_buffer_size = 64
Set page policy to better suit DMC Huxley
page_policy = 'close_adaptive'
16-bit channel interface
device_bus_width = 16
LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL32 for higher command bandwidth
burst_length = 32
size of device in bytes
device_size = '1GB'
2kB page with BG mode
device_rowbuffer_size = '2kB'
Use a 1x16 configuration
devices_per_rank = 1
Use a single rank
ranks_per_channel = 1
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Initial configuration will have 16 banks with Bank Group Arch
to maximim resources and enable higher data rates
banks_per_rank = 16
bank_groups_per_rank = 4
5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK
tCK = '1.455ns'
Greater of 2 CK or 18ns
tRCD = '18ns'
Base RL is 16 CK @ 687.5 MHz = 23.28ns
tCL = '23.280ns'
Greater of 2 CK or 18ns
tRP = '18ns'
Greater of 3 CK or 42ns
tRAS = '42ns'
Greater of 3 CK or 34ns
tWR = '34ns'
active powerdown and precharge powerdown exit time
Greater of 3 CK or 7ns
tXP = '7ns'
self refresh exit time (tRFCab + 7.5ns)
tXS = '217.5ns'
Greater of 2 CK or 7.5 ns minus 2 CK
tRTP = '4.59ns'
With BG architecture, burst of 32 transferred in two 16-beat
sub-bursts, with a 16-beat gap in between.
Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz
tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz
tBURST = '8.73ns'
can interleave a Bstof32 from another bank group at tBURST_MIN
16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz
tBURST_MIN = '2.91ns'
tBURST_MAX is the maximum burst delay for same bank group timing
this is 8 CK @ 687.5 MHz
tBURST_MAX = '11.64ns'
8 CK @ 687.5 MHz
tCCD_L = "11.64ns"
LPDDR5, 8 Gbit/channel for 280ns tRFCab
tRFC = '210ns'
tREFI = '3.9us'
Greater of 4 CK or 6.25 ns
tWTR = '6.25ns'
Greater of 4 CK or 12 ns
tWTR_L = '12ns'
Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL
tWCKDQ0/tCK will be 1 CK for most cases
For gem5 RL = WL and BL/n is already accounted for with tBURST
Result is and additional 1 CK is required
tRTW = '1.455ns'
Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns
tCS = '2.91ns'
2 CK
tPPD = '2.91ns'
Greater of 2 CK or 5 ns
tRRD = '5ns'
tRRD_L = '5ns'
With Bank Group Arch mode tFAW is 20 ns
tXAW = '20ns'
activation_limit = 4
at 5Gbps, 4:1 WCK to CK ratio required
2 data beats per WCK (DDR) -> 8 per CK
beats_per_clock = 8
2 cycles required to send activate command
2 command phases can be sent back-to-back or
with a gap up to tAAD = 8 CK
two_cycle_activate = True
tAAD = '11.640ns'
data_clock_sync = True

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 16-bank mode with bank-group architecture, burst of 16
+class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32):
+

LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL16 for smaller access granularity
burst_length = 16
For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio
tBURST = '2.91ns'
tBURST_MIN = '2.91ns'
For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio
tBURST_MAX = '5.82ns'
4 CK @ 687.5 MHz
tCCD_L = "5.82ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# Starting with 5.5Gbps data rates and 8Gbit die
+# Configuring for 8-bank mode, burst of 32
+class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32):
+

4kB page with 8B mode
device_rowbuffer_size = '4kB'
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Select 8B
banks_per_rank = 8
bank_groups_per_rank = 0
For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio
tBURST = '5.82ns'
tBURST_MIN = '5.82ns'
tBURST_MAX = '5.82ns'
Greater of 4 CK or 12 ns
tWTR = '12ns'
Greater of 2 CK or 10 ns
tRRD = '10ns'
With 8B mode tFAW is 40 ns
tXAW = '40ns'
activation_limit = 4
Reset BG arch timing for 8B mode
tCCD_L = "0ns"
tRRD_L = "0ns"
tWTR_L = "0ns"

5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK
tCK = '1.25ns'
Base RL is 17 CK @ 800 MHz = 21.25ns
tCL = '21.25ns'
With BG architecture, burst of 32 transferred in two 16-beat
sub-bursts, with a 16-beat gap in between.
Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz
tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz
tBURST = '7.5ns'
can interleave a Bstof32 from another bank group at tBURST_MIN
16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz
tBURST_MIN = '2.5ns'
tBURST_MAX is the maximum burst delay for same bank group timing
this is 8 CK @ 800 MHz
tBURST_MAX = '10ns'
8 CK @ 800 MHz
tCCD_L = "10ns"
Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL
tWCKDQ0/tCK will be 1 CK for most cases
For gem5 RL = WL and BL/n is already accounted for with tBURST
Result is and additional 1 CK is required
tRTW = '1.25ns'
Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns
tCS = '2.5ns'
2 CK
tPPD = '2.5ns'
2 command phases can be sent back-to-back or
with a gap up to tAAD = 8 CK
tAAD = '10ns'

LPDDR5 is a BL16 or BL32 device
With BG mode, BL16 and BL32 are supported
Use BL16 for smaller access granularity
burst_length = 16
For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio
tBURST = '2.5ns'
tBURST_MIN = '2.5ns'
For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio
tBURST_MAX = '5ns'
4 CK @ 800 MHz
tCCD_L = "5ns"

+# A single LPDDR5 x16 interface (one command/address bus)
+# for a single x16 channel with default timings based on
+# initial JEDEC specification
+# 6.4Gbps data rates and 8Gbit die
+# Configuring for 8-bank mode, burst of 32
+class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32):
+

4kB page with 8B mode
device_rowbuffer_size = '4kB'
LPDDR5 supports configurable bank options
8B : BL32, all frequencies
16B : BL32 or BL16, <=3.2Gbps
16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps
Select 8B
banks_per_rank = 8
bank_groups_per_rank = 0
For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio
tBURST = '5ns'
tBURST_MIN = '5ns'
tBURST_MAX = '5ns'
Greater of 4 CK or 12 ns
tWTR = '12ns'
Greater of 2 CK or 10 ns
tRRD = '10ns'
With 8B mode tFAW is 40 ns
tXAW = '40ns'
activation_limit = 4
Reset BG arch timing for 8B mode
tCCD_L = "0ns"
tRRD_L = "0ns"
tWTR_L = "0ns"
diff --git a/src/mem/SConscript b/src/mem/SConscript
index 2fe179d..ceeed98 100644
--- a/src/mem/SConscript
+++ b/src/mem/SConscript
@@ -1,6 +1,6 @@

-- mode:python --

The license below extends only to copyright in the software and shall

@@ -47,6 +47,7 @@
SimObject('AddrMapper.py')
SimObject('Bridge.py')
SimObject('DRAMCtrl.py')
+SimObject('DRAMInterface.py')
SimObject('ExternalMaster.py')
SimObject('ExternalSlave.py')
SimObject('MemObject.py')
diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc
index b646581..4055505 100644
--- a/src/mem/dram_ctrl.cc
+++ b/src/mem/dram_ctrl.cc
@@ -47,6 +47,7 @@
#include "debug/DRAMState.hh"
#include "debug/Drain.hh"
#include "debug/QOS.hh"
+#include "params/DRAMInterface.hh"
#include "sim/system.hh"

using namespace std;
@@ -58,12 +59,13 @@
retryRdReq(false), retryWrReq(false),
nextReqEvent([this]{ processNextReqEvent(); }, name()),
respondEvent([this]{ processRespondEvent(); }, name()),

readBufferSize(p->read_buffer_size),
writeBufferSize(p->write_buffer_size),

dram(p->dram),
readBufferSize(dram->readBufferSize),
writeBufferSize(dram->writeBufferSize),
writeHighThreshold(writeBufferSize * p->write_high_thresh_perc /
100.0),
writeLowThreshold(writeBufferSize * p->write_low_thresh_perc / 100.0),
minWritesPerSwitch(p->min_writes_per_switch),

writesThisTime(0), readsThisTime(0), tCS(p->tCS),

writesThisTime(0), readsThisTime(0),
memSchedPolicy(p->mem_sched_policy),
frontendLatency(p->static_frontend_latency),
backendLatency(p->static_backend_latency),
@@ -74,37 +76,23 @@
readQueue.resize(p->qos_priorities);
writeQueue.resize(p->qos_priorities);
dram->setCtrl(this);

// perform a basic check of the write thresholds
if (p->write_low_thresh_perc >= p->write_high_thresh_perc)
    fatal("Write buffer low threshold %d must be smaller than the "
          "high threshold %d\n", p->write_low_thresh_perc,
          p->write_high_thresh_perc);

// determine the rows per bank by looking at the total capacity
uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size());
DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity,
```
       AbstractMemory::size());
```
// create a DRAM interface
// will only populate the ranks if DRAM is configured
dram = new DRAMInterface(*this, p, capacity, range);
DPRINTF(DRAM, "Created DRAM interface \n");
}

void
DRAMCtrl::init()
{

MemCtrl::init();
if (!port.isConnected()) {
fatal("DRAMCtrl %s is unconnected!\n", name());
} else {
port.sendRangeChange();
}
dram->init(range);
}

void
@@ -114,8 +102,6 @@
isTimingMode = system()->isTimingMode();
```
if (isTimingMode) {
```
```
   dram->startupRanks();
```

    // shift the bus busy time sufficiently far ahead that we never
    // have to worry about negative values when computing the time for
    // the next request, this will add an insignificant bubble at the

@@ -133,7 +119,7 @@
"is responding");

  // do the actual memory access and turn the packet into a response

access(pkt);

dram->access(pkt);

Tick latency = 0;
if (pkt->hasData()) {
@@ -263,7 +249,7 @@
// address of first DRAM packet is kept unaliged. Subsequent DRAM
packets
// are aligned to burst size boundaries. This is to ensure we
accurately
// check read packets against packets in write queue.

const Addr base_addr = getCtrlAddr(pkt->getAddr());

const Addr base_addr = dram->getCtrlAddr(pkt->getAddr());
Addr addr = base_addr;
unsigned pktsServicedByWrQ = 0;
BurstHelper* burst_helper = NULL;
@@ -363,7 +349,7 @@

// if the request size is larger than burst size, the pkt is split into
// multiple DRAM packets

const Addr base_addr = getCtrlAddr(pkt->getAddr());

const Addr base_addr = dram->getCtrlAddr(pkt->getAddr());
Addr addr = base_addr;
uint32_t burstSize = dram->bytesPerBurst();
for (int cnt = 0; cnt < pktCount; ++cnt) {
@@ -526,7 +512,7 @@
DRAMPacket* dram_pkt = respQueue.front();

// media specific checks and functions when read response is complete

dram->respondEventDRAM(dram_pkt->rank);

dram->respondEvent(dram_pkt->rank);

if (dram_pkt->burstHelper) {
// it is a split packet
@@ -727,12 +713,12 @@
void
DRAMCtrl::accessAndRespond(PacketPtr pkt, Tick static_latency)
{

DPRINTF(DRAM, "Responding to Address %lld.. ",pkt->getAddr());

DPRINTF(DRAM, "Responding to Address %lld.. \n",pkt->getAddr());

bool needsResponse = pkt->needsResponse();
// do the actual memory access which also turns the packet into a
// response

access(pkt);

dram->access(pkt);

// turn packet around to go back to requester if response expected
if (needsResponse) {
@@ -877,9 +863,9 @@
// if not, shift to next burst window
Tick act_at;
if (twoCycleActivate)

   act_at = ctrl.verifyMultiCmd(act_tick, tAAD);

   act_at = ctrl->verifyMultiCmd(act_tick, tAAD);
else

   act_at = ctrl.verifySingleCmd(act_tick);

   act_at = ctrl->verifySingleCmd(act_tick);

DPRINTF(DRAM, "Activate at tick %d\n", act_at);

@@ -997,7 +983,7 @@
// Issuing an explicit PRE command
// Verify that we have command bandwidth to issue the precharge
// if not, shift to next burst window

   pre_at = ctrl.verifySingleCmd(pre_tick);

   pre_at = ctrl->verifySingleCmd(pre_tick);
    // enforce tPPD
    for (int i = 0; i < banksPerRank; i++) {
        rank_ref.banks[i].preAllowedAt = std::max(pre_at + tPPD,

@@ -1096,9 +1082,9 @@
// verify that we have command bandwidth to issue the burst
// if not, shift to next burst window
if (dataClockSync && ((cmd_at - rank_ref.lastBurstTick) >
clkResyncDelay))

   cmd_at = ctrl.verifyMultiCmd(cmd_at, tCK);

   cmd_at = ctrl->verifyMultiCmd(cmd_at, tCK);
else

   cmd_at = ctrl.verifySingleCmd(cmd_at);

   cmd_at = ctrl->verifySingleCmd(cmd_at);

// if we are interleaving bursts, ensure that
// 1) we don't double interleave on next burst issue

@@ -1196,7 +1182,7 @@
bool got_more_hits = false;
bool got_bank_conflict = false;

   for (uint8_t i = 0; i < ctrl.numPriorities(); ++i) {

   for (uint8_t i = 0; i < ctrl->numPriorities(); ++i) {
        auto p = queue[i].begin();
        // keep on looking until we find a hit or reach the end of the
        // queue

@@ -1267,6 +1253,7 @@
// Update latency stats
stats.totMemAccLat += dram_pkt->readyTime - dram_pkt->entryTime;
stats.totQLat += cmd_at - dram_pkt->entryTime;

   stats.totBusLat += tBURST;
} else {
    // Schedule write done event to decrement event count
    // after the readyTime has been reached

@@ -1350,13 +1337,9 @@
// Update latency stats
stats.masterReadTotalLat[dram_pkt->masterId()] +=
dram_pkt->readyTime - dram_pkt->entryTime;

   stats.bytesRead += dram->bytesPerBurst();

   stats.totBusLat += dram->burstDelay();
    stats.masterReadBytes[dram_pkt->masterId()] += dram_pkt->size;
} else {
    ++writesThisTime;

   stats.bytesWritten += dram->bytesPerBurst();
    stats.masterWriteBytes[dram_pkt->masterId()] += dram_pkt->size;
    stats.masterWriteTotalLat[dram_pkt->masterId()] +=
        dram_pkt->readyTime - dram_pkt->entryTime;

@@ -1458,8 +1441,9 @@

              // Figure out which read request goes next
              // If we are changing command type, incorporate the minimum

           // bus turnaround delay which will be tCS (different rank)

case

           to_read = chooseNext((*queue), switched_cmd_type ? tCS :

0);

           // bus turnaround delay which will be rank to rank delay

           to_read = chooseNext((*queue), switched_cmd_type ?

                                          dram->rankDelay() : 0);

            if (to_read != queue->end()) {
                // candidate read found

@@ -1538,7 +1522,8 @@
// If we are changing command type, incorporate the minimum
// bus turnaround delay
to_write = chooseNext((*queue),

                switched_cmd_type ? std::min(dram->minRdToWr(),

tCS) : 0);

                switched_cmd_type ? std::min(dram->minRdToWr(),

                                             dram->rankDelay()) : 0);

        if (to_write != queue->end()) {
            write_found = true;

@@ -1611,11 +1596,8 @@
}
}

-DRAMInterface::DRAMInterface(DRAMCtrl& _ctrl,

                                const DRAMCtrlParams* _p,

                                const uint64_t capacity,

                                const AddrRange range)

: SimObject(_p), ctrl(_ctrl),
+DRAMInterface::DRAMInterface(const DRAMInterfaceParams* _p)

: AbstractMemory(_p),
addrMapping(_p->addr_mapping),
burstSize((_p->devices_per_rank * _p->burst_length *
_p->device_bus_width) / 8),
@@ -1630,7 +1612,7 @@
bankGroupsPerRank(_p->bank_groups_per_rank),
bankGroupArch(_p->bank_groups_per_rank > 0),
banksPerRank(_p->banks_per_rank), rowsPerBank(0),

 tCK(_p->tCK), tCL(_p->tCL), tBURST(_p->tBURST),

 tCK(_p->tCK), tCS(_p->tCS), tCL(_p->tCL), tBURST(_p->tBURST),
  tBURST_MIN(_p->tBURST_MIN), tBURST_MAX(_p->tBURST_MAX),

tRTW(_p->tRTW),
tCCD_L_WR(_p->tCCD_L_WR), tCCD_L(_p->tCCD_L), tRCD(_p->tRCD),
tRP(_p->tRP), tRAS(_p->tRAS), tWR(_p->tWR), tRTP(_p->tRTP),
@@ -1646,13 +1628,15 @@
wrToRdDly(tCL + tBURST + _p->tWTR), rdToWrDly(tBURST + tRTW),
wrToRdDlySameBG(tCL + _p->tBURST_MAX + _p->tWTR_L),
rdToWrDlySameBG(tRTW + _p->tBURST_MAX),

 rankToRankDly(ctrl.rankDelay() + tBURST),

 rankToRankDly(tCS + tBURST),
  pageMgmt(_p->page_policy),
  maxAccessesPerRow(_p->max_accesses_per_row),
  timeStampOffset(0), activeRank(0),
  enableDRAMPowerdown(_p->enable_dram_powerdown),
  lastStatsResetTick(0),

```
 stats(_ctrl, *this)
```

```
 stats(*this),
```
```
 readBufferSize(_p->read_buffer_size),
```

 writeBufferSize(_p->write_buffer_size)

{
fatal_if(!isPowerOf2(burstSize), "DRAM burst size %d is not allowed, "
"must be a power of two\n", burstSize);
@@ -1664,7 +1648,7 @@

for (int i = 0; i < ranksPerChannel; i++) {
    DPRINTF(DRAM, "Creating DRAM rank %d \n", i);

   Rank* rank = new Rank(ctrl, _p, i, *this);

   Rank* rank = new Rank(_p, i, *this);
    ranks.push_back(rank);
}

@@ -1672,6 +1656,11 @@
uint64_t deviceCapacity = deviceSize / (1024 * 1024) * devicesPerRank *
ranksPerChannel;

uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size());
DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity,
```
       AbstractMemory::size());
```

// if actual DRAM size does not match memory capacity in system warn!
if (deviceCapacity != capacity / (1024 * 1024))
    warn("DRAM device capacity (%d Mbytes) does not match the "

@@ -1726,8 +1715,10 @@
}

void
-DRAMInterface::init(AddrRange range)
+DRAMInterface::init()
{

AbstractMemory::init();

// a bit of sanity checks on the interleaving, save it for here to
// ensure that the system pointer is initialised
if (range.interleaved()) {

@@ -1749,7 +1740,7 @@

          // channel striping has to be done at a granularity that
          // is equal or larger to a cache line

       if (ctrl.system()->cacheLineSize() > range.granularity()) {

       if (system()->cacheLineSize() > range.granularity()) {
            fatal("Channel interleaving of %s must be at least as

large "
"as the cache line size\n", name());
}
@@ -1766,10 +1757,12 @@
}

void
-DRAMInterface::startupRanks()
+DRAMInterface::startup()
{

// timestamp offset should be in clock cycles for DRAMPower
timeStampOffset = divCeil(curTick(), tCK);

if (system()->isTimingMode()) {

   // timestamp offset should be in clock cycles for DRAMPower

   timeStampOffset = divCeil(curTick(), tCK);

}

for (auto r : ranks) {
    r->startup(curTick() + tREFI - tRP);

@@ -1815,7 +1808,7 @@
}

void
-DRAMInterface::respondEventDRAM(uint8_t rank)
+DRAMInterface::respondEvent(uint8_t rank)
{
Rank& rank_ref = *ranks[rank];

@@ -1956,7 +1949,7 @@
std::max(ranks[i]->banks[j].preAllowedAt, curTick()) +
tRP;

              // When is the earliest the R/W burst can issue?

           const Tick col_allowed_at = ctrl.inReadBusState(false) ?

           const Tick col_allowed_at = ctrl->inReadBusState(false) ?

ranks[i]->banks[j].rdAllowedAt :

ranks[i]->banks[j].wrAllowedAt;
Tick col_at = std::max(col_allowed_at, act_at + tRCD);
@@ -1996,9 +1989,15 @@
return make_pair(bank_mask, hidden_bank_prep);
}

-DRAMInterface::Rank::Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int
_rank,

                    DRAMInterface& _dram)

: EventManager(&_ctrl), ctrl(_ctrl), dram(_dram),
+DRAMInterface*
+DRAMInterfaceParams::create()
+{

return new DRAMInterface(this);
+}

+DRAMInterface::Rank::Rank(const DRAMInterfaceParams* _p,

                    int _rank, DRAMInterface& _dram)

: EventManager(&_dram), dram(_dram),
pwrStateTrans(PWR_IDLE), pwrStatePostRefresh(PWR_IDLE),
pwrStateTick(0), refreshDueAt(0), pwrState(PWR_IDLE),
refreshState(REF_IDLE), inLowPowerState(false), rank(_rank),
@@ -2011,7 +2010,7 @@
refreshEvent([this]{ processRefreshEvent(); }, name()),
powerEvent([this]{ processPowerEvent(); }, name()),
wakeUpEvent([this]{ processWakeUpEvent(); }, name()),

```
 stats(_ctrl, *this)
```

```
 stats(_dram, *this)
```
{
for (int b = 0; b < _p->banks_per_rank; b++) {
banks[b].bank = b;
@@ -2062,8 +2061,10 @@
DRAMInterface::Rank::isQueueEmpty() const
{
// check commmands in Q based on current bus direction

bool no_queued_cmds = (ctrl.inReadBusState(true) && (readEntries == 0))

                  || (ctrl.inWriteBusState(true) && (writeEntries ==

0));

bool no_queued_cmds = (dram.ctrl->inReadBusState(true) &&

                     (readEntries == 0))

                  || (dram.ctrl->inWriteBusState(true) &&

                     (writeEntries == 0));
return no_queued_cmds;

}

@@ -2187,7 +2188,7 @@
// if a request is at the moment being handled and this request is
// accessing the current rank then wait for it to finish
if ((rank == dram.activeRank)

       && (ctrl.requestEventScheduled())) {

       && (dram.ctrl->requestEventScheduled())) {
        // hand control over to the request loop until it is
        // evaluated next
        DPRINTF(DRAM, "Refresh awaiting draining\n");

@@ -2262,7 +2263,7 @@
// or have outstanding ACT,RD/WR,Auto-PRE sequence scheduled
// should have outstanding precharge or read response event
assert(prechargeEvent.scheduled() ||

              ctrl.respondEventScheduled());

              dram.ctrl->respondEventScheduled());
        // will start refresh when pwrState transitions to IDLE
    }

@@ -2322,8 +2323,8 @@

      assert(!powerEvent.scheduled());

   if ((ctrl.drainState() == DrainState::Draining) ||

       (ctrl.drainState() == DrainState::Drained)) {

   if ((dram.ctrl->drainState() == DrainState::Draining) ||

       (dram.ctrl->drainState() == DrainState::Drained)) {
        // if draining, do not re-enter low-power mode.
        // simply go to IDLE and wait
        schedulePowerEvent(PWR_IDLE, curTick());

@@ -2548,10 +2549,10 @@
}

      // completed refresh event, ensure next request is scheduled

   if (!ctrl.requestEventScheduled()) {

   if (!dram.ctrl->requestEventScheduled()) {
        DPRINTF(DRAM, "Scheduling next request after refreshing"
                       " rank %d\n", rank);

       ctrl.restartScheduler(curTick());

       dram.ctrl->restartScheduler(curTick());
    }
}

@@ -2610,8 +2611,8 @@
// bypass auto-refresh and go straight to SREF, where memory
// will issue refresh immediately upon entry
if (pwrStatePostRefresh == PWR_PRE_PDN && isQueueEmpty() &&

      (ctrl.drainState() != DrainState::Draining) &&

      (ctrl.drainState() != DrainState::Drained) &&

      (dram.ctrl->drainState() != DrainState::Draining) &&

      (dram.ctrl->drainState() != DrainState::Drained) &&
       dram.enableDRAMPowerdown) {
        DPRINTF(DRAMState, "Rank %d bypassing refresh and

transitioning "
"to self refresh at %11u tick\n", rank, curTick());
@@ -2712,7 +2713,7 @@
bool
DRAMInterface::Rank::forceSelfRefreshExit() const {
return (readEntries != 0) ||

      (ctrl.inWriteBusState(true) && (writeEntries != 0));

```
      (dram.ctrl->inWriteBusState(true) && (writeEntries != 0));
```
}

DRAMCtrl::CtrlStats::CtrlStats(DRAMCtrl &_ctrl)
@@ -2723,15 +2724,15 @@
ADD_STAT(writeReqs, "Number of write requests accepted"),
```
ADD_STAT(readBursts,
```

```
        "Number of DRAM read bursts, "
```

        "Number of controller read bursts, "
         "including those serviced by the write queue"),
ADD_STAT(writeBursts,

        "Number of DRAM write bursts, "

        "Number of controller write bursts, "
         "including those merged in the write queue"),
ADD_STAT(servicedByWrQ,

        "Number of DRAM read bursts serviced by the write queue"),

        "Number of controller read bursts serviced by the write

queue"),
ADD_STAT(mergedWrBursts,

        "Number of DRAM write bursts merged with an existing one"),

        "Number of controller write bursts merged with an existing

one"),

  ADD_STAT(neitherReadNorWriteReqs,
           "Number of requests that are neither read nor write"),

@@ -2739,9 +2740,6 @@
ADD_STAT(avgRdQLen, "Average read queue length when enqueuing"),
ADD_STAT(avgWrQLen, "Average write queue length when enqueuing"),

ADD_STAT(totBusLat, "Total ticks spent in databus transfers"),
ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"),

ADD_STAT(numRdRetry, "Number of times read queue was full causing

retry"),
ADD_STAT(numWrRetry, "Number of times write queue was full causing
retry"),

@@ -2756,22 +2754,13 @@
ADD_STAT(wrPerTurnAround,
"Writes before turning the bus around for reads"),

ADD_STAT(bytesRead, "Total number of bytes read from memory"),
ADD_STAT(bytesReadWrQ, "Total number of bytes read from write queue"),
ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"),
ADD_STAT(bytesReadSys, "Total read bytes from the system interface
side"),
ADD_STAT(bytesWrittenSys,
"Total written bytes from the system interface side"),
ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiByte/s"),
ADD_STAT(avgWrBW, "Average achieved write bandwidth in MiByte/s"),
ADD_STAT(avgRdBWSys, "Average system read bandwidth in MiByte/s"),
ADD_STAT(avgWrBWSys, "Average system write bandwidth in MiByte/s"),
ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"),
ADD_STAT(busUtil, "Data bus utilization in percentage"),
ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"),
ADD_STAT(busUtilWrite, "Data bus utilization in percentage for
writes"),

ADD_STAT(totGap, "Total gap between requests"),
ADD_STAT(avgGap, "Average gap between requests"),
@@ -2803,12 +2792,11 @@
{
using namespace Stats;
assert(ctrl._system);
const auto max_masters = ctrl._system->maxMasters();

assert(ctrl.system());
const auto max_masters = ctrl.system()->maxMasters();

avgRdQLen.precision(2);
avgWrQLen.precision(2);

avgBusLat.precision(2);

readPktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1);
writePktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1);
@@ -2823,14 +2811,9 @@
.init(ctrl.writeBufferSize)
.flags(nozero);
avgRdBW.precision(2);
avgWrBW.precision(2);
avgRdBWSys.precision(2);
avgWrBWSys.precision(2);
peakBW.precision(2);
busUtil.precision(2);
avgGap.precision(2);
busUtilWrite.precision(2);

// per-master bytes read and written to memory
masterReadBytes
@@ -2862,9 +2845,6 @@
.flags(nonan)
.precision(2);
busUtilRead
```
   .precision(2);
```

masterWriteRate
    .flags(nozero | nonan)
    .precision(12);

@@ -2878,7 +2858,7 @@
.precision(2);

  for (int i = 0; i < max_masters; i++) {

   const std::string master = ctrl._system->getMasterName(i);

   const std::string master = ctrl.system()->getMasterName(i);
    masterReadBytes.subname(i, master);
    masterReadRate.subname(i, master);
    masterWriteBytes.subname(i, master);

@@ -2892,22 +2872,11 @@
}

  // Formula stats

avgBusLat = totBusLat / (readBursts - servicedByWrQ);
avgRdBW = (bytesRead / 1000000) / simSeconds;
avgWrBW = (bytesWritten / 1000000) / simSeconds;
avgRdBWSys = (bytesReadSys / 1000000) / simSeconds;
avgWrBWSys = (bytesWrittenSys / 1000000) / simSeconds;
peakBW = (SimClock::Frequency / ctrl.dram->burstDataDelay()) *

         ctrl.dram->bytesPerBurst() / 1000000;

busUtil = (avgRdBW + avgWrBW) / peakBW * 100;

avgGap = totGap / (readReqs + writeReqs);
busUtilRead = avgRdBW / peakBW * 100;
busUtilWrite = avgWrBW / peakBW * 100;

masterReadRate = masterReadBytes / simSeconds;
masterWriteRate = masterWriteBytes / simSeconds;
masterReadAvgLat = masterReadTotalLat / masterReadAccesses;

@@ -2920,8 +2889,8 @@
dram.lastStatsResetTick = curTick();
}

-DRAMInterface::DRAMStats::DRAMStats(DRAMCtrl &_ctrl, DRAMInterface &_dram)

: Stats::Group(&_ctrl, csprintf("dram").c_str()),
+DRAMInterface::DRAMStats::DRAMStats(DRAMInterface &_dram)

: Stats::Group(&_dram),
dram(_dram),

ADD_STAT(readBursts, "Number of DRAM read bursts"),
@@ -2931,10 +2900,13 @@
ADD_STAT(perBankWrBursts, "Per bank write bursts"),

ADD_STAT(totQLat, "Total ticks spent queuing"),
ADD_STAT(totBusLat, "Total ticks spent in databus transfers"),
ADD_STAT(totMemAccLat,
"Total ticks spent from burst creation until serviced "
"by the DRAM"),

ADD_STAT(avgQLat, "Average queueing delay per DRAM burst"),

ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"),
ADD_STAT(avgMemAccLat, "Average memory access latency per DRAM burst"),

ADD_STAT(readRowHits, "Number of row buffer hits during reads"),
@@ -2947,6 +2919,12 @@
ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"),
ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiBytes/s"),
ADD_STAT(avgWrBW, "Average DRAM write bandwidth in MiBytes/s"),
ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"),
ADD_STAT(busUtil, "Data bus utilization in percentage"),
ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"),
ADD_STAT(busUtilWrite, "Data bus utilization in percentage for
writes"),

ADD_STAT(pageHitRate, "Row buffer hit rate, read and write combined")

{
@@ -2958,6 +2936,7 @@
using namespace Stats;

avgQLat.precision(2);

avgBusLat.precision(2);
avgMemAccLat.precision(2);

readRowHitRate.precision(2);
@@ -2971,10 +2950,16 @@
dram.maxAccessesPerRow : dram.rowBufferSize)
.flags(nozero);
peakBW.precision(2);
busUtil.precision(2);
busUtilWrite.precision(2);
busUtilRead.precision(2);

pageHitRate.precision(2);

// Formula stats
avgQLat = totQLat / readBursts;

avgBusLat = totBusLat / readBursts;
avgMemAccLat = totMemAccLat / readBursts;

readRowHitRate = (readRowHits / readBursts) * 100;
@@ -2982,13 +2967,19 @@

avgRdBW = (bytesRead / 1000000) / simSeconds;
avgWrBW = (bytesWritten / 1000000) / simSeconds;
peakBW = (SimClock::Frequency / dram.burstDataDelay()) *

         dram.bytesPerBurst() / 1000000;

busUtil = (avgRdBW + avgWrBW) / peakBW * 100;
busUtilRead = avgRdBW / peakBW * 100;
busUtilWrite = avgWrBW / peakBW * 100;

pageHitRate = (writeRowHits + readRowHits) /
(writeBursts + readBursts) * 100;
}

-DRAMInterface::RankStats::RankStats(DRAMCtrl &_ctrl, Rank &_rank)

: Stats::Group(&_ctrl, csprintf("dram_rank%d", _rank.rank).c_str()),
+DRAMInterface::RankStats::RankStats(DRAMInterface &_dram, Rank &_rank)

: Stats::Group(&_dram, csprintf("rank%d", _rank.rank).c_str()),
rank(_rank),

ADD_STAT(actEnergy, "Energy for activate commands per rank (pJ)"),
@@ -3047,7 +3038,7 @@
DRAMCtrl::recvFunctional(PacketPtr pkt)
{
// rely on the abstract memory

functionalAccess(pkt);

dram->functionalAccess(pkt);
}

Port &
@@ -3093,6 +3084,7 @@
// if we switched to timing mode, kick things into action,
// and behave as if we restored from a checkpoint
startup();

   dram->startup();
} else if (isTimingMode && !system()->isTimingMode()) {
    // if we switch from timing mode, stop the refresh events to
    // not cause issues with KVM

@@ -3112,7 +3104,7 @@
DRAMCtrl::MemoryPort::getAddrRanges() const
{
AddrRangeList ranges;

ranges.push_back(ctrl.getAddrRange());

ranges.push_back(ctrl.dram->getAddrRange());
return ranges;
}

diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh
index dc030b1..417e935 100644
--- a/src/mem/dram_ctrl.hh
+++ b/src/mem/dram_ctrl.hh
@@ -55,12 +55,15 @@
#include "enums/AddrMap.hh"
#include "enums/MemSched.hh"
#include "enums/PageManage.hh"
+#include "mem/abstract_mem.hh"
#include "mem/drampower.hh"
#include "mem/qos/mem_ctrl.hh"
#include "mem/qport.hh"
#include "params/DRAMCtrl.hh"
#include "sim/eventq.hh"

+class DRAMInterfaceParams;
+
/**

A basic class to track the bank state, i.e. what row is
currently open (if any), when is the bank free to accept a new
@@ -242,7 +245,7 @@
The DRAMInterface includes a class for individual ranks
and per rank functions.
/
-class DRAMInterface : public SimObject
+class DRAMInterface : public AbstractMemory
{
private:
/*
@@ -342,7 +345,7 @@
class Rank;
struct RankStats : public Stats::Group
{

   RankStats(DRAMCtrl &ctrl, Rank &rank);

   RankStats(DRAMInterface &dram, Rank &rank);

    void regStats() override;
    void resetStats() override;

@@ -408,13 +411,6 @@
*/
class Rank : public EventManager
{

```
 protected:
```
```
   /**
```

    * A reference to the parent DRAMCtrl instance

```
    */
```
```
   DRAMCtrl& ctrl;
```
```
  private:

    /**
```

@@ -534,10 +530,10 @@
*/
Tick lastBurstTick;

   Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank,

   Rank(const DRAMInterfaceParams* _p, int _rank,
         DRAMInterface& _dram);

   const std::string name() const { return csprintf("dram_%d", rank);

}

   const std::string name() const { return csprintf("%d", rank); }

    /**
     * Kick off accounting for power and refresh states and

@@ -659,15 +655,16 @@
* @param next Memory Command
* @return true if timeStamp of Command 1 < timeStamp of Command 2
*/

static bool sortTime(const Command& cmd, const Command& cmd_next)

static bool
sortTime(const Command& cmd, const Command& cmd_next)
{
return cmd.timeStamp < cmd_next.timeStamp;

};

}
```
/**
```

* A reference to the parent DRAMCtrl instance

* A pointer to the parent DRAMCtrl instance
 */

DRAMCtrl& ctrl;

DRAMCtrl* ctrl;

/**
- Memory controller configuration initialized based on parameter
  @@ -698,6 +695,7 @@
- DRAM timing requirements
  */
  const Tick M5_CLASS_VAR_USED tCK;
const Tick tCS;
const Tick tCL;
const Tick tBURST;
const Tick tBURST_MIN;
@@ -781,7 +779,7 @@

struct DRAMStats : public Stats::Group
{

   DRAMStats(DRAMCtrl &ctrl, DRAMInterface &dram);

   DRAMStats(DRAMInterface &dram);

    void regStats() override;
    void resetStats() override;

@@ -798,10 +796,12 @@

      // Latencies summed over all requests
      Stats::Scalar totQLat;

   Stats::Scalar totBusLat;
    Stats::Scalar totMemAccLat;

    // Average latencies per request
    Stats::Formula avgQLat;

   Stats::Formula avgBusLat;
    Stats::Formula avgMemAccLat;

    // Row hit count and rate

@@ -817,6 +817,11 @@
// Average bandwidth
Stats::Formula avgRdBW;
Stats::Formula avgWrBW;

```
   Stats::Formula peakBW;
```
```
   // bus utilization
```
```
   Stats::Formula busUtil;
```
```
   Stats::Formula busUtilRead;
```

   Stats::Formula busUtilWrite;
    Stats::Formula pageHitRate;
};

@@ -828,16 +833,28 @@
std::vector<Rank*> ranks;

public:

 * Buffer sizes for read and write queues in the controller

 * These are passed to the controller on instantiation

 * Defining them here allows for buffers to be resized based

```
 * on memory type / configuration.
```
```
 */
```
const uint32_t readBufferSize;
const uint32_t writeBufferSize;
/** Setting a pointer to the controller */
void setCtrl(DRAMCtrl* _ctrl) { ctrl = _ctrl; }

/**
 * Initialize the DRAM interface and verify parameters

* @param range is the address range for this interface
 */

void init(AddrRange range);

void init() override;

/**
- Iterate through dram ranks and instantiate per rank startup routine
  */

void startupRanks();

void startup() override;

/**
- Iterate through dram ranks to exit self-refresh in order to drain
  @@ -861,15 +878,26 @@
  void suspend();
/**

* Get an address in a dense range which starts from 0. The input

* address is the physical address of the request in an address

* space that contains other SimObjects apart from this

```
* controller.
```
```
*
```

* @param addr The intput address which should be in the addrRange

* @return An address in the continues range [0, max)

```
*/
```
Addr getCtrlAddr(Addr addr) { return range.getOffset(addr); }
/**
- @return number of bytes in a burst for this interface
  */

uint32_t bytesPerBurst() const { return burstSize; };

uint32_t bytesPerBurst() const { return burstSize; }

/**
*
- @return number of ranks per channel for this interface
  */

uint32_t numRanks() const { return ranksPerChannel; };

uint32_t numRanks() const { return ranksPerChannel; }

/*
- @return time to send a burst of data
  @@ -879,7 +907,8 @@
  /*
- @return time to send a burst of data without gaps
  */

Tick burstDataDelay() const

Tick
burstDataDelay() const
{
return (burstInterleave ? tBURST_MAX / 2 : tBURST);
}
@@ -893,7 +922,14 @@
*
- @return additional bus turnaround required for read-to-write
  */

Tick minRdToWr() const { return tRTW; };

Tick minRdToWr() const { return tRTW; }
/**

* Determine the required delay for an access to a different rank

```
*
```
```
* @return required rank to rank delay
```
```
*/
```
Tick rankDelay() const { return tCS; }

/*
- Function to calulate RAS cycle time for use within and
  @@ -957,7 +993,8 @@
- ```
               This requires the DRAM to be in the
```
- ```
               REF IDLE state
```
*/

bool burstReady(uint8_t rank) const

bool
burstReady(uint8_t rank) const
{
return ranks[rank]->inRefIdleState();
}
@@ -979,7 +1016,7 @@
*
- @param rank Specifies rank associated with read burst
  */

void respondEventDRAM(uint8_t rank);

void respondEvent(uint8_t rank);

/**
- Check the refresh state to determine if refresh needs
  @@ -989,8 +1026,7 @@
  */
  void checkRefreshState(uint8_t rank);

DRAMInterface(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p,

            uint64_t capacity, AddrRange range);

DRAMInterface(const DRAMInterfaceParams* _p);
};

/**
@@ -1141,20 +1177,6 @@
void accessAndRespond(PacketPtr pkt, Tick static_latency);

/**

* Get an address in a dense range which starts from 0. The input

* address is the physical address of the request in an address

* space that contains other SimObjects apart from this

```
* controller.
```
```
*
```

* @param addr The intput address which should be in the addrRange

* @return An address in the continues range [0, max)

```
*/
```
Addr getCtrlAddr(Addr addr)
{
```
   return range.getOffset(addr);
```
}
/**
- The memory schduler/arbiter - picks which request needs to
- go next, based on the specified policy such as FCFS or FR-FCFS
- and moves it to the head of the queue.
  @@ -1237,6 +1259,11 @@
  std::unordered_multiset<Tick> burstTicks;
/**

* Create pointer to interface of the actual dram media

```
*/
```
DRAMInterface* const dram;
/**
- The following are basic design parameters of the memory
- controller, and are initialized based on parameter values.
- The rowsPerBank is determined based on the capacity, number of
  @@ -1251,12 +1278,6 @@
  uint32_t readsThisTime;
/**

* Basic memory timing parameters initialized based on parameter

* values. These will be used across memory interfaces.

```
*/
```
const Tick tCS;
/**
- Memory controller configuration initialized based on parameter
- values.
  */
  @@ -1310,10 +1331,6 @@
  // Average queue lengths
  Stats::Average avgRdQLen;
  Stats::Average avgWrQLen;

   // Latencies summed over all requests

```
   Stats::Scalar totBusLat;
```
```
   // Average latencies per request
```

   Stats::Formula avgBusLat;

    Stats::Scalar numRdRetry;
    Stats::Scalar numWrRetry;

@@ -1324,21 +1341,12 @@
Stats::Histogram rdPerTurnAround;
Stats::Histogram wrPerTurnAround;

   Stats::Scalar bytesRead;
    Stats::Scalar bytesReadWrQ;

   Stats::Scalar bytesWritten;
    Stats::Scalar bytesReadSys;
    Stats::Scalar bytesWrittenSys;
    // Average bandwidth

```
   Stats::Formula avgRdBW;
```

   Stats::Formula avgWrBW;
    Stats::Formula avgRdBWSys;
    Stats::Formula avgWrBWSys;

```
   Stats::Formula peakBW;
```
```
   // bus utilization
```
```
   Stats::Formula busUtil;
```
```
   Stats::Formula busUtilRead;
```

   Stats::Formula busUtilWrite;

    Stats::Scalar totGap;
    Stats::Formula avgGap;

@@ -1367,11 +1375,6 @@
CtrlStats stats;

/**

* Create pointer to interfasce to the actual media

```
*/
```
DRAMInterface* dram;
/**
- Upstream caches need this packet until true is returned, so
- hold it for deletion until a subsequent call
  */
  @@ -1449,13 +1452,6 @@
  void restartScheduler(Tick tick) { schedule(nextReqEvent, tick); }
/**

* Determine the required delay for an access to a different rank

```
*
```
```
* @return required rank to rank delay
```
```
*/
```
Tick rankDelay() const { return tCS; }
/**
- Check the current direction of the memory channel
- @param next_state Check either the current or next bus state
  diff --git a/src/mem/drampower.cc b/src/mem/drampower.cc
  index 13551a0..96dcb55 100644
  --- a/src/mem/drampower.cc
  +++ b/src/mem/drampower.cc
  @@ -40,13 +40,13 @@
  #include "base/intmath.hh"
  #include "sim/core.hh"

-DRAMPower::DRAMPower(const DRAMCtrlParams* p, bool include_io) :
+DRAMPower::DRAMPower(const DRAMInterfaceParams* p, bool include_io) :
powerlib(libDRAMPower(getMemSpec(p), include_io))
{
}

bool
-DRAMPower::hasTwoVDD(const DRAMCtrlParams* p)
+DRAMPower::hasTwoVDD(const DRAMInterfaceParams* p)
{
return p->VDD2 == 0 ? false : true;
}
+
+uint8_t
+DRAMPower::getDataRate(const DRAMInterfaceParams* p)
+{

uint32_t burst_cycles = divCeil(p->tBURST_MAX, p->tCK);
uint8_t data_rate = p->burst_length / burst_cycles;
// 4 for GDDR5
if (data_rate != 1 && data_rate != 2 && data_rate != 4 && data_rate !=

   fatal("Got unexpected data rate %d, should be 1 or 2 or 4 or 8\n");

return data_rate;
+}
diff --git a/src/mem/drampower.hh b/src/mem/drampower.hh
index da24bca..da68a78 100644
--- a/src/mem/drampower.hh
+++ b/src/mem/drampower.hh
@@ -44,7 +44,7 @@
#define MEM_DRAM_POWER_HH

#include "libdrampower/LibDRAMPower.h"
-#include "params/DRAMCtrl.hh"
+#include "params/DRAMInterface.hh"

/**

DRAMPower is a standalone tool which calculates the power consumed by a
@@ -57,38 +57,44 @@

/**
- Transform the architechture parameters defined in

* DRAMCtrlParams to the memSpec of DRAMPower

* DRAMInterfaceParams to the memSpec of DRAMPower
 */

static Data::MemArchitectureSpec getArchParams(const DRAMCtrlParams*
p);

static Data::MemArchitectureSpec getArchParams(

                                const DRAMInterfaceParams* p);

/**

* Transforms the timing parameters defined in DRAMCtrlParams to

* Transforms the timing parameters defined in DRAMInterfaceParams to
 * the memSpec of DRAMPower
 */

static Data::MemTimingSpec getTimingParams(const DRAMCtrlParams* p);

static Data::MemTimingSpec getTimingParams(const DRAMInterfaceParams*
p);

/**
- Transforms the power and current parameters defined in

* DRAMCtrlParam to the memSpec of DRAMPower

* DRAMInterfaceParams to the memSpec of DRAMPower
 */

static Data::MemPowerSpec getPowerParams(const DRAMCtrlParams* p);

static Data::MemPowerSpec getPowerParams(const DRAMInterfaceParams* p);
/**

* Determine data rate, either one or two.

```
*/
```
static uint8_t getDataRate(const DRAMInterfaceParams* p);

/**
- Determine if DRAM has two voltage domains (or one)
  */

static bool hasTwoVDD(const DRAMCtrlParams* p);

static bool hasTwoVDD(const DRAMInterfaceParams* p);

/**

* Return an instance of MemSpec based on the DRAMCtrlParams

* Return an instance of MemSpec based on the DRAMInterfaceParams
 */

static Data::MemorySpecification getMemSpec(const DRAMCtrlParams* p);

static Data::MemorySpecification getMemSpec(const DRAMInterfaceParams*
p);

public:

  // Instance of DRAMPower Library
  libDRAMPower powerlib;

DRAMPower(const DRAMCtrlParams* p, bool include_io);

DRAMPower(const DRAMInterfaceParams* p, bool include_io);

};

The license below extends only to copyright in the software and shall

@@ -34,18 +34,21 @@

OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from m5.params import *
-from m5.objects.AbstractMemory import AbstractMemory
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
from m5.objects.QoSTurnaround import *

QoS Queue Selection policy used to select packets among same-QoS queues

class QoSQPolicy(Enum): vals = ["fifo", "lifo", "lrg"]

-class QoSMemCtrl(AbstractMemory):
+class QoSMemCtrl(ClockedObject):
type = 'QoSMemCtrl'
cxx_header = "mem/qos/mem_ctrl.hh"
cxx_class = 'QoS::MemCtrl'
abstract = True

system = Param.System(Parent.any, "System that the controller belongs
to.")

##### QoS support parameters ####

# Number of priorities in the system

diff --git a/src/mem/qos/QoSMemSinkCtrl.py b/src/mem/qos/QoSMemSinkCtrl.py
index 6c4f263..fafac64 100644
--- a/src/mem/qos/QoSMemSinkCtrl.py
+++ b/src/mem/qos/QoSMemSinkCtrl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 ARM Limited
+# Copyright (c) 2018-2020 ARM Limited

The license below extends only to copyright in the software and shall

@@ -37,6 +37,7 @@

from m5.params import *
from m5.objects.QoSMemCtrl import *
+from m5.objects.QoSMemSinkInterface import *

class QoSMemSinkCtrl(QoSMemCtrl):
type = 'QoSMemSinkCtrl'
@@ -44,6 +45,10 @@
cxx_class = "QoS::MemSinkCtrl"
port = ResponsePort("Response ports")

interface = Param.QoSMemSinkInterface(QoSMemSinkInterface(),
```
           "Interface to memory")
```

# the basic configuration of the controller architecture, note
# that each entry corresponds to a burst for the specific DRAM
# configuration (e.g. x32 with burst length 8 is 32 bytes) and not

@@ -59,5 +64,3 @@

  # response latency - time to issue a response once a request is

serviced
response_latency = Param.Latency("20ns", "Memory response latency")

diff --git a/src/mem/qos/QoSMemSinkInterface.py
b/src/mem/qos/QoSMemSinkInterface.py
new file mode 100644
index 0000000..5c79f64
--- /dev/null
+++ b/src/mem/qos/QoSMemSinkInterface.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder. You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.objects.AbstractMemory import AbstractMemory
+
+class QoSMemSinkInterface(AbstractMemory):

type = 'QoSMemSinkInterface'
cxx_header = "mem/qos/mem_sink.hh"
diff --git a/src/mem/qos/SConscript b/src/mem/qos/SConscript
index f8601b6..1d90f9c 100644
--- a/src/mem/qos/SConscript
+++ b/src/mem/qos/SConscript
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 ARM Limited
+# Copyright (c) 2018-2020 ARM Limited

The license below extends only to copyright in the software and shall

@@ -37,6 +37,7 @@

SimObject('QoSMemCtrl.py')
SimObject('QoSMemSinkCtrl.py')
+SimObject('QoSMemSinkInterface.py')
SimObject('QoSPolicy.py')
SimObject('QoSTurnaround.py')

diff --git a/src/mem/qos/mem_ctrl.cc b/src/mem/qos/mem_ctrl.cc
index 50e6035..190960b 100644
--- a/src/mem/qos/mem_ctrl.cc
+++ b/src/mem/qos/mem_ctrl.cc
@@ -1,5 +1,5 @@
/*

- Copyright (c) 2017-2020 ARM Limited
- All rights reserved
- The license below extends only to copyright in the software and shall
  @@ -42,7 +42,7 @@
  namespace QoS {
MemCtrl::MemCtrl(const QoSMemCtrlParams * p)

: AbstractMemory(p),

: ClockedObject(p),
policy(p->qos_policy),
turnPolicy(p->qos_turnaround_policy),
queuePolicy(QueuePolicy::create(p)),
@@ -51,7 +51,8 @@
qosSyncroScheduler(p->qos_syncro_scheduler),
totalReadQueueSize(0), totalWriteQueueSize(0),
busState(READ), busStateNext(READ),

stats(*this)

stats(*this),
_system(p->system)
{
// Set the priority policy
if (policy) {
@@ -77,12 +78,6 @@
{}

void
-MemCtrl::init()
-{

AbstractMemory::init();
-}

-void
MemCtrl::logRequest(BusState dir, MasterID m_id, uint8_t qos,
Addr addr, uint64_t entries)
{
diff --git a/src/mem/qos/mem_ctrl.hh b/src/mem/qos/mem_ctrl.hh
index 0e29fcc..5d7c9d6 100644
--- a/src/mem/qos/mem_ctrl.hh
+++ b/src/mem/qos/mem_ctrl.hh
@@ -1,5 +1,5 @@
/*

- Copyright (c) 2020 ARM Limited
- All rights reserved
- The license below extends only to copyright in the software and shall
  @@ -36,10 +36,10 @@
  */
#include "debug/QOS.hh"
-#include "mem/abstract_mem.hh"
-#include "mem/qos/q_policy.hh"
#include "mem/qos/policy.hh"
+#include "mem/qos/q_policy.hh"
#include "params/QoSMemCtrl.hh"
+#include "sim/clocked_object.hh"
#include "sim/system.hh"

#include <unordered_map>
@@ -56,7 +56,7 @@
- which support QoS - it provides access to a set of QoS
- scheduling policies
  /
  -class MemCtrl: public AbstractMemory
  +class MemCtrl : public ClockedObject
  {
  public:
  /* Bus Direction */
  @@ -151,6 +151,9 @@
  Stats::Scalar numStayWriteState;
  } stats;
/** Pointer to the System object */
System* _system;

/**
 * Initializes dynamically counters and
 * statistics for a given Master

@@ -266,11 +269,6 @@
virtual ~MemCtrl();

/**

```
* Initializes this object
```
```
*/
```
void init() override;
/**
- Gets the current bus state
- @return current bus state
  @@ -346,6 +344,10 @@
- @return total number of priority levels
  */
  uint8_t numPriorities() const { return _numPriorities; }

/** read the system pointer

* @return pointer to the system object */

System* system() const { return _system; }
};

template<typename Queues>
diff --git a/src/mem/qos/mem_sink.cc b/src/mem/qos/mem_sink.cc
index 1f104e4..dbdf548 100644
--- a/src/mem/qos/mem_sink.cc
+++ b/src/mem/qos/mem_sink.cc
@@ -1,5 +1,5 @@
/*

- Copyright (c) 2018-2020 ARM Limited
- All rights reserved
- The license below extends only to copyright in the software and shall
  @@ -40,6 +40,7 @@
  #include "debug/Drain.hh"
  #include "debug/QOS.hh"
  #include "mem_sink.hh"
  +#include "params/QoSMemSinkInterface.hh"
  #include "sim/system.hh"
namespace QoS {
@@ -50,12 +51,15 @@
memoryPacketSize(p->memory_packet_size),
readBufferSize(p->read_buffer_size),
writeBufferSize(p->write_buffer_size), port(name() + ".port", *this),
interface(p->interface),
retryRdReq(false), retryWrReq(false), nextRequest(0),
nextReqEvent(this)
{
// Resize read and write queue to allocate space
// for configured QoS priorities
readQueue.resize(numPriorities());
writeQueue.resize(numPriorities());
interface->setMemCtrl(this);
}

MemSinkCtrl::~MemSinkCtrl()
@@ -92,7 +96,7 @@
"%s Should not see packets where cache is responding\n",
func);

access(pkt);

interface->access(pkt);
return responseLatency;
}

@@ -101,7 +105,7 @@
{
pkt->pushLabel(name());

functionalAccess(pkt);

interface->functionalAccess(pkt);

pkt->popLabel();
}
@@ -279,7 +283,7 @@

// Do the actual memory access which also turns the packet
// into a response

access(pkt);

interface->access(pkt);

// Log the response
logResponse(pkt->isRead()? READ : WRITE,
@@ -351,7 +355,7 @@
MemSinkCtrl::MemoryPort::getAddrRanges() const
{
AddrRangeList ranges;

ranges.push_back(memory.getAddrRange());

ranges.push_back(memory.interface->getAddrRange());
return ranges;
}

@@ -390,3 +394,13 @@
return new QoS::MemSinkCtrl(this);
}

+QoSMemSinkInterface::QoSMemSinkInterface(const QoSMemSinkInterfaceParams*
_p)

: AbstractMemory(_p)
+{
+}

+QoSMemSinkInterface*
+QoSMemSinkInterfaceParams::create()
+{

return new QoSMemSinkInterface(this);
+}
diff --git a/src/mem/qos/mem_sink.hh b/src/mem/qos/mem_sink.hh
index 9a51269..5f6c1be 100644
--- a/src/mem/qos/mem_sink.hh
+++ b/src/mem/qos/mem_sink.hh
@@ -1,5 +1,5 @@
/*

- Copyright (c) 2018-2020 ARM Limited
- All rights reserved
- The license below extends only to copyright in the software and shall
  @@ -41,10 +41,14 @@
  #ifndef MEM_QOS_MEM_SINK_HH
  #define MEM_QOS_MEM_SINK_HH

+#include "mem/abstract_mem.hh"
#include "mem/qos/mem_ctrl.hh"
#include "mem/qport.hh"
#include "params/QoSMemSinkCtrl.hh"

+class QoSMemSinkInterfaceParams;
+class QoSMemSinkInterface;
+
namespace QoS {

/**
@@ -163,6 +167,11 @@
/** Memory slave port */
MemoryPort port;

* Create pointer to interface of actual media

```
*/
```
QoSMemSinkInterface* const interface;

/** Read request pending */
bool retryRdReq;

@@ -244,4 +253,17 @@

} // namespace QoS

+class QoSMemSinkInterface : public AbstractMemory
+{

public:
/** Setting a pointer to the interface */
void setMemCtrl(QoS::MemSinkCtrl* _ctrl) { ctrl = _ctrl; };
/** Pointer to the controller */
QoS::MemSinkCtrl* ctrl;
QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p);
+};
#endif /* MEM_QOS_MEM_SINK_HH */
diff --git a/tests/gem5/configs/base_config.py
b/tests/gem5/configs/base_config.py
index b5bddf4..cbea768 100644
--- a/tests/gem5/configs/base_config.py
+++ b/tests/gem5/configs/base_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012-2013, 2017-2018 ARM Limited
+# Copyright (c) 2012-2013, 2017-2018, 2020 ARM Limited

All rights reserved.

The license below extends only to copyright in the software and shall

@@ -220,7 +220,12 @@
super(BaseSESystem, self).init_system(system)

  def create_system(self):

   system = System(physmem = self.mem_class(),

   if issubclass(self.mem_class, m5.objects.DRAMInterface):

```
       mem_ctrl = DRAMCtrl()
```

       mem_ctrl.dram = self.mem_class()

```
   else:
```
```
       mem_ctrl = self.mem_class()
```

   system = System(physmem = mem_ctrl,
                    membus = SystemXBar(),
                    mem_mode = self.mem_mode,
                    multi_thread = (self.num_threads > 1))

@@ -272,8 +277,16 @@
else:
# create the memory controllers and connect them, stick with
# the physmem name to avoid bumping all the reference stats

       system.physmem = [self.mem_class(range = r)

                         for r in system.mem_ranges]

       if issubclass(self.mem_class, m5.objects.DRAMInterface):

```
           mem_ctrls = []
```
```
           for r in system.mem_ranges:
```
```
               mem_ctrl = DRAMCtrl()
```

               mem_ctrl.dram = self.mem_class(range = r)

               mem_ctrls.append(mem_ctrl)

```
           system.physmem = mem_ctrls
```
```
       else:
```

           system.physmem = [self.mem_class(range = r)

                             for r in system.mem_ranges]
        for i in range(len(system.physmem)):
            system.physmem[i].port = system.membus.master

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28968
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8
Gerrit-Change-Number: 28968
Gerrit-PatchSet: 13
Gerrit-Owner: Wendy Elsasser <wendy.elsasser(a)arm.com>
Gerrit-Reviewer: Daniel Carvalho <odanrc(a)yahoo.com.br>
Gerrit-Reviewer: Jason Lowe-Power <power.jg(a)gmail.com>
Gerrit-Reviewer: John Alsop <johnathan.alsop(a)amd.com>
Gerrit-Reviewer: Matthew Poremba <matthew.poremba(a)amd.com>
Gerrit-Reviewer: Nikos Nikoleris <nikos.nikoleris(a)arm.com>
Gerrit-Reviewer: Srikant Bharadwaj <srikant.bharadwaj(a)amd.com>
Gerrit-Reviewer: kokoro <noreply+kokoro(a)google.com>
Gerrit-MessageType: merged

Jason Lowe-Power has submitted this change. ( https://gem5-review.googlesource.com/c/public/gem5/+/28968 ) Change subject: mem: Make MemCtrl a ClockedObject ...................................................................... mem: Make MemCtrl a ClockedObject Made DRAMCtrl a ClockedObject, with DRAMInterface defined as an AbstractMemory. The address ranges are now defined per interface. Currently the model only includes a DRAMInterface but this can be expanded for other media types. The controller object includes a parameter to the interface, which is setup when gem5 is configured. Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28968 Reviewed-by: Jason Lowe-Power <power.jg(a)gmail.com> Maintainer: Jason Lowe-Power <power.jg(a)gmail.com> Tested-by: kokoro <noreply+kokoro(a)google.com> --- M configs/common/MemConfig.py M configs/dram/low_power_sweep.py M configs/dram/sweep.py M configs/example/memcheck.py M configs/learning_gem5/part1/simple.py M configs/learning_gem5/part1/two_level.py M configs/learning_gem5/part2/simple_cache.py M configs/learning_gem5/part2/simple_memobj.py M configs/learning_gem5/part3/simple_ruby.py M configs/ruby/Ruby.py M src/mem/DRAMCtrl.py A src/mem/DRAMInterface.py M src/mem/SConscript M src/mem/dram_ctrl.cc M src/mem/dram_ctrl.hh M src/mem/drampower.cc M src/mem/drampower.hh M src/mem/qos/QoSMemCtrl.py M src/mem/qos/QoSMemSinkCtrl.py A src/mem/qos/QoSMemSinkInterface.py M src/mem/qos/SConscript M src/mem/qos/mem_ctrl.cc M src/mem/qos/mem_ctrl.hh M src/mem/qos/mem_sink.cc M src/mem/qos/mem_sink.hh M tests/gem5/configs/base_config.py 26 files changed, 1,913 insertions(+), 1,736 deletions(-) Approvals: Jason Lowe-Power: Looks good to me, approved; Looks good to me, approved kokoro: Regressions pass diff --git a/configs/common/MemConfig.py b/configs/common/MemConfig.py index b530145..1ace875 100644 --- a/configs/common/MemConfig.py +++ b/configs/common/MemConfig.py @@ -40,7 +40,7 @@ from common import ObjectList from common import HMC -def create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, intlv_size,\ +def create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size, xor_low_bit): """ Helper function for creating a single memoy controller from the given @@ -63,32 +63,32 @@ # Create an instance so we can figure out the address # mapping and row-buffer size - ctrl = cls() + interface = intf() # Only do this for DRAMs - if issubclass(cls, m5.objects.DRAMCtrl): + if issubclass(intf, m5.objects.DRAMInterface): # If the channel bits are appearing after the column # bits, we need to add the appropriate number of bits # for the row buffer size - if ctrl.addr_mapping.value == 'RoRaBaChCo': + if interface.addr_mapping.value == 'RoRaBaChCo': # This computation only really needs to happen # once, but as we rely on having an instance we # end up having to repeat it for each and every # one - rowbuffer_size = ctrl.device_rowbuffer_size.value * \ - ctrl.devices_per_rank.value + rowbuffer_size = interface.device_rowbuffer_size.value * \ + interface.devices_per_rank.value intlv_low_bit = int(math.log(rowbuffer_size, 2)) # We got all we need to configure the appropriate address # range - ctrl.range = m5.objects.AddrRange(r.start, size = r.size(), + interface.range = m5.objects.AddrRange(r.start, size = r.size(), intlvHighBit = \ intlv_low_bit + intlv_bits - 1, xorHighBit = xor_high_bit, intlvBits = intlv_bits, intlvMatch = i) - return ctrl + return interface def config_mem(options, system): """ @@ -148,10 +148,10 @@ if 2 ** intlv_bits != nbr_mem_ctrls: fatal("Number of memory channels must be a power of 2") - cls = ObjectList.mem_list.get(opt_mem_type) + intf = ObjectList.mem_list.get(opt_mem_type) mem_ctrls = [] - if opt_elastic_trace_en and not issubclass(cls, m5.objects.SimpleMemory): + if opt_elastic_trace_en and not issubclass(intf, m5.objects.SimpleMemory): fatal("When elastic trace is enabled, configure mem-type as " "simple-mem.") @@ -162,36 +162,53 @@ intlv_size = max(opt_mem_channels_intlv, system.cache_line_size.value) # For every range (most systems will only have one), create an - # array of controllers and set their parameters to match their - # address mapping in the case of a DRAM + # array of memory interfaces and set their parameters to match + # their address mapping in the case of a DRAM for r in system.mem_ranges: for i in range(nbr_mem_ctrls): - mem_ctrl = create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, + # Create the DRAM interface + dram_intf = create_mem_intf(intf, r, i, nbr_mem_ctrls, intlv_bits, intlv_size, opt_xor_low_bit) + # Set the number of ranks based on the command-line # options if it was explicitly set - if issubclass(cls, m5.objects.DRAMCtrl) and opt_mem_ranks: - mem_ctrl.ranks_per_channel = opt_mem_ranks + if issubclass(intf, m5.objects.DRAMInterface) and opt_mem_ranks: + dram_intf.ranks_per_channel = opt_mem_ranks # Enable low-power DRAM states if option is set - if issubclass(cls, m5.objects.DRAMCtrl): - mem_ctrl.enable_dram_powerdown = opt_dram_powerdown + if issubclass(intf, m5.objects.DRAMInterface): + dram_intf.enable_dram_powerdown = opt_dram_powerdown if opt_elastic_trace_en: - mem_ctrl.latency = '1ns' + dram_intf.latency = '1ns' print("For elastic trace, over-riding Simple Memory " "latency to 1ns.") + # Create the controller that will drive the interface + if opt_mem_type == "HMC_2500_1x32": + # The static latency of the vault controllers is estimated + # to be smaller than a full DRAM channel controller + mem_ctrl = m5.objects.DRAMCtrl(min_writes_per_switch = 8, + static_backend_latency = '4ns', + static_frontend_latency = '4ns') + else: + mem_ctrl = m5.objects.DRAMCtrl() + + # Hookup the controller to the interface and add to the list + mem_ctrl.dram = dram_intf mem_ctrls.append(mem_ctrl) + # Create a controller and connect the interfaces to a controller + for i in range(len(mem_ctrls)): + if opt_mem_type == "HMC_2500_1x32": + # Connect the controllers to the membus + mem_ctrls[i].port = xbar[i/4].master + # Set memory device size. There is an independent controller for + # each vault. All vaults are same size. + mem_ctrls[i].dram.device_size = options.hmc_dev_vault_size + else: + # Connect the controllers to the membus + mem_ctrls[i].port = xbar.master + subsystem.mem_ctrls = mem_ctrls - # Connect the controllers to the membus - for i in range(len(subsystem.mem_ctrls)): - if opt_mem_type == "HMC_2500_1x32": - subsystem.mem_ctrls[i].port = xbar[i/4].master - # Set memory device size. There is an independent controller for - # each vault. All vaults are same size. - subsystem.mem_ctrls[i].device_size = options.hmc_dev_vault_size - else: - subsystem.mem_ctrls[i].port = xbar.master diff --git a/configs/dram/low_power_sweep.py b/configs/dram/low_power_sweep.py index 9a62393..0da2b93 100644 --- a/configs/dram/low_power_sweep.py +++ b/configs/dram/low_power_sweep.py @@ -111,14 +111,19 @@ # Sanity check for memory controller class. if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl): - fatal("This script assumes the memory is a DRAMCtrl subclass") + fatal("This script assumes the controller is a DRAMCtrl subclass") +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface): + fatal("This script assumes the memory is a DRAMInterface subclass") # There is no point slowing things down by saving any data. -system.mem_ctrls[0].null = True +system.mem_ctrls[0].dram.null = True + +# enable DRAM low power states +system.mem_ctrls[0].dram.enable_dram_powerdown = True # Set the address mapping based on input argument -system.mem_ctrls[0].addr_mapping = args.addr_map -system.mem_ctrls[0].page_policy = args.page_policy +system.mem_ctrls[0].dram.addr_mapping = args.addr_map +system.mem_ctrls[0].dram.page_policy = args.page_policy # We create a traffic generator state for each param combination we want to # test. Each traffic generator state is specified in the config file and the @@ -132,22 +137,22 @@ cfg_file = open(cfg_file_path, 'w') # Get the number of banks -nbr_banks = int(system.mem_ctrls[0].banks_per_rank.value) +nbr_banks = int(system.mem_ctrls[0].dram.banks_per_rank.value) # determine the burst size in bytes -burst_size = int((system.mem_ctrls[0].devices_per_rank.value * - system.mem_ctrls[0].device_bus_width.value * - system.mem_ctrls[0].burst_length.value) / 8) +burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value * + system.mem_ctrls[0].dram.device_bus_width.value * + system.mem_ctrls[0].dram.burst_length.value) / 8) # next, get the page size in bytes (the rowbuffer size is already in bytes) -page_size = system.mem_ctrls[0].devices_per_rank.value * \ - system.mem_ctrls[0].device_rowbuffer_size.value +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \ + system.mem_ctrls[0].dram.device_rowbuffer_size.value # Inter-request delay should be such that we can hit as many transitions # to/from low power states as possible to. We provide a min and max itt to the # traffic generator and it randomises in the range. The parameter is in # seconds and we need it in ticks (ps). -itt_min = system.mem_ctrls[0].tBURST.value * 1000000000000 +itt_min = system.mem_ctrls[0].dram.tBURST.value * 1000000000000 #The itt value when set to (tRAS + tRP + tCK) covers the case where # a read command is delayed beyond the delay from ACT to PRE_PDN entry of the @@ -155,9 +160,9 @@ # between a write and power down entry will be tRCD + tCL + tWR + tRP + tCK. # As we use this delay as a unit and create multiples of it as bigger delays # for the sweep, this parameter works for reads, writes and mix of them. -pd_entry_time = (system.mem_ctrls[0].tRAS.value + - system.mem_ctrls[0].tRP.value + - system.mem_ctrls[0].tCK.value) * 1000000000000 +pd_entry_time = (system.mem_ctrls[0].dram.tRAS.value + + system.mem_ctrls[0].dram.tRP.value + + system.mem_ctrls[0].dram.tCK.value) * 1000000000000 # We sweep itt max using the multipliers specified by the user. itt_max_str = args.itt_list.strip().split() diff --git a/configs/dram/sweep.py b/configs/dram/sweep.py index a340b46..a771c5c 100644 --- a/configs/dram/sweep.py +++ b/configs/dram/sweep.py @@ -116,13 +116,15 @@ # the following assumes that we are using the native DRAM # controller, check to be sure if not isinstance(system.mem_ctrls[0], m5.objects.DRAMCtrl): - fatal("This script assumes the memory is a DRAMCtrl subclass") + fatal("This script assumes the controller is a DRAMCtrl subclass") +if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface): + fatal("This script assumes the memory is a DRAMInterface subclass") # there is no point slowing things down by saving any data -system.mem_ctrls[0].null = True +system.mem_ctrls[0].dram.null = True # Set the address mapping based on input argument -system.mem_ctrls[0].addr_mapping = options.addr_map +system.mem_ctrls[0].dram.addr_mapping = options.addr_map # stay in each state for 0.25 ms, long enough to warm things up, and # short enough to avoid hitting a refresh @@ -133,21 +135,21 @@ # the DRAM maximum bandwidth to ensure that it is saturated # get the number of banks -nbr_banks = system.mem_ctrls[0].banks_per_rank.value +nbr_banks = system.mem_ctrls[0].dram.banks_per_rank.value # determine the burst length in bytes -burst_size = int((system.mem_ctrls[0].devices_per_rank.value * - system.mem_ctrls[0].device_bus_width.value * - system.mem_ctrls[0].burst_length.value) / 8) +burst_size = int((system.mem_ctrls[0].dram.devices_per_rank.value * + system.mem_ctrls[0].dram.device_bus_width.value * + system.mem_ctrls[0].dram.burst_length.value) / 8) # next, get the page size in bytes -page_size = system.mem_ctrls[0].devices_per_rank.value * \ - system.mem_ctrls[0].device_rowbuffer_size.value +page_size = system.mem_ctrls[0].dram.devices_per_rank.value * \ + system.mem_ctrls[0].dram.device_rowbuffer_size.value # match the maximum bandwidth of the memory, the parameter is in seconds # and we need it in ticks (ps) -itt = getattr(system.mem_ctrls[0].tBURST_MIN, 'value', - system.mem_ctrls[0].tBURST.value) * 1000000000000 +itt = getattr(system.mem_ctrls[0].dram.tBURST_MIN, 'value', + system.mem_ctrls[0].dram.tBURST.value) * 1000000000000 # assume we start at 0 max_addr = mem_range.end diff --git a/configs/example/memcheck.py b/configs/example/memcheck.py index 6d80d60..6bccd54 100644 --- a/configs/example/memcheck.py +++ b/configs/example/memcheck.py @@ -217,7 +217,7 @@ proto_tester = TrafficGen(config_file = cfg_file_path) # Set up the system along with a DRAM controller -system = System(physmem = DDR3_1600_8x8()) +system = System(physmem = DRAMCtrl(dram = DDR3_1600_8x8())) system.voltage_domain = VoltageDomain(voltage = '1V') diff --git a/configs/learning_gem5/part1/simple.py b/configs/learning_gem5/part1/simple.py index ef73a06..cfd15be 100644 --- a/configs/learning_gem5/part1/simple.py +++ b/configs/learning_gem5/part1/simple.py @@ -77,8 +77,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part1/two_level.py b/configs/learning_gem5/part1/two_level.py index 564c785..0dbcfc7 100644 --- a/configs/learning_gem5/part1/two_level.py +++ b/configs/learning_gem5/part1/two_level.py @@ -132,8 +132,9 @@ system.system_port = system.membus.slave # Create a DDR3 memory controller -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Create a process for a simple "Hello World" application diff --git a/configs/learning_gem5/part2/simple_cache.py b/configs/learning_gem5/part2/simple_cache.py index 8d98d92..fbea73d 100644 --- a/configs/learning_gem5/part2/simple_cache.py +++ b/configs/learning_gem5/part2/simple_cache.py @@ -76,8 +76,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part2/simple_memobj.py b/configs/learning_gem5/part2/simple_memobj.py index d30977c..e792eb9 100644 --- a/configs/learning_gem5/part2/simple_memobj.py +++ b/configs/learning_gem5/part2/simple_memobj.py @@ -74,8 +74,9 @@ system.cpu.interrupts[0].int_slave = system.membus.master # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] system.mem_ctrl.port = system.membus.master # Connect the system up to the membus diff --git a/configs/learning_gem5/part3/simple_ruby.py b/configs/learning_gem5/part3/simple_ruby.py index c47ee7e..7f70a8c 100644 --- a/configs/learning_gem5/part3/simple_ruby.py +++ b/configs/learning_gem5/part3/simple_ruby.py @@ -68,8 +68,9 @@ system.cpu = [TimingSimpleCPU() for i in range(2)] # Create a DDR3 memory controller and connect it to the membus -system.mem_ctrl = DDR3_1600_8x8() -system.mem_ctrl.range = system.mem_ranges[0] +system.mem_ctrl = DRAMCtrl() +system.mem_ctrl.dram = DDR3_1600_8x8() +system.mem_ctrl.dram.range = system.mem_ranges[0] # create the interrupt controller for the CPU and connect to the membus for cpu in system.cpu: diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py index 9bceaa3..9f400a8 100644 --- a/configs/ruby/Ruby.py +++ b/configs/ruby/Ruby.py @@ -130,15 +130,16 @@ dir_ranges = [] for r in system.mem_ranges: mem_type = ObjectList.mem_list.get(options.mem_type) - mem_ctrl = MemConfig.create_mem_ctrl(mem_type, r, index, + dram_intf = MemConfig.create_mem_intf(mem_type, r, index, options.num_dirs, int(math.log(options.num_dirs, 2)), intlv_size, options.xor_low_bit) + mem_ctrl = m5.objects.DRAMCtrl(dram = dram_intf) if options.access_backing_store: mem_ctrl.kvm_map=False mem_ctrls.append(mem_ctrl) - dir_ranges.append(mem_ctrl.range) + dir_ranges.append(mem_ctrl.dram.range) if crossbar != None: mem_ctrl.port = crossbar.master diff --git a/src/mem/DRAMCtrl.py b/src/mem/DRAMCtrl.py index 0f70dff..b7b43dc 100644 --- a/src/mem/DRAMCtrl.py +++ b/src/mem/DRAMCtrl.py @@ -40,26 +40,12 @@ from m5.params import * from m5.proxy import * -from m5.objects.AbstractMemory import * from m5.objects.QoSMemCtrl import * # Enum for memory scheduling algorithms, currently First-Come # First-Served and a First-Row Hit then First-Come First-Served class MemSched(Enum): vals = ['fcfs', 'frfcfs'] -# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting -# channel, rank, bank, row and column, respectively, and going from -# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are -# suitable for an open-page policy, optimising for sequential accesses -# hitting in the open row. For a closed-page policy, RoCoRaBaCh -# maximises parallelism. -class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh'] - -# Enum for the page policy, either open, open_adaptive, close, or -# close_adaptive. -class PageManage(Enum): vals = ['open', 'open_adaptive', 'close', - 'close_adaptive'] - # DRAMCtrl is a single-channel single-ported DRAM controller model # that aims to model the most important system-level performance # effects of a DRAM without getting into too much detail of the DRAM @@ -72,12 +58,11 @@ # bus in front of the controller for multiple ports port = SlavePort("Slave port") - # the basic configuration of the controller architecture, note - # that each entry corresponds to a burst for the specific DRAM - # configuration (e.g. x32 with burst length 8 is 32 bytes) and not - # the cacheline size or request/packet size - write_buffer_size = Param.Unsigned(64, "Number of write queue entries") - read_buffer_size = Param.Unsigned(32, "Number of read queue entries") + # Interface to volatile, DRAM media + dram = Param.DRAMInterface("DRAM interface") + + # read and write buffer depths are set in the interface + # the controller will read these values when instantiated # threshold in percent for when to forcefully trigger writes and # start emptying the write buffer @@ -93,15 +78,6 @@ # scheduler, address map and page policy mem_sched_policy = Param.MemSched('frfcfs', "Memory scheduling policy") - addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy") - page_policy = Param.PageManage('open_adaptive', "Page management policy") - - # enforce a limit on the number of accesses per row - max_accesses_per_row = Param.Unsigned(16, "Max accesses per row before " - "closing"); - - # size of DRAM Chip in Bytes - device_size = Param.MemorySize("Size of DRAM chip") # pipeline latency of the controller and PHY, split into a # frontend part and a backend part, with reads and writes serviced @@ -109,1404 +85,3 @@ # serviced by the memory seeing the sum of the two static_frontend_latency = Param.Latency("10ns", "Static frontend latency") static_backend_latency = Param.Latency("10ns", "Static backend latency") - - # the physical organisation of the DRAM - device_bus_width = Param.Unsigned("data bus width in bits for each DRAM "\ - "device/chip") - burst_length = Param.Unsigned("Burst lenght (BL) in beats") - device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\ - "device/chip") - devices_per_rank = Param.Unsigned("Number of devices/chips per rank") - ranks_per_channel = Param.Unsigned("Number of ranks per channel") - - # default to 0 bank groups per rank, indicating bank group architecture - # is not used - # update per memory class when bank group architecture is supported - bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per rank") - banks_per_rank = Param.Unsigned("Number of banks per rank") - - # Enable DRAM powerdown states if True. This is False by default due to - # performance being lower when enabled - enable_dram_powerdown = Param.Bool(False, "Enable powerdown states") - - # For power modelling we need to know if the DRAM has a DLL or not - dll = Param.Bool(True, "DRAM has DLL or not") - - # DRAMPower provides in addition to the core power, the possibility to - # include RD/WR termination and IO power. This calculation assumes some - # default values. The integration of DRAMPower with gem5 does not include - # IO and RD/WR termination power by default. This might be added as an - # additional feature in the future. - - # timing behaviour and constraints - all in nanoseconds - - # the base clock period of the DRAM - tCK = Param.Latency("Clock period") - - # the amount of time in nanoseconds from issuing an activate command - # to the data being available in the row buffer for a read/write - tRCD = Param.Latency("RAS to CAS delay") - - # the time from issuing a read/write command to seeing the actual data - tCL = Param.Latency("CAS latency") - - # minimum time between a precharge and subsequent activate - tRP = Param.Latency("Row precharge time") - - # minimum time between an activate and a precharge to the same row - tRAS = Param.Latency("ACT to PRE delay") - - # minimum time between a write data transfer and a precharge - tWR = Param.Latency("Write recovery time") - - # minimum time between a read and precharge command - tRTP = Param.Latency("Read to precharge") - - # time to complete a burst transfer, typically the burst length - # divided by two due to the DDR bus, but by making it a parameter - # it is easier to also evaluate SDR memories like WideIO. - # This parameter has to account for burst length. - # Read/Write requests with data size larger than one full burst are broken - # down into multiple requests in the controller - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = Param.Latency("Burst duration " - "(typically burst length / 2 cycles)") - - # tBURST_MAX is the column array cycle delay required before next access, - # which could be greater than tBURST when the memory access time is greater - # than tBURST - tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay") - - # tBURST_MIN is the minimum delay between bursts, which could be less than - # tBURST when interleaving is supported - tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts") - - # CAS-to-CAS delay for bursts to the same bank group - # only utilized with bank group architectures; set to 0 for default case - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay") - - # Write-to-Write delay for bursts to the same bank group - # only utilized with bank group architectures; set to 0 for default case - # This will be used to enable different same bank group delays - # for writes versus reads - tCCD_L_WR = Param.Latency(Self.tCCD_L, - "Same bank group Write to Write delay") - - # time taken to complete one refresh cycle (N rows in all banks) - tRFC = Param.Latency("Refresh cycle time") - - # refresh command interval, how often a "ref" command needs - # to be sent. It is 7.8 us for a 64ms refresh requirement - tREFI = Param.Latency("Refresh command interval") - - # write-to-read, same rank turnaround penalty - tWTR = Param.Latency("Write to read, same rank switching time") - - # write-to-read, same rank turnaround penalty for same bank group - tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching " - "time, same bank group") - - # read-to-write, same rank turnaround penalty - tRTW = Param.Latency("Read to write, same rank switching time") - - # rank-to-rank bus delay penalty - # this does not correlate to a memory timing parameter and encompasses: - # 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD - # different rank bus delay - tCS = Param.Latency("Rank to rank switching time") - - # minimum precharge to precharge delay time - tPPD = Param.Latency("0ns", "PRE to PRE delay") - - # maximum delay between two-cycle ACT command phases - tAAD = Param.Latency(Self.tCK, - "Maximum delay between two-cycle ACT commands") - - two_cycle_activate = Param.Bool(False, - "Two cycles required to send activate") - - # minimum row activate to row activate delay time - tRRD = Param.Latency("ACT to ACT delay") - - # only utilized with bank group architectures; set to 0 for default case - tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay") - - # time window in which a maximum number of activates are allowed - # to take place, set to 0 to disable - tXAW = Param.Latency("X activation window") - activation_limit = Param.Unsigned("Max number of activates in window") - - # time to exit power-down mode - # Exit power-down to next valid command delay - tXP = Param.Latency("0ns", "Power-up Delay") - - # Exit Powerdown to commands requiring a locked DLL - tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL") - - # time to exit self-refresh mode - tXS = Param.Latency("0ns", "Self-refresh exit latency") - - # time to exit self-refresh mode with locked DLL - tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL") - - # number of data beats per clock. with DDR, default is 2, one per edge - beats_per_clock = Param.Unsigned(2, "Data beats per clock") - - data_clock_sync = Param.Bool(False, "Synchronization commands required") - - # Currently rolled into other params - ###################################################################### - - # tRC - assumed to be tRAS + tRP - - # Power Behaviour and Constraints - # DRAMs like LPDDR and WideIO have 2 external voltage domains. These are - # defined as VDD and VDD2. Each current is defined for each voltage domain - # separately. For example, current IDD0 is active-precharge current for - # voltage domain VDD and current IDD02 is active-precharge current for - # voltage domain VDD2. - # By default all currents are set to 0mA. Users who are only interested in - # the performance of DRAMs can leave them at 0. - - # Operating 1 Bank Active-Precharge current - IDD0 = Param.Current("0mA", "Active precharge current") - - # Operating 1 Bank Active-Precharge current multiple voltage Range - IDD02 = Param.Current("0mA", "Active precharge current VDD2") - - # Precharge Power-down Current: Slow exit - IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow") - - # Precharge Power-down Current: Slow exit multiple voltage Range - IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2") - - # Precharge Power-down Current: Fast exit - IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast") - - # Precharge Power-down Current: Fast exit multiple voltage Range - IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2") - - # Precharge Standby current - IDD2N = Param.Current("0mA", "Precharge Standby current") - - # Precharge Standby current multiple voltage range - IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2") - - # Active Power-down current: slow exit - IDD3P0 = Param.Current("0mA", "Active Powerdown slow") - - # Active Power-down current: slow exit multiple voltage range - IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2") - - # Active Power-down current : fast exit - IDD3P1 = Param.Current("0mA", "Active Powerdown fast") - - # Active Power-down current : fast exit multiple voltage range - IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2") - - # Active Standby current - IDD3N = Param.Current("0mA", "Active Standby current") - - # Active Standby current multiple voltage range - IDD3N2 = Param.Current("0mA", "Active Standby current VDD2") - - # Burst Read Operating Current - IDD4R = Param.Current("0mA", "READ current") - - # Burst Read Operating Current multiple voltage range - IDD4R2 = Param.Current("0mA", "READ current VDD2") - - # Burst Write Operating Current - IDD4W = Param.Current("0mA", "WRITE current") - - # Burst Write Operating Current multiple voltage range - IDD4W2 = Param.Current("0mA", "WRITE current VDD2") - - # Refresh Current - IDD5 = Param.Current("0mA", "Refresh current") - - # Refresh Current multiple voltage range - IDD52 = Param.Current("0mA", "Refresh current VDD2") - - # Self-Refresh Current - IDD6 = Param.Current("0mA", "Self-refresh Current") - - # Self-Refresh Current multiple voltage range - IDD62 = Param.Current("0mA", "Self-refresh Current VDD2") - - # Main voltage range of the DRAM - VDD = Param.Voltage("0V", "Main Voltage Range") - - # Second voltage range defined by some DRAMs - VDD2 = Param.Voltage("0V", "2nd Voltage Range") - -# A single DDR3-1600 x64 channel (one command and address bus), with -# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in -# an 8x8 configuration. -class DDR3_1600_8x8(DRAMCtrl): - # size of device in bytes - device_size = '512MB' - - # 8x8 configuration, 8 devices each with an 8-bit interface - device_bus_width = 8 - - # DDR3 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) - device_rowbuffer_size = '1kB' - - # 8x8 configuration, so 8 devices - devices_per_rank = 8 - - # Use two ranks - ranks_per_channel = 2 - - # DDR3 has 8 banks in all configurations - banks_per_rank = 8 - - # 800 MHz - tCK = '1.25ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 800 MHz - tBURST = '5ns' - - # DDR3-1600 11-11-11 - tRCD = '13.75ns' - tCL = '13.75ns' - tRP = '13.75ns' - tRAS = '35ns' - tRRD = '6ns' - tXAW = '30ns' - activation_limit = 4 - tRFC = '260ns' - - tWR = '15ns' - - # Greater of 4 CK or 7.5 ns - tWTR = '7.5ns' - - # Greater of 4 CK or 7.5 ns - tRTP = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns - tRTW = '2.5ns' - - # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns - tCS = '2.5ns' - - # <=85C, half for >85C - tREFI = '7.8us' - - # active powerdown and precharge powerdown exit time - tXP = '6ns' - - # self refresh exit time - tXS = '270ns' - - # Current values from datasheet Die Rev E,J - IDD0 = '55mA' - IDD2N = '32mA' - IDD3N = '38mA' - IDD4W = '125mA' - IDD4R = '157mA' - IDD5 = '235mA' - IDD3P1 = '38mA' - IDD2P1 = '32mA' - IDD6 = '20mA' - VDD = '1.5V' - -# A single HMC-2500 x32 model based on: -# [1] DRAMSpec: a high-level DRAM bank modelling tool -# developed at the University of Kaiserslautern. This high level tool -# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to -# estimate the DRAM bank latency and power numbers. -# [2] High performance AXI-4.0 based interconnect for extensible smart memory -# cubes (E. Azarkhish et. al) -# Assumed for the HMC model is a 30 nm technology node. -# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory (4 -# layers). -# Each layer has 16 vaults and each vault consists of 2 banks per layer. -# In order to be able to use the same controller used for 2D DRAM generations -# for HMC, the following analogy is done: -# Channel (DDR) => Vault (HMC) -# device_size (DDR) => size of a single layer in a vault -# ranks per channel (DDR) => number of layers -# banks per rank (DDR) => banks per layer -# devices per rank (DDR) => devices per layer ( 1 for HMC). -# The parameters for which no input is available are inherited from the DDR3 -# configuration. -# This configuration includes the latencies from the DRAM to the logic layer -# of the HMC -class HMC_2500_1x32(DDR3_1600_8x8): - # size of device - # two banks per device with each bank 4MB [2] - device_size = '8MB' - - # 1x32 configuration, 1 device with 32 TSVs [2] - device_bus_width = 32 - - # HMC is a BL8 device [2] - burst_length = 8 - - # Each device has a page (row buffer) size of 256 bytes [2] - device_rowbuffer_size = '256B' - - # 1x32 configuration, so 1 device [2] - devices_per_rank = 1 - - # 4 layers so 4 ranks [2] - ranks_per_channel = 4 - - # HMC has 2 banks per layer [2] - # Each layer represents a rank. With 4 layers and 8 banks in total, each - # layer has 2 banks; thus 2 banks per rank. - banks_per_rank = 2 - - # 1250 MHz [2] - tCK = '0.8ns' - - # 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz - tBURST = '3.2ns' - - # Values using DRAMSpec HMC model [1] - tRCD = '10.2ns' - tCL = '9.9ns' - tRP = '7.7ns' - tRAS = '21.6ns' - - # tRRD depends on the power supply network for each vendor. - # We assume a tRRD of a double bank approach to be equal to 4 clock - # cycles (Assumption) - tRRD = '3.2ns' - - # activation limit is set to 0 since there are only 2 banks per vault - # layer. - activation_limit = 0 - - # Values using DRAMSpec HMC model [1] - tRFC = '59ns' - tWR = '8ns' - tRTP = '4.9ns' - - # Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz = - # 0.8 ns (Assumption) - tCS = '0.8ns' - - # Value using DRAMSpec HMC model [1] - tREFI = '3.9us' - - # The default page policy in the vault controllers is simple closed page - # [2] nevertheless 'close' policy opens and closes the row multiple times - # for bursts largers than 32Bytes. For this reason we use 'close_adaptive' - page_policy = 'close_adaptive' - - # RoCoRaBaCh resembles the default address mapping in HMC - addr_mapping = 'RoCoRaBaCh' - min_writes_per_switch = 8 - - # These parameters do not directly correlate with buffer_size in real - # hardware. Nevertheless, their value has been tuned to achieve a - # bandwidth similar to the cycle-accurate model in [2] - write_buffer_size = 32 - read_buffer_size = 32 - - # The static latency of the vault controllers is estimated to be smaller - # than a full DRAM channel controller - static_backend_latency='4ns' - static_frontend_latency='4ns' - -# A single DDR3-2133 x64 channel refining a selected subset of the -# options for the DDR-1600 configuration, based on the same DDR3-1600 -# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept -# consistent across the two configurations. -class DDR3_2133_8x8(DDR3_1600_8x8): - # 1066 MHz - tCK = '0.938ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz - tBURST = '3.752ns' - - # DDR3-2133 14-14-14 - tRCD = '13.09ns' - tCL = '13.09ns' - tRP = '13.09ns' - tRAS = '33ns' - tRRD = '5ns' - tXAW = '25ns' - - # Current values from datasheet - IDD0 = '70mA' - IDD2N = '37mA' - IDD3N = '44mA' - IDD4W = '157mA' - IDD4R = '191mA' - IDD5 = '250mA' - IDD3P1 = '44mA' - IDD2P1 = '43mA' - IDD6 ='20mA' - VDD = '1.5V' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4) -# in an 16x4 configuration. -# Total channel capacity is 32GB -# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel -class DDR4_2400_16x4(DRAMCtrl): - # size of device - device_size = '1GB' - - # 16x4 configuration, 16 devices each with a 4-bit interface - device_bus_width = 4 - - # DDR4 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 512 byte (1K columns x4) - device_rowbuffer_size = '512B' - - # 16x4 configuration, so 16 devices - devices_per_rank = 16 - - # Match our DDR3 configurations which is dual rank - ranks_per_channel = 2 - - # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups - # Set to 4 for x4 case - bank_groups_per_rank = 4 - - # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all - # configurations). Currently we do not capture the additional - # constraints incurred by the bank groups - banks_per_rank = 16 - - # override the default buffer sizes and go for something larger to - # accommodate the larger bank count - write_buffer_size = 128 - read_buffer_size = 64 - - # 1200 MHz - tCK = '0.833ns' - - # 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = '3.332ns' - - # @2400 data rate, tCCD_L is 6 CK - # CAS-to-CAS delay for bursts to the same bank group - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = '5ns'; - - # DDR4-2400 17-17-17 - tRCD = '14.16ns' - tCL = '14.16ns' - tRP = '14.16ns' - tRAS = '32ns' - - # RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns) - tRRD = '3.332ns' - - # RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns) - tRRD_L = '4.9ns'; - - # tFAW for 512B page is MAX(16 CK, 13ns) - tXAW = '13.328ns' - activation_limit = 4 - # tRFC is 350ns - tRFC = '350ns' - - tWR = '15ns' - - # Here using the average of WTR_S and WTR_L - tWTR = '5ns' - - # Greater of 4 CK or 7.5 ns - tRTP = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666 ns - tRTW = '1.666ns' - - # Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns - tCS = '1.666ns' - - # <=85C, half for >85C - tREFI = '7.8us' - - # active powerdown and precharge powerdown exit time - tXP = '6ns' - - # self refresh exit time - # exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is: - # tRFC + 10ns = 340ns - tXS = '340ns' - - # Current values from datasheet - IDD0 = '43mA' - IDD02 = '3mA' - IDD2N = '34mA' - IDD3N = '38mA' - IDD3N2 = '3mA' - IDD4W = '103mA' - IDD4R = '110mA' - IDD5 = '250mA' - IDD3P1 = '32mA' - IDD2P1 = '25mA' - IDD6 = '30mA' - VDD = '1.2V' - VDD2 = '2.5V' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8) -# in an 8x8 configuration. -# Total channel capacity is 16GB -# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel -class DDR4_2400_8x8(DDR4_2400_16x4): - # 8x8 configuration, 8 devices each with an 8-bit interface - device_bus_width = 8 - - # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) - device_rowbuffer_size = '1kB' - - # 8x8 configuration, so 8 devices - devices_per_rank = 8 - - # RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns) - tRRD_L = '4.9ns'; - - tXAW = '21ns' - - # Current values from datasheet - IDD0 = '48mA' - IDD3N = '43mA' - IDD4W = '123mA' - IDD4R = '135mA' - IDD3P1 = '37mA' - -# A single DDR4-2400 x64 channel (one command and address bus), with -# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16) -# in an 4x16 configuration. -# Total channel capacity is 4GB -# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel -class DDR4_2400_4x16(DDR4_2400_16x4): - # 4x16 configuration, 4 devices each with an 16-bit interface - device_bus_width = 16 - - # Each device has a page (row buffer) size of 2 Kbyte (1K columns x16) - device_rowbuffer_size = '2kB' - - # 4x16 configuration, so 4 devices - devices_per_rank = 4 - - # Single rank for x16 - ranks_per_channel = 1 - - # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups - # Set to 2 for x16 case - bank_groups_per_rank = 2 - - # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all - # configurations). Currently we do not capture the additional - # constraints incurred by the bank groups - banks_per_rank = 8 - - # RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns) - tRRD = '5.3ns' - - # RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns) - tRRD_L = '6.4ns'; - - tXAW = '30ns' - - # Current values from datasheet - IDD0 = '80mA' - IDD02 = '4mA' - IDD2N = '34mA' - IDD3N = '47mA' - IDD4W = '228mA' - IDD4R = '243mA' - IDD5 = '280mA' - IDD3P1 = '41mA' - -# A single LPDDR2-S4 x32 interface (one command/address bus), with -# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1) -# in a 1x32 configuration. -class LPDDR2_S4_1066_1x32(DRAMCtrl): - # No DLL in LPDDR2 - dll = False - - # size of device - device_size = '512MB' - - # 1x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # LPDDR2_S4 is a BL4 and BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 1KB - # (this depends on the memory density) - device_rowbuffer_size = '1kB' - - # 1x32 configuration, so 1 device - devices_per_rank = 1 - - # Use a single rank - ranks_per_channel = 1 - - # LPDDR2-S4 has 8 banks in all configurations - banks_per_rank = 8 - - # 533 MHz - tCK = '1.876ns' - - # Fixed at 15 ns - tRCD = '15ns' - - # 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time - tCL = '15ns' - - # Pre-charge one bank 15 ns (all banks 18 ns) - tRP = '15ns' - - tRAS = '42ns' - tWR = '15ns' - - tRTP = '7.5ns' - - # 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz. - # Note this is a BL8 DDR device. - # Requests larger than 32 bytes are broken down into multiple requests - # in the controller - tBURST = '7.5ns' - - # LPDDR2-S4, 4 Gbit - tRFC = '130ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '7.5ns' - - # self refresh exit time - tXS = '140ns' - - # Irrespective of speed grade, tWTR is 7.5 ns - tWTR = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns - tRTW = '3.75ns' - - # Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns - tCS = '3.75ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Irrespective of density, tFAW is 50 ns - tXAW = '50ns' - activation_limit = 4 - - # Current values from datasheet - IDD0 = '15mA' - IDD02 = '70mA' - IDD2N = '2mA' - IDD2N2 = '30mA' - IDD3N = '2.5mA' - IDD3N2 = '30mA' - IDD4W = '10mA' - IDD4W2 = '190mA' - IDD4R = '3mA' - IDD4R2 = '220mA' - IDD5 = '40mA' - IDD52 = '150mA' - IDD3P1 = '1.2mA' - IDD3P12 = '8mA' - IDD2P1 = '0.6mA' - IDD2P12 = '0.8mA' - IDD6 = '1mA' - IDD62 = '3.2mA' - VDD = '1.8V' - VDD2 = '1.2V' - -# A single WideIO x128 interface (one command and address bus), with -# default timings based on an estimated WIO-200 8 Gbit part. -class WideIO_200_1x128(DRAMCtrl): - # No DLL for WideIO - dll = False - - # size of device - device_size = '1024MB' - - # 1x128 configuration, 1 device with a 128-bit interface - device_bus_width = 128 - - # This is a BL4 device - burst_length = 4 - - # Each device has a page (row buffer) size of 4KB - # (this depends on the memory density) - device_rowbuffer_size = '4kB' - - # 1x128 configuration, so 1 device - devices_per_rank = 1 - - # Use one rank for a one-high die stack - ranks_per_channel = 1 - - # WideIO has 4 banks in all configurations - banks_per_rank = 4 - - # 200 MHz - tCK = '5ns' - - # WIO-200 - tRCD = '18ns' - tCL = '18ns' - tRP = '18ns' - tRAS = '42ns' - tWR = '15ns' - # Read to precharge is same as the burst - tRTP = '20ns' - - # 4 beats across an x128 SDR interface translates to 4 clocks @ 200 MHz. - # Note this is a BL4 SDR device. - tBURST = '20ns' - - # WIO 8 Gb - tRFC = '210ns' - - # WIO 8 Gb, <=85C, half for >85C - tREFI = '3.9us' - - # Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns - tWTR = '15ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns - tRTW = '10ns' - - # Default different rank bus delay to 2 CK, @200 MHz = 10 ns - tCS = '10ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Two instead of four activation window - tXAW = '50ns' - activation_limit = 2 - - # The WideIO specification does not provide current information - -# A single LPDDR3 x32 interface (one command/address bus), with -# default timings based on a LPDDR3-1600 4 Gbit part (Micron -# EDF8132A1MC) in a 1x32 configuration. -class LPDDR3_1600_1x32(DRAMCtrl): - # No DLL for LPDDR3 - dll = False - - # size of device - device_size = '512MB' - - # 1x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # LPDDR3 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 4KB - device_rowbuffer_size = '4kB' - - # 1x32 configuration, so 1 device - devices_per_rank = 1 - - # Technically the datasheet is a dual-rank package, but for - # comparison with the LPDDR2 config we stick to a single rank - ranks_per_channel = 1 - - # LPDDR3 has 8 banks in all configurations - banks_per_rank = 8 - - # 800 MHz - tCK = '1.25ns' - - tRCD = '18ns' - - # 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time - tCL = '15ns' - - tRAS = '42ns' - tWR = '15ns' - - # Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns - tRTP = '7.5ns' - - # Pre-charge one bank 18 ns (all banks 21 ns) - tRP = '18ns' - - # 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz. - # Note this is a BL8 DDR device. - # Requests larger than 32 bytes are broken down into multiple requests - # in the controller - tBURST = '5ns' - - # LPDDR3, 4 Gb - tRFC = '130ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '7.5ns' - - # self refresh exit time - tXS = '140ns' - - # Irrespective of speed grade, tWTR is 7.5 ns - tWTR = '7.5ns' - - # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns - tRTW = '2.5ns' - - # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns - tCS = '2.5ns' - - # Activate to activate irrespective of density and speed grade - tRRD = '10.0ns' - - # Irrespective of size, tFAW is 50 ns - tXAW = '50ns' - activation_limit = 4 - - # Current values from datasheet - IDD0 = '8mA' - IDD02 = '60mA' - IDD2N = '0.8mA' - IDD2N2 = '26mA' - IDD3N = '2mA' - IDD3N2 = '34mA' - IDD4W = '2mA' - IDD4W2 = '190mA' - IDD4R = '2mA' - IDD4R2 = '230mA' - IDD5 = '28mA' - IDD52 = '150mA' - IDD3P1 = '1.4mA' - IDD3P12 = '11mA' - IDD2P1 = '0.8mA' - IDD2P12 = '1.8mA' - IDD6 = '0.5mA' - IDD62 = '1.8mA' - VDD = '1.8V' - VDD2 = '1.2V' - -# A single GDDR5 x64 interface, with -# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix -# H5GQ1H24AFR) in a 2x32 configuration. -class GDDR5_4000_2x32(DRAMCtrl): - # size of device - device_size = '128MB' - - # 2x32 configuration, 1 device with a 32-bit interface - device_bus_width = 32 - - # GDDR5 is a BL8 device - burst_length = 8 - - # Each device has a page (row buffer) size of 2Kbits (256Bytes) - device_rowbuffer_size = '256B' - - # 2x32 configuration, so 2 devices - devices_per_rank = 2 - - # assume single rank - ranks_per_channel = 1 - - # GDDR5 has 4 bank groups - bank_groups_per_rank = 4 - - # GDDR5 has 16 banks with 4 bank groups - banks_per_rank = 16 - - # 1000 MHz - tCK = '1ns' - - # 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz - # Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz ) - # 8 beats at 4000 MHz = 2 beats at 1000 MHz - # tBURST is equivalent to the CAS-to-CAS delay (tCCD) - # With bank group architectures, tBURST represents the CAS-to-CAS - # delay for bursts to different bank groups (tCCD_S) - tBURST = '2ns' - - # @1000MHz data rate, tCCD_L is 3 CK - # CAS-to-CAS delay for bursts to the same bank group - # tBURST is equivalent to tCCD_S; no explicit parameter required - # for CAS-to-CAS delay for bursts to different bank groups - tCCD_L = '3ns'; - - tRCD = '12ns' - - # tCL is not directly found in datasheet and assumed equal tRCD - tCL = '12ns' - - tRP = '12ns' - tRAS = '28ns' - - # RRD_S (different bank group) - # RRD_S is 5.5 ns in datasheet. - # rounded to the next multiple of tCK - tRRD = '6ns' - - # RRD_L (same bank group) - # RRD_L is 5.5 ns in datasheet. - # rounded to the next multiple of tCK - tRRD_L = '6ns' - - tXAW = '23ns' - - # tXAW < 4 x tRRD. - # Therefore, activation limit is set to 0 - activation_limit = 0 - - tRFC = '65ns' - tWR = '12ns' - - # Here using the average of WTR_S and WTR_L - tWTR = '5ns' - - # Read-to-Precharge 2 CK - tRTP = '2ns' - - # Assume 2 cycles - tRTW = '2ns' - -# A single HBM x128 interface (one command and address bus), with -# default timings based on data publically released -# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014), -# IDD measurement values, and by extrapolating data from other classes. -# Architecture values based on published HBM spec -# A 4H stack is defined, 2Gb per die for a total of 1GB of memory. -class HBM_1000_4H_1x128(DRAMCtrl): - # HBM gen1 supports up to 8 128-bit physical channels - # Configuration defines a single channel, with the capacity - # set to (full_ stack_capacity / 8) based on 2Gb dies - # To use all 8 channels, set 'channels' parameter to 8 in - # system configuration - - # 128-bit interface legacy mode - device_bus_width = 128 - - # HBM supports BL4 and BL2 (legacy mode only) - burst_length = 4 - - # size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack; - # with 8 channels, 128MB per channel - device_size = '128MB' - - device_rowbuffer_size = '2kB' - - # 1x128 configuration - devices_per_rank = 1 - - # HBM does not have a CS pin; set rank to 1 - ranks_per_channel = 1 - - # HBM has 8 or 16 banks depending on capacity - # 2Gb dies have 8 banks - banks_per_rank = 8 - - # depending on frequency, bank groups may be required - # will always have 4 bank groups when enabled - # current specifications do not define the minimum frequency for - # bank group architecture - # setting bank_groups_per_rank to 0 to disable until range is defined - bank_groups_per_rank = 0 - - # 500 MHz for 1Gbps DDR data rate - tCK = '2ns' - - # use values from IDD measurement in JEDEC spec - # use tRP value for tRCD and tCL similar to other classes - tRP = '15ns' - tRCD = '15ns' - tCL = '15ns' - tRAS = '33ns' - - # BL2 and BL4 supported, default to BL4 - # DDR @ 500 MHz means 4 * 2ns / 2 = 4ns - tBURST = '4ns' - - # value for 2Gb device from JEDEC spec - tRFC = '160ns' - - # value for 2Gb device from JEDEC spec - tREFI = '3.9us' - - # extrapolate the following from LPDDR configs, using ns values - # to minimize burst length, prefetch differences - tWR = '18ns' - tRTP = '7.5ns' - tWTR = '10ns' - - # start with 2 cycles turnaround, similar to other memory classes - # could be more with variations across the stack - tRTW = '4ns' - - # single rank device, set to 0 - tCS = '0ns' - - # from MemCon example, tRRD is 4ns with 2ns tCK - tRRD = '4ns' - - # from MemCon example, tFAW is 30ns with 2ns tCK - tXAW = '30ns' - activation_limit = 4 - - # 4tCK - tXP = '8ns' - - # start with tRFC + tXP -> 160ns + 8ns = 168ns - tXS = '168ns' - -# A single HBM x64 interface (one command and address bus), with -# default timings based on HBM gen1 and data publically released -# A 4H stack is defined, 8Gb per die for a total of 4GB of memory. -# Note: This defines a pseudo-channel with a unique controller -# instantiated per pseudo-channel -# Stay at same IO rate (1Gbps) to maintain timing relationship with -# HBM gen1 class (HBM_1000_4H_x128) where possible -class HBM_1000_4H_1x64(HBM_1000_4H_1x128): - # For HBM gen2 with pseudo-channel mode, configure 2X channels. - # Configuration defines a single pseudo channel, with the capacity - # set to (full_ stack_capacity / 16) based on 8Gb dies - # To use all 16 pseudo channels, set 'channels' parameter to 16 in - # system configuration - - # 64-bit pseudo-channle interface - device_bus_width = 64 - - # HBM pseudo-channel only supports BL4 - burst_length = 4 - - # size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack; - # with 16 channels, 256MB per channel - device_size = '256MB' - - # page size is halved with pseudo-channel; maintaining the same same number - # of rows per pseudo-channel with 2X banks across 2 channels - device_rowbuffer_size = '1kB' - - # HBM has 8 or 16 banks depending on capacity - # Starting with 4Gb dies, 16 banks are defined - banks_per_rank = 16 - - # reset tRFC for larger, 8Gb device - # use HBM1 4Gb value as a starting point - tRFC = '260ns' - - # start with tRFC + tXP -> 160ns + 8ns = 168ns - tXS = '268ns' - # Default different rank bus delay to 2 CK, @1000 MHz = 2 ns - tCS = '2ns' - tREFI = '3.9us' - - # active powerdown and precharge powerdown exit time - tXP = '10ns' - - # self refresh exit time - tXS = '65ns' - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture -# burst of 32, which means bursts can be interleaved -class LPDDR5_5500_1x16_BG_BL32(DRAMCtrl): - - # Increase buffer size to account for more bank resources - read_buffer_size = 64 - - # Set page policy to better suit DMC Huxley - page_policy = 'close_adaptive' - - # 16-bit channel interface - device_bus_width = 16 - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL32 for higher command bandwidth - burst_length = 32 - - # size of device in bytes - device_size = '1GB' - - # 2kB page with BG mode - device_rowbuffer_size = '2kB' - - # Use a 1x16 configuration - devices_per_rank = 1 - - # Use a single rank - ranks_per_channel = 1 - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Initial configuration will have 16 banks with Bank Group Arch - # to maximim resources and enable higher data rates - banks_per_rank = 16 - bank_groups_per_rank = 4 - - # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK - tCK = '1.455ns' - - # Greater of 2 CK or 18ns - tRCD = '18ns' - - # Base RL is 16 CK @ 687.5 MHz = 23.28ns - tCL = '23.280ns' - - # Greater of 2 CK or 18ns - tRP = '18ns' - - # Greater of 3 CK or 42ns - tRAS = '42ns' - - # Greater of 3 CK or 34ns - tWR = '34ns' - - # active powerdown and precharge powerdown exit time - # Greater of 3 CK or 7ns - tXP = '7ns' - - # self refresh exit time (tRFCab + 7.5ns) - tXS = '217.5ns' - - # Greater of 2 CK or 7.5 ns minus 2 CK - tRTP = '4.59ns' - - # With BG architecture, burst of 32 transferred in two 16-beat - # sub-bursts, with a 16-beat gap in between. - # Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz - # tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz - tBURST = '8.73ns' - # can interleave a Bstof32 from another bank group at tBURST_MIN - # 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz - tBURST_MIN = '2.91ns' - # tBURST_MAX is the maximum burst delay for same bank group timing - # this is 8 CK @ 687.5 MHz - tBURST_MAX = '11.64ns' - - # 8 CK @ 687.5 MHz - tCCD_L = "11.64ns" - - # LPDDR5, 8 Gbit/channel for 280ns tRFCab - tRFC = '210ns' - tREFI = '3.9us' - - # Greater of 4 CK or 6.25 ns - tWTR = '6.25ns' - # Greater of 4 CK or 12 ns - tWTR_L = '12ns' - - # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL - # tWCKDQ0/tCK will be 1 CK for most cases - # For gem5 RL = WL and BL/n is already accounted for with tBURST - # Result is and additional 1 CK is required - tRTW = '1.455ns' - - # Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns - tCS = '2.91ns' - - # 2 CK - tPPD = '2.91ns' - - # Greater of 2 CK or 5 ns - tRRD = '5ns' - tRRD_L = '5ns' - - # With Bank Group Arch mode tFAW is 20 ns - tXAW = '20ns' - activation_limit = 4 - - # at 5Gbps, 4:1 WCK to CK ratio required - # 2 data beats per WCK (DDR) -> 8 per CK - beats_per_clock = 8 - - # 2 cycles required to send activate command - # 2 command phases can be sent back-to-back or - # with a gap up to tAAD = 8 CK - two_cycle_activate = True - tAAD = '11.640ns' - - data_clock_sync = True - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture, burst of 16 -class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32): - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL16 for smaller access granularity - burst_length = 16 - - # For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio - tBURST = '2.91ns' - tBURST_MIN = '2.91ns' - # For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio - tBURST_MAX = '5.82ns' - - # 4 CK @ 687.5 MHz - tCCD_L = "5.82ns" - - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# Starting with 5.5Gbps data rates and 8Gbit die -# Configuring for 8-bank mode, burst of 32 -class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32): - - # 4kB page with 8B mode - device_rowbuffer_size = '4kB' - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Select 8B - banks_per_rank = 8 - bank_groups_per_rank = 0 - - # For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio - tBURST = '5.82ns' - tBURST_MIN = '5.82ns' - tBURST_MAX = '5.82ns' - - # Greater of 4 CK or 12 ns - tWTR = '12ns' - - # Greater of 2 CK or 10 ns - tRRD = '10ns' - - # With 8B mode tFAW is 40 ns - tXAW = '40ns' - activation_limit = 4 - - # Reset BG arch timing for 8B mode - tCCD_L = "0ns" - tRRD_L = "0ns" - tWTR_L = "0ns" - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# 6.4Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture -# burst of 32, which means bursts can be interleaved -class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32): - - # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK - tCK = '1.25ns' - - # Base RL is 17 CK @ 800 MHz = 21.25ns - tCL = '21.25ns' - - # With BG architecture, burst of 32 transferred in two 16-beat - # sub-bursts, with a 16-beat gap in between. - # Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz - # tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz - tBURST = '7.5ns' - # can interleave a Bstof32 from another bank group at tBURST_MIN - # 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz - tBURST_MIN = '2.5ns' - # tBURST_MAX is the maximum burst delay for same bank group timing - # this is 8 CK @ 800 MHz - tBURST_MAX = '10ns' - - # 8 CK @ 800 MHz - tCCD_L = "10ns" - - # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL - # tWCKDQ0/tCK will be 1 CK for most cases - # For gem5 RL = WL and BL/n is already accounted for with tBURST - # Result is and additional 1 CK is required - tRTW = '1.25ns' - - # Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns - tCS = '2.5ns' - - # 2 CK - tPPD = '2.5ns' - - # 2 command phases can be sent back-to-back or - # with a gap up to tAAD = 8 CK - tAAD = '10ns' - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on initial -# JEDEC specifcation -# 6.4Gbps data rates and 8Gbit die -# Configuring for 16-bank mode with bank-group architecture, burst of 16 -class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32): - - # LPDDR5 is a BL16 or BL32 device - # With BG mode, BL16 and BL32 are supported - # Use BL16 for smaller access granularity - burst_length = 16 - - # For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio - tBURST = '2.5ns' - tBURST_MIN = '2.5ns' - # For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio - tBURST_MAX = '5ns' - - # 4 CK @ 800 MHz - tCCD_L = "5ns" - - -# A single LPDDR5 x16 interface (one command/address bus) -# for a single x16 channel with default timings based on -# initial JEDEC specification -# 6.4Gbps data rates and 8Gbit die -# Configuring for 8-bank mode, burst of 32 -class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32): - - # 4kB page with 8B mode - device_rowbuffer_size = '4kB' - - # LPDDR5 supports configurable bank options - # 8B : BL32, all frequencies - # 16B : BL32 or BL16, <=3.2Gbps - # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps - # Select 8B - banks_per_rank = 8 - bank_groups_per_rank = 0 - - # For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio - tBURST = '5ns' - tBURST_MIN = '5ns' - tBURST_MAX = '5ns' - - # Greater of 4 CK or 12 ns - tWTR = '12ns' - - # Greater of 2 CK or 10 ns - tRRD = '10ns' - - # With 8B mode tFAW is 40 ns - tXAW = '40ns' - activation_limit = 4 - - # Reset BG arch timing for 8B mode - tCCD_L = "0ns" - tRRD_L = "0ns" - tWTR_L = "0ns" diff --git a/src/mem/DRAMInterface.py b/src/mem/DRAMInterface.py new file mode 100644 index 0000000..f571920 --- /dev/null +++ b/src/mem/DRAMInterface.py @@ -0,0 +1,1473 @@ +# Copyright (c) 2012-2020 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Copyright (c) 2013 Amin Farmahini-Farahani +# Copyright (c) 2015 University of Kaiserslautern +# Copyright (c) 2015 The University of Bologna +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * + +from m5.objects.AbstractMemory import AbstractMemory + +# Enum for the address mapping. With Ch, Ra, Ba, Ro and Co denoting +# channel, rank, bank, row and column, respectively, and going from +# MSB to LSB. Available are RoRaBaChCo and RoRaBaCoCh, that are +# suitable for an open-page policy, optimising for sequential accesses +# hitting in the open row. For a closed-page policy, RoCoRaBaCh +# maximises parallelism. +class AddrMap(Enum): vals = ['RoRaBaChCo', 'RoRaBaCoCh', 'RoCoRaBaCh'] + +# Enum for the page policy, either open, open_adaptive, close, or +# close_adaptive. +class PageManage(Enum): vals = ['open', 'open_adaptive', 'close', + 'close_adaptive'] + +class DRAMInterface(AbstractMemory): + type = 'DRAMInterface' + cxx_header = "mem/dram_ctrl.hh" + + # Allow the interface to set required controller buffer sizes + # each entry corresponds to a burst for the specific DRAM + # configuration (e.g. x32 with burst length 8 is 32 bytes) and not + # the cacheline size or request/packet size + write_buffer_size = Param.Unsigned(64, "Number of write queue entries") + read_buffer_size = Param.Unsigned(32, "Number of read queue entries") + + # scheduler, address map and page policy + addr_mapping = Param.AddrMap('RoRaBaCoCh', "Address mapping policy") + page_policy = Param.PageManage('open_adaptive', "Page management policy") + + # enforce a limit on the number of accesses per row + max_accesses_per_row = Param.Unsigned(16, "Max accesses per row before " + "closing"); + + # size of DRAM Chip in Bytes + device_size = Param.MemorySize("Size of DRAM chip") + # the physical organisation of the DRAM + device_bus_width = Param.Unsigned("data bus width in bits for each DRAM "\ + "device/chip") + burst_length = Param.Unsigned("Burst lenght (BL) in beats") + device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\ + "device/chip") + devices_per_rank = Param.Unsigned("Number of devices/chips per rank") + ranks_per_channel = Param.Unsigned("Number of ranks per channel") + + # default to 0 bank groups per rank, indicating bank group architecture + # is not used + # update per memory class when bank group architecture is supported + bank_groups_per_rank = Param.Unsigned(0, "Number of bank groups per rank") + banks_per_rank = Param.Unsigned("Number of banks per rank") + + # Enable DRAM powerdown states if True. This is False by default due to + # performance being lower when enabled + enable_dram_powerdown = Param.Bool(False, "Enable powerdown states") + + # For power modelling we need to know if the DRAM has a DLL or not + dll = Param.Bool(True, "DRAM has DLL or not") + + # DRAMPower provides in addition to the core power, the possibility to + # include RD/WR termination and IO power. This calculation assumes some + # default values. The integration of DRAMPower with gem5 does not include + # IO and RD/WR termination power by default. This might be added as an + # additional feature in the future. + + # timing behaviour and constraints - all in nanoseconds + + # the base clock period of the DRAM + tCK = Param.Latency("Clock period") + + # the amount of time in nanoseconds from issuing an activate command + # to the data being available in the row buffer for a read/write + tRCD = Param.Latency("RAS to CAS delay") + + # the time from issuing a read/write command to seeing the actual data + tCL = Param.Latency("CAS latency") + + # minimum time between a precharge and subsequent activate + tRP = Param.Latency("Row precharge time") + + # minimum time between an activate and a precharge to the same row + tRAS = Param.Latency("ACT to PRE delay") + + # minimum time between a write data transfer and a precharge + tWR = Param.Latency("Write recovery time") + + # minimum time between a read and precharge command + tRTP = Param.Latency("Read to precharge") + + # time to complete a burst transfer, typically the burst length + # divided by two due to the DDR bus, but by making it a parameter + # it is easier to also evaluate SDR memories like WideIO. + # This parameter has to account for burst length. + # Read/Write requests with data size larger than one full burst are broken + # down into multiple requests in the controller + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = Param.Latency("Burst duration " + "(typically burst length / 2 cycles)") + + # tBURST_MAX is the column array cycle delay required before next access, + # which could be greater than tBURST when the memory access time is greater + # than tBURST + tBURST_MAX = Param.Latency(Self.tBURST, "Column access delay") + + # tBURST_MIN is the minimum delay between bursts, which could be less than + # tBURST when interleaving is supported + tBURST_MIN = Param.Latency(Self.tBURST, "Minimim delay between bursts") + + # CAS-to-CAS delay for bursts to the same bank group + # only utilized with bank group architectures; set to 0 for default case + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = Param.Latency("0ns", "Same bank group CAS to CAS delay") + + # Write-to-Write delay for bursts to the same bank group + # only utilized with bank group architectures; set to 0 for default case + # This will be used to enable different same bank group delays + # for writes versus reads + tCCD_L_WR = Param.Latency(Self.tCCD_L, + "Same bank group Write to Write delay") + + # time taken to complete one refresh cycle (N rows in all banks) + tRFC = Param.Latency("Refresh cycle time") + + # refresh command interval, how often a "ref" command needs + # to be sent. It is 7.8 us for a 64ms refresh requirement + tREFI = Param.Latency("Refresh command interval") + + # write-to-read, same rank turnaround penalty + tWTR = Param.Latency("Write to read, same rank switching time") + + # write-to-read, same rank turnaround penalty for same bank group + tWTR_L = Param.Latency(Self.tWTR, "Write to read, same rank switching " + "time, same bank group") + + # read-to-write, same rank turnaround penalty + tRTW = Param.Latency("Read to write, same rank switching time") + + # rank-to-rank bus delay penalty + # this does not correlate to a memory timing parameter and encompasses: + # 1) RD-to-RD, 2) WR-to-WR, 3) RD-to-WR, and 4) WR-to-RD + # different rank bus delay + tCS = Param.Latency("Rank to rank switching time") + + # minimum precharge to precharge delay time + tPPD = Param.Latency("0ns", "PRE to PRE delay") + + # maximum delay between two-cycle ACT command phases + tAAD = Param.Latency(Self.tCK, + "Maximum delay between two-cycle ACT commands") + + two_cycle_activate = Param.Bool(False, + "Two cycles required to send activate") + + # minimum row activate to row activate delay time + tRRD = Param.Latency("ACT to ACT delay") + + # only utilized with bank group architectures; set to 0 for default case + tRRD_L = Param.Latency("0ns", "Same bank group ACT to ACT delay") + + # time window in which a maximum number of activates are allowed + # to take place, set to 0 to disable + tXAW = Param.Latency("X activation window") + activation_limit = Param.Unsigned("Max number of activates in window") + + # time to exit power-down mode + # Exit power-down to next valid command delay + tXP = Param.Latency("0ns", "Power-up Delay") + + # Exit Powerdown to commands requiring a locked DLL + tXPDLL = Param.Latency("0ns", "Power-up Delay with locked DLL") + + # time to exit self-refresh mode + tXS = Param.Latency("0ns", "Self-refresh exit latency") + + # time to exit self-refresh mode with locked DLL + tXSDLL = Param.Latency("0ns", "Self-refresh exit latency DLL") + + # number of data beats per clock. with DDR, default is 2, one per edge + beats_per_clock = Param.Unsigned(2, "Data beats per clock") + + data_clock_sync = Param.Bool(False, "Synchronization commands required") + + # Currently rolled into other params + ###################################################################### + + # tRC - assumed to be tRAS + tRP + + # Power Behaviour and Constraints + # DRAMs like LPDDR and WideIO have 2 external voltage domains. These are + # defined as VDD and VDD2. Each current is defined for each voltage domain + # separately. For example, current IDD0 is active-precharge current for + # voltage domain VDD and current IDD02 is active-precharge current for + # voltage domain VDD2. + # By default all currents are set to 0mA. Users who are only interested in + # the performance of DRAMs can leave them at 0. + + # Operating 1 Bank Active-Precharge current + IDD0 = Param.Current("0mA", "Active precharge current") + + # Operating 1 Bank Active-Precharge current multiple voltage Range + IDD02 = Param.Current("0mA", "Active precharge current VDD2") + + # Precharge Power-down Current: Slow exit + IDD2P0 = Param.Current("0mA", "Precharge Powerdown slow") + + # Precharge Power-down Current: Slow exit multiple voltage Range + IDD2P02 = Param.Current("0mA", "Precharge Powerdown slow VDD2") + + # Precharge Power-down Current: Fast exit + IDD2P1 = Param.Current("0mA", "Precharge Powerdown fast") + + # Precharge Power-down Current: Fast exit multiple voltage Range + IDD2P12 = Param.Current("0mA", "Precharge Powerdown fast VDD2") + + # Precharge Standby current + IDD2N = Param.Current("0mA", "Precharge Standby current") + + # Precharge Standby current multiple voltage range + IDD2N2 = Param.Current("0mA", "Precharge Standby current VDD2") + + # Active Power-down current: slow exit + IDD3P0 = Param.Current("0mA", "Active Powerdown slow") + + # Active Power-down current: slow exit multiple voltage range + IDD3P02 = Param.Current("0mA", "Active Powerdown slow VDD2") + + # Active Power-down current : fast exit + IDD3P1 = Param.Current("0mA", "Active Powerdown fast") + + # Active Power-down current : fast exit multiple voltage range + IDD3P12 = Param.Current("0mA", "Active Powerdown fast VDD2") + + # Active Standby current + IDD3N = Param.Current("0mA", "Active Standby current") + + # Active Standby current multiple voltage range + IDD3N2 = Param.Current("0mA", "Active Standby current VDD2") + + # Burst Read Operating Current + IDD4R = Param.Current("0mA", "READ current") + + # Burst Read Operating Current multiple voltage range + IDD4R2 = Param.Current("0mA", "READ current VDD2") + + # Burst Write Operating Current + IDD4W = Param.Current("0mA", "WRITE current") + + # Burst Write Operating Current multiple voltage range + IDD4W2 = Param.Current("0mA", "WRITE current VDD2") + + # Refresh Current + IDD5 = Param.Current("0mA", "Refresh current") + + # Refresh Current multiple voltage range + IDD52 = Param.Current("0mA", "Refresh current VDD2") + + # Self-Refresh Current + IDD6 = Param.Current("0mA", "Self-refresh Current") + + # Self-Refresh Current multiple voltage range + IDD62 = Param.Current("0mA", "Self-refresh Current VDD2") + + # Main voltage range of the DRAM + VDD = Param.Voltage("0V", "Main Voltage Range") + + # Second voltage range defined by some DRAMs + VDD2 = Param.Voltage("0V", "2nd Voltage Range") + +# A single DDR3-1600 x64 channel (one command and address bus), with +# timings based on a DDR3-1600 4 Gbit datasheet (Micron MT41J512M8) in +# an 8x8 configuration. +class DDR3_1600_8x8(DRAMInterface): + # size of device in bytes + device_size = '512MB' + + # 8x8 configuration, 8 devices each with an 8-bit interface + device_bus_width = 8 + + # DDR3 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) + device_rowbuffer_size = '1kB' + + # 8x8 configuration, so 8 devices + devices_per_rank = 8 + + # Use two ranks + ranks_per_channel = 2 + + # DDR3 has 8 banks in all configurations + banks_per_rank = 8 + + # 800 MHz + tCK = '1.25ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 800 MHz + tBURST = '5ns' + + # DDR3-1600 11-11-11 + tRCD = '13.75ns' + tCL = '13.75ns' + tRP = '13.75ns' + tRAS = '35ns' + tRRD = '6ns' + tXAW = '30ns' + activation_limit = 4 + tRFC = '260ns' + + tWR = '15ns' + + # Greater of 4 CK or 7.5 ns + tWTR = '7.5ns' + + # Greater of 4 CK or 7.5 ns + tRTP = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns + tRTW = '2.5ns' + + # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns + tCS = '2.5ns' + + # <=85C, half for >85C + tREFI = '7.8us' + + # active powerdown and precharge powerdown exit time + tXP = '6ns' + + # self refresh exit time + tXS = '270ns' + + # Current values from datasheet Die Rev E,J + IDD0 = '55mA' + IDD2N = '32mA' + IDD3N = '38mA' + IDD4W = '125mA' + IDD4R = '157mA' + IDD5 = '235mA' + IDD3P1 = '38mA' + IDD2P1 = '32mA' + IDD6 = '20mA' + VDD = '1.5V' + +# A single HMC-2500 x32 model based on: +# [1] DRAMSpec: a high-level DRAM bank modelling tool +# developed at the University of Kaiserslautern. This high level tool +# uses RC (resistance-capacitance) and CV (capacitance-voltage) models to +# estimate the DRAM bank latency and power numbers. +# [2] High performance AXI-4.0 based interconnect for extensible smart memory +# cubes (E. Azarkhish et. al) +# Assumed for the HMC model is a 30 nm technology node. +# The modelled HMC consists of 4 Gbit layers which sum up to 2GB of memory (4 +# layers). +# Each layer has 16 vaults and each vault consists of 2 banks per layer. +# In order to be able to use the same controller used for 2D DRAM generations +# for HMC, the following analogy is done: +# Channel (DDR) => Vault (HMC) +# device_size (DDR) => size of a single layer in a vault +# ranks per channel (DDR) => number of layers +# banks per rank (DDR) => banks per layer +# devices per rank (DDR) => devices per layer ( 1 for HMC). +# The parameters for which no input is available are inherited from the DDR3 +# configuration. +# This configuration includes the latencies from the DRAM to the logic layer +# of the HMC +class HMC_2500_1x32(DDR3_1600_8x8): + # size of device + # two banks per device with each bank 4MB [2] + device_size = '8MB' + + # 1x32 configuration, 1 device with 32 TSVs [2] + device_bus_width = 32 + + # HMC is a BL8 device [2] + burst_length = 8 + + # Each device has a page (row buffer) size of 256 bytes [2] + device_rowbuffer_size = '256B' + + # 1x32 configuration, so 1 device [2] + devices_per_rank = 1 + + # 4 layers so 4 ranks [2] + ranks_per_channel = 4 + + # HMC has 2 banks per layer [2] + # Each layer represents a rank. With 4 layers and 8 banks in total, each + # layer has 2 banks; thus 2 banks per rank. + banks_per_rank = 2 + + # 1250 MHz [2] + tCK = '0.8ns' + + # 8 beats across an x32 interface translates to 4 clocks @ 1250 MHz + tBURST = '3.2ns' + + # Values using DRAMSpec HMC model [1] + tRCD = '10.2ns' + tCL = '9.9ns' + tRP = '7.7ns' + tRAS = '21.6ns' + + # tRRD depends on the power supply network for each vendor. + # We assume a tRRD of a double bank approach to be equal to 4 clock + # cycles (Assumption) + tRRD = '3.2ns' + + # activation limit is set to 0 since there are only 2 banks per vault + # layer. + activation_limit = 0 + + # Values using DRAMSpec HMC model [1] + tRFC = '59ns' + tWR = '8ns' + tRTP = '4.9ns' + + # Default different rank bus delay assumed to 1 CK for TSVs, @1250 MHz = + # 0.8 ns (Assumption) + tCS = '0.8ns' + + # Value using DRAMSpec HMC model [1] + tREFI = '3.9us' + + # The default page policy in the vault controllers is simple closed page + # [2] nevertheless 'close' policy opens and closes the row multiple times + # for bursts largers than 32Bytes. For this reason we use 'close_adaptive' + page_policy = 'close_adaptive' + + # RoCoRaBaCh resembles the default address mapping in HMC + addr_mapping = 'RoCoRaBaCh' + + # These parameters do not directly correlate with buffer_size in real + # hardware. Nevertheless, their value has been tuned to achieve a + # bandwidth similar to the cycle-accurate model in [2] + write_buffer_size = 32 + read_buffer_size = 32 + +# A single DDR3-2133 x64 channel refining a selected subset of the +# options for the DDR-1600 configuration, based on the same DDR3-1600 +# 4 Gbit datasheet (Micron MT41J512M8). Most parameters are kept +# consistent across the two configurations. +class DDR3_2133_8x8(DDR3_1600_8x8): + # 1066 MHz + tCK = '0.938ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 1066 MHz + tBURST = '3.752ns' + + # DDR3-2133 14-14-14 + tRCD = '13.09ns' + tCL = '13.09ns' + tRP = '13.09ns' + tRAS = '33ns' + tRRD = '5ns' + tXAW = '25ns' + + # Current values from datasheet + IDD0 = '70mA' + IDD2N = '37mA' + IDD3N = '44mA' + IDD4W = '157mA' + IDD4R = '191mA' + IDD5 = '250mA' + IDD3P1 = '44mA' + IDD2P1 = '43mA' + IDD6 ='20mA' + VDD = '1.5V' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A2G4) +# in an 16x4 configuration. +# Total channel capacity is 32GB +# 16 devices/rank * 2 ranks/channel * 1GB/device = 32GB/channel +class DDR4_2400_16x4(DRAMInterface): + # size of device + device_size = '1GB' + + # 16x4 configuration, 16 devices each with a 4-bit interface + device_bus_width = 4 + + # DDR4 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 512 byte (1K columns x4) + device_rowbuffer_size = '512B' + + # 16x4 configuration, so 16 devices + devices_per_rank = 16 + + # Match our DDR3 configurations which is dual rank + ranks_per_channel = 2 + + # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups + # Set to 4 for x4 case + bank_groups_per_rank = 4 + + # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all + # configurations). Currently we do not capture the additional + # constraints incurred by the bank groups + banks_per_rank = 16 + + # override the default buffer sizes and go for something larger to + # accommodate the larger bank count + write_buffer_size = 128 + read_buffer_size = 64 + + # 1200 MHz + tCK = '0.833ns' + + # 8 beats across an x64 interface translates to 4 clocks @ 1200 MHz + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = '3.332ns' + + # @2400 data rate, tCCD_L is 6 CK + # CAS-to-CAS delay for bursts to the same bank group + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = '5ns'; + + # DDR4-2400 17-17-17 + tRCD = '14.16ns' + tCL = '14.16ns' + tRP = '14.16ns' + tRAS = '32ns' + + # RRD_S (different bank group) for 512B page is MAX(4 CK, 3.3ns) + tRRD = '3.332ns' + + # RRD_L (same bank group) for 512B page is MAX(4 CK, 4.9ns) + tRRD_L = '4.9ns'; + + # tFAW for 512B page is MAX(16 CK, 13ns) + tXAW = '13.328ns' + activation_limit = 4 + # tRFC is 350ns + tRFC = '350ns' + + tWR = '15ns' + + # Here using the average of WTR_S and WTR_L + tWTR = '5ns' + + # Greater of 4 CK or 7.5 ns + tRTP = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @1200 MHz = 1.666 ns + tRTW = '1.666ns' + + # Default different rank bus delay to 2 CK, @1200 MHz = 1.666 ns + tCS = '1.666ns' + + # <=85C, half for >85C + tREFI = '7.8us' + + # active powerdown and precharge powerdown exit time + tXP = '6ns' + + # self refresh exit time + # exit delay to ACT, PRE, PREALL, REF, SREF Enter, and PD Enter is: + # tRFC + 10ns = 340ns + tXS = '340ns' + + # Current values from datasheet + IDD0 = '43mA' + IDD02 = '3mA' + IDD2N = '34mA' + IDD3N = '38mA' + IDD3N2 = '3mA' + IDD4W = '103mA' + IDD4R = '110mA' + IDD5 = '250mA' + IDD3P1 = '32mA' + IDD2P1 = '25mA' + IDD6 = '30mA' + VDD = '1.2V' + VDD2 = '2.5V' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A1G8) +# in an 8x8 configuration. +# Total channel capacity is 16GB +# 8 devices/rank * 2 ranks/channel * 1GB/device = 16GB/channel +class DDR4_2400_8x8(DDR4_2400_16x4): + # 8x8 configuration, 8 devices each with an 8-bit interface + device_bus_width = 8 + + # Each device has a page (row buffer) size of 1 Kbyte (1K columns x8) + device_rowbuffer_size = '1kB' + + # 8x8 configuration, so 8 devices + devices_per_rank = 8 + + # RRD_L (same bank group) for 1K page is MAX(4 CK, 4.9ns) + tRRD_L = '4.9ns'; + + tXAW = '21ns' + + # Current values from datasheet + IDD0 = '48mA' + IDD3N = '43mA' + IDD4W = '123mA' + IDD4R = '135mA' + IDD3P1 = '37mA' + +# A single DDR4-2400 x64 channel (one command and address bus), with +# timings based on a DDR4-2400 8 Gbit datasheet (Micron MT40A512M16) +# in an 4x16 configuration. +# Total channel capacity is 4GB +# 4 devices/rank * 1 ranks/channel * 1GB/device = 4GB/channel +class DDR4_2400_4x16(DDR4_2400_16x4): + # 4x16 configuration, 4 devices each with an 16-bit interface + device_bus_width = 16 + + # Each device has a page (row buffer) size of 2 Kbyte (1K columns x16) + device_rowbuffer_size = '2kB' + + # 4x16 configuration, so 4 devices + devices_per_rank = 4 + + # Single rank for x16 + ranks_per_channel = 1 + + # DDR4 has 2 (x16) or 4 (x4 and x8) bank groups + # Set to 2 for x16 case + bank_groups_per_rank = 2 + + # DDR4 has 16 banks(x4,x8) and 8 banks(x16) (4 bank groups in all + # configurations). Currently we do not capture the additional + # constraints incurred by the bank groups + banks_per_rank = 8 + + # RRD_S (different bank group) for 2K page is MAX(4 CK, 5.3ns) + tRRD = '5.3ns' + + # RRD_L (same bank group) for 2K page is MAX(4 CK, 6.4ns) + tRRD_L = '6.4ns'; + + tXAW = '30ns' + + # Current values from datasheet + IDD0 = '80mA' + IDD02 = '4mA' + IDD2N = '34mA' + IDD3N = '47mA' + IDD4W = '228mA' + IDD4R = '243mA' + IDD5 = '280mA' + IDD3P1 = '41mA' + +# A single LPDDR2-S4 x32 interface (one command/address bus), with +# default timings based on a LPDDR2-1066 4 Gbit part (Micron MT42L128M32D1) +# in a 1x32 configuration. +class LPDDR2_S4_1066_1x32(DRAMInterface): + # No DLL in LPDDR2 + dll = False + + # size of device + device_size = '512MB' + + # 1x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # LPDDR2_S4 is a BL4 and BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1KB + # (this depends on the memory density) + device_rowbuffer_size = '1kB' + + # 1x32 configuration, so 1 device + devices_per_rank = 1 + + # Use a single rank + ranks_per_channel = 1 + + # LPDDR2-S4 has 8 banks in all configurations + banks_per_rank = 8 + + # 533 MHz + tCK = '1.876ns' + + # Fixed at 15 ns + tRCD = '15ns' + + # 8 CK read latency, 4 CK write latency @ 533 MHz, 1.876 ns cycle time + tCL = '15ns' + + # Pre-charge one bank 15 ns (all banks 18 ns) + tRP = '15ns' + + tRAS = '42ns' + tWR = '15ns' + + tRTP = '7.5ns' + + # 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz. + # Note this is a BL8 DDR device. + # Requests larger than 32 bytes are broken down into multiple requests + # in the controller + tBURST = '7.5ns' + + # LPDDR2-S4, 4 Gbit + tRFC = '130ns' + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '7.5ns' + + # self refresh exit time + tXS = '140ns' + + # Irrespective of speed grade, tWTR is 7.5 ns + tWTR = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @533 MHz = 3.75 ns + tRTW = '3.75ns' + + # Default different rank bus delay to 2 CK, @533 MHz = 3.75 ns + tCS = '3.75ns' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Irrespective of density, tFAW is 50 ns + tXAW = '50ns' + activation_limit = 4 + + # Current values from datasheet + IDD0 = '15mA' + IDD02 = '70mA' + IDD2N = '2mA' + IDD2N2 = '30mA' + IDD3N = '2.5mA' + IDD3N2 = '30mA' + IDD4W = '10mA' + IDD4W2 = '190mA' + IDD4R = '3mA' + IDD4R2 = '220mA' + IDD5 = '40mA' + IDD52 = '150mA' + IDD3P1 = '1.2mA' + IDD3P12 = '8mA' + IDD2P1 = '0.6mA' + IDD2P12 = '0.8mA' + IDD6 = '1mA' + IDD62 = '3.2mA' + VDD = '1.8V' + VDD2 = '1.2V' + +# A single WideIO x128 interface (one command and address bus), with +# default timings based on an estimated WIO-200 8 Gbit part. +class WideIO_200_1x128(DRAMInterface): + # No DLL for WideIO + dll = False + + # size of device + device_size = '1024MB' + + # 1x128 configuration, 1 device with a 128-bit interface + device_bus_width = 128 + + # This is a BL4 device + burst_length = 4 + + # Each device has a page (row buffer) size of 4KB + # (this depends on the memory density) + device_rowbuffer_size = '4kB' + + # 1x128 configuration, so 1 device + devices_per_rank = 1 + + # Use one rank for a one-high die stack + ranks_per_channel = 1 + + # WideIO has 4 banks in all configurations + banks_per_rank = 4 + + # 200 MHz + tCK = '5ns' + + # WIO-200 + tRCD = '18ns' + tCL = '18ns' + tRP = '18ns' + tRAS = '42ns' + tWR = '15ns' + # Read to precharge is same as the burst + tRTP = '20ns' + + # 4 beats across an x128 SDR interface translates to 4 clocks @ 200 MHz. + # Note this is a BL4 SDR device. + tBURST = '20ns' + + # WIO 8 Gb + tRFC = '210ns' + + # WIO 8 Gb, <=85C, half for >85C + tREFI = '3.9us' + + # Greater of 2 CK or 15 ns, 2 CK @ 200 MHz = 10 ns + tWTR = '15ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @200 MHz = 10 ns + tRTW = '10ns' + + # Default different rank bus delay to 2 CK, @200 MHz = 10 ns + tCS = '10ns' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Two instead of four activation window + tXAW = '50ns' + activation_limit = 2 + + # The WideIO specification does not provide current information + +# A single LPDDR3 x32 interface (one command/address bus), with +# default timings based on a LPDDR3-1600 4 Gbit part (Micron +# EDF8132A1MC) in a 1x32 configuration. +class LPDDR3_1600_1x32(DRAMInterface): + # No DLL for LPDDR3 + dll = False + + # size of device + device_size = '512MB' + + # 1x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # LPDDR3 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 4KB + device_rowbuffer_size = '4kB' + + # 1x32 configuration, so 1 device + devices_per_rank = 1 + + # Technically the datasheet is a dual-rank package, but for + # comparison with the LPDDR2 config we stick to a single rank + ranks_per_channel = 1 + + # LPDDR3 has 8 banks in all configurations + banks_per_rank = 8 + + # 800 MHz + tCK = '1.25ns' + + tRCD = '18ns' + + # 12 CK read latency, 6 CK write latency @ 800 MHz, 1.25 ns cycle time + tCL = '15ns' + + tRAS = '42ns' + tWR = '15ns' + + # Greater of 4 CK or 7.5 ns, 4 CK @ 800 MHz = 5 ns + tRTP = '7.5ns' + + # Pre-charge one bank 18 ns (all banks 21 ns) + tRP = '18ns' + + # 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz. + # Note this is a BL8 DDR device. + # Requests larger than 32 bytes are broken down into multiple requests + # in the controller + tBURST = '5ns' + + # LPDDR3, 4 Gb + tRFC = '130ns' + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '7.5ns' + + # self refresh exit time + tXS = '140ns' + + # Irrespective of speed grade, tWTR is 7.5 ns + tWTR = '7.5ns' + + # Default same rank rd-to-wr bus turnaround to 2 CK, @800 MHz = 2.5 ns + tRTW = '2.5ns' + + # Default different rank bus delay to 2 CK, @800 MHz = 2.5 ns + tCS = '2.5ns' + + # Activate to activate irrespective of density and speed grade + tRRD = '10.0ns' + + # Irrespective of size, tFAW is 50 ns + tXAW = '50ns' + activation_limit = 4 + + # Current values from datasheet + IDD0 = '8mA' + IDD02 = '60mA' + IDD2N = '0.8mA' + IDD2N2 = '26mA' + IDD3N = '2mA' + IDD3N2 = '34mA' + IDD4W = '2mA' + IDD4W2 = '190mA' + IDD4R = '2mA' + IDD4R2 = '230mA' + IDD5 = '28mA' + IDD52 = '150mA' + IDD3P1 = '1.4mA' + IDD3P12 = '11mA' + IDD2P1 = '0.8mA' + IDD2P12 = '1.8mA' + IDD6 = '0.5mA' + IDD62 = '1.8mA' + VDD = '1.8V' + VDD2 = '1.2V' + +# A single GDDR5 x64 interface, with +# default timings based on a GDDR5-4000 1 Gbit part (SK Hynix +# H5GQ1H24AFR) in a 2x32 configuration. +class GDDR5_4000_2x32(DRAMInterface): + # size of device + device_size = '128MB' + + # 2x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # GDDR5 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 2Kbits (256Bytes) + device_rowbuffer_size = '256B' + + # 2x32 configuration, so 2 devices + devices_per_rank = 2 + + # assume single rank + ranks_per_channel = 1 + + # GDDR5 has 4 bank groups + bank_groups_per_rank = 4 + + # GDDR5 has 16 banks with 4 bank groups + banks_per_rank = 16 + + # 1000 MHz + tCK = '1ns' + + # 8 beats across an x64 interface translates to 2 clocks @ 1000 MHz + # Data bus runs @2000 Mhz => DDR ( data runs at 4000 MHz ) + # 8 beats at 4000 MHz = 2 beats at 1000 MHz + # tBURST is equivalent to the CAS-to-CAS delay (tCCD) + # With bank group architectures, tBURST represents the CAS-to-CAS + # delay for bursts to different bank groups (tCCD_S) + tBURST = '2ns' + + # @1000MHz data rate, tCCD_L is 3 CK + # CAS-to-CAS delay for bursts to the same bank group + # tBURST is equivalent to tCCD_S; no explicit parameter required + # for CAS-to-CAS delay for bursts to different bank groups + tCCD_L = '3ns'; + + tRCD = '12ns' + + # tCL is not directly found in datasheet and assumed equal tRCD + tCL = '12ns' + + tRP = '12ns' + tRAS = '28ns' + + # RRD_S (different bank group) + # RRD_S is 5.5 ns in datasheet. + # rounded to the next multiple of tCK + tRRD = '6ns' + + # RRD_L (same bank group) + # RRD_L is 5.5 ns in datasheet. + # rounded to the next multiple of tCK + tRRD_L = '6ns' + + tXAW = '23ns' + + # tXAW < 4 x tRRD. + # Therefore, activation limit is set to 0 + activation_limit = 0 + + tRFC = '65ns' + tWR = '12ns' + + # Here using the average of WTR_S and WTR_L + tWTR = '5ns' + + # Read-to-Precharge 2 CK + tRTP = '2ns' + + # Assume 2 cycles + tRTW = '2ns' + +# A single HBM x128 interface (one command and address bus), with +# default timings based on data publically released +# ("HBM: Memory Solution for High Performance Processors", MemCon, 2014), +# IDD measurement values, and by extrapolating data from other classes. +# Architecture values based on published HBM spec +# A 4H stack is defined, 2Gb per die for a total of 1GB of memory. +class HBM_1000_4H_1x128(DRAMInterface): + # HBM gen1 supports up to 8 128-bit physical channels + # Configuration defines a single channel, with the capacity + # set to (full_ stack_capacity / 8) based on 2Gb dies + # To use all 8 channels, set 'channels' parameter to 8 in + # system configuration + + # 128-bit interface legacy mode + device_bus_width = 128 + + # HBM supports BL4 and BL2 (legacy mode only) + burst_length = 4 + + # size of channel in bytes, 4H stack of 2Gb dies is 1GB per stack; + # with 8 channels, 128MB per channel + device_size = '128MB' + + device_rowbuffer_size = '2kB' + + # 1x128 configuration + devices_per_rank = 1 + + # HBM does not have a CS pin; set rank to 1 + ranks_per_channel = 1 + + # HBM has 8 or 16 banks depending on capacity + # 2Gb dies have 8 banks + banks_per_rank = 8 + + # depending on frequency, bank groups may be required + # will always have 4 bank groups when enabled + # current specifications do not define the minimum frequency for + # bank group architecture + # setting bank_groups_per_rank to 0 to disable until range is defined + bank_groups_per_rank = 0 + + # 500 MHz for 1Gbps DDR data rate + tCK = '2ns' + + # use values from IDD measurement in JEDEC spec + # use tRP value for tRCD and tCL similar to other classes + tRP = '15ns' + tRCD = '15ns' + tCL = '15ns' + tRAS = '33ns' + + # BL2 and BL4 supported, default to BL4 + # DDR @ 500 MHz means 4 * 2ns / 2 = 4ns + tBURST = '4ns' + + # value for 2Gb device from JEDEC spec + tRFC = '160ns' + + # value for 2Gb device from JEDEC spec + tREFI = '3.9us' + + # extrapolate the following from LPDDR configs, using ns values + # to minimize burst length, prefetch differences + tWR = '18ns' + tRTP = '7.5ns' + tWTR = '10ns' + + # start with 2 cycles turnaround, similar to other memory classes + # could be more with variations across the stack + tRTW = '4ns' + + # single rank device, set to 0 + tCS = '0ns' + + # from MemCon example, tRRD is 4ns with 2ns tCK + tRRD = '4ns' + + # from MemCon example, tFAW is 30ns with 2ns tCK + tXAW = '30ns' + activation_limit = 4 + + # 4tCK + tXP = '8ns' + + # start with tRFC + tXP -> 160ns + 8ns = 168ns + tXS = '168ns' + +# A single HBM x64 interface (one command and address bus), with +# default timings based on HBM gen1 and data publically released +# A 4H stack is defined, 8Gb per die for a total of 4GB of memory. +# Note: This defines a pseudo-channel with a unique controller +# instantiated per pseudo-channel +# Stay at same IO rate (1Gbps) to maintain timing relationship with +# HBM gen1 class (HBM_1000_4H_x128) where possible +class HBM_1000_4H_1x64(HBM_1000_4H_1x128): + # For HBM gen2 with pseudo-channel mode, configure 2X channels. + # Configuration defines a single pseudo channel, with the capacity + # set to (full_ stack_capacity / 16) based on 8Gb dies + # To use all 16 pseudo channels, set 'channels' parameter to 16 in + # system configuration + + # 64-bit pseudo-channle interface + device_bus_width = 64 + + # HBM pseudo-channel only supports BL4 + burst_length = 4 + + # size of channel in bytes, 4H stack of 8Gb dies is 4GB per stack; + # with 16 channels, 256MB per channel + device_size = '256MB' + + # page size is halved with pseudo-channel; maintaining the same same number + # of rows per pseudo-channel with 2X banks across 2 channels + device_rowbuffer_size = '1kB' + + # HBM has 8 or 16 banks depending on capacity + # Starting with 4Gb dies, 16 banks are defined + banks_per_rank = 16 + + # reset tRFC for larger, 8Gb device + # use HBM1 4Gb value as a starting point + tRFC = '260ns' + + # start with tRFC + tXP -> 160ns + 8ns = 168ns + tXS = '268ns' + # Default different rank bus delay to 2 CK, @1000 MHz = 2 ns + tCS = '2ns' + tREFI = '3.9us' + + # active powerdown and precharge powerdown exit time + tXP = '10ns' + + # self refresh exit time + tXS = '65ns' + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture +# burst of 32, which means bursts can be interleaved +class LPDDR5_5500_1x16_BG_BL32(DRAMInterface): + + # Increase buffer size to account for more bank resources + read_buffer_size = 64 + + # Set page policy to better suit DMC Huxley + page_policy = 'close_adaptive' + + # 16-bit channel interface + device_bus_width = 16 + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL32 for higher command bandwidth + burst_length = 32 + + # size of device in bytes + device_size = '1GB' + + # 2kB page with BG mode + device_rowbuffer_size = '2kB' + + # Use a 1x16 configuration + devices_per_rank = 1 + + # Use a single rank + ranks_per_channel = 1 + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Initial configuration will have 16 banks with Bank Group Arch + # to maximim resources and enable higher data rates + banks_per_rank = 16 + bank_groups_per_rank = 4 + + # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK + tCK = '1.455ns' + + # Greater of 2 CK or 18ns + tRCD = '18ns' + + # Base RL is 16 CK @ 687.5 MHz = 23.28ns + tCL = '23.280ns' + + # Greater of 2 CK or 18ns + tRP = '18ns' + + # Greater of 3 CK or 42ns + tRAS = '42ns' + + # Greater of 3 CK or 34ns + tWR = '34ns' + + # active powerdown and precharge powerdown exit time + # Greater of 3 CK or 7ns + tXP = '7ns' + + # self refresh exit time (tRFCab + 7.5ns) + tXS = '217.5ns' + + # Greater of 2 CK or 7.5 ns minus 2 CK + tRTP = '4.59ns' + + # With BG architecture, burst of 32 transferred in two 16-beat + # sub-bursts, with a 16-beat gap in between. + # Each 16-beat sub-burst is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz + # tBURST is the delay to transfer the Bstof32 = 6 CK @ 687.5 MHz + tBURST = '8.73ns' + # can interleave a Bstof32 from another bank group at tBURST_MIN + # 16-beats is 8 WCK @2.75 GHz or 2 CK @ 687.5 MHz + tBURST_MIN = '2.91ns' + # tBURST_MAX is the maximum burst delay for same bank group timing + # this is 8 CK @ 687.5 MHz + tBURST_MAX = '11.64ns' + + # 8 CK @ 687.5 MHz + tCCD_L = "11.64ns" + + # LPDDR5, 8 Gbit/channel for 280ns tRFCab + tRFC = '210ns' + tREFI = '3.9us' + + # Greater of 4 CK or 6.25 ns + tWTR = '6.25ns' + # Greater of 4 CK or 12 ns + tWTR_L = '12ns' + + # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL + # tWCKDQ0/tCK will be 1 CK for most cases + # For gem5 RL = WL and BL/n is already accounted for with tBURST + # Result is and additional 1 CK is required + tRTW = '1.455ns' + + # Default different rank bus delay to 2 CK, @687.5 MHz = 2.91 ns + tCS = '2.91ns' + + # 2 CK + tPPD = '2.91ns' + + # Greater of 2 CK or 5 ns + tRRD = '5ns' + tRRD_L = '5ns' + + # With Bank Group Arch mode tFAW is 20 ns + tXAW = '20ns' + activation_limit = 4 + + # at 5Gbps, 4:1 WCK to CK ratio required + # 2 data beats per WCK (DDR) -> 8 per CK + beats_per_clock = 8 + + # 2 cycles required to send activate command + # 2 command phases can be sent back-to-back or + # with a gap up to tAAD = 8 CK + two_cycle_activate = True + tAAD = '11.640ns' + + data_clock_sync = True + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture, burst of 16 +class LPDDR5_5500_1x16_BG_BL16(LPDDR5_5500_1x16_BG_BL32): + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL16 for smaller access granularity + burst_length = 16 + + # For Bstof16 with BG arch, 2 CK @ 687.5 MHz with 4:1 clock ratio + tBURST = '2.91ns' + tBURST_MIN = '2.91ns' + # For Bstof16 with BG arch, 4 CK @ 687.5 MHz with 4:1 clock ratio + tBURST_MAX = '5.82ns' + + # 4 CK @ 687.5 MHz + tCCD_L = "5.82ns" + + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# Starting with 5.5Gbps data rates and 8Gbit die +# Configuring for 8-bank mode, burst of 32 +class LPDDR5_5500_1x16_8B_BL32(LPDDR5_5500_1x16_BG_BL32): + + # 4kB page with 8B mode + device_rowbuffer_size = '4kB' + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Select 8B + banks_per_rank = 8 + bank_groups_per_rank = 0 + + # For Bstof32 with 8B mode, 4 CK @ 687.5 MHz with 4:1 clock ratio + tBURST = '5.82ns' + tBURST_MIN = '5.82ns' + tBURST_MAX = '5.82ns' + + # Greater of 4 CK or 12 ns + tWTR = '12ns' + + # Greater of 2 CK or 10 ns + tRRD = '10ns' + + # With 8B mode tFAW is 40 ns + tXAW = '40ns' + activation_limit = 4 + + # Reset BG arch timing for 8B mode + tCCD_L = "0ns" + tRRD_L = "0ns" + tWTR_L = "0ns" + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# 6.4Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture +# burst of 32, which means bursts can be interleaved +class LPDDR5_6400_1x16_BG_BL32(LPDDR5_5500_1x16_BG_BL32): + + # 5.5Gb/s DDR with 4:1 WCK:CK ratio for 687.5 MHz CK + tCK = '1.25ns' + + # Base RL is 17 CK @ 800 MHz = 21.25ns + tCL = '21.25ns' + + # With BG architecture, burst of 32 transferred in two 16-beat + # sub-bursts, with a 16-beat gap in between. + # Each 16-beat sub-burst is 8 WCK @3.2 GHz or 2 CK @ 800 MHz + # tBURST is the delay to transfer the Bstof32 = 6 CK @ 800 MHz + tBURST = '7.5ns' + # can interleave a Bstof32 from another bank group at tBURST_MIN + # 16-beats is 8 WCK @2.3 GHz or 2 CK @ 800 MHz + tBURST_MIN = '2.5ns' + # tBURST_MAX is the maximum burst delay for same bank group timing + # this is 8 CK @ 800 MHz + tBURST_MAX = '10ns' + + # 8 CK @ 800 MHz + tCCD_L = "10ns" + + # Required RD-to-WR timing is RL+ BL/n + tWCKDQ0/tCK - WL + # tWCKDQ0/tCK will be 1 CK for most cases + # For gem5 RL = WL and BL/n is already accounted for with tBURST + # Result is and additional 1 CK is required + tRTW = '1.25ns' + + # Default different rank bus delay to 2 CK, @687.5 MHz = 2.5 ns + tCS = '2.5ns' + + # 2 CK + tPPD = '2.5ns' + + # 2 command phases can be sent back-to-back or + # with a gap up to tAAD = 8 CK + tAAD = '10ns' + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on initial +# JEDEC specifcation +# 6.4Gbps data rates and 8Gbit die +# Configuring for 16-bank mode with bank-group architecture, burst of 16 +class LPDDR5_6400_1x16_BG_BL16(LPDDR5_6400_1x16_BG_BL32): + + # LPDDR5 is a BL16 or BL32 device + # With BG mode, BL16 and BL32 are supported + # Use BL16 for smaller access granularity + burst_length = 16 + + # For Bstof16 with BG arch, 2 CK @ 800 MHz with 4:1 clock ratio + tBURST = '2.5ns' + tBURST_MIN = '2.5ns' + # For Bstof16 with BG arch, 4 CK @ 800 MHz with 4:1 clock ratio + tBURST_MAX = '5ns' + + # 4 CK @ 800 MHz + tCCD_L = "5ns" + + +# A single LPDDR5 x16 interface (one command/address bus) +# for a single x16 channel with default timings based on +# initial JEDEC specification +# 6.4Gbps data rates and 8Gbit die +# Configuring for 8-bank mode, burst of 32 +class LPDDR5_6400_1x16_8B_BL32(LPDDR5_6400_1x16_BG_BL32): + + # 4kB page with 8B mode + device_rowbuffer_size = '4kB' + + # LPDDR5 supports configurable bank options + # 8B : BL32, all frequencies + # 16B : BL32 or BL16, <=3.2Gbps + # 16B with Bank Group Arch (4B/BG): BL32 or BL16, >3.2Gbps + # Select 8B + banks_per_rank = 8 + bank_groups_per_rank = 0 + + # For Bstof32 with 8B mode, 4 CK @ 800 MHz with 4:1 clock ratio + tBURST = '5ns' + tBURST_MIN = '5ns' + tBURST_MAX = '5ns' + + # Greater of 4 CK or 12 ns + tWTR = '12ns' + + # Greater of 2 CK or 10 ns + tRRD = '10ns' + + # With 8B mode tFAW is 40 ns + tXAW = '40ns' + activation_limit = 4 + + # Reset BG arch timing for 8B mode + tCCD_L = "0ns" + tRRD_L = "0ns" + tWTR_L = "0ns" diff --git a/src/mem/SConscript b/src/mem/SConscript index 2fe179d..ceeed98 100644 --- a/src/mem/SConscript +++ b/src/mem/SConscript @@ -1,6 +1,6 @@ # -*- mode:python -*- # -# Copyright (c) 2018-2019 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved # # The license below extends only to copyright in the software and shall @@ -47,6 +47,7 @@ SimObject('AddrMapper.py') SimObject('Bridge.py') SimObject('DRAMCtrl.py') +SimObject('DRAMInterface.py') SimObject('ExternalMaster.py') SimObject('ExternalSlave.py') SimObject('MemObject.py') diff --git a/src/mem/dram_ctrl.cc b/src/mem/dram_ctrl.cc index b646581..4055505 100644 --- a/src/mem/dram_ctrl.cc +++ b/src/mem/dram_ctrl.cc @@ -47,6 +47,7 @@ #include "debug/DRAMState.hh" #include "debug/Drain.hh" #include "debug/QOS.hh" +#include "params/DRAMInterface.hh" #include "sim/system.hh" using namespace std; @@ -58,12 +59,13 @@ retryRdReq(false), retryWrReq(false), nextReqEvent([this]{ processNextReqEvent(); }, name()), respondEvent([this]{ processRespondEvent(); }, name()), - readBufferSize(p->read_buffer_size), - writeBufferSize(p->write_buffer_size), + dram(p->dram), + readBufferSize(dram->readBufferSize), + writeBufferSize(dram->writeBufferSize), writeHighThreshold(writeBufferSize * p->write_high_thresh_perc / 100.0), writeLowThreshold(writeBufferSize * p->write_low_thresh_perc / 100.0), minWritesPerSwitch(p->min_writes_per_switch), - writesThisTime(0), readsThisTime(0), tCS(p->tCS), + writesThisTime(0), readsThisTime(0), memSchedPolicy(p->mem_sched_policy), frontendLatency(p->static_frontend_latency), backendLatency(p->static_backend_latency), @@ -74,37 +76,23 @@ readQueue.resize(p->qos_priorities); writeQueue.resize(p->qos_priorities); + dram->setCtrl(this); + // perform a basic check of the write thresholds if (p->write_low_thresh_perc >= p->write_high_thresh_perc) fatal("Write buffer low threshold %d must be smaller than the " "high threshold %d\n", p->write_low_thresh_perc, p->write_high_thresh_perc); - - // determine the rows per bank by looking at the total capacity - uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size()); - - DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity, - AbstractMemory::size()); - - // create a DRAM interface - // will only populate the ranks if DRAM is configured - dram = new DRAMInterface(*this, p, capacity, range); - DPRINTF(DRAM, "Created DRAM interface \n"); } void DRAMCtrl::init() { - MemCtrl::init(); - if (!port.isConnected()) { fatal("DRAMCtrl %s is unconnected!\n", name()); } else { port.sendRangeChange(); } - - dram->init(range); - } void @@ -114,8 +102,6 @@ isTimingMode = system()->isTimingMode(); if (isTimingMode) { - dram->startupRanks(); - // shift the bus busy time sufficiently far ahead that we never // have to worry about negative values when computing the time for // the next request, this will add an insignificant bubble at the @@ -133,7 +119,7 @@ "is responding"); // do the actual memory access and turn the packet into a response - access(pkt); + dram->access(pkt); Tick latency = 0; if (pkt->hasData()) { @@ -263,7 +249,7 @@ // address of first DRAM packet is kept unaliged. Subsequent DRAM packets // are aligned to burst size boundaries. This is to ensure we accurately // check read packets against packets in write queue. - const Addr base_addr = getCtrlAddr(pkt->getAddr()); + const Addr base_addr = dram->getCtrlAddr(pkt->getAddr()); Addr addr = base_addr; unsigned pktsServicedByWrQ = 0; BurstHelper* burst_helper = NULL; @@ -363,7 +349,7 @@ // if the request size is larger than burst size, the pkt is split into // multiple DRAM packets - const Addr base_addr = getCtrlAddr(pkt->getAddr()); + const Addr base_addr = dram->getCtrlAddr(pkt->getAddr()); Addr addr = base_addr; uint32_t burstSize = dram->bytesPerBurst(); for (int cnt = 0; cnt < pktCount; ++cnt) { @@ -526,7 +512,7 @@ DRAMPacket* dram_pkt = respQueue.front(); // media specific checks and functions when read response is complete - dram->respondEventDRAM(dram_pkt->rank); + dram->respondEvent(dram_pkt->rank); if (dram_pkt->burstHelper) { // it is a split packet @@ -727,12 +713,12 @@ void DRAMCtrl::accessAndRespond(PacketPtr pkt, Tick static_latency) { - DPRINTF(DRAM, "Responding to Address %lld.. ",pkt->getAddr()); + DPRINTF(DRAM, "Responding to Address %lld.. \n",pkt->getAddr()); bool needsResponse = pkt->needsResponse(); // do the actual memory access which also turns the packet into a // response - access(pkt); + dram->access(pkt); // turn packet around to go back to requester if response expected if (needsResponse) { @@ -877,9 +863,9 @@ // if not, shift to next burst window Tick act_at; if (twoCycleActivate) - act_at = ctrl.verifyMultiCmd(act_tick, tAAD); + act_at = ctrl->verifyMultiCmd(act_tick, tAAD); else - act_at = ctrl.verifySingleCmd(act_tick); + act_at = ctrl->verifySingleCmd(act_tick); DPRINTF(DRAM, "Activate at tick %d\n", act_at); @@ -997,7 +983,7 @@ // Issuing an explicit PRE command // Verify that we have command bandwidth to issue the precharge // if not, shift to next burst window - pre_at = ctrl.verifySingleCmd(pre_tick); + pre_at = ctrl->verifySingleCmd(pre_tick); // enforce tPPD for (int i = 0; i < banksPerRank; i++) { rank_ref.banks[i].preAllowedAt = std::max(pre_at + tPPD, @@ -1096,9 +1082,9 @@ // verify that we have command bandwidth to issue the burst // if not, shift to next burst window if (dataClockSync && ((cmd_at - rank_ref.lastBurstTick) > clkResyncDelay)) - cmd_at = ctrl.verifyMultiCmd(cmd_at, tCK); + cmd_at = ctrl->verifyMultiCmd(cmd_at, tCK); else - cmd_at = ctrl.verifySingleCmd(cmd_at); + cmd_at = ctrl->verifySingleCmd(cmd_at); // if we are interleaving bursts, ensure that // 1) we don't double interleave on next burst issue @@ -1196,7 +1182,7 @@ bool got_more_hits = false; bool got_bank_conflict = false; - for (uint8_t i = 0; i < ctrl.numPriorities(); ++i) { + for (uint8_t i = 0; i < ctrl->numPriorities(); ++i) { auto p = queue[i].begin(); // keep on looking until we find a hit or reach the end of the // queue @@ -1267,6 +1253,7 @@ // Update latency stats stats.totMemAccLat += dram_pkt->readyTime - dram_pkt->entryTime; stats.totQLat += cmd_at - dram_pkt->entryTime; + stats.totBusLat += tBURST; } else { // Schedule write done event to decrement event count // after the readyTime has been reached @@ -1350,13 +1337,9 @@ // Update latency stats stats.masterReadTotalLat[dram_pkt->masterId()] += dram_pkt->readyTime - dram_pkt->entryTime; - - stats.bytesRead += dram->bytesPerBurst(); - stats.totBusLat += dram->burstDelay(); stats.masterReadBytes[dram_pkt->masterId()] += dram_pkt->size; } else { ++writesThisTime; - stats.bytesWritten += dram->bytesPerBurst(); stats.masterWriteBytes[dram_pkt->masterId()] += dram_pkt->size; stats.masterWriteTotalLat[dram_pkt->masterId()] += dram_pkt->readyTime - dram_pkt->entryTime; @@ -1458,8 +1441,9 @@ // Figure out which read request goes next // If we are changing command type, incorporate the minimum - // bus turnaround delay which will be tCS (different rank) case - to_read = chooseNext((*queue), switched_cmd_type ? tCS : 0); + // bus turnaround delay which will be rank to rank delay + to_read = chooseNext((*queue), switched_cmd_type ? + dram->rankDelay() : 0); if (to_read != queue->end()) { // candidate read found @@ -1538,7 +1522,8 @@ // If we are changing command type, incorporate the minimum // bus turnaround delay to_write = chooseNext((*queue), - switched_cmd_type ? std::min(dram->minRdToWr(), tCS) : 0); + switched_cmd_type ? std::min(dram->minRdToWr(), + dram->rankDelay()) : 0); if (to_write != queue->end()) { write_found = true; @@ -1611,11 +1596,8 @@ } } -DRAMInterface::DRAMInterface(DRAMCtrl& _ctrl, - const DRAMCtrlParams* _p, - const uint64_t capacity, - const AddrRange range) - : SimObject(_p), ctrl(_ctrl), +DRAMInterface::DRAMInterface(const DRAMInterfaceParams* _p) + : AbstractMemory(_p), addrMapping(_p->addr_mapping), burstSize((_p->devices_per_rank * _p->burst_length * _p->device_bus_width) / 8), @@ -1630,7 +1612,7 @@ bankGroupsPerRank(_p->bank_groups_per_rank), bankGroupArch(_p->bank_groups_per_rank > 0), banksPerRank(_p->banks_per_rank), rowsPerBank(0), - tCK(_p->tCK), tCL(_p->tCL), tBURST(_p->tBURST), + tCK(_p->tCK), tCS(_p->tCS), tCL(_p->tCL), tBURST(_p->tBURST), tBURST_MIN(_p->tBURST_MIN), tBURST_MAX(_p->tBURST_MAX), tRTW(_p->tRTW), tCCD_L_WR(_p->tCCD_L_WR), tCCD_L(_p->tCCD_L), tRCD(_p->tRCD), tRP(_p->tRP), tRAS(_p->tRAS), tWR(_p->tWR), tRTP(_p->tRTP), @@ -1646,13 +1628,15 @@ wrToRdDly(tCL + tBURST + _p->tWTR), rdToWrDly(tBURST + tRTW), wrToRdDlySameBG(tCL + _p->tBURST_MAX + _p->tWTR_L), rdToWrDlySameBG(tRTW + _p->tBURST_MAX), - rankToRankDly(ctrl.rankDelay() + tBURST), + rankToRankDly(tCS + tBURST), pageMgmt(_p->page_policy), maxAccessesPerRow(_p->max_accesses_per_row), timeStampOffset(0), activeRank(0), enableDRAMPowerdown(_p->enable_dram_powerdown), lastStatsResetTick(0), - stats(_ctrl, *this) + stats(*this), + readBufferSize(_p->read_buffer_size), + writeBufferSize(_p->write_buffer_size) { fatal_if(!isPowerOf2(burstSize), "DRAM burst size %d is not allowed, " "must be a power of two\n", burstSize); @@ -1664,7 +1648,7 @@ for (int i = 0; i < ranksPerChannel; i++) { DPRINTF(DRAM, "Creating DRAM rank %d \n", i); - Rank* rank = new Rank(ctrl, _p, i, *this); + Rank* rank = new Rank(_p, i, *this); ranks.push_back(rank); } @@ -1672,6 +1656,11 @@ uint64_t deviceCapacity = deviceSize / (1024 * 1024) * devicesPerRank * ranksPerChannel; + uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size()); + + DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity, + AbstractMemory::size()); + // if actual DRAM size does not match memory capacity in system warn! if (deviceCapacity != capacity / (1024 * 1024)) warn("DRAM device capacity (%d Mbytes) does not match the " @@ -1726,8 +1715,10 @@ } void -DRAMInterface::init(AddrRange range) +DRAMInterface::init() { + AbstractMemory::init(); + // a bit of sanity checks on the interleaving, save it for here to // ensure that the system pointer is initialised if (range.interleaved()) { @@ -1749,7 +1740,7 @@ // channel striping has to be done at a granularity that // is equal or larger to a cache line - if (ctrl.system()->cacheLineSize() > range.granularity()) { + if (system()->cacheLineSize() > range.granularity()) { fatal("Channel interleaving of %s must be at least as large " "as the cache line size\n", name()); } @@ -1766,10 +1757,12 @@ } void -DRAMInterface::startupRanks() +DRAMInterface::startup() { - // timestamp offset should be in clock cycles for DRAMPower - timeStampOffset = divCeil(curTick(), tCK); + if (system()->isTimingMode()) { + // timestamp offset should be in clock cycles for DRAMPower + timeStampOffset = divCeil(curTick(), tCK); + } for (auto r : ranks) { r->startup(curTick() + tREFI - tRP); @@ -1815,7 +1808,7 @@ } void -DRAMInterface::respondEventDRAM(uint8_t rank) +DRAMInterface::respondEvent(uint8_t rank) { Rank& rank_ref = *ranks[rank]; @@ -1956,7 +1949,7 @@ std::max(ranks[i]->banks[j].preAllowedAt, curTick()) + tRP; // When is the earliest the R/W burst can issue? - const Tick col_allowed_at = ctrl.inReadBusState(false) ? + const Tick col_allowed_at = ctrl->inReadBusState(false) ? ranks[i]->banks[j].rdAllowedAt : ranks[i]->banks[j].wrAllowedAt; Tick col_at = std::max(col_allowed_at, act_at + tRCD); @@ -1996,9 +1989,15 @@ return make_pair(bank_mask, hidden_bank_prep); } -DRAMInterface::Rank::Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank, - DRAMInterface& _dram) - : EventManager(&_ctrl), ctrl(_ctrl), dram(_dram), +DRAMInterface* +DRAMInterfaceParams::create() +{ + return new DRAMInterface(this); +} + +DRAMInterface::Rank::Rank(const DRAMInterfaceParams* _p, + int _rank, DRAMInterface& _dram) + : EventManager(&_dram), dram(_dram), pwrStateTrans(PWR_IDLE), pwrStatePostRefresh(PWR_IDLE), pwrStateTick(0), refreshDueAt(0), pwrState(PWR_IDLE), refreshState(REF_IDLE), inLowPowerState(false), rank(_rank), @@ -2011,7 +2010,7 @@ refreshEvent([this]{ processRefreshEvent(); }, name()), powerEvent([this]{ processPowerEvent(); }, name()), wakeUpEvent([this]{ processWakeUpEvent(); }, name()), - stats(_ctrl, *this) + stats(_dram, *this) { for (int b = 0; b < _p->banks_per_rank; b++) { banks[b].bank = b; @@ -2062,8 +2061,10 @@ DRAMInterface::Rank::isQueueEmpty() const { // check commmands in Q based on current bus direction - bool no_queued_cmds = (ctrl.inReadBusState(true) && (readEntries == 0)) - || (ctrl.inWriteBusState(true) && (writeEntries == 0)); + bool no_queued_cmds = (dram.ctrl->inReadBusState(true) && + (readEntries == 0)) + || (dram.ctrl->inWriteBusState(true) && + (writeEntries == 0)); return no_queued_cmds; } @@ -2187,7 +2188,7 @@ // if a request is at the moment being handled and this request is // accessing the current rank then wait for it to finish if ((rank == dram.activeRank) - && (ctrl.requestEventScheduled())) { + && (dram.ctrl->requestEventScheduled())) { // hand control over to the request loop until it is // evaluated next DPRINTF(DRAM, "Refresh awaiting draining\n"); @@ -2262,7 +2263,7 @@ // or have outstanding ACT,RD/WR,Auto-PRE sequence scheduled // should have outstanding precharge or read response event assert(prechargeEvent.scheduled() || - ctrl.respondEventScheduled()); + dram.ctrl->respondEventScheduled()); // will start refresh when pwrState transitions to IDLE } @@ -2322,8 +2323,8 @@ assert(!powerEvent.scheduled()); - if ((ctrl.drainState() == DrainState::Draining) || - (ctrl.drainState() == DrainState::Drained)) { + if ((dram.ctrl->drainState() == DrainState::Draining) || + (dram.ctrl->drainState() == DrainState::Drained)) { // if draining, do not re-enter low-power mode. // simply go to IDLE and wait schedulePowerEvent(PWR_IDLE, curTick()); @@ -2548,10 +2549,10 @@ } // completed refresh event, ensure next request is scheduled - if (!ctrl.requestEventScheduled()) { + if (!dram.ctrl->requestEventScheduled()) { DPRINTF(DRAM, "Scheduling next request after refreshing" " rank %d\n", rank); - ctrl.restartScheduler(curTick()); + dram.ctrl->restartScheduler(curTick()); } } @@ -2610,8 +2611,8 @@ // bypass auto-refresh and go straight to SREF, where memory // will issue refresh immediately upon entry if (pwrStatePostRefresh == PWR_PRE_PDN && isQueueEmpty() && - (ctrl.drainState() != DrainState::Draining) && - (ctrl.drainState() != DrainState::Drained) && + (dram.ctrl->drainState() != DrainState::Draining) && + (dram.ctrl->drainState() != DrainState::Drained) && dram.enableDRAMPowerdown) { DPRINTF(DRAMState, "Rank %d bypassing refresh and transitioning " "to self refresh at %11u tick\n", rank, curTick()); @@ -2712,7 +2713,7 @@ bool DRAMInterface::Rank::forceSelfRefreshExit() const { return (readEntries != 0) || - (ctrl.inWriteBusState(true) && (writeEntries != 0)); + (dram.ctrl->inWriteBusState(true) && (writeEntries != 0)); } DRAMCtrl::CtrlStats::CtrlStats(DRAMCtrl &_ctrl) @@ -2723,15 +2724,15 @@ ADD_STAT(writeReqs, "Number of write requests accepted"), ADD_STAT(readBursts, - "Number of DRAM read bursts, " + "Number of controller read bursts, " "including those serviced by the write queue"), ADD_STAT(writeBursts, - "Number of DRAM write bursts, " + "Number of controller write bursts, " "including those merged in the write queue"), ADD_STAT(servicedByWrQ, - "Number of DRAM read bursts serviced by the write queue"), + "Number of controller read bursts serviced by the write queue"), ADD_STAT(mergedWrBursts, - "Number of DRAM write bursts merged with an existing one"), + "Number of controller write bursts merged with an existing one"), ADD_STAT(neitherReadNorWriteReqs, "Number of requests that are neither read nor write"), @@ -2739,9 +2740,6 @@ ADD_STAT(avgRdQLen, "Average read queue length when enqueuing"), ADD_STAT(avgWrQLen, "Average write queue length when enqueuing"), - ADD_STAT(totBusLat, "Total ticks spent in databus transfers"), - ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"), - ADD_STAT(numRdRetry, "Number of times read queue was full causing retry"), ADD_STAT(numWrRetry, "Number of times write queue was full causing retry"), @@ -2756,22 +2754,13 @@ ADD_STAT(wrPerTurnAround, "Writes before turning the bus around for reads"), - ADD_STAT(bytesRead, "Total number of bytes read from memory"), ADD_STAT(bytesReadWrQ, "Total number of bytes read from write queue"), - ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"), ADD_STAT(bytesReadSys, "Total read bytes from the system interface side"), ADD_STAT(bytesWrittenSys, "Total written bytes from the system interface side"), - ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiByte/s"), - ADD_STAT(avgWrBW, "Average achieved write bandwidth in MiByte/s"), ADD_STAT(avgRdBWSys, "Average system read bandwidth in MiByte/s"), ADD_STAT(avgWrBWSys, "Average system write bandwidth in MiByte/s"), - ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"), - - ADD_STAT(busUtil, "Data bus utilization in percentage"), - ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"), - ADD_STAT(busUtilWrite, "Data bus utilization in percentage for writes"), ADD_STAT(totGap, "Total gap between requests"), ADD_STAT(avgGap, "Average gap between requests"), @@ -2803,12 +2792,11 @@ { using namespace Stats; - assert(ctrl._system); - const auto max_masters = ctrl._system->maxMasters(); + assert(ctrl.system()); + const auto max_masters = ctrl.system()->maxMasters(); avgRdQLen.precision(2); avgWrQLen.precision(2); - avgBusLat.precision(2); readPktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1); writePktSize.init(ceilLog2(ctrl.dram->bytesPerBurst()) + 1); @@ -2823,14 +2811,9 @@ .init(ctrl.writeBufferSize) .flags(nozero); - avgRdBW.precision(2); - avgWrBW.precision(2); avgRdBWSys.precision(2); avgWrBWSys.precision(2); - peakBW.precision(2); - busUtil.precision(2); avgGap.precision(2); - busUtilWrite.precision(2); // per-master bytes read and written to memory masterReadBytes @@ -2862,9 +2845,6 @@ .flags(nonan) .precision(2); - busUtilRead - .precision(2); - masterWriteRate .flags(nozero | nonan) .precision(12); @@ -2878,7 +2858,7 @@ .precision(2); for (int i = 0; i < max_masters; i++) { - const std::string master = ctrl._system->getMasterName(i); + const std::string master = ctrl.system()->getMasterName(i); masterReadBytes.subname(i, master); masterReadRate.subname(i, master); masterWriteBytes.subname(i, master); @@ -2892,22 +2872,11 @@ } // Formula stats - avgBusLat = totBusLat / (readBursts - servicedByWrQ); - - avgRdBW = (bytesRead / 1000000) / simSeconds; - avgWrBW = (bytesWritten / 1000000) / simSeconds; avgRdBWSys = (bytesReadSys / 1000000) / simSeconds; avgWrBWSys = (bytesWrittenSys / 1000000) / simSeconds; - peakBW = (SimClock::Frequency / ctrl.dram->burstDataDelay()) * - ctrl.dram->bytesPerBurst() / 1000000; - - busUtil = (avgRdBW + avgWrBW) / peakBW * 100; avgGap = totGap / (readReqs + writeReqs); - busUtilRead = avgRdBW / peakBW * 100; - busUtilWrite = avgWrBW / peakBW * 100; - masterReadRate = masterReadBytes / simSeconds; masterWriteRate = masterWriteBytes / simSeconds; masterReadAvgLat = masterReadTotalLat / masterReadAccesses; @@ -2920,8 +2889,8 @@ dram.lastStatsResetTick = curTick(); } -DRAMInterface::DRAMStats::DRAMStats(DRAMCtrl &_ctrl, DRAMInterface &_dram) - : Stats::Group(&_ctrl, csprintf("dram").c_str()), +DRAMInterface::DRAMStats::DRAMStats(DRAMInterface &_dram) + : Stats::Group(&_dram), dram(_dram), ADD_STAT(readBursts, "Number of DRAM read bursts"), @@ -2931,10 +2900,13 @@ ADD_STAT(perBankWrBursts, "Per bank write bursts"), ADD_STAT(totQLat, "Total ticks spent queuing"), + ADD_STAT(totBusLat, "Total ticks spent in databus transfers"), ADD_STAT(totMemAccLat, "Total ticks spent from burst creation until serviced " "by the DRAM"), + ADD_STAT(avgQLat, "Average queueing delay per DRAM burst"), + ADD_STAT(avgBusLat, "Average bus latency per DRAM burst"), ADD_STAT(avgMemAccLat, "Average memory access latency per DRAM burst"), ADD_STAT(readRowHits, "Number of row buffer hits during reads"), @@ -2947,6 +2919,12 @@ ADD_STAT(bytesWritten, "Total number of bytes written to DRAM"), ADD_STAT(avgRdBW, "Average DRAM read bandwidth in MiBytes/s"), ADD_STAT(avgWrBW, "Average DRAM write bandwidth in MiBytes/s"), + ADD_STAT(peakBW, "Theoretical peak bandwidth in MiByte/s"), + + ADD_STAT(busUtil, "Data bus utilization in percentage"), + ADD_STAT(busUtilRead, "Data bus utilization in percentage for reads"), + ADD_STAT(busUtilWrite, "Data bus utilization in percentage for writes"), + ADD_STAT(pageHitRate, "Row buffer hit rate, read and write combined") { @@ -2958,6 +2936,7 @@ using namespace Stats; avgQLat.precision(2); + avgBusLat.precision(2); avgMemAccLat.precision(2); readRowHitRate.precision(2); @@ -2971,10 +2950,16 @@ dram.maxAccessesPerRow : dram.rowBufferSize) .flags(nozero); + peakBW.precision(2); + busUtil.precision(2); + busUtilWrite.precision(2); + busUtilRead.precision(2); + pageHitRate.precision(2); // Formula stats avgQLat = totQLat / readBursts; + avgBusLat = totBusLat / readBursts; avgMemAccLat = totMemAccLat / readBursts; readRowHitRate = (readRowHits / readBursts) * 100; @@ -2982,13 +2967,19 @@ avgRdBW = (bytesRead / 1000000) / simSeconds; avgWrBW = (bytesWritten / 1000000) / simSeconds; + peakBW = (SimClock::Frequency / dram.burstDataDelay()) * + dram.bytesPerBurst() / 1000000; + + busUtil = (avgRdBW + avgWrBW) / peakBW * 100; + busUtilRead = avgRdBW / peakBW * 100; + busUtilWrite = avgWrBW / peakBW * 100; pageHitRate = (writeRowHits + readRowHits) / (writeBursts + readBursts) * 100; } -DRAMInterface::RankStats::RankStats(DRAMCtrl &_ctrl, Rank &_rank) - : Stats::Group(&_ctrl, csprintf("dram_rank%d", _rank.rank).c_str()), +DRAMInterface::RankStats::RankStats(DRAMInterface &_dram, Rank &_rank) + : Stats::Group(&_dram, csprintf("rank%d", _rank.rank).c_str()), rank(_rank), ADD_STAT(actEnergy, "Energy for activate commands per rank (pJ)"), @@ -3047,7 +3038,7 @@ DRAMCtrl::recvFunctional(PacketPtr pkt) { // rely on the abstract memory - functionalAccess(pkt); + dram->functionalAccess(pkt); } Port & @@ -3093,6 +3084,7 @@ // if we switched to timing mode, kick things into action, // and behave as if we restored from a checkpoint startup(); + dram->startup(); } else if (isTimingMode && !system()->isTimingMode()) { // if we switch from timing mode, stop the refresh events to // not cause issues with KVM @@ -3112,7 +3104,7 @@ DRAMCtrl::MemoryPort::getAddrRanges() const { AddrRangeList ranges; - ranges.push_back(ctrl.getAddrRange()); + ranges.push_back(ctrl.dram->getAddrRange()); return ranges; } diff --git a/src/mem/dram_ctrl.hh b/src/mem/dram_ctrl.hh index dc030b1..417e935 100644 --- a/src/mem/dram_ctrl.hh +++ b/src/mem/dram_ctrl.hh @@ -55,12 +55,15 @@ #include "enums/AddrMap.hh" #include "enums/MemSched.hh" #include "enums/PageManage.hh" +#include "mem/abstract_mem.hh" #include "mem/drampower.hh" #include "mem/qos/mem_ctrl.hh" #include "mem/qport.hh" #include "params/DRAMCtrl.hh" #include "sim/eventq.hh" +class DRAMInterfaceParams; + /** * A basic class to track the bank state, i.e. what row is * currently open (if any), when is the bank free to accept a new @@ -242,7 +245,7 @@ * The DRAMInterface includes a class for individual ranks * and per rank functions. */ -class DRAMInterface : public SimObject +class DRAMInterface : public AbstractMemory { private: /** @@ -342,7 +345,7 @@ class Rank; struct RankStats : public Stats::Group { - RankStats(DRAMCtrl &ctrl, Rank &rank); + RankStats(DRAMInterface &dram, Rank &rank); void regStats() override; void resetStats() override; @@ -408,13 +411,6 @@ */ class Rank : public EventManager { - protected: - - /** - * A reference to the parent DRAMCtrl instance - */ - DRAMCtrl& ctrl; - private: /** @@ -534,10 +530,10 @@ */ Tick lastBurstTick; - Rank(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, int _rank, + Rank(const DRAMInterfaceParams* _p, int _rank, DRAMInterface& _dram); - const std::string name() const { return csprintf("dram_%d", rank); } + const std::string name() const { return csprintf("%d", rank); } /** * Kick off accounting for power and refresh states and @@ -659,15 +655,16 @@ * @param next Memory Command * @return true if timeStamp of Command 1 < timeStamp of Command 2 */ - static bool sortTime(const Command& cmd, const Command& cmd_next) + static bool + sortTime(const Command& cmd, const Command& cmd_next) { return cmd.timeStamp < cmd_next.timeStamp; - }; + } /** - * A reference to the parent DRAMCtrl instance + * A pointer to the parent DRAMCtrl instance */ - DRAMCtrl& ctrl; + DRAMCtrl* ctrl; /** * Memory controller configuration initialized based on parameter @@ -698,6 +695,7 @@ * DRAM timing requirements */ const Tick M5_CLASS_VAR_USED tCK; + const Tick tCS; const Tick tCL; const Tick tBURST; const Tick tBURST_MIN; @@ -781,7 +779,7 @@ struct DRAMStats : public Stats::Group { - DRAMStats(DRAMCtrl &ctrl, DRAMInterface &dram); + DRAMStats(DRAMInterface &dram); void regStats() override; void resetStats() override; @@ -798,10 +796,12 @@ // Latencies summed over all requests Stats::Scalar totQLat; + Stats::Scalar totBusLat; Stats::Scalar totMemAccLat; // Average latencies per request Stats::Formula avgQLat; + Stats::Formula avgBusLat; Stats::Formula avgMemAccLat; // Row hit count and rate @@ -817,6 +817,11 @@ // Average bandwidth Stats::Formula avgRdBW; Stats::Formula avgWrBW; + Stats::Formula peakBW; + // bus utilization + Stats::Formula busUtil; + Stats::Formula busUtilRead; + Stats::Formula busUtilWrite; Stats::Formula pageHitRate; }; @@ -828,16 +833,28 @@ std::vector<Rank*> ranks; public: + + /** + * Buffer sizes for read and write queues in the controller + * These are passed to the controller on instantiation + * Defining them here allows for buffers to be resized based + * on memory type / configuration. + */ + const uint32_t readBufferSize; + const uint32_t writeBufferSize; + + /** Setting a pointer to the controller */ + void setCtrl(DRAMCtrl* _ctrl) { ctrl = _ctrl; } + /** * Initialize the DRAM interface and verify parameters - * @param range is the address range for this interface */ - void init(AddrRange range); + void init() override; /** * Iterate through dram ranks and instantiate per rank startup routine */ - void startupRanks(); + void startup() override; /** * Iterate through dram ranks to exit self-refresh in order to drain @@ -861,15 +878,26 @@ void suspend(); /** + * Get an address in a dense range which starts from 0. The input + * address is the physical address of the request in an address + * space that contains other SimObjects apart from this + * controller. + * + * @param addr The intput address which should be in the addrRange + * @return An address in the continues range [0, max) + */ + Addr getCtrlAddr(Addr addr) { return range.getOffset(addr); } + + /** * @return number of bytes in a burst for this interface */ - uint32_t bytesPerBurst() const { return burstSize; }; + uint32_t bytesPerBurst() const { return burstSize; } /** * * @return number of ranks per channel for this interface */ - uint32_t numRanks() const { return ranksPerChannel; }; + uint32_t numRanks() const { return ranksPerChannel; } /* * @return time to send a burst of data @@ -879,7 +907,8 @@ /* * @return time to send a burst of data without gaps */ - Tick burstDataDelay() const + Tick + burstDataDelay() const { return (burstInterleave ? tBURST_MAX / 2 : tBURST); } @@ -893,7 +922,14 @@ * * @return additional bus turnaround required for read-to-write */ - Tick minRdToWr() const { return tRTW; }; + Tick minRdToWr() const { return tRTW; } + + /** + * Determine the required delay for an access to a different rank + * + * @return required rank to rank delay + */ + Tick rankDelay() const { return tCS; } /* * Function to calulate RAS cycle time for use within and @@ -957,7 +993,8 @@ * This requires the DRAM to be in the * REF IDLE state */ - bool burstReady(uint8_t rank) const + bool + burstReady(uint8_t rank) const { return ranks[rank]->inRefIdleState(); } @@ -979,7 +1016,7 @@ * * @param rank Specifies rank associated with read burst */ - void respondEventDRAM(uint8_t rank); + void respondEvent(uint8_t rank); /** * Check the refresh state to determine if refresh needs @@ -989,8 +1026,7 @@ */ void checkRefreshState(uint8_t rank); - DRAMInterface(DRAMCtrl& _ctrl, const DRAMCtrlParams* _p, - uint64_t capacity, AddrRange range); + DRAMInterface(const DRAMInterfaceParams* _p); }; /** @@ -1141,20 +1177,6 @@ void accessAndRespond(PacketPtr pkt, Tick static_latency); /** - * Get an address in a dense range which starts from 0. The input - * address is the physical address of the request in an address - * space that contains other SimObjects apart from this - * controller. - * - * @param addr The intput address which should be in the addrRange - * @return An address in the continues range [0, max) - */ - Addr getCtrlAddr(Addr addr) - { - return range.getOffset(addr); - } - - /** * The memory schduler/arbiter - picks which request needs to * go next, based on the specified policy such as FCFS or FR-FCFS * and moves it to the head of the queue. @@ -1237,6 +1259,11 @@ std::unordered_multiset<Tick> burstTicks; /** + * Create pointer to interface of the actual dram media + */ + DRAMInterface* const dram; + + /** * The following are basic design parameters of the memory * controller, and are initialized based on parameter values. * The rowsPerBank is determined based on the capacity, number of @@ -1251,12 +1278,6 @@ uint32_t readsThisTime; /** - * Basic memory timing parameters initialized based on parameter - * values. These will be used across memory interfaces. - */ - const Tick tCS; - - /** * Memory controller configuration initialized based on parameter * values. */ @@ -1310,10 +1331,6 @@ // Average queue lengths Stats::Average avgRdQLen; Stats::Average avgWrQLen; - // Latencies summed over all requests - Stats::Scalar totBusLat; - // Average latencies per request - Stats::Formula avgBusLat; Stats::Scalar numRdRetry; Stats::Scalar numWrRetry; @@ -1324,21 +1341,12 @@ Stats::Histogram rdPerTurnAround; Stats::Histogram wrPerTurnAround; - Stats::Scalar bytesRead; Stats::Scalar bytesReadWrQ; - Stats::Scalar bytesWritten; Stats::Scalar bytesReadSys; Stats::Scalar bytesWrittenSys; // Average bandwidth - Stats::Formula avgRdBW; - Stats::Formula avgWrBW; Stats::Formula avgRdBWSys; Stats::Formula avgWrBWSys; - Stats::Formula peakBW; - // bus utilization - Stats::Formula busUtil; - Stats::Formula busUtilRead; - Stats::Formula busUtilWrite; Stats::Scalar totGap; Stats::Formula avgGap; @@ -1367,11 +1375,6 @@ CtrlStats stats; /** - * Create pointer to interfasce to the actual media - */ - DRAMInterface* dram; - - /** * Upstream caches need this packet until true is returned, so * hold it for deletion until a subsequent call */ @@ -1449,13 +1452,6 @@ void restartScheduler(Tick tick) { schedule(nextReqEvent, tick); } /** - * Determine the required delay for an access to a different rank - * - * @return required rank to rank delay - */ - Tick rankDelay() const { return tCS; } - - /** * Check the current direction of the memory channel * * @param next_state Check either the current or next bus state diff --git a/src/mem/drampower.cc b/src/mem/drampower.cc index 13551a0..96dcb55 100644 --- a/src/mem/drampower.cc +++ b/src/mem/drampower.cc @@ -40,13 +40,13 @@ #include "base/intmath.hh" #include "sim/core.hh" -DRAMPower::DRAMPower(const DRAMCtrlParams* p, bool include_io) : +DRAMPower::DRAMPower(const DRAMInterfaceParams* p, bool include_io) : powerlib(libDRAMPower(getMemSpec(p), include_io)) { } Data::MemArchitectureSpec -DRAMPower::getArchParams(const DRAMCtrlParams* p) +DRAMPower::getArchParams(const DRAMInterfaceParams* p) { Data::MemArchitectureSpec archSpec; archSpec.burstLength = p->burst_length; @@ -68,7 +68,7 @@ } Data::MemTimingSpec -DRAMPower::getTimingParams(const DRAMCtrlParams* p) +DRAMPower::getTimingParams(const DRAMInterfaceParams* p) { // Set the values that are used for power calculations and ignore // the ones only used by the controller functionality in DRAMPower @@ -100,7 +100,7 @@ } Data::MemPowerSpec -DRAMPower::getPowerParams(const DRAMCtrlParams* p) +DRAMPower::getPowerParams(const DRAMInterfaceParams* p) { // All DRAMPower currents are in mA Data::MemPowerSpec powerSpec; @@ -132,7 +132,7 @@ } Data::MemorySpecification -DRAMPower::getMemSpec(const DRAMCtrlParams* p) +DRAMPower::getMemSpec(const DRAMInterfaceParams* p) { Data::MemorySpecification memSpec; memSpec.memArchSpec = getArchParams(p); @@ -142,7 +142,18 @@ } bool -DRAMPower::hasTwoVDD(const DRAMCtrlParams* p) +DRAMPower::hasTwoVDD(const DRAMInterfaceParams* p) { return p->VDD2 == 0 ? false : true; } + +uint8_t +DRAMPower::getDataRate(const DRAMInterfaceParams* p) +{ + uint32_t burst_cycles = divCeil(p->tBURST_MAX, p->tCK); + uint8_t data_rate = p->burst_length / burst_cycles; + // 4 for GDDR5 + if (data_rate != 1 && data_rate != 2 && data_rate != 4 && data_rate != 8) + fatal("Got unexpected data rate %d, should be 1 or 2 or 4 or 8\n"); + return data_rate; +} diff --git a/src/mem/drampower.hh b/src/mem/drampower.hh index da24bca..da68a78 100644 --- a/src/mem/drampower.hh +++ b/src/mem/drampower.hh @@ -44,7 +44,7 @@ #define __MEM_DRAM_POWER_HH__ #include "libdrampower/LibDRAMPower.h" -#include "params/DRAMCtrl.hh" +#include "params/DRAMInterface.hh" /** * DRAMPower is a standalone tool which calculates the power consumed by a @@ -57,38 +57,44 @@ /** * Transform the architechture parameters defined in - * DRAMCtrlParams to the memSpec of DRAMPower + * DRAMInterfaceParams to the memSpec of DRAMPower */ - static Data::MemArchitectureSpec getArchParams(const DRAMCtrlParams* p); + static Data::MemArchitectureSpec getArchParams( + const DRAMInterfaceParams* p); /** - * Transforms the timing parameters defined in DRAMCtrlParams to + * Transforms the timing parameters defined in DRAMInterfaceParams to * the memSpec of DRAMPower */ - static Data::MemTimingSpec getTimingParams(const DRAMCtrlParams* p); + static Data::MemTimingSpec getTimingParams(const DRAMInterfaceParams* p); /** * Transforms the power and current parameters defined in - * DRAMCtrlParam to the memSpec of DRAMPower + * DRAMInterfaceParams to the memSpec of DRAMPower */ - static Data::MemPowerSpec getPowerParams(const DRAMCtrlParams* p); + static Data::MemPowerSpec getPowerParams(const DRAMInterfaceParams* p); + + /** + * Determine data rate, either one or two. + */ + static uint8_t getDataRate(const DRAMInterfaceParams* p); /** * Determine if DRAM has two voltage domains (or one) */ - static bool hasTwoVDD(const DRAMCtrlParams* p); + static bool hasTwoVDD(const DRAMInterfaceParams* p); /** - * Return an instance of MemSpec based on the DRAMCtrlParams + * Return an instance of MemSpec based on the DRAMInterfaceParams */ - static Data::MemorySpecification getMemSpec(const DRAMCtrlParams* p); + static Data::MemorySpecification getMemSpec(const DRAMInterfaceParams* p); public: // Instance of DRAMPower Library libDRAMPower powerlib; - DRAMPower(const DRAMCtrlParams* p, bool include_io); + DRAMPower(const DRAMInterfaceParams* p, bool include_io); }; diff --git a/src/mem/qos/QoSMemCtrl.py b/src/mem/qos/QoSMemCtrl.py index 1cd3f0b..f55105b 100644 --- a/src/mem/qos/QoSMemCtrl.py +++ b/src/mem/qos/QoSMemCtrl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -34,18 +34,21 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from m5.params import * -from m5.objects.AbstractMemory import AbstractMemory +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject from m5.objects.QoSTurnaround import * # QoS Queue Selection policy used to select packets among same-QoS queues class QoSQPolicy(Enum): vals = ["fifo", "lifo", "lrg"] -class QoSMemCtrl(AbstractMemory): +class QoSMemCtrl(ClockedObject): type = 'QoSMemCtrl' cxx_header = "mem/qos/mem_ctrl.hh" cxx_class = 'QoS::MemCtrl' abstract = True + system = Param.System(Parent.any, "System that the controller belongs to.") + ##### QoS support parameters #### # Number of priorities in the system diff --git a/src/mem/qos/QoSMemSinkCtrl.py b/src/mem/qos/QoSMemSinkCtrl.py index 6c4f263..fafac64 100644 --- a/src/mem/qos/QoSMemSinkCtrl.py +++ b/src/mem/qos/QoSMemSinkCtrl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -37,6 +37,7 @@ from m5.params import * from m5.objects.QoSMemCtrl import * +from m5.objects.QoSMemSinkInterface import * class QoSMemSinkCtrl(QoSMemCtrl): type = 'QoSMemSinkCtrl' @@ -44,6 +45,10 @@ cxx_class = "QoS::MemSinkCtrl" port = ResponsePort("Response ports") + + interface = Param.QoSMemSinkInterface(QoSMemSinkInterface(), + "Interface to memory") + # the basic configuration of the controller architecture, note # that each entry corresponds to a burst for the specific DRAM # configuration (e.g. x32 with burst length 8 is 32 bytes) and not @@ -59,5 +64,3 @@ # response latency - time to issue a response once a request is serviced response_latency = Param.Latency("20ns", "Memory response latency") - - diff --git a/src/mem/qos/QoSMemSinkInterface.py b/src/mem/qos/QoSMemSinkInterface.py new file mode 100644 index 0000000..5c79f64 --- /dev/null +++ b/src/mem/qos/QoSMemSinkInterface.py @@ -0,0 +1,40 @@ +# Copyright (c) 2020 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.objects.AbstractMemory import AbstractMemory + +class QoSMemSinkInterface(AbstractMemory): + type = 'QoSMemSinkInterface' + cxx_header = "mem/qos/mem_sink.hh" diff --git a/src/mem/qos/SConscript b/src/mem/qos/SConscript index f8601b6..1d90f9c 100644 --- a/src/mem/qos/SConscript +++ b/src/mem/qos/SConscript @@ -1,4 +1,4 @@ -# Copyright (c) 2018 ARM Limited +# Copyright (c) 2018-2020 ARM Limited # All rights reserved # # The license below extends only to copyright in the software and shall @@ -37,6 +37,7 @@ SimObject('QoSMemCtrl.py') SimObject('QoSMemSinkCtrl.py') +SimObject('QoSMemSinkInterface.py') SimObject('QoSPolicy.py') SimObject('QoSTurnaround.py') diff --git a/src/mem/qos/mem_ctrl.cc b/src/mem/qos/mem_ctrl.cc index 50e6035..190960b 100644 --- a/src/mem/qos/mem_ctrl.cc +++ b/src/mem/qos/mem_ctrl.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited + * Copyright (c) 2017-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -42,7 +42,7 @@ namespace QoS { MemCtrl::MemCtrl(const QoSMemCtrlParams * p) - : AbstractMemory(p), + : ClockedObject(p), policy(p->qos_policy), turnPolicy(p->qos_turnaround_policy), queuePolicy(QueuePolicy::create(p)), @@ -51,7 +51,8 @@ qosSyncroScheduler(p->qos_syncro_scheduler), totalReadQueueSize(0), totalWriteQueueSize(0), busState(READ), busStateNext(READ), - stats(*this) + stats(*this), + _system(p->system) { // Set the priority policy if (policy) { @@ -77,12 +78,6 @@ {} void -MemCtrl::init() -{ - AbstractMemory::init(); -} - -void MemCtrl::logRequest(BusState dir, MasterID m_id, uint8_t qos, Addr addr, uint64_t entries) { diff --git a/src/mem/qos/mem_ctrl.hh b/src/mem/qos/mem_ctrl.hh index 0e29fcc..5d7c9d6 100644 --- a/src/mem/qos/mem_ctrl.hh +++ b/src/mem/qos/mem_ctrl.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited + * Copyright (c) 2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -36,10 +36,10 @@ */ #include "debug/QOS.hh" -#include "mem/abstract_mem.hh" -#include "mem/qos/q_policy.hh" #include "mem/qos/policy.hh" +#include "mem/qos/q_policy.hh" #include "params/QoSMemCtrl.hh" +#include "sim/clocked_object.hh" #include "sim/system.hh" #include <unordered_map> @@ -56,7 +56,7 @@ * which support QoS - it provides access to a set of QoS * scheduling policies */ -class MemCtrl: public AbstractMemory +class MemCtrl : public ClockedObject { public: /** Bus Direction */ @@ -151,6 +151,9 @@ Stats::Scalar numStayWriteState; } stats; + /** Pointer to the System object */ + System* _system; + /** * Initializes dynamically counters and * statistics for a given Master @@ -266,11 +269,6 @@ virtual ~MemCtrl(); /** - * Initializes this object - */ - void init() override; - - /** * Gets the current bus state * * @return current bus state @@ -346,6 +344,10 @@ * @return total number of priority levels */ uint8_t numPriorities() const { return _numPriorities; } + + /** read the system pointer + * @return pointer to the system object */ + System* system() const { return _system; } }; template<typename Queues> diff --git a/src/mem/qos/mem_sink.cc b/src/mem/qos/mem_sink.cc index 1f104e4..dbdf548 100644 --- a/src/mem/qos/mem_sink.cc +++ b/src/mem/qos/mem_sink.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited + * Copyright (c) 2018-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -40,6 +40,7 @@ #include "debug/Drain.hh" #include "debug/QOS.hh" #include "mem_sink.hh" +#include "params/QoSMemSinkInterface.hh" #include "sim/system.hh" namespace QoS { @@ -50,12 +51,15 @@ memoryPacketSize(p->memory_packet_size), readBufferSize(p->read_buffer_size), writeBufferSize(p->write_buffer_size), port(name() + ".port", *this), + interface(p->interface), retryRdReq(false), retryWrReq(false), nextRequest(0), nextReqEvent(this) { // Resize read and write queue to allocate space // for configured QoS priorities readQueue.resize(numPriorities()); writeQueue.resize(numPriorities()); + + interface->setMemCtrl(this); } MemSinkCtrl::~MemSinkCtrl() @@ -92,7 +96,7 @@ "%s Should not see packets where cache is responding\n", __func__); - access(pkt); + interface->access(pkt); return responseLatency; } @@ -101,7 +105,7 @@ { pkt->pushLabel(name()); - functionalAccess(pkt); + interface->functionalAccess(pkt); pkt->popLabel(); } @@ -279,7 +283,7 @@ // Do the actual memory access which also turns the packet // into a response - access(pkt); + interface->access(pkt); // Log the response logResponse(pkt->isRead()? READ : WRITE, @@ -351,7 +355,7 @@ MemSinkCtrl::MemoryPort::getAddrRanges() const { AddrRangeList ranges; - ranges.push_back(memory.getAddrRange()); + ranges.push_back(memory.interface->getAddrRange()); return ranges; } @@ -390,3 +394,13 @@ return new QoS::MemSinkCtrl(this); } +QoSMemSinkInterface::QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p) + : AbstractMemory(_p) +{ +} + +QoSMemSinkInterface* +QoSMemSinkInterfaceParams::create() +{ + return new QoSMemSinkInterface(this); +} diff --git a/src/mem/qos/mem_sink.hh b/src/mem/qos/mem_sink.hh index 9a51269..5f6c1be 100644 --- a/src/mem/qos/mem_sink.hh +++ b/src/mem/qos/mem_sink.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited + * Copyright (c) 2018-2020 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -41,10 +41,14 @@ #ifndef __MEM_QOS_MEM_SINK_HH__ #define __MEM_QOS_MEM_SINK_HH__ +#include "mem/abstract_mem.hh" #include "mem/qos/mem_ctrl.hh" #include "mem/qport.hh" #include "params/QoSMemSinkCtrl.hh" +class QoSMemSinkInterfaceParams; +class QoSMemSinkInterface; + namespace QoS { /** @@ -163,6 +167,11 @@ /** Memory slave port */ MemoryPort port; + /** + * Create pointer to interface of actual media + */ + QoSMemSinkInterface* const interface; + /** Read request pending */ bool retryRdReq; @@ -244,4 +253,17 @@ } // namespace QoS +class QoSMemSinkInterface : public AbstractMemory +{ + public: + /** Setting a pointer to the interface */ + void setMemCtrl(QoS::MemSinkCtrl* _ctrl) { ctrl = _ctrl; }; + + /** Pointer to the controller */ + QoS::MemSinkCtrl* ctrl; + + QoSMemSinkInterface(const QoSMemSinkInterfaceParams* _p); +}; + + #endif /* __MEM_QOS_MEM_SINK_HH__ */ diff --git a/tests/gem5/configs/base_config.py b/tests/gem5/configs/base_config.py index b5bddf4..cbea768 100644 --- a/tests/gem5/configs/base_config.py +++ b/tests/gem5/configs/base_config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2012-2013, 2017-2018 ARM Limited +# Copyright (c) 2012-2013, 2017-2018, 2020 ARM Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -220,7 +220,12 @@ super(BaseSESystem, self).init_system(system) def create_system(self): - system = System(physmem = self.mem_class(), + if issubclass(self.mem_class, m5.objects.DRAMInterface): + mem_ctrl = DRAMCtrl() + mem_ctrl.dram = self.mem_class() + else: + mem_ctrl = self.mem_class() + system = System(physmem = mem_ctrl, membus = SystemXBar(), mem_mode = self.mem_mode, multi_thread = (self.num_threads > 1)) @@ -272,8 +277,16 @@ else: # create the memory controllers and connect them, stick with # the physmem name to avoid bumping all the reference stats - system.physmem = [self.mem_class(range = r) - for r in system.mem_ranges] + if issubclass(self.mem_class, m5.objects.DRAMInterface): + mem_ctrls = [] + for r in system.mem_ranges: + mem_ctrl = DRAMCtrl() + mem_ctrl.dram = self.mem_class(range = r) + mem_ctrls.append(mem_ctrl) + system.physmem = mem_ctrls + else: + system.physmem = [self.mem_class(range = r) + for r in system.mem_ranges] for i in range(len(system.physmem)): system.physmem[i].port = system.membus.master -- To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/28968 To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings Gerrit-Project: public/gem5 Gerrit-Branch: develop Gerrit-Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8 Gerrit-Change-Number: 28968 Gerrit-PatchSet: 13 Gerrit-Owner: Wendy Elsasser <wendy.elsasser(a)arm.com> Gerrit-Reviewer: Daniel Carvalho <odanrc(a)yahoo.com.br> Gerrit-Reviewer: Jason Lowe-Power <power.jg(a)gmail.com> Gerrit-Reviewer: John Alsop <johnathan.alsop(a)amd.com> Gerrit-Reviewer: Matthew Poremba <matthew.poremba(a)amd.com> Gerrit-Reviewer: Nikos Nikoleris <nikos.nikoleris(a)arm.com> Gerrit-Reviewer: Srikant Bharadwaj <srikant.bharadwaj(a)amd.com> Gerrit-Reviewer: kokoro <noreply+kokoro(a)google.com> Gerrit-MessageType: merged

gem5-dev@gem5.org

Change in gem5/gem5[develop]: mem: Make DRAMCtrl a ClockedObject

Change-Id: I6a368b845d574a713c7196c5671188ca8c1dc5e8

Only do this for DRAMs

array of controllers and set their parameters to match their

address mapping in the case of a DRAM

array of memory interfaces and set their parameters to match

their address mapping in the case of a DRAM

Connect the controllers to the membus

Create a controller and connect the interfaces to a controller

All rights reserved.

The license below extends only to copyright in the software and shall

Use a single-channel DDR4-2400 in 16x4 configuration by default

Sanity check for memory controller class.

There is no point slowing things down by saving any data.

Set the address mapping based on input argument

We create a traffic generator state for each param combination we want to

test. Each traffic generator state is specified in the config file and

Stats are dumped and reset at the state transition.

-# We specify the states in a config file input to the traffic generator. -cfg_file_name = "lowp_sweep.cfg" -cfg_file_path = os.path.dirname(file) + "/" +cfg_file_name -cfg_file = open(cfg_file_path, 'w')

Get the number of banks

determine the burst size in bytes

next, get the page size in bytes (the rowbuffer size is already in bytes)

Inter-request delay should be such that we can hit as many transitions

to/from low power states as possible to. We provide a min and max itt to

traffic generator and it randomises in the range. The parameter is in

seconds and we need it in ticks (ps).

a read command is delayed beyond the delay from ACT to PRE_PDN entry of

between a write and power down entry will be tRCD + tCL + tWR + tRP +

As we use this delay as a unit and create multiples of it as bigger

for the sweep, this parameter works for reads, writes and mix of them.

We sweep itt max using the multipliers specified by the user.

banks

-# Next we create the config file, but first a comment -cfg_file.write("""# STATE state# period mode=DRAM -# read_percent start_addr end_addr req_size min_itt max_itt data_limit -# stride_size page_size #banks #banks_util addr_map #ranks\n""")

-addr_map = m5.objects.AddrMap.map[args.addr_map]

State for idle period

-# Init state is state 0 -cfg_file.write("INIT 0\n")

create a traffic generator, and point it to the file we just created

add a communication monitor

every period, dump and reset all stats

Simulate for exactly as long as it takes to go through all the states

This is why sim exists.

the following assumes that we are using the native DRAM

controller, check to be sure

there is no point slowing things down by saving any data

Set the address mapping based on input argument

stay in each state for 0.25 ms, long enough to warm things up, and

short enough to avoid hitting a refresh

the DRAM maximum bandwidth to ensure that it is saturated

get the number of banks

determine the burst length in bytes

next, get the page size in bytes

match the maximum bandwidth of the memory, the parameter is in seconds

and we need it in ticks (ps)

assume we start at 0

Create a DDR3 memory controller and connect it to the membus

Connect the system up to the membus

Create a DDR3 memory controller

Create a process for a simple "Hello World" application

Create a DDR3 memory controller and connect it to the membus

Connect the system up to the membus

Create a DDR3 memory controller and connect it to the membus

Connect the system up to the membus

Create a DDR3 memory controller and connect it to the membus

create the interrupt controller for the CPU and connect to the membus

Enum for memory scheduling algorithms, currently First-Come

First-Served and a First-Row Hit then First-Come First-Served

DRAMCtrl is a single-channel single-ported DRAM controller model

that aims to model the most important system-level performance

effects of a DRAM without getting into too much detail of the DRAM

the basic configuration of the controller architecture, note

that each entry corresponds to a burst for the specific DRAM

Interface to volatile, DRAM media

Set default buffer sizes

each entry corresponds to a burst for the specific DRAM

configuration (e.g. x32 with burst length 8 is 32 bytes) and not

the cacheline size or request/packet size

scheduler, address map and page policy

enforce a limit on the number of accesses per row

size of DRAM Chip in Bytes

-# We specify the states in a config file input to the traffic generator.
-cfg_file_name = "lowp_sweep.cfg"
-cfg_file_path = os.path.dirname(file) + "/" +cfg_file_name
-cfg_file = open(cfg_file_path, 'w')

-# Next we create the config file, but first a comment
-cfg_file.write("""# STATE state# period mode=DRAM
-# read_percent start_addr end_addr req_size min_itt max_itt data_limit
-# stride_size page_size #banks #banks_util addr_map #ranks\n""")

-# Init state is state 0
-cfg_file.write("INIT 0\n")

@@ -109,1404 +89,3 @@
# serviced by the memory seeing the sum of the two
static_frontend_latency = Param.Latency("10ns", "Static frontend
latency")
static_backend_latency = Param.Latency("10ns", "Static backend
latency")