gem5-users@gem5.org

The gem5 Users mailing list

View all threads

Exception when running libtorch simulation in SE mode

CV
Caio Vieira
Fri, Jul 14, 2023 10:02 AM

Hi everyone,

I'm trying to execute gem5 simulations using libtorch in SE mode.
However, I get the following error message:

--- Error message ---
...
terminate called after throwing an instance of 'std::runtime_error'
  what():  expected eof but found 'ident' here:
aten::quantized_lstm.inpr input, Tensor[] orch.classes.rnn.CellPara[]
params, bool has_biases, int num_layers, float dropout, bool train, bool
bidirectional, bool batch_first, *, ScalarType? dtype=None, bool
use_dynamic=False) dy
namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor)
                          ~~~~~ <--- HERE

build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall
rt_sigprocmask(...)
      (further warnings will be suppressed)
build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigaction(...)
      (further warnings will be suppressed)
build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred:
fault (General-Protection) detected @ PC
(0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1)
Memory Usage: 11842716 KBytes
Program aborted at tick 294083905383
--- BEGIN LIBC BACKTRACE ---
...

The simulation fails before the first line of the main function. I
believe that it is failing to load the libtorch library.
Unfortunately, it is not possible to build libtorch with "-static" since
their static builds is broken for quiet a long
time: https://github.com/pytorch/pytorch/issues/21737
I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also tested
using different GCC versions to build the simulated binary.

For anyone interested in reproducing the error, I'm sending a "setup.sh"
script to create a minimal reproducible environment.
Simply copy and paste the script below and name it as "setup.sh" in a
new directory, then:

source setup.sh
cmake --B build -S .
cmake --build build
./<gem5> config.py build/main

Best regards,
Caio Vieira

--- setup.sh ---

#!/bin/bash

Bash script to create minimal reproducible environment for libtorch

simulation

bug. This script creates necessary files such as a CMakeLists.txt and

a minimal

main.cpp. The CMakeLists.txt file downloads and manages libtorch by

saving it

in a ""_deps"" folder. Steps to reproduce the bug:

./<this-script>

cmake -B build -S .

cmake --build build

./<gem5> config.py build/main

function create_cmake() {
    cat > CMakeLists.txt <<- \EOF
cmake_minimum_required(VERSION 3.22 FATAL_ERROR)

Download and manage libtorch dependency

set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps")

file(MAKE_DIRECTORY "${DEPENDENCY_DIR}")
if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch")
    file(DOWNLOAD
https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip

        "${DEPENDENCY_DIR}/libtorch.zip")
    file(ARCHIVE_EXTRACT
        INPUT "${DEPENDENCY_DIR}/libtorch.zip"
        DESTINATION "${DEPENDENCY_DIR}")
    file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip")
endif()
set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch")
find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

project(main)

Create executable

add_executable(${CMAKE_PROJECT_NAME} main.cpp)
target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC
${TORCH_INCLUDE_DIRS})
target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES})
set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14)
message("Torch Libraries: ${TORCH_LIBRARIES}")
EOF
}

function create_main() {
    cat > main.cpp <<- \EOF
#include <iostream>
#include <torch/torch.h>

int main(int argc, char *argv[]) {
    std::cout << "Hello World" << std::endl;
}
EOF
}

function create_gem5_config() {
    cat > config.py <<- \EOF

This script must be executed by gem5

Usage: ./<gem5> <this-script> <binary>

import sys

from gem5.utils.requires import ISA
from gem5.components.boards.simple_board import SimpleBoard
from gem5.components.cachehierarchies.classic.no_cache import NoCache
from gem5.components.memory.single_channel import SingleChannelDDR3_1600
from gem5.components.processors.simple_processor import SimpleProcessor
from gem5.components.processors.cpu_types import CPUTypes
from gem5.resources.resource import CustomResource
from gem5.simulate.simulator import Simulator

if len(sys.argv) == 1:
    print("Provide a binary as argument", file=sys.stderr)
    sys.exit(1)

cache_hierarchy = NoCache()
memory = SingleChannelDDR3_1600("8GiB")
processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, num_cores=1,
isa=ISA.X86)

board = SimpleBoard(
        clk_freq="3GHz",
        processor=processor,
        memory=memory,
        cache_hierarchy=cache_hierarchy
)

binary = CustomResource(sys.argv[1])
board.set_se_binary_workload(binary)

simulator = Simulator(board=board)
simulator.run()
EOF
}

create_cmake
create_main
create_gem5_config

Hi everyone, I'm trying to execute gem5 simulations using libtorch in SE mode. However, I get the following error message: --- Error message --- ... terminate called after throwing an instance of 'std::runtime_error'   what():  expected eof but found 'ident' here: aten::quantized_lstm.inpr input, Tensor[] orch.classes.rnn.CellPara[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) dy namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor)                           ~~~~~ <--- HERE build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigprocmask(...)       (further warnings will be suppressed) build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigaction(...)       (further warnings will be suppressed) build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred: fault (General-Protection) detected @ PC (0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1) Memory Usage: 11842716 KBytes Program aborted at tick 294083905383 --- BEGIN LIBC BACKTRACE --- ... The simulation fails before the first line of the main function. I believe that it is failing to load the libtorch library. Unfortunately, it is not possible to build libtorch with "-static" since their static builds is broken for quiet a long time: https://github.com/pytorch/pytorch/issues/21737 I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also tested using different GCC versions to build the simulated binary. For anyone interested in reproducing the error, I'm sending a "setup.sh" script to create a minimal reproducible environment. Simply copy and paste the script below and name it as "setup.sh" in a new directory, then: source setup.sh cmake --B build -S . cmake --build build ./<gem5> config.py build/main Best regards, Caio Vieira --- setup.sh --- #!/bin/bash # Bash script to create minimal reproducible environment for libtorch simulation # bug. This script creates necessary files such as a CMakeLists.txt and a minimal # main.cpp. The CMakeLists.txt file downloads and manages libtorch by saving it # in a ""_deps"" folder. Steps to reproduce the bug: # ./<this-script> # cmake -B build -S . # cmake --build build # ./<gem5> config.py build/main function create_cmake() {     cat > CMakeLists.txt <<- \EOF cmake_minimum_required(VERSION 3.22 FATAL_ERROR) # Download and manage libtorch dependency set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps") file(MAKE_DIRECTORY "${DEPENDENCY_DIR}") if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch")     file(DOWNLOAD https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip         "${DEPENDENCY_DIR}/libtorch.zip")     file(ARCHIVE_EXTRACT         INPUT "${DEPENDENCY_DIR}/libtorch.zip"         DESTINATION "${DEPENDENCY_DIR}")     file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip") endif() set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch") find_package(Torch REQUIRED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") project(main) # Create executable add_executable(${CMAKE_PROJECT_NAME} main.cpp) target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC ${TORCH_INCLUDE_DIRS}) target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES}) set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14) message("Torch Libraries: ${TORCH_LIBRARIES}") EOF } function create_main() {     cat > main.cpp <<- \EOF #include <iostream> #include <torch/torch.h> int main(int argc, char *argv[]) {     std::cout << "Hello World" << std::endl; } EOF } function create_gem5_config() {     cat > config.py <<- \EOF # This script must be executed by gem5 # Usage: ./<gem5> <this-script> <binary> import sys from gem5.utils.requires import ISA from gem5.components.boards.simple_board import SimpleBoard from gem5.components.cachehierarchies.classic.no_cache import NoCache from gem5.components.memory.single_channel import SingleChannelDDR3_1600 from gem5.components.processors.simple_processor import SimpleProcessor from gem5.components.processors.cpu_types import CPUTypes from gem5.resources.resource import CustomResource from gem5.simulate.simulator import Simulator if len(sys.argv) == 1:     print("Provide a binary as argument", file=sys.stderr)     sys.exit(1) cache_hierarchy = NoCache() memory = SingleChannelDDR3_1600("8GiB") processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, num_cores=1, isa=ISA.X86) board = SimpleBoard(         clk_freq="3GHz",         processor=processor,         memory=memory,         cache_hierarchy=cache_hierarchy ) binary = CustomResource(sys.argv[1]) board.set_se_binary_workload(binary) simulator = Simulator(board=board) simulator.run() EOF } create_cmake create_main create_gem5_config
BB
Bobby Bruce
Wed, Jul 19, 2023 12:55 AM

I’m afraid I don’t know exactly what’s causing this error, but just to make sure, the binary you built and as a CustomResource executes on your host? This looks like an error coming from PyTorch, not the simulator. That being said, I don’t understand why "build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred: fault (General-Protection) detected @ PC “ is occurring after either, that could also be the issue. Personally, I’m always a bit scared linking to dynamic libraries on the host as well,

If you want to get around this the annoying advice is to use FS mode. It’s slower, and requires creation of a disk image, but it isn’t nearly as error-prone as SE mode. If your binary works on your host then you should be able to get it to work in FS mode. Using checkpoints and (if you have the right hardware and are using X86) KVM cores can speed things up for you too.

Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t you want to simulate a GPU too?

--
Dr. Bobby R. Bruce
Room 3050,
Kemper Hall, UC Davis
Davis,
CA, 95616

web: https://www.bobbybruce.net

On Jul 14, 2023, at 3:02 AM, Caio Vieira via gem5-users gem5-users@gem5.org wrote:

Hi everyone,

I'm trying to execute gem5 simulations using libtorch in SE mode. However, I get the following error message:

--- Error message ---
...
terminate called after throwing an instance of 'std::runtime_error'
what():  expected eof but found 'ident' here:
aten::quantized_lstm.inpr input, Tensor[] orch.classes.rnn.CellPara[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) dy
namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor)
~~~~~ <--- HERE

build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigprocmask(...)
(further warnings will be suppressed)
build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigaction(...)
(further warnings will be suppressed)
build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred: fault (General-Protection) detected @ PC (0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1)
Memory Usage: 11842716 KBytes
Program aborted at tick 294083905383
--- BEGIN LIBC BACKTRACE ---
...

The simulation fails before the first line of the main function. I believe that it is failing to load the libtorch library.
Unfortunately, it is not possible to build libtorch with "-static" since their static builds is broken for quiet a long
time: https://github.com/pytorch/pytorch/issues/21737
I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also tested using different GCC versions to build the simulated binary.

For anyone interested in reproducing the error, I'm sending a "setup.sh" script to create a minimal reproducible environment.
Simply copy and paste the script below and name it as "setup.sh" in a new directory, then:

source setup.sh
cmake --B build -S .
cmake --build build
./<gem5> config.py build/main

Best regards,
Caio Vieira

--- setup.sh ---

#!/bin/bash

Bash script to create minimal reproducible environment for libtorch simulation

bug. This script creates necessary files such as a CMakeLists.txt and a minimal

main.cpp. The CMakeLists.txt file downloads and manages libtorch by saving it

in a ""_deps"" folder. Steps to reproduce the bug:

./<this-script>

cmake -B build -S .

cmake --build build

./<gem5> config.py build/main

function create_cmake() {
cat > CMakeLists.txt <<- \EOF
cmake_minimum_required(VERSION 3.22 FATAL_ERROR)

Download and manage libtorch dependency

set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps")

file(MAKE_DIRECTORY "${DEPENDENCY_DIR}")
if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch")
file(DOWNLOAD
https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip
"${DEPENDENCY_DIR}/libtorch.zip")
file(ARCHIVE_EXTRACT
INPUT "${DEPENDENCY_DIR}/libtorch.zip"
DESTINATION "${DEPENDENCY_DIR}")
file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip")
endif()
set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch")
find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

project(main)

Create executable

add_executable(${CMAKE_PROJECT_NAME} main.cpp)
target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC ${TORCH_INCLUDE_DIRS})
target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES})
set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14)
message("Torch Libraries: ${TORCH_LIBRARIES}")
EOF
}

function create_main() {
cat > main.cpp <<- \EOF
#include <iostream>
#include <torch/torch.h>

int main(int argc, char *argv[]) {
std::cout << "Hello World" << std::endl;
}
EOF
}

function create_gem5_config() {
cat > config.py <<- \EOF

This script must be executed by gem5

Usage: ./<gem5> <this-script> <binary>

import sys

from gem5.utils.requires import ISA
from gem5.components.boards.simple_board import SimpleBoard
from gem5.components.cachehierarchies.classic.no_cache import NoCache
from gem5.components.memory.single_channel import SingleChannelDDR3_1600
from gem5.components.processors.simple_processor import SimpleProcessor
from gem5.components.processors.cpu_types import CPUTypes
from gem5.resources.resource import CustomResource
from gem5.simulate.simulator import Simulator

if len(sys.argv) == 1:
print("Provide a binary as argument", file=sys.stderr)
sys.exit(1)

cache_hierarchy = NoCache()
memory = SingleChannelDDR3_1600("8GiB")
processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, num_cores=1, isa=ISA.X86)

board = SimpleBoard(
clk_freq="3GHz",
processor=processor,
memory=memory,
cache_hierarchy=cache_hierarchy
)

binary = CustomResource(sys.argv[1])
board.set_se_binary_workload(binary)

simulator = Simulator(board=board)
simulator.run()
EOF
}

create_cmake
create_main
create_gem5_config


gem5-users mailing list -- gem5-users@gem5.org
To unsubscribe send an email to gem5-users-leave@gem5.org

I’m afraid I don’t know exactly what’s causing this error, but just to make sure, the binary you built and as a `CustomResource` executes on your host? This looks like an error coming from PyTorch, not the simulator. That being said, I don’t understand why "build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred: fault (General-Protection) detected @ PC “ is occurring after either, that could also be the issue. Personally, I’m always a bit scared linking to dynamic libraries on the host as well, If you want to get around this the annoying advice is to use FS mode. It’s slower, and requires creation of a disk image, but it isn’t nearly as error-prone as SE mode. If your binary works on your host then you should be able to get it to work in FS mode. Using checkpoints and (if you have the right hardware and are using X86) KVM cores can speed things up for you too. Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t you want to simulate a GPU too? -- Dr. Bobby R. Bruce Room 3050, Kemper Hall, UC Davis Davis, CA, 95616 web: https://www.bobbybruce.net > On Jul 14, 2023, at 3:02 AM, Caio Vieira via gem5-users <gem5-users@gem5.org> wrote: > > > Hi everyone, > > I'm trying to execute gem5 simulations using libtorch in SE mode. However, I get the following error message: > > --- Error message --- > ... > terminate called after throwing an instance of 'std::runtime_error' > what(): expected eof but found 'ident' here: > aten::quantized_lstm.inpr input, Tensor[] orch.classes.rnn.CellPara[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) dy > namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor) > ~~~~~ <--- HERE > > build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigprocmask(...) > (further warnings will be suppressed) > build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigaction(...) > (further warnings will be suppressed) > build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred: fault (General-Protection) detected @ PC (0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1) > Memory Usage: 11842716 KBytes > Program aborted at tick 294083905383 > --- BEGIN LIBC BACKTRACE --- > ... > > The simulation fails before the first line of the main function. I believe that it is failing to load the libtorch library. > Unfortunately, it is not possible to build libtorch with "-static" since their static builds is broken for quiet a long > time: https://github.com/pytorch/pytorch/issues/21737 > I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also tested using different GCC versions to build the simulated binary. > > For anyone interested in reproducing the error, I'm sending a "setup.sh" script to create a minimal reproducible environment. > Simply copy and paste the script below and name it as "setup.sh" in a new directory, then: > > source setup.sh > cmake --B build -S . > cmake --build build > ./<gem5> config.py build/main > > Best regards, > Caio Vieira > > --- setup.sh --- > > #!/bin/bash > > # Bash script to create minimal reproducible environment for libtorch simulation > # bug. This script creates necessary files such as a CMakeLists.txt and a minimal > # main.cpp. The CMakeLists.txt file downloads and manages libtorch by saving it > # in a ""_deps"" folder. Steps to reproduce the bug: > # ./<this-script> > # cmake -B build -S . > # cmake --build build > # ./<gem5> config.py build/main > > function create_cmake() { > cat > CMakeLists.txt <<- \EOF > cmake_minimum_required(VERSION 3.22 FATAL_ERROR) > > # Download and manage libtorch dependency > set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps") > > file(MAKE_DIRECTORY "${DEPENDENCY_DIR}") > if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch") > file(DOWNLOAD > https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip > "${DEPENDENCY_DIR}/libtorch.zip") > file(ARCHIVE_EXTRACT > INPUT "${DEPENDENCY_DIR}/libtorch.zip" > DESTINATION "${DEPENDENCY_DIR}") > file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip") > endif() > set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch") > find_package(Torch REQUIRED) > set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") > > project(main) > > # Create executable > add_executable(${CMAKE_PROJECT_NAME} main.cpp) > target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC ${TORCH_INCLUDE_DIRS}) > target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES}) > set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14) > message("Torch Libraries: ${TORCH_LIBRARIES}") > EOF > } > > function create_main() { > cat > main.cpp <<- \EOF > #include <iostream> > #include <torch/torch.h> > > int main(int argc, char *argv[]) { > std::cout << "Hello World" << std::endl; > } > EOF > } > > function create_gem5_config() { > cat > config.py <<- \EOF > # This script must be executed by gem5 > # Usage: ./<gem5> <this-script> <binary> > > import sys > > from gem5.utils.requires import ISA > from gem5.components.boards.simple_board import SimpleBoard > from gem5.components.cachehierarchies.classic.no_cache import NoCache > from gem5.components.memory.single_channel import SingleChannelDDR3_1600 > from gem5.components.processors.simple_processor import SimpleProcessor > from gem5.components.processors.cpu_types import CPUTypes > from gem5.resources.resource import CustomResource > from gem5.simulate.simulator import Simulator > > if len(sys.argv) == 1: > print("Provide a binary as argument", file=sys.stderr) > sys.exit(1) > > cache_hierarchy = NoCache() > memory = SingleChannelDDR3_1600("8GiB") > processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, num_cores=1, isa=ISA.X86) > > board = SimpleBoard( > clk_freq="3GHz", > processor=processor, > memory=memory, > cache_hierarchy=cache_hierarchy > ) > > binary = CustomResource(sys.argv[1]) > board.set_se_binary_workload(binary) > > simulator = Simulator(board=board) > simulator.run() > EOF > } > > create_cmake > create_main > create_gem5_config > _______________________________________________ > gem5-users mailing list -- gem5-users@gem5.org > To unsubscribe send an email to gem5-users-leave@gem5.org
MS
Matt Sinclair
Wed, Jul 19, 2023 1:08 AM

For what it's worth, one of the students working with me (Marco, CC'd) is
having the same failure right now for the head of develop (plus this fix:
https://github.com/gem5/gem5/pull/99), except for a tiny GPU microbenchmark
that definitely is not using PyTorch or any higher level library.

We are working on getting a backtrace to understand what's going on for us
(and then push a fix as applicable), and it's possible our problems have
the same symptom but a different root cause.  But just wanted to chime in
that there are multiple cases where this error is happening on develop
right now with SE mode.

Matt

On Tue, Jul 18, 2023 at 7:58 PM Bobby Bruce via gem5-users <
gem5-users@gem5.org> wrote:

I’m afraid I don’t know exactly what’s causing this error, but just to
make sure, the binary you built and as a CustomResource executes on your
host? This looks like an error coming from PyTorch, not the simulator. That
being said, I don’t understand why "build/X86/sim/faults.cc:61: panic:
panic condition !FullSystem occurred: fault (General-Protection) detected @
PC “ is occurring after either, that could also be the issue. Personally,
I’m always a bit scared linking to dynamic libraries on the host as well,

If you want to get around this the annoying advice is to use FS mode. It’s
slower, and requires creation of a disk image, but it isn’t nearly as
error-prone as SE mode. If your binary works on your host then you should
be able to get it to work in FS mode. Using checkpoints and (if you have
the right hardware and are using X86) KVM cores can speed things up for you
too.

Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t you want
to simulate a GPU too?

--
Dr. Bobby R. Bruce
Room 3050,
Kemper Hall, UC Davis
Davis,
CA, 95616

web: https://www.bobbybruce.net

On Jul 14, 2023, at 3:02 AM, Caio Vieira via gem5-users <
gem5-users@gem5.org> wrote:

Hi everyone,

I'm trying to execute gem5 simulations using libtorch in SE mode. However,
I get the following error message:

--- Error message ---
...
terminate called after throwing an instance of 'std::runtime_error'
what():  expected eof but found 'ident' here:
aten::quantized_lstm.inpr input, Tensor[] orch.classes.rnn.CellPara[]
params, bool has_biases, int num_layers, float dropout, bool train, bool
bidirectional, bool batch_first, *, ScalarType? dtype=None, bool
use_dynamic=False) dy
namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor)
~~~~~ <--- HERE

build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall
rt_sigprocmask(...)
(further warnings will be suppressed)
build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigaction(...)
(further warnings will be suppressed)
build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred:
fault (General-Protection) detected @ PC
(0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1)
Memory Usage: 11842716 KBytes
Program aborted at tick 294083905383
--- BEGIN LIBC BACKTRACE ---
...

The simulation fails before the first line of the main function. I believe
that it is failing to load the libtorch library.
Unfortunately, it is not possible to build libtorch with "-static" since
their static builds is broken for quiet a long
time: https://github.com/pytorch/pytorch/issues/21737
I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also tested using
different GCC versions to build the simulated binary.

For anyone interested in reproducing the error, I'm sending a "setup.sh"
script to create a minimal reproducible environment.
Simply copy and paste the script below and name it as "setup.sh" in a new
directory, then:

source setup.sh
cmake --B build -S .
cmake --build build
./<gem5> config.py build/main

Best regards,
Caio Vieira

--- setup.sh ---

#!/bin/bash

Bash script to create minimal reproducible environment for libtorch

simulation

bug. This script creates necessary files such as a CMakeLists.txt and a

minimal

main.cpp. The CMakeLists.txt file downloads and manages libtorch by

saving it

in a ""_deps"" folder. Steps to reproduce the bug:

./<this-script>

cmake -B build -S .

cmake --build build

./<gem5> config.py build/main

function create_cmake() {
cat > CMakeLists.txt <<- \EOF
cmake_minimum_required(VERSION 3.22 FATAL_ERROR)

Download and manage libtorch dependency

set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps")

file(MAKE_DIRECTORY "${DEPENDENCY_DIR}")
if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch")
file(DOWNLOAD

https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip
"${DEPENDENCY_DIR}/libtorch.zip")
file(ARCHIVE_EXTRACT
INPUT "${DEPENDENCY_DIR}/libtorch.zip"
DESTINATION "${DEPENDENCY_DIR}")
file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip")
endif()
set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch")
find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

project(main)

Create executable

add_executable(${CMAKE_PROJECT_NAME} main.cpp)
target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC
${TORCH_INCLUDE_DIRS})
target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES})
set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14)
message("Torch Libraries: ${TORCH_LIBRARIES}")
EOF
}

function create_main() {
cat > main.cpp <<- \EOF
#include <iostream>
#include <torch/torch.h>

int main(int argc, char *argv[]) {
std::cout << "Hello World" << std::endl;
}
EOF
}

function create_gem5_config() {
cat > config.py <<- \EOF

This script must be executed by gem5

Usage: ./<gem5> <this-script> <binary>

import sys

from gem5.utils.requires import ISA
from gem5.components.boards.simple_board import SimpleBoard
from gem5.components.cachehierarchies.classic.no_cache import NoCache
from gem5.components.memory.single_channel import SingleChannelDDR3_1600
from gem5.components.processors.simple_processor import SimpleProcessor
from gem5.components.processors.cpu_types import CPUTypes
from gem5.resources.resource import CustomResource
from gem5.simulate.simulator import Simulator

if len(sys.argv) == 1:
print("Provide a binary as argument", file=sys.stderr)
sys.exit(1)

cache_hierarchy = NoCache()
memory = SingleChannelDDR3_1600("8GiB")
processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, num_cores=1,
isa=ISA.X86)

board = SimpleBoard(
clk_freq="3GHz",
processor=processor,
memory=memory,
cache_hierarchy=cache_hierarchy
)

binary = CustomResource(sys.argv[1])
board.set_se_binary_workload(binary)

simulator = Simulator(board=board)
simulator.run()
EOF
}

create_cmake
create_main
create_gem5_config


gem5-users mailing list -- gem5-users@gem5.org
To unsubscribe send an email to gem5-users-leave@gem5.org


gem5-users mailing list -- gem5-users@gem5.org
To unsubscribe send an email to gem5-users-leave@gem5.org

For what it's worth, one of the students working with me (Marco, CC'd) is having the same failure right now for the head of develop (plus this fix: https://github.com/gem5/gem5/pull/99), except for a tiny GPU microbenchmark that definitely is not using PyTorch or any higher level library. We are working on getting a backtrace to understand what's going on for us (and then push a fix as applicable), and it's possible our problems have the same symptom but a different root cause. But just wanted to chime in that there are multiple cases where this error is happening on develop right now with SE mode. Matt On Tue, Jul 18, 2023 at 7:58 PM Bobby Bruce via gem5-users < gem5-users@gem5.org> wrote: > I’m afraid I don’t know exactly what’s causing this error, but just to > make sure, the binary you built and as a `CustomResource` executes on your > host? This looks like an error coming from PyTorch, not the simulator. That > being said, I don’t understand why "build/X86/sim/faults.cc:61: panic: > panic condition !FullSystem occurred: fault (General-Protection) detected @ > PC “ is occurring after either, that could also be the issue. Personally, > I’m always a bit scared linking to dynamic libraries on the host as well, > > If you want to get around this the annoying advice is to use FS mode. It’s > slower, and requires creation of a disk image, but it isn’t nearly as > error-prone as SE mode. If your binary works on your host then you should > be able to get it to work in FS mode. Using checkpoints and (if you have > the right hardware and are using X86) KVM cores can speed things up for you > too. > > Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t you want > to simulate a GPU too? > > -- > Dr. Bobby R. Bruce > Room 3050, > Kemper Hall, UC Davis > Davis, > CA, 95616 > > web: https://www.bobbybruce.net > > On Jul 14, 2023, at 3:02 AM, Caio Vieira via gem5-users < > gem5-users@gem5.org> wrote: > > > Hi everyone, > > I'm trying to execute gem5 simulations using libtorch in SE mode. However, > I get the following error message: > > --- Error message --- > ... > terminate called after throwing an instance of 'std::runtime_error' > what(): expected eof but found 'ident' here: > aten::quantized_lstm.inpr input, Tensor[] orch.classes.rnn.CellPara[] > params, bool has_biases, int num_layers, float dropout, bool train, bool > bidirectional, bool batch_first, *, ScalarType? dtype=None, bool > use_dynamic=False) dy > namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor) > ~~~~~ <--- HERE > > build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall > rt_sigprocmask(...) > (further warnings will be suppressed) > build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigaction(...) > (further warnings will be suppressed) > build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred: > fault (General-Protection) detected @ PC > (0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1) > Memory Usage: 11842716 KBytes > Program aborted at tick 294083905383 > --- BEGIN LIBC BACKTRACE --- > ... > > The simulation fails before the first line of the main function. I believe > that it is failing to load the libtorch library. > Unfortunately, it is not possible to build libtorch with "-static" since > their static builds is broken for quiet a long > time: https://github.com/pytorch/pytorch/issues/21737 > I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also tested using > different GCC versions to build the simulated binary. > > For anyone interested in reproducing the error, I'm sending a "setup.sh" > script to create a minimal reproducible environment. > Simply copy and paste the script below and name it as "setup.sh" in a new > directory, then: > > source setup.sh > cmake --B build -S . > cmake --build build > ./<gem5> config.py build/main > > Best regards, > Caio Vieira > > --- setup.sh --- > > #!/bin/bash > > # Bash script to create minimal reproducible environment for libtorch > simulation > # bug. This script creates necessary files such as a CMakeLists.txt and a > minimal > # main.cpp. The CMakeLists.txt file downloads and manages libtorch by > saving it > # in a ""_deps"" folder. Steps to reproduce the bug: > # ./<this-script> > # cmake -B build -S . > # cmake --build build > # ./<gem5> config.py build/main > > function create_cmake() { > cat > CMakeLists.txt <<- \EOF > cmake_minimum_required(VERSION 3.22 FATAL_ERROR) > > # Download and manage libtorch dependency > set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps") > > file(MAKE_DIRECTORY "${DEPENDENCY_DIR}") > if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch") > file(DOWNLOAD > > https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip > "${DEPENDENCY_DIR}/libtorch.zip") > file(ARCHIVE_EXTRACT > INPUT "${DEPENDENCY_DIR}/libtorch.zip" > DESTINATION "${DEPENDENCY_DIR}") > file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip") > endif() > set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch") > find_package(Torch REQUIRED) > set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") > > project(main) > > # Create executable > add_executable(${CMAKE_PROJECT_NAME} main.cpp) > target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC > ${TORCH_INCLUDE_DIRS}) > target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES}) > set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14) > message("Torch Libraries: ${TORCH_LIBRARIES}") > EOF > } > > function create_main() { > cat > main.cpp <<- \EOF > #include <iostream> > #include <torch/torch.h> > > int main(int argc, char *argv[]) { > std::cout << "Hello World" << std::endl; > } > EOF > } > > function create_gem5_config() { > cat > config.py <<- \EOF > # This script must be executed by gem5 > # Usage: ./<gem5> <this-script> <binary> > > import sys > > from gem5.utils.requires import ISA > from gem5.components.boards.simple_board import SimpleBoard > from gem5.components.cachehierarchies.classic.no_cache import NoCache > from gem5.components.memory.single_channel import SingleChannelDDR3_1600 > from gem5.components.processors.simple_processor import SimpleProcessor > from gem5.components.processors.cpu_types import CPUTypes > from gem5.resources.resource import CustomResource > from gem5.simulate.simulator import Simulator > > if len(sys.argv) == 1: > print("Provide a binary as argument", file=sys.stderr) > sys.exit(1) > > cache_hierarchy = NoCache() > memory = SingleChannelDDR3_1600("8GiB") > processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, num_cores=1, > isa=ISA.X86) > > board = SimpleBoard( > clk_freq="3GHz", > processor=processor, > memory=memory, > cache_hierarchy=cache_hierarchy > ) > > binary = CustomResource(sys.argv[1]) > board.set_se_binary_workload(binary) > > simulator = Simulator(board=board) > simulator.run() > EOF > } > > create_cmake > create_main > create_gem5_config > _______________________________________________ > gem5-users mailing list -- gem5-users@gem5.org > To unsubscribe send an email to gem5-users-leave@gem5.org > > > _______________________________________________ > gem5-users mailing list -- gem5-users@gem5.org > To unsubscribe send an email to gem5-users-leave@gem5.org >
CV
Caio Vieira
Wed, Jul 19, 2023 8:12 AM

Hi,

Yes, the binary works correctly on my host. Also, I was able to load it
in an Ubuntu 18 FS simulation using gem5.

Before the panic condition is triggered, an exception is thrown, which
does not occur when running on host. I tried to debug gem5 by running it
on gdb and using "catch throw" to intercept exceptions. However, the gdb
only stopped at the panic condition and not when the exception was
thrown, which makes me believe that the exception was thrown by libtorch.

"Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t you
want to simulate a GPU too?"

Maybe in the future. At the moment I'm focused on getting a basic
simulation of CPU+libtorch working :)

On 19.07.23 02:55, Bobby Bruce via gem5-users wrote:

I’m afraid I don’t know exactly what’s causing this error, but just to
make sure, the binary you built and as a CustomResource executes on
your host? This looks like an error coming from PyTorch, not the
simulator. That being said, I don’t understand why
"build/X86/sim/faults.cc:61: panic: panic condition !FullSystem
occurred: fault (General-Protection) detected @ PC “ is occurring
after either, that could also be the issue. Personally, I’m always a
bit scared linking to dynamic libraries on the host as well,

If you want to get around this the annoying advice is to use FS mode.
It’s slower, and requires creation of a disk image, but it isn’t
nearly as error-prone as SE mode. If your binary works on your host
then you should be able to get it to work in FS mode. Using
checkpoints and (if you have the right hardware and are using X86) KVM
cores can speed things up for you too.

Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t you
want to simulate a GPU too?

--
Dr. Bobby R. Bruce
Room 3050,
Kemper Hall, UC Davis
Davis,
CA, 95616

web: https://www.bobbybruce.net

On Jul 14, 2023, at 3:02 AM, Caio Vieira via gem5-users
gem5-users@gem5.org wrote:

Hi everyone,

I'm trying to execute gem5 simulations using libtorch in SE mode.
However, I get the following error message:

--- Error message ---
...
terminate called after throwing an instance of 'std::runtime_error'
  what():  expected eof but found 'ident' here:
aten::quantized_lstm.inpr input, Tensor[] orch.classes.rnn.CellPara[]
params, bool has_biases, int num_layers, float dropout, bool train,
bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool
use_dynamic=False) dy
namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor)
                          ~~~~~ <--- HERE

build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall
rt_sigprocmask(...)
      (further warnings will be suppressed)
build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall
rt_sigaction(...)
      (further warnings will be suppressed)
build/X86/sim/faults.cc:61: panic: panic condition !FullSystem
occurred: fault (General-Protection) detected @ PC
(0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1)
Memory Usage: 11842716 KBytes
Program aborted at tick 294083905383
--- BEGIN LIBC BACKTRACE ---
...

The simulation fails before the first line of the main function. I
believe that it is failing to load the libtorch library.
Unfortunately, it is not possible to build libtorch with "-static"
since their static builds is broken for quiet a long
time: https://github.com/pytorch/pytorch/issues/21737
I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also tested
using different GCC versions to build the simulated binary.

For anyone interested in reproducing the error, I'm sending a
"setup.sh" script to create a minimal reproducible environment.
Simply copy and paste the script below and name it as "setup.sh" in a
new directory, then:

source setup.sh
cmake --B build -S .
cmake --build build
./<gem5> config.py build/main

Best regards,
Caio Vieira

--- setup.sh ---

#!/bin/bash

Bash script to create minimal reproducible environment for libtorch

simulation

bug. This script creates necessary files such as a CMakeLists.txt

and a minimal

main.cpp. The CMakeLists.txt file downloads and manages libtorch by

saving it

in a ""_deps"" folder. Steps to reproduce the bug:

./<this-script>

cmake -B build -S .

cmake --build build

./<gem5> config.py build/main

function create_cmake() {
    cat > CMakeLists.txt <<- \EOF
cmake_minimum_required(VERSION 3.22 FATAL_ERROR)

Download and manage libtorch dependency

set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps")

file(MAKE_DIRECTORY "${DEPENDENCY_DIR}")
if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch")
    file(DOWNLOAD
https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip

        "${DEPENDENCY_DIR}/libtorch.zip")
    file(ARCHIVE_EXTRACT
        INPUT "${DEPENDENCY_DIR}/libtorch.zip"
        DESTINATION "${DEPENDENCY_DIR}")
    file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip")
endif()
set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch")
find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

project(main)

Create executable

add_executable(${CMAKE_PROJECT_NAME} main.cpp)
target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC
${TORCH_INCLUDE_DIRS})
target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES})
set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14)
message("Torch Libraries: ${TORCH_LIBRARIES}")
EOF
}

function create_main() {
    cat > main.cpp <<- \EOF
#include <iostream>
#include <torch/torch.h>

int main(int argc, char *argv[]) {
    std::cout << "Hello World" << std::endl;
}
EOF
}

function create_gem5_config() {
    cat > config.py <<- \EOF

This script must be executed by gem5

Usage: ./<gem5> <this-script> <binary>

import sys

from gem5.utils.requires import ISA
from gem5.components.boards.simple_board import SimpleBoard
from gem5.components.cachehierarchies.classic.no_cache import NoCache
from gem5.components.memory.single_channel import SingleChannelDDR3_1600
from gem5.components.processors.simple_processor import SimpleProcessor
from gem5.components.processors.cpu_types import CPUTypes
from gem5.resources.resource import CustomResource
from gem5.simulate.simulator import Simulator

if len(sys.argv) == 1:
    print("Provide a binary as argument", file=sys.stderr)
    sys.exit(1)

cache_hierarchy = NoCache()
memory = SingleChannelDDR3_1600("8GiB")
processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, num_cores=1,
isa=ISA.X86)

board = SimpleBoard(
        clk_freq="3GHz",
        processor=processor,
        memory=memory,
        cache_hierarchy=cache_hierarchy
)

binary = CustomResource(sys.argv[1])
board.set_se_binary_workload(binary)

simulator = Simulator(board=board)
simulator.run()
EOF
}

create_cmake
create_main
create_gem5_config


gem5-users mailing list -- gem5-users@gem5.org
To unsubscribe send an email to gem5-users-leave@gem5.org


gem5-users mailing list --gem5-users@gem5.org
To unsubscribe send an email togem5-users-leave@gem5.org

Hi, Yes, the binary works correctly on my host. Also, I was able to load it in an Ubuntu 18 FS simulation using gem5. Before the panic condition is triggered, an exception is thrown, which does not occur when running on host. I tried to debug gem5 by running it on gdb and using "catch throw" to intercept exceptions. However, the gdb only stopped at the panic condition and not when the exception was thrown, which makes me believe that the exception was thrown by libtorch. "Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t you want to simulate a GPU too?" Maybe in the future. At the moment I'm focused on getting a basic simulation of CPU+libtorch working :) On 19.07.23 02:55, Bobby Bruce via gem5-users wrote: > I’m afraid I don’t know exactly what’s causing this error, but just to > make sure, the binary you built and as a `CustomResource` executes on > your host? This looks like an error coming from PyTorch, not the > simulator. That being said, I don’t understand why > "build/X86/sim/faults.cc:61: panic: panic condition !FullSystem > occurred: fault (General-Protection) detected @ PC “ is occurring > after either, that could also be the issue. Personally, I’m always a > bit scared linking to dynamic libraries on the host as well, > > If you want to get around this the annoying advice is to use FS mode. > It’s slower, and requires creation of a disk image, but it isn’t > nearly as error-prone as SE mode. If your binary works on your host > then you should be able to get it to work in FS mode. Using > checkpoints and (if you have the right hardware and are using X86) KVM > cores can speed things up for you too. > > Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t you > want to simulate a GPU too? > > -- > Dr. Bobby R. Bruce > Room 3050, > Kemper Hall, UC Davis > Davis, > CA, 95616 > > web: https://www.bobbybruce.net > >> On Jul 14, 2023, at 3:02 AM, Caio Vieira via gem5-users >> <gem5-users@gem5.org> wrote: >> >> >> Hi everyone, >> >> I'm trying to execute gem5 simulations using libtorch in SE mode. >> However, I get the following error message: >> >> --- Error message --- >> ... >> terminate called after throwing an instance of 'std::runtime_error' >>   what():  expected eof but found 'ident' here: >> aten::quantized_lstm.inpr input, Tensor[] orch.classes.rnn.CellPara[] >> params, bool has_biases, int num_layers, float dropout, bool train, >> bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool >> use_dynamic=False) dy >> namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor) >>                           ~~~~~ <--- HERE >> >> build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall >> rt_sigprocmask(...) >>       (further warnings will be suppressed) >> build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall >> rt_sigaction(...) >>       (further warnings will be suppressed) >> build/X86/sim/faults.cc:61: panic: panic condition !FullSystem >> occurred: fault (General-Protection) detected @ PC >> (0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1) >> Memory Usage: 11842716 KBytes >> Program aborted at tick 294083905383 >> --- BEGIN LIBC BACKTRACE --- >> ... >> >> The simulation fails before the first line of the main function. I >> believe that it is failing to load the libtorch library. >> Unfortunately, it is not possible to build libtorch with "-static" >> since their static builds is broken for quiet a long >> time: https://github.com/pytorch/pytorch/issues/21737 >> I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also tested >> using different GCC versions to build the simulated binary. >> >> For anyone interested in reproducing the error, I'm sending a >> "setup.sh" script to create a minimal reproducible environment. >> Simply copy and paste the script below and name it as "setup.sh" in a >> new directory, then: >> >> source setup.sh >> cmake --B build -S . >> cmake --build build >> ./<gem5> config.py build/main >> >> Best regards, >> Caio Vieira >> >> --- setup.sh --- >> >> #!/bin/bash >> >> # Bash script to create minimal reproducible environment for libtorch >> simulation >> # bug. This script creates necessary files such as a CMakeLists.txt >> and a minimal >> # main.cpp. The CMakeLists.txt file downloads and manages libtorch by >> saving it >> # in a ""_deps"" folder. Steps to reproduce the bug: >> # ./<this-script> >> # cmake -B build -S . >> # cmake --build build >> # ./<gem5> config.py build/main >> >> function create_cmake() { >>     cat > CMakeLists.txt <<- \EOF >> cmake_minimum_required(VERSION 3.22 FATAL_ERROR) >> >> # Download and manage libtorch dependency >> set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps") >> >> file(MAKE_DIRECTORY "${DEPENDENCY_DIR}") >> if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch") >>     file(DOWNLOAD >> https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip >> >>         "${DEPENDENCY_DIR}/libtorch.zip") >>     file(ARCHIVE_EXTRACT >>         INPUT "${DEPENDENCY_DIR}/libtorch.zip" >>         DESTINATION "${DEPENDENCY_DIR}") >>     file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip") >> endif() >> set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch") >> find_package(Torch REQUIRED) >> set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") >> >> project(main) >> >> # Create executable >> add_executable(${CMAKE_PROJECT_NAME} main.cpp) >> target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC >> ${TORCH_INCLUDE_DIRS}) >> target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES}) >> set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14) >> message("Torch Libraries: ${TORCH_LIBRARIES}") >> EOF >> } >> >> function create_main() { >>     cat > main.cpp <<- \EOF >> #include <iostream> >> #include <torch/torch.h> >> >> int main(int argc, char *argv[]) { >>     std::cout << "Hello World" << std::endl; >> } >> EOF >> } >> >> function create_gem5_config() { >>     cat > config.py <<- \EOF >> # This script must be executed by gem5 >> # Usage: ./<gem5> <this-script> <binary> >> >> import sys >> >> from gem5.utils.requires import ISA >> from gem5.components.boards.simple_board import SimpleBoard >> from gem5.components.cachehierarchies.classic.no_cache import NoCache >> from gem5.components.memory.single_channel import SingleChannelDDR3_1600 >> from gem5.components.processors.simple_processor import SimpleProcessor >> from gem5.components.processors.cpu_types import CPUTypes >> from gem5.resources.resource import CustomResource >> from gem5.simulate.simulator import Simulator >> >> if len(sys.argv) == 1: >>     print("Provide a binary as argument", file=sys.stderr) >>     sys.exit(1) >> >> cache_hierarchy = NoCache() >> memory = SingleChannelDDR3_1600("8GiB") >> processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, num_cores=1, >> isa=ISA.X86) >> >> board = SimpleBoard( >>         clk_freq="3GHz", >>         processor=processor, >>         memory=memory, >>         cache_hierarchy=cache_hierarchy >> ) >> >> binary = CustomResource(sys.argv[1]) >> board.set_se_binary_workload(binary) >> >> simulator = Simulator(board=board) >> simulator.run() >> EOF >> } >> >> create_cmake >> create_main >> create_gem5_config >> _______________________________________________ >> gem5-users mailing list -- gem5-users@gem5.org >> To unsubscribe send an email to gem5-users-leave@gem5.org > > > _______________________________________________ > gem5-users mailing list --gem5-users@gem5.org > To unsubscribe send an email togem5-users-leave@gem5.org
BB
Bobby Bruce
Wed, Jul 19, 2023 1:52 PM

I tried to reproduce this for myself but I failed to do so (using v23.0.0.1, build/ALL/gem5.opt).. Attached is a tarball of the files i used. The steps I carried out can be found on the the "README.md" file. I'm not sure what I'm missing but my execution prints "Hello World" successfully.

The host was an X86 Ubuntu 20.04.6 OS and everything was compiled if GCC 9.4.0.

Dr. Bobby R. Bruce
Room 3050,
Kemper Hall, UC Davis
Davis,
CA, 95616

web: https://www.bobbybruce.net

On Jul 18, 2023, at 6:08 PM, Matt Sinclair mattdsinclair.wisc@gmail.com wrote:

For what it's worth, one of the students working with me (Marco, CC'd) is having the same failure right now for the head of develop (plus this fix: https://github.com/gem5/gem5/pull/99), except for a tiny GPU microbenchmark that definitely is not using PyTorch or any higher level library.

We are working on getting a backtrace to understand what's going on for us (and then push a fix as applicable), and it's possible our problems have the same symptom but a different root cause.  But just wanted to chime in that there are multiple cases where this error is happening on develop right now with SE mode.

Matt

On Tue, Jul 18, 2023 at 7:58 PM Bobby Bruce via gem5-users <gem5-users@gem5.org mailto:gem5-users@gem5.org> wrote:

I’m afraid I don’t know exactly what’s causing this error, but just to make sure, the binary you built and as a CustomResource executes on your host? This looks like an error coming from PyTorch, not the simulator. That being said, I don’t understand why "build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred: fault (General-Protection) detected @ PC “ is occurring after either, that could also be the issue. Personally, I’m always a bit scared linking to dynamic libraries on the host as well,

If you want to get around this the annoying advice is to use FS mode. It’s slower, and requires creation of a disk image, but it isn’t nearly as error-prone as SE mode. If your binary works on your host then you should be able to get it to work in FS mode. Using checkpoints and (if you have the right hardware and are using X86) KVM cores can speed things up for you too.

Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t you want to simulate a GPU too?

--
Dr. Bobby R. Bruce
Room 3050,
Kemper Hall, UC Davis
Davis,
CA, 95616

web: https://www.bobbybruce.net https://www.bobbybruce.net/

On Jul 14, 2023, at 3:02 AM, Caio Vieira via gem5-users <gem5-users@gem5.org mailto:gem5-users@gem5.org> wrote:

Hi everyone,

I'm trying to execute gem5 simulations using libtorch in SE mode. However, I get the following error message:

--- Error message ---
...
terminate called after throwing an instance of 'std::runtime_error'
what():  expected eof but found 'ident' here:
aten::quantized_lstm.inpr input, Tensor[] orch.classes.rnn.CellPara[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) dy
namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor)
~~~~~ <--- HERE

build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigprocmask(...)
(further warnings will be suppressed)
build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigaction(...)
(further warnings will be suppressed)
build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred: fault (General-Protection) detected @ PC (0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1)
Memory Usage: 11842716 KBytes
Program aborted at tick 294083905383
--- BEGIN LIBC BACKTRACE ---
...

The simulation fails before the first line of the main function. I believe that it is failing to load the libtorch library.
Unfortunately, it is not possible to build libtorch with "-static" since their static builds is broken for quiet a long
time: https://github.com/pytorch/pytorch/issues/21737
I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also tested using different GCC versions to build the simulated binary.

For anyone interested in reproducing the error, I'm sending a "setup.sh" script to create a minimal reproducible environment.
Simply copy and paste the script below and name it as "setup.sh" in a new directory, then:

source setup.sh
cmake --B build -S .
cmake --build build
./<gem5> config.py build/main

Best regards,
Caio Vieira

--- setup.sh ---

#!/bin/bash

Bash script to create minimal reproducible environment for libtorch simulation

bug. This script creates necessary files such as a CMakeLists.txt and a minimal

main.cpp. The CMakeLists.txt file downloads and manages libtorch by saving it

in a ""_deps"" folder. Steps to reproduce the bug:

./<this-script>

cmake -B build -S .

cmake --build build

./<gem5> config.py build/main

function create_cmake() {
cat > CMakeLists.txt <<- \EOF
cmake_minimum_required(VERSION 3.22 FATAL_ERROR)

Download and manage libtorch dependency

set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps")

file(MAKE_DIRECTORY "${DEPENDENCY_DIR}")
if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch")
file(DOWNLOAD
https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip
"${DEPENDENCY_DIR}/libtorch.zip")
file(ARCHIVE_EXTRACT
INPUT "${DEPENDENCY_DIR}/libtorch.zip"
DESTINATION "${DEPENDENCY_DIR}")
file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip")
endif()
set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch")
find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

project(main)

Create executable

add_executable(${CMAKE_PROJECT_NAME} main.cpp)
target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC ${TORCH_INCLUDE_DIRS})
target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES})
set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14)
message("Torch Libraries: ${TORCH_LIBRARIES}")
EOF
}

function create_main() {
cat > main.cpp <<- \EOF
#include <iostream>
#include <torch/torch.h>

int main(int argc, char *argv[]) {
std::cout << "Hello World" << std::endl;
}
EOF
}

function create_gem5_config() {
cat > config.py <<- \EOF

This script must be executed by gem5

Usage: ./<gem5> <this-script> <binary>

import sys

from gem5.utils.requires import ISA
from gem5.components.boards.simple_board import SimpleBoard
from gem5.components.cachehierarchies.classic.no_cache import NoCache
from gem5.components.memory.single_channel import SingleChannelDDR3_1600
from gem5.components.processors.simple_processor import SimpleProcessor
from gem5.components.processors.cpu_types import CPUTypes
from gem5.resources.resource import CustomResource
from gem5.simulate.simulator import Simulator

if len(sys.argv) == 1:
print("Provide a binary as argument", file=sys.stderr)
sys.exit(1)

cache_hierarchy = NoCache()
memory = SingleChannelDDR3_1600("8GiB")
processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, num_cores=1, isa=ISA.X86)

board = SimpleBoard(
clk_freq="3GHz",
processor=processor,
memory=memory,
cache_hierarchy=cache_hierarchy
)

binary = CustomResource(sys.argv[1])
board.set_se_binary_workload(binary)

simulator = Simulator(board=board)
simulator.run()
EOF
}

create_cmake
create_main
create_gem5_config


gem5-users mailing list -- gem5-users@gem5.org mailto:gem5-users@gem5.org
To unsubscribe send an email to gem5-users-leave@gem5.org mailto:gem5-users-leave@gem5.org

I tried to reproduce this for myself but I failed to do so (using v23.0.0.1, `build/ALL/gem5.opt`).. Attached is a tarball of the files i used. The steps I carried out can be found on the the "README.md" file. I'm not sure what I'm missing but my execution prints "Hello World" successfully. The host was an X86 Ubuntu 20.04.6 OS and everything was compiled if GCC 9.4.0.  -- Dr. Bobby R. Bruce Room 3050, Kemper Hall, UC Davis Davis, CA, 95616 web: https://www.bobbybruce.net > On Jul 18, 2023, at 6:08 PM, Matt Sinclair <mattdsinclair.wisc@gmail.com> wrote: > > For what it's worth, one of the students working with me (Marco, CC'd) is having the same failure right now for the head of develop (plus this fix: https://github.com/gem5/gem5/pull/99), except for a tiny GPU microbenchmark that definitely is not using PyTorch or any higher level library. > > We are working on getting a backtrace to understand what's going on for us (and then push a fix as applicable), and it's possible our problems have the same symptom but a different root cause. But just wanted to chime in that there are multiple cases where this error is happening on develop right now with SE mode. > > Matt > > > On Tue, Jul 18, 2023 at 7:58 PM Bobby Bruce via gem5-users <gem5-users@gem5.org <mailto:gem5-users@gem5.org>> wrote: >> I’m afraid I don’t know exactly what’s causing this error, but just to make sure, the binary you built and as a `CustomResource` executes on your host? This looks like an error coming from PyTorch, not the simulator. That being said, I don’t understand why "build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred: fault (General-Protection) detected @ PC “ is occurring after either, that could also be the issue. Personally, I’m always a bit scared linking to dynamic libraries on the host as well, >> >> If you want to get around this the annoying advice is to use FS mode. It’s slower, and requires creation of a disk image, but it isn’t nearly as error-prone as SE mode. If your binary works on your host then you should be able to get it to work in FS mode. Using checkpoints and (if you have the right hardware and are using X86) KVM cores can speed things up for you too. >> >> Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t you want to simulate a GPU too? >> >> -- >> Dr. Bobby R. Bruce >> Room 3050, >> Kemper Hall, UC Davis >> Davis, >> CA, 95616 >> >> web: https://www.bobbybruce.net <https://www.bobbybruce.net/> >> >>> On Jul 14, 2023, at 3:02 AM, Caio Vieira via gem5-users <gem5-users@gem5.org <mailto:gem5-users@gem5.org>> wrote: >>> >>> >>> Hi everyone, >>> >>> I'm trying to execute gem5 simulations using libtorch in SE mode. However, I get the following error message: >>> >>> --- Error message --- >>> ... >>> terminate called after throwing an instance of 'std::runtime_error' >>> what(): expected eof but found 'ident' here: >>> aten::quantized_lstm.inpr input, Tensor[] orch.classes.rnn.CellPara[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) dy >>> namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor) >>> ~~~~~ <--- HERE >>> >>> build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigprocmask(...) >>> (further warnings will be suppressed) >>> build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall rt_sigaction(...) >>> (further warnings will be suppressed) >>> build/X86/sim/faults.cc:61: panic: panic condition !FullSystem occurred: fault (General-Protection) detected @ PC (0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1) >>> Memory Usage: 11842716 KBytes >>> Program aborted at tick 294083905383 >>> --- BEGIN LIBC BACKTRACE --- >>> ... >>> >>> The simulation fails before the first line of the main function. I believe that it is failing to load the libtorch library. >>> Unfortunately, it is not possible to build libtorch with "-static" since their static builds is broken for quiet a long >>> time: https://github.com/pytorch/pytorch/issues/21737 >>> I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also tested using different GCC versions to build the simulated binary. >>> >>> For anyone interested in reproducing the error, I'm sending a "setup.sh" script to create a minimal reproducible environment. >>> Simply copy and paste the script below and name it as "setup.sh" in a new directory, then: >>> >>> source setup.sh >>> cmake --B build -S . >>> cmake --build build >>> ./<gem5> config.py build/main >>> >>> Best regards, >>> Caio Vieira >>> >>> --- setup.sh --- >>> >>> #!/bin/bash >>> >>> # Bash script to create minimal reproducible environment for libtorch simulation >>> # bug. This script creates necessary files such as a CMakeLists.txt and a minimal >>> # main.cpp. The CMakeLists.txt file downloads and manages libtorch by saving it >>> # in a ""_deps"" folder. Steps to reproduce the bug: >>> # ./<this-script> >>> # cmake -B build -S . >>> # cmake --build build >>> # ./<gem5> config.py build/main >>> >>> function create_cmake() { >>> cat > CMakeLists.txt <<- \EOF >>> cmake_minimum_required(VERSION 3.22 FATAL_ERROR) >>> >>> # Download and manage libtorch dependency >>> set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps") >>> >>> file(MAKE_DIRECTORY "${DEPENDENCY_DIR}") >>> if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch") >>> file(DOWNLOAD >>> https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip >>> "${DEPENDENCY_DIR}/libtorch.zip") >>> file(ARCHIVE_EXTRACT >>> INPUT "${DEPENDENCY_DIR}/libtorch.zip" >>> DESTINATION "${DEPENDENCY_DIR}") >>> file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip") >>> endif() >>> set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch") >>> find_package(Torch REQUIRED) >>> set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") >>> >>> project(main) >>> >>> # Create executable >>> add_executable(${CMAKE_PROJECT_NAME} main.cpp) >>> target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC ${TORCH_INCLUDE_DIRS}) >>> target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES}) >>> set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14) >>> message("Torch Libraries: ${TORCH_LIBRARIES}") >>> EOF >>> } >>> >>> function create_main() { >>> cat > main.cpp <<- \EOF >>> #include <iostream> >>> #include <torch/torch.h> >>> >>> int main(int argc, char *argv[]) { >>> std::cout << "Hello World" << std::endl; >>> } >>> EOF >>> } >>> >>> function create_gem5_config() { >>> cat > config.py <<- \EOF >>> # This script must be executed by gem5 >>> # Usage: ./<gem5> <this-script> <binary> >>> >>> import sys >>> >>> from gem5.utils.requires import ISA >>> from gem5.components.boards.simple_board import SimpleBoard >>> from gem5.components.cachehierarchies.classic.no_cache import NoCache >>> from gem5.components.memory.single_channel import SingleChannelDDR3_1600 >>> from gem5.components.processors.simple_processor import SimpleProcessor >>> from gem5.components.processors.cpu_types import CPUTypes >>> from gem5.resources.resource import CustomResource >>> from gem5.simulate.simulator import Simulator >>> >>> if len(sys.argv) == 1: >>> print("Provide a binary as argument", file=sys.stderr) >>> sys.exit(1) >>> >>> cache_hierarchy = NoCache() >>> memory = SingleChannelDDR3_1600("8GiB") >>> processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, num_cores=1, isa=ISA.X86) >>> >>> board = SimpleBoard( >>> clk_freq="3GHz", >>> processor=processor, >>> memory=memory, >>> cache_hierarchy=cache_hierarchy >>> ) >>> >>> binary = CustomResource(sys.argv[1]) >>> board.set_se_binary_workload(binary) >>> >>> simulator = Simulator(board=board) >>> simulator.run() >>> EOF >>> } >>> >>> create_cmake >>> create_main >>> create_gem5_config >>> _______________________________________________ >>> gem5-users mailing list -- gem5-users@gem5.org <mailto:gem5-users@gem5.org> >>> To unsubscribe send an email to gem5-users-leave@gem5.org <mailto:gem5-users-leave@gem5.org> >> >> _______________________________________________ >> gem5-users mailing list -- gem5-users@gem5.org <mailto:gem5-users@gem5.org> >> To unsubscribe send an email to gem5-users-leave@gem5.org <mailto:gem5-users-leave@gem5.org>
CV
Caio Vieira
Fri, Jul 21, 2023 8:49 AM

I can confirm that gem5 can load a libtorch binary on v23.0.0.1. I was
able to run the binary on a Ubuntu 22.04.1.

Also, I've run git bisect to find the commit that fixes the bug. Here is
the git bisect log:


old: [5fa484e2e02604ad3a5bf01f35ad1f97ca6d17b8] misc: Merge the v22.1

release staging into stable
git bisect old 5fa484e2e02604ad3a5bf01f35ad1f97ca6d17b8

new: [af72b9ba580546ac12ce05bfaac3fd53fa8699f4] misc: Update

RELEASE-NOTES.md for v23.0.0.1 hotfix
git bisect new af72b9ba580546ac12ce05bfaac3fd53fa8699f4

old: [6841e1aa5a1738961940fece2b35baf77c8c224d] stdlib: Fix bug in

MESI_Three_Level_Cache initialization
git bisect old 6841e1aa5a1738961940fece2b35baf77c8c224d

new: [e90bd5feb9a7d6672b231190783433bf3f7d6706] configs: Add

--with-pmu option to the simple Arm FS configs
git bisect new e90bd5feb9a7d6672b231190783433bf3f7d6706

new: [f9cf3de711d59bc3a81bb8d49f1408b1f6349a7b] mem: Use HostSocket in

the SharedMemoryServer.
git bisect new f9cf3de711d59bc3a81bb8d49f1408b1f6349a7b

new: [c8abd97584d4dffe32c21773b98b41fa991f4231] tests: Disable the

looppoint checkpoint tests
git bisect new c8abd97584d4dffe32c21773b98b41fa991f4231

new: [7b39a7f14e6d5132961e3e568b121d7fd7422f90] misc: Rename DEBUG

macro into GEM5_DEBUG
git bisect new 7b39a7f14e6d5132961e3e568b121d7fd7422f90

new: [a589d7b5697b3fbe61e1842e1831aef50aa96f32] arch-x86: Add

instructions from SSE4.1 set.
git bisect new a589d7b5697b3fbe61e1842e1831aef50aa96f32

old: [1bb8cd3d44c563877d486953f0534c4dc9daa9e1] sim: Switch from

EventWrapper to MemberEventWrapper before deprec
git bisect old 1bb8cd3d44c563877d486953f0534c4dc9daa9e1

old: [ba19f967d7529542f790bcd15a2746e399591fdf] sim: Use ref

constructor of MemberEventWrapper everywhere
git bisect old ba19f967d7529542f790bcd15a2746e399591fdf

old: [99852d56876eb3b0e26ec2a15752321a4a047ebd] sim: Deprecate pointer

version of MemberEventWrapper constructor
git bisect old 99852d56876eb3b0e26ec2a15752321a4a047ebd

first new commit: [a589d7b5697b3fbe61e1842e1831aef50aa96f32] arch-x86:

Add instructions from SSE4.1 set.

I can confirm that the commit "arch-x86: Add instructions from SSE4.1
set." fixed the bug. I've used 2 simple scripts in my bisect session:
bisect.sh and build.sh (at the end of this e-mail). I've extended my
generated working directory by adding the gem5 repository. So it looks like:

  • CMakeLists.txt *
  • config.py *
  • main.cpp *
  • setup.sh
  • gem5
  • files are generated by "setup.sh"

I've placed bisect.sh and build.sh inside gem5 directory and then
./bisect.sh. I've used 20 jobs in scons in build.sh, but I recommend
changing it according to your machine.

--- bisect.sh ---

#!/bin/bash

git bisect start

Old commit that is incapable of running libtorch

git bisect old v22.1.0.0

Commit that is able to run libtorch

git bisect new v23.0.0.1
git bisect run ./build.sh

--- build.sh ---

#!/bin/bash

Must build from a fresh build directory. If not, then bisect will find the

wrong commit.

rm -rf build

Build gem5. If the build fails, return the special code 125 to git

bisect so

it will know that this commit must be skipped.

scons --ignore-style build/X86/gem5.opt -j 20 --gold-linker || exit 125

If running the binary results in an error (exit code != 0), then

return 0 to

git so it will know that the current commit is old.

./build/X86/gem5.opt ../config.py ../build/main || exit 0

If the binary was run successfully, then the current analyzed commit

is able

to run libtorch.

exit 1

On 19.07.23 15:52, Bobby Bruce via gem5-users wrote:

I tried to reproduce this for myself but I failed to do so (using
v23.0.0.1, build/ALL/gem5.opt).. Attached is a tarball of the files
i used. The steps I carried out can be found on the the "README.md"
file. I'm not sure what I'm missing but my execution prints "Hello
World" successfully.

The host was an X86 Ubuntu 20.04.6 OS and everything was compiled if
GCC 9.4.0.

--
Dr. Bobby R. Bruce
Room 3050,
Kemper Hall, UC Davis
Davis,
CA, 95616

web: https://www.bobbybruce.net

On Jul 18, 2023, at 6:08 PM, Matt Sinclair
mattdsinclair.wisc@gmail.com wrote:

For what it's worth, one of the students working with me (Marco,
CC'd) is having the same failure right now for the head of develop
(plus this fix: https://github.com/gem5/gem5/pull/99), except for a
tiny GPU microbenchmark that definitely is not using PyTorch or any
higher level library.

We are working on getting a backtrace to understand what's going on
for us (and then push a fix as applicable), and it's possible our
problems have the same symptom but a different root cause. But just
wanted to chime in that there are multiple cases where this error is
happening on develop right now with SE mode.

Matt

On Tue, Jul 18, 2023 at 7:58 PM Bobby Bruce via gem5-users
gem5-users@gem5.org wrote:

 I’m afraid I don’t know exactly what’s causing this error, but
 just to make sure, the binary you built and as a `CustomResource`
 executes on your host? This looks like an error coming from
 PyTorch, not the simulator. That being said, I don’t understand
 why "build/X86/sim/faults.cc:61: panic: panic condition
 !FullSystem occurred: fault (General-Protection) detected @ PC “
 is occurring after either, that could also be the issue.
 Personally, I’m always a bit scared linking to dynamic libraries
 on the host as well,

 If you want to get around this the annoying advice is to use FS
 mode. It’s slower, and requires creation of a disk image, but it
 isn’t nearly as error-prone as SE mode. If your binary works on
 your host then you should be able to get it to work in FS mode.
 Using checkpoints and (if you have the right hardware and are
 using X86) KVM cores can speed things up for you too.

 Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t
 you want to simulate a GPU too?

 --
 Dr. Bobby R. Bruce
 Room 3050,
 Kemper Hall, UC Davis
 Davis,
 CA, 95616

 web: https://www.bobbybruce.net
 On Jul 14, 2023, at 3:02 AM, Caio Vieira via gem5-users
 <gem5-users@gem5.org> wrote:


 Hi everyone,

 I'm trying to execute gem5 simulations using libtorch in SE
 mode. However, I get the following error message:

 --- Error message ---
 ...
 terminate called after throwing an instance of 'std::runtime_error'
   what():  expected eof but found 'ident' here:
 aten::quantized_lstm.inpr input, Tensor[]
 orch.classes.rnn.CellPara[] params, bool has_biases, int
 num_layers, float dropout, bool train, bool bidirectional, bool
 batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) dy
 namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor)
                           ~~~~~ <--- HERE

 build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall
 rt_sigprocmask(...)
       (further warnings will be suppressed)
 build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall
 rt_sigaction(...)
       (further warnings will be suppressed)
 build/X86/sim/faults.cc:61: panic: panic condition !FullSystem
 occurred: fault (General-Protection) detected @ PC
 (0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1)
 Memory Usage: 11842716 KBytes
 Program aborted at tick 294083905383
 --- BEGIN LIBC BACKTRACE ---
 ...

 The simulation fails before the first line of the main function.
 I believe that it is failing to load the libtorch library.
 Unfortunately, it is not possible to build libtorch with
 "-static" since their static builds is broken for quiet a long
 time: https://github.com/pytorch/pytorch/issues/21737
 I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also
 tested using different GCC versions to build the simulated binary.

 For anyone interested in reproducing the error, I'm sending a
 "setup.sh" script to create a minimal reproducible environment.
 Simply copy and paste the script below and name it as "setup.sh"
 in a new directory, then:

 source setup.sh
 cmake --B build -S .
 cmake --build build
 ./<gem5> config.py build/main

 Best regards,
 Caio Vieira

 --- setup.sh ---

 #!/bin/bash

 # Bash script to create minimal reproducible environment for
 libtorch simulation
 # bug. This script creates necessary files such as a
 CMakeLists.txt and a minimal
 # main.cpp. The CMakeLists.txt file downloads and manages
 libtorch by saving it
 # in a ""_deps"" folder. Steps to reproduce the bug:
 # ./<this-script>
 # cmake -B build -S .
 # cmake --build build
 # ./<gem5> config.py build/main

 function create_cmake() {
     cat > CMakeLists.txt <<- \EOF
 cmake_minimum_required(VERSION 3.22 FATAL_ERROR)

 # Download and manage libtorch dependency
 set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps")

 file(MAKE_DIRECTORY "${DEPENDENCY_DIR}")
 if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch")
     file(DOWNLOAD
 https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip

 "${DEPENDENCY_DIR}/libtorch.zip")
     file(ARCHIVE_EXTRACT
         INPUT "${DEPENDENCY_DIR}/libtorch.zip"
         DESTINATION "${DEPENDENCY_DIR}")
     file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip")
 endif()
 set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch")
 find_package(Torch REQUIRED)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

 project(main)

 # Create executable
 add_executable(${CMAKE_PROJECT_NAME} main.cpp)
 target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC
 ${TORCH_INCLUDE_DIRS})
 target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES})
 set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14)
 message("Torch Libraries: ${TORCH_LIBRARIES}")
 EOF
 }

 function create_main() {
     cat > main.cpp <<- \EOF
 #include <iostream>
 #include <torch/torch.h>

 int main(int argc, char *argv[]) {
     std::cout << "Hello World" << std::endl;
 }
 EOF
 }

 function create_gem5_config() {
     cat > config.py <<- \EOF
 # This script must be executed by gem5
 # Usage: ./<gem5> <this-script> <binary>

 import sys

 from gem5.utils.requires import ISA
 from gem5.components.boards.simple_board import SimpleBoard
 from gem5.components.cachehierarchies.classic.no_cache import
 NoCache
 from gem5.components.memory.single_channel import
 SingleChannelDDR3_1600
 from gem5.components.processors.simple_processor import
 SimpleProcessor
 from gem5.components.processors.cpu_types import CPUTypes
 from gem5.resources.resource import CustomResource
 from gem5.simulate.simulator import Simulator

 if len(sys.argv) == 1:
     print("Provide a binary as argument", file=sys.stderr)
     sys.exit(1)

 cache_hierarchy = NoCache()
 memory = SingleChannelDDR3_1600("8GiB")
 processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC,
 num_cores=1, isa=ISA.X86)

 board = SimpleBoard(
         clk_freq="3GHz",
         processor=processor,
         memory=memory,
 cache_hierarchy=cache_hierarchy
 )

 binary = CustomResource(sys.argv[1])
 board.set_se_binary_workload(binary)

 simulator = Simulator(board=board)
 simulator.run()
 EOF
 }

 create_cmake
 create_main
 create_gem5_config
 _______________________________________________
 gem5-users mailing list -- gem5-users@gem5.org
 To unsubscribe send an email to gem5-users-leave@gem5.org
 _______________________________________________
 gem5-users mailing list -- gem5-users@gem5.org
 To unsubscribe send an email to gem5-users-leave@gem5.org
I can confirm that gem5 can load a libtorch binary on v23.0.0.1. I was able to run the binary on a Ubuntu 22.04.1. Also, I've run git bisect to find the commit that fixes the bug. Here is the git bisect log: --- # old: [5fa484e2e02604ad3a5bf01f35ad1f97ca6d17b8] misc: Merge the v22.1 release staging into stable git bisect old 5fa484e2e02604ad3a5bf01f35ad1f97ca6d17b8 # new: [af72b9ba580546ac12ce05bfaac3fd53fa8699f4] misc: Update RELEASE-NOTES.md for v23.0.0.1 hotfix git bisect new af72b9ba580546ac12ce05bfaac3fd53fa8699f4 # old: [6841e1aa5a1738961940fece2b35baf77c8c224d] stdlib: Fix bug in MESI_Three_Level_Cache initialization git bisect old 6841e1aa5a1738961940fece2b35baf77c8c224d # new: [e90bd5feb9a7d6672b231190783433bf3f7d6706] configs: Add `--with-pmu` option to the simple Arm FS configs git bisect new e90bd5feb9a7d6672b231190783433bf3f7d6706 # new: [f9cf3de711d59bc3a81bb8d49f1408b1f6349a7b] mem: Use HostSocket in the SharedMemoryServer. git bisect new f9cf3de711d59bc3a81bb8d49f1408b1f6349a7b # new: [c8abd97584d4dffe32c21773b98b41fa991f4231] tests: Disable the looppoint checkpoint tests git bisect new c8abd97584d4dffe32c21773b98b41fa991f4231 # new: [7b39a7f14e6d5132961e3e568b121d7fd7422f90] misc: Rename DEBUG macro into GEM5_DEBUG git bisect new 7b39a7f14e6d5132961e3e568b121d7fd7422f90 # new: [a589d7b5697b3fbe61e1842e1831aef50aa96f32] arch-x86: Add instructions from SSE4.1 set. git bisect new a589d7b5697b3fbe61e1842e1831aef50aa96f32 # old: [1bb8cd3d44c563877d486953f0534c4dc9daa9e1] sim: Switch from EventWrapper to MemberEventWrapper before deprec git bisect old 1bb8cd3d44c563877d486953f0534c4dc9daa9e1 # old: [ba19f967d7529542f790bcd15a2746e399591fdf] sim: Use ref constructor of MemberEventWrapper everywhere git bisect old ba19f967d7529542f790bcd15a2746e399591fdf # old: [99852d56876eb3b0e26ec2a15752321a4a047ebd] sim: Deprecate pointer version of MemberEventWrapper constructor git bisect old 99852d56876eb3b0e26ec2a15752321a4a047ebd # first new commit: [a589d7b5697b3fbe61e1842e1831aef50aa96f32] arch-x86: Add instructions from SSE4.1 set. --- I can confirm that the commit "arch-x86: Add instructions from SSE4.1 set." fixed the bug. I've used 2 simple scripts in my bisect session: bisect.sh and build.sh (at the end of this e-mail). I've extended my generated working directory by adding the gem5 repository. So it looks like: - CMakeLists.txt * - config.py * - main.cpp * - setup.sh - gem5 * files are generated by "setup.sh" I've placed bisect.sh and build.sh inside gem5 directory and then ./bisect.sh. I've used 20 jobs in scons in build.sh, but I recommend changing it according to your machine. --- bisect.sh --- #!/bin/bash git bisect start # Old commit that is incapable of running libtorch git bisect old v22.1.0.0 # Commit that is able to run libtorch git bisect new v23.0.0.1 git bisect run ./build.sh --- build.sh --- #!/bin/bash # Must build from a fresh build directory. If not, then bisect will find the # wrong commit. rm -rf build # Build gem5. If the build fails, return the special code 125 to git bisect so # it will know that this commit must be skipped. scons --ignore-style build/X86/gem5.opt -j 20 --gold-linker || exit 125 # If running the binary results in an error (exit code != 0), then return 0 to # git so it will know that the current commit is old. ./build/X86/gem5.opt ../config.py ../build/main || exit 0 # If the binary was run successfully, then the current analyzed commit is able # to run libtorch. exit 1 On 19.07.23 15:52, Bobby Bruce via gem5-users wrote: > I tried to reproduce this for myself but I failed to do so (using > v23.0.0.1, `build/ALL/gem5.opt`).. Attached is a tarball of the files > i used. The steps I carried out can be found on the the "README.md" > file. I'm not sure what I'm missing but my execution prints "Hello > World" successfully. > > The host was an X86 Ubuntu 20.04.6 OS and everything was compiled if > GCC 9.4.0. > > > > -- > Dr. Bobby R. Bruce > Room 3050, > Kemper Hall, UC Davis > Davis, > CA, 95616 > > web: https://www.bobbybruce.net > >> On Jul 18, 2023, at 6:08 PM, Matt Sinclair >> <mattdsinclair.wisc@gmail.com> wrote: >> >> For what it's worth, one of the students working with me (Marco, >> CC'd) is having the same failure right now for the head of develop >> (plus this fix: https://github.com/gem5/gem5/pull/99), except for a >> tiny GPU microbenchmark that definitely is not using PyTorch or any >> higher level library. >> >> We are working on getting a backtrace to understand what's going on >> for us (and then push a fix as applicable), and it's possible our >> problems have the same symptom but a different root cause. But just >> wanted to chime in that there are multiple cases where this error is >> happening on develop right now with SE mode. >> >> Matt >> >> >> On Tue, Jul 18, 2023 at 7:58 PM Bobby Bruce via gem5-users >> <gem5-users@gem5.org> wrote: >> >> I’m afraid I don’t know exactly what’s causing this error, but >> just to make sure, the binary you built and as a `CustomResource` >> executes on your host? This looks like an error coming from >> PyTorch, not the simulator. That being said, I don’t understand >> why "build/X86/sim/faults.cc:61: panic: panic condition >> !FullSystem occurred: fault (General-Protection) detected @ PC “ >> is occurring after either, that could also be the issue. >> Personally, I’m always a bit scared linking to dynamic libraries >> on the host as well, >> >> If you want to get around this the annoying advice is to use FS >> mode. It’s slower, and requires creation of a disk image, but it >> isn’t nearly as error-prone as SE mode. If your binary works on >> your host then you should be able to get it to work in FS mode. >> Using checkpoints and (if you have the right hardware and are >> using X86) KVM cores can speed things up for you too. >> >> Also, as a sidenote: If you’re wanting to simulate PyTorch, don’t >> you want to simulate a GPU too? >> >> -- >> Dr. Bobby R. Bruce >> Room 3050, >> Kemper Hall, UC Davis >> Davis, >> CA, 95616 >> >> web: https://www.bobbybruce.net >> >>> On Jul 14, 2023, at 3:02 AM, Caio Vieira via gem5-users >>> <gem5-users@gem5.org> wrote: >>> >>> >>> Hi everyone, >>> >>> I'm trying to execute gem5 simulations using libtorch in SE >>> mode. However, I get the following error message: >>> >>> --- Error message --- >>> ... >>> terminate called after throwing an instance of 'std::runtime_error' >>>   what():  expected eof but found 'ident' here: >>> aten::quantized_lstm.inpr input, Tensor[] >>> orch.classes.rnn.CellPara[] params, bool has_biases, int >>> num_layers, float dropout, bool train, bool bidirectional, bool >>> batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) dy >>> namic=False) -> (Tensor, Tensor, Tenso Tensor, Tensor) >>>                           ~~~~~ <--- HERE >>> >>> build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall >>> rt_sigprocmask(...) >>>       (further warnings will be suppressed) >>> build/X86/sim/syscall_emul.cc:86: warn: ignoring syscall >>> rt_sigaction(...) >>>       (further warnings will be suppressed) >>> build/X86/sim/faults.cc:61: panic: panic condition !FullSystem >>> occurred: fault (General-Protection) detected @ PC >>> (0x7fff7a3d5898=>0x7fff7a3d5899).(0=>1) >>> Memory Usage: 11842716 KBytes >>> Program aborted at tick 294083905383 >>> --- BEGIN LIBC BACKTRACE --- >>> ... >>> >>> The simulation fails before the first line of the main function. >>> I believe that it is failing to load the libtorch library. >>> Unfortunately, it is not possible to build libtorch with >>> "-static" since their static builds is broken for quiet a long >>> time: https://github.com/pytorch/pytorch/issues/21737 >>> I've tested with gem5 v22.1.0.0 and also 22.0.0.2. I've also >>> tested using different GCC versions to build the simulated binary. >>> >>> For anyone interested in reproducing the error, I'm sending a >>> "setup.sh" script to create a minimal reproducible environment. >>> Simply copy and paste the script below and name it as "setup.sh" >>> in a new directory, then: >>> >>> source setup.sh >>> cmake --B build -S . >>> cmake --build build >>> ./<gem5> config.py build/main >>> >>> Best regards, >>> Caio Vieira >>> >>> --- setup.sh --- >>> >>> #!/bin/bash >>> >>> # Bash script to create minimal reproducible environment for >>> libtorch simulation >>> # bug. This script creates necessary files such as a >>> CMakeLists.txt and a minimal >>> # main.cpp. The CMakeLists.txt file downloads and manages >>> libtorch by saving it >>> # in a ""_deps"" folder. Steps to reproduce the bug: >>> # ./<this-script> >>> # cmake -B build -S . >>> # cmake --build build >>> # ./<gem5> config.py build/main >>> >>> function create_cmake() { >>>     cat > CMakeLists.txt <<- \EOF >>> cmake_minimum_required(VERSION 3.22 FATAL_ERROR) >>> >>> # Download and manage libtorch dependency >>> set(DEPENDENCY_DIR "${CMAKE_CURRENT_LIST_DIR}/_deps") >>> >>> file(MAKE_DIRECTORY "${DEPENDENCY_DIR}") >>> if(NOT EXISTS "${DEPENDENCY_DIR}/libtorch") >>>     file(DOWNLOAD >>> https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip >>> >>> "${DEPENDENCY_DIR}/libtorch.zip") >>>     file(ARCHIVE_EXTRACT >>>         INPUT "${DEPENDENCY_DIR}/libtorch.zip" >>>         DESTINATION "${DEPENDENCY_DIR}") >>>     file(REMOVE "${DEPENDENCY_DIR}/libtorch.zip") >>> endif() >>> set(CMAKE_PREFIX_PATH "${DEPENDENCY_DIR}/libtorch") >>> find_package(Torch REQUIRED) >>> set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") >>> >>> project(main) >>> >>> # Create executable >>> add_executable(${CMAKE_PROJECT_NAME} main.cpp) >>> target_include_directories(${CMAKE_PROJECT_NAME} PUBLIC >>> ${TORCH_INCLUDE_DIRS}) >>> target_link_libraries(${CMAKE_PROJECT_NAME} ${TORCH_LIBRARIES}) >>> set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 14) >>> message("Torch Libraries: ${TORCH_LIBRARIES}") >>> EOF >>> } >>> >>> function create_main() { >>>     cat > main.cpp <<- \EOF >>> #include <iostream> >>> #include <torch/torch.h> >>> >>> int main(int argc, char *argv[]) { >>>     std::cout << "Hello World" << std::endl; >>> } >>> EOF >>> } >>> >>> function create_gem5_config() { >>>     cat > config.py <<- \EOF >>> # This script must be executed by gem5 >>> # Usage: ./<gem5> <this-script> <binary> >>> >>> import sys >>> >>> from gem5.utils.requires import ISA >>> from gem5.components.boards.simple_board import SimpleBoard >>> from gem5.components.cachehierarchies.classic.no_cache import >>> NoCache >>> from gem5.components.memory.single_channel import >>> SingleChannelDDR3_1600 >>> from gem5.components.processors.simple_processor import >>> SimpleProcessor >>> from gem5.components.processors.cpu_types import CPUTypes >>> from gem5.resources.resource import CustomResource >>> from gem5.simulate.simulator import Simulator >>> >>> if len(sys.argv) == 1: >>>     print("Provide a binary as argument", file=sys.stderr) >>>     sys.exit(1) >>> >>> cache_hierarchy = NoCache() >>> memory = SingleChannelDDR3_1600("8GiB") >>> processor = SimpleProcessor(cpu_type=CPUTypes.ATOMIC, >>> num_cores=1, isa=ISA.X86) >>> >>> board = SimpleBoard( >>>         clk_freq="3GHz", >>>         processor=processor, >>>         memory=memory, >>> cache_hierarchy=cache_hierarchy >>> ) >>> >>> binary = CustomResource(sys.argv[1]) >>> board.set_se_binary_workload(binary) >>> >>> simulator = Simulator(board=board) >>> simulator.run() >>> EOF >>> } >>> >>> create_cmake >>> create_main >>> create_gem5_config >>> _______________________________________________ >>> gem5-users mailing list -- gem5-users@gem5.org >>> To unsubscribe send an email to gem5-users-leave@gem5.org >> >> _______________________________________________ >> gem5-users mailing list -- gem5-users@gem5.org >> To unsubscribe send an email to gem5-users-leave@gem5.org >> >