Unverified Commit 8d724968 by Andrew Reusch Committed by GitHub

[RUNTIME][uTVM] AutoTVM + uTVM for Cortex-M7 (#5417)

* Prototype for micro TVM.

* Cleanup and sync micro tvm prototype.

* Use /std:c++14 with MSVC.

 * Per tqchen: project has already moved to C++14
 * Presubmit failed for code that built locally on gcc.

* fix ASF lint, and fix add_asf_header too

* Compiles with USE_MICRO=OFF.

* Cleanup TargetPtr and word size representations.

* fix compile warning

* address logan's comments

* address logan and liangfu comments

* address thierry's comments

* address u99127, liangfu, tmoreau89 comments

Co-authored-by: Logan Weber <weberlo@cs.washington.edu>
parent ae89afe0
Subproject commit 981b1c32f91668e669ee376856f92f36cfd2a351 Subproject commit 808f485387f9a03f78fa9f1159f387d0d91b7a28
...@@ -304,12 +304,15 @@ include(cmake/modules/contrib/TFLite.cmake) ...@@ -304,12 +304,15 @@ include(cmake/modules/contrib/TFLite.cmake)
include(cmake/modules/contrib/TF_TVMDSOOP.cmake) include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
include(cmake/modules/contrib/CoreML.cmake) include(cmake/modules/contrib/CoreML.cmake)
include(CheckCXXCompilerFlag)
if(NOT MSVC) if(NOT MSVC)
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14) check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14)
message(STATUS "Build with c++14")
set(CMAKE_CXX_FLAGS "-std=c++14 ${CMAKE_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "-std=c++14 ${CMAKE_CXX_FLAGS}")
set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_STANDARD 14)
else()
check_cxx_compiler_flag("/std:c++14" SUPPORT_CXX14)
set(CMAKE_CXX_FLAGS "/std:c++14 ${CMAKE_CXX_FLAGS}")
set(CMAKE_CUDA_STANDARD 14)
endif() endif()
add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS}) add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})
......
...@@ -73,7 +73,8 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc ...@@ -73,7 +73,8 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc
cpplint: cpplint:
python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include; python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include;
python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp include src \ python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp \
include src \
examples/extension/src examples/graph_executor/src examples/extension/src examples/graph_executor/src
pylint: pylint:
......
...@@ -930,6 +930,8 @@ constexpr const char* loop_scope = "loop_scope"; ...@@ -930,6 +930,8 @@ constexpr const char* loop_scope = "loop_scope";
constexpr const char* reduce_scope = "reduce_scope"; constexpr const char* reduce_scope = "reduce_scope";
/*! \brief Mark region is guarded by the pragma extension */ /*! \brief Mark region is guarded by the pragma extension */
constexpr const char* pragma_scope_prefix = "pragma_"; constexpr const char* pragma_scope_prefix = "pragma_";
/*! \brief Import C source or file into the final code gen module */
constexpr const char* pragma_import_c = "pragma_import_c";
/*! \brief Import llvm source or file into the final code gen module */ /*! \brief Import llvm source or file into the final code gen module */
constexpr const char* pragma_import_llvm = "pragma_import_llvm"; constexpr const char* pragma_import_llvm = "pragma_import_llvm";
/*! \brief Try to modify the AST to support Tensor Core */ /*! \brief Try to modify the AST to support Tensor Core */
......
...@@ -145,7 +145,7 @@ class LocalExecutor(executor.Executor): ...@@ -145,7 +145,7 @@ class LocalExecutor(executor.Executor):
if not self.do_fork: if not self.do_fork:
return LocalFutureNoFork(func(*args, **kwargs)) return LocalFutureNoFork(func(*args, **kwargs))
queue = Queue(2) queue = Queue(2) # Size of 2 to avoid a race condition with size 1.
process = Process(target=call_with_timeout, process = Process(target=call_with_timeout,
args=(queue, self.timeout, func, args, kwargs)) args=(queue, self.timeout, func, args, kwargs))
process.start() process.start()
......
...@@ -245,6 +245,8 @@ class RPCRunner(Runner): ...@@ -245,6 +245,8 @@ class RPCRunner(Runner):
if 'cuda' in self.task.target.keys: if 'cuda' in self.task.target.keys:
kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.')) kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
if self.task.target.device_name == 'micro_dev':
kwargs.setdefault('build_option', {})['disable_vectorize'] = True
return kwargs return kwargs
...@@ -273,7 +275,8 @@ class RPCRunner(Runner): ...@@ -273,7 +275,8 @@ class RPCRunner(Runner):
if isinstance(res, Exception): # executor error or timeout if isinstance(res, Exception): # executor error or timeout
results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT, results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT,
self.timeout, time.time())) self.timeout, time.time()))
else: raise Exception(f'encountered exception during measurement: {results}')
results.append(res) results.append(res)
return results return results
......
...@@ -48,6 +48,7 @@ def _lower(mod, ...@@ -48,6 +48,7 @@ def _lower(mod,
grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
grc.codegen(mod["main"]) grc.codegen(mod["main"])
return return
# default case # default case
# Try graph codegen first to extract autotvm tasks. # Try graph codegen first to extract autotvm tasks.
# If failed to compile, then fallback to use VM compiler. # If failed to compile, then fallback to use VM compiler.
......
...@@ -149,7 +149,7 @@ def progress_bar(total, prefix='', si_prefix='G'): ...@@ -149,7 +149,7 @@ def progress_bar(total, prefix='', si_prefix='G'):
if res.error_no == 0: if res.error_no == 0:
flops = inp.task.flop / np.mean(res.costs) flops = inp.task.flop / np.mean(res.costs)
if logger.level < logging.DEBUG: # only print progress bar in non-debug mode if not logger.isEnabledFor(logging.DEBUG): # only print progress bar in non-debug mode
ctx.cur_flops = flops ctx.cur_flops = flops
ctx.best_flops = tuner.best_flops ctx.best_flops = tuner.best_flops
......
...@@ -50,7 +50,11 @@ class GATuner(Tuner): ...@@ -50,7 +50,11 @@ class GATuner(Tuner):
# space info # space info
self.space = task.config_space self.space = task.config_space
self.dims = [len(x) for x in self.space.space_map.values()] self.dim_keys = []
self.dims = []
for k, v in self.space.space_map.items():
self.dim_keys.append(k)
self.dims.append(len(v))
self.visited = set([]) self.visited = set([])
...@@ -123,7 +127,7 @@ class GATuner(Tuner): ...@@ -123,7 +127,7 @@ class GATuner(Tuner):
if len(self.visited) < len(self.space): if len(self.visited) < len(self.space):
while knob2point(tmp_gene, self.dims) in self.visited: while knob2point(tmp_gene, self.dims) in self.visited:
j = np.random.randint(len(self.dims)) j = np.random.randint(len(self.dims))
tmp_gene[j] = np.random.randint(self.dims[j]) tmp_gene[j] = np.random.randint(self.dims[j]) # pylint: disable=invalid-sequence-index
next_genes.append(tmp_gene) next_genes.append(tmp_gene)
self.visited.add(knob2point(tmp_gene, self.dims)) self.visited.add(knob2point(tmp_gene, self.dims))
else: else:
......
...@@ -21,7 +21,9 @@ import subprocess ...@@ -21,7 +21,9 @@ import subprocess
import tvm._ffi import tvm._ffi
from . import util from . import util
# TODO does this file still belong in `contrib`. is it too µTVM-specific?
# TODO shouldn't need so many `ALIGN` directives
RELOCATION_LD_SCRIPT_TEMPLATE = """ RELOCATION_LD_SCRIPT_TEMPLATE = """
/* linker symbol for use in UTVMInit */ /* linker symbol for use in UTVMInit */
_utvm_stack_pointer_init = 0x{stack_pointer_init:x}; _utvm_stack_pointer_init = 0x{stack_pointer_init:x};
...@@ -118,7 +120,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): ...@@ -118,7 +120,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
size of the section in bytes size of the section in bytes
""" """
if not os.path.isfile(binary_path): if not os.path.isfile(binary_path):
raise RuntimeError("no such file \"{}\"".format(binary_path)) raise RuntimeError('no such file "{}"'.format(binary_path))
# We use the "-A" flag here to get the ".rodata" section's size, which is # We use the "-A" flag here to get the ".rodata" section's size, which is
# not included by default. # not included by default.
size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path]) size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path])
...@@ -160,6 +162,10 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): ...@@ -160,6 +162,10 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
# padding for most cases, but symbols can be arbitrarily large, so this # padding for most cases, but symbols can be arbitrarily large, so this
# isn't bulletproof. # isn't bulletproof.
return section_size + 32 return section_size + 32
# NOTE: in the past, section_size has been wrong on x86. it may be
# inconsistent. TODO: maybe stop relying on `*size` to give us the size and
# instead read the section with `*objcopy` and count the bytes.
return section_size return section_size
...@@ -206,11 +212,13 @@ def tvm_callback_relocate_binary( ...@@ -206,11 +212,13 @@ def tvm_callback_relocate_binary(
rel_bin : bytearray rel_bin : bytearray
the relocated binary the relocated binary
""" """
assert text_start < rodata_start < data_start < bss_start < stack_end
stack_pointer_init = stack_end - word_size stack_pointer_init = stack_end - word_size
ld_script_contents = "" ld_script_contents = ""
# TODO(weberlo): There should be a better way to configure this for different archs. # TODO(weberlo): There should be a better way to configure this for different archs.
# TODO is this line even necessary?
if "riscv" in toolchain_prefix: if "riscv" in toolchain_prefix:
ld_script_contents += "OUTPUT_ARCH( \"riscv\" )\n\n" ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n'
ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format( ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format(
word_size=word_size, word_size=word_size,
text_start=text_start, text_start=text_start,
...@@ -221,7 +229,7 @@ def tvm_callback_relocate_binary( ...@@ -221,7 +229,7 @@ def tvm_callback_relocate_binary(
tmp_dir = util.tempdir() tmp_dir = util.tempdir()
rel_obj_path = tmp_dir.relpath("relocated.obj") rel_obj_path = tmp_dir.relpath("relocated.obj")
rel_ld_script_path = tmp_dir.relpath("relocated.lds") rel_ld_script_path = tmp_dir.relpath("relocate.lds")
with open(rel_ld_script_path, "w") as f: with open(rel_ld_script_path, "w") as f:
f.write(ld_script_contents) f.write(ld_script_contents)
run_cmd([ run_cmd([
...@@ -229,8 +237,23 @@ def tvm_callback_relocate_binary( ...@@ -229,8 +237,23 @@ def tvm_callback_relocate_binary(
binary_path, binary_path,
"-T", rel_ld_script_path, "-T", rel_ld_script_path,
"-o", rel_obj_path]) "-o", rel_obj_path])
with open(rel_obj_path, "rb") as f: with open(rel_obj_path, "rb") as f:
rel_bin = bytearray(f.read()) rel_bin = bytearray(f.read())
gdb_init_dir = os.environ.get("MICRO_GDB_INIT_DIR")
if gdb_init_dir is not None:
gdb_init_path = f"{gdb_init_dir}/.gdbinit"
with open(gdb_init_path, "r") as f:
gdbinit_contents = f.read().split("\n")
new_contents = []
for line in gdbinit_contents:
new_contents.append(line)
if line.startswith("target"):
new_contents.append(f"add-symbol-file {rel_obj_path}")
with open(gdb_init_path, "w") as f:
f.write("\n".join(new_contents))
return rel_bin return rel_bin
......
...@@ -20,6 +20,7 @@ from __future__ import absolute_import ...@@ -20,6 +20,7 @@ from __future__ import absolute_import
import argparse import argparse
import ast import ast
import json
import multiprocessing import multiprocessing
import sys import sys
import logging import logging
...@@ -41,7 +42,7 @@ def main(args): ...@@ -41,7 +42,7 @@ def main(args):
tracker_addr = (url, port) tracker_addr = (url, port)
if not args.key: if not args.key:
raise RuntimeError( raise RuntimeError(
"Need key to present type of resource when tracker is available") 'Need key to present type of resource when tracker is available')
else: else:
tracker_addr = None tracker_addr = None
...@@ -75,8 +76,8 @@ def init_utvm(args): ...@@ -75,8 +76,8 @@ def init_utvm(args):
dev_config = json.load(dev_conf_file) dev_config = json.load(dev_conf_file)
else: else:
dev_config_args = ast.literal_eval(args.utvm_dev_config_args) dev_config_args = ast.literal_eval(args.utvm_dev_config_args)
default_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['default_config'] generate_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['generate_config']
dev_config = default_config_func(*dev_config_args) dev_config = generate_config_func(*dev_config_args)
if args.utvm_dev_config or args.utvm_dev_id: if args.utvm_dev_config or args.utvm_dev_id:
# add MicroTVM overrides # add MicroTVM overrides
...@@ -100,8 +101,8 @@ if __name__ == "__main__": ...@@ -100,8 +101,8 @@ if __name__ == "__main__":
parser.add_argument('--port-end', type=int, default=9199, parser.add_argument('--port-end', type=int, default=9199,
help='The end search port of the RPC') help='The end search port of the RPC')
parser.add_argument('--tracker', type=str, parser.add_argument('--tracker', type=str,
help="The address of RPC tracker in host:port format. " help=("The address of RPC tracker in host:port format. "
"e.g. (10.77.1.234:9190)") "e.g. (10.77.1.234:9190)"))
parser.add_argument('--key', type=str, default="", parser.add_argument('--key', type=str, default="",
help="The key used to identify the device type in tracker.") help="The key used to identify the device type in tracker.")
parser.add_argument('--silent', action='store_true', parser.add_argument('--silent', action='store_true',
...@@ -115,12 +116,19 @@ if __name__ == "__main__": ...@@ -115,12 +116,19 @@ if __name__ == "__main__":
parser.add_argument('--custom-addr', type=str, parser.add_argument('--custom-addr', type=str,
help="Custom IP Address to Report to RPC Tracker") help="Custom IP Address to Report to RPC Tracker")
parser.add_argument('--utvm-dev-config', type=str, parser.add_argument('--utvm-dev-config', type=str,
help='JSON config file for the target device (if using MicroTVM)') help=('JSON config file for the target device (if using MicroTVM). '
parser.add_argument('--utvm-dev-id', type=str, 'This file should contain serialized output similar to that returned '
help='Unique ID for the target device (if using MicroTVM)') "from the device module's generate_config. Can't be specified when "
'--utvm-dev-config-args is specified.'))
parser.add_argument('--utvm-dev-config-args', type=str, parser.add_argument('--utvm-dev-config-args', type=str,
help=('Python list of literals required to generate a default' help=("Arguments to the device module's generate_config function. "
' MicroTVM config (if --utvm-dev-id is specified)')) 'Must be a python literal parseable by literal_eval. If specified, '
"the device configuration is generated using the device module's "
"generate_config. Can't be specified when --utvm-dev-config is "
"specified."))
parser.add_argument('--utvm-dev-id', type=str,
help=('Unique ID for the target device (if using MicroTVM). Should '
'match the name of a module underneath tvm.micro.device).'))
parser.set_defaults(fork=True) parser.set_defaults(fork=True)
args = parser.parse_args() args = parser.parse_args()
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
"""MicroTVM module for bare-metal backends""" """MicroTVM module for bare-metal backends"""
from ..contrib import binutil from ..contrib import binutil
from .base import Session, create_micro_mod, cross_compiler from .base import DEVICE_SECTIONS
from .base import LibType, get_micro_host_driven_dir, get_micro_device_dir from .base import Session, create_micro_mod, cross_compiler, LibType
from .base import get_micro_host_driven_dir, get_micro_device_dir
from . import device from . import device
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
from __future__ import absolute_import from __future__ import absolute_import
import os import os
import re
import sys import sys
from enum import Enum from enum import Enum
...@@ -28,6 +29,18 @@ import tvm._ffi ...@@ -28,6 +29,18 @@ import tvm._ffi
from tvm.contrib import util as _util from tvm.contrib import util as _util
from tvm.contrib import cc as _cc from tvm.contrib import cc as _cc
# all sections that comprise a device's memory layout, in order from lowest
# starting address to highest
DEVICE_SECTIONS = [
"text",
"rodata",
"data",
"bss",
"args",
"heap",
"workspace",
"stack",
]
class LibType(Enum): class LibType(Enum):
"""Enumeration of library types that can be compiled and loaded onto a device""" """Enumeration of library types that can be compiled and loaded onto a device"""
...@@ -51,9 +64,9 @@ class Session: ...@@ -51,9 +64,9 @@ class Session:
.. code-block:: python .. code-block:: python
c_mod = ... # some module generated with "c" as the target c_mod = ... # some module generated with "c" as the target
dev_config = micro.device.arm.stm32f746xx.default_config("127.0.0.1", 6666) dev_config = micro.device.arm.stm32f746xx.default_config('127.0.0.1', 6666)
with tvm.micro.Session(dev_config) as sess: with tvm.micro.Session(dev_config) as sess:
micro_mod = create_micro_mod(c_mod, dev_config) micro_mod = sess.create_micro_mod(c_mod)
""" """
def __init__(self, config): def __init__(self, config):
...@@ -62,19 +75,20 @@ class Session: ...@@ -62,19 +75,20 @@ class Session:
# grab a binutil instance from the ID in the config # grab a binutil instance from the ID in the config
dev_funcs = tvm.micro.device.get_device_funcs(config["device_id"]) dev_funcs = tvm.micro.device.get_device_funcs(config["device_id"])
self.create_micro_lib = dev_funcs["create_micro_lib"]
self.toolchain_prefix = config["toolchain_prefix"] self.toolchain_prefix = config["toolchain_prefix"]
self.mem_layout = config["mem_layout"] self.mem_layout = config["mem_layout"]
self.word_size = config["word_size"] self.word_size_bits = config["word_size_bits"]
self.thumb_mode = config["thumb_mode"] self.thumb_mode = config["thumb_mode"]
self.use_device_timer = config["use_device_timer"]
self.comms_method = config["comms_method"] self.comms_method = config["comms_method"]
# First, find and compile runtime library. # First, find and compile runtime library.
runtime_src_path = os.path.join(get_micro_host_driven_dir(), "utvm_runtime.c") runtime_src_path = os.path.join(get_micro_host_driven_dir(), "utvm_runtime.c")
tmp_dir = _util.tempdir() tmp_dir = _util.tempdir()
runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj") runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj")
self.create_micro_lib(runtime_obj_path, runtime_src_path, LibType.RUNTIME) options = ["-I{}".format(get_micro_host_driven_dir())]
#input(f"check {runtime_obj_path}: ") dev_funcs["create_micro_lib"](
runtime_obj_path, runtime_src_path, LibType.RUNTIME, options=options)
comms_method = config["comms_method"] comms_method = config["comms_method"]
if comms_method == "openocd": if comms_method == "openocd":
...@@ -86,6 +100,8 @@ class Session: ...@@ -86,6 +100,8 @@ class Session:
else: else:
raise RuntimeError(f"unknown communication method: f{self.comms_method}") raise RuntimeError(f"unknown communication method: f{self.comms_method}")
assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), \
"not all sections have an assigned memory layout"
self.module = _CreateSession( self.module = _CreateSession(
comms_method, comms_method,
runtime_obj_path, runtime_obj_path,
...@@ -106,12 +122,15 @@ class Session: ...@@ -106,12 +122,15 @@ class Session:
self.mem_layout["workspace"]["size"], self.mem_layout["workspace"]["size"],
self.mem_layout["stack"].get("start", 0), self.mem_layout["stack"].get("start", 0),
self.mem_layout["stack"]["size"], self.mem_layout["stack"]["size"],
self.word_size, self.word_size_bits,
self.thumb_mode, self.thumb_mode,
self.use_device_timer,
server_addr, server_addr,
server_port) server_port)
self._enter = self.module["enter"] self._enter = self.module["enter"]
self._exit = self.module["exit"] self._exit = self.module["exit"]
self.get_last_batch_time = self.module["get_last_batch_time"]
self.get_last_batch_cycles = self.module["get_last_batch_cycles"]
def _check_system(self): def _check_system(self):
"""Check if the user's system is supported by MicroTVM. """Check if the user's system is supported by MicroTVM.
...@@ -119,7 +138,7 @@ class Session: ...@@ -119,7 +138,7 @@ class Session:
Raises error if not supported. Raises error if not supported.
""" """
if not sys.platform.startswith("linux"): if not sys.platform.startswith("linux"):
raise RuntimeError("MicroTVM is currently only supported on Linux hosts") raise RuntimeError("MicroTVM is currently only supported on Linux")
# TODO(weberlo): Add 32-bit support. # TODO(weberlo): Add 32-bit support.
# It's primarily the compilation pipeline that isn't compatible. # It's primarily the compilation pipeline that isn't compatible.
if sys.maxsize <= 2**32: if sys.maxsize <= 2**32:
...@@ -133,44 +152,91 @@ class Session: ...@@ -133,44 +152,91 @@ class Session:
self._exit() self._exit()
def create_micro_mod(c_mod, dev_config): def _calc_max_workspace_usage(src):
# TODO factor in alignment to the calculation (alloc sizes will be aligned up to the word size)
alloc_re = re.compile(
r'.*\* ?(.+) = (\(.+\))? TVMBackendAllocWorkspace\(.+, .+, \(uint64_t\)(.+), .+, .+\).*')
free_re = re.compile(r'.*if \(TVMBackendFreeWorkspace\(.+, .+, (\(void\*\))? (.+)\) != 0\) {.*')
max_usage = 0
alloc_map = {}
for line in src.split("\n"):
if line.strip().startswith("//"):
continue
match = alloc_re.match(line)
if match is not None:
alloc_map[match.group(1)] = int(match.group(3))
max_usage = max(max_usage, sum(alloc_map.values()))
else:
match = free_re.match(line)
if match is not None:
print(alloc_map)
del alloc_map[match.group(2)]
return max_usage
def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None,
lib_include_paths=None):
"""Produces a micro module from a given module. """Produces a micro module from a given module.
Parameters Parameters
---------- ----------
c_mod : tvm.runtime.Module c_mod : tvm.module.Module
module with "c" as its target backend module with "c" as its target backend
dev_config : Dict[str, Any] lib_src_paths: TODO
MicroTVM config dict for the target device TODO
lib_headers: TODO
TODO
lib_include_paths: TODO
TODO
Return Return
------ ------
micro_mod : tvm.runtim.Module micro_mod : tvm.module.Module
micro module for the target device micro module for the target device
""" """
temp_dir = _util.tempdir() temp_dir = _util.tempdir()
lib_obj_path = temp_dir.relpath("dev_lib.obj") lib_obj_path = temp_dir.relpath("dev_lib.obj")
# TODO use dev config to dispatch on the type of C codegen to run through
# (e.g., CodeGenCArm, CodeGenCHost, CodeGenCRiscV)
c_mod.export_library( c_mod.export_library(
lib_obj_path, lib_obj_path,
fcompile=cross_compiler(dev_config, LibType.OPERATOR)) fcompile=cross_compiler(
dev_config,
LibType.OPERATOR,
lib_src_paths=lib_src_paths,
lib_headers=lib_headers,
lib_include_paths=lib_include_paths))
micro_mod = tvm.runtime.load_module(lib_obj_path) micro_mod = tvm.runtime.load_module(lib_obj_path)
return micro_mod return micro_mod
def cross_compiler(dev_config, lib_type): def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None,
"""Create a cross-compile function that wraps `create_lib` for a `Binutil` instance. lib_include_paths=None):
"""Create a cross compile function that wraps `create_lib` for a `Binutil` instance.
For use in `tvm.runtime.Module.export_library`. For use in `tvm.runtime.Module.export_library`.
Parameters Parameters
---------- ----------
dev_config : Dict[str, Any] create_micro_lib : func
MicroTVM config dict for the target device function for creating MicroTVM libraries for a specific device (e.g.,
`tvm.micro.device.get_device_funcs('arm.stm32f746xx')['create_micro_lib']`)
lib_type : micro.LibType lib_type : micro.LibType
whether to compile a MicroTVM runtime or operator library whether to compile a MicroTVM runtime or operator library
lib_src_paths: TODO
TODO
lib_headers: TODO
e.g., `['cmsis_gcc.h', 'arm_math.h']`
lib_include_paths: TODO
TODO
Return Return
------ ------
func : Callable[[str, str, Optional[str]], None] func : Callable[[str, str, Optional[str]], None]
...@@ -183,16 +249,49 @@ def cross_compiler(dev_config, lib_type): ...@@ -183,16 +249,49 @@ def cross_compiler(dev_config, lib_type):
c_mod = ... # some module generated with "c" as the target c_mod = ... # some module generated with "c" as the target
fcompile = tvm.micro.cross_compiler(dev_config, LibType.OPERATOR) fcompile = tvm.micro.cross_compiler(dev_config, LibType.OPERATOR)
c_mod.export_library("dev_lib.obj", fcompile=fcompile) c_mod.export_library('dev_lib.obj', fcompile=fcompile)
""" """
dev_funcs = tvm.micro.device.get_device_funcs(dev_config['device_id']) assert (lib_headers is None) == (lib_include_paths is None), \
create_micro_lib = dev_funcs['create_micro_lib'] "must specify both `lib_headers` and `lib_include_paths` or neither"
if lib_src_paths is None:
lib_src_paths = []
if lib_include_paths is None:
lib_include_paths = []
include_options = []
for include_path in lib_include_paths:
include_options.append("-I")
include_options.append(include_path)
create_micro_lib = tvm.micro.device.get_device_funcs(
dev_config["device_id"])["create_micro_lib"]
mem_layout = dev_config["mem_layout"]
def compile_func(obj_path, src_path, **kwargs): def compile_func(obj_path, src_path, **kwargs):
if isinstance(obj_path, list): if isinstance(obj_path, list):
obj_path = obj_path[0] obj_path = obj_path[0]
if isinstance(src_path, list): if isinstance(src_path, list):
src_path = src_path[0] src_path = src_path[0]
create_micro_lib(obj_path, src_path, lib_type, kwargs.get("options", None)) options = kwargs.get("options", [])
options += include_options
# check that workspace allocations don't exceed available workspace memory
with open(src_path) as f:
src_contents = f.read()
max_ws_usage = _calc_max_workspace_usage(src_contents)
available_mem = mem_layout["workspace"]["size"]
if max_ws_usage > available_mem:
raise RuntimeError(f"workspace allocations in library ({max_ws_usage}) "
f"exceed available memory ({available_mem})")
# inject headers into new source path, if requested
if lib_headers:
headers_to_inject = "\n".join(map(lambda s: f"#include <{s}>", lib_headers)) + "\n"
new_src_contents = headers_to_inject + src_contents
tmp_dir = _util.tempdir()
src_path = tmp_dir.relpath(os.path.basename(src_path))
with open(src_path, "w") as f:
f.write(new_src_contents)
create_micro_lib(obj_path, src_path, lib_type, options, lib_src_paths=lib_src_paths)
return _cc.cross_compiler(compile_func, output_format="obj") return _cc.cross_compiler(compile_func, output_format="obj")
......
...@@ -16,7 +16,8 @@ ...@@ -16,7 +16,8 @@
# under the License. # under the License.
"""Device-specific configuration for MicroTVM""" """Device-specific configuration for MicroTVM"""
from .base import register_device, get_device_funcs, create_micro_lib_base from .base import create_micro_lib_base, gen_mem_layout
from .base import MemConstraint, register_device, get_device_funcs
from . import host from . import host
from . import arm from . import arm
from . import riscv_spike from . import riscv_spike
...@@ -14,13 +14,32 @@ ...@@ -14,13 +14,32 @@
# KIND, either express or implied. See the License for the # KIND, either express or implied. See the License for the
# specific language governing permissions and limitations # specific language governing permissions and limitations
# under the License. # under the License.
"""Compilation and config definitions for ARM STM32F746XX devices""" """Compilation and config definitions for Arm STM32F746XX devices"""
from .. import create_micro_lib_base, register_device import os
from .. import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
DEVICE_ID = "arm.stm32f746xx" DEVICE_ID = "arm.stm32f746xx"
TOOLCHAIN_PREFIX = "arm-none-eabi-" TOOLCHAIN_PREFIX = "arm-none-eabi-"
WORD_SIZE_BITS = 32
#
# [Device Memory Layout]
# RAM (rwx) : START = 0x20000000, LENGTH = 320K
# Flash (rx) : START = 0x8000000, LENGTH = 1024K
#
BASE_ADDR = 0x20000000
AVAILABLE_MEM = 320000
DEFAULT_SECTION_CONSTRAINTS = {
"text": (18000, MemConstraint.ABSOLUTE_BYTES),
"rodata": (100, MemConstraint.ABSOLUTE_BYTES),
"data": (100, MemConstraint.ABSOLUTE_BYTES),
"bss": (600, MemConstraint.ABSOLUTE_BYTES),
"args": (4096, MemConstraint.ABSOLUTE_BYTES),
"heap": (100.0, MemConstraint.WEIGHT),
"workspace": (64000, MemConstraint.ABSOLUTE_BYTES),
"stack": (32, MemConstraint.ABSOLUTE_BYTES),
}
def create_micro_lib(obj_path, src_path, lib_type, options=None): def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
"""Wrapper over `create_micro_lib_base` to add device-specific options """Wrapper over `create_micro_lib_base` to add device-specific options
Parameters Parameters
...@@ -36,23 +55,40 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): ...@@ -36,23 +55,40 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
options : Optional[List[str]] options : Optional[List[str]]
additional options to pass to GCC additional options to pass to GCC
lib_src_paths : Optional[List[str]]
TODO
""" """
if options is None: if options is None:
options = [] options = []
else:
options = list(options)
options += [ options += [
# TODO(weberlo): make a debug flag
"-O2",
"-mcpu=cortex-m7", "-mcpu=cortex-m7",
"-mlittle-endian", "-mlittle-endian",
"-mfloat-abi=hard", "-mfloat-abi=hard",
"-mfpu=fpv5-sp-d16", "-mfpu=fpv5-sp-d16",
"-mthumb", "-mthumb",
"-ffast-math",
"-gdwarf-5", "-gdwarf-5",
"-DARM_MATH_CM7",
"-D__FPU_PRESENT=1U",
"-DARM_MATH_DSP",
"-Wno-unused-variable",
"-Wno-unused-parameter",
"-I{}".format(os.environ["CMSIS_ST_PATH"]),
"-I{}/Core/Include".format(os.environ["CMSIS_ST_PATH"])
] ]
create_micro_lib_base( create_micro_lib_base(
obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options) obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options,
lib_src_paths=lib_src_paths)
def default_config(server_addr, server_port): def generate_config(server_addr, server_port, section_constraints=None):
"""Generates a default configuration for ARM STM32F746XX devices """Generates a configuration for Arm STM32F746XX devices
Parameters Parameters
---------- ----------
...@@ -62,55 +98,23 @@ def default_config(server_addr, server_port): ...@@ -62,55 +98,23 @@ def default_config(server_addr, server_port):
server_port : int server_port : int
port of OpenOCD server to connect to port of OpenOCD server to connect to
section_constraints: Optional[Dict[str, [Number, MemConstraint]]]
maps section name to the quantity of available memory
Return Return
------ ------
config : Dict[str, Any] config : Dict[str, Any]
MicroTVM config dict for this device MicroTVM config dict for this device
""" """
if section_constraints is None:
section_constraints = DEFAULT_SECTION_CONSTRAINTS
return { return {
"device_id": DEVICE_ID, "device_id": DEVICE_ID,
"toolchain_prefix": TOOLCHAIN_PREFIX, "toolchain_prefix": TOOLCHAIN_PREFIX,
# "mem_layout": gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE_BITS, section_constraints),
# [Device Memory Layout] "word_size_bits": WORD_SIZE_BITS,
# RAM (rwx) : START = 0x20000000, LENGTH = 320K
# FLASH (rx) : START = 0x8000000, LENGTH = 1024K
#
"mem_layout": {
"text": {
"start": 0x20000180,
"size": 20480,
},
"rodata": {
"start": 0x20005180,
"size": 20480,
},
"data": {
"start": 0x2000a180,
"size": 768,
},
"bss": {
"start": 0x2000a480,
"size": 768,
},
"args": {
"start": 0x2000a780,
"size": 1280,
},
"heap": {
"start": 0x2000ac80,
"size": 262144,
},
"workspace": {
"start": 0x2004ac80,
"size": 20480,
},
"stack": {
"start": 0x2004fc80,
"size": 80,
},
},
"word_size": 4,
"thumb_mode": True, "thumb_mode": True,
"use_device_timer": True,
"comms_method": "openocd", "comms_method": "openocd",
"server_addr": server_addr, "server_addr": server_addr,
"server_port": server_port, "server_port": server_port,
...@@ -119,5 +123,5 @@ def default_config(server_addr, server_port): ...@@ -119,5 +123,5 @@ def default_config(server_addr, server_port):
register_device(DEVICE_ID, { register_device(DEVICE_ID, {
"create_micro_lib": create_micro_lib, "create_micro_lib": create_micro_lib,
"default_config": default_config, "generate_config": generate_config,
}) })
...@@ -17,12 +17,13 @@ ...@@ -17,12 +17,13 @@
"""Base definitions for MicroTVM config""" """Base definitions for MicroTVM config"""
import glob import glob
import os import os
from pathlib import Path import enum
import pathlib
from tvm.contrib import util as _util from tvm.contrib import util as _util
from tvm.contrib.binutil import run_cmd from tvm.contrib.binutil import run_cmd
from tvm._ffi.libinfo import find_include_path from tvm._ffi.libinfo import find_include_path
from tvm.micro import LibType, get_micro_host_driven_dir, get_micro_device_dir from tvm.micro import DEVICE_SECTIONS, LibType, get_micro_host_driven_dir, get_micro_device_dir
_DEVICE_REGISTRY = {} _DEVICE_REGISTRY = {}
...@@ -38,7 +39,7 @@ def register_device(device_id, device_funcs): ...@@ -38,7 +39,7 @@ def register_device(device_id, device_funcs):
dictionary with compilation and config generation functions as values dictionary with compilation and config generation functions as values
""" """
if device_id in _DEVICE_REGISTRY: if device_id in _DEVICE_REGISTRY:
raise RuntimeError(f"\"{device_id}\" already exists in the device registry") raise RuntimeError(f'"{device_id}" already exists in the device registry')
_DEVICE_REGISTRY[device_id] = device_funcs _DEVICE_REGISTRY[device_id] = device_funcs
...@@ -56,7 +57,7 @@ def get_device_funcs(device_id): ...@@ -56,7 +57,7 @@ def get_device_funcs(device_id):
dictionary with compilation and config generation functions as values dictionary with compilation and config generation functions as values
""" """
if device_id not in _DEVICE_REGISTRY: if device_id not in _DEVICE_REGISTRY:
raise RuntimeError(f"\"{device_id}\" does not exist in the binutil registry") raise RuntimeError(f'"{device_id}" does not exist in the binutil registry')
device_funcs = _DEVICE_REGISTRY[device_id] device_funcs = _DEVICE_REGISTRY[device_id]
return device_funcs return device_funcs
...@@ -67,7 +68,9 @@ def create_micro_lib_base( ...@@ -67,7 +68,9 @@ def create_micro_lib_base(
toolchain_prefix, toolchain_prefix,
device_id, device_id,
lib_type, lib_type,
options=None): options=None,
lib_src_paths=None,
):
"""Compiles code into a binary for the target micro device. """Compiles code into a binary for the target micro device.
Parameters Parameters
...@@ -92,7 +95,12 @@ def create_micro_lib_base( ...@@ -92,7 +95,12 @@ def create_micro_lib_base(
options : List[str] options : List[str]
additional options to pass to GCC additional options to pass to GCC
lib_src_paths : Optional[List[str]]
paths to additional source files to be compiled into the library
""" """
# look at these (specifically `strip`):
# https://stackoverflow.com/questions/15314581/g-compiler-flag-to-minimize-binary-size
base_compile_cmd = [ base_compile_cmd = [
f"{toolchain_prefix}gcc", f"{toolchain_prefix}gcc",
"-std=c11", "-std=c11",
...@@ -100,7 +108,6 @@ def create_micro_lib_base( ...@@ -100,7 +108,6 @@ def create_micro_lib_base(
"-Wextra", "-Wextra",
"--pedantic", "--pedantic",
"-c", "-c",
"-O0",
"-g", "-g",
"-nostartfiles", "-nostartfiles",
"-nodefaultlibs", "-nodefaultlibs",
...@@ -114,40 +121,48 @@ def create_micro_lib_base( ...@@ -114,40 +121,48 @@ def create_micro_lib_base(
src_paths = [] src_paths = []
include_paths = find_include_path() + [get_micro_host_driven_dir()] include_paths = find_include_path() + [get_micro_host_driven_dir()]
tmp_dir = _util.tempdir() tmp_dir = _util.tempdir()
# we might transform the src path in one of the branches below # we need to create a new src file in the operator branch
new_in_src_path = in_src_path new_in_src_path = in_src_path
if lib_type == LibType.RUNTIME: if lib_type == LibType.RUNTIME:
dev_dir = _get_device_source_dir(device_id) dev_dir = _get_device_source_dir(device_id)
dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]") dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]")
# there needs to at least be a utvm_timer.c file # there needs to at least be a utvm_timer.c file
assert dev_src_paths assert dev_src_paths
assert "utvm_timer.c" in map(os.path.basename, dev_src_paths) assert "utvm_timer.c" in map(os.path.basename, dev_src_paths)
src_paths += dev_src_paths src_paths += dev_src_paths
elif lib_type == LibType.OPERATOR: elif lib_type == LibType.OPERATOR:
# create a temporary copy of the source, so we can inject the dev lib # create a temporary copy of the operator source, so we can inject the dev lib
# header without modifying the original. # header without modifying the original.
temp_src_path = tmp_dir.relpath("temp.c") temp_src_path = tmp_dir.relpath("temp.c")
with open(in_src_path, "r") as f: with open(in_src_path, "r") as f:
src_lines = f.read().splitlines() src_lines = f.read().splitlines()
src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"") src_lines.insert(0, '#include "utvm_device_dylib_redirect.c"')
with open(temp_src_path, "w") as f: with open(temp_src_path, "w") as f:
f.write("\n".join(src_lines)) f.write("\n".join(src_lines))
new_in_src_path = temp_src_path new_in_src_path = temp_src_path
base_compile_cmd += ["-c"]
else: else:
raise RuntimeError("unknown lib type") raise RuntimeError("unknown lib type")
src_paths += [new_in_src_path] src_paths += [new_in_src_path]
# add any src paths required by the operator
if lib_src_paths is not None:
src_paths += lib_src_paths
# print(f"include paths: {include_paths}")
for path in include_paths: for path in include_paths:
base_compile_cmd += ["-I", path] base_compile_cmd += ["-I", path]
prereq_obj_paths = [] prereq_obj_paths = []
# print(src_paths)
for src_path in src_paths: for src_path in src_paths:
curr_obj_path = Path(src_path).with_suffix(".o").name curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix(".o").name)
assert curr_obj_path not in prereq_obj_paths assert curr_obj_path not in prereq_obj_paths
prereq_obj_paths.append(curr_obj_path) prereq_obj_paths.append(curr_obj_path)
curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path] curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path]
# TODO(weberlo): make compilation fail if there are any warnings
run_cmd(curr_compile_cmd) run_cmd(curr_compile_cmd)
ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"] ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"]
...@@ -156,6 +171,65 @@ def create_micro_lib_base( ...@@ -156,6 +171,65 @@ def create_micro_lib_base(
run_cmd(ld_cmd) run_cmd(ld_cmd)
# TODO we shouldn't need an enum for this. too much bureaucracy.
class MemConstraint(enum.Enum):
"""Represents a constraint on the device's memory layout"""
ABSOLUTE_BYTES = 0
WEIGHT = 1
def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints):
"""Template function to generate memory layout for devices.
Parameters
----------
base_addr: Number
The address where usable memory begins on this device.
available_mem: Number
Available memory at base_addr, given in bytes.
word_size_bits: Number
Number of bits in one word on this device.
section_constraints: Optional[Dict[str, [Number, MemConstraint]]]
maps section name to the quantity of available memory
"""
assert word_size_bits in (32, 64), "only 32- or 64-bit devices are supported now"
word_size_bytes = word_size_bits // 8
byte_sum = sum(x[0]
for x in section_constraints.values()
if x[1] == MemConstraint.ABSOLUTE_BYTES)
weight_sum = sum(x[0]
for x in section_constraints.values()
if x[1] == MemConstraint.WEIGHT)
assert byte_sum <= available_mem
available_weight_mem = available_mem - byte_sum
res = {}
curr_addr = base_addr
for section in DEVICE_SECTIONS:
(val, cons_type) = section_constraints[section]
if cons_type == MemConstraint.ABSOLUTE_BYTES:
assert val % word_size_bytes == 0, \
f"constraint {val} for {section} section is not word-aligned"
size = val
res[section] = {
"start": curr_addr,
"size": size,
}
else:
size = int((val / weight_sum) * available_weight_mem)
size = (size // word_size_bytes) * word_size_bytes
res[section] = {
"start": curr_addr,
"size": size,
}
curr_addr += size
return res
def _get_device_source_dir(device_id): def _get_device_source_dir(device_id):
"""Grabs the source directory for device-specific uTVM files""" """Grabs the source directory for device-specific uTVM files"""
dev_subdir = "/".join(device_id.split(".")) dev_subdir = "/".join(device_id.split("."))
......
...@@ -17,12 +17,26 @@ ...@@ -17,12 +17,26 @@
"""Compilation and config definitions for the host emulated device""" """Compilation and config definitions for the host emulated device"""
import sys import sys
from . import create_micro_lib_base, register_device from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
DEVICE_ID = "host" DEVICE_ID = "host"
TOOLCHAIN_PREFIX = "" TOOLCHAIN_PREFIX = ""
WORD_SIZE_BITS = 64 if sys.maxsize > 2**32 else 32
def create_micro_lib(obj_path, src_path, lib_type, options=None): # we pretend we only have 320kb in the default case, so we can use `gen_mem_layout`
DEFAULT_AVAILABLE_MEM = 3200000
DEFAULT_SECTION_CONSTRAINTS = {
"text": (20480, MemConstraint.ABSOLUTE_BYTES),
"rodata": (20480, MemConstraint.ABSOLUTE_BYTES),
"data": (768, MemConstraint.ABSOLUTE_BYTES),
"bss": (4096, MemConstraint.ABSOLUTE_BYTES),
"args": (4096, MemConstraint.ABSOLUTE_BYTES),
"heap": (262144, MemConstraint.ABSOLUTE_BYTES),
"workspace": (64000, MemConstraint.ABSOLUTE_BYTES),
"stack": (80, MemConstraint.ABSOLUTE_BYTES),
}
def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
"""Wrapper over `create_micro_lib_base` to add device-specific options """Wrapper over `create_micro_lib_base` to add device-specific options
Parameters Parameters
...@@ -38,59 +52,65 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): ...@@ -38,59 +52,65 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
options : Optional[List[str]] options : Optional[List[str]]
additional options to pass to GCC additional options to pass to GCC
lib_src_paths : Optional[List[str]]
paths to additional source files to be compiled into the library
""" """
if options is None: if options is None:
options = [] options = []
else:
options = list(options)
# Cannot increase optimization level on host due to code loading method.
options.append("-O0")
if sys.maxsize > 2**32 and sys.platform.startswith("linux"): if sys.maxsize > 2**32 and sys.platform.startswith("linux"):
options += ["-mcmodel=large"] options += ["-mcmodel=large"]
create_micro_lib_base( create_micro_lib_base(
obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options) obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options,
lib_src_paths=lib_src_paths)
def default_config(): def generate_config(available_mem=None, section_constraints=None):
"""Generates a default configuration for the host emulated device """Generates a configuration for the host emulated device
Parameters
----------
available_mem: int
number of RW bytes available for use on device
section_constraints: Optional[Dict[str, Dict[Number, MemConstraint]]]
maps section name to the quantity of available memory
Return Return
------ ------
config : Dict[str, Any] config : Dict[str, Any]
MicroTVM config dict for this device MicroTVM config dict for this device
""" """
if available_mem is None:
available_mem = DEFAULT_AVAILABLE_MEM
if section_constraints is None:
section_constraints = DEFAULT_SECTION_CONSTRAINTS
mem_layout = gen_mem_layout(0, available_mem, WORD_SIZE_BITS, section_constraints)
# TODO the host emulated device is an outlier, since we don't know how what
# its base address will be until we've created it in the C++. is there any
# way to change the infrastructure around this so it's not so much of an
# outlier?
# need to zero out all start addresses, because they don't make sense for a
# host device (the memory region is allocated in the backend)
for section in mem_layout:
mem_layout[section]["start"] = 0
return { return {
"device_id": DEVICE_ID, "device_id": DEVICE_ID,
"toolchain_prefix": TOOLCHAIN_PREFIX, "toolchain_prefix": TOOLCHAIN_PREFIX,
"mem_layout": { "mem_layout": mem_layout,
"text": { "word_size_bits": WORD_SIZE_BITS,
"size": 20480,
},
"rodata": {
"size": 20480,
},
"data": {
"size": 768,
},
"bss": {
"size": 768,
},
"args": {
"size": 1280,
},
"heap": {
"size": 262144,
},
"workspace": {
"size": 20480,
},
"stack": {
"size": 80,
},
},
"word_size": 8 if sys.maxsize > 2**32 else 4,
"thumb_mode": False, "thumb_mode": False,
"use_device_timer": False,
"comms_method": "host", "comms_method": "host",
} }
register_device(DEVICE_ID, { register_device(DEVICE_ID, {
"create_micro_lib": create_micro_lib, "create_micro_lib": create_micro_lib,
"default_config": default_config, "generate_config": generate_config,
}) })
...@@ -15,14 +15,25 @@ ...@@ -15,14 +15,25 @@
# specific language governing permissions and limitations # specific language governing permissions and limitations
# under the License. # under the License.
"""Compilation and config definitions for Spike, a RISC-V functional ISA simulator""" """Compilation and config definitions for Spike, a RISC-V functional ISA simulator"""
from collections import OrderedDict
from . import create_micro_lib_base, register_device from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
DEVICE_ID = "riscv_spike" DEVICE_ID = "riscv_spike"
TOOLCHAIN_PREFIX = "riscv64-unknown-elf-" TOOLCHAIN_PREFIX = "riscv64-unknown-elf-"
WORD_SIZE_BITS = 64
def create_micro_lib(obj_path, src_path, lib_type, options=None): DEFAULT_SECTION_CONSTRAINTS = {
"text": (18000, MemConstraint.ABSOLUTE_BYTES),
"rodata": (128, MemConstraint.ABSOLUTE_BYTES),
"data": (128, MemConstraint.ABSOLUTE_BYTES),
"bss": (2048, MemConstraint.ABSOLUTE_BYTES),
"args": (4096, MemConstraint.ABSOLUTE_BYTES),
"heap": (100.0, MemConstraint.WEIGHT),
"workspace": (64000, MemConstraint.ABSOLUTE_BYTES),
"stack": (32, MemConstraint.ABSOLUTE_BYTES),
}
def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
"""Wrapper over `create_micro_lib_base` to add device-specific options """Wrapper over `create_micro_lib_base` to add device-specific options
Parameters Parameters
...@@ -38,6 +49,9 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): ...@@ -38,6 +49,9 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
options : Optional[List[str]] options : Optional[List[str]]
additional options to pass to GCC additional options to pass to GCC
lib_src_paths : Optional[List[str]]
TODO
""" """
create_micro_lib_base( create_micro_lib_base(
obj_path, obj_path,
...@@ -45,11 +59,13 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None): ...@@ -45,11 +59,13 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
TOOLCHAIN_PREFIX, TOOLCHAIN_PREFIX,
DEVICE_ID, DEVICE_ID,
lib_type, lib_type,
options=options) options=options,
lib_src_paths=lib_src_paths
)
def default_config(base_addr, server_addr, server_port): def generate_config(base_addr, available_mem, server_addr, server_port, section_constraints=None):
"""Generates a default configuration for Spike """Generates a configuration for Spike
Parameters Parameters
---------- ----------
...@@ -62,56 +78,31 @@ def default_config(base_addr, server_addr, server_port): ...@@ -62,56 +78,31 @@ def default_config(base_addr, server_addr, server_port):
server_port : int server_port : int
port of OpenOCD server to connect to port of OpenOCD server to connect to
TODO correct type annotation?
section_constraints: Optional[Dict[str, Tuple[Number, MemConstraint]]]
TODO
Return Return
------ ------
config : Dict[str, Any] config : Dict[str, Any]
MicroTVM config dict for this device MicroTVM config dict for this device
""" """
res = { if section_constraints is None:
section_constraints = DEFAULT_SECTION_CONSTRAINTS
return {
"device_id": DEVICE_ID, "device_id": DEVICE_ID,
"toolchain_prefix": TOOLCHAIN_PREFIX, "toolchain_prefix": TOOLCHAIN_PREFIX,
"mem_layout": OrderedDict([ "mem_layout": gen_mem_layout(base_addr, available_mem, WORD_SIZE_BITS, section_constraints),
("text", { "word_size_bits": WORD_SIZE_BITS,
"size": 20480, "thumb_mode": False,
}), "use_device_timer": False,
("rodata", {
"size": 20480,
}),
("data", {
"size": 768,
}),
("bss", {
"size": 768,
}),
("args", {
"size": 1280,
}),
("heap", {
"size": 262144,
}),
("workspace", {
"size": 20480,
}),
("stack", {
"size": 80,
}),
]),
"word_size": 4,
"thumb_mode": True,
"comms_method": "openocd", "comms_method": "openocd",
"server_addr": server_addr, "server_addr": server_addr,
"server_port": server_port, "server_port": server_port,
} }
# generate section start addresses from the given `base_addr`
curr_offset = 0
mem_layout = res["mem_layout"]
for region_dict in mem_layout.values():
region_dict["start"] = base_addr + curr_offset
curr_offset += region_dict["size"]
return res
register_device(DEVICE_ID, { register_device(DEVICE_ID, {
"create_micro_lib": create_micro_lib, "create_micro_lib": create_micro_lib,
"default_config": default_config, "generate_config": generate_config,
}) })
...@@ -151,7 +151,9 @@ FUNC_OPS = { ...@@ -151,7 +151,9 @@ FUNC_OPS = {
"nn.dropout": op.nn.dropout_raw, "nn.dropout": op.nn.dropout_raw,
"zeros": op.zeros, "zeros": op.zeros,
"split": op.split, "split": op.split,
"cast": op.cast "cast": op.cast,
"clip": op.clip,
"right_shift": op.right_shift,
} }
TYPE_PREFIXES = [ TYPE_PREFIXES = [
......
...@@ -20,24 +20,25 @@ import re ...@@ -20,24 +20,25 @@ import re
import logging import logging
import topi import topi
from ....target import arm_isa
from .generic import * from .generic import *
from .. import op as _op from .. import op as _op
logger = logging.getLogger('strategy') logger = logging.getLogger('strategy')
@schedule_injective.register("arm_cpu") @schedule_injective.register(["arm_cpu", "micro_dev"])
def schedule_injective_arm_cpu(_, outs, target): def schedule_injective_arm_cpu(_, outs, target):
"""schedule injective ops for arm cpu""" """schedule injective ops for arm cpu"""
with target: with target:
return topi.arm_cpu.schedule_injective(outs) return topi.arm_cpu.schedule_injective(outs)
@schedule_concatenate.register("arm_cpu") @schedule_concatenate.register(["arm_cpu", "micro_dev"])
def schedule_concatenate_arm_cpu(_, outs, target): def schedule_concatenate_arm_cpu(_, outs, target):
"""schedule concatenate for arm cpu""" """schedule concatenate for arm cpu"""
with target: with target:
return topi.arm_cpu.schedule_concatenate(outs) return topi.arm_cpu.schedule_concatenate(outs)
@conv2d_strategy.register("arm_cpu") @conv2d_strategy.register(["arm_cpu", "micro_dev"])
def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d arm cpu strategy""" """conv2d arm cpu strategy"""
strategy = _op.OpStrategy() strategy = _op.OpStrategy()
...@@ -51,6 +52,8 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): ...@@ -51,6 +52,8 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
if dilation_h < 1 or dilation_w < 1: if dilation_h < 1 or dilation_w < 1:
raise ValueError("dilation should be positive value") raise ValueError("dilation should be positive value")
isa = arm_isa.IsaAnalyzer(target)
if groups == 1: if groups == 1:
if layout == "NCHW": if layout == "NCHW":
if kernel_layout == "OIHW": if kernel_layout == "OIHW":
...@@ -102,12 +105,23 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): ...@@ -102,12 +105,23 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
name="conv2d_hwcn.generic") name="conv2d_hwcn.generic")
elif layout == "NHWC": elif layout == "NHWC":
assert kernel_layout == "HWIO" channels = data.shape[3]
if "SMLAD" in isa and (channels % 4) == 0 and kernel_layout == "HWOI":
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_direct_simd),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd),
name='conv2d_direct_simd.micro_dev')
elif kernel_layout == "HWIO":
strategy.add_implementation( strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
name="conv2d_nhwc_spatial_pack.arm_cpu") name="conv2d_nhwc_spatial_pack.arm_cpu")
else: else:
raise RuntimeError("Unsupported kernel layout {} for conv2d NHWC".
format(kernel_layout))
else:
raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout)) raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout))
elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
if layout == "NCHW": if layout == "NCHW":
...@@ -232,7 +246,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out ...@@ -232,7 +246,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out
format(layout)) format(layout))
return strategy return strategy
@conv2d_transpose_strategy.register("arm_cpu") @conv2d_transpose_strategy.register(["arm_cpu", "micro_dev"])
def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target): def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d_transpose arm cpu strategy""" """conv2d_transpose arm cpu strategy"""
layout = attrs.data_layout layout = attrs.data_layout
......
...@@ -325,7 +325,10 @@ class Server(object): ...@@ -325,7 +325,10 @@ class Server(object):
key="", key="",
load_library=None, load_library=None,
custom_addr=None, custom_addr=None,
silent=False): silent=False,
utvm_dev_id=None,
utvm_dev_config_args=None,
):
try: try:
if base._ServerLoop is None: if base._ServerLoop is None:
raise RuntimeError("Please compile with USE_RPC=1") raise RuntimeError("Please compile with USE_RPC=1")
...@@ -355,6 +358,10 @@ class Server(object): ...@@ -355,6 +358,10 @@ class Server(object):
cmd += ["--custom-addr", custom_addr] cmd += ["--custom-addr", custom_addr]
if silent: if silent:
cmd += ["--silent"] cmd += ["--silent"]
if utvm_dev_id is not None:
assert utvm_dev_config_args is not None
cmd += [f"--utvm-dev-id={utvm_dev_id}"]
cmd += [f"--utvm-dev-config-args={utvm_dev_config_args}"]
# prexec_fn is not thread safe and may result in deadlock. # prexec_fn is not thread safe and may result in deadlock.
# python 3.2 introduced the start_new_session parameter as # python 3.2 introduced the start_new_session parameter as
......
...@@ -109,7 +109,6 @@ class Module(object): ...@@ -109,7 +109,6 @@ class Module(object):
# pylint: disable=not-callable # pylint: disable=not-callable
return self.entry_func(*args) return self.entry_func(*args)
def __repr__(self): def __repr__(self):
return "Module(%s, %x)" % (self.type_key, self.handle.value) return "Module(%s, %x)" % (self.type_key, self.handle.value)
......
...@@ -219,7 +219,7 @@ def context(dev_type, dev_id=0): ...@@ -219,7 +219,7 @@ def context(dev_type, dev_id=0):
""" """
if isinstance(dev_type, string_types): if isinstance(dev_type, string_types):
if '-device=micro_dev' in dev_type: if '-device=micro_dev' in dev_type:
dev_type = 'micro_dev' dev_type = TVMContext.STR2MASK['micro_dev']
else: else:
dev_type = dev_type.split()[0] dev_type = dev_type.split()[0]
if dev_type not in TVMContext.STR2MASK: if dev_type not in TVMContext.STR2MASK:
......
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Defines functions to analyze available opcodes in the ARM ISA."""
ARM_ISA_MAP = {
'armv7e-m': ['SMLAD'],
}
class IsaAnalyzer(object):
def __init__(self, target):
self.target = target
# TODO: actually parse -mcpu
arch = 'armv7e-m'
self._isa_map = ARM_ISA_MAP[arch]
def __contains__(self, instruction):
return instruction in self._isa_map
...@@ -272,7 +272,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs, ...@@ -272,7 +272,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs,
Target target_host_val = target_host; Target target_host_val = target_host;
if (!target_host.defined()) { if (!target_host.defined()) {
for (const auto& it : inputs) { for (const auto& it : inputs) {
if (it.first->device_type == kDLCPU) { if (it.first->device_type == kDLCPU || it.first->device_type == kDLMicroDev) {
target_host_val = it.first; target_host_val = it.first;
break; break;
} }
......
...@@ -17,11 +17,6 @@ ...@@ -17,11 +17,6 @@
* under the License. * under the License.
*/ */
/*!
* \file utvm_init.s
* \brief uTVM init definition for STM32F746XX-series boards
*/
.syntax unified .syntax unified
.cpu cortex-m7 .cpu cortex-m7
.fpu softvfp .fpu softvfp
......
...@@ -29,100 +29,51 @@ extern "C" { ...@@ -29,100 +29,51 @@ extern "C" {
#include <stdint.h> #include <stdint.h>
#include "utvm_runtime.h" #include "utvm_runtime.h"
// NOTE: This expects ST CMSIS to be in your include path.
// Download STM32CubeF7 here:
// https://www.st.com/content/st_com/en/products/embedded-software/mcu-mpu-embedded-software/stm32-embedded-software/stm32cube-mcu-mpu-packages/stm32cubef7.html
// and add Drivers/CMSIS to your C include path.
#include "Device/ST/STM32F7xx/Include/stm32f746xx.h"
// There are two implementations of cycle counters on the STM32F7X: SysTick and
// CYCCNT. SysTick is preferred, as it gives better error handling, but the
// counter is only 24 bits wide. If a larger timer is needed, use the CYCCNT
// implementation, which has a 32-bit counter.
#define USE_SYSTICK
#ifdef USE_SYSTICK #define utvm_SystemCoreClock 216000000UL
#define SYST_CSR (*((volatile uint32_t *) 0xE000E010))
#define SYST_RVR (*((volatile uint32_t *) 0xE000E014))
#define SYST_CVR (*((volatile uint32_t *) 0xE000E018))
#define SYST_CALIB (*((volatile uint32_t *) 0xE000E01C))
#define SYST_CSR_ENABLE 0
#define SYST_CSR_TICKINT 1
#define SYST_CSR_CLKSOURCE 2
#define SYST_COUNTFLAG 16
#define SYST_CALIB_NOREF 31
#define SYST_CALIB_SKEW 30
uint32_t start_time = 0;
uint32_t stop_time = 0;
int32_t UTVMTimerStart() { int32_t UTVMTimerStart() {
SYST_CSR = (1 << SYST_CSR_ENABLE) | (1 << SYST_CSR_CLKSOURCE); UTVMTimerReset();
// wait until timer starts TIM2->CR1 =
while (SYST_CVR == 0) {} TIM_CR1_CEN; // Start counter
start_time = SYST_CVR; return UTVM_ERR_OK;
return 0;
}
void UTVMTimerStop() {
SYST_CSR = 0;
stop_time = SYST_CVR;
}
void UTVMTimerReset() {
SYST_CSR = 0;
// maximum reload value (24-bit)
SYST_RVR = (~((uint32_t) 0)) >> 8;
SYST_CVR = 0;
} }
uint32_t UTVMTimerRead() { uint32_t UTVMTimerStop(int32_t* err) {
if (SYST_CSR & SYST_COUNTFLAG) { TIM2->CR1 &= TIM_CR1_CEN;
TVMAPISetLastError("timer overflowed"); if (TIM2->SR & TIM_SR_UIF_Msk) {
return -1; *err = UTVM_ERR_TIMER_OVERFLOW;
} else { return 0;
return start_time - stop_time;
} }
*err = UTVM_ERR_OK;
uint32_t tim_cnt = TIM2->CNT;
uint32_t millis = tim_cnt / (utvm_SystemCoreClock / 1000);
uint32_t micros =
(tim_cnt - (millis * (utvm_SystemCoreClock / 1000))) /
(utvm_SystemCoreClock / 1000000);
return millis * 1000 + micros;
} }
#else // !USE_SYSTICK
#define DWT_CTRL (*((volatile uint32_t *) 0xE0001000))
#define DWT_CYCCNT (*((volatile uint32_t *) 0xE0001004))
#define DWT_CTRL_NOCYCCNT 25
#define DWT_CTRL_CYCCNTENA 0
uint32_t start_time = 0;
uint32_t stop_time = 0;
void UTVMTimerReset() { void UTVMTimerReset() {
DWT_CYCCNT = 0; RCC->APB1RSTR |= RCC_APB1RSTR_TIM2RST; // Hold TIM2 in reset
} RCC->DCKCFGR1 = (RCC->DCKCFGR1 & ~RCC_DCKCFGR1_TIMPRE_Msk); // disable 2x clock boost to TIM2
RCC->CFGR = (RCC->CFGR & ~RCC_CFGR_PPRE1_Msk); // No AHB clock division to APB1 (1:1).
int32_t UTVMTimerStart() { RCC->APB1ENR |= RCC_APB1ENR_TIM2EN; // Enable TIM2 clock.
if (DWT_CTRL & DWT_CTRL_NOCYCCNT) { RCC->APB1RSTR &= ~RCC_APB1RSTR_TIM2RST; // Exit TIM2 reset.
TVMAPISetLastError("cycle counter not implemented on device");
return -1; DBGMCU->APB1FZ |= DBGMCU_APB1_FZ_DBG_TIM2_STOP; // stop TIM2 clock during debug halt.
TIM2->ARR = 0xffffffff;
if (TIM2->SR & TIM_SR_UIF_Msk) {
for (;;) ;
} }
start_time = DWT_CYCCNT;
DWT_CTRL |= (1 << DWT_CTRL_CYCCNTENA);
} }
void UTVMTimerStop() {
stop_time = DWT_CYCCNT;
DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA);
}
int32_t UTVMTimerRead() {
if (stop_time > stop_time) {
return stop_time - start_time;
} else {
uint32_t largest = ~0;
return (largest - start_time) + stop_time;
}
}
#endif // USE_SYSTICK
#ifdef __cplusplus #ifdef __cplusplus
} // TVM_EXTERN_C } // TVM_EXTERN_C
#endif #endif
...@@ -22,26 +22,16 @@ ...@@ -22,26 +22,16 @@
* \brief uTVM timer API stubs for the host emulated device * \brief uTVM timer API stubs for the host emulated device
*/ */
#ifdef __cplusplus #include <stdint.h>
extern "C" {
#endif
#include "utvm_runtime.h" #include "utvm_runtime.h"
// TODO(weberlo): use this? https://stackoverflow.com/questions/5141960/get-the-current-time-in-c // TODO(weberlo): use this? https://stackoverflow.com/questions/5141960/get-the-current-time-in-c
int32_t UTVMTimerStart() { int32_t UTVMTimerStart() {
return 0; return UTVM_ERR_OK;
} }
void UTVMTimerStop() { } uint32_t UTVMTimerStop(int32_t* err) {
*err = UTVM_ERR_OK;
void UTVMTimerReset() { } return 0;
uint32_t UTVMTimerRead() {
return 1;
} }
#ifdef __cplusplus
} // TVM_EXTERN_C
#endif
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
UTVMInit:
/* set stack pointer */
la sp, _utvm_stack_pointer_init
call UTVMMain
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file utvm_timer.c
* \brief uTVM timer API stubs for Spike
*/
#ifdef __cplusplus
extern "C" {
#endif
#include "utvm_runtime.h"
int32_t UTVMTimerStart() {
return UTVM_ERR_OK;
}
uint32_t UTVMTimerStop(int32_t* err) {
*err = UTVM_ERR_OK;
return 0;
}
#ifdef __cplusplus
} // TVM_EXTERN_C
#endif
...@@ -32,10 +32,11 @@ extern "C" { ...@@ -32,10 +32,11 @@ extern "C" {
#include <stdint.h> #include <stdint.h>
#include <stddef.h> #include <stddef.h>
void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = // TODO(weberlo, areusch): compiler errors say volatile qualifier is discarded.
(void *(*)(int, int, uint64_t, int, int)) NULL; // should we just get rid of em?
int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) NULL; void* (* volatile TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = NULL;
void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) NULL; int (* volatile TVMBackendFreeWorkspace_)(int, int, void*) = NULL;
void (* volatile TVMAPISetLastError_)(const char*) = NULL;
void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size,
int dtype_code_hint, int dtype_bits_hint) { int dtype_code_hint, int dtype_bits_hint) {
...@@ -51,6 +52,41 @@ void TVMAPISetLastError(const char* msg) { ...@@ -51,6 +52,41 @@ void TVMAPISetLastError(const char* msg) {
(*TVMAPISetLastError_)(msg); (*TVMAPISetLastError_)(msg);
} }
void *memset(void *s, int c, size_t n) {
char *p = (char*) s; // NOLINT(readability/casting): linter is configured for c++
while (n > 0) {
*p = (char) c; // NOLINT(readability/casting): linter is configured for c++
p++;
n--;
}
return s;
}
void *memmove(void *to, const void *from, size_t n) {
// TODO(weberlo, areusch): will need to factor memmove calls into workspace size calculation
// NOLINTNEXTLINE(readability/casting): linter is configured for c++
char *temp = (char*) TVMBackendAllocWorkspace(1, 1, (uint64_t) n, 2, 8);
if (temp == NULL) {
return NULL;
}
const char *from_pp = (char*) from; // NOLINT(readability/casting): linter is configured for c++
for (size_t i = 0; i < n; i++) {
temp[i] = from_pp[i];
}
char *to_pp = (char*) to; // NOLINT(readability/casting): linter is configured for c++
for (size_t i = 0; i < n; i++) {
to_pp[i] = temp[i];
}
// NOLINTNEXTLINE(readability/casting): linter is configured for c++
if (TVMBackendFreeWorkspace(1, (uint64_t) 1, (void*) temp) != 0) {
return NULL;
}
return to;
}
#ifdef __cplusplus #ifdef __cplusplus
} // TVM_EXTERN_C } // TVM_EXTERN_C
#endif #endif
...@@ -34,89 +34,148 @@ extern "C" { ...@@ -34,89 +34,148 @@ extern "C" {
#include "utvm_runtime.h" #include "utvm_runtime.h"
// Task pointers must be patched before calling a function. // TODO(weberlo, areusch): move defines into header
UTVMTask utvm_task = { // TODO(weberlo, areusch): unify TASK_QUEUE_SIZE and MicroSession::kTaskQueueCapacity.
.func = NULL, #define TASK_QUEUE_SIZE 20
.arg_values = NULL, volatile UTVMTask utvm_tasks[TASK_QUEUE_SIZE] = { };
.arg_type_codes = NULL, volatile uint32_t utvm_num_tasks = 0;
.num_args = 0, volatile uint32_t utvm_task_times[TASK_QUEUE_SIZE] = { };
};
size_t utvm_word_size = 0; // NOLINT(*)
// These pointers are patched at load time to point to the workspace section. // These pointers are patched at load time to point to the workspace section.
char* utvm_workspace_start = NULL; // NOLINT(*) volatile char* utvm_workspace_start = NULL; // NOLINT(*)
char* utvm_workspace_end = NULL; // NOLINT(*) volatile char* utvm_workspace_end = NULL; // NOLINT(*)
char* utvm_workspace_curr = NULL; // NOLINT(*) volatile char* utvm_workspace_curr = NULL; // NOLINT(*)
#define MAX_WS_ALLOCS 10
volatile char* utvm_alloc_ends[MAX_WS_ALLOCS] = {}; // NOLINT(*)
volatile uint32_t utvm_alloc_idx = 0;
// Keep track of how many active allocations there are on the workspace. // Keep track of how many active allocations there are on the workspace.
size_t utvm_num_active_allocs = 0; volatile uint32_t utvm_num_active_allocs = 0;
volatile uint32_t utvm_word_size = 0;
const char* utvm_last_error = NULL; // NOLINT(*) volatile int32_t utvm_last_error = 0; // NOLINT(*)
int32_t utvm_return_code = 0; // NOLINT(*)
uint32_t utvm_task_time = 0; volatile uint32_t utvm_done = 0;
// Gets called by UTVMInit, after device-specific initialization is finished. // Gets called by UTVMInit, after device-specific initialization is finished.
void UTVMMain() { void UTVMMain() {
utvm_done = 0;
// loss of precision should be fine here, since we only care about the lower bits
if (((uint32_t) utvm_workspace_start) % utvm_word_size) {
utvm_last_error = UTVM_ERR_WS_UNALIGNED_START;
UTVMDone();
return;
}
utvm_workspace_curr = utvm_workspace_start; utvm_workspace_curr = utvm_workspace_start;
utvm_num_active_allocs = 0; utvm_num_active_allocs = 0;
utvm_last_error = NULL; // NOLINT(*) utvm_alloc_idx = 0;
utvm_return_code = 0; utvm_last_error = UTVM_ERR_NOT_FINISHED;
utvm_task_time = 0; for (uint32_t i = 0; i < utvm_num_tasks; i++) {
UTVMTimerReset(); int32_t err = UTVM_ERR_OK;
int32_t err = UTVMTimerStart(); utvm_task_times[i] = 0;
err = UTVMTimerStart();
if (err < 0) { if (err < 0) {
utvm_return_code = err; utvm_last_error = err;
UTVMDone(); UTVMDone();
return;
}
err = utvm_tasks[i].func(
(void*) utvm_tasks[i].arg_values, // NOLINT(*)
(void*) utvm_tasks[i].arg_type_codes, // NOLINT(*)
utvm_tasks[i].num_args);
if (err < 0) {
UTVMDone();
return;
}
utvm_task_times[i] = UTVMTimerStop(&err);
if (err < 0) {
utvm_last_error = err;
UTVMDone();
return;
}
}
if (utvm_last_error == UTVM_ERR_NOT_FINISHED) {
utvm_last_error = UTVM_ERR_OK;
} }
utvm_return_code = utvm_task.func(
(void*) utvm_task.arg_values, // NOLINT(*)
(void*) utvm_task.arg_type_codes, // NOLINT(*)
utvm_task.num_args);
UTVMTimerStop();
utvm_task_time = UTVMTimerRead();
UTVMDone(); UTVMDone();
} }
// We use a dummy function to signal execution is finished for device // We use a dummy function to signal execution is finished for device
// backends which require breakpoints. // backends which require breakpoints.
void UTVMDone() { } void __attribute__((noinline)) UTVMDone() {
utvm_done = 1;
}
#define ALIGNED_UP(x, word_size) \
((((word_size) - (((uintptr_t) (x)) % (word_size))) % (word_size)) + (x))
void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size,
int dtype_code_hint, int dtype_bits_hint) { int dtype_code_hint, int dtype_bits_hint) {
// Align up to 8 bytes. if (size == 0) {
utvm_workspace_curr += utvm_last_error = UTVM_ERR_WS_ZERO_SIZE_ALLOC;
(utvm_word_size - ((uintptr_t) utvm_workspace_curr % utvm_word_size)) % utvm_word_size; // NOLINT(*) return NULL;
if (utvm_workspace_curr + size > utvm_workspace_end) { }
size_t alloc_requested_bytes = size;
size_t alloc_size_words = (alloc_requested_bytes + utvm_word_size - 1) / utvm_word_size;
size_t alloc_size_bytes = alloc_size_words * utvm_word_size;
// Align up to the target word size.
if (utvm_workspace_curr + alloc_size_bytes > utvm_workspace_end) {
// Out of space in workspace. // Out of space in workspace.
utvm_last_error = UTVM_ERR_WS_OUT_OF_SPACE;
return NULL;
}
if (utvm_alloc_idx == MAX_WS_ALLOCS - 1) {
// Exceeded number of allocs we can keep track of.
utvm_last_error = UTVM_ERR_WS_TOO_MANY_ALLOCS;
return NULL; return NULL;
} }
void* ret_ptr = (void*) utvm_workspace_curr; // NOLINT(*) void* ret_ptr = (void*) utvm_workspace_curr; // NOLINT(*)
utvm_workspace_curr += size; utvm_workspace_curr = utvm_workspace_curr + alloc_size_bytes;
// store the *end* of the alloc, so we can restore the WS pointer when freeing
utvm_alloc_ends[utvm_alloc_idx] = utvm_workspace_curr;
utvm_alloc_idx++;
utvm_num_active_allocs++; utvm_num_active_allocs++;
return ret_ptr; return ret_ptr;
} }
int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
utvm_num_active_allocs--; // TODO(weberlo, areusch): add dev type check
if (utvm_num_active_allocs < 0) { if (utvm_num_active_allocs == 0) {
TVMAPISetLastError("free called with no active workspace allocations"); TVMAPISetLastError("free called with no active workspace allocations");
// Reset allocations and workspace (for future task executions). // Reset allocations and workspace (for future task executions).
utvm_num_active_allocs = 0; utvm_num_active_allocs = 0;
utvm_workspace_curr = utvm_workspace_start; utvm_workspace_curr = utvm_workspace_start;
utvm_last_error = UTVM_ERR_WS_DOUBLE_FREE;
return -1; return -1;
} else if (utvm_num_active_allocs == 0) { } else {
// No more allocations. Reset workspace. utvm_num_active_allocs--;
if (ptr == utvm_workspace_start) {
// it's the first allocation
utvm_alloc_ends[0] = NULL;
} else {
for (uint32_t i = utvm_alloc_idx - 1; i >= 0; i--) {
if (utvm_alloc_ends[i] == ptr) {
utvm_alloc_ends[i + 1] = NULL;
break;
}
}
}
while (utvm_alloc_idx > 0 && utvm_alloc_ends[utvm_alloc_idx - 1] == NULL) {
utvm_alloc_idx--;
}
if (utvm_alloc_idx == 0) {
utvm_workspace_curr = utvm_workspace_start; utvm_workspace_curr = utvm_workspace_start;
return 0;
} else { } else {
// TODO(weberlo, areusch): could you possibly have utvm_alloc_idx pointing to a NULL entry in
// this branch?
utvm_workspace_curr = utvm_alloc_ends[utvm_alloc_idx - 1];
}
return 0; return 0;
} }
} }
void TVMAPISetLastError(const char* msg) { void TVMAPISetLastError(const char* msg) { }
utvm_last_error = msg;
}
#ifdef __cplusplus #ifdef __cplusplus
} // TVM_EXTERN_C } // TVM_EXTERN_C
......
...@@ -33,6 +33,22 @@ extern "C" { ...@@ -33,6 +33,22 @@ extern "C" {
#include <tvm/runtime/c_backend_api.h> #include <tvm/runtime/c_backend_api.h>
/*! /*!
* \brief TODO
*/
enum UTVMReturnCode {
UTVM_ERR_OK = 0,
UTVM_ERR_NOT_FINISHED = -1,
UTVM_ERR_TIMER_NOT_IMPLEMENTED = -2,
UTVM_ERR_TIMER_OVERFLOW = -3,
UTVM_ERR_WS_DOUBLE_FREE = -4,
UTVM_ERR_WS_OUT_OF_SPACE = -5,
UTVM_ERR_WS_TOO_MANY_ALLOCS = -6,
UTVM_ERR_WS_ZERO_SIZE_ALLOC = -7,
UTVM_ERR_WS_UNALIGNED_START = -8,
UTVM_ERR_WS_UNALIGNED_ALLOC_SIZE = -9,
};
/*!
* \brief Task structure for uTVM * \brief Task structure for uTVM
*/ */
typedef struct { typedef struct {
...@@ -46,20 +62,46 @@ typedef struct { ...@@ -46,20 +62,46 @@ typedef struct {
int32_t num_args; int32_t num_args;
} UTVMTask; } UTVMTask;
/*!
* \brief microTVM processor startup.
* Expected to reset the stack pointer, configure any hardware required to support the CRT
* (i.e. FPU), and then jump to UTVMMain.
*/
extern void UTVMInit(); extern void UTVMInit();
extern void UTVMTimerReset(); /*!
* \brief Start the on-device timer.
* \return UTVMReturnCode indicating the outcome of the operation.
*/
extern int32_t UTVMTimerStart(); extern int32_t UTVMTimerStart();
extern void UTVMTimerStop(); /*!
* \brief Stop the on-device timer.
extern uint32_t UTVMTimerRead(); * TODO(areusch): Use an SI specification of timer units here.
* \param err Receives a UTVMReturnCode indicating the outcome of the operation.
* \return elapsed time since UTVMTimerStart returned, in device timer ticks.
*/
extern uint32_t UTVMTimerStop(int32_t* err);
/*!
* \brief Main entry point for UTVM runtime.
* Waits for "go" signal, then executes tasks and reports result. Should never return.
*/
void UTVMMain(); void UTVMMain();
/*!
* \brief Function entered when UTVMMain is complete.
* Should never return. The host sets a breakpoint here to detect end of computation.
*/
void UTVMDone(); void UTVMDone();
// GCC -O3 begins to inject memset and memmove calls, so we provide impls in
// the runtime for this case and for general usage.
void *memset(void *s, int c, size_t n);
void *memmove(void *to, const void *from, size_t n);
#ifdef __cplusplus #ifdef __cplusplus
} // TVM_EXTERN_C } // TVM_EXTERN_C
#endif #endif
......
...@@ -43,14 +43,15 @@ class HostLowLevelDevice final : public LowLevelDevice { ...@@ -43,14 +43,15 @@ class HostLowLevelDevice final : public LowLevelDevice {
* \brief constructor to initialize on-host memory region to act as device * \brief constructor to initialize on-host memory region to act as device
* \param num_bytes size of the emulated on-device memory region * \param num_bytes size of the emulated on-device memory region
*/ */
explicit HostLowLevelDevice(size_t num_bytes, void** base_addr) : size_(num_bytes) { explicit HostLowLevelDevice(size_t num_bytes, TargetPtr* base_addr) : size_(num_bytes) {
size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize;
// TODO(weberlo): Set permissions per section (e.g., read-write perms for // TODO(weberlo): Set permissions per section (e.g., read-write perms for
// the heap, execute perms for text, etc.). // the heap, execute perms for text, etc.).
int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE;
base_addr_ = mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0); base_addr_ = mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0);
*base_addr = base_addr_; *base_addr = TargetPtr(TargetWordSize(sizeof(size_t) * 8),
reinterpret_cast<uint64_t>(base_addr_));
} }
/*! /*!
...@@ -60,16 +61,16 @@ class HostLowLevelDevice final : public LowLevelDevice { ...@@ -60,16 +61,16 @@ class HostLowLevelDevice final : public LowLevelDevice {
munmap(base_addr_, size_); munmap(base_addr_, size_);
} }
void Read(DevPtr addr, void* buf, size_t num_bytes) { void Read(TargetPtr addr, void* buf, size_t num_bytes) {
std::memcpy(buf, addr.cast_to<void*>(), num_bytes); std::memcpy(buf, addr.cast_to<void*>(), num_bytes);
} }
void Write(DevPtr addr, const void* buf, size_t num_bytes) { void Write(TargetPtr addr, const void* buf, size_t num_bytes) {
std::memcpy(addr.cast_to<void*>(), buf, num_bytes); std::memcpy(addr.cast_to<void*>(), buf, num_bytes);
} }
void Execute(DevPtr func_addr, DevPtr breakpoint_addr) { void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) {
reinterpret_cast<void (*)(void)>(func_addr.value().val64)(); reinterpret_cast<void (*)(void)>(func_addr.value().uint64())();
} }
const char* device_type() const final { const char* device_type() const final {
...@@ -83,9 +84,9 @@ class HostLowLevelDevice final : public LowLevelDevice { ...@@ -83,9 +84,9 @@ class HostLowLevelDevice final : public LowLevelDevice {
size_t size_; size_t size_;
}; };
const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr) { const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes,
std::shared_ptr<LowLevelDevice> lld = TargetPtr* base_addr) {
std::make_shared<HostLowLevelDevice>(num_bytes, base_addr); std::shared_ptr<LowLevelDevice> lld = std::make_shared<HostLowLevelDevice>(num_bytes, base_addr);
return lld; return lld;
} }
......
...@@ -45,7 +45,7 @@ class LowLevelDevice { ...@@ -45,7 +45,7 @@ class LowLevelDevice {
* \param buffer on-host buffer to be read into * \param buffer on-host buffer to be read into
* \param num_bytes number of bytes to read * \param num_bytes number of bytes to read
*/ */
virtual void Read(DevPtr addr, virtual void Read(TargetPtr addr,
void* buffer, void* buffer,
size_t num_bytes) = 0; size_t num_bytes) = 0;
...@@ -55,7 +55,7 @@ class LowLevelDevice { ...@@ -55,7 +55,7 @@ class LowLevelDevice {
* \param buffer host buffer to write from * \param buffer host buffer to write from
* \param num_bytes number of bytes to write * \param num_bytes number of bytes to write
*/ */
virtual void Write(DevPtr addr, virtual void Write(TargetPtr addr,
const void* buffer, const void* buffer,
size_t num_bytes) = 0; size_t num_bytes) = 0;
...@@ -64,7 +64,7 @@ class LowLevelDevice { ...@@ -64,7 +64,7 @@ class LowLevelDevice {
* \param func_addr offset of the init stub function * \param func_addr offset of the init stub function
* \param breakpoint_addr address at which to stop function execution * \param breakpoint_addr address at which to stop function execution
*/ */
virtual void Execute(DevPtr func_addr, DevPtr breakpoint_addr) = 0; virtual void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) = 0;
/*! /*!
* \brief getter function for low-level device type * \brief getter function for low-level device type
...@@ -78,7 +78,8 @@ class LowLevelDevice { ...@@ -78,7 +78,8 @@ class LowLevelDevice {
* \param num_bytes size of the memory region * \param num_bytes size of the memory region
* \param base_addr pointer to write the host device's resulting base address into * \param base_addr pointer to write the host device's resulting base address into
*/ */
const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr); const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes,
TargetPtr* base_addr);
/*! /*!
* \brief connect to OpenOCD and create an OpenOCD low-level device * \brief connect to OpenOCD and create an OpenOCD low-level device
......
...@@ -51,18 +51,18 @@ const char* SectionToString(SectionKind section) { ...@@ -51,18 +51,18 @@ const char* SectionToString(SectionKind section) {
std::string RelocateBinarySections( std::string RelocateBinarySections(
const std::string& binary_path, const std::string& binary_path,
size_t word_size, TargetWordSize word_size,
DevPtr text_start, TargetPtr text_start,
DevPtr rodata_start, TargetPtr rodata_start,
DevPtr data_start, TargetPtr data_start,
DevPtr bss_start, TargetPtr bss_start,
DevPtr stack_end, TargetPtr stack_end,
const std::string& toolchain_prefix) { const std::string& toolchain_prefix) {
const auto* f = Registry::Get("tvm_callback_relocate_binary"); const auto* f = Registry::Get("tvm_callback_relocate_binary");
CHECK(f != nullptr) CHECK(f != nullptr)
<< "Require tvm_callback_relocate_binary to exist in registry"; << "Require tvm_callback_relocate_binary to exist in registry";
std::string relocated_bin = (*f)(binary_path, std::string relocated_bin = (*f)(binary_path,
word_size, word_size.bytes(),
text_start.cast_to<uint64_t>(), text_start.cast_to<uint64_t>(),
rodata_start.cast_to<uint64_t>(), rodata_start.cast_to<uint64_t>(),
data_start.cast_to<uint64_t>(), data_start.cast_to<uint64_t>(),
...@@ -91,7 +91,7 @@ std::string ReadSection(const std::string& binary, ...@@ -91,7 +91,7 @@ std::string ReadSection(const std::string& binary,
size_t GetSectionSize(const std::string& binary_path, size_t GetSectionSize(const std::string& binary_path,
SectionKind section, SectionKind section,
const std::string& toolchain_prefix, const std::string& toolchain_prefix,
size_t align) { TargetWordSize word_size) {
CHECK(section == SectionKind::kText || section == SectionKind::kRodata || CHECK(section == SectionKind::kText || section == SectionKind::kRodata ||
section == SectionKind::kData || section == SectionKind::kBss) section == SectionKind::kData || section == SectionKind::kBss)
<< "GetSectionSize requires section to be one of text, rodata, data, or bss."; << "GetSectionSize requires section to be one of text, rodata, data, or bss.";
...@@ -99,7 +99,7 @@ size_t GetSectionSize(const std::string& binary_path, ...@@ -99,7 +99,7 @@ size_t GetSectionSize(const std::string& binary_path,
CHECK(f != nullptr) CHECK(f != nullptr)
<< "Require tvm_callback_get_section_size to exist in registry"; << "Require tvm_callback_get_section_size to exist in registry";
int size = (*f)(binary_path, SectionToString(section), toolchain_prefix); int size = (*f)(binary_path, SectionToString(section), toolchain_prefix);
return UpperAlignValue(size, align); return UpperAlignValue(size, word_size.bytes());
} }
} // namespace runtime } // namespace runtime
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility>
namespace tvm { namespace tvm {
namespace runtime { namespace runtime {
...@@ -52,28 +53,115 @@ enum class SectionKind : size_t { ...@@ -52,28 +53,115 @@ enum class SectionKind : size_t {
kNumKinds, kNumKinds,
}; };
/*! \brief union for storing values on varying target word sizes */ /*! \brief data type for word sizes */
union TargetVal { class TargetWordSize {
/*! \brief 32-bit pointer */ public:
uint32_t val32; explicit TargetWordSize(size_t word_size_bits) : word_size_bits_{word_size_bits} {
/*! \brief 64-bit pointer */ CHECK(word_size_bits == 32 || word_size_bits == 64)
uint64_t val64; << "only 32-bit and 64-bit are supported now";
}
size_t bytes() const {
return word_size_bits_ / 8;
}
size_t bits() const {
return word_size_bits_;
}
private:
size_t word_size_bits_;
}; };
/*! \brief absolute device address */
class DevPtr { /*! \brief class for storing values on varying target word sizes */
class TargetVal {
private:
size_t width_bits_;
uint64_t value_;
public: public:
/*! \brief construct a device address with value `value` */ /*! \brief construct a TargetVal matching the size of the given integral argument */
explicit DevPtr(std::uintptr_t value) : value_(TargetVal { .val64 = value }) {} template<typename T, typename U = typename std::enable_if<std::is_integral<T>::value, T>::type>
explicit constexpr TargetVal(T value) : TargetVal(sizeof(T) * 8, value) {}
/*! \brief construct an uninitialized value */
TargetVal() : width_bits_{0}, value_{0} {}
/*! \brief construct a TargetVal with explicit size and value */
TargetVal(size_t width_bits, uint64_t value) : width_bits_{width_bits} {
CHECK(width_bits >= 8 &&
width_bits <= 64 &&
(width_bits & (width_bits - 1)) == 0)
<< "width_bits must be a power of 2 in [8, 64], got " << width_bits;
value_ = value & Bitmask();
}
bool IsInitialized() const { return width_bits_ != 0; }
size_t width_bits() const {
CHECK(IsInitialized()) << "TargetVal is not initialized";
return width_bits_;
}
uint64_t Bitmask() const {
CHECK(IsInitialized()) << "TargetVal is not initialized";
if (width_bits_ == 64) {
return ~0UL;
} else {
return (1UL << width_bits_) - 1;
}
}
/*! \brief default constructor */ uint32_t uint32() const {
DevPtr() : value_(TargetVal { .val64 = 0 }) {} CHECK(IsInitialized()) << "TargetVal is not initialized";
CHECK(width_bits_ <= 32) << "TargetVal: requested 32-bit value, actual width is "
<< width_bits_;
return uint32_t(value_ & Bitmask());
}
uint64_t uint64() const {
CHECK(IsInitialized()) << "TargetVal is not initialized";
return value_;
}
TargetVal& operator=(const TargetVal& other) {
CHECK(other.IsInitialized()) << "Cannot assign an uninitialized TargetVal";
if (!IsInitialized()) {
width_bits_ = other.width_bits_;
}
CHECK(width_bits_ >= other.width_bits_)
<< "Cannot assign TargetVal with width " << other.width_bits_
<< "bits to TargetVal with width " << width_bits_ << "bits";
value_ = other.value_ & Bitmask();
return *this;
}
};
// TODO(weberlo, areusch): just get rid of `TargetPtr`.
/*! \brief absolute device address */
class TargetPtr {
public:
/*! \brief construct a device address with variable-length value `value` */
TargetPtr(TargetWordSize word_size, std::uint64_t value) :
value_(TargetVal(word_size.bits(), value)) {}
/*! \brief construct a null address */ /*! \brief construct a null address */
explicit DevPtr(std::nullptr_t value) : value_(TargetVal { .val64 = 0 }) {} TargetPtr(TargetWordSize word_size, std::nullptr_t value) :
value_{TargetVal(word_size.bits(), 0)} {}
/*! \brief construct an uninitialized pointer whose word_size can be changed once */
TargetPtr() = default;
/*! \brief construct a device address using the given TargetVal */
explicit TargetPtr(const TargetVal& value) : value_{value} {}
/*! \brief destructor */ /*! \brief destructor */
~DevPtr() {} ~TargetPtr() {}
/*! /*!
* \brief get value of pointer * \brief get value of pointer
...@@ -86,33 +174,33 @@ class DevPtr { ...@@ -86,33 +174,33 @@ class DevPtr {
* \return casted result * \return casted result
*/ */
template <typename T> template <typename T>
T cast_to() const { return reinterpret_cast<T>(value_.val64); } T cast_to() const { return reinterpret_cast<T>(value_.uint64()); }
/*! \brief check if location is null */ /*! \brief check if location is null */
bool operator==(std::nullptr_t) const { return value_.val64 == 0; } bool operator==(std::nullptr_t) const { return value_.uint64() == 0; }
/*! \brief check if location is not null */ /*! \brief check if location is not null */
bool operator!=(std::nullptr_t) const { return value_.val64 != 0; } bool operator!=(std::nullptr_t) const { return value_.uint64() != 0; }
/*! \brief add an integer to this absolute address to get a larger absolute address */ /*! \brief add an integer to this absolute address to get a larger absolute address */
DevPtr operator+(size_t n) const { TargetPtr operator+(size_t n) const {
return DevPtr(value_.val64 + n); return TargetPtr(TargetWordSize(value_.width_bits()), value_.uint64() + n);
} }
/*! \brief mutably add an integer to this absolute address */ /*! \brief mutably add an integer to this absolute address */
DevPtr& operator+=(size_t n) { TargetPtr& operator+=(size_t n) {
value_.val64 += n; value_ = TargetVal(value_.width_bits(), value_.uint64() + n);
return *this; return *this;
} }
/*! \brief subtract an integer from this absolute address to get a smaller absolute address */ /*! \brief subtract an integer from this absolute address to get a smaller absolute address */
DevPtr operator-(size_t n) const { TargetPtr operator-(size_t n) const {
return DevPtr(value_.val64 - n); return TargetPtr(TargetWordSize(value_.width_bits()), value_.uint64() - n);
} }
/*! \brief mutably subtract an integer from this absolute address */ /*! \brief mutably subtract an integer from this absolute address */
DevPtr& operator-=(size_t n) { TargetPtr& operator-=(size_t n) {
value_.val64 -= n; value_ = TargetVal(value_.width_bits(), value_.uint64() - n);
return *this; return *this;
} }
...@@ -137,7 +225,8 @@ class SymbolMap { ...@@ -137,7 +225,8 @@ class SymbolMap {
* \param toolchain_prefix prefix of compiler toolchain to use * \param toolchain_prefix prefix of compiler toolchain to use
*/ */
SymbolMap(const std::string& binary, SymbolMap(const std::string& binary,
const std::string& toolchain_prefix) { const std::string& toolchain_prefix,
TargetWordSize word_size) {
const auto* f = Registry::Get("tvm_callback_get_symbol_map"); const auto* f = Registry::Get("tvm_callback_get_symbol_map");
CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry"; CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry";
TVMByteArray arr; TVMByteArray arr;
...@@ -152,7 +241,7 @@ class SymbolMap { ...@@ -152,7 +241,7 @@ class SymbolMap {
stream >> name; stream >> name;
stream >> std::hex >> addr; stream >> std::hex >> addr;
while (stream) { while (stream) {
map_[name] = DevPtr(addr); map_.emplace(std::make_pair(name, TargetPtr(word_size, addr)));
stream >> name; stream >> name;
stream >> std::hex >> addr; stream >> std::hex >> addr;
} }
...@@ -163,7 +252,7 @@ class SymbolMap { ...@@ -163,7 +252,7 @@ class SymbolMap {
* \param name name of the symbol * \param name name of the symbol
* \return on-device offset of the symbol * \return on-device offset of the symbol
*/ */
DevPtr operator[](const std::string& name) const { TargetPtr operator[](const std::string& name) const {
auto result = map_.find(name); auto result = map_.find(name);
CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map"; CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map";
return result->second; return result->second;
...@@ -173,15 +262,21 @@ class SymbolMap { ...@@ -173,15 +262,21 @@ class SymbolMap {
return map_.find(name) != map_.end(); return map_.find(name) != map_.end();
} }
void Dump(std::ostream& stream) const {
for (auto e : map_) {
stream << "Entry:" << e.first << std::endl;
}
}
private: private:
/*! \brief backing map */ /*! \brief backing map */
std::unordered_map<std::string, DevPtr> map_; std::unordered_map<std::string, TargetPtr> map_;
}; };
/*! \brief struct containing start and size of a device memory region */ /*! \brief struct containing start and size of a device memory region */
struct DevMemRegion { struct DevMemRegion {
/*! \brief section start offset */ /*! \brief section start offset */
DevPtr start; TargetPtr start;
/*! \brief size of section */ /*! \brief size of section */
size_t size; size_t size;
}; };
...@@ -239,12 +334,12 @@ const char* SectionToString(SectionKind section); ...@@ -239,12 +334,12 @@ const char* SectionToString(SectionKind section);
*/ */
std::string RelocateBinarySections( std::string RelocateBinarySections(
const std::string& binary_path, const std::string& binary_path,
size_t word_size, TargetWordSize word_size,
DevPtr text_start, TargetPtr text_start,
DevPtr rodata_start, TargetPtr rodata_start,
DevPtr data_start, TargetPtr data_start,
DevPtr bss_start, TargetPtr bss_start,
DevPtr stack_end, TargetPtr stack_end,
const std::string& toolchain_prefix); const std::string& toolchain_prefix);
/*! /*!
...@@ -263,13 +358,13 @@ std::string ReadSection(const std::string& binary, ...@@ -263,13 +358,13 @@ std::string ReadSection(const std::string& binary,
* \param binary input binary contents * \param binary input binary contents
* \param section section type * \param section section type
* \param toolchain_prefix prefix of compiler toolchain to use * \param toolchain_prefix prefix of compiler toolchain to use
* \param align alignment of the returned size (default: 8) * \param word_size word size of the target, for alignment
* \return size of the section if it exists, 0 otherwise * \return size of the section if it exists, 0 otherwise
*/ */
size_t GetSectionSize(const std::string& binary_name, size_t GetSectionSize(const std::string& binary_name,
SectionKind section, SectionKind section,
const std::string& toolchain_prefix, const std::string& toolchain_prefix,
size_t align); TargetWordSize word_size);
} // namespace runtime } // namespace runtime
} // namespace tvm } // namespace tvm
......
...@@ -50,18 +50,14 @@ class MicroDeviceAPI final : public DeviceAPI { ...@@ -50,18 +50,14 @@ class MicroDeviceAPI final : public DeviceAPI {
size_t alignment, size_t alignment,
DLDataType type_hint) final { DLDataType type_hint) final {
ObjectPtr<MicroSession>& session = MicroSession::Current(); ObjectPtr<MicroSession>& session = MicroSession::Current();
void* data = session->AllocateInSection(SectionKind::kHeap, nbytes).cast_to<void*>(); TargetPtr data = session->AllocateInSection(SectionKind::kHeap, nbytes);
CHECK(data != nullptr) << "unable to allocate " << nbytes << " bytes on device heap"; CHECK(data != nullptr) << "unable to allocate " << nbytes << " bytes on device heap";
MicroDevSpace* dev_space = new MicroDevSpace(); return reinterpret_cast<void*>(new MicroDevSpace{data, session});
dev_space->data = data;
dev_space->session = session;
return static_cast<void*>(dev_space);
} }
void FreeDataSpace(TVMContext ctx, void* ptr) final { void FreeDataSpace(TVMContext ctx, void* ptr) final {
MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(ptr); MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(ptr);
dev_space->session->FreeInSection( dev_space->session->FreeInSection(SectionKind::kHeap, dev_space->data);
SectionKind::kHeap, DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
delete dev_space; delete dev_space;
} }
...@@ -77,7 +73,6 @@ class MicroDeviceAPI final : public DeviceAPI { ...@@ -77,7 +73,6 @@ class MicroDeviceAPI final : public DeviceAPI {
std::tuple<int, int> type_from_to(ctx_from.device_type, ctx_to.device_type); std::tuple<int, int> type_from_to(ctx_from.device_type, ctx_to.device_type);
if (type_from_to == std::make_tuple(kDLMicroDev, kDLMicroDev)) { if (type_from_to == std::make_tuple(kDLMicroDev, kDLMicroDev)) {
// Copying from the device to the device. // Copying from the device to the device.
MicroDevSpace* from_space = static_cast<MicroDevSpace*>(const_cast<void*>(from)); MicroDevSpace* from_space = static_cast<MicroDevSpace*>(const_cast<void*>(from));
MicroDevSpace* to_space = static_cast<MicroDevSpace*>(const_cast<void*>(to)); MicroDevSpace* to_space = static_cast<MicroDevSpace*>(const_cast<void*>(to));
CHECK(from_space->session == to_space->session) CHECK(from_space->session == to_space->session)
...@@ -87,58 +82,65 @@ class MicroDeviceAPI final : public DeviceAPI { ...@@ -87,58 +82,65 @@ class MicroDeviceAPI final : public DeviceAPI {
CHECK(ctx_from.device_id == ctx_to.device_id) CHECK(ctx_from.device_id == ctx_to.device_id)
<< "can only copy between the same micro device"; << "can only copy between the same micro device";
ObjectPtr<MicroSession>& session = from_space->session; ObjectPtr<MicroSession>& session = from_space->session;
// flush all pending tasks to ensure data is consistent
session->FlushTaskQueue();
const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device(); const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
DevPtr from_dev_addr = GetDevLoc(from_space, from_offset); TargetPtr from_dev_addr = GetDevLoc(from_space, from_offset);
DevPtr to_dev_addr = GetDevLoc(to_space, to_offset); TargetPtr to_dev_addr = GetDevLoc(to_space, to_offset);
std::vector<uint8_t> buffer(size); std::vector<uint8_t> buffer(size);
lld->Read(from_dev_addr, static_cast<void*>(buffer.data()), size); lld->Read(from_dev_addr, static_cast<void*>(buffer.data()), size);
lld->Write(to_dev_addr, static_cast<void*>(buffer.data()), size); lld->Write(to_dev_addr, static_cast<void*>(buffer.data()), size);
} else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) { } else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) {
// Reading from the device. // Reading from the device.
MicroDevSpace* from_space = static_cast<MicroDevSpace*>(const_cast<void*>(from)); MicroDevSpace* from_space = static_cast<MicroDevSpace*>(const_cast<void*>(from));
ObjectPtr<MicroSession>& session = from_space->session; ObjectPtr<MicroSession>& session = from_space->session;
// flush all pending tasks to ensure data is consistent
session->FlushTaskQueue();
const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device(); const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
DevPtr from_dev_addr = GetDevLoc(from_space, from_offset); TargetPtr from_dev_addr = GetDevLoc(from_space, from_offset);
void* to_host_ptr = GetHostLoc(to, to_offset); void* to_host_ptr = GetHostLoc(to, to_offset);
lld->Read(from_dev_addr, to_host_ptr, size); lld->Read(from_dev_addr, to_host_ptr, size);
} else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) { } else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) {
// Writing to the device. // Writing to the device.
MicroDevSpace* to_space = static_cast<MicroDevSpace*>(const_cast<void*>(to)); MicroDevSpace* to_space = static_cast<MicroDevSpace*>(const_cast<void*>(to));
ObjectPtr<MicroSession>& session = to_space->session; ObjectPtr<MicroSession>& session = to_space->session;
// flush all pending tasks to ensure data is consistent
session->FlushTaskQueue();
const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device(); const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
void* from_host_ptr = GetHostLoc(from, from_offset); void* from_host_ptr = GetHostLoc(from, from_offset);
DevPtr to_dev_addr = GetDevLoc(to_space, to_offset); TargetPtr to_dev_addr = GetDevLoc(to_space, to_offset);
lld->Write(to_dev_addr, from_host_ptr, size); lld->Write(to_dev_addr, from_host_ptr, size);
} else { } else {
LOG(FATAL) << "Expect copy from/to micro device or between micro device\n"; LOG(FATAL) << "Expect copy from/to micro device or between micro device\n";
} }
} }
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
MicroSession::Current()->FlushTaskQueue();
} }
void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final { void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final {
CHECK(false) << "the on-device workspace allocator isn't aware of this function";
ObjectPtr<MicroSession>& session = MicroSession::Current(); ObjectPtr<MicroSession>& session = MicroSession::Current();
void* data = session->AllocateInSection(SectionKind::kWorkspace, size).cast_to<void*>(); TargetPtr data = session->AllocateInSection(SectionKind::kWorkspace, size);
CHECK(data != nullptr) << "unable to allocate " << size << " bytes on device workspace"; CHECK(data.value().uint64() != 0)
MicroDevSpace* dev_space = new MicroDevSpace(); << "unable to allocate " << size << " bytes on device workspace";
dev_space->data = data; return static_cast<void*>(new MicroDevSpace{data, session});
dev_space->session = session;
return static_cast<void*>(dev_space);
} }
void FreeWorkspace(TVMContext ctx, void* data) final { void FreeWorkspace(TVMContext ctx, void* data) final {
CHECK(false) << "the on-device workspace allocator isn't aware of this function";
MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(data); MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(data);
ObjectPtr<MicroSession>& session = dev_space->session; ObjectPtr<MicroSession>& session = dev_space->session;
session->FreeInSection(SectionKind::kWorkspace, session->FreeInSection(SectionKind::kWorkspace, dev_space->data);
DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
delete dev_space; delete dev_space;
} }
...@@ -152,8 +154,8 @@ class MicroDeviceAPI final : public DeviceAPI { ...@@ -152,8 +154,8 @@ class MicroDeviceAPI final : public DeviceAPI {
} }
private: private:
DevPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) { TargetPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) {
return DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data) + offset); return dev_space->data + offset;
} }
void* GetHostLoc(const void* ptr, size_t offset) { void* GetHostLoc(const void* ptr, size_t offset) {
......
...@@ -54,6 +54,8 @@ class MicroModuleNode final : public ModuleNode { ...@@ -54,6 +54,8 @@ class MicroModuleNode final : public ModuleNode {
* \param binary_path path of the binary to be loaded * \param binary_path path of the binary to be loaded
*/ */
void InitMicroModule(const std::string& binary_path) { void InitMicroModule(const std::string& binary_path) {
// std::cout << "[MicroModuleNode::InitMicroModule]" << std::endl;
// std::cout << " start" << std::endl;
session_ = MicroSession::Current(); session_ = MicroSession::Current();
symbol_map_ = session_->LoadBinary(binary_path, true).symbol_map; symbol_map_ = session_->LoadBinary(binary_path, true).symbol_map;
} }
...@@ -67,26 +69,26 @@ class MicroModuleNode final : public ModuleNode { ...@@ -67,26 +69,26 @@ class MicroModuleNode final : public ModuleNode {
class MicroWrappedFunc { class MicroWrappedFunc {
public: public:
MicroWrappedFunc(ObjectPtr<MicroSession> session, MicroWrappedFunc(ObjectPtr<MicroSession> session,
DevPtr func_ptr) { TargetPtr func_ptr) {
session_ = session; session_ = session;
func_ptr_ = func_ptr; func_ptr_ = func_ptr;
} }
void operator()(TVMArgs args, TVMRetValue* rv) const { void operator()(TVMArgs args, TVMRetValue* rv) const {
*rv = session_->PushToExecQueue(func_ptr_, args); session_->PushToTaskQueue(func_ptr_, args);
} }
private: private:
/*! \brief reference to the session for this function (to keep the session alive) */ /*! \brief reference to the session for this function (to keep the session alive) */
ObjectPtr<MicroSession> session_; ObjectPtr<MicroSession> session_;
/*! \brief offset of the function to be called */ /*! \brief offset of the function to be called */
DevPtr func_ptr_; TargetPtr func_ptr_;
}; };
PackedFunc MicroModuleNode::GetFunction( PackedFunc MicroModuleNode::GetFunction(
const std::string& name, const std::string& name,
const ObjectPtr<Object>& sptr_to_self) { const ObjectPtr<Object>& sptr_to_self) {
DevPtr func_ptr; TargetPtr func_ptr;
if (name == tvm::runtime::symbol::tvm_module_main) { if (name == tvm::runtime::symbol::tvm_module_main) {
if (symbol_map_.HasSymbol(tvm::runtime::symbol::tvm_module_main)) { if (symbol_map_.HasSymbol(tvm::runtime::symbol::tvm_module_main)) {
func_ptr = symbol_map_[tvm::runtime::symbol::tvm_module_main]; func_ptr = symbol_map_[tvm::runtime::symbol::tvm_module_main];
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#ifndef TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ #ifndef TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_
#define TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ #define TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_
#include <string>
#include <unordered_map> #include <unordered_map>
#include "micro_common.h" #include "micro_common.h"
...@@ -38,15 +39,18 @@ class MicroSectionAllocator { ...@@ -38,15 +39,18 @@ class MicroSectionAllocator {
* \brief constructor that specifies section boundaries * \brief constructor that specifies section boundaries
* \param region location and size of the section on the device * \param region location and size of the section on the device
*/ */
explicit MicroSectionAllocator(DevMemRegion region, size_t word_size) explicit MicroSectionAllocator(std::string section_name,
: start_addr_(region.start), DevMemRegion region,
TargetWordSize word_size)
: section_name_(section_name),
start_addr_(region.start),
size_(0), size_(0),
capacity_(region.size), capacity_(region.size),
word_size_(word_size) { word_size_(word_size) {
CHECK_EQ(start_addr_.value().val64 % word_size, 0) CHECK_EQ(start_addr_.value().uint64() % word_size.bytes(), 0)
<< "micro section start not aligned to " << word_size << " bytes"; << "micro section start not aligned to " << word_size.bytes() << " bytes";
CHECK_EQ(capacity_ % word_size, 0) CHECK_EQ(capacity_ % word_size.bytes(), 0)
<< "micro section end not aligned to " << word_size << " bytes"; << "micro section end not aligned to " << word_size.bytes() << " bytes";
} }
/*! /*!
...@@ -56,17 +60,18 @@ class MicroSectionAllocator { ...@@ -56,17 +60,18 @@ class MicroSectionAllocator {
/*! /*!
* \brief memory allocator * \brief memory allocator
* \param size size of allocated memory in bytes * \param alloc_size size of allocated memory in bytes
* \return pointer to allocated memory region in section, nullptr if out of space * \return pointer to allocated memory region in section, nullptr if out of space
*/ */
DevPtr Allocate(size_t size) { TargetPtr Allocate(size_t size) {
size_ = UpperAlignValue(size_, word_size_); size_ = UpperAlignValue(size_, word_size_.bytes());
CHECK(size_ + size < capacity_) CHECK(size_ + size < capacity_)
<< "cannot alloc " << size << " bytes in section with start_addr " << << "cannot alloc " << size << " bytes in section \""
start_addr_.cast_to<void*>(); << section_name_ << "\" (start_addr=" << start_addr_.cast_to<void*>()
DevPtr alloc_addr = start_addr_ + size_; << ", used=" << size_ << ", capacity=" << capacity_ << ")";
TargetPtr alloc_addr = start_addr_ + size_;
size_ += size; size_ += size;
alloc_map_[alloc_addr.value().val64] = size; alloc_map_[alloc_addr.value().uint64()] = size;
return alloc_addr; return alloc_addr;
} }
...@@ -75,10 +80,10 @@ class MicroSectionAllocator { ...@@ -75,10 +80,10 @@ class MicroSectionAllocator {
* \param offs offset to allocated memory * \param offs offset to allocated memory
* \note simple allocator scheme, more complex versions will be implemented later * \note simple allocator scheme, more complex versions will be implemented later
*/ */
void Free(DevPtr addr) { void Free(TargetPtr addr) {
CHECK(alloc_map_.find(addr.value().val64) != alloc_map_.end()) CHECK(alloc_map_.find(addr.value().uint64()) != alloc_map_.end())
<< "freed pointer was never allocated"; << "freed pointer was never allocated";
alloc_map_.erase(addr.value().val64); alloc_map_.erase(addr.value().uint64());
if (alloc_map_.empty()) { if (alloc_map_.empty()) {
size_ = 0; size_ = 0;
} }
...@@ -87,17 +92,17 @@ class MicroSectionAllocator { ...@@ -87,17 +92,17 @@ class MicroSectionAllocator {
/*! /*!
* \brief start offset of the memory region managed by this allocator * \brief start offset of the memory region managed by this allocator
*/ */
DevPtr start_addr() const { return start_addr_; } TargetPtr start_addr() const { return start_addr_; }
/*! /*!
* \brief current end addr of the space being used in this memory region * \brief current end addr of the space being used in this memory region
*/ */
DevPtr curr_end_addr() const { return start_addr_ + size_; } TargetPtr curr_end_addr() const { return start_addr_ + size_; }
/*! /*!
* \brief end addr of the memory region managed by this allocator * \brief end addr of the memory region managed by this allocator
*/ */
DevPtr max_addr() const { return start_addr_ + capacity_; } TargetPtr max_addr() const { return start_addr_ + capacity_; }
/*! /*!
* \brief size of the section * \brief size of the section
...@@ -110,14 +115,16 @@ class MicroSectionAllocator { ...@@ -110,14 +115,16 @@ class MicroSectionAllocator {
size_t capacity() const { return capacity_; } size_t capacity() const { return capacity_; }
private: private:
/*! \brief name of the section (for debugging) */
std::string section_name_;
/*! \brief start address of the section */ /*! \brief start address of the section */
DevPtr start_addr_; TargetPtr start_addr_;
/*! \brief current size of the section */ /*! \brief current size of the section */
size_t size_; size_t size_;
/*! \brief total storage capacity of the section */ /*! \brief total storage capacity of the section */
size_t capacity_; size_t capacity_;
/*! \brief number of bytes in a word on the target device */ /*! \brief number of bytes in a word on the target device */
size_t word_size_; TargetWordSize word_size_;
/*! \brief allocation map for allocation sizes */ /*! \brief allocation map for allocation sizes */
std::unordered_map<uint64_t, size_t> alloc_map_; std::unordered_map<uint64_t, size_t> alloc_map_;
}; };
......
...@@ -23,7 +23,10 @@ ...@@ -23,7 +23,10 @@
#include <dmlc/thread_local.h> #include <dmlc/thread_local.h>
#include <tvm/runtime/registry.h> #include <tvm/runtime/registry.h>
#include <tvm/runtime/device_api.h>
#include <chrono>
#include <memory> #include <memory>
#include <locale>
#include <stack> #include <stack>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
...@@ -77,14 +80,16 @@ MicroSession::MicroSession( ...@@ -77,14 +80,16 @@ MicroSession::MicroSession(
size_t workspace_size, size_t workspace_size,
uint64_t stack_start, uint64_t stack_start,
size_t stack_size, size_t stack_size,
size_t word_size, TargetWordSize word_size,
bool thumb_mode, bool thumb_mode,
bool use_device_timer,
const std::string& server_addr, const std::string& server_addr,
int port) int port)
: toolchain_prefix_(toolchain_prefix) : toolchain_prefix_(toolchain_prefix),
, word_size_(word_size) word_size_(word_size),
, thumb_mode_(thumb_mode) { thumb_mode_(thumb_mode),
CHECK(word_size_ == 4 || word_size_ == 8) << "unsupported word size " << word_size_; use_device_timer_(use_device_timer),
batch_args_encoder_(args_size, word_size) {
if (comms_method == "host") { if (comms_method == "host") {
// TODO(weberlo): move checks to python // TODO(weberlo): move checks to python
CHECK( CHECK(
...@@ -99,106 +104,138 @@ MicroSession::MicroSession( ...@@ -99,106 +104,138 @@ MicroSession::MicroSession(
size_t memory_size = size_t memory_size =
text_size + rodata_size + data_size + bss_size + text_size + rodata_size + data_size + bss_size +
args_size + heap_size + workspace_size + stack_size; args_size + heap_size + workspace_size + stack_size;
void* base_addr; TargetPtr base_addr;
low_level_device_ = HostLowLevelDeviceCreate(memory_size, &base_addr); low_level_device_ = HostLowLevelDeviceCreate(memory_size, &base_addr);
CHECK_EQ(reinterpret_cast<std::uintptr_t>(base_addr) % word_size_, 0) CHECK_EQ(base_addr.value().uint64() % word_size.bytes(), 0)
<< "base address not aligned to " << word_size_ << " bytes"; << "base address not aligned to " << word_size.bytes() << " bytes";
DevPtr curr_addr = DevPtr(reinterpret_cast<std::uintptr_t>(base_addr)); TargetPtr curr_addr = base_addr;
section_allocators_[0] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[0] = std::make_shared<MicroSectionAllocator>(
"text",
DevMemRegion {
.start = curr_addr, .start = curr_addr,
.size = text_size, .size = text_size,
}, word_size_); }, word_size_);
curr_addr += text_size; curr_addr += text_size;
section_allocators_[1] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[1] = std::make_shared<MicroSectionAllocator>(
"rodata",
DevMemRegion {
.start = curr_addr, .start = curr_addr,
.size = rodata_size, .size = rodata_size,
}, word_size_); }, word_size_);
curr_addr += rodata_size; curr_addr += rodata_size;
section_allocators_[2] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[2] = std::make_shared<MicroSectionAllocator>(
"data",
DevMemRegion {
.start = curr_addr, .start = curr_addr,
.size = data_size, .size = data_size,
}, word_size_); }, word_size_);
curr_addr += data_size; curr_addr += data_size;
section_allocators_[3] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[3] = std::make_shared<MicroSectionAllocator>(
"bss",
DevMemRegion {
.start = curr_addr, .start = curr_addr,
.size = bss_size, .size = bss_size,
}, word_size_); }, word_size_);
curr_addr += bss_size; curr_addr += bss_size;
section_allocators_[4] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[4] = std::make_shared<MicroSectionAllocator>(
"args",
DevMemRegion {
.start = curr_addr, .start = curr_addr,
.size = args_size, .size = args_size,
}, word_size_); }, word_size_);
curr_addr += args_size; curr_addr += args_size;
section_allocators_[5] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[5] = std::make_shared<MicroSectionAllocator>(
"heap",
DevMemRegion {
.start = curr_addr, .start = curr_addr,
.size = heap_size, .size = heap_size,
}, word_size_); }, word_size_);
curr_addr += heap_size; curr_addr += heap_size;
section_allocators_[6] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[6] = std::make_shared<MicroSectionAllocator>(
"workspace",
DevMemRegion {
.start = curr_addr, .start = curr_addr,
.size = workspace_size, .size = workspace_size,
}, word_size_); }, word_size_);
curr_addr += workspace_size; curr_addr += workspace_size;
section_allocators_[7] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[7] = std::make_shared<MicroSectionAllocator>(
"stack",
DevMemRegion {
.start = curr_addr, .start = curr_addr,
.size = stack_size, .size = stack_size,
}, word_size_); }, word_size_);
curr_addr += stack_size; curr_addr += stack_size;
} else if (comms_method == "openocd") { } else if (comms_method == "openocd") {
low_level_device_ = OpenOCDLowLevelDeviceCreate(server_addr, port); low_level_device_ = OpenOCDLowLevelDeviceCreate(server_addr, port);
section_allocators_[0] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[0] = std::make_shared<MicroSectionAllocator>(
.start = DevPtr(text_start), "text",
DevMemRegion {
.start = TargetPtr(word_size_, text_start),
.size = text_size, .size = text_size,
}, word_size_); }, word_size_);
section_allocators_[1] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[1] = std::make_shared<MicroSectionAllocator>(
.start = DevPtr(rodata_start), "rodata",
DevMemRegion {
.start = TargetPtr(word_size_, rodata_start),
.size = rodata_size, .size = rodata_size,
}, word_size_); }, word_size_);
section_allocators_[2] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[2] = std::make_shared<MicroSectionAllocator>(
.start = DevPtr(data_start), "data",
DevMemRegion {
.start = TargetPtr(word_size_, data_start),
.size = data_size, .size = data_size,
}, word_size_); }, word_size_);
section_allocators_[3] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[3] = std::make_shared<MicroSectionAllocator>(
.start = DevPtr(bss_start), "bss",
DevMemRegion {
.start = TargetPtr(word_size_, bss_start),
.size = bss_size, .size = bss_size,
}, word_size_); }, word_size_);
section_allocators_[4] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[4] = std::make_shared<MicroSectionAllocator>(
.start = DevPtr(args_start), "args",
DevMemRegion {
.start = TargetPtr(word_size_, args_start),
.size = args_size, .size = args_size,
}, word_size_); }, word_size_);
section_allocators_[5] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[5] = std::make_shared<MicroSectionAllocator>(
.start = DevPtr(heap_start), "heap",
DevMemRegion {
.start = TargetPtr(word_size_, heap_start),
.size = heap_size, .size = heap_size,
}, word_size_); }, word_size_);
section_allocators_[6] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[6] = std::make_shared<MicroSectionAllocator>(
.start = DevPtr(workspace_start), "workspace",
DevMemRegion {
.start = TargetPtr(word_size_, workspace_start),
.size = workspace_size, .size = workspace_size,
}, word_size_); }, word_size_);
section_allocators_[7] = std::make_shared<MicroSectionAllocator>(DevMemRegion { section_allocators_[7] = std::make_shared<MicroSectionAllocator>(
.start = DevPtr(stack_start), "stack",
DevMemRegion {
.start = TargetPtr(word_size_, stack_start),
.size = stack_size, .size = stack_size,
}, word_size_); }, word_size_);
} else { } else {
LOG(FATAL) << "unsupported micro low-level device"; LOG(FATAL) << "unsupported micro low-level device";
} }
TargetPtr args_start_addr = GetAllocator(SectionKind::kArgs)->start_addr();
batch_args_encoder_.set_start_addr(args_start_addr);
runtime_symbol_map_ = LoadBinary(binary_path, false).symbol_map; runtime_symbol_map_ = LoadBinary(binary_path, false).symbol_map;
// Patch pointers to define the bounds of the workspace section and the word // Patch pointers to define the bounds of the workspace section and the word
// size (for allocation alignment). // size (for allocation alignment).
std::shared_ptr<MicroSectionAllocator> ws_allocator = GetAllocator(SectionKind::kWorkspace); std::shared_ptr<MicroSectionAllocator> ws_allocator = GetAllocator(SectionKind::kWorkspace);
TargetVal ws_start = ws_allocator->start_addr().value(); DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_allocator->start_addr());
TargetVal ws_end = ws_allocator->max_addr().value(); DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_allocator->max_addr());
TargetVal target_word_size { .val64 = word_size_ }; if (word_size.bytes() == 4) {
if (word_size_ == 4) { DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", uint32_t(word_size.bytes()));
DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val32); } else if (word_size.bytes() == 8) {
DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val32); DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", uint64_t(word_size.bytes()));
DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val32); } else {
} else if (word_size_ == 8) { CHECK(false) << "Unsupported word size unexpectedly here";
DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val64);
DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val64);
DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val64);
} }
} }
...@@ -209,59 +246,122 @@ MicroSession::~MicroSession() { ...@@ -209,59 +246,122 @@ MicroSession::~MicroSession() {
low_level_device_ = nullptr; low_level_device_ = nullptr;
} }
double MicroSession::PushToExecQueue(DevPtr func_ptr, const TVMArgs& args) { void MicroSession::PushToTaskQueue(TargetPtr func_ptr, const TVMArgs& args) {
if (thumb_mode_) { if (thumb_mode_) {
// TODO(areusch): should be |=
func_ptr += 1; func_ptr += 1;
} }
TargetVal func_dev_addr = func_ptr.value();
std::tuple<TargetPtr, TargetPtr> arg_field_addrs = EncoderAppend(&batch_args_encoder_, args);
TargetVal arg_values_dev_addr{std::get<0>(arg_field_addrs).value()};
TargetVal arg_type_codes_dev_addr{std::get<1>(arg_field_addrs).value()};
task_queue_.push_back(
DevTask {
.func = func_dev_addr,
.arg_values = arg_values_dev_addr,
.arg_type_codes = arg_type_codes_dev_addr,
.num_args = args.num_args
});
// Create an allocator stream for the memory region after the most recent if (task_queue_.size() == MicroSession::kTaskQueueCapacity) {
// allocation in the args section. FlushTaskQueue();
DevPtr args_addr = GetAllocator(SectionKind::kArgs)->curr_end_addr(); }
TargetDataLayoutEncoder encoder(args_addr, word_size_); }
std::tuple<DevPtr, DevPtr> arg_field_addrs = EncoderAppend(&encoder, args); void MicroSession::FlushTaskQueue() {
if (task_queue_.size() == 0) {
// Flush `stream` to device memory. // nothing to run
DevPtr stream_dev_addr = return;
GetAllocator(SectionKind::kArgs)->Allocate(encoder.buf_size()); }
low_level_device()->Write(stream_dev_addr, if (word_size_.bytes() == 4) {
reinterpret_cast<void*>(encoder.data()), FlushTaskQueuePriv<StructUTVMTask32>();
encoder.buf_size()); } else if (word_size_.bytes() == 8) {
FlushTaskQueuePriv<StructUTVMTask64>();
TargetVal arg_values_dev_addr = std::get<0>(arg_field_addrs).value(); }
TargetVal arg_type_codes_dev_addr = std::get<1>(arg_field_addrs).value(); }
if (word_size_ == 4) {
UTVMTask32 task = { template <typename T>
.func = func_ptr.value().val32, void MicroSession::FlushTaskQueuePriv() {
.arg_values = arg_values_dev_addr.val32, std::vector<T> prepped_tasks;
.arg_type_codes = arg_type_codes_dev_addr.val32, for (const auto& task : task_queue_) {
.num_args = args.num_args, prepped_tasks.push_back(T(task));
};
// Write the task.
DevSymbolWrite(runtime_symbol_map_, "utvm_task", task);
} else if (word_size_ == 8) {
UTVMTask64 task = {
.func = func_ptr.value().val64,
.arg_values = arg_values_dev_addr.val64,
.arg_type_codes = arg_type_codes_dev_addr.val64,
.num_args = args.num_args,
};
// Write the task.
DevSymbolWrite(runtime_symbol_map_, "utvm_task", task);
} }
DevPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"]; // Flush `args` to device memory.
DevPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"]; low_level_device()->Write(
batch_args_encoder_.start_addr(),
reinterpret_cast<void*>(batch_args_encoder_.data()),
batch_args_encoder_.buf_size());
// Flush `tasks` to device memory.
TargetPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"];
low_level_device()->Write(
dev_tasks_addr,
reinterpret_cast<void*>(prepped_tasks.data()),
prepped_tasks.size() * sizeof(T));
DevSymbolWrite<uint32_t>(runtime_symbol_map_, "utvm_num_tasks", prepped_tasks.size());
TargetPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"];
TargetPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"];
if (thumb_mode_) { if (thumb_mode_) {
// TODO(areusch): should be |=
utvm_init_addr += 1; utvm_init_addr += 1;
} }
std::chrono::time_point<
std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin, tend;
tbegin = std::chrono::high_resolution_clock::now();
// std::string tmp;
// while (tmp[0] != 'd' && tmp[0] != 'e') {
// std::cout << "How to proceed? [Debug / Execute] ";
// getline(std::cin, tmp);
// CHECK(std::cin.good()) << "Stdin closed";
// tmp[0] = std::tolower(tmp[0]);
// }
// if (tmp[0] == 'd') {
// std::cout << "Launch debugger; [Enter] to resume automated execution";
// getline(std::cin, tmp);
// } else {
low_level_device()->Execute(utvm_init_addr, utvm_done_addr); low_level_device()->Execute(utvm_init_addr, utvm_done_addr);
// }
tend = std::chrono::high_resolution_clock::now();
// Check if there was an error during execution. If so, log it. // Check if there was an error during execution. If so, log it.
CheckDeviceError(); CheckDeviceError();
uint32_t task_time = DevSymbolRead<uint32_t>(runtime_symbol_map_, "utvm_task_time");
GetAllocator(SectionKind::kArgs)->Free(stream_dev_addr); if (use_device_timer_) {
return static_cast<double>(task_time); uint64_t sum = 0;
std::vector<uint32_t> times;
times.resize(task_queue_.size());
low_level_device()->Read(runtime_symbol_map_["utvm_task_times"],
times.data(),
task_queue_.size() * sizeof(uint32_t));
int i = 0;
for (uint32_t time : times) {
LOG(INFO) << "Time " << i++ << ": " << time;
sum += time;
}
last_batch_time_ += static_cast<double>(sum) / 1e3;
} else {
last_batch_time_ += std::chrono::duration_cast<std::chrono::duration<double> >
(tend - tbegin).count() * 1000;
// TODO(weberlo): Reading internal data structure is hacky.
uint64_t sum = 0;
std::vector<uint32_t> times;
times.resize(task_queue_.size());
low_level_device()->Read(runtime_symbol_map_["utvm_task_times"],
times.data(),
task_queue_.size() * sizeof(uint32_t));
for (uint32_t time : times) {
sum += time;
}
last_batch_cycles_ += static_cast<double>(sum);
}
batch_args_encoder_.Clear();
task_queue_.clear();
} }
BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) { BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) {
...@@ -283,9 +383,6 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d ...@@ -283,9 +383,6 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size); rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size);
data_section.start = AllocateInSection(SectionKind::kData, data_section.size); data_section.start = AllocateInSection(SectionKind::kData, data_section.size);
bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size); bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size);
CHECK(text_section.start != nullptr && rodata_section.start != nullptr &&
data_section.start != nullptr && bss_section.start != nullptr)
<< "not enough space to load module on device";
std::string relocated_bin = RelocateBinarySections( std::string relocated_bin = RelocateBinarySections(
binary_path, binary_path,
...@@ -305,7 +402,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d ...@@ -305,7 +402,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size); low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size);
low_level_device_->Write(data_section.start, &data_contents[0], data_section.size); low_level_device_->Write(data_section.start, &data_contents[0], data_section.size);
low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size); low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size);
SymbolMap symbol_map {relocated_bin, toolchain_prefix_}; SymbolMap symbol_map {relocated_bin, toolchain_prefix_, word_size_};
if (patch_dylib_pointers) { if (patch_dylib_pointers) {
// Patch device lib pointers. // Patch device lib pointers.
...@@ -323,7 +420,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d ...@@ -323,7 +420,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
}; };
} }
std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend( std::tuple<TargetPtr, TargetPtr> MicroSession::EncoderAppend(
TargetDataLayoutEncoder* encoder, const TVMArgs& args) { TargetDataLayoutEncoder* encoder, const TVMArgs& args) {
const int* type_codes = args.type_codes; const int* type_codes = args.type_codes;
int num_args = args.num_args; int num_args = args.num_args;
...@@ -341,12 +438,13 @@ std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend( ...@@ -341,12 +438,13 @@ std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
// order to prevent premature session destruction. // order to prevent premature session destruction.
void* old_data = base_arr_handle->data; void* old_data = base_arr_handle->data;
// Mutate the array to unwrap the `data` field. // Mutate the array to unwrap the `data` field.
base_arr_handle->data = reinterpret_cast<MicroDevSpace*>(old_data)->data; MicroDevSpace* dev_arr_ptr = reinterpret_cast<MicroDevSpace*>(old_data);
base_arr_handle->data = reinterpret_cast<void*>(dev_arr_ptr->data.value().uint64());
// Now, encode the unwrapped version. // Now, encode the unwrapped version.
void* arr_ptr = nullptr; void* arr_ptr = nullptr;
if (word_size_ == 4) { if (word_size_.bytes() == 4) {
arr_ptr = EncoderAppend<TVMArray32>(encoder, *base_arr_handle).cast_to<void*>(); arr_ptr = EncoderAppend<TVMArray32>(encoder, *base_arr_handle).cast_to<void*>();
} else if (word_size_ == 8) { } else if (word_size_.bytes() == 8) {
arr_ptr = EncoderAppend<TVMArray64>(encoder, *base_arr_handle).cast_to<void*>(); arr_ptr = EncoderAppend<TVMArray64>(encoder, *base_arr_handle).cast_to<void*>();
} }
// And restore the original wrapped version. // And restore the original wrapped version.
...@@ -371,7 +469,7 @@ std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend( ...@@ -371,7 +469,7 @@ std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
} }
template <typename T> template <typename T>
DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr) { TargetPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr) {
auto tvm_arr_slot = encoder->Alloc<T>(); auto tvm_arr_slot = encoder->Alloc<T>();
auto shape_slot = encoder->Alloc<int64_t>(arr.ndim); auto shape_slot = encoder->Alloc<int64_t>(arr.ndim);
...@@ -379,8 +477,8 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen ...@@ -379,8 +477,8 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen
// the device first. The `data` field is already allocated on the device and // the device first. The `data` field is already allocated on the device and
// is a device pointer, so we don't need to write it. // is a device pointer, so we don't need to write it.
shape_slot.WriteArray(arr.shape, arr.ndim); shape_slot.WriteArray(arr.shape, arr.ndim);
DevPtr shape_dev_addr = shape_slot.start_addr(); TargetPtr shape_dev_addr = shape_slot.start_addr();
DevPtr strides_dev_addr = DevPtr(nullptr); TargetPtr strides_dev_addr = TargetPtr(word_size_, nullptr);
if (arr.strides != nullptr) { if (arr.strides != nullptr) {
auto stride_slot = encoder->Alloc<int64_t>(arr.ndim); auto stride_slot = encoder->Alloc<int64_t>(arr.ndim);
stride_slot.WriteArray(arr.strides, arr.ndim); stride_slot.WriteArray(arr.strides, arr.ndim);
...@@ -388,13 +486,13 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen ...@@ -388,13 +486,13 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen
} }
T dev_arr( T dev_arr(
TargetVal { .val64 = reinterpret_cast<uint64_t>(arr.data) }, TargetVal { word_size_.bits(), reinterpret_cast<uint64_t>(arr.data) },
arr.ctx, arr.ctx,
arr.ndim, arr.ndim,
arr.dtype, arr.dtype,
shape_dev_addr.value(), shape_dev_addr.value(),
strides_dev_addr.value(), strides_dev_addr.value(),
TargetVal { .val64 = arr.byte_offset }); TargetVal { word_size_.bits(), arr.byte_offset });
CHECK(dev_arr.ctx.device_type == static_cast<DLDeviceType>(kDLMicroDev)) CHECK(dev_arr.ctx.device_type == static_cast<DLDeviceType>(kDLMicroDev))
<< "attempt to write DLTensor with non-micro device type"; << "attempt to write DLTensor with non-micro device type";
// Update the device type to CPU, because from the microcontroller's // Update the device type to CPU, because from the microcontroller's
...@@ -404,39 +502,70 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen ...@@ -404,39 +502,70 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen
return tvm_arr_slot.start_addr(); return tvm_arr_slot.start_addr();
} }
// TODO(weberlo): switch over entirely to error codes that expand to error
// messages on the host side.
void MicroSession::CheckDeviceError() { void MicroSession::CheckDeviceError() {
int32_t return_code = DevSymbolRead<int32_t>(runtime_symbol_map_, "utvm_return_code"); int32_t last_error = DevSymbolRead<int32_t>(runtime_symbol_map_, "utvm_last_error");
if (return_code) {
std::uintptr_t last_error =
DevSymbolRead<std::uintptr_t>(runtime_symbol_map_, "utvm_last_error");
std::string last_error_str;
if (last_error) { if (last_error) {
DevPtr last_err_addr = DevPtr(last_error); if (!use_device_timer_ &&
last_error_str = ReadString(last_err_addr); (last_error == UTVM_ERR_TIMER_OVERFLOW ||
last_error == UTVM_ERR_TIMER_NOT_IMPLEMENTED)) {
// these errors don't matter if we're not using the on-device timer
return;
}
std::string err_msg;
switch (last_error) {
case UTVM_ERR_NOT_FINISHED:
err_msg = "execution timed out";
break;
case UTVM_ERR_TIMER_NOT_IMPLEMENTED:
err_msg = "timer is not implemented for the target device";
break;
case UTVM_ERR_TIMER_OVERFLOW:
// TODO(weberlo): this should be remedied by using interrupts to accumulate the
// timer into a larger datatype (ARM timers are only 24 bits)
err_msg = "timer overflowed during execution";
break;
case UTVM_ERR_WS_DOUBLE_FREE:
err_msg = "free called with no active workspace allocations";
break;
case UTVM_ERR_WS_OUT_OF_SPACE:
err_msg = "ran out of space in workspace section";
break;
case UTVM_ERR_WS_TOO_MANY_ALLOCS:
err_msg = "exceeded number of allocs the runtime can keep track of";
break;
case UTVM_ERR_WS_ZERO_SIZE_ALLOC:
err_msg = "attempt to allocate scratchpad of size zero";
break;
case UTVM_ERR_WS_UNALIGNED_START:
err_msg = "start of workspace section is not word-aligned";
break;
case UTVM_ERR_WS_UNALIGNED_ALLOC_SIZE:
err_msg = "scratchpad allocation size is not a multiple of the word size";
break;
default:
err_msg = "unknown error code";
break;
} }
LOG(FATAL) << "error during micro function execution:\n" LOG(FATAL) << "error during micro function execution:\n"
<< " return code: " << std::dec << return_code << "\n" << " error ID: " << std::dec << last_error << std::endl
<< " dev str addr: 0x" << std::hex << last_error << "\n" << " error message: " << err_msg;
<< " dev str data: " << last_error_str << std::endl;
} }
} }
void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name) { void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name) {
DevPtr runtime_impl_addr = runtime_symbol_map_[func_name]; TargetPtr runtime_impl_addr = runtime_symbol_map_[func_name];
if (thumb_mode_) { if (thumb_mode_) {
runtime_impl_addr += 1; runtime_impl_addr += 1;
} }
std::ostringstream func_name_underscore; std::ostringstream func_name_underscore;
func_name_underscore << func_name << "_"; func_name_underscore << func_name << "_";
if (word_size_ == 4) { DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr);
DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val32);
} else if (word_size_ == 8) {
DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val64);
}
} }
std::string MicroSession::ReadString(DevPtr str_addr) { std::string MicroSession::ReadString(TargetPtr str_addr) {
std::ostringstream result; std::ostringstream result;
const size_t buf_size = 256; const size_t buf_size = 256;
std::vector<char> buf(buf_size, 0); std::vector<char> buf(buf_size, 0);
...@@ -454,27 +583,39 @@ std::string MicroSession::ReadString(DevPtr str_addr) { ...@@ -454,27 +583,39 @@ std::string MicroSession::ReadString(DevPtr str_addr) {
return result.str(); return result.str();
} }
DevPtr MicroSession::AllocateInSection(SectionKind type, size_t size) { TargetPtr MicroSession::AllocateInSection(SectionKind type, size_t size) {
return GetAllocator(type)->Allocate(size); return GetAllocator(type)->Allocate(size);
} }
void MicroSession::FreeInSection(SectionKind type, DevPtr addr) { void MicroSession::FreeInSection(SectionKind type, TargetPtr addr) {
return GetAllocator(type)->Free(addr); return GetAllocator(type)->Free(addr);
} }
template <typename T> template <typename T>
T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol) { T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol) {
DevPtr sym_addr = symbol_map[symbol]; TargetPtr sym_addr = symbol_map[symbol];
T result; T result;
low_level_device()->Read(sym_addr, &result, sizeof(T)); low_level_device()->Read(sym_addr, &result, sizeof(T));
return result; return result;
} }
void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map,
const std::string& symbol,
const TargetPtr& ptr) {
if (word_size_.bytes() == 4) {
DevSymbolWrite(symbol_map, symbol, ptr.value().uint32());
} else if (word_size_.bytes() == 8) {
DevSymbolWrite(symbol_map, symbol, ptr.value().uint64());
} else {
CHECK(false) << "Unsupported word size unexpectedly here";
}
}
template <typename T> template <typename T>
void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map, void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map,
const std::string& symbol, const std::string& symbol,
const T& value) { const T& value) {
DevPtr sym_addr = symbol_map[symbol]; TargetPtr sym_addr = symbol_map[symbol];
low_level_device()->Write(sym_addr, &value, sizeof(T)); low_level_device()->Write(sym_addr, &value, sizeof(T));
} }
...@@ -489,11 +630,55 @@ PackedFunc MicroSession::GetFunction( ...@@ -489,11 +630,55 @@ PackedFunc MicroSession::GetFunction(
return PackedFunc([sptr_to_self](TVMArgs args, TVMRetValue* rv) { return PackedFunc([sptr_to_self](TVMArgs args, TVMRetValue* rv) {
MicroSession::ExitWithScope(); MicroSession::ExitWithScope();
}); });
// TODO(weberlo): add a `clear_batch_timer` func
} else if (name == "get_last_batch_time") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
*rv = this->GetLastBatchTime();
});
// TODO(weberlo): remove this func
} else if (name == "get_last_batch_cycles") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
*rv = this->GetLastBatchCycles();
});
} else { } else {
return PackedFunc(); return PackedFunc();
} }
} }
TVM_REGISTER_GLOBAL("micro._GetMicroTimeEvaluator")
.set_body([](TVMArgs args, TVMRetValue* rv) {
PackedFunc pf = args[0];
TVMContext ctx = args[1];
uint64_t number = args[2];
uint64_t repeat = args[3];
auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable {
TVMRetValue temp;
std::ostringstream os;
for (unsigned int i = 0; i < repeat; ++i) {
// start timing
CHECK(number < MicroSession::kTaskQueueCapacity)
<< "`number` must be less than uTVM task queue capacity";
for (unsigned int j = 0; j < number; ++j) {
pf.CallPacked(args, &temp);
}
ObjectPtr<MicroSession> session = MicroSession::Current();
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
double time_per_batch = session->GetLastBatchTime() / number;
os.write(reinterpret_cast<char*>(&time_per_batch), sizeof(time_per_batch));
}
std::string blob = os.str();
TVMByteArray arr;
arr.size = blob.length();
arr.data = blob.data();
// return the time.
*rv = arr;
};
*rv = PackedFunc(ftimer);
});
// create micro session and low-level device from Python frontend // create micro session and low-level device from Python frontend
TVM_REGISTER_GLOBAL("micro._CreateSession") TVM_REGISTER_GLOBAL("micro._CreateSession")
.set_body([](TVMArgs args, TVMRetValue* rv) { .set_body([](TVMArgs args, TVMRetValue* rv) {
...@@ -501,25 +686,26 @@ TVM_REGISTER_GLOBAL("micro._CreateSession") ...@@ -501,25 +686,26 @@ TVM_REGISTER_GLOBAL("micro._CreateSession")
const std::string& binary_path = args[1]; const std::string& binary_path = args[1];
const std::string& toolchain_prefix = args[2]; const std::string& toolchain_prefix = args[2];
uint64_t text_start = args[3]; uint64_t text_start = args[3];
size_t text_size = args[4]; size_t text_size = uint64_t(args[4]);
uint64_t rodata_start = args[5]; uint64_t rodata_start = args[5];
size_t rodata_size = args[6]; size_t rodata_size = uint64_t(args[6]);
uint64_t data_start = args[7]; uint64_t data_start = args[7];
size_t data_size = args[8]; size_t data_size = uint64_t(args[8]);
uint64_t bss_start = args[9]; uint64_t bss_start = args[9];
size_t bss_size = args[10]; size_t bss_size = uint64_t(args[10]);
uint64_t args_start = args[11]; uint64_t args_start = args[11];
size_t args_size = args[12]; size_t args_size = uint64_t(args[12]);
uint64_t heap_start = args[13]; uint64_t heap_start = args[13];
size_t heap_size = args[14]; size_t heap_size = uint64_t(args[14]);
uint64_t workspace_start = args[15]; uint64_t workspace_start = args[15];
size_t workspace_size = args[16]; size_t workspace_size = uint64_t(args[16]);
uint64_t stack_start = args[17]; uint64_t stack_start = args[17];
size_t stack_size = args[18]; size_t stack_size = uint64_t(args[18]);
size_t word_size = args[19]; TargetWordSize word_size{uint64_t(args[19])};
bool thumb_mode = args[20]; bool thumb_mode = args[20];
const std::string& server_addr = args[21]; bool use_device_timer = args[21];
int port = args[22]; const std::string& server_addr = args[22];
int port = args[23];
ObjectPtr<MicroSession> session = make_object<MicroSession>( ObjectPtr<MicroSession> session = make_object<MicroSession>(
comms_method, comms_method,
binary_path, binary_path,
...@@ -542,6 +728,7 @@ TVM_REGISTER_GLOBAL("micro._CreateSession") ...@@ -542,6 +728,7 @@ TVM_REGISTER_GLOBAL("micro._CreateSession")
stack_size, stack_size,
word_size, word_size,
thumb_mode, thumb_mode,
use_device_timer,
server_addr, server_addr,
port); port);
*rv = Module(session); *rv = Module(session);
......
...@@ -52,6 +52,8 @@ ...@@ -52,6 +52,8 @@
namespace tvm { namespace tvm {
namespace runtime { namespace runtime {
struct DevTask;
/*! /*!
* \brief session for facilitating micro device interaction * \brief session for facilitating micro device interaction
*/ */
...@@ -66,6 +68,9 @@ class MicroSession : public ModuleNode { ...@@ -66,6 +68,9 @@ class MicroSession : public ModuleNode {
virtual PackedFunc GetFunction(const std::string& name, virtual PackedFunc GetFunction(const std::string& name,
const ObjectPtr<Object>& sptr_to_self); const ObjectPtr<Object>& sptr_to_self);
// todo having this decoupled from the value in utvm_runtime.c gives me stress dreams
static const size_t kTaskQueueCapacity = 20;
/*! /*!
* \return The type key of the executor. * \return The type key of the executor.
*/ */
...@@ -94,7 +99,7 @@ class MicroSession : public ModuleNode { ...@@ -94,7 +99,7 @@ class MicroSession : public ModuleNode {
* \param workspace_size workspace section size * \param workspace_size workspace section size
* \param stack_start stack section start address * \param stack_start stack section start address
* \param stack_size stack section size * \param stack_size stack section size
* \param word_size number of bytes in a word on the target device * \param word_size_bytes number of bytes in a word on the target device
* \param thumb_mode whether the target device requires a thumb-mode bit on function addresses * \param thumb_mode whether the target device requires a thumb-mode bit on function addresses
* \param server_addr address of the OpenOCD server to connect to (if `comms_method == "openocd"`) * \param server_addr address of the OpenOCD server to connect to (if `comms_method == "openocd"`)
* \param port port of the OpenOCD server to connect to (if `comms_method == "openocd"`) * \param port port of the OpenOCD server to connect to (if `comms_method == "openocd"`)
...@@ -119,8 +124,9 @@ class MicroSession : public ModuleNode { ...@@ -119,8 +124,9 @@ class MicroSession : public ModuleNode {
size_t workspace_size, size_t workspace_size,
uint64_t stack_start, uint64_t stack_start,
size_t stack_size, size_t stack_size,
size_t word_size, TargetWordSize word_size,
bool thumb_mode, bool thumb_mode,
bool use_device_timer,
const std::string& server_addr, const std::string& server_addr,
int port); int port);
...@@ -137,7 +143,19 @@ class MicroSession : public ModuleNode { ...@@ -137,7 +143,19 @@ class MicroSession : public ModuleNode {
* \param args args to the packed function * \param args args to the packed function
* \return elapsed time during function execution on the device * \return elapsed time during function execution on the device
*/ */
double PushToExecQueue(DevPtr func, const TVMArgs& args); void PushToTaskQueue(TargetPtr func, const TVMArgs& args);
/*!
* \brief serialize runtime metadata to the device for enqueued tasks and execute
* \return elapsed time during function execution on the device
*/
void FlushTaskQueue();
/*!
* \brief TODO
*/
template <typename T>
void FlushTaskQueuePriv();
/*! /*!
* \brief loads binary onto device * \brief loads binary onto device
...@@ -153,21 +171,21 @@ class MicroSession : public ModuleNode { ...@@ -153,21 +171,21 @@ class MicroSession : public ModuleNode {
* \param size size of allocated memory in bytes * \param size size of allocated memory in bytes
* \return pointer to allocated memory region in section, nullptr if out of space * \return pointer to allocated memory region in section, nullptr if out of space
*/ */
DevPtr AllocateInSection(SectionKind type, size_t size); TargetPtr AllocateInSection(SectionKind type, size_t size);
/*! /*!
* \brief free prior allocation from section * \brief free prior allocation from section
* \param type type of section to allocate in * \param type type of section to allocate in
* \param addr device address of allocated memory * \param addr device address of allocated memory
*/ */
void FreeInSection(SectionKind type, DevPtr addr); void FreeInSection(SectionKind type, TargetPtr addr);
/*! /*!
* \brief read string from device to host * \brief read string from device to host
* \param str_addr device address of first character of string * \param str_addr device address of first character of string
* \return host copy of device string that was read * \return host copy of device string that was read
*/ */
std::string ReadString(DevPtr str_addr); std::string ReadString(TargetPtr str_addr);
/*! /*!
* \brief read value of symbol from device memory * \brief read value of symbol from device memory
...@@ -179,6 +197,16 @@ class MicroSession : public ModuleNode { ...@@ -179,6 +197,16 @@ class MicroSession : public ModuleNode {
T DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol); T DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol);
/*! /*!
* \brief write pointer value into device memory corresponding to symbol
* \param symbol_map symbol map to read location of symbol from
* \param symbol name of symbol being written to
* \param ptr pointer value to write into symbol
*/
void DevSymbolWrite(const SymbolMap& symbol_map,
const std::string& symbol,
const TargetPtr& ptr);
/*!
* \brief write value into device memory corresponding to symbol * \brief write value into device memory corresponding to symbol
* \param symbol_map symbol map to read location of symbol from * \param symbol_map symbol map to read location of symbol from
* \param symbol name of symbol being written to * \param symbol name of symbol being written to
...@@ -196,6 +224,18 @@ class MicroSession : public ModuleNode { ...@@ -196,6 +224,18 @@ class MicroSession : public ModuleNode {
return low_level_device_; return low_level_device_;
} }
const double GetLastBatchTime() {
double result = last_batch_time_;
last_batch_time_ = 0.0;
return result;
}
const double GetLastBatchCycles() {
double result = last_batch_cycles_;
last_batch_cycles_ = 0.0;
return result;
}
private: private:
/*! \brief low-level device pointer */ /*! \brief low-level device pointer */
std::shared_ptr<LowLevelDevice> low_level_device_; std::shared_ptr<LowLevelDevice> low_level_device_;
...@@ -205,7 +245,7 @@ class MicroSession : public ModuleNode { ...@@ -205,7 +245,7 @@ class MicroSession : public ModuleNode {
std::shared_ptr<MicroSectionAllocator> std::shared_ptr<MicroSectionAllocator>
section_allocators_[static_cast<size_t>(SectionKind::kNumKinds)]; section_allocators_[static_cast<size_t>(SectionKind::kNumKinds)];
/*! \brief number of bytes in a word on the target device */ /*! \brief number of bytes in a word on the target device */
size_t word_size_; TargetWordSize word_size_;
/*! \brief whether the target device requires a thumb-mode bit on function addresses /*! \brief whether the target device requires a thumb-mode bit on function addresses
* *
* ARM and other manufacturers use the lowest bit of a function address to determine * ARM and other manufacturers use the lowest bit of a function address to determine
...@@ -213,8 +253,20 @@ class MicroSession : public ModuleNode { ...@@ -213,8 +253,20 @@ class MicroSession : public ModuleNode {
* results in more compact binaries. * results in more compact binaries.
*/ */
bool thumb_mode_; bool thumb_mode_;
/*! \brief TODO */
bool use_device_timer_;
/*! \brief symbol map for the device runtime */ /*! \brief symbol map for the device runtime */
SymbolMap runtime_symbol_map_; SymbolMap runtime_symbol_map_;
/*! \brief TODO */
std::vector<DevTask> task_queue_;
// TODO(weberlo): we don't even need an allocator mechanism for the args
// section. there's only ever one allocation.
/*! \brief TODO hack */
TargetDataLayoutEncoder batch_args_encoder_;
/*! \brief TODO hack */
double last_batch_time_;
/*! \brief TODO hack */
double last_batch_cycles_;
/*! /*!
* \brief patches a function pointer in this module to an implementation * \brief patches a function pointer in this module to an implementation
...@@ -228,7 +280,8 @@ class MicroSession : public ModuleNode { ...@@ -228,7 +280,8 @@ class MicroSession : public ModuleNode {
* \param args args to be appended * \param args args to be appended
* \return device address of the allocated args * \return device address of the allocated args
*/ */
std::tuple<DevPtr, DevPtr> EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args); std::tuple<TargetPtr, TargetPtr> EncoderAppend(TargetDataLayoutEncoder* encoder,
const TVMArgs& args);
/*! /*!
* \brief appends a `DLTensor` to the host-side buffer of `encoder` * \brief appends a `DLTensor` to the host-side buffer of `encoder`
...@@ -237,7 +290,7 @@ class MicroSession : public ModuleNode { ...@@ -237,7 +290,7 @@ class MicroSession : public ModuleNode {
* \return device address of the allocated `DLTensor` * \return device address of the allocated `DLTensor`
*/ */
template <typename T> template <typename T>
DevPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr); TargetPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr);
/*! /*!
* \brief checks and logs if there was an error during the device's most recent execution * \brief checks and logs if there was an error during the device's most recent execution
...@@ -274,7 +327,7 @@ class MicroSession : public ModuleNode { ...@@ -274,7 +327,7 @@ class MicroSession : public ModuleNode {
*/ */
struct MicroDevSpace { struct MicroDevSpace {
/*! \brief data being wrapped */ /*! \brief data being wrapped */
void* data; TargetPtr data;
/*! \brief shared ptr to session where this data is valid */ /*! \brief shared ptr to session where this data is valid */
ObjectPtr<MicroSession> session; ObjectPtr<MicroSession> session;
}; };
...@@ -291,18 +344,22 @@ struct TVMArray32 { ...@@ -291,18 +344,22 @@ struct TVMArray32 {
TargetVal shape, TargetVal shape,
TargetVal strides, TargetVal strides,
TargetVal byte_offset) TargetVal byte_offset)
: data(data.val32), : data(data.uint32()),
ctx(ctx), ctx(ctx),
ndim(ndim), ndim(ndim),
pad0(0), pad0(0),
dtype(dtype), dtype(dtype),
shape(shape.val32), shape(shape.uint32()),
strides(strides.val32), strides(strides.uint32()),
pad1(0), pad1(0),
byte_offset(byte_offset.val32), byte_offset(byte_offset.uint32()),
pad2(0) { } pad2(0) { }
/*! \brief opaque pointer to the allocated data */ /*!
* \brief The opaque data pointer points to the allocated data.
* This will be CUDA device pointer or cl_mem handle in OpenCL.
* This pointer is always aligns to 256 bytes as in CUDA.
*/
uint32_t data; uint32_t data;
/*! \brief The device context of the tensor */ /*! \brief The device context of the tensor */
DLContext ctx; DLContext ctx;
...@@ -337,16 +394,19 @@ struct TVMArray64 { ...@@ -337,16 +394,19 @@ struct TVMArray64 {
TargetVal shape, TargetVal shape,
TargetVal strides, TargetVal strides,
TargetVal byte_offset) TargetVal byte_offset)
: data(data.val64), : data(data.uint64()),
ctx(ctx), ctx(ctx),
ndim(ndim), ndim(ndim),
pad0(0), pad0(0),
dtype(dtype), dtype(dtype),
shape(shape.val64), shape(shape.uint64()),
strides(strides.val64), strides(strides.uint64()),
byte_offset(byte_offset.val64) { } byte_offset(byte_offset.uint64()) { }
/*!
/*! \brief opaque pointer to the allocated data */ * \brief The opaque data pointer points to the allocated data.
* This will be CUDA device pointer or cl_mem handle in OpenCL.
* This pointer is always aligns to 256 bytes as in CUDA.
*/
uint64_t data; uint64_t data;
/*! \brief The device context of the tensor */ /*! \brief The device context of the tensor */
DLContext ctx; DLContext ctx;
...@@ -367,8 +427,26 @@ struct TVMArray64 { ...@@ -367,8 +427,26 @@ struct TVMArray64 {
uint64_t byte_offset; uint64_t byte_offset;
}; };
/*! \brief MicroTVM task to store in task queue before specializing to word size */
struct DevTask {
/*! \brief Pointer to function to call for this task */
TargetVal func;
/*! \brief Array of argument values */
TargetVal arg_values;
/*! \brief Array of type codes for each argument value */
TargetVal arg_type_codes;
/*! \brief Number of arguments */
int32_t num_args;
};
/*! \brief MicroTVM task for serialization to 32-bit devices */ /*! \brief MicroTVM task for serialization to 32-bit devices */
typedef struct StructUTVMTask32 { typedef struct StructUTVMTask32 {
StructUTVMTask32(DevTask task)
: func(task.func.uint32()),
arg_values(task.arg_values.uint32()),
arg_type_codes(task.arg_type_codes.uint32()),
num_args(task.num_args) { }
/*! \brief Pointer to function to call for this task */ /*! \brief Pointer to function to call for this task */
uint32_t func; uint32_t func;
/*! \brief Array of argument values */ /*! \brief Array of argument values */
...@@ -377,10 +455,16 @@ typedef struct StructUTVMTask32 { ...@@ -377,10 +455,16 @@ typedef struct StructUTVMTask32 {
uint32_t arg_type_codes; uint32_t arg_type_codes;
/*! \brief Number of arguments */ /*! \brief Number of arguments */
int32_t num_args; int32_t num_args;
} UTVMTask32; } StructUTVMTask32;
/*! \brief MicroTVM task for serialization to 64-bit devices */ /*! \brief MicroTVM task for serialization to 64-bit devices */
typedef struct StructUTVMTask64 { typedef struct StructUTVMTask64 {
StructUTVMTask64(DevTask task)
: func(task.func.uint64()),
arg_values(task.arg_values.uint64()),
arg_type_codes(task.arg_type_codes.uint64()),
num_args(task.num_args) { }
/*! \brief Pointer to function to call for this task */ /*! \brief Pointer to function to call for this task */
uint64_t func; uint64_t func;
/*! \brief Array of argument values */ /*! \brief Array of argument values */
...@@ -389,7 +473,7 @@ typedef struct StructUTVMTask64 { ...@@ -389,7 +473,7 @@ typedef struct StructUTVMTask64 {
uint64_t arg_type_codes; uint64_t arg_type_codes;
/*! \brief Number of arguments */ /*! \brief Number of arguments */
int32_t num_args; int32_t num_args;
} UTVMTask64; } StructUTVMTask64;
} // namespace runtime } // namespace runtime
} // namespace tvm } // namespace tvm
......
...@@ -50,7 +50,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { ...@@ -50,7 +50,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
socket_.SendCommand(); socket_.SendCommand();
} }
void Read(DevPtr addr, void* buf, size_t num_bytes) { void Read(TargetPtr addr, void* buf, size_t num_bytes) override {
if (num_bytes == 0) { if (num_bytes == 0) {
return; return;
} }
...@@ -88,7 +88,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { ...@@ -88,7 +88,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
} }
{ {
socket_.cmd_builder() << "ocd_echo $output"; socket_.cmd_builder() << "return $output";
socket_.SendCommand(); socket_.SendCommand();
const std::string& reply = socket_.last_reply(); const std::string& reply = socket_.last_reply();
...@@ -119,7 +119,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { ...@@ -119,7 +119,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
} }
} }
void Write(DevPtr addr, const void* buf, size_t num_bytes) { void Write(TargetPtr addr, const void* buf, size_t num_bytes) override {
if (num_bytes == 0) { if (num_bytes == 0) {
return; return;
} }
...@@ -171,7 +171,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { ...@@ -171,7 +171,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
} }
} }
void Execute(DevPtr func_addr, DevPtr breakpoint_addr) { void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) override {
socket_.cmd_builder() << "halt 0"; socket_.cmd_builder() << "halt 0";
socket_.SendCommand(); socket_.SendCommand();
...@@ -207,12 +207,12 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { ...@@ -207,12 +207,12 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
/*! \brief number of bytes in a word on the target device (64-bit) */ /*! \brief number of bytes in a word on the target device (64-bit) */
static const constexpr ssize_t kWordSize = 8; static const constexpr ssize_t kWordSize = 8;
// NOTE: OpenOCD will call any request larger than this constant an "absurd // NOTE: The OS pipe buffer must be able to handle a line long enough to
// request". // print this transfer request.
/*! \brief maximum number of bytes allowed in a single memory transfer */ /*! \brief maximum number of bytes allowed in a single memory transfer */
static const constexpr ssize_t kMemTransferLimit = 64000; static const constexpr ssize_t kMemTransferLimit = 8000;
/*! \brief number of milliseconds to wait for function execution to halt */ /*! \brief number of milliseconds to wait for function execution to halt */
static const constexpr int kWaitTime = 10000; static const constexpr int kWaitTime = 30000;
}; };
const std::shared_ptr<LowLevelDevice> OpenOCDLowLevelDeviceCreate(const std::string& server_addr, const std::shared_ptr<LowLevelDevice> OpenOCDLowLevelDeviceCreate(const std::string& server_addr,
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
namespace tvm { namespace tvm {
namespace runtime { namespace runtime {
// TODO(weberlo): Handle endianness. // TODO(weberlo, areusch): Handle endianness.
/*! /*!
* \brief data encoder for uTVM that builds a host-side buffer * \brief data encoder for uTVM that builds a host-side buffer
...@@ -50,7 +50,7 @@ class TargetDataLayoutEncoder { ...@@ -50,7 +50,7 @@ class TargetDataLayoutEncoder {
* \param size size (in bytes) of the memory region allocated for this slot * \param size size (in bytes) of the memory region allocated for this slot
* \param start_addr start address of the slot in the device's memory * \param start_addr start address of the slot in the device's memory
*/ */
Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, DevPtr start_addr); Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, TargetPtr start_addr);
~Slot(); ~Slot();
...@@ -71,7 +71,7 @@ class TargetDataLayoutEncoder { ...@@ -71,7 +71,7 @@ class TargetDataLayoutEncoder {
* \brief returns start address of the slot in device memory * \brief returns start address of the slot in device memory
* \return device start address * \return device start address
*/ */
DevPtr start_addr(); TargetPtr start_addr();
/*! /*!
* \brief returns number of bytes allocated for this slot * \brief returns number of bytes allocated for this slot
...@@ -89,16 +89,17 @@ class TargetDataLayoutEncoder { ...@@ -89,16 +89,17 @@ class TargetDataLayoutEncoder {
/*! \brief size (in bytes) of the memory region allocated for this slot */ /*! \brief size (in bytes) of the memory region allocated for this slot */
size_t size_; size_t size_;
/*! \brief start address of the slot in the device's memory */ /*! \brief start address of the slot in the device's memory */
DevPtr start_addr_; TargetPtr start_addr_;
}; };
/*! /*!
* \brief constructor * \brief constructor
* \param start_addr start address of the encoder in device memory * \param start_addr start address of the encoder in device memory
*/ */
explicit TargetDataLayoutEncoder(DevPtr start_addr, size_t word_size) explicit TargetDataLayoutEncoder(size_t capacity, TargetWordSize word_size)
: buf_(std::vector<uint8_t>()), curr_offset_(0), word_size_(word_size) { : buf_(std::vector<uint8_t>()), curr_offset_(0),
start_addr_ = DevPtr(UpperAlignValue(start_addr.value().val64, word_size_)); start_addr_(word_size, nullptr),
capacity_(capacity), word_size_(word_size) {
} }
/*! /*!
...@@ -108,14 +109,20 @@ class TargetDataLayoutEncoder { ...@@ -108,14 +109,20 @@ class TargetDataLayoutEncoder {
*/ */
template <typename T> template <typename T>
Slot<T> Alloc(size_t num_elems = 1) { Slot<T> Alloc(size_t num_elems = 1) {
curr_offset_ = UpperAlignValue(curr_offset_, word_size_); curr_offset_ = UpperAlignValue(curr_offset_, word_size_.bytes());
size_t size = sizeof(T) * num_elems; size_t size = sizeof(T) * num_elems;
if (curr_offset_ + size > buf_.size()) { if (curr_offset_ + size > buf_.size()) {
buf_.resize(curr_offset_ + size); buf_.resize(curr_offset_ + size);
} }
CHECK(buf_.size() < capacity_) << "out of space in data encoder";
size_t slot_start_offset = curr_offset_; size_t slot_start_offset = curr_offset_;
curr_offset_ += size; curr_offset_ += size;
return Slot<T>(this, slot_start_offset, size, start_addr_ + slot_start_offset); return Slot<T>(this, slot_start_offset, size, start_addr() + slot_start_offset);
}
void Clear() {
buf_.clear();
curr_offset_ = 0;
} }
/*! /*!
...@@ -130,26 +137,39 @@ class TargetDataLayoutEncoder { ...@@ -130,26 +137,39 @@ class TargetDataLayoutEncoder {
* \brief returns current size of the encoder's buffer * \brief returns current size of the encoder's buffer
* \return buffer size * \return buffer size
*/ */
size_t buf_size() { size_t buf_size() const {
return buf_.size(); return buf_.size();
} }
TargetPtr start_addr() const {
CHECK_NE(start_addr_.value().uint64(), 0) << "start addr uninitialized";
return start_addr_;
}
void set_start_addr(TargetPtr start_addr) {
CHECK_EQ(buf_.size(), 0) << "cannot change encoder start addr unless empty";
start_addr_ = TargetPtr(word_size_,
UpperAlignValue(start_addr.value().uint64(), word_size_.bytes()));
}
private: private:
/*! \brief in-memory backing buffer */ /*! \brief in-memory backing buffer */
std::vector<uint8_t> buf_; std::vector<uint8_t> buf_;
/*! \brief current offset */ /*! \brief current offset */
size_t curr_offset_; size_t curr_offset_;
/*! \brief start address of the encoder in device memory */ /*! \brief start address of the encoder in device memory */
DevPtr start_addr_; TargetPtr start_addr_;
/*! \brief number of bytes available in device memory */
size_t capacity_;
/*! \brief number of bytes in a word on the target device */ /*! \brief number of bytes in a word on the target device */
size_t word_size_; TargetWordSize word_size_;
}; };
template <typename T> template <typename T>
TargetDataLayoutEncoder::Slot<T>::Slot(TargetDataLayoutEncoder* parent, TargetDataLayoutEncoder::Slot<T>::Slot(TargetDataLayoutEncoder* parent,
size_t start_offset, size_t start_offset,
size_t size, size_t size,
DevPtr start_addr) TargetPtr start_addr)
: parent_(parent), : parent_(parent),
start_offset_(start_offset), start_offset_(start_offset),
curr_offset_(0), curr_offset_(0),
...@@ -158,7 +178,10 @@ TargetDataLayoutEncoder::Slot<T>::Slot(TargetDataLayoutEncoder* parent, ...@@ -158,7 +178,10 @@ TargetDataLayoutEncoder::Slot<T>::Slot(TargetDataLayoutEncoder* parent,
template <typename T> template <typename T>
TargetDataLayoutEncoder::Slot<T>::~Slot() { TargetDataLayoutEncoder::Slot<T>::~Slot() {
CHECK(curr_offset_ == size_) << "unwritten space in slot"; // TODO(weberlo, areusch): this can mask the exception thrown by slot allocation... even though
// that doesn't make sense.
CHECK(curr_offset_ == size_) << "unwritten space in slot; curr_offset="
<< curr_offset_ << ", size=" << size_;
} }
template <typename T> template <typename T>
...@@ -177,7 +200,7 @@ void TargetDataLayoutEncoder::Slot<T>::WriteValue(const T& val) { ...@@ -177,7 +200,7 @@ void TargetDataLayoutEncoder::Slot<T>::WriteValue(const T& val) {
} }
template <typename T> template <typename T>
DevPtr TargetDataLayoutEncoder::Slot<T>::start_addr() { TargetPtr TargetDataLayoutEncoder::Slot<T>::start_addr() {
return start_addr_; return start_addr_;
} }
......
...@@ -45,6 +45,7 @@ void TclSocket::SendCommand() { ...@@ -45,6 +45,7 @@ void TclSocket::SendCommand() {
const char terminate_token = kCommandTerminateToken; const char terminate_token = kCommandTerminateToken;
cmd_builder_ << terminate_token; cmd_builder_ << terminate_token;
std::string full_cmd = cmd_builder_.str(); std::string full_cmd = cmd_builder_.str();
CHECK(tcp_socket_.Send(full_cmd.data(), full_cmd.length()) != -1) CHECK(tcp_socket_.Send(full_cmd.data(), full_cmd.length()) != -1)
<< "failed to send command"; << "failed to send command";
cmd_builder_.str(std::string()); cmd_builder_.str(std::string());
......
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
#include "../object_internal.h" #include "../object_internal.h"
#include "../../support/ring_buffer.h" #include "../../support/ring_buffer.h"
#include "../../support/socket.h" #include "../../support/socket.h"
#include "../micro/micro_session.h"
namespace tvm { namespace tvm {
namespace runtime { namespace runtime {
...@@ -1246,43 +1247,15 @@ void RPCSession::EventHandler::HandlePackedCall() { ...@@ -1246,43 +1247,15 @@ void RPCSession::EventHandler::HandlePackedCall() {
CHECK_EQ(state_, kRecvCode); CHECK_EQ(state_, kRecvCode);
} }
PackedFunc MicroTimeEvaluator(
PackedFunc pf,
TVMContext ctx,
int number,
int repeat) {
auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable {
TVMRetValue temp;
std::ostringstream os;
// skip first time call, to activate lazy compilation components.
pf.CallPacked(args, &temp);
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
for (int i = 0; i < repeat; ++i) {
double speed = 0.0;
for (int j = 0; j < number; ++j) {
pf.CallPacked(args, &temp);
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
speed += (temp.operator double()) / number;
}
os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
}
std::string blob = os.str();
TVMByteArray arr;
arr.size = blob.length();
arr.data = blob.data();
// return the time.
*rv = arr;
};
return PackedFunc(ftimer);
}
PackedFunc WrapTimeEvaluator(PackedFunc pf, PackedFunc WrapTimeEvaluator(PackedFunc pf,
TVMContext ctx, TVMContext ctx,
int number, int number,
int repeat, int repeat,
int min_repeat_ms) { int min_repeat_ms) {
if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) { if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
return MicroTimeEvaluator(pf, ctx, number, repeat); auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator");
CHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
return (*get_micro_time_evaluator)(pf, ctx, number, repeat);
} }
auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable { auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
......
...@@ -840,6 +840,10 @@ void CodeGenC::VisitStmt_(const AttrStmtNode* op) { ...@@ -840,6 +840,10 @@ void CodeGenC::VisitStmt_(const AttrStmtNode* op) {
const VarNode* v = op->node.as<VarNode>(); const VarNode* v = op->node.as<VarNode>();
CHECK(v); CHECK(v);
volatile_buf_.insert(v); volatile_buf_.insert(v);
} else if (op->attr_key == tir::attr::pragma_import_c) {
const StringImmNode* value = op->value.as<StringImmNode>();
CHECK(value != nullptr);
decl_stream << value->value;
} }
this->PrintStmt(op->body); this->PrintStmt(op->body);
} }
......
...@@ -23,8 +23,8 @@ ...@@ -23,8 +23,8 @@
#include <tvm/target/codegen.h> #include <tvm/target/codegen.h>
#include <vector> #include <vector>
#include <string> #include <string>
#include "codegen_c_host.h"
#include "../build_common.h" #include "../build_common.h"
#include "codegen_c_host.h"
namespace tvm { namespace tvm {
namespace codegen { namespace codegen {
...@@ -35,9 +35,10 @@ CodeGenCHost::CodeGenCHost() { ...@@ -35,9 +35,10 @@ CodeGenCHost::CodeGenCHost() {
void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) { void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) {
emit_asserts_ = emit_asserts; emit_asserts_ = emit_asserts;
declared_globals_.clear();
decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n"; decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n"; decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
decl_stream << "extern void* " << module_name_ << " = NULL;\n"; decl_stream << "void* " << module_name_ << " = NULL;\n";
CodeGenC::Init(output_ssa); CodeGenC::Init(output_ssa);
} }
...@@ -182,8 +183,15 @@ void CodeGenCHost::VisitExpr_(const CallNode *op, std::ostream& os) { // NOLINT( ...@@ -182,8 +183,15 @@ void CodeGenCHost::VisitExpr_(const CallNode *op, std::ostream& os) { // NOLINT(
int64_t num_args = end - begin; int64_t num_args = end - begin;
CHECK_GE(num_args, 0); CHECK_GE(num_args, 0);
std::string func_name = s->value; std::string func_name = s->value;
std::string packed_func_name = GetUniqueName(func_name + "_packed"); // NOTE: cannot rely on GetUnique for global decl_stream declarations
// because it is reset between AddFunction().
std::string packed_func_name = func_name + "_packed";
if (declared_globals_.insert(packed_func_name).second) {
// Still reserve the name among unique names.
CHECK(GetUniqueName(packed_func_name) == packed_func_name) <<
"Expected name " << packed_func_name << " to not be taken";
decl_stream << "static void* " << packed_func_name << " = NULL;\n"; decl_stream << "static void* " << packed_func_name << " = NULL;\n";
}
this->PrintGetFuncFromBackend(func_name, packed_func_name); this->PrintGetFuncFromBackend(func_name, packed_func_name);
this->PrintFuncCall(packed_func_name, num_args); this->PrintFuncCall(packed_func_name, num_args);
} else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) { } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) {
...@@ -255,6 +263,6 @@ runtime::Module BuildCHost(IRModule mod) { ...@@ -255,6 +263,6 @@ runtime::Module BuildCHost(IRModule mod) {
TVM_REGISTER_GLOBAL("target.build.c") TVM_REGISTER_GLOBAL("target.build.c")
.set_body([](TVMArgs args, TVMRetValue* rv) { .set_body([](TVMArgs args, TVMRetValue* rv) {
*rv = BuildCHost(args[0]); *rv = BuildCHost(args[0]);
}); });
} // namespace codegen } // namespace codegen
} // namespace tvm } // namespace tvm
...@@ -24,9 +24,10 @@ ...@@ -24,9 +24,10 @@
#ifndef TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_ #ifndef TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_
#define TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_ #define TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_
#include <tvm/target/codegen.h> #include <set>
#include <tvm/tir/expr.h>
#include <string> #include <string>
#include "tvm/target/codegen.h"
#include "tvm/tir/expr.h"
#include "codegen_c.h" #include "codegen_c.h"
namespace tvm { namespace tvm {
...@@ -53,6 +54,8 @@ class CodeGenCHost final : public CodeGenC { ...@@ -53,6 +54,8 @@ class CodeGenCHost final : public CodeGenC {
private: private:
std::string module_name_; std::string module_name_;
/* \brief tracks declared global variables which live despite GetUniqueName */
std::set<std::string> declared_globals_;
/*! \brief whether to emit asserts in the resulting C code */ /*! \brief whether to emit asserts in the resulting C code */
bool emit_asserts_; bool emit_asserts_;
......
...@@ -140,7 +140,7 @@ Target CreateTarget(const std::string& target_name, ...@@ -140,7 +140,7 @@ Target CreateTarget(const std::string& target_name,
t->keys_array.push_back("hexagon"); t->keys_array.push_back("hexagon");
t->device_type = kDLHexagon; t->device_type = kDLHexagon;
} else { } else {
LOG(ERROR) << "Unknown target name " << target_name; LOG(ERROR) << "Unknown target name " << target_name << "; falling back to stackvm";
return target::stackvm(); return target::stackvm();
} }
......
...@@ -181,7 +181,9 @@ def add_header(fname, header): ...@@ -181,7 +181,9 @@ def add_header(fname, header):
skipline = False skipline = False
ext = os.path.splitext(fname)[1][1:] ext = os.path.splitext(fname)[1][1:]
if lines[0][:2] == "#!": if not lines:
skipline = False # File is enpty
elif lines[0][:2] == "#!":
skipline = True skipline = True
elif lines[0][:2] == "<?": elif lines[0][:2] == "<?":
skipline = True skipline = True
......
...@@ -25,8 +25,10 @@ import tvm.micro as micro ...@@ -25,8 +25,10 @@ import tvm.micro as micro
from tvm.micro import create_micro_mod from tvm.micro import create_micro_mod
from tvm.relay.testing import resnet from tvm.relay.testing import resnet
# Use the host emulated micro device. # # Use the host emulated micro device.
DEV_CONFIG = micro.device.host.default_config() DEV_CONFIG_A = micro.device.host.generate_config()
DEV_CONFIG_B = micro.device.host.generate_config()
TARGET = 'c -device=micro_dev'
def relay_micro_build(func, dev_config, params=None): def relay_micro_build(func, dev_config, params=None):
"""Create a graph runtime module with a micro device context from a Relay function. """Create a graph runtime module with a micro device context from a Relay function.
...@@ -47,22 +49,41 @@ def relay_micro_build(func, dev_config, params=None): ...@@ -47,22 +49,41 @@ def relay_micro_build(func, dev_config, params=None):
mod : tvm.runtime.Module mod : tvm.runtime.Module
graph runtime module for the target device graph runtime module for the target device
""" """
with tvm.target.build_config(disable_vectorize=True): disable_vectorize = tvm.target.build_config(disable_vectorize=True)
graph, c_mod, params = relay.build(func, target="c", params=params) disable_fusion = relay.build_config(disabled_pass={'FuseOps'})
micro_mod = create_micro_mod(c_mod, dev_config) with disable_vectorize, disable_fusion:
graph, c_mod, params = relay.build(func, target=TARGET, params=params)
micro_mod = micro.create_micro_mod(c_mod, dev_config)
ctx = tvm.micro_dev(0) ctx = tvm.micro_dev(0)
mod = graph_runtime.create(graph, micro_mod, ctx) mod = graph_runtime.create(graph, micro_mod, ctx)
mod.set_input(**params) mod.set_input(**params)
return mod return mod
GDB_INIT_TEMPLATE = """
layout asm
target remote localhost:{gdb_port}
set $pc = UTVMInit
break UTVMDone
"""
def reset_gdbinit():
if 'server_port' not in DEV_CONFIG_A:
return
gdb_init_dir = os.environ['MICRO_GDB_INIT_DIR']
with open(f'{gdb_init_dir}/.gdbinit', 'w') as f:
gdb_port = DEV_CONFIG_A['server_port'] - 3333
f.write(GDB_INIT_TEMPLATE.format(gdb_port=gdb_port))
def test_alloc(): def test_alloc():
"""Test tensor allocation on the device.""" """Test tensor allocation on the device."""
if not tvm.runtime.enabled("micro_dev"): if not tvm.runtime.enabled("micro_dev"):
return return
shape = (1024,) shape = (1024,)
dtype = "float32" dtype = "float32"
with micro.Session(DEV_CONFIG): with micro.Session(DEV_CONFIG_A):
ctx = tvm.micro_dev(0) ctx = tvm.micro_dev(0)
np_tensor = np.random.uniform(size=shape).astype(dtype) np_tensor = np.random.uniform(size=shape).astype(dtype)
micro_tensor = tvm.nd.array(np_tensor, ctx) micro_tensor = tvm.nd.array(np_tensor, ctx)
...@@ -76,6 +97,8 @@ def test_add(): ...@@ -76,6 +97,8 @@ def test_add():
shape = (1024,) shape = (1024,)
dtype = "float32" dtype = "float32"
reset_gdbinit()
# Construct TVM expression. # Construct TVM expression.
tvm_shape = tvm.runtime.convert(shape) tvm_shape = tvm.runtime.convert(shape)
A = te.placeholder(tvm_shape, name="A", dtype=dtype) A = te.placeholder(tvm_shape, name="A", dtype=dtype)
...@@ -86,14 +109,24 @@ def test_add(): ...@@ -86,14 +109,24 @@ def test_add():
func_name = "fadd" func_name = "fadd"
c_mod = tvm.build(s, [A, B, C], target="c", name=func_name) c_mod = tvm.build(s, [A, B, C], target="c", name=func_name)
with micro.Session(DEV_CONFIG): with micro.Session(DEV_CONFIG_A) as sess:
micro_mod = create_micro_mod(c_mod, DEV_CONFIG) micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
micro_func = micro_mod[func_name] micro_func = micro_mod[func_name]
ctx = tvm.micro_dev(0) ctx = tvm.micro_dev(0)
a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) a_np = np.random.uniform(size=shape).astype(dtype)
a = tvm.nd.array(a_np, ctx)
b_np = np.random.uniform(size=shape).astype(dtype)
b = tvm.nd.array(b_np, ctx)
c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
micro_func(a, b, c) micro_func(a, b, c)
# ensure inputs weren't corrupted
tvm.testing.assert_allclose(
a.asnumpy(), a_np)
tvm.testing.assert_allclose(
b.asnumpy(), b_np)
# ensure output is correct
tvm.testing.assert_allclose( tvm.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + b.asnumpy()) c.asnumpy(), a.asnumpy() + b.asnumpy())
...@@ -105,6 +138,8 @@ def test_workspace_add(): ...@@ -105,6 +138,8 @@ def test_workspace_add():
shape = (1024,) shape = (1024,)
dtype = "float32" dtype = "float32"
reset_gdbinit()
# Construct TVM expression. # Construct TVM expression.
tvm_shape = tvm.runtime.convert(shape) tvm_shape = tvm.runtime.convert(shape)
A = te.placeholder(tvm_shape, name="A", dtype=dtype) A = te.placeholder(tvm_shape, name="A", dtype=dtype)
...@@ -116,14 +151,19 @@ def test_workspace_add(): ...@@ -116,14 +151,19 @@ def test_workspace_add():
func_name = "fadd_two_workspace" func_name = "fadd_two_workspace"
c_mod = tvm.build(s, [A, C], target="c", name=func_name) c_mod = tvm.build(s, [A, C], target="c", name=func_name)
with micro.Session(DEV_CONFIG): with micro.Session(DEV_CONFIG_A) as sess:
micro_mod = create_micro_mod(c_mod, DEV_CONFIG) micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
micro_func = micro_mod[func_name] micro_func = micro_mod[func_name]
ctx = tvm.micro_dev(0) ctx = tvm.micro_dev(0)
a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) a_np = np.random.uniform(size=shape).astype(dtype)
a = tvm.nd.array(a_np, ctx)
c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
micro_func(a, c) micro_func(a, c)
# ensure input wasn't corrupted
tvm.testing.assert_allclose(
a.asnumpy(), a_np)
# ensure output is correct
tvm.testing.assert_allclose( tvm.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + 2.0) c.asnumpy(), a.asnumpy() + 2.0)
...@@ -141,47 +181,74 @@ def test_graph_runtime(): ...@@ -141,47 +181,74 @@ def test_graph_runtime():
z = relay.add(xx, relay.const(1.0)) z = relay.add(xx, relay.const(1.0))
func = relay.Function([x], z) func = relay.Function([x], z)
with micro.Session(DEV_CONFIG): with micro.Session(DEV_CONFIG_A):
mod = relay_micro_build(func, DEV_CONFIG) mod = relay_micro_build(func, DEV_CONFIG_A)
x_in = np.random.uniform(size=shape[0]).astype(dtype) x_in = np.random.uniform(size=shape[0]).astype(dtype)
mod.run(x=x_in) mod.run(x=x_in)
result = mod.get_output(0).asnumpy() result = mod.get_output(0).asnumpy()
tvm.testing.assert_allclose( tvm.testing.assert_allclose(
mod.get_input(0).asnumpy(), x_in)
tvm.testing.assert_allclose(
result, x_in * x_in + 1.0) result, x_in * x_in + 1.0)
def test_multiple_modules(): def test_conv2d():
"""Test loading multiple modules on the device simultaneously."""
if not tvm.runtime.enabled("micro_dev"): if not tvm.runtime.enabled("micro_dev"):
return return
shape = (1024,)
dtype = "float32"
# Construct Relay add program. from tvm.relay import create_executor
x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) from tvm.relay import transform
ret = relay.add(x, relay.const(1.0))
add_const_func = relay.Function([x], ret)
# Construct Relay subtract program.
x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype))
ret = relay.subtract(x, relay.const(1.0))
sub_const_func = relay.Function([x], ret)
with micro.Session(DEV_CONFIG): dshape = (1, 4, 16, 16)
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG) dtype = 'int8'
sub_const_mod = relay_micro_build(sub_const_func, DEV_CONFIG) func_name = 'fused_nn_conv2d'
x_in = np.random.uniform(size=shape[0]).astype(dtype) reset_gdbinit()
add_const_mod.run(x=x_in)
add_result = add_const_mod.get_output(0).asnumpy()
sub_const_mod.run(x=x_in)
sub_result = sub_const_mod.get_output(0).asnumpy()
tvm.testing.assert_allclose( # Construct Relay program.
add_result, x_in + 1.0) x = relay.var("x", shape=dshape, dtype=dtype)
tvm.testing.assert_allclose( conv_expr = relay.nn.conv2d(
sub_result, x_in - 1.0) x, relay.var("w"),
kernel_size=(3, 3),
padding=(1, 1),
channels=4)
func = relay.Function(relay.analysis.free_vars(conv_expr), conv_expr)
mod = tvm.IRModule.from_expr(func)
mod = transform.InferType()(mod)
x_shape = list(map(lambda x: x.value, mod['main'].params[0].checked_type.shape))
w_shape = list(map(lambda x: x.value, mod['main'].params[1].checked_type.shape))
out_shape = list(map(lambda x: x.value, mod['main'].ret_type.shape))
with tvm.target.build_config(disable_vectorize=True):
graph, c_mod, params = relay.build(mod, target="c")
with micro.Session(DEV_CONFIG_A):
micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
candidate_func_name = func_name
for i in range(100):
try:
micro_func = micro_mod[candidate_func_name]
break
except tvm.TVMError as e:
candidate_func_name = f'{func_name}_{i}'
else:
assert False
ctx = tvm.micro_dev(0)
x_data = tvm.nd.array(np.random.uniform(size=x_shape).astype(dtype), ctx)
w_data = tvm.nd.array(np.random.uniform(size=w_shape).astype(dtype), ctx)
result = tvm.nd.array(np.zeros(shape=out_shape, dtype=dtype), ctx)
micro_func(x_data, w_data, result)
out_data = np.zeros(out_shape, dtype=dtype)
params = { 'x': x_data.asnumpy(), 'w': w_data.asnumpy() }
intrp = create_executor('debug')
expected_result = intrp.evaluate(mod['main'])(x_data, w_data)
tvm.testing.assert_allclose(result.asnumpy(), expected_result.asnumpy())
def test_interleave_sessions(): def test_interleave_sessions():
...@@ -196,8 +263,8 @@ def test_interleave_sessions(): ...@@ -196,8 +263,8 @@ def test_interleave_sessions():
ret = relay.add(x, relay.const(1.0)) ret = relay.add(x, relay.const(1.0))
add_const_func = relay.Function([x], ret) add_const_func = relay.Function([x], ret)
sess_a = micro.Session(DEV_CONFIG) sess_a = micro.Session(DEV_CONFIG_A)
sess_b = micro.Session(DEV_CONFIG) sess_b = micro.Session(DEV_CONFIG_B)
with sess_a: with sess_a:
np_tensor_a = np.random.uniform(size=shape).astype(dtype) np_tensor_a = np.random.uniform(size=shape).astype(dtype)
micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
...@@ -205,13 +272,13 @@ def test_interleave_sessions(): ...@@ -205,13 +272,13 @@ def test_interleave_sessions():
np_tensor_b = np.random.uniform(size=shape).astype(dtype) np_tensor_b = np.random.uniform(size=shape).astype(dtype)
micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0)) micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0))
with sess_a: with sess_a:
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG) add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
add_const_mod.run(x=micro_tensor_a) add_const_mod.run(x=micro_tensor_a)
add_result = add_const_mod.get_output(0).asnumpy() add_result = add_const_mod.get_output(0).asnumpy()
tvm.testing.assert_allclose( tvm.testing.assert_allclose(
add_result, np_tensor_a + 1.0) add_result, np_tensor_a + 1.0)
with sess_b: with sess_b:
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG) add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_B)
add_const_mod.run(x=micro_tensor_b) add_const_mod.run(x=micro_tensor_b)
add_result = add_const_mod.get_output(0).asnumpy() add_result = add_const_mod.get_output(0).asnumpy()
tvm.testing.assert_allclose( tvm.testing.assert_allclose(
...@@ -230,15 +297,15 @@ def test_nested_sessions(): ...@@ -230,15 +297,15 @@ def test_nested_sessions():
ret = relay.add(x, relay.const(1.0)) ret = relay.add(x, relay.const(1.0))
add_const_func = relay.Function([x], ret) add_const_func = relay.Function([x], ret)
sess_a = micro.Session(DEV_CONFIG) sess_a = micro.Session(DEV_CONFIG_A)
sess_b = micro.Session(DEV_CONFIG) sess_b = micro.Session(DEV_CONFIG_B)
with sess_a: with sess_a:
np_tensor_a = np.random.uniform(size=shape).astype(dtype) np_tensor_a = np.random.uniform(size=shape).astype(dtype)
micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
with sess_b: with sess_b:
np_tensor_b = np.random.uniform(size=shape).astype(dtype) np_tensor_b = np.random.uniform(size=shape).astype(dtype)
micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0)) micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0))
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG) add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
add_const_mod.run(x=micro_tensor_a) add_const_mod.run(x=micro_tensor_a)
add_result = add_const_mod.get_output(0).asnumpy() add_result = add_const_mod.get_output(0).asnumpy()
tvm.testing.assert_allclose( tvm.testing.assert_allclose(
...@@ -257,12 +324,12 @@ def test_inactive_session_use(): ...@@ -257,12 +324,12 @@ def test_inactive_session_use():
ret = relay.add(x, relay.const(1.0)) ret = relay.add(x, relay.const(1.0))
add_const_func = relay.Function([x], ret) add_const_func = relay.Function([x], ret)
sess_a = micro.Session(DEV_CONFIG) sess_a = micro.Session(DEV_CONFIG_A)
sess_b = micro.Session(DEV_CONFIG) sess_b = micro.Session(DEV_CONFIG_B)
with sess_a: with sess_a:
np_tensor_a = np.random.uniform(size=shape).astype(dtype) np_tensor_a = np.random.uniform(size=shape).astype(dtype)
micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG) add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
with sess_b: with sess_b:
# These objects belong to `sess_a`. # These objects belong to `sess_a`.
...@@ -272,12 +339,42 @@ def test_inactive_session_use(): ...@@ -272,12 +339,42 @@ def test_inactive_session_use():
add_result, np_tensor_a + 1.0) add_result, np_tensor_a + 1.0)
# TODO add workspace alloc/free stress test
if __name__ == "__main__": if __name__ == "__main__":
test_alloc() test_alloc()
print()
print('finished alloc test')
input('[press enter to continue]')
test_add() test_add()
print()
print('finished add test')
input('[press enter to continue]')
test_workspace_add() test_workspace_add()
print()
print('finished workspace add test')
input('[press enter to continue]')
test_graph_runtime() test_graph_runtime()
print()
print('finished graph runtime test')
input('[press enter to continue]')
test_conv2d()
print()
print('finished conv2d test')
input('[press enter to continue]')
test_multiple_modules() test_multiple_modules()
print()
print('finished multiple modules test')
input('[press enter to continue]')
test_interleave_sessions() test_interleave_sessions()
print()
print('finished interleaved sessions test')
input('[press enter to continue]')
test_nested_sessions() test_nested_sessions()
print()
print('finished nested sessions test')
input('[press enter to continue]')
test_inactive_session_use() test_inactive_session_use()
print()
print('finished use inactive session test')
input('[press enter to continue]')
...@@ -25,3 +25,4 @@ from . import conv2d_alter_op ...@@ -25,3 +25,4 @@ from . import conv2d_alter_op
from .bitserial_conv2d import * from .bitserial_conv2d import *
from .bitserial_dense import * from .bitserial_dense import *
from .injective import * from .injective import *
from . import cortex_m7
...@@ -31,6 +31,7 @@ from .conv2d_spatial_pack import conv2d_spatial_pack_nchw, \ ...@@ -31,6 +31,7 @@ from .conv2d_spatial_pack import conv2d_spatial_pack_nchw, \
conv2d_spatial_pack_nhwc, \ conv2d_spatial_pack_nhwc, \
schedule_conv2d_spatial_pack_nchw, \ schedule_conv2d_spatial_pack_nchw, \
schedule_conv2d_spatial_pack_nhwc schedule_conv2d_spatial_pack_nhwc
from .cortex_m7.conv2d import direct_simd
@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu") @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
...@@ -425,3 +426,15 @@ def schedule_conv2d_nchw_winograd_nnpack_without_weight_transform(cfg, outs): ...@@ -425,3 +426,15 @@ def schedule_conv2d_nchw_winograd_nnpack_without_weight_transform(cfg, outs):
traverse_inline(s, outs[0].op, _callback) traverse_inline(s, outs[0].op, _callback)
return s return s
@autotvm.register_topi_compute("conv2d_direct_simd.arm_cpu")
def conv2d_direct_simd(cfg, data, kernel, strides, padding, dilation, out_dtype):
"""Compute conv2d with SIMD (v7e-m)."""
return direct_simd.conv2d_direct_simd_compute(
cfg, data, kernel, strides, padding, dilation, out_dtype)
@autotvm.register_topi_schedule("conv2d_direct_simd.arm_cpu")
def schedule_conv2d_direct_simd(cfg, outs):
"""Create schedule for conv2d_direct_simd"""
return direct_simd.conv2d_direct_simd_nhwc_schedule(cfg, outs)
...@@ -152,13 +152,13 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, ...@@ -152,13 +152,13 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
cfg["ann_reduce"].apply(s, conv, [kh, kw], cfg["ann_reduce"].apply(s, conv, [kh, kw],
axis_lens=[get_const_int(kh.dom.extent), axis_lens=[get_const_int(kh.dom.extent),
get_const_int(kw.dom.extent)], get_const_int(kw.dom.extent)],
max_unroll=16, max_unroll=None,
cfg=cfg) cfg=cfg)
cfg["ann_spatial"].apply(s, conv, [vh, vw, vc], cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
axis_lens=[cfg['tile_oh'].size[-1], axis_lens=[cfg['tile_oh'].size[-1],
cfg['tile_ow'].size[-1], cfg['tile_ow'].size[-1],
cfg['tile_co'].size[-1]], cfg['tile_co'].size[-1]],
max_unroll=16, max_unroll=None,
cfg=cfg) cfg=cfg)
# schedule fusion # schedule fusion
......
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Schedules specialized for cortex-m7."""
from . import conv2d
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Conv2d implementations for cortex-m7."""
from . import direct_simd
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name
"""Direct implementation of conv2d."""
import tvm
from tvm import autotvm
from tvm.autotvm.task import deserialize_args
from topi.nn.conv2d import conv2d_nchw, conv2d_nhwc
from topi.util import get_const_tuple, get_const_int, traverse_inline
def conv2d_direct(*args, **kwargs):
"""Schedule function for directly-scheduled conv2d."""
assert not kwargs, "Do not support kwargs in template function call"
args = deserialize_args(args)
data, kernel = args[:2]
layout = args[-2]
cfg = autotvm.get_config()
args = [cfg] + args
conv = conv2d_direct_compute(*args)
if layout == 'NHWC':
sched = conv2d_direct_nhwc_schedule(cfg, [data, kernel, conv])
elif layout == 'NCHW':
sched = conv2d_direct_nchw_schedule(cfg, [data, kernel, conv])
else:
raise RuntimeError(f'unsupported data layout "{layout}"')
return sched, [data, kernel, conv]
conv2d_direct.template_key = 'direct'
conv2d_direct.default_data_layout = 'NHWC'
conv2d_direct.default_kernel_layout = 'HWIO'
@autotvm.register_topi_compute('conv2d_direct.micro_dev')
def conv2d_direct_compute(*args):
layout = args[-2]
if layout == 'NHWC':
return _conv2d_direct_nhwc_compute(*args)
if layout == 'NCHW':
return _conv2d_direct_nchw_compute(*args)
raise RuntimeError(f'unsupported data layout "{layout}"')
def _conv2d_direct_nhwc_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
assert layout == 'NHWC'
conv = conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
# Config Space Definition
N, H, W, CI = get_const_tuple(data.shape)
KH, KW, _, CO = get_const_tuple(kernel.shape)
n, oh, ow, co = cfg.axis(N), cfg.axis(H), cfg.axis(W), cfg.axis(CO)
kh, kw, ci = cfg.reduce_axis(KH), cfg.reduce_axis(KW), cfg.reduce_axis(CI)
# TODO should we add a max_factor attr to these splits?
co, vc = cfg.define_split('tile_co', co, num_outputs=2)
oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
cfg.define_reorder('reorder_0',
[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
policy='candidate', candidate=[
[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
[n, co, oh, ow, ci, kh, kw, vc, vh, vw],
[n, co, oh, ow, ci, vh, vw, vc, kh, kw],
[n, co, oh, ow, ci, vc, vh, vw, kh, kw]])
cfg.define_annotate('ann_reduce', [kh, kw], policy='try_unroll')
cfg.define_annotate('ann_spatial', [vh, vw, vc], policy='try_unroll')
cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32])
cfg.define_knob('unroll_explicit', [0, 1])
return conv
def _conv2d_direct_nchw_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
assert layout == 'NCHW'
conv = conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
###########################
# Config Space Definition #
###########################
cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32])
cfg.define_knob('unroll_explicit', [0, 1])
return conv
@autotvm.register_topi_schedule('conv2d_direct_nhwc.micro_dev')
def conv2d_direct_nhwc_schedule(cfg, outs):
"""Schedule function for directly-scheduled conv2d on NHWC layout."""
sched = tvm.create_schedule([x.op for x in outs])
def _callback(op):
if 'conv2d_nhwc' not in op.tag:
return
### extract tensors ###
output = op.output(0)
conv = op
data_vec = conv.input_tensors[0]
kernel = conv.input_tensors[1] # pylint: disable=unused-variable
last = outs[0] # pylint: disable=unused-variable
# tile reduction axes
n, oh, ow, co = sched[conv].op.axis
kh, kw, ci = sched[conv].op.reduce_axis
# NOTE we can't inline data padding in the SIMD path, because it
# introduces conditionals in the inner loop.
data_pad = data_vec.op
sched[data_pad].compute_inline()
co, vc = cfg['tile_co'].apply(sched, conv, co)
oh, vh = cfg['tile_oh'].apply(sched, conv, oh)
ow, vw = cfg['tile_ow'].apply(sched, conv, ow)
cfg['reorder_0'].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
cfg['ann_reduce'].apply(sched, conv, [kh, kw],
axis_lens=[get_const_int(kh.dom.extent),
get_const_int(kw.dom.extent)],
max_unroll=8,
cfg=cfg)
cfg['ann_spatial'].apply(sched, conv, [vh, vw, vc],
axis_lens=[cfg['tile_oh'].size[-1],
cfg['tile_ow'].size[-1],
cfg['tile_co'].size[-1]],
max_unroll=8,
cfg=cfg)
kernel_scope = n # this is the scope to attach global config inside this kernel
# tune unroll
sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
traverse_inline(sched, outs[-1].op, _callback)
return sched
@autotvm.register_topi_schedule('conv2d_direct_nchw.micro_dev')
def conv2d_direct_nchw_schedule(cfg, outs):
"""Schedule function for Cortex-M7 direct implementation of conv2d."""
# use default schedule
sched = tvm.create_schedule([x.op for x in outs])
conv = outs[-1].op
output = conv.output(0)
data_vec = conv.input_tensors[0]
data_pad = data_vec.op
sched[data_pad].compute_inline()
# TODO add more schedule opts (similar to the NHWC template)
n, _, _, _ = sched[conv].op.axis
kernel_scope = n # this is the scope to attach global config inside this kernel
# tune unroll
sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
return sched
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name, no-value-for-parameter
"""Direct implementation of conv2d."""
from tvm import autotvm
from tvm.autotvm.task import deserialize_args
from tvm import te
from topi.util import simplify, traverse_inline
from topi.nn.pad import pad
from topi.nn.util import get_pad_tuple
from ..micro_kernel.gemm import (
intrin_gemm_MxKxN, gemm_MxKxN_impl,
)
def conv2d_direct_simd(*args, **kwargs):
"""Defines the Cortex-M7 SIMD implementation of conv2d."""
assert not kwargs, "Do not support kwargs in template function call"
args = deserialize_args(args)
data, kernel = args[:2]
layout = args[-2]
cfg = autotvm.get_config()
args = [cfg] + args
assert layout == 'NHWC'
conv = conv2d_direct_simd_compute(*args)
sched = conv2d_direct_simd_nhwc_schedule(cfg, [data, kernel, conv])
return sched, [data, kernel, conv]
conv2d_direct_simd.template_key = 'direct_simd'
conv2d_direct_simd.default_data_layout = 'NHWC'
conv2d_direct_simd.default_kernel_layout = 'HWOI'
def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
"""Compute function for Cortex-M7 SIMD implementation of conv2d."""
assert isinstance(strides, int) or len(strides) == 2
assert isinstance(dilation, int) or len(dilation) == 2
if isinstance(strides, int):
stride_h = stride_w = strides
else:
stride_h, stride_w = strides
if isinstance(dilation, int):
dilation_h = dilation_w = dilation
else:
dilation_h, dilation_w = dilation
batch_size, in_height, in_width, in_channels = data.shape
kernel_h, kernel_w, out_channels, _ = kernel.shape
# compute the output shape
dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
padding, (dilated_kernel_h, dilated_kernel_w))
out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
pad_before = [0, pad_top, pad_left, 0]
pad_after = [0, pad_down, pad_right, 0]
padded_data = pad(data, pad_before, pad_after, name='padded_data')
rc = te.reduce_axis((0, in_channels), name='rc')
ry = te.reduce_axis((0, kernel_h), name='ry')
rx = te.reduce_axis((0, kernel_w), name='rx')
conv = te.compute(
(batch_size, out_height, out_width, out_channels),
lambda nn, yy, xx, ff: te.sum(
padded_data[nn, yy * stride_h + ry * dilation_h,
xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
kernel[ry, rx, ff, rc].astype(out_dtype), axis=[ry, rx, rc]),
name='conv2d', tag='conv2d_nhwc')
###########################
# Config Space Definition #
###########################
n, oh, ow, co = (cfg.axis(batch_size.value),
cfg.axis(out_height.value),
cfg.axis(out_width.value),
cfg.axis(out_channels.value))
kh, kw, ci = (cfg.reduce_axis(kernel_h.value),
cfg.reduce_axis(kernel_w.value),
cfg.reduce_axis(in_channels.value))
assert in_channels.value % 4 == 0
owo, owi = cfg.define_split('tile_ow', ow, policy='factors', num_outputs=2)
cio, cii = cfg.define_split('tile_ci', ci, policy='factors', num_outputs=2,
filter=lambda x: x.size[-1] % 4 == 0)
coo, coi = cfg.define_split('tile_co', co, policy='factors', num_outputs=2)
cfg.define_reorder('reorder_0_simd',
[n, oh, owo, owi, coo, coi, kh, kw, cio, cii],
policy='candidate', candidate=[
[n, oh, kh, kw, owo, coo, cio, owi, coi, cii],
[n, oh, kh, kw, coo, owo, cio, owi, coi, cii],
[n, kh, kw, oh, owo, coo, cio, owi, coi, cii],
[n, kh, kw, oh, coo, owo, cio, owi, coi, cii]])
cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32])
cfg.define_knob('unroll_explicit', [0, 1])
return conv
def conv2d_direct_simd_nhwc_schedule(cfg, outs):
"""Schedule function for Cortex-M7 SIMD implementation of conv2d."""
sched = te.create_schedule([x.op for x in outs])
def _callback(op):
if 'conv2d_nhwc' not in op.tag:
return
# extract tensors
output = op.output(0)
conv = op
data_vec = conv.input_tensors[0]
kernel = conv.input_tensors[1] # pylint: disable=unused-variable
last = outs[0] # pylint: disable=unused-variable
# tile reduction axes
n, oh, ow, co = sched[conv].op.axis
kh, kw, ci = sched[conv].op.reduce_axis
M = cfg['tile_ow'].size[-1]
K = cfg['tile_ci'].size[-1]
N = cfg['tile_co'].size[-1]
owo, owi = cfg['tile_ow'].apply(sched, conv, ow)
cio, cii = cfg['tile_ci'].apply(sched, conv, ci)
coo, coi = cfg['tile_co'].apply(sched, conv, co)
cfg['reorder_0_simd'].apply(sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii])
gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype)
sched[output].tensorize(owi, gemm)
sched[output].pragma(n, 'import_c', gemm_MxKxN_impl(M, K, N, uniq_id))
# this is the scope to attach global config inside this kernel
kernel_scope = n
# tune unroll
sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
traverse_inline(sched, outs[-1].op, _callback)
return sched
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name, no-value-for-parameter
"""Defines gemm intrinsics for SIMD matrix multiplication."""
import random
import string
import tvm
from tvm import te
##########################
# MxKxN MatMul Intrinsic #
##########################
# NOTE this is transposed matmul (A * B^T)
def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
"""Defines a SIMD-accelerated transposed matmul."""
# we generate a unique ID for every intrinsic definition, to prevent name
# collisions in the generated source (e.g., if there are multiple operators
# in the same module that use the same intrinsic)
#
# TODO(weberlo, areusch): to cut down on memory usage, we should cache each intrinsic
# instantiation and include it only once, eliminating the need for unique
# IDs
UNIQ_ID_LEN = 8
uniq_id = ''.join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN))
if isinstance(M, tvm.tir.IntImm):
M = M.value
if isinstance(K, tvm.tir.IntImm):
K = K.value
if isinstance(N, tvm.tir.IntImm):
N = N.value
assert K % 4 == 0
# TODO(weberlo, areusch): support more dtypes?
assert in_dtype == 'int8'
assert out_dtype == 'int32'
A = te.placeholder((M, K), name='a', dtype=in_dtype)
B = te.placeholder((N, K), name='b', dtype=in_dtype)
k = te.reduce_axis((0, K), name='k')
C = te.compute(
(M, N),
lambda i, j: te.sum(A[i, k].astype(out_dtype) * B[j, k].astype(out_dtype), axis=k),
name='c')
A_buf = tvm.tir.decl_buffer(
A.shape, A.dtype,
name="A",
offset_factor=1,
strides=[te.var("A_s"), 1])
B_buf = tvm.tir.decl_buffer(
B.shape, B.dtype,
name="B",
offset_factor=1,
strides=[te.var("B_s"), 1])
C_buf = tvm.tir.decl_buffer(
C.shape, C.dtype,
name="C",
offset_factor=1,
strides=[te.var("C_s"), 1])
def intrin_func(ins, outs):
aa, bb = ins
cc = outs[0]
def _reduce_update():
ib = tvm.tir.ir_builder.create()
ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_update_{uniq_id}",
aa.access_ptr("r"),
bb.access_ptr("r"),
cc.access_ptr("w"),
aa.strides[0],
bb.strides[0],
cc.strides[0]))
return ib.get()
def _reduce_reset():
ib = tvm.tir.ir_builder.create()
ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_reset_{uniq_id}",
cc.access_ptr("w"),
cc.strides[0]))
return ib.get()
def _body():
ib = tvm.tir.ir_builder.create()
ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_body_{uniq_id}",
aa.access_ptr("r"),
bb.access_ptr("r"),
cc.access_ptr("w"),
aa.strides[0],
bb.strides[0],
cc.strides[0]))
return ib.get()
return _body(), _reduce_reset(), _reduce_update()
with tvm.target.build_config(offset_factor=1):
intrin_decl = te.decl_tensor_intrin(
C.op, intrin_func, binds={A: A_buf, B: B_buf, C: C_buf})
return intrin_decl, uniq_id
def gemm_MxKxN_impl(M, K, N, uniq_id):
"""Emit C code for gemm impl."""
# TODO(weberlo, areusch): are there any SIMD tricks to zero out arrays quickly?
aa_pad_size = M * K
bb_pad_size = N * K
# code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601)
cc_code = f"""
#ifdef __cplusplus
extern "C"
#endif
__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
int8_t *aa, int8_t *bb, int32_t *cc,
int A_stride, int B_stride, int C_stride) {{
int16_t aa_pad[{aa_pad_size}];
int16_t bb_pad[{bb_pad_size}];
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {K} / 4; j++) {{
read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]);
}}
}}
for (int i = 0; i < {N}; i++) {{
for (int j = 0; j < {K} / 4; j++) {{
read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
}}
}}
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {N}; j++) {{
int32_t sum = 0;
for (int l = 0; l < {K} / 2; l++) {{
sum = __SMLAD(
*((int32_t*) &aa_pad[i*{K} + l*2]),
*((int32_t*) &bb_pad[j*{K} + l*2]),
sum);
}}
// NOTE: this is the line where `*_body` differs from `*_update`. here
// we're *setting* the result, instead of accumulating, because we know
// the `i` and `j` itervars span their entire respective axes.
cc[i*C_stride + j] = sum;
}}
}}
return 0;
}}
#ifdef __cplusplus
extern "C"
#endif
__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
int8_t *aa, int8_t *bb, int32_t *cc,
int A_stride, int B_stride, int C_stride) {{
int16_t aa_pad[{aa_pad_size}];
int16_t bb_pad[{bb_pad_size}];
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {K} / 4; j++) {{
read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]);
}}
}}
for (int i = 0; i < {N}; i++) {{
for (int j = 0; j < {K} / 4; j++) {{
read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
}}
}}
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {N}; j++) {{
int32_t sum = 0;
for (int l = 0; l < {K} / 2; l++) {{
sum = __SMLAD(
*((int32_t*) &aa_pad[i*{K} + l*2]),
*((int32_t*) &bb_pad[j*{K} + l*2]),
sum);
}}
cc[i*C_stride + j] += sum;
}}
}}
return 0;
}}
#ifdef __cplusplus
extern "C"
#endif
__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_reset_{uniq_id}(int32_t *cc, int C_stride) {{
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {N}; j++) {{
cc[i*C_stride + j] = 0;
}}
}}
return 0;
}}
"""
return cc_code
...@@ -24,7 +24,7 @@ def default_schedule(outs, auto_inline): ...@@ -24,7 +24,7 @@ def default_schedule(outs, auto_inline):
"""Default schedule for llvm.""" """Default schedule for llvm."""
target = tvm.target.Target.current(allow_none=False) target = tvm.target.Target.current(allow_none=False)
outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
if target.target_name != "llvm": if target.target_name not in ("llvm", "c"):
raise RuntimeError("schedule not registered for '%s'" % target) raise RuntimeError("schedule not registered for '%s'" % target)
s = te.create_schedule([x.op for x in outs]) s = te.create_schedule([x.op for x in outs])
if auto_inline: if auto_inline:
......
...@@ -35,10 +35,8 @@ def _conv2d_nhwc_python(a_np, w_np, stride, padding): ...@@ -35,10 +35,8 @@ def _conv2d_nhwc_python(a_np, w_np, stride, padding):
stride : int or a list/tuple of two ints stride : int or a list/tuple of two ints
Stride size, or [stride_height, stride_width] Stride size, or [stride_height, stride_width]
padding : int or str or a list/tuple of 2 or 4 ints padding : int or str or a list/tuple of two ints
Padding size, or ['VALID', 'SAME'], or Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width]
[pad_height, pad_width] for 2 ints, or
[pad_top, pad_left, pad_bottom, pad_right] for 2 ints
Returns Returns
------- -------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment