Unverified Commit 8d724968 by Andrew Reusch Committed by GitHub

[RUNTIME][uTVM] AutoTVM + uTVM for Cortex-M7 (#5417)

* Prototype for micro TVM.

* Cleanup and sync micro tvm prototype.

* Use /std:c++14 with MSVC.

 * Per tqchen: project has already moved to C++14
 * Presubmit failed for code that built locally on gcc.

* fix ASF lint, and fix add_asf_header too

* Compiles with USE_MICRO=OFF.

* Cleanup TargetPtr and word size representations.

* fix compile warning

* address logan's comments

* address logan and liangfu comments

* address thierry's comments

* address u99127, liangfu, tmoreau89 comments

Co-authored-by: Logan Weber <weberlo@cs.washington.edu>
parent ae89afe0
Subproject commit 981b1c32f91668e669ee376856f92f36cfd2a351
Subproject commit 808f485387f9a03f78fa9f1159f387d0d91b7a28
......@@ -304,12 +304,15 @@ include(cmake/modules/contrib/TFLite.cmake)
include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
include(cmake/modules/contrib/CoreML.cmake)
include(CheckCXXCompilerFlag)
if(NOT MSVC)
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14)
message(STATUS "Build with c++14")
set(CMAKE_CXX_FLAGS "-std=c++14 ${CMAKE_CXX_FLAGS}")
set(CMAKE_CUDA_STANDARD 14)
else()
check_cxx_compiler_flag("/std:c++14" SUPPORT_CXX14)
set(CMAKE_CXX_FLAGS "/std:c++14 ${CMAKE_CXX_FLAGS}")
set(CMAKE_CUDA_STANDARD 14)
endif()
add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})
......
......@@ -73,7 +73,8 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc
cpplint:
python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include;
python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp include src \
python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp \
include src \
examples/extension/src examples/graph_executor/src
pylint:
......
......@@ -930,6 +930,8 @@ constexpr const char* loop_scope = "loop_scope";
constexpr const char* reduce_scope = "reduce_scope";
/*! \brief Mark region is guarded by the pragma extension */
constexpr const char* pragma_scope_prefix = "pragma_";
/*! \brief Import C source or file into the final code gen module */
constexpr const char* pragma_import_c = "pragma_import_c";
/*! \brief Import llvm source or file into the final code gen module */
constexpr const char* pragma_import_llvm = "pragma_import_llvm";
/*! \brief Try to modify the AST to support Tensor Core */
......
......@@ -145,7 +145,7 @@ class LocalExecutor(executor.Executor):
if not self.do_fork:
return LocalFutureNoFork(func(*args, **kwargs))
queue = Queue(2)
queue = Queue(2) # Size of 2 to avoid a race condition with size 1.
process = Process(target=call_with_timeout,
args=(queue, self.timeout, func, args, kwargs))
process.start()
......
......@@ -245,6 +245,8 @@ class RPCRunner(Runner):
if 'cuda' in self.task.target.keys:
kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
if self.task.target.device_name == 'micro_dev':
kwargs.setdefault('build_option', {})['disable_vectorize'] = True
return kwargs
......@@ -273,7 +275,8 @@ class RPCRunner(Runner):
if isinstance(res, Exception): # executor error or timeout
results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT,
self.timeout, time.time()))
else:
raise Exception(f'encountered exception during measurement: {results}')
results.append(res)
return results
......
......@@ -48,6 +48,7 @@ def _lower(mod,
grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
grc.codegen(mod["main"])
return
# default case
# Try graph codegen first to extract autotvm tasks.
# If failed to compile, then fallback to use VM compiler.
......
......@@ -149,7 +149,7 @@ def progress_bar(total, prefix='', si_prefix='G'):
if res.error_no == 0:
flops = inp.task.flop / np.mean(res.costs)
if logger.level < logging.DEBUG: # only print progress bar in non-debug mode
if not logger.isEnabledFor(logging.DEBUG): # only print progress bar in non-debug mode
ctx.cur_flops = flops
ctx.best_flops = tuner.best_flops
......
......@@ -50,7 +50,11 @@ class GATuner(Tuner):
# space info
self.space = task.config_space
self.dims = [len(x) for x in self.space.space_map.values()]
self.dim_keys = []
self.dims = []
for k, v in self.space.space_map.items():
self.dim_keys.append(k)
self.dims.append(len(v))
self.visited = set([])
......@@ -123,7 +127,7 @@ class GATuner(Tuner):
if len(self.visited) < len(self.space):
while knob2point(tmp_gene, self.dims) in self.visited:
j = np.random.randint(len(self.dims))
tmp_gene[j] = np.random.randint(self.dims[j])
tmp_gene[j] = np.random.randint(self.dims[j]) # pylint: disable=invalid-sequence-index
next_genes.append(tmp_gene)
self.visited.add(knob2point(tmp_gene, self.dims))
else:
......
......@@ -21,7 +21,9 @@ import subprocess
import tvm._ffi
from . import util
# TODO does this file still belong in `contrib`. is it too µTVM-specific?
# TODO shouldn't need so many `ALIGN` directives
RELOCATION_LD_SCRIPT_TEMPLATE = """
/* linker symbol for use in UTVMInit */
_utvm_stack_pointer_init = 0x{stack_pointer_init:x};
......@@ -118,7 +120,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
size of the section in bytes
"""
if not os.path.isfile(binary_path):
raise RuntimeError("no such file \"{}\"".format(binary_path))
raise RuntimeError('no such file "{}"'.format(binary_path))
# We use the "-A" flag here to get the ".rodata" section's size, which is
# not included by default.
size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path])
......@@ -160,6 +162,10 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
# padding for most cases, but symbols can be arbitrarily large, so this
# isn't bulletproof.
return section_size + 32
# NOTE: in the past, section_size has been wrong on x86. it may be
# inconsistent. TODO: maybe stop relying on `*size` to give us the size and
# instead read the section with `*objcopy` and count the bytes.
return section_size
......@@ -206,11 +212,13 @@ def tvm_callback_relocate_binary(
rel_bin : bytearray
the relocated binary
"""
assert text_start < rodata_start < data_start < bss_start < stack_end
stack_pointer_init = stack_end - word_size
ld_script_contents = ""
# TODO(weberlo): There should be a better way to configure this for different archs.
# TODO is this line even necessary?
if "riscv" in toolchain_prefix:
ld_script_contents += "OUTPUT_ARCH( \"riscv\" )\n\n"
ld_script_contents += 'OUTPUT_ARCH( "riscv" )\n\n'
ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format(
word_size=word_size,
text_start=text_start,
......@@ -221,7 +229,7 @@ def tvm_callback_relocate_binary(
tmp_dir = util.tempdir()
rel_obj_path = tmp_dir.relpath("relocated.obj")
rel_ld_script_path = tmp_dir.relpath("relocated.lds")
rel_ld_script_path = tmp_dir.relpath("relocate.lds")
with open(rel_ld_script_path, "w") as f:
f.write(ld_script_contents)
run_cmd([
......@@ -229,8 +237,23 @@ def tvm_callback_relocate_binary(
binary_path,
"-T", rel_ld_script_path,
"-o", rel_obj_path])
with open(rel_obj_path, "rb") as f:
rel_bin = bytearray(f.read())
gdb_init_dir = os.environ.get("MICRO_GDB_INIT_DIR")
if gdb_init_dir is not None:
gdb_init_path = f"{gdb_init_dir}/.gdbinit"
with open(gdb_init_path, "r") as f:
gdbinit_contents = f.read().split("\n")
new_contents = []
for line in gdbinit_contents:
new_contents.append(line)
if line.startswith("target"):
new_contents.append(f"add-symbol-file {rel_obj_path}")
with open(gdb_init_path, "w") as f:
f.write("\n".join(new_contents))
return rel_bin
......
......@@ -20,6 +20,7 @@ from __future__ import absolute_import
import argparse
import ast
import json
import multiprocessing
import sys
import logging
......@@ -41,7 +42,7 @@ def main(args):
tracker_addr = (url, port)
if not args.key:
raise RuntimeError(
"Need key to present type of resource when tracker is available")
'Need key to present type of resource when tracker is available')
else:
tracker_addr = None
......@@ -75,8 +76,8 @@ def init_utvm(args):
dev_config = json.load(dev_conf_file)
else:
dev_config_args = ast.literal_eval(args.utvm_dev_config_args)
default_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['default_config']
dev_config = default_config_func(*dev_config_args)
generate_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['generate_config']
dev_config = generate_config_func(*dev_config_args)
if args.utvm_dev_config or args.utvm_dev_id:
# add MicroTVM overrides
......@@ -100,8 +101,8 @@ if __name__ == "__main__":
parser.add_argument('--port-end', type=int, default=9199,
help='The end search port of the RPC')
parser.add_argument('--tracker', type=str,
help="The address of RPC tracker in host:port format. "
"e.g. (10.77.1.234:9190)")
help=("The address of RPC tracker in host:port format. "
"e.g. (10.77.1.234:9190)"))
parser.add_argument('--key', type=str, default="",
help="The key used to identify the device type in tracker.")
parser.add_argument('--silent', action='store_true',
......@@ -115,12 +116,19 @@ if __name__ == "__main__":
parser.add_argument('--custom-addr', type=str,
help="Custom IP Address to Report to RPC Tracker")
parser.add_argument('--utvm-dev-config', type=str,
help='JSON config file for the target device (if using MicroTVM)')
parser.add_argument('--utvm-dev-id', type=str,
help='Unique ID for the target device (if using MicroTVM)')
help=('JSON config file for the target device (if using MicroTVM). '
'This file should contain serialized output similar to that returned '
"from the device module's generate_config. Can't be specified when "
'--utvm-dev-config-args is specified.'))
parser.add_argument('--utvm-dev-config-args', type=str,
help=('Python list of literals required to generate a default'
' MicroTVM config (if --utvm-dev-id is specified)'))
help=("Arguments to the device module's generate_config function. "
'Must be a python literal parseable by literal_eval. If specified, '
"the device configuration is generated using the device module's "
"generate_config. Can't be specified when --utvm-dev-config is "
"specified."))
parser.add_argument('--utvm-dev-id', type=str,
help=('Unique ID for the target device (if using MicroTVM). Should '
'match the name of a module underneath tvm.micro.device).'))
parser.set_defaults(fork=True)
args = parser.parse_args()
......
......@@ -17,6 +17,7 @@
"""MicroTVM module for bare-metal backends"""
from ..contrib import binutil
from .base import Session, create_micro_mod, cross_compiler
from .base import LibType, get_micro_host_driven_dir, get_micro_device_dir
from .base import DEVICE_SECTIONS
from .base import Session, create_micro_mod, cross_compiler, LibType
from .base import get_micro_host_driven_dir, get_micro_device_dir
from . import device
......@@ -19,6 +19,7 @@
from __future__ import absolute_import
import os
import re
import sys
from enum import Enum
......@@ -28,6 +29,18 @@ import tvm._ffi
from tvm.contrib import util as _util
from tvm.contrib import cc as _cc
# all sections that comprise a device's memory layout, in order from lowest
# starting address to highest
DEVICE_SECTIONS = [
"text",
"rodata",
"data",
"bss",
"args",
"heap",
"workspace",
"stack",
]
class LibType(Enum):
"""Enumeration of library types that can be compiled and loaded onto a device"""
......@@ -51,9 +64,9 @@ class Session:
.. code-block:: python
c_mod = ... # some module generated with "c" as the target
dev_config = micro.device.arm.stm32f746xx.default_config("127.0.0.1", 6666)
dev_config = micro.device.arm.stm32f746xx.default_config('127.0.0.1', 6666)
with tvm.micro.Session(dev_config) as sess:
micro_mod = create_micro_mod(c_mod, dev_config)
micro_mod = sess.create_micro_mod(c_mod)
"""
def __init__(self, config):
......@@ -62,19 +75,20 @@ class Session:
# grab a binutil instance from the ID in the config
dev_funcs = tvm.micro.device.get_device_funcs(config["device_id"])
self.create_micro_lib = dev_funcs["create_micro_lib"]
self.toolchain_prefix = config["toolchain_prefix"]
self.mem_layout = config["mem_layout"]
self.word_size = config["word_size"]
self.word_size_bits = config["word_size_bits"]
self.thumb_mode = config["thumb_mode"]
self.use_device_timer = config["use_device_timer"]
self.comms_method = config["comms_method"]
# First, find and compile runtime library.
runtime_src_path = os.path.join(get_micro_host_driven_dir(), "utvm_runtime.c")
tmp_dir = _util.tempdir()
runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj")
self.create_micro_lib(runtime_obj_path, runtime_src_path, LibType.RUNTIME)
#input(f"check {runtime_obj_path}: ")
options = ["-I{}".format(get_micro_host_driven_dir())]
dev_funcs["create_micro_lib"](
runtime_obj_path, runtime_src_path, LibType.RUNTIME, options=options)
comms_method = config["comms_method"]
if comms_method == "openocd":
......@@ -86,6 +100,8 @@ class Session:
else:
raise RuntimeError(f"unknown communication method: f{self.comms_method}")
assert all(map(lambda sec: sec in self.mem_layout, DEVICE_SECTIONS)), \
"not all sections have an assigned memory layout"
self.module = _CreateSession(
comms_method,
runtime_obj_path,
......@@ -106,12 +122,15 @@ class Session:
self.mem_layout["workspace"]["size"],
self.mem_layout["stack"].get("start", 0),
self.mem_layout["stack"]["size"],
self.word_size,
self.word_size_bits,
self.thumb_mode,
self.use_device_timer,
server_addr,
server_port)
self._enter = self.module["enter"]
self._exit = self.module["exit"]
self.get_last_batch_time = self.module["get_last_batch_time"]
self.get_last_batch_cycles = self.module["get_last_batch_cycles"]
def _check_system(self):
"""Check if the user's system is supported by MicroTVM.
......@@ -119,7 +138,7 @@ class Session:
Raises error if not supported.
"""
if not sys.platform.startswith("linux"):
raise RuntimeError("MicroTVM is currently only supported on Linux hosts")
raise RuntimeError("MicroTVM is currently only supported on Linux")
# TODO(weberlo): Add 32-bit support.
# It's primarily the compilation pipeline that isn't compatible.
if sys.maxsize <= 2**32:
......@@ -133,44 +152,91 @@ class Session:
self._exit()
def create_micro_mod(c_mod, dev_config):
def _calc_max_workspace_usage(src):
# TODO factor in alignment to the calculation (alloc sizes will be aligned up to the word size)
alloc_re = re.compile(
r'.*\* ?(.+) = (\(.+\))? TVMBackendAllocWorkspace\(.+, .+, \(uint64_t\)(.+), .+, .+\).*')
free_re = re.compile(r'.*if \(TVMBackendFreeWorkspace\(.+, .+, (\(void\*\))? (.+)\) != 0\) {.*')
max_usage = 0
alloc_map = {}
for line in src.split("\n"):
if line.strip().startswith("//"):
continue
match = alloc_re.match(line)
if match is not None:
alloc_map[match.group(1)] = int(match.group(3))
max_usage = max(max_usage, sum(alloc_map.values()))
else:
match = free_re.match(line)
if match is not None:
print(alloc_map)
del alloc_map[match.group(2)]
return max_usage
def create_micro_mod(c_mod, dev_config, lib_src_paths=None, lib_headers=None,
lib_include_paths=None):
"""Produces a micro module from a given module.
Parameters
----------
c_mod : tvm.runtime.Module
c_mod : tvm.module.Module
module with "c" as its target backend
dev_config : Dict[str, Any]
MicroTVM config dict for the target device
lib_src_paths: TODO
TODO
lib_headers: TODO
TODO
lib_include_paths: TODO
TODO
Return
------
micro_mod : tvm.runtim.Module
micro_mod : tvm.module.Module
micro module for the target device
"""
temp_dir = _util.tempdir()
lib_obj_path = temp_dir.relpath("dev_lib.obj")
# TODO use dev config to dispatch on the type of C codegen to run through
# (e.g., CodeGenCArm, CodeGenCHost, CodeGenCRiscV)
c_mod.export_library(
lib_obj_path,
fcompile=cross_compiler(dev_config, LibType.OPERATOR))
fcompile=cross_compiler(
dev_config,
LibType.OPERATOR,
lib_src_paths=lib_src_paths,
lib_headers=lib_headers,
lib_include_paths=lib_include_paths))
micro_mod = tvm.runtime.load_module(lib_obj_path)
return micro_mod
def cross_compiler(dev_config, lib_type):
"""Create a cross-compile function that wraps `create_lib` for a `Binutil` instance.
def cross_compiler(dev_config, lib_type, lib_src_paths=None, lib_headers=None,
lib_include_paths=None):
"""Create a cross compile function that wraps `create_lib` for a `Binutil` instance.
For use in `tvm.runtime.Module.export_library`.
Parameters
----------
dev_config : Dict[str, Any]
MicroTVM config dict for the target device
create_micro_lib : func
function for creating MicroTVM libraries for a specific device (e.g.,
`tvm.micro.device.get_device_funcs('arm.stm32f746xx')['create_micro_lib']`)
lib_type : micro.LibType
whether to compile a MicroTVM runtime or operator library
lib_src_paths: TODO
TODO
lib_headers: TODO
e.g., `['cmsis_gcc.h', 'arm_math.h']`
lib_include_paths: TODO
TODO
Return
------
func : Callable[[str, str, Optional[str]], None]
......@@ -183,16 +249,49 @@ def cross_compiler(dev_config, lib_type):
c_mod = ... # some module generated with "c" as the target
fcompile = tvm.micro.cross_compiler(dev_config, LibType.OPERATOR)
c_mod.export_library("dev_lib.obj", fcompile=fcompile)
c_mod.export_library('dev_lib.obj', fcompile=fcompile)
"""
dev_funcs = tvm.micro.device.get_device_funcs(dev_config['device_id'])
create_micro_lib = dev_funcs['create_micro_lib']
assert (lib_headers is None) == (lib_include_paths is None), \
"must specify both `lib_headers` and `lib_include_paths` or neither"
if lib_src_paths is None:
lib_src_paths = []
if lib_include_paths is None:
lib_include_paths = []
include_options = []
for include_path in lib_include_paths:
include_options.append("-I")
include_options.append(include_path)
create_micro_lib = tvm.micro.device.get_device_funcs(
dev_config["device_id"])["create_micro_lib"]
mem_layout = dev_config["mem_layout"]
def compile_func(obj_path, src_path, **kwargs):
if isinstance(obj_path, list):
obj_path = obj_path[0]
if isinstance(src_path, list):
src_path = src_path[0]
create_micro_lib(obj_path, src_path, lib_type, kwargs.get("options", None))
options = kwargs.get("options", [])
options += include_options
# check that workspace allocations don't exceed available workspace memory
with open(src_path) as f:
src_contents = f.read()
max_ws_usage = _calc_max_workspace_usage(src_contents)
available_mem = mem_layout["workspace"]["size"]
if max_ws_usage > available_mem:
raise RuntimeError(f"workspace allocations in library ({max_ws_usage}) "
f"exceed available memory ({available_mem})")
# inject headers into new source path, if requested
if lib_headers:
headers_to_inject = "\n".join(map(lambda s: f"#include <{s}>", lib_headers)) + "\n"
new_src_contents = headers_to_inject + src_contents
tmp_dir = _util.tempdir()
src_path = tmp_dir.relpath(os.path.basename(src_path))
with open(src_path, "w") as f:
f.write(new_src_contents)
create_micro_lib(obj_path, src_path, lib_type, options, lib_src_paths=lib_src_paths)
return _cc.cross_compiler(compile_func, output_format="obj")
......
......@@ -16,7 +16,8 @@
# under the License.
"""Device-specific configuration for MicroTVM"""
from .base import register_device, get_device_funcs, create_micro_lib_base
from .base import create_micro_lib_base, gen_mem_layout
from .base import MemConstraint, register_device, get_device_funcs
from . import host
from . import arm
from . import riscv_spike
......@@ -14,13 +14,32 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Compilation and config definitions for ARM STM32F746XX devices"""
from .. import create_micro_lib_base, register_device
"""Compilation and config definitions for Arm STM32F746XX devices"""
import os
from .. import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
DEVICE_ID = "arm.stm32f746xx"
TOOLCHAIN_PREFIX = "arm-none-eabi-"
WORD_SIZE_BITS = 32
#
# [Device Memory Layout]
# RAM (rwx) : START = 0x20000000, LENGTH = 320K
# Flash (rx) : START = 0x8000000, LENGTH = 1024K
#
BASE_ADDR = 0x20000000
AVAILABLE_MEM = 320000
DEFAULT_SECTION_CONSTRAINTS = {
"text": (18000, MemConstraint.ABSOLUTE_BYTES),
"rodata": (100, MemConstraint.ABSOLUTE_BYTES),
"data": (100, MemConstraint.ABSOLUTE_BYTES),
"bss": (600, MemConstraint.ABSOLUTE_BYTES),
"args": (4096, MemConstraint.ABSOLUTE_BYTES),
"heap": (100.0, MemConstraint.WEIGHT),
"workspace": (64000, MemConstraint.ABSOLUTE_BYTES),
"stack": (32, MemConstraint.ABSOLUTE_BYTES),
}
def create_micro_lib(obj_path, src_path, lib_type, options=None):
def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
"""Wrapper over `create_micro_lib_base` to add device-specific options
Parameters
......@@ -36,23 +55,40 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
options : Optional[List[str]]
additional options to pass to GCC
lib_src_paths : Optional[List[str]]
TODO
"""
if options is None:
options = []
else:
options = list(options)
options += [
# TODO(weberlo): make a debug flag
"-O2",
"-mcpu=cortex-m7",
"-mlittle-endian",
"-mfloat-abi=hard",
"-mfpu=fpv5-sp-d16",
"-mthumb",
"-ffast-math",
"-gdwarf-5",
"-DARM_MATH_CM7",
"-D__FPU_PRESENT=1U",
"-DARM_MATH_DSP",
"-Wno-unused-variable",
"-Wno-unused-parameter",
"-I{}".format(os.environ["CMSIS_ST_PATH"]),
"-I{}/Core/Include".format(os.environ["CMSIS_ST_PATH"])
]
create_micro_lib_base(
obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options)
obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options,
lib_src_paths=lib_src_paths)
def default_config(server_addr, server_port):
"""Generates a default configuration for ARM STM32F746XX devices
def generate_config(server_addr, server_port, section_constraints=None):
"""Generates a configuration for Arm STM32F746XX devices
Parameters
----------
......@@ -62,55 +98,23 @@ def default_config(server_addr, server_port):
server_port : int
port of OpenOCD server to connect to
section_constraints: Optional[Dict[str, [Number, MemConstraint]]]
maps section name to the quantity of available memory
Return
------
config : Dict[str, Any]
MicroTVM config dict for this device
"""
if section_constraints is None:
section_constraints = DEFAULT_SECTION_CONSTRAINTS
return {
"device_id": DEVICE_ID,
"toolchain_prefix": TOOLCHAIN_PREFIX,
#
# [Device Memory Layout]
# RAM (rwx) : START = 0x20000000, LENGTH = 320K
# FLASH (rx) : START = 0x8000000, LENGTH = 1024K
#
"mem_layout": {
"text": {
"start": 0x20000180,
"size": 20480,
},
"rodata": {
"start": 0x20005180,
"size": 20480,
},
"data": {
"start": 0x2000a180,
"size": 768,
},
"bss": {
"start": 0x2000a480,
"size": 768,
},
"args": {
"start": 0x2000a780,
"size": 1280,
},
"heap": {
"start": 0x2000ac80,
"size": 262144,
},
"workspace": {
"start": 0x2004ac80,
"size": 20480,
},
"stack": {
"start": 0x2004fc80,
"size": 80,
},
},
"word_size": 4,
"mem_layout": gen_mem_layout(BASE_ADDR, AVAILABLE_MEM, WORD_SIZE_BITS, section_constraints),
"word_size_bits": WORD_SIZE_BITS,
"thumb_mode": True,
"use_device_timer": True,
"comms_method": "openocd",
"server_addr": server_addr,
"server_port": server_port,
......@@ -119,5 +123,5 @@ def default_config(server_addr, server_port):
register_device(DEVICE_ID, {
"create_micro_lib": create_micro_lib,
"default_config": default_config,
"generate_config": generate_config,
})
......@@ -17,12 +17,13 @@
"""Base definitions for MicroTVM config"""
import glob
import os
from pathlib import Path
import enum
import pathlib
from tvm.contrib import util as _util
from tvm.contrib.binutil import run_cmd
from tvm._ffi.libinfo import find_include_path
from tvm.micro import LibType, get_micro_host_driven_dir, get_micro_device_dir
from tvm.micro import DEVICE_SECTIONS, LibType, get_micro_host_driven_dir, get_micro_device_dir
_DEVICE_REGISTRY = {}
......@@ -38,7 +39,7 @@ def register_device(device_id, device_funcs):
dictionary with compilation and config generation functions as values
"""
if device_id in _DEVICE_REGISTRY:
raise RuntimeError(f"\"{device_id}\" already exists in the device registry")
raise RuntimeError(f'"{device_id}" already exists in the device registry')
_DEVICE_REGISTRY[device_id] = device_funcs
......@@ -56,7 +57,7 @@ def get_device_funcs(device_id):
dictionary with compilation and config generation functions as values
"""
if device_id not in _DEVICE_REGISTRY:
raise RuntimeError(f"\"{device_id}\" does not exist in the binutil registry")
raise RuntimeError(f'"{device_id}" does not exist in the binutil registry')
device_funcs = _DEVICE_REGISTRY[device_id]
return device_funcs
......@@ -67,7 +68,9 @@ def create_micro_lib_base(
toolchain_prefix,
device_id,
lib_type,
options=None):
options=None,
lib_src_paths=None,
):
"""Compiles code into a binary for the target micro device.
Parameters
......@@ -92,7 +95,12 @@ def create_micro_lib_base(
options : List[str]
additional options to pass to GCC
lib_src_paths : Optional[List[str]]
paths to additional source files to be compiled into the library
"""
# look at these (specifically `strip`):
# https://stackoverflow.com/questions/15314581/g-compiler-flag-to-minimize-binary-size
base_compile_cmd = [
f"{toolchain_prefix}gcc",
"-std=c11",
......@@ -100,7 +108,6 @@ def create_micro_lib_base(
"-Wextra",
"--pedantic",
"-c",
"-O0",
"-g",
"-nostartfiles",
"-nodefaultlibs",
......@@ -114,40 +121,48 @@ def create_micro_lib_base(
src_paths = []
include_paths = find_include_path() + [get_micro_host_driven_dir()]
tmp_dir = _util.tempdir()
# we might transform the src path in one of the branches below
# we need to create a new src file in the operator branch
new_in_src_path = in_src_path
if lib_type == LibType.RUNTIME:
dev_dir = _get_device_source_dir(device_id)
dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]")
# there needs to at least be a utvm_timer.c file
assert dev_src_paths
assert "utvm_timer.c" in map(os.path.basename, dev_src_paths)
src_paths += dev_src_paths
elif lib_type == LibType.OPERATOR:
# create a temporary copy of the source, so we can inject the dev lib
# create a temporary copy of the operator source, so we can inject the dev lib
# header without modifying the original.
temp_src_path = tmp_dir.relpath("temp.c")
with open(in_src_path, "r") as f:
src_lines = f.read().splitlines()
src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"")
src_lines.insert(0, '#include "utvm_device_dylib_redirect.c"')
with open(temp_src_path, "w") as f:
f.write("\n".join(src_lines))
new_in_src_path = temp_src_path
base_compile_cmd += ["-c"]
else:
raise RuntimeError("unknown lib type")
src_paths += [new_in_src_path]
# add any src paths required by the operator
if lib_src_paths is not None:
src_paths += lib_src_paths
# print(f"include paths: {include_paths}")
for path in include_paths:
base_compile_cmd += ["-I", path]
prereq_obj_paths = []
# print(src_paths)
for src_path in src_paths:
curr_obj_path = Path(src_path).with_suffix(".o").name
curr_obj_path = tmp_dir.relpath(pathlib.Path(src_path).with_suffix(".o").name)
assert curr_obj_path not in prereq_obj_paths
prereq_obj_paths.append(curr_obj_path)
curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path]
# TODO(weberlo): make compilation fail if there are any warnings
run_cmd(curr_compile_cmd)
ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"]
......@@ -156,6 +171,65 @@ def create_micro_lib_base(
run_cmd(ld_cmd)
# TODO we shouldn't need an enum for this. too much bureaucracy.
class MemConstraint(enum.Enum):
"""Represents a constraint on the device's memory layout"""
ABSOLUTE_BYTES = 0
WEIGHT = 1
def gen_mem_layout(base_addr, available_mem, word_size_bits, section_constraints):
"""Template function to generate memory layout for devices.
Parameters
----------
base_addr: Number
The address where usable memory begins on this device.
available_mem: Number
Available memory at base_addr, given in bytes.
word_size_bits: Number
Number of bits in one word on this device.
section_constraints: Optional[Dict[str, [Number, MemConstraint]]]
maps section name to the quantity of available memory
"""
assert word_size_bits in (32, 64), "only 32- or 64-bit devices are supported now"
word_size_bytes = word_size_bits // 8
byte_sum = sum(x[0]
for x in section_constraints.values()
if x[1] == MemConstraint.ABSOLUTE_BYTES)
weight_sum = sum(x[0]
for x in section_constraints.values()
if x[1] == MemConstraint.WEIGHT)
assert byte_sum <= available_mem
available_weight_mem = available_mem - byte_sum
res = {}
curr_addr = base_addr
for section in DEVICE_SECTIONS:
(val, cons_type) = section_constraints[section]
if cons_type == MemConstraint.ABSOLUTE_BYTES:
assert val % word_size_bytes == 0, \
f"constraint {val} for {section} section is not word-aligned"
size = val
res[section] = {
"start": curr_addr,
"size": size,
}
else:
size = int((val / weight_sum) * available_weight_mem)
size = (size // word_size_bytes) * word_size_bytes
res[section] = {
"start": curr_addr,
"size": size,
}
curr_addr += size
return res
def _get_device_source_dir(device_id):
"""Grabs the source directory for device-specific uTVM files"""
dev_subdir = "/".join(device_id.split("."))
......
......@@ -17,12 +17,26 @@
"""Compilation and config definitions for the host emulated device"""
import sys
from . import create_micro_lib_base, register_device
from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
DEVICE_ID = "host"
TOOLCHAIN_PREFIX = ""
WORD_SIZE_BITS = 64 if sys.maxsize > 2**32 else 32
def create_micro_lib(obj_path, src_path, lib_type, options=None):
# we pretend we only have 320kb in the default case, so we can use `gen_mem_layout`
DEFAULT_AVAILABLE_MEM = 3200000
DEFAULT_SECTION_CONSTRAINTS = {
"text": (20480, MemConstraint.ABSOLUTE_BYTES),
"rodata": (20480, MemConstraint.ABSOLUTE_BYTES),
"data": (768, MemConstraint.ABSOLUTE_BYTES),
"bss": (4096, MemConstraint.ABSOLUTE_BYTES),
"args": (4096, MemConstraint.ABSOLUTE_BYTES),
"heap": (262144, MemConstraint.ABSOLUTE_BYTES),
"workspace": (64000, MemConstraint.ABSOLUTE_BYTES),
"stack": (80, MemConstraint.ABSOLUTE_BYTES),
}
def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
"""Wrapper over `create_micro_lib_base` to add device-specific options
Parameters
......@@ -38,59 +52,65 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
options : Optional[List[str]]
additional options to pass to GCC
lib_src_paths : Optional[List[str]]
paths to additional source files to be compiled into the library
"""
if options is None:
options = []
else:
options = list(options)
# Cannot increase optimization level on host due to code loading method.
options.append("-O0")
if sys.maxsize > 2**32 and sys.platform.startswith("linux"):
options += ["-mcmodel=large"]
create_micro_lib_base(
obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options)
obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options,
lib_src_paths=lib_src_paths)
def default_config():
"""Generates a default configuration for the host emulated device
def generate_config(available_mem=None, section_constraints=None):
"""Generates a configuration for the host emulated device
Parameters
----------
available_mem: int
number of RW bytes available for use on device
section_constraints: Optional[Dict[str, Dict[Number, MemConstraint]]]
maps section name to the quantity of available memory
Return
------
config : Dict[str, Any]
MicroTVM config dict for this device
"""
if available_mem is None:
available_mem = DEFAULT_AVAILABLE_MEM
if section_constraints is None:
section_constraints = DEFAULT_SECTION_CONSTRAINTS
mem_layout = gen_mem_layout(0, available_mem, WORD_SIZE_BITS, section_constraints)
# TODO the host emulated device is an outlier, since we don't know how what
# its base address will be until we've created it in the C++. is there any
# way to change the infrastructure around this so it's not so much of an
# outlier?
# need to zero out all start addresses, because they don't make sense for a
# host device (the memory region is allocated in the backend)
for section in mem_layout:
mem_layout[section]["start"] = 0
return {
"device_id": DEVICE_ID,
"toolchain_prefix": TOOLCHAIN_PREFIX,
"mem_layout": {
"text": {
"size": 20480,
},
"rodata": {
"size": 20480,
},
"data": {
"size": 768,
},
"bss": {
"size": 768,
},
"args": {
"size": 1280,
},
"heap": {
"size": 262144,
},
"workspace": {
"size": 20480,
},
"stack": {
"size": 80,
},
},
"word_size": 8 if sys.maxsize > 2**32 else 4,
"mem_layout": mem_layout,
"word_size_bits": WORD_SIZE_BITS,
"thumb_mode": False,
"use_device_timer": False,
"comms_method": "host",
}
register_device(DEVICE_ID, {
"create_micro_lib": create_micro_lib,
"default_config": default_config,
"generate_config": generate_config,
})
......@@ -15,14 +15,25 @@
# specific language governing permissions and limitations
# under the License.
"""Compilation and config definitions for Spike, a RISC-V functional ISA simulator"""
from collections import OrderedDict
from . import create_micro_lib_base, register_device
from . import create_micro_lib_base, register_device, gen_mem_layout, MemConstraint
DEVICE_ID = "riscv_spike"
TOOLCHAIN_PREFIX = "riscv64-unknown-elf-"
WORD_SIZE_BITS = 64
def create_micro_lib(obj_path, src_path, lib_type, options=None):
DEFAULT_SECTION_CONSTRAINTS = {
"text": (18000, MemConstraint.ABSOLUTE_BYTES),
"rodata": (128, MemConstraint.ABSOLUTE_BYTES),
"data": (128, MemConstraint.ABSOLUTE_BYTES),
"bss": (2048, MemConstraint.ABSOLUTE_BYTES),
"args": (4096, MemConstraint.ABSOLUTE_BYTES),
"heap": (100.0, MemConstraint.WEIGHT),
"workspace": (64000, MemConstraint.ABSOLUTE_BYTES),
"stack": (32, MemConstraint.ABSOLUTE_BYTES),
}
def create_micro_lib(obj_path, src_path, lib_type, options=None, lib_src_paths=None):
"""Wrapper over `create_micro_lib_base` to add device-specific options
Parameters
......@@ -38,6 +49,9 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
options : Optional[List[str]]
additional options to pass to GCC
lib_src_paths : Optional[List[str]]
TODO
"""
create_micro_lib_base(
obj_path,
......@@ -45,11 +59,13 @@ def create_micro_lib(obj_path, src_path, lib_type, options=None):
TOOLCHAIN_PREFIX,
DEVICE_ID,
lib_type,
options=options)
options=options,
lib_src_paths=lib_src_paths
)
def default_config(base_addr, server_addr, server_port):
"""Generates a default configuration for Spike
def generate_config(base_addr, available_mem, server_addr, server_port, section_constraints=None):
"""Generates a configuration for Spike
Parameters
----------
......@@ -62,56 +78,31 @@ def default_config(base_addr, server_addr, server_port):
server_port : int
port of OpenOCD server to connect to
TODO correct type annotation?
section_constraints: Optional[Dict[str, Tuple[Number, MemConstraint]]]
TODO
Return
------
config : Dict[str, Any]
MicroTVM config dict for this device
"""
res = {
if section_constraints is None:
section_constraints = DEFAULT_SECTION_CONSTRAINTS
return {
"device_id": DEVICE_ID,
"toolchain_prefix": TOOLCHAIN_PREFIX,
"mem_layout": OrderedDict([
("text", {
"size": 20480,
}),
("rodata", {
"size": 20480,
}),
("data", {
"size": 768,
}),
("bss", {
"size": 768,
}),
("args", {
"size": 1280,
}),
("heap", {
"size": 262144,
}),
("workspace", {
"size": 20480,
}),
("stack", {
"size": 80,
}),
]),
"word_size": 4,
"thumb_mode": True,
"mem_layout": gen_mem_layout(base_addr, available_mem, WORD_SIZE_BITS, section_constraints),
"word_size_bits": WORD_SIZE_BITS,
"thumb_mode": False,
"use_device_timer": False,
"comms_method": "openocd",
"server_addr": server_addr,
"server_port": server_port,
}
# generate section start addresses from the given `base_addr`
curr_offset = 0
mem_layout = res["mem_layout"]
for region_dict in mem_layout.values():
region_dict["start"] = base_addr + curr_offset
curr_offset += region_dict["size"]
return res
register_device(DEVICE_ID, {
"create_micro_lib": create_micro_lib,
"default_config": default_config,
"generate_config": generate_config,
})
......@@ -151,7 +151,9 @@ FUNC_OPS = {
"nn.dropout": op.nn.dropout_raw,
"zeros": op.zeros,
"split": op.split,
"cast": op.cast
"cast": op.cast,
"clip": op.clip,
"right_shift": op.right_shift,
}
TYPE_PREFIXES = [
......
......@@ -20,24 +20,25 @@ import re
import logging
import topi
from ....target import arm_isa
from .generic import *
from .. import op as _op
logger = logging.getLogger('strategy')
@schedule_injective.register("arm_cpu")
@schedule_injective.register(["arm_cpu", "micro_dev"])
def schedule_injective_arm_cpu(_, outs, target):
"""schedule injective ops for arm cpu"""
with target:
return topi.arm_cpu.schedule_injective(outs)
@schedule_concatenate.register("arm_cpu")
@schedule_concatenate.register(["arm_cpu", "micro_dev"])
def schedule_concatenate_arm_cpu(_, outs, target):
"""schedule concatenate for arm cpu"""
with target:
return topi.arm_cpu.schedule_concatenate(outs)
@conv2d_strategy.register("arm_cpu")
@conv2d_strategy.register(["arm_cpu", "micro_dev"])
def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d arm cpu strategy"""
strategy = _op.OpStrategy()
......@@ -51,6 +52,8 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
if dilation_h < 1 or dilation_w < 1:
raise ValueError("dilation should be positive value")
isa = arm_isa.IsaAnalyzer(target)
if groups == 1:
if layout == "NCHW":
if kernel_layout == "OIHW":
......@@ -102,12 +105,23 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
name="conv2d_hwcn.generic")
elif layout == "NHWC":
assert kernel_layout == "HWIO"
channels = data.shape[3]
if "SMLAD" in isa and (channels % 4) == 0 and kernel_layout == "HWOI":
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_direct_simd),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd),
name='conv2d_direct_simd.micro_dev')
elif kernel_layout == "HWIO":
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
name="conv2d_nhwc_spatial_pack.arm_cpu")
else:
raise RuntimeError("Unsupported kernel layout {} for conv2d NHWC".
format(kernel_layout))
else:
raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout))
elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
if layout == "NCHW":
......@@ -232,7 +246,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out
format(layout))
return strategy
@conv2d_transpose_strategy.register("arm_cpu")
@conv2d_transpose_strategy.register(["arm_cpu", "micro_dev"])
def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d_transpose arm cpu strategy"""
layout = attrs.data_layout
......
......@@ -325,7 +325,10 @@ class Server(object):
key="",
load_library=None,
custom_addr=None,
silent=False):
silent=False,
utvm_dev_id=None,
utvm_dev_config_args=None,
):
try:
if base._ServerLoop is None:
raise RuntimeError("Please compile with USE_RPC=1")
......@@ -355,6 +358,10 @@ class Server(object):
cmd += ["--custom-addr", custom_addr]
if silent:
cmd += ["--silent"]
if utvm_dev_id is not None:
assert utvm_dev_config_args is not None
cmd += [f"--utvm-dev-id={utvm_dev_id}"]
cmd += [f"--utvm-dev-config-args={utvm_dev_config_args}"]
# prexec_fn is not thread safe and may result in deadlock.
# python 3.2 introduced the start_new_session parameter as
......
......@@ -109,7 +109,6 @@ class Module(object):
# pylint: disable=not-callable
return self.entry_func(*args)
def __repr__(self):
return "Module(%s, %x)" % (self.type_key, self.handle.value)
......
......@@ -219,7 +219,7 @@ def context(dev_type, dev_id=0):
"""
if isinstance(dev_type, string_types):
if '-device=micro_dev' in dev_type:
dev_type = 'micro_dev'
dev_type = TVMContext.STR2MASK['micro_dev']
else:
dev_type = dev_type.split()[0]
if dev_type not in TVMContext.STR2MASK:
......
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Defines functions to analyze available opcodes in the ARM ISA."""
ARM_ISA_MAP = {
'armv7e-m': ['SMLAD'],
}
class IsaAnalyzer(object):
def __init__(self, target):
self.target = target
# TODO: actually parse -mcpu
arch = 'armv7e-m'
self._isa_map = ARM_ISA_MAP[arch]
def __contains__(self, instruction):
return instruction in self._isa_map
......@@ -272,7 +272,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs,
Target target_host_val = target_host;
if (!target_host.defined()) {
for (const auto& it : inputs) {
if (it.first->device_type == kDLCPU) {
if (it.first->device_type == kDLCPU || it.first->device_type == kDLMicroDev) {
target_host_val = it.first;
break;
}
......
......@@ -17,11 +17,6 @@
* under the License.
*/
/*!
* \file utvm_init.s
* \brief uTVM init definition for STM32F746XX-series boards
*/
.syntax unified
.cpu cortex-m7
.fpu softvfp
......
......@@ -29,100 +29,51 @@ extern "C" {
#include <stdint.h>
#include "utvm_runtime.h"
// NOTE: This expects ST CMSIS to be in your include path.
// Download STM32CubeF7 here:
// https://www.st.com/content/st_com/en/products/embedded-software/mcu-mpu-embedded-software/stm32-embedded-software/stm32cube-mcu-mpu-packages/stm32cubef7.html
// and add Drivers/CMSIS to your C include path.
#include "Device/ST/STM32F7xx/Include/stm32f746xx.h"
// There are two implementations of cycle counters on the STM32F7X: SysTick and
// CYCCNT. SysTick is preferred, as it gives better error handling, but the
// counter is only 24 bits wide. If a larger timer is needed, use the CYCCNT
// implementation, which has a 32-bit counter.
#define USE_SYSTICK
#ifdef USE_SYSTICK
#define SYST_CSR (*((volatile uint32_t *) 0xE000E010))
#define SYST_RVR (*((volatile uint32_t *) 0xE000E014))
#define SYST_CVR (*((volatile uint32_t *) 0xE000E018))
#define SYST_CALIB (*((volatile uint32_t *) 0xE000E01C))
#define SYST_CSR_ENABLE 0
#define SYST_CSR_TICKINT 1
#define SYST_CSR_CLKSOURCE 2
#define SYST_COUNTFLAG 16
#define SYST_CALIB_NOREF 31
#define SYST_CALIB_SKEW 30
uint32_t start_time = 0;
uint32_t stop_time = 0;
#define utvm_SystemCoreClock 216000000UL
int32_t UTVMTimerStart() {
SYST_CSR = (1 << SYST_CSR_ENABLE) | (1 << SYST_CSR_CLKSOURCE);
// wait until timer starts
while (SYST_CVR == 0) {}
start_time = SYST_CVR;
return 0;
}
void UTVMTimerStop() {
SYST_CSR = 0;
stop_time = SYST_CVR;
}
void UTVMTimerReset() {
SYST_CSR = 0;
// maximum reload value (24-bit)
SYST_RVR = (~((uint32_t) 0)) >> 8;
SYST_CVR = 0;
UTVMTimerReset();
TIM2->CR1 =
TIM_CR1_CEN; // Start counter
return UTVM_ERR_OK;
}
uint32_t UTVMTimerRead() {
if (SYST_CSR & SYST_COUNTFLAG) {
TVMAPISetLastError("timer overflowed");
return -1;
} else {
return start_time - stop_time;
uint32_t UTVMTimerStop(int32_t* err) {
TIM2->CR1 &= TIM_CR1_CEN;
if (TIM2->SR & TIM_SR_UIF_Msk) {
*err = UTVM_ERR_TIMER_OVERFLOW;
return 0;
}
*err = UTVM_ERR_OK;
uint32_t tim_cnt = TIM2->CNT;
uint32_t millis = tim_cnt / (utvm_SystemCoreClock / 1000);
uint32_t micros =
(tim_cnt - (millis * (utvm_SystemCoreClock / 1000))) /
(utvm_SystemCoreClock / 1000000);
return millis * 1000 + micros;
}
#else // !USE_SYSTICK
#define DWT_CTRL (*((volatile uint32_t *) 0xE0001000))
#define DWT_CYCCNT (*((volatile uint32_t *) 0xE0001004))
#define DWT_CTRL_NOCYCCNT 25
#define DWT_CTRL_CYCCNTENA 0
uint32_t start_time = 0;
uint32_t stop_time = 0;
void UTVMTimerReset() {
DWT_CYCCNT = 0;
}
int32_t UTVMTimerStart() {
if (DWT_CTRL & DWT_CTRL_NOCYCCNT) {
TVMAPISetLastError("cycle counter not implemented on device");
return -1;
RCC->APB1RSTR |= RCC_APB1RSTR_TIM2RST; // Hold TIM2 in reset
RCC->DCKCFGR1 = (RCC->DCKCFGR1 & ~RCC_DCKCFGR1_TIMPRE_Msk); // disable 2x clock boost to TIM2
RCC->CFGR = (RCC->CFGR & ~RCC_CFGR_PPRE1_Msk); // No AHB clock division to APB1 (1:1).
RCC->APB1ENR |= RCC_APB1ENR_TIM2EN; // Enable TIM2 clock.
RCC->APB1RSTR &= ~RCC_APB1RSTR_TIM2RST; // Exit TIM2 reset.
DBGMCU->APB1FZ |= DBGMCU_APB1_FZ_DBG_TIM2_STOP; // stop TIM2 clock during debug halt.
TIM2->ARR = 0xffffffff;
if (TIM2->SR & TIM_SR_UIF_Msk) {
for (;;) ;
}
start_time = DWT_CYCCNT;
DWT_CTRL |= (1 << DWT_CTRL_CYCCNTENA);
}
void UTVMTimerStop() {
stop_time = DWT_CYCCNT;
DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA);
}
int32_t UTVMTimerRead() {
if (stop_time > stop_time) {
return stop_time - start_time;
} else {
uint32_t largest = ~0;
return (largest - start_time) + stop_time;
}
}
#endif // USE_SYSTICK
#ifdef __cplusplus
} // TVM_EXTERN_C
#endif
......@@ -22,26 +22,16 @@
* \brief uTVM timer API stubs for the host emulated device
*/
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
#include "utvm_runtime.h"
// TODO(weberlo): use this? https://stackoverflow.com/questions/5141960/get-the-current-time-in-c
int32_t UTVMTimerStart() {
return 0;
return UTVM_ERR_OK;
}
void UTVMTimerStop() { }
void UTVMTimerReset() { }
uint32_t UTVMTimerRead() {
return 1;
uint32_t UTVMTimerStop(int32_t* err) {
*err = UTVM_ERR_OK;
return 0;
}
#ifdef __cplusplus
} // TVM_EXTERN_C
#endif
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
UTVMInit:
/* set stack pointer */
la sp, _utvm_stack_pointer_init
call UTVMMain
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file utvm_timer.c
* \brief uTVM timer API stubs for Spike
*/
#ifdef __cplusplus
extern "C" {
#endif
#include "utvm_runtime.h"
int32_t UTVMTimerStart() {
return UTVM_ERR_OK;
}
uint32_t UTVMTimerStop(int32_t* err) {
*err = UTVM_ERR_OK;
return 0;
}
#ifdef __cplusplus
} // TVM_EXTERN_C
#endif
......@@ -32,10 +32,11 @@ extern "C" {
#include <stdint.h>
#include <stddef.h>
void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) =
(void *(*)(int, int, uint64_t, int, int)) NULL;
int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) NULL;
void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) NULL;
// TODO(weberlo, areusch): compiler errors say volatile qualifier is discarded.
// should we just get rid of em?
void* (* volatile TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = NULL;
int (* volatile TVMBackendFreeWorkspace_)(int, int, void*) = NULL;
void (* volatile TVMAPISetLastError_)(const char*) = NULL;
void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size,
int dtype_code_hint, int dtype_bits_hint) {
......@@ -51,6 +52,41 @@ void TVMAPISetLastError(const char* msg) {
(*TVMAPISetLastError_)(msg);
}
void *memset(void *s, int c, size_t n) {
char *p = (char*) s; // NOLINT(readability/casting): linter is configured for c++
while (n > 0) {
*p = (char) c; // NOLINT(readability/casting): linter is configured for c++
p++;
n--;
}
return s;
}
void *memmove(void *to, const void *from, size_t n) {
// TODO(weberlo, areusch): will need to factor memmove calls into workspace size calculation
// NOLINTNEXTLINE(readability/casting): linter is configured for c++
char *temp = (char*) TVMBackendAllocWorkspace(1, 1, (uint64_t) n, 2, 8);
if (temp == NULL) {
return NULL;
}
const char *from_pp = (char*) from; // NOLINT(readability/casting): linter is configured for c++
for (size_t i = 0; i < n; i++) {
temp[i] = from_pp[i];
}
char *to_pp = (char*) to; // NOLINT(readability/casting): linter is configured for c++
for (size_t i = 0; i < n; i++) {
to_pp[i] = temp[i];
}
// NOLINTNEXTLINE(readability/casting): linter is configured for c++
if (TVMBackendFreeWorkspace(1, (uint64_t) 1, (void*) temp) != 0) {
return NULL;
}
return to;
}
#ifdef __cplusplus
} // TVM_EXTERN_C
#endif
......@@ -34,89 +34,148 @@ extern "C" {
#include "utvm_runtime.h"
// Task pointers must be patched before calling a function.
UTVMTask utvm_task = {
.func = NULL,
.arg_values = NULL,
.arg_type_codes = NULL,
.num_args = 0,
};
size_t utvm_word_size = 0; // NOLINT(*)
// TODO(weberlo, areusch): move defines into header
// TODO(weberlo, areusch): unify TASK_QUEUE_SIZE and MicroSession::kTaskQueueCapacity.
#define TASK_QUEUE_SIZE 20
volatile UTVMTask utvm_tasks[TASK_QUEUE_SIZE] = { };
volatile uint32_t utvm_num_tasks = 0;
volatile uint32_t utvm_task_times[TASK_QUEUE_SIZE] = { };
// These pointers are patched at load time to point to the workspace section.
char* utvm_workspace_start = NULL; // NOLINT(*)
char* utvm_workspace_end = NULL; // NOLINT(*)
char* utvm_workspace_curr = NULL; // NOLINT(*)
volatile char* utvm_workspace_start = NULL; // NOLINT(*)
volatile char* utvm_workspace_end = NULL; // NOLINT(*)
volatile char* utvm_workspace_curr = NULL; // NOLINT(*)
#define MAX_WS_ALLOCS 10
volatile char* utvm_alloc_ends[MAX_WS_ALLOCS] = {}; // NOLINT(*)
volatile uint32_t utvm_alloc_idx = 0;
// Keep track of how many active allocations there are on the workspace.
size_t utvm_num_active_allocs = 0;
volatile uint32_t utvm_num_active_allocs = 0;
volatile uint32_t utvm_word_size = 0;
const char* utvm_last_error = NULL; // NOLINT(*)
int32_t utvm_return_code = 0; // NOLINT(*)
volatile int32_t utvm_last_error = 0; // NOLINT(*)
uint32_t utvm_task_time = 0;
volatile uint32_t utvm_done = 0;
// Gets called by UTVMInit, after device-specific initialization is finished.
void UTVMMain() {
utvm_done = 0;
// loss of precision should be fine here, since we only care about the lower bits
if (((uint32_t) utvm_workspace_start) % utvm_word_size) {
utvm_last_error = UTVM_ERR_WS_UNALIGNED_START;
UTVMDone();
return;
}
utvm_workspace_curr = utvm_workspace_start;
utvm_num_active_allocs = 0;
utvm_last_error = NULL; // NOLINT(*)
utvm_return_code = 0;
utvm_task_time = 0;
UTVMTimerReset();
int32_t err = UTVMTimerStart();
utvm_alloc_idx = 0;
utvm_last_error = UTVM_ERR_NOT_FINISHED;
for (uint32_t i = 0; i < utvm_num_tasks; i++) {
int32_t err = UTVM_ERR_OK;
utvm_task_times[i] = 0;
err = UTVMTimerStart();
if (err < 0) {
utvm_return_code = err;
utvm_last_error = err;
UTVMDone();
return;
}
err = utvm_tasks[i].func(
(void*) utvm_tasks[i].arg_values, // NOLINT(*)
(void*) utvm_tasks[i].arg_type_codes, // NOLINT(*)
utvm_tasks[i].num_args);
if (err < 0) {
UTVMDone();
return;
}
utvm_task_times[i] = UTVMTimerStop(&err);
if (err < 0) {
utvm_last_error = err;
UTVMDone();
return;
}
}
if (utvm_last_error == UTVM_ERR_NOT_FINISHED) {
utvm_last_error = UTVM_ERR_OK;
}
utvm_return_code = utvm_task.func(
(void*) utvm_task.arg_values, // NOLINT(*)
(void*) utvm_task.arg_type_codes, // NOLINT(*)
utvm_task.num_args);
UTVMTimerStop();
utvm_task_time = UTVMTimerRead();
UTVMDone();
}
// We use a dummy function to signal execution is finished for device
// backends which require breakpoints.
void UTVMDone() { }
void __attribute__((noinline)) UTVMDone() {
utvm_done = 1;
}
#define ALIGNED_UP(x, word_size) \
((((word_size) - (((uintptr_t) (x)) % (word_size))) % (word_size)) + (x))
void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size,
int dtype_code_hint, int dtype_bits_hint) {
// Align up to 8 bytes.
utvm_workspace_curr +=
(utvm_word_size - ((uintptr_t) utvm_workspace_curr % utvm_word_size)) % utvm_word_size; // NOLINT(*)
if (utvm_workspace_curr + size > utvm_workspace_end) {
if (size == 0) {
utvm_last_error = UTVM_ERR_WS_ZERO_SIZE_ALLOC;
return NULL;
}
size_t alloc_requested_bytes = size;
size_t alloc_size_words = (alloc_requested_bytes + utvm_word_size - 1) / utvm_word_size;
size_t alloc_size_bytes = alloc_size_words * utvm_word_size;
// Align up to the target word size.
if (utvm_workspace_curr + alloc_size_bytes > utvm_workspace_end) {
// Out of space in workspace.
utvm_last_error = UTVM_ERR_WS_OUT_OF_SPACE;
return NULL;
}
if (utvm_alloc_idx == MAX_WS_ALLOCS - 1) {
// Exceeded number of allocs we can keep track of.
utvm_last_error = UTVM_ERR_WS_TOO_MANY_ALLOCS;
return NULL;
}
void* ret_ptr = (void*) utvm_workspace_curr; // NOLINT(*)
utvm_workspace_curr += size;
utvm_workspace_curr = utvm_workspace_curr + alloc_size_bytes;
// store the *end* of the alloc, so we can restore the WS pointer when freeing
utvm_alloc_ends[utvm_alloc_idx] = utvm_workspace_curr;
utvm_alloc_idx++;
utvm_num_active_allocs++;
return ret_ptr;
}
int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
utvm_num_active_allocs--;
if (utvm_num_active_allocs < 0) {
// TODO(weberlo, areusch): add dev type check
if (utvm_num_active_allocs == 0) {
TVMAPISetLastError("free called with no active workspace allocations");
// Reset allocations and workspace (for future task executions).
utvm_num_active_allocs = 0;
utvm_workspace_curr = utvm_workspace_start;
utvm_last_error = UTVM_ERR_WS_DOUBLE_FREE;
return -1;
} else if (utvm_num_active_allocs == 0) {
// No more allocations. Reset workspace.
} else {
utvm_num_active_allocs--;
if (ptr == utvm_workspace_start) {
// it's the first allocation
utvm_alloc_ends[0] = NULL;
} else {
for (uint32_t i = utvm_alloc_idx - 1; i >= 0; i--) {
if (utvm_alloc_ends[i] == ptr) {
utvm_alloc_ends[i + 1] = NULL;
break;
}
}
}
while (utvm_alloc_idx > 0 && utvm_alloc_ends[utvm_alloc_idx - 1] == NULL) {
utvm_alloc_idx--;
}
if (utvm_alloc_idx == 0) {
utvm_workspace_curr = utvm_workspace_start;
return 0;
} else {
// TODO(weberlo, areusch): could you possibly have utvm_alloc_idx pointing to a NULL entry in
// this branch?
utvm_workspace_curr = utvm_alloc_ends[utvm_alloc_idx - 1];
}
return 0;
}
}
void TVMAPISetLastError(const char* msg) {
utvm_last_error = msg;
}
void TVMAPISetLastError(const char* msg) { }
#ifdef __cplusplus
} // TVM_EXTERN_C
......
......@@ -33,6 +33,22 @@ extern "C" {
#include <tvm/runtime/c_backend_api.h>
/*!
* \brief TODO
*/
enum UTVMReturnCode {
UTVM_ERR_OK = 0,
UTVM_ERR_NOT_FINISHED = -1,
UTVM_ERR_TIMER_NOT_IMPLEMENTED = -2,
UTVM_ERR_TIMER_OVERFLOW = -3,
UTVM_ERR_WS_DOUBLE_FREE = -4,
UTVM_ERR_WS_OUT_OF_SPACE = -5,
UTVM_ERR_WS_TOO_MANY_ALLOCS = -6,
UTVM_ERR_WS_ZERO_SIZE_ALLOC = -7,
UTVM_ERR_WS_UNALIGNED_START = -8,
UTVM_ERR_WS_UNALIGNED_ALLOC_SIZE = -9,
};
/*!
* \brief Task structure for uTVM
*/
typedef struct {
......@@ -46,20 +62,46 @@ typedef struct {
int32_t num_args;
} UTVMTask;
/*!
* \brief microTVM processor startup.
* Expected to reset the stack pointer, configure any hardware required to support the CRT
* (i.e. FPU), and then jump to UTVMMain.
*/
extern void UTVMInit();
extern void UTVMTimerReset();
/*!
* \brief Start the on-device timer.
* \return UTVMReturnCode indicating the outcome of the operation.
*/
extern int32_t UTVMTimerStart();
extern void UTVMTimerStop();
extern uint32_t UTVMTimerRead();
/*!
* \brief Stop the on-device timer.
* TODO(areusch): Use an SI specification of timer units here.
* \param err Receives a UTVMReturnCode indicating the outcome of the operation.
* \return elapsed time since UTVMTimerStart returned, in device timer ticks.
*/
extern uint32_t UTVMTimerStop(int32_t* err);
/*!
* \brief Main entry point for UTVM runtime.
* Waits for "go" signal, then executes tasks and reports result. Should never return.
*/
void UTVMMain();
/*!
* \brief Function entered when UTVMMain is complete.
* Should never return. The host sets a breakpoint here to detect end of computation.
*/
void UTVMDone();
// GCC -O3 begins to inject memset and memmove calls, so we provide impls in
// the runtime for this case and for general usage.
void *memset(void *s, int c, size_t n);
void *memmove(void *to, const void *from, size_t n);
#ifdef __cplusplus
} // TVM_EXTERN_C
#endif
......
......@@ -43,14 +43,15 @@ class HostLowLevelDevice final : public LowLevelDevice {
* \brief constructor to initialize on-host memory region to act as device
* \param num_bytes size of the emulated on-device memory region
*/
explicit HostLowLevelDevice(size_t num_bytes, void** base_addr) : size_(num_bytes) {
explicit HostLowLevelDevice(size_t num_bytes, TargetPtr* base_addr) : size_(num_bytes) {
size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize;
// TODO(weberlo): Set permissions per section (e.g., read-write perms for
// the heap, execute perms for text, etc.).
int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE;
base_addr_ = mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0);
*base_addr = base_addr_;
*base_addr = TargetPtr(TargetWordSize(sizeof(size_t) * 8),
reinterpret_cast<uint64_t>(base_addr_));
}
/*!
......@@ -60,16 +61,16 @@ class HostLowLevelDevice final : public LowLevelDevice {
munmap(base_addr_, size_);
}
void Read(DevPtr addr, void* buf, size_t num_bytes) {
void Read(TargetPtr addr, void* buf, size_t num_bytes) {
std::memcpy(buf, addr.cast_to<void*>(), num_bytes);
}
void Write(DevPtr addr, const void* buf, size_t num_bytes) {
void Write(TargetPtr addr, const void* buf, size_t num_bytes) {
std::memcpy(addr.cast_to<void*>(), buf, num_bytes);
}
void Execute(DevPtr func_addr, DevPtr breakpoint_addr) {
reinterpret_cast<void (*)(void)>(func_addr.value().val64)();
void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) {
reinterpret_cast<void (*)(void)>(func_addr.value().uint64())();
}
const char* device_type() const final {
......@@ -83,9 +84,9 @@ class HostLowLevelDevice final : public LowLevelDevice {
size_t size_;
};
const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr) {
std::shared_ptr<LowLevelDevice> lld =
std::make_shared<HostLowLevelDevice>(num_bytes, base_addr);
const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes,
TargetPtr* base_addr) {
std::shared_ptr<LowLevelDevice> lld = std::make_shared<HostLowLevelDevice>(num_bytes, base_addr);
return lld;
}
......
......@@ -45,7 +45,7 @@ class LowLevelDevice {
* \param buffer on-host buffer to be read into
* \param num_bytes number of bytes to read
*/
virtual void Read(DevPtr addr,
virtual void Read(TargetPtr addr,
void* buffer,
size_t num_bytes) = 0;
......@@ -55,7 +55,7 @@ class LowLevelDevice {
* \param buffer host buffer to write from
* \param num_bytes number of bytes to write
*/
virtual void Write(DevPtr addr,
virtual void Write(TargetPtr addr,
const void* buffer,
size_t num_bytes) = 0;
......@@ -64,7 +64,7 @@ class LowLevelDevice {
* \param func_addr offset of the init stub function
* \param breakpoint_addr address at which to stop function execution
*/
virtual void Execute(DevPtr func_addr, DevPtr breakpoint_addr) = 0;
virtual void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) = 0;
/*!
* \brief getter function for low-level device type
......@@ -78,7 +78,8 @@ class LowLevelDevice {
* \param num_bytes size of the memory region
* \param base_addr pointer to write the host device's resulting base address into
*/
const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr);
const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes,
TargetPtr* base_addr);
/*!
* \brief connect to OpenOCD and create an OpenOCD low-level device
......
......@@ -51,18 +51,18 @@ const char* SectionToString(SectionKind section) {
std::string RelocateBinarySections(
const std::string& binary_path,
size_t word_size,
DevPtr text_start,
DevPtr rodata_start,
DevPtr data_start,
DevPtr bss_start,
DevPtr stack_end,
TargetWordSize word_size,
TargetPtr text_start,
TargetPtr rodata_start,
TargetPtr data_start,
TargetPtr bss_start,
TargetPtr stack_end,
const std::string& toolchain_prefix) {
const auto* f = Registry::Get("tvm_callback_relocate_binary");
CHECK(f != nullptr)
<< "Require tvm_callback_relocate_binary to exist in registry";
std::string relocated_bin = (*f)(binary_path,
word_size,
word_size.bytes(),
text_start.cast_to<uint64_t>(),
rodata_start.cast_to<uint64_t>(),
data_start.cast_to<uint64_t>(),
......@@ -91,7 +91,7 @@ std::string ReadSection(const std::string& binary,
size_t GetSectionSize(const std::string& binary_path,
SectionKind section,
const std::string& toolchain_prefix,
size_t align) {
TargetWordSize word_size) {
CHECK(section == SectionKind::kText || section == SectionKind::kRodata ||
section == SectionKind::kData || section == SectionKind::kBss)
<< "GetSectionSize requires section to be one of text, rodata, data, or bss.";
......@@ -99,7 +99,7 @@ size_t GetSectionSize(const std::string& binary_path,
CHECK(f != nullptr)
<< "Require tvm_callback_get_section_size to exist in registry";
int size = (*f)(binary_path, SectionToString(section), toolchain_prefix);
return UpperAlignValue(size, align);
return UpperAlignValue(size, word_size.bytes());
}
} // namespace runtime
......
......@@ -30,6 +30,7 @@
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
namespace tvm {
namespace runtime {
......@@ -52,28 +53,115 @@ enum class SectionKind : size_t {
kNumKinds,
};
/*! \brief union for storing values on varying target word sizes */
union TargetVal {
/*! \brief 32-bit pointer */
uint32_t val32;
/*! \brief 64-bit pointer */
uint64_t val64;
/*! \brief data type for word sizes */
class TargetWordSize {
public:
explicit TargetWordSize(size_t word_size_bits) : word_size_bits_{word_size_bits} {
CHECK(word_size_bits == 32 || word_size_bits == 64)
<< "only 32-bit and 64-bit are supported now";
}
size_t bytes() const {
return word_size_bits_ / 8;
}
size_t bits() const {
return word_size_bits_;
}
private:
size_t word_size_bits_;
};
/*! \brief absolute device address */
class DevPtr {
/*! \brief class for storing values on varying target word sizes */
class TargetVal {
private:
size_t width_bits_;
uint64_t value_;
public:
/*! \brief construct a device address with value `value` */
explicit DevPtr(std::uintptr_t value) : value_(TargetVal { .val64 = value }) {}
/*! \brief construct a TargetVal matching the size of the given integral argument */
template<typename T, typename U = typename std::enable_if<std::is_integral<T>::value, T>::type>
explicit constexpr TargetVal(T value) : TargetVal(sizeof(T) * 8, value) {}
/*! \brief construct an uninitialized value */
TargetVal() : width_bits_{0}, value_{0} {}
/*! \brief construct a TargetVal with explicit size and value */
TargetVal(size_t width_bits, uint64_t value) : width_bits_{width_bits} {
CHECK(width_bits >= 8 &&
width_bits <= 64 &&
(width_bits & (width_bits - 1)) == 0)
<< "width_bits must be a power of 2 in [8, 64], got " << width_bits;
value_ = value & Bitmask();
}
bool IsInitialized() const { return width_bits_ != 0; }
size_t width_bits() const {
CHECK(IsInitialized()) << "TargetVal is not initialized";
return width_bits_;
}
uint64_t Bitmask() const {
CHECK(IsInitialized()) << "TargetVal is not initialized";
if (width_bits_ == 64) {
return ~0UL;
} else {
return (1UL << width_bits_) - 1;
}
}
/*! \brief default constructor */
DevPtr() : value_(TargetVal { .val64 = 0 }) {}
uint32_t uint32() const {
CHECK(IsInitialized()) << "TargetVal is not initialized";
CHECK(width_bits_ <= 32) << "TargetVal: requested 32-bit value, actual width is "
<< width_bits_;
return uint32_t(value_ & Bitmask());
}
uint64_t uint64() const {
CHECK(IsInitialized()) << "TargetVal is not initialized";
return value_;
}
TargetVal& operator=(const TargetVal& other) {
CHECK(other.IsInitialized()) << "Cannot assign an uninitialized TargetVal";
if (!IsInitialized()) {
width_bits_ = other.width_bits_;
}
CHECK(width_bits_ >= other.width_bits_)
<< "Cannot assign TargetVal with width " << other.width_bits_
<< "bits to TargetVal with width " << width_bits_ << "bits";
value_ = other.value_ & Bitmask();
return *this;
}
};
// TODO(weberlo, areusch): just get rid of `TargetPtr`.
/*! \brief absolute device address */
class TargetPtr {
public:
/*! \brief construct a device address with variable-length value `value` */
TargetPtr(TargetWordSize word_size, std::uint64_t value) :
value_(TargetVal(word_size.bits(), value)) {}
/*! \brief construct a null address */
explicit DevPtr(std::nullptr_t value) : value_(TargetVal { .val64 = 0 }) {}
TargetPtr(TargetWordSize word_size, std::nullptr_t value) :
value_{TargetVal(word_size.bits(), 0)} {}
/*! \brief construct an uninitialized pointer whose word_size can be changed once */
TargetPtr() = default;
/*! \brief construct a device address using the given TargetVal */
explicit TargetPtr(const TargetVal& value) : value_{value} {}
/*! \brief destructor */
~DevPtr() {}
~TargetPtr() {}
/*!
* \brief get value of pointer
......@@ -86,33 +174,33 @@ class DevPtr {
* \return casted result
*/
template <typename T>
T cast_to() const { return reinterpret_cast<T>(value_.val64); }
T cast_to() const { return reinterpret_cast<T>(value_.uint64()); }
/*! \brief check if location is null */
bool operator==(std::nullptr_t) const { return value_.val64 == 0; }
bool operator==(std::nullptr_t) const { return value_.uint64() == 0; }
/*! \brief check if location is not null */
bool operator!=(std::nullptr_t) const { return value_.val64 != 0; }
bool operator!=(std::nullptr_t) const { return value_.uint64() != 0; }
/*! \brief add an integer to this absolute address to get a larger absolute address */
DevPtr operator+(size_t n) const {
return DevPtr(value_.val64 + n);
TargetPtr operator+(size_t n) const {
return TargetPtr(TargetWordSize(value_.width_bits()), value_.uint64() + n);
}
/*! \brief mutably add an integer to this absolute address */
DevPtr& operator+=(size_t n) {
value_.val64 += n;
TargetPtr& operator+=(size_t n) {
value_ = TargetVal(value_.width_bits(), value_.uint64() + n);
return *this;
}
/*! \brief subtract an integer from this absolute address to get a smaller absolute address */
DevPtr operator-(size_t n) const {
return DevPtr(value_.val64 - n);
TargetPtr operator-(size_t n) const {
return TargetPtr(TargetWordSize(value_.width_bits()), value_.uint64() - n);
}
/*! \brief mutably subtract an integer from this absolute address */
DevPtr& operator-=(size_t n) {
value_.val64 -= n;
TargetPtr& operator-=(size_t n) {
value_ = TargetVal(value_.width_bits(), value_.uint64() - n);
return *this;
}
......@@ -137,7 +225,8 @@ class SymbolMap {
* \param toolchain_prefix prefix of compiler toolchain to use
*/
SymbolMap(const std::string& binary,
const std::string& toolchain_prefix) {
const std::string& toolchain_prefix,
TargetWordSize word_size) {
const auto* f = Registry::Get("tvm_callback_get_symbol_map");
CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry";
TVMByteArray arr;
......@@ -152,7 +241,7 @@ class SymbolMap {
stream >> name;
stream >> std::hex >> addr;
while (stream) {
map_[name] = DevPtr(addr);
map_.emplace(std::make_pair(name, TargetPtr(word_size, addr)));
stream >> name;
stream >> std::hex >> addr;
}
......@@ -163,7 +252,7 @@ class SymbolMap {
* \param name name of the symbol
* \return on-device offset of the symbol
*/
DevPtr operator[](const std::string& name) const {
TargetPtr operator[](const std::string& name) const {
auto result = map_.find(name);
CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map";
return result->second;
......@@ -173,15 +262,21 @@ class SymbolMap {
return map_.find(name) != map_.end();
}
void Dump(std::ostream& stream) const {
for (auto e : map_) {
stream << "Entry:" << e.first << std::endl;
}
}
private:
/*! \brief backing map */
std::unordered_map<std::string, DevPtr> map_;
std::unordered_map<std::string, TargetPtr> map_;
};
/*! \brief struct containing start and size of a device memory region */
struct DevMemRegion {
/*! \brief section start offset */
DevPtr start;
TargetPtr start;
/*! \brief size of section */
size_t size;
};
......@@ -239,12 +334,12 @@ const char* SectionToString(SectionKind section);
*/
std::string RelocateBinarySections(
const std::string& binary_path,
size_t word_size,
DevPtr text_start,
DevPtr rodata_start,
DevPtr data_start,
DevPtr bss_start,
DevPtr stack_end,
TargetWordSize word_size,
TargetPtr text_start,
TargetPtr rodata_start,
TargetPtr data_start,
TargetPtr bss_start,
TargetPtr stack_end,
const std::string& toolchain_prefix);
/*!
......@@ -263,13 +358,13 @@ std::string ReadSection(const std::string& binary,
* \param binary input binary contents
* \param section section type
* \param toolchain_prefix prefix of compiler toolchain to use
* \param align alignment of the returned size (default: 8)
* \param word_size word size of the target, for alignment
* \return size of the section if it exists, 0 otherwise
*/
size_t GetSectionSize(const std::string& binary_name,
SectionKind section,
const std::string& toolchain_prefix,
size_t align);
TargetWordSize word_size);
} // namespace runtime
} // namespace tvm
......
......@@ -50,18 +50,14 @@ class MicroDeviceAPI final : public DeviceAPI {
size_t alignment,
DLDataType type_hint) final {
ObjectPtr<MicroSession>& session = MicroSession::Current();
void* data = session->AllocateInSection(SectionKind::kHeap, nbytes).cast_to<void*>();
TargetPtr data = session->AllocateInSection(SectionKind::kHeap, nbytes);
CHECK(data != nullptr) << "unable to allocate " << nbytes << " bytes on device heap";
MicroDevSpace* dev_space = new MicroDevSpace();
dev_space->data = data;
dev_space->session = session;
return static_cast<void*>(dev_space);
return reinterpret_cast<void*>(new MicroDevSpace{data, session});
}
void FreeDataSpace(TVMContext ctx, void* ptr) final {
MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(ptr);
dev_space->session->FreeInSection(
SectionKind::kHeap, DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
dev_space->session->FreeInSection(SectionKind::kHeap, dev_space->data);
delete dev_space;
}
......@@ -77,7 +73,6 @@ class MicroDeviceAPI final : public DeviceAPI {
std::tuple<int, int> type_from_to(ctx_from.device_type, ctx_to.device_type);
if (type_from_to == std::make_tuple(kDLMicroDev, kDLMicroDev)) {
// Copying from the device to the device.
MicroDevSpace* from_space = static_cast<MicroDevSpace*>(const_cast<void*>(from));
MicroDevSpace* to_space = static_cast<MicroDevSpace*>(const_cast<void*>(to));
CHECK(from_space->session == to_space->session)
......@@ -87,58 +82,65 @@ class MicroDeviceAPI final : public DeviceAPI {
CHECK(ctx_from.device_id == ctx_to.device_id)
<< "can only copy between the same micro device";
ObjectPtr<MicroSession>& session = from_space->session;
// flush all pending tasks to ensure data is consistent
session->FlushTaskQueue();
const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
DevPtr from_dev_addr = GetDevLoc(from_space, from_offset);
DevPtr to_dev_addr = GetDevLoc(to_space, to_offset);
TargetPtr from_dev_addr = GetDevLoc(from_space, from_offset);
TargetPtr to_dev_addr = GetDevLoc(to_space, to_offset);
std::vector<uint8_t> buffer(size);
lld->Read(from_dev_addr, static_cast<void*>(buffer.data()), size);
lld->Write(to_dev_addr, static_cast<void*>(buffer.data()), size);
} else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) {
// Reading from the device.
MicroDevSpace* from_space = static_cast<MicroDevSpace*>(const_cast<void*>(from));
ObjectPtr<MicroSession>& session = from_space->session;
// flush all pending tasks to ensure data is consistent
session->FlushTaskQueue();
const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
DevPtr from_dev_addr = GetDevLoc(from_space, from_offset);
TargetPtr from_dev_addr = GetDevLoc(from_space, from_offset);
void* to_host_ptr = GetHostLoc(to, to_offset);
lld->Read(from_dev_addr, to_host_ptr, size);
} else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) {
// Writing to the device.
MicroDevSpace* to_space = static_cast<MicroDevSpace*>(const_cast<void*>(to));
ObjectPtr<MicroSession>& session = to_space->session;
// flush all pending tasks to ensure data is consistent
session->FlushTaskQueue();
const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();
void* from_host_ptr = GetHostLoc(from, from_offset);
DevPtr to_dev_addr = GetDevLoc(to_space, to_offset);
TargetPtr to_dev_addr = GetDevLoc(to_space, to_offset);
lld->Write(to_dev_addr, from_host_ptr, size);
} else {
LOG(FATAL) << "Expect copy from/to micro device or between micro device\n";
}
}
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
MicroSession::Current()->FlushTaskQueue();
}
void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final {
CHECK(false) << "the on-device workspace allocator isn't aware of this function";
ObjectPtr<MicroSession>& session = MicroSession::Current();
void* data = session->AllocateInSection(SectionKind::kWorkspace, size).cast_to<void*>();
CHECK(data != nullptr) << "unable to allocate " << size << " bytes on device workspace";
MicroDevSpace* dev_space = new MicroDevSpace();
dev_space->data = data;
dev_space->session = session;
return static_cast<void*>(dev_space);
TargetPtr data = session->AllocateInSection(SectionKind::kWorkspace, size);
CHECK(data.value().uint64() != 0)
<< "unable to allocate " << size << " bytes on device workspace";
return static_cast<void*>(new MicroDevSpace{data, session});
}
void FreeWorkspace(TVMContext ctx, void* data) final {
CHECK(false) << "the on-device workspace allocator isn't aware of this function";
MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(data);
ObjectPtr<MicroSession>& session = dev_space->session;
session->FreeInSection(SectionKind::kWorkspace,
DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
session->FreeInSection(SectionKind::kWorkspace, dev_space->data);
delete dev_space;
}
......@@ -152,8 +154,8 @@ class MicroDeviceAPI final : public DeviceAPI {
}
private:
DevPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) {
return DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data) + offset);
TargetPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) {
return dev_space->data + offset;
}
void* GetHostLoc(const void* ptr, size_t offset) {
......
......@@ -54,6 +54,8 @@ class MicroModuleNode final : public ModuleNode {
* \param binary_path path of the binary to be loaded
*/
void InitMicroModule(const std::string& binary_path) {
// std::cout << "[MicroModuleNode::InitMicroModule]" << std::endl;
// std::cout << " start" << std::endl;
session_ = MicroSession::Current();
symbol_map_ = session_->LoadBinary(binary_path, true).symbol_map;
}
......@@ -67,26 +69,26 @@ class MicroModuleNode final : public ModuleNode {
class MicroWrappedFunc {
public:
MicroWrappedFunc(ObjectPtr<MicroSession> session,
DevPtr func_ptr) {
TargetPtr func_ptr) {
session_ = session;
func_ptr_ = func_ptr;
}
void operator()(TVMArgs args, TVMRetValue* rv) const {
*rv = session_->PushToExecQueue(func_ptr_, args);
session_->PushToTaskQueue(func_ptr_, args);
}
private:
/*! \brief reference to the session for this function (to keep the session alive) */
ObjectPtr<MicroSession> session_;
/*! \brief offset of the function to be called */
DevPtr func_ptr_;
TargetPtr func_ptr_;
};
PackedFunc MicroModuleNode::GetFunction(
const std::string& name,
const ObjectPtr<Object>& sptr_to_self) {
DevPtr func_ptr;
TargetPtr func_ptr;
if (name == tvm::runtime::symbol::tvm_module_main) {
if (symbol_map_.HasSymbol(tvm::runtime::symbol::tvm_module_main)) {
func_ptr = symbol_map_[tvm::runtime::symbol::tvm_module_main];
......
......@@ -23,6 +23,7 @@
#ifndef TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_
#define TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_
#include <string>
#include <unordered_map>
#include "micro_common.h"
......@@ -38,15 +39,18 @@ class MicroSectionAllocator {
* \brief constructor that specifies section boundaries
* \param region location and size of the section on the device
*/
explicit MicroSectionAllocator(DevMemRegion region, size_t word_size)
: start_addr_(region.start),
explicit MicroSectionAllocator(std::string section_name,
DevMemRegion region,
TargetWordSize word_size)
: section_name_(section_name),
start_addr_(region.start),
size_(0),
capacity_(region.size),
word_size_(word_size) {
CHECK_EQ(start_addr_.value().val64 % word_size, 0)
<< "micro section start not aligned to " << word_size << " bytes";
CHECK_EQ(capacity_ % word_size, 0)
<< "micro section end not aligned to " << word_size << " bytes";
CHECK_EQ(start_addr_.value().uint64() % word_size.bytes(), 0)
<< "micro section start not aligned to " << word_size.bytes() << " bytes";
CHECK_EQ(capacity_ % word_size.bytes(), 0)
<< "micro section end not aligned to " << word_size.bytes() << " bytes";
}
/*!
......@@ -56,17 +60,18 @@ class MicroSectionAllocator {
/*!
* \brief memory allocator
* \param size size of allocated memory in bytes
* \param alloc_size size of allocated memory in bytes
* \return pointer to allocated memory region in section, nullptr if out of space
*/
DevPtr Allocate(size_t size) {
size_ = UpperAlignValue(size_, word_size_);
TargetPtr Allocate(size_t size) {
size_ = UpperAlignValue(size_, word_size_.bytes());
CHECK(size_ + size < capacity_)
<< "cannot alloc " << size << " bytes in section with start_addr " <<
start_addr_.cast_to<void*>();
DevPtr alloc_addr = start_addr_ + size_;
<< "cannot alloc " << size << " bytes in section \""
<< section_name_ << "\" (start_addr=" << start_addr_.cast_to<void*>()
<< ", used=" << size_ << ", capacity=" << capacity_ << ")";
TargetPtr alloc_addr = start_addr_ + size_;
size_ += size;
alloc_map_[alloc_addr.value().val64] = size;
alloc_map_[alloc_addr.value().uint64()] = size;
return alloc_addr;
}
......@@ -75,10 +80,10 @@ class MicroSectionAllocator {
* \param offs offset to allocated memory
* \note simple allocator scheme, more complex versions will be implemented later
*/
void Free(DevPtr addr) {
CHECK(alloc_map_.find(addr.value().val64) != alloc_map_.end())
void Free(TargetPtr addr) {
CHECK(alloc_map_.find(addr.value().uint64()) != alloc_map_.end())
<< "freed pointer was never allocated";
alloc_map_.erase(addr.value().val64);
alloc_map_.erase(addr.value().uint64());
if (alloc_map_.empty()) {
size_ = 0;
}
......@@ -87,17 +92,17 @@ class MicroSectionAllocator {
/*!
* \brief start offset of the memory region managed by this allocator
*/
DevPtr start_addr() const { return start_addr_; }
TargetPtr start_addr() const { return start_addr_; }
/*!
* \brief current end addr of the space being used in this memory region
*/
DevPtr curr_end_addr() const { return start_addr_ + size_; }
TargetPtr curr_end_addr() const { return start_addr_ + size_; }
/*!
* \brief end addr of the memory region managed by this allocator
*/
DevPtr max_addr() const { return start_addr_ + capacity_; }
TargetPtr max_addr() const { return start_addr_ + capacity_; }
/*!
* \brief size of the section
......@@ -110,14 +115,16 @@ class MicroSectionAllocator {
size_t capacity() const { return capacity_; }
private:
/*! \brief name of the section (for debugging) */
std::string section_name_;
/*! \brief start address of the section */
DevPtr start_addr_;
TargetPtr start_addr_;
/*! \brief current size of the section */
size_t size_;
/*! \brief total storage capacity of the section */
size_t capacity_;
/*! \brief number of bytes in a word on the target device */
size_t word_size_;
TargetWordSize word_size_;
/*! \brief allocation map for allocation sizes */
std::unordered_map<uint64_t, size_t> alloc_map_;
};
......
......@@ -23,7 +23,10 @@
#include <dmlc/thread_local.h>
#include <tvm/runtime/registry.h>
#include <tvm/runtime/device_api.h>
#include <chrono>
#include <memory>
#include <locale>
#include <stack>
#include <tuple>
#include <vector>
......@@ -77,14 +80,16 @@ MicroSession::MicroSession(
size_t workspace_size,
uint64_t stack_start,
size_t stack_size,
size_t word_size,
TargetWordSize word_size,
bool thumb_mode,
bool use_device_timer,
const std::string& server_addr,
int port)
: toolchain_prefix_(toolchain_prefix)
, word_size_(word_size)
, thumb_mode_(thumb_mode) {
CHECK(word_size_ == 4 || word_size_ == 8) << "unsupported word size " << word_size_;
: toolchain_prefix_(toolchain_prefix),
word_size_(word_size),
thumb_mode_(thumb_mode),
use_device_timer_(use_device_timer),
batch_args_encoder_(args_size, word_size) {
if (comms_method == "host") {
// TODO(weberlo): move checks to python
CHECK(
......@@ -99,106 +104,138 @@ MicroSession::MicroSession(
size_t memory_size =
text_size + rodata_size + data_size + bss_size +
args_size + heap_size + workspace_size + stack_size;
void* base_addr;
TargetPtr base_addr;
low_level_device_ = HostLowLevelDeviceCreate(memory_size, &base_addr);
CHECK_EQ(reinterpret_cast<std::uintptr_t>(base_addr) % word_size_, 0)
<< "base address not aligned to " << word_size_ << " bytes";
DevPtr curr_addr = DevPtr(reinterpret_cast<std::uintptr_t>(base_addr));
CHECK_EQ(base_addr.value().uint64() % word_size.bytes(), 0)
<< "base address not aligned to " << word_size.bytes() << " bytes";
TargetPtr curr_addr = base_addr;
section_allocators_[0] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
section_allocators_[0] = std::make_shared<MicroSectionAllocator>(
"text",
DevMemRegion {
.start = curr_addr,
.size = text_size,
}, word_size_);
curr_addr += text_size;
section_allocators_[1] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
section_allocators_[1] = std::make_shared<MicroSectionAllocator>(
"rodata",
DevMemRegion {
.start = curr_addr,
.size = rodata_size,
}, word_size_);
curr_addr += rodata_size;
section_allocators_[2] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
section_allocators_[2] = std::make_shared<MicroSectionAllocator>(
"data",
DevMemRegion {
.start = curr_addr,
.size = data_size,
}, word_size_);
curr_addr += data_size;
section_allocators_[3] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
section_allocators_[3] = std::make_shared<MicroSectionAllocator>(
"bss",
DevMemRegion {
.start = curr_addr,
.size = bss_size,
}, word_size_);
curr_addr += bss_size;
section_allocators_[4] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
section_allocators_[4] = std::make_shared<MicroSectionAllocator>(
"args",
DevMemRegion {
.start = curr_addr,
.size = args_size,
}, word_size_);
curr_addr += args_size;
section_allocators_[5] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
section_allocators_[5] = std::make_shared<MicroSectionAllocator>(
"heap",
DevMemRegion {
.start = curr_addr,
.size = heap_size,
}, word_size_);
curr_addr += heap_size;
section_allocators_[6] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
section_allocators_[6] = std::make_shared<MicroSectionAllocator>(
"workspace",
DevMemRegion {
.start = curr_addr,
.size = workspace_size,
}, word_size_);
curr_addr += workspace_size;
section_allocators_[7] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
section_allocators_[7] = std::make_shared<MicroSectionAllocator>(
"stack",
DevMemRegion {
.start = curr_addr,
.size = stack_size,
}, word_size_);
curr_addr += stack_size;
} else if (comms_method == "openocd") {
low_level_device_ = OpenOCDLowLevelDeviceCreate(server_addr, port);
section_allocators_[0] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
.start = DevPtr(text_start),
section_allocators_[0] = std::make_shared<MicroSectionAllocator>(
"text",
DevMemRegion {
.start = TargetPtr(word_size_, text_start),
.size = text_size,
}, word_size_);
section_allocators_[1] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
.start = DevPtr(rodata_start),
section_allocators_[1] = std::make_shared<MicroSectionAllocator>(
"rodata",
DevMemRegion {
.start = TargetPtr(word_size_, rodata_start),
.size = rodata_size,
}, word_size_);
section_allocators_[2] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
.start = DevPtr(data_start),
section_allocators_[2] = std::make_shared<MicroSectionAllocator>(
"data",
DevMemRegion {
.start = TargetPtr(word_size_, data_start),
.size = data_size,
}, word_size_);
section_allocators_[3] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
.start = DevPtr(bss_start),
section_allocators_[3] = std::make_shared<MicroSectionAllocator>(
"bss",
DevMemRegion {
.start = TargetPtr(word_size_, bss_start),
.size = bss_size,
}, word_size_);
section_allocators_[4] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
.start = DevPtr(args_start),
section_allocators_[4] = std::make_shared<MicroSectionAllocator>(
"args",
DevMemRegion {
.start = TargetPtr(word_size_, args_start),
.size = args_size,
}, word_size_);
section_allocators_[5] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
.start = DevPtr(heap_start),
section_allocators_[5] = std::make_shared<MicroSectionAllocator>(
"heap",
DevMemRegion {
.start = TargetPtr(word_size_, heap_start),
.size = heap_size,
}, word_size_);
section_allocators_[6] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
.start = DevPtr(workspace_start),
section_allocators_[6] = std::make_shared<MicroSectionAllocator>(
"workspace",
DevMemRegion {
.start = TargetPtr(word_size_, workspace_start),
.size = workspace_size,
}, word_size_);
section_allocators_[7] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
.start = DevPtr(stack_start),
section_allocators_[7] = std::make_shared<MicroSectionAllocator>(
"stack",
DevMemRegion {
.start = TargetPtr(word_size_, stack_start),
.size = stack_size,
}, word_size_);
} else {
LOG(FATAL) << "unsupported micro low-level device";
}
TargetPtr args_start_addr = GetAllocator(SectionKind::kArgs)->start_addr();
batch_args_encoder_.set_start_addr(args_start_addr);
runtime_symbol_map_ = LoadBinary(binary_path, false).symbol_map;
// Patch pointers to define the bounds of the workspace section and the word
// size (for allocation alignment).
std::shared_ptr<MicroSectionAllocator> ws_allocator = GetAllocator(SectionKind::kWorkspace);
TargetVal ws_start = ws_allocator->start_addr().value();
TargetVal ws_end = ws_allocator->max_addr().value();
TargetVal target_word_size { .val64 = word_size_ };
if (word_size_ == 4) {
DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val32);
DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val32);
DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val32);
} else if (word_size_ == 8) {
DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val64);
DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val64);
DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val64);
DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_allocator->start_addr());
DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_allocator->max_addr());
if (word_size.bytes() == 4) {
DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", uint32_t(word_size.bytes()));
} else if (word_size.bytes() == 8) {
DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", uint64_t(word_size.bytes()));
} else {
CHECK(false) << "Unsupported word size unexpectedly here";
}
}
......@@ -209,59 +246,122 @@ MicroSession::~MicroSession() {
low_level_device_ = nullptr;
}
double MicroSession::PushToExecQueue(DevPtr func_ptr, const TVMArgs& args) {
void MicroSession::PushToTaskQueue(TargetPtr func_ptr, const TVMArgs& args) {
if (thumb_mode_) {
// TODO(areusch): should be |=
func_ptr += 1;
}
TargetVal func_dev_addr = func_ptr.value();
std::tuple<TargetPtr, TargetPtr> arg_field_addrs = EncoderAppend(&batch_args_encoder_, args);
TargetVal arg_values_dev_addr{std::get<0>(arg_field_addrs).value()};
TargetVal arg_type_codes_dev_addr{std::get<1>(arg_field_addrs).value()};
task_queue_.push_back(
DevTask {
.func = func_dev_addr,
.arg_values = arg_values_dev_addr,
.arg_type_codes = arg_type_codes_dev_addr,
.num_args = args.num_args
});
// Create an allocator stream for the memory region after the most recent
// allocation in the args section.
DevPtr args_addr = GetAllocator(SectionKind::kArgs)->curr_end_addr();
TargetDataLayoutEncoder encoder(args_addr, word_size_);
std::tuple<DevPtr, DevPtr> arg_field_addrs = EncoderAppend(&encoder, args);
// Flush `stream` to device memory.
DevPtr stream_dev_addr =
GetAllocator(SectionKind::kArgs)->Allocate(encoder.buf_size());
low_level_device()->Write(stream_dev_addr,
reinterpret_cast<void*>(encoder.data()),
encoder.buf_size());
TargetVal arg_values_dev_addr = std::get<0>(arg_field_addrs).value();
TargetVal arg_type_codes_dev_addr = std::get<1>(arg_field_addrs).value();
if (word_size_ == 4) {
UTVMTask32 task = {
.func = func_ptr.value().val32,
.arg_values = arg_values_dev_addr.val32,
.arg_type_codes = arg_type_codes_dev_addr.val32,
.num_args = args.num_args,
};
// Write the task.
DevSymbolWrite(runtime_symbol_map_, "utvm_task", task);
} else if (word_size_ == 8) {
UTVMTask64 task = {
.func = func_ptr.value().val64,
.arg_values = arg_values_dev_addr.val64,
.arg_type_codes = arg_type_codes_dev_addr.val64,
.num_args = args.num_args,
};
// Write the task.
DevSymbolWrite(runtime_symbol_map_, "utvm_task", task);
if (task_queue_.size() == MicroSession::kTaskQueueCapacity) {
FlushTaskQueue();
}
}
void MicroSession::FlushTaskQueue() {
if (task_queue_.size() == 0) {
// nothing to run
return;
}
if (word_size_.bytes() == 4) {
FlushTaskQueuePriv<StructUTVMTask32>();
} else if (word_size_.bytes() == 8) {
FlushTaskQueuePriv<StructUTVMTask64>();
}
}
template <typename T>
void MicroSession::FlushTaskQueuePriv() {
std::vector<T> prepped_tasks;
for (const auto& task : task_queue_) {
prepped_tasks.push_back(T(task));
}
DevPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"];
DevPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"];
// Flush `args` to device memory.
low_level_device()->Write(
batch_args_encoder_.start_addr(),
reinterpret_cast<void*>(batch_args_encoder_.data()),
batch_args_encoder_.buf_size());
// Flush `tasks` to device memory.
TargetPtr dev_tasks_addr = runtime_symbol_map_["utvm_tasks"];
low_level_device()->Write(
dev_tasks_addr,
reinterpret_cast<void*>(prepped_tasks.data()),
prepped_tasks.size() * sizeof(T));
DevSymbolWrite<uint32_t>(runtime_symbol_map_, "utvm_num_tasks", prepped_tasks.size());
TargetPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"];
TargetPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"];
if (thumb_mode_) {
// TODO(areusch): should be |=
utvm_init_addr += 1;
}
std::chrono::time_point<
std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin, tend;
tbegin = std::chrono::high_resolution_clock::now();
// std::string tmp;
// while (tmp[0] != 'd' && tmp[0] != 'e') {
// std::cout << "How to proceed? [Debug / Execute] ";
// getline(std::cin, tmp);
// CHECK(std::cin.good()) << "Stdin closed";
// tmp[0] = std::tolower(tmp[0]);
// }
// if (tmp[0] == 'd') {
// std::cout << "Launch debugger; [Enter] to resume automated execution";
// getline(std::cin, tmp);
// } else {
low_level_device()->Execute(utvm_init_addr, utvm_done_addr);
// }
tend = std::chrono::high_resolution_clock::now();
// Check if there was an error during execution. If so, log it.
CheckDeviceError();
uint32_t task_time = DevSymbolRead<uint32_t>(runtime_symbol_map_, "utvm_task_time");
GetAllocator(SectionKind::kArgs)->Free(stream_dev_addr);
return static_cast<double>(task_time);
if (use_device_timer_) {
uint64_t sum = 0;
std::vector<uint32_t> times;
times.resize(task_queue_.size());
low_level_device()->Read(runtime_symbol_map_["utvm_task_times"],
times.data(),
task_queue_.size() * sizeof(uint32_t));
int i = 0;
for (uint32_t time : times) {
LOG(INFO) << "Time " << i++ << ": " << time;
sum += time;
}
last_batch_time_ += static_cast<double>(sum) / 1e3;
} else {
last_batch_time_ += std::chrono::duration_cast<std::chrono::duration<double> >
(tend - tbegin).count() * 1000;
// TODO(weberlo): Reading internal data structure is hacky.
uint64_t sum = 0;
std::vector<uint32_t> times;
times.resize(task_queue_.size());
low_level_device()->Read(runtime_symbol_map_["utvm_task_times"],
times.data(),
task_queue_.size() * sizeof(uint32_t));
for (uint32_t time : times) {
sum += time;
}
last_batch_cycles_ += static_cast<double>(sum);
}
batch_args_encoder_.Clear();
task_queue_.clear();
}
BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) {
......@@ -283,9 +383,6 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size);
data_section.start = AllocateInSection(SectionKind::kData, data_section.size);
bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size);
CHECK(text_section.start != nullptr && rodata_section.start != nullptr &&
data_section.start != nullptr && bss_section.start != nullptr)
<< "not enough space to load module on device";
std::string relocated_bin = RelocateBinarySections(
binary_path,
......@@ -305,7 +402,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size);
low_level_device_->Write(data_section.start, &data_contents[0], data_section.size);
low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size);
SymbolMap symbol_map {relocated_bin, toolchain_prefix_};
SymbolMap symbol_map {relocated_bin, toolchain_prefix_, word_size_};
if (patch_dylib_pointers) {
// Patch device lib pointers.
......@@ -323,7 +420,7 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d
};
}
std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
std::tuple<TargetPtr, TargetPtr> MicroSession::EncoderAppend(
TargetDataLayoutEncoder* encoder, const TVMArgs& args) {
const int* type_codes = args.type_codes;
int num_args = args.num_args;
......@@ -341,12 +438,13 @@ std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
// order to prevent premature session destruction.
void* old_data = base_arr_handle->data;
// Mutate the array to unwrap the `data` field.
base_arr_handle->data = reinterpret_cast<MicroDevSpace*>(old_data)->data;
MicroDevSpace* dev_arr_ptr = reinterpret_cast<MicroDevSpace*>(old_data);
base_arr_handle->data = reinterpret_cast<void*>(dev_arr_ptr->data.value().uint64());
// Now, encode the unwrapped version.
void* arr_ptr = nullptr;
if (word_size_ == 4) {
if (word_size_.bytes() == 4) {
arr_ptr = EncoderAppend<TVMArray32>(encoder, *base_arr_handle).cast_to<void*>();
} else if (word_size_ == 8) {
} else if (word_size_.bytes() == 8) {
arr_ptr = EncoderAppend<TVMArray64>(encoder, *base_arr_handle).cast_to<void*>();
}
// And restore the original wrapped version.
......@@ -371,7 +469,7 @@ std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
}
template <typename T>
DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr) {
TargetPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr) {
auto tvm_arr_slot = encoder->Alloc<T>();
auto shape_slot = encoder->Alloc<int64_t>(arr.ndim);
......@@ -379,8 +477,8 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen
// the device first. The `data` field is already allocated on the device and
// is a device pointer, so we don't need to write it.
shape_slot.WriteArray(arr.shape, arr.ndim);
DevPtr shape_dev_addr = shape_slot.start_addr();
DevPtr strides_dev_addr = DevPtr(nullptr);
TargetPtr shape_dev_addr = shape_slot.start_addr();
TargetPtr strides_dev_addr = TargetPtr(word_size_, nullptr);
if (arr.strides != nullptr) {
auto stride_slot = encoder->Alloc<int64_t>(arr.ndim);
stride_slot.WriteArray(arr.strides, arr.ndim);
......@@ -388,13 +486,13 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen
}
T dev_arr(
TargetVal { .val64 = reinterpret_cast<uint64_t>(arr.data) },
TargetVal { word_size_.bits(), reinterpret_cast<uint64_t>(arr.data) },
arr.ctx,
arr.ndim,
arr.dtype,
shape_dev_addr.value(),
strides_dev_addr.value(),
TargetVal { .val64 = arr.byte_offset });
TargetVal { word_size_.bits(), arr.byte_offset });
CHECK(dev_arr.ctx.device_type == static_cast<DLDeviceType>(kDLMicroDev))
<< "attempt to write DLTensor with non-micro device type";
// Update the device type to CPU, because from the microcontroller's
......@@ -404,39 +502,70 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTen
return tvm_arr_slot.start_addr();
}
// TODO(weberlo): switch over entirely to error codes that expand to error
// messages on the host side.
void MicroSession::CheckDeviceError() {
int32_t return_code = DevSymbolRead<int32_t>(runtime_symbol_map_, "utvm_return_code");
int32_t last_error = DevSymbolRead<int32_t>(runtime_symbol_map_, "utvm_last_error");
if (return_code) {
std::uintptr_t last_error =
DevSymbolRead<std::uintptr_t>(runtime_symbol_map_, "utvm_last_error");
std::string last_error_str;
if (last_error) {
DevPtr last_err_addr = DevPtr(last_error);
last_error_str = ReadString(last_err_addr);
if (!use_device_timer_ &&
(last_error == UTVM_ERR_TIMER_OVERFLOW ||
last_error == UTVM_ERR_TIMER_NOT_IMPLEMENTED)) {
// these errors don't matter if we're not using the on-device timer
return;
}
std::string err_msg;
switch (last_error) {
case UTVM_ERR_NOT_FINISHED:
err_msg = "execution timed out";
break;
case UTVM_ERR_TIMER_NOT_IMPLEMENTED:
err_msg = "timer is not implemented for the target device";
break;
case UTVM_ERR_TIMER_OVERFLOW:
// TODO(weberlo): this should be remedied by using interrupts to accumulate the
// timer into a larger datatype (ARM timers are only 24 bits)
err_msg = "timer overflowed during execution";
break;
case UTVM_ERR_WS_DOUBLE_FREE:
err_msg = "free called with no active workspace allocations";
break;
case UTVM_ERR_WS_OUT_OF_SPACE:
err_msg = "ran out of space in workspace section";
break;
case UTVM_ERR_WS_TOO_MANY_ALLOCS:
err_msg = "exceeded number of allocs the runtime can keep track of";
break;
case UTVM_ERR_WS_ZERO_SIZE_ALLOC:
err_msg = "attempt to allocate scratchpad of size zero";
break;
case UTVM_ERR_WS_UNALIGNED_START:
err_msg = "start of workspace section is not word-aligned";
break;
case UTVM_ERR_WS_UNALIGNED_ALLOC_SIZE:
err_msg = "scratchpad allocation size is not a multiple of the word size";
break;
default:
err_msg = "unknown error code";
break;
}
LOG(FATAL) << "error during micro function execution:\n"
<< " return code: " << std::dec << return_code << "\n"
<< " dev str addr: 0x" << std::hex << last_error << "\n"
<< " dev str data: " << last_error_str << std::endl;
<< " error ID: " << std::dec << last_error << std::endl
<< " error message: " << err_msg;
}
}
void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name) {
DevPtr runtime_impl_addr = runtime_symbol_map_[func_name];
TargetPtr runtime_impl_addr = runtime_symbol_map_[func_name];
if (thumb_mode_) {
runtime_impl_addr += 1;
}
std::ostringstream func_name_underscore;
func_name_underscore << func_name << "_";
if (word_size_ == 4) {
DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val32);
} else if (word_size_ == 8) {
DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val64);
}
DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr);
}
std::string MicroSession::ReadString(DevPtr str_addr) {
std::string MicroSession::ReadString(TargetPtr str_addr) {
std::ostringstream result;
const size_t buf_size = 256;
std::vector<char> buf(buf_size, 0);
......@@ -454,27 +583,39 @@ std::string MicroSession::ReadString(DevPtr str_addr) {
return result.str();
}
DevPtr MicroSession::AllocateInSection(SectionKind type, size_t size) {
TargetPtr MicroSession::AllocateInSection(SectionKind type, size_t size) {
return GetAllocator(type)->Allocate(size);
}
void MicroSession::FreeInSection(SectionKind type, DevPtr addr) {
void MicroSession::FreeInSection(SectionKind type, TargetPtr addr) {
return GetAllocator(type)->Free(addr);
}
template <typename T>
T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol) {
DevPtr sym_addr = symbol_map[symbol];
TargetPtr sym_addr = symbol_map[symbol];
T result;
low_level_device()->Read(sym_addr, &result, sizeof(T));
return result;
}
void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map,
const std::string& symbol,
const TargetPtr& ptr) {
if (word_size_.bytes() == 4) {
DevSymbolWrite(symbol_map, symbol, ptr.value().uint32());
} else if (word_size_.bytes() == 8) {
DevSymbolWrite(symbol_map, symbol, ptr.value().uint64());
} else {
CHECK(false) << "Unsupported word size unexpectedly here";
}
}
template <typename T>
void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map,
const std::string& symbol,
const T& value) {
DevPtr sym_addr = symbol_map[symbol];
TargetPtr sym_addr = symbol_map[symbol];
low_level_device()->Write(sym_addr, &value, sizeof(T));
}
......@@ -489,11 +630,55 @@ PackedFunc MicroSession::GetFunction(
return PackedFunc([sptr_to_self](TVMArgs args, TVMRetValue* rv) {
MicroSession::ExitWithScope();
});
// TODO(weberlo): add a `clear_batch_timer` func
} else if (name == "get_last_batch_time") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
*rv = this->GetLastBatchTime();
});
// TODO(weberlo): remove this func
} else if (name == "get_last_batch_cycles") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
*rv = this->GetLastBatchCycles();
});
} else {
return PackedFunc();
}
}
TVM_REGISTER_GLOBAL("micro._GetMicroTimeEvaluator")
.set_body([](TVMArgs args, TVMRetValue* rv) {
PackedFunc pf = args[0];
TVMContext ctx = args[1];
uint64_t number = args[2];
uint64_t repeat = args[3];
auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable {
TVMRetValue temp;
std::ostringstream os;
for (unsigned int i = 0; i < repeat; ++i) {
// start timing
CHECK(number < MicroSession::kTaskQueueCapacity)
<< "`number` must be less than uTVM task queue capacity";
for (unsigned int j = 0; j < number; ++j) {
pf.CallPacked(args, &temp);
}
ObjectPtr<MicroSession> session = MicroSession::Current();
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
double time_per_batch = session->GetLastBatchTime() / number;
os.write(reinterpret_cast<char*>(&time_per_batch), sizeof(time_per_batch));
}
std::string blob = os.str();
TVMByteArray arr;
arr.size = blob.length();
arr.data = blob.data();
// return the time.
*rv = arr;
};
*rv = PackedFunc(ftimer);
});
// create micro session and low-level device from Python frontend
TVM_REGISTER_GLOBAL("micro._CreateSession")
.set_body([](TVMArgs args, TVMRetValue* rv) {
......@@ -501,25 +686,26 @@ TVM_REGISTER_GLOBAL("micro._CreateSession")
const std::string& binary_path = args[1];
const std::string& toolchain_prefix = args[2];
uint64_t text_start = args[3];
size_t text_size = args[4];
size_t text_size = uint64_t(args[4]);
uint64_t rodata_start = args[5];
size_t rodata_size = args[6];
size_t rodata_size = uint64_t(args[6]);
uint64_t data_start = args[7];
size_t data_size = args[8];
size_t data_size = uint64_t(args[8]);
uint64_t bss_start = args[9];
size_t bss_size = args[10];
size_t bss_size = uint64_t(args[10]);
uint64_t args_start = args[11];
size_t args_size = args[12];
size_t args_size = uint64_t(args[12]);
uint64_t heap_start = args[13];
size_t heap_size = args[14];
size_t heap_size = uint64_t(args[14]);
uint64_t workspace_start = args[15];
size_t workspace_size = args[16];
size_t workspace_size = uint64_t(args[16]);
uint64_t stack_start = args[17];
size_t stack_size = args[18];
size_t word_size = args[19];
size_t stack_size = uint64_t(args[18]);
TargetWordSize word_size{uint64_t(args[19])};
bool thumb_mode = args[20];
const std::string& server_addr = args[21];
int port = args[22];
bool use_device_timer = args[21];
const std::string& server_addr = args[22];
int port = args[23];
ObjectPtr<MicroSession> session = make_object<MicroSession>(
comms_method,
binary_path,
......@@ -542,6 +728,7 @@ TVM_REGISTER_GLOBAL("micro._CreateSession")
stack_size,
word_size,
thumb_mode,
use_device_timer,
server_addr,
port);
*rv = Module(session);
......
......@@ -52,6 +52,8 @@
namespace tvm {
namespace runtime {
struct DevTask;
/*!
* \brief session for facilitating micro device interaction
*/
......@@ -66,6 +68,9 @@ class MicroSession : public ModuleNode {
virtual PackedFunc GetFunction(const std::string& name,
const ObjectPtr<Object>& sptr_to_self);
// todo having this decoupled from the value in utvm_runtime.c gives me stress dreams
static const size_t kTaskQueueCapacity = 20;
/*!
* \return The type key of the executor.
*/
......@@ -94,7 +99,7 @@ class MicroSession : public ModuleNode {
* \param workspace_size workspace section size
* \param stack_start stack section start address
* \param stack_size stack section size
* \param word_size number of bytes in a word on the target device
* \param word_size_bytes number of bytes in a word on the target device
* \param thumb_mode whether the target device requires a thumb-mode bit on function addresses
* \param server_addr address of the OpenOCD server to connect to (if `comms_method == "openocd"`)
* \param port port of the OpenOCD server to connect to (if `comms_method == "openocd"`)
......@@ -119,8 +124,9 @@ class MicroSession : public ModuleNode {
size_t workspace_size,
uint64_t stack_start,
size_t stack_size,
size_t word_size,
TargetWordSize word_size,
bool thumb_mode,
bool use_device_timer,
const std::string& server_addr,
int port);
......@@ -137,7 +143,19 @@ class MicroSession : public ModuleNode {
* \param args args to the packed function
* \return elapsed time during function execution on the device
*/
double PushToExecQueue(DevPtr func, const TVMArgs& args);
void PushToTaskQueue(TargetPtr func, const TVMArgs& args);
/*!
* \brief serialize runtime metadata to the device for enqueued tasks and execute
* \return elapsed time during function execution on the device
*/
void FlushTaskQueue();
/*!
* \brief TODO
*/
template <typename T>
void FlushTaskQueuePriv();
/*!
* \brief loads binary onto device
......@@ -153,21 +171,21 @@ class MicroSession : public ModuleNode {
* \param size size of allocated memory in bytes
* \return pointer to allocated memory region in section, nullptr if out of space
*/
DevPtr AllocateInSection(SectionKind type, size_t size);
TargetPtr AllocateInSection(SectionKind type, size_t size);
/*!
* \brief free prior allocation from section
* \param type type of section to allocate in
* \param addr device address of allocated memory
*/
void FreeInSection(SectionKind type, DevPtr addr);
void FreeInSection(SectionKind type, TargetPtr addr);
/*!
* \brief read string from device to host
* \param str_addr device address of first character of string
* \return host copy of device string that was read
*/
std::string ReadString(DevPtr str_addr);
std::string ReadString(TargetPtr str_addr);
/*!
* \brief read value of symbol from device memory
......@@ -179,6 +197,16 @@ class MicroSession : public ModuleNode {
T DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol);
/*!
* \brief write pointer value into device memory corresponding to symbol
* \param symbol_map symbol map to read location of symbol from
* \param symbol name of symbol being written to
* \param ptr pointer value to write into symbol
*/
void DevSymbolWrite(const SymbolMap& symbol_map,
const std::string& symbol,
const TargetPtr& ptr);
/*!
* \brief write value into device memory corresponding to symbol
* \param symbol_map symbol map to read location of symbol from
* \param symbol name of symbol being written to
......@@ -196,6 +224,18 @@ class MicroSession : public ModuleNode {
return low_level_device_;
}
const double GetLastBatchTime() {
double result = last_batch_time_;
last_batch_time_ = 0.0;
return result;
}
const double GetLastBatchCycles() {
double result = last_batch_cycles_;
last_batch_cycles_ = 0.0;
return result;
}
private:
/*! \brief low-level device pointer */
std::shared_ptr<LowLevelDevice> low_level_device_;
......@@ -205,7 +245,7 @@ class MicroSession : public ModuleNode {
std::shared_ptr<MicroSectionAllocator>
section_allocators_[static_cast<size_t>(SectionKind::kNumKinds)];
/*! \brief number of bytes in a word on the target device */
size_t word_size_;
TargetWordSize word_size_;
/*! \brief whether the target device requires a thumb-mode bit on function addresses
*
* ARM and other manufacturers use the lowest bit of a function address to determine
......@@ -213,8 +253,20 @@ class MicroSession : public ModuleNode {
* results in more compact binaries.
*/
bool thumb_mode_;
/*! \brief TODO */
bool use_device_timer_;
/*! \brief symbol map for the device runtime */
SymbolMap runtime_symbol_map_;
/*! \brief TODO */
std::vector<DevTask> task_queue_;
// TODO(weberlo): we don't even need an allocator mechanism for the args
// section. there's only ever one allocation.
/*! \brief TODO hack */
TargetDataLayoutEncoder batch_args_encoder_;
/*! \brief TODO hack */
double last_batch_time_;
/*! \brief TODO hack */
double last_batch_cycles_;
/*!
* \brief patches a function pointer in this module to an implementation
......@@ -228,7 +280,8 @@ class MicroSession : public ModuleNode {
* \param args args to be appended
* \return device address of the allocated args
*/
std::tuple<DevPtr, DevPtr> EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args);
std::tuple<TargetPtr, TargetPtr> EncoderAppend(TargetDataLayoutEncoder* encoder,
const TVMArgs& args);
/*!
* \brief appends a `DLTensor` to the host-side buffer of `encoder`
......@@ -237,7 +290,7 @@ class MicroSession : public ModuleNode {
* \return device address of the allocated `DLTensor`
*/
template <typename T>
DevPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr);
TargetPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const DLTensor& arr);
/*!
* \brief checks and logs if there was an error during the device's most recent execution
......@@ -274,7 +327,7 @@ class MicroSession : public ModuleNode {
*/
struct MicroDevSpace {
/*! \brief data being wrapped */
void* data;
TargetPtr data;
/*! \brief shared ptr to session where this data is valid */
ObjectPtr<MicroSession> session;
};
......@@ -291,18 +344,22 @@ struct TVMArray32 {
TargetVal shape,
TargetVal strides,
TargetVal byte_offset)
: data(data.val32),
: data(data.uint32()),
ctx(ctx),
ndim(ndim),
pad0(0),
dtype(dtype),
shape(shape.val32),
strides(strides.val32),
shape(shape.uint32()),
strides(strides.uint32()),
pad1(0),
byte_offset(byte_offset.val32),
byte_offset(byte_offset.uint32()),
pad2(0) { }
/*! \brief opaque pointer to the allocated data */
/*!
* \brief The opaque data pointer points to the allocated data.
* This will be CUDA device pointer or cl_mem handle in OpenCL.
* This pointer is always aligns to 256 bytes as in CUDA.
*/
uint32_t data;
/*! \brief The device context of the tensor */
DLContext ctx;
......@@ -337,16 +394,19 @@ struct TVMArray64 {
TargetVal shape,
TargetVal strides,
TargetVal byte_offset)
: data(data.val64),
: data(data.uint64()),
ctx(ctx),
ndim(ndim),
pad0(0),
dtype(dtype),
shape(shape.val64),
strides(strides.val64),
byte_offset(byte_offset.val64) { }
/*! \brief opaque pointer to the allocated data */
shape(shape.uint64()),
strides(strides.uint64()),
byte_offset(byte_offset.uint64()) { }
/*!
* \brief The opaque data pointer points to the allocated data.
* This will be CUDA device pointer or cl_mem handle in OpenCL.
* This pointer is always aligns to 256 bytes as in CUDA.
*/
uint64_t data;
/*! \brief The device context of the tensor */
DLContext ctx;
......@@ -367,8 +427,26 @@ struct TVMArray64 {
uint64_t byte_offset;
};
/*! \brief MicroTVM task to store in task queue before specializing to word size */
struct DevTask {
/*! \brief Pointer to function to call for this task */
TargetVal func;
/*! \brief Array of argument values */
TargetVal arg_values;
/*! \brief Array of type codes for each argument value */
TargetVal arg_type_codes;
/*! \brief Number of arguments */
int32_t num_args;
};
/*! \brief MicroTVM task for serialization to 32-bit devices */
typedef struct StructUTVMTask32 {
StructUTVMTask32(DevTask task)
: func(task.func.uint32()),
arg_values(task.arg_values.uint32()),
arg_type_codes(task.arg_type_codes.uint32()),
num_args(task.num_args) { }
/*! \brief Pointer to function to call for this task */
uint32_t func;
/*! \brief Array of argument values */
......@@ -377,10 +455,16 @@ typedef struct StructUTVMTask32 {
uint32_t arg_type_codes;
/*! \brief Number of arguments */
int32_t num_args;
} UTVMTask32;
} StructUTVMTask32;
/*! \brief MicroTVM task for serialization to 64-bit devices */
typedef struct StructUTVMTask64 {
StructUTVMTask64(DevTask task)
: func(task.func.uint64()),
arg_values(task.arg_values.uint64()),
arg_type_codes(task.arg_type_codes.uint64()),
num_args(task.num_args) { }
/*! \brief Pointer to function to call for this task */
uint64_t func;
/*! \brief Array of argument values */
......@@ -389,7 +473,7 @@ typedef struct StructUTVMTask64 {
uint64_t arg_type_codes;
/*! \brief Number of arguments */
int32_t num_args;
} UTVMTask64;
} StructUTVMTask64;
} // namespace runtime
} // namespace tvm
......
......@@ -50,7 +50,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
socket_.SendCommand();
}
void Read(DevPtr addr, void* buf, size_t num_bytes) {
void Read(TargetPtr addr, void* buf, size_t num_bytes) override {
if (num_bytes == 0) {
return;
}
......@@ -88,7 +88,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
}
{
socket_.cmd_builder() << "ocd_echo $output";
socket_.cmd_builder() << "return $output";
socket_.SendCommand();
const std::string& reply = socket_.last_reply();
......@@ -119,7 +119,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
}
}
void Write(DevPtr addr, const void* buf, size_t num_bytes) {
void Write(TargetPtr addr, const void* buf, size_t num_bytes) override {
if (num_bytes == 0) {
return;
}
......@@ -171,7 +171,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
}
}
void Execute(DevPtr func_addr, DevPtr breakpoint_addr) {
void Execute(TargetPtr func_addr, TargetPtr breakpoint_addr) override {
socket_.cmd_builder() << "halt 0";
socket_.SendCommand();
......@@ -207,12 +207,12 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
/*! \brief number of bytes in a word on the target device (64-bit) */
static const constexpr ssize_t kWordSize = 8;
// NOTE: OpenOCD will call any request larger than this constant an "absurd
// request".
// NOTE: The OS pipe buffer must be able to handle a line long enough to
// print this transfer request.
/*! \brief maximum number of bytes allowed in a single memory transfer */
static const constexpr ssize_t kMemTransferLimit = 64000;
static const constexpr ssize_t kMemTransferLimit = 8000;
/*! \brief number of milliseconds to wait for function execution to halt */
static const constexpr int kWaitTime = 10000;
static const constexpr int kWaitTime = 30000;
};
const std::shared_ptr<LowLevelDevice> OpenOCDLowLevelDeviceCreate(const std::string& server_addr,
......
......@@ -30,7 +30,7 @@
namespace tvm {
namespace runtime {
// TODO(weberlo): Handle endianness.
// TODO(weberlo, areusch): Handle endianness.
/*!
* \brief data encoder for uTVM that builds a host-side buffer
......@@ -50,7 +50,7 @@ class TargetDataLayoutEncoder {
* \param size size (in bytes) of the memory region allocated for this slot
* \param start_addr start address of the slot in the device's memory
*/
Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, DevPtr start_addr);
Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, TargetPtr start_addr);
~Slot();
......@@ -71,7 +71,7 @@ class TargetDataLayoutEncoder {
* \brief returns start address of the slot in device memory
* \return device start address
*/
DevPtr start_addr();
TargetPtr start_addr();
/*!
* \brief returns number of bytes allocated for this slot
......@@ -89,16 +89,17 @@ class TargetDataLayoutEncoder {
/*! \brief size (in bytes) of the memory region allocated for this slot */
size_t size_;
/*! \brief start address of the slot in the device's memory */
DevPtr start_addr_;
TargetPtr start_addr_;
};
/*!
* \brief constructor
* \param start_addr start address of the encoder in device memory
*/
explicit TargetDataLayoutEncoder(DevPtr start_addr, size_t word_size)
: buf_(std::vector<uint8_t>()), curr_offset_(0), word_size_(word_size) {
start_addr_ = DevPtr(UpperAlignValue(start_addr.value().val64, word_size_));
explicit TargetDataLayoutEncoder(size_t capacity, TargetWordSize word_size)
: buf_(std::vector<uint8_t>()), curr_offset_(0),
start_addr_(word_size, nullptr),
capacity_(capacity), word_size_(word_size) {
}
/*!
......@@ -108,14 +109,20 @@ class TargetDataLayoutEncoder {
*/
template <typename T>
Slot<T> Alloc(size_t num_elems = 1) {
curr_offset_ = UpperAlignValue(curr_offset_, word_size_);
curr_offset_ = UpperAlignValue(curr_offset_, word_size_.bytes());
size_t size = sizeof(T) * num_elems;
if (curr_offset_ + size > buf_.size()) {
buf_.resize(curr_offset_ + size);
}
CHECK(buf_.size() < capacity_) << "out of space in data encoder";
size_t slot_start_offset = curr_offset_;
curr_offset_ += size;
return Slot<T>(this, slot_start_offset, size, start_addr_ + slot_start_offset);
return Slot<T>(this, slot_start_offset, size, start_addr() + slot_start_offset);
}
void Clear() {
buf_.clear();
curr_offset_ = 0;
}
/*!
......@@ -130,26 +137,39 @@ class TargetDataLayoutEncoder {
* \brief returns current size of the encoder's buffer
* \return buffer size
*/
size_t buf_size() {
size_t buf_size() const {
return buf_.size();
}
TargetPtr start_addr() const {
CHECK_NE(start_addr_.value().uint64(), 0) << "start addr uninitialized";
return start_addr_;
}
void set_start_addr(TargetPtr start_addr) {
CHECK_EQ(buf_.size(), 0) << "cannot change encoder start addr unless empty";
start_addr_ = TargetPtr(word_size_,
UpperAlignValue(start_addr.value().uint64(), word_size_.bytes()));
}
private:
/*! \brief in-memory backing buffer */
std::vector<uint8_t> buf_;
/*! \brief current offset */
size_t curr_offset_;
/*! \brief start address of the encoder in device memory */
DevPtr start_addr_;
TargetPtr start_addr_;
/*! \brief number of bytes available in device memory */
size_t capacity_;
/*! \brief number of bytes in a word on the target device */
size_t word_size_;
TargetWordSize word_size_;
};
template <typename T>
TargetDataLayoutEncoder::Slot<T>::Slot(TargetDataLayoutEncoder* parent,
size_t start_offset,
size_t size,
DevPtr start_addr)
TargetPtr start_addr)
: parent_(parent),
start_offset_(start_offset),
curr_offset_(0),
......@@ -158,7 +178,10 @@ TargetDataLayoutEncoder::Slot<T>::Slot(TargetDataLayoutEncoder* parent,
template <typename T>
TargetDataLayoutEncoder::Slot<T>::~Slot() {
CHECK(curr_offset_ == size_) << "unwritten space in slot";
// TODO(weberlo, areusch): this can mask the exception thrown by slot allocation... even though
// that doesn't make sense.
CHECK(curr_offset_ == size_) << "unwritten space in slot; curr_offset="
<< curr_offset_ << ", size=" << size_;
}
template <typename T>
......@@ -177,7 +200,7 @@ void TargetDataLayoutEncoder::Slot<T>::WriteValue(const T& val) {
}
template <typename T>
DevPtr TargetDataLayoutEncoder::Slot<T>::start_addr() {
TargetPtr TargetDataLayoutEncoder::Slot<T>::start_addr() {
return start_addr_;
}
......
......@@ -45,6 +45,7 @@ void TclSocket::SendCommand() {
const char terminate_token = kCommandTerminateToken;
cmd_builder_ << terminate_token;
std::string full_cmd = cmd_builder_.str();
CHECK(tcp_socket_.Send(full_cmd.data(), full_cmd.length()) != -1)
<< "failed to send command";
cmd_builder_.str(std::string());
......
......@@ -38,6 +38,7 @@
#include "../object_internal.h"
#include "../../support/ring_buffer.h"
#include "../../support/socket.h"
#include "../micro/micro_session.h"
namespace tvm {
namespace runtime {
......@@ -1246,43 +1247,15 @@ void RPCSession::EventHandler::HandlePackedCall() {
CHECK_EQ(state_, kRecvCode);
}
PackedFunc MicroTimeEvaluator(
PackedFunc pf,
TVMContext ctx,
int number,
int repeat) {
auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable {
TVMRetValue temp;
std::ostringstream os;
// skip first time call, to activate lazy compilation components.
pf.CallPacked(args, &temp);
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
for (int i = 0; i < repeat; ++i) {
double speed = 0.0;
for (int j = 0; j < number; ++j) {
pf.CallPacked(args, &temp);
DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
speed += (temp.operator double()) / number;
}
os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
}
std::string blob = os.str();
TVMByteArray arr;
arr.size = blob.length();
arr.data = blob.data();
// return the time.
*rv = arr;
};
return PackedFunc(ftimer);
}
PackedFunc WrapTimeEvaluator(PackedFunc pf,
TVMContext ctx,
int number,
int repeat,
int min_repeat_ms) {
if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
return MicroTimeEvaluator(pf, ctx, number, repeat);
auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator");
CHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
return (*get_micro_time_evaluator)(pf, ctx, number, repeat);
}
auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
......
......@@ -840,6 +840,10 @@ void CodeGenC::VisitStmt_(const AttrStmtNode* op) {
const VarNode* v = op->node.as<VarNode>();
CHECK(v);
volatile_buf_.insert(v);
} else if (op->attr_key == tir::attr::pragma_import_c) {
const StringImmNode* value = op->value.as<StringImmNode>();
CHECK(value != nullptr);
decl_stream << value->value;
}
this->PrintStmt(op->body);
}
......
......@@ -23,8 +23,8 @@
#include <tvm/target/codegen.h>
#include <vector>
#include <string>
#include "codegen_c_host.h"
#include "../build_common.h"
#include "codegen_c_host.h"
namespace tvm {
namespace codegen {
......@@ -35,9 +35,10 @@ CodeGenCHost::CodeGenCHost() {
void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) {
emit_asserts_ = emit_asserts;
declared_globals_.clear();
decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
decl_stream << "extern void* " << module_name_ << " = NULL;\n";
decl_stream << "void* " << module_name_ << " = NULL;\n";
CodeGenC::Init(output_ssa);
}
......@@ -182,8 +183,15 @@ void CodeGenCHost::VisitExpr_(const CallNode *op, std::ostream& os) { // NOLINT(
int64_t num_args = end - begin;
CHECK_GE(num_args, 0);
std::string func_name = s->value;
std::string packed_func_name = GetUniqueName(func_name + "_packed");
// NOTE: cannot rely on GetUnique for global decl_stream declarations
// because it is reset between AddFunction().
std::string packed_func_name = func_name + "_packed";
if (declared_globals_.insert(packed_func_name).second) {
// Still reserve the name among unique names.
CHECK(GetUniqueName(packed_func_name) == packed_func_name) <<
"Expected name " << packed_func_name << " to not be taken";
decl_stream << "static void* " << packed_func_name << " = NULL;\n";
}
this->PrintGetFuncFromBackend(func_name, packed_func_name);
this->PrintFuncCall(packed_func_name, num_args);
} else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) {
......@@ -255,6 +263,6 @@ runtime::Module BuildCHost(IRModule mod) {
TVM_REGISTER_GLOBAL("target.build.c")
.set_body([](TVMArgs args, TVMRetValue* rv) {
*rv = BuildCHost(args[0]);
});
});
} // namespace codegen
} // namespace tvm
......@@ -24,9 +24,10 @@
#ifndef TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_
#define TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_
#include <tvm/target/codegen.h>
#include <tvm/tir/expr.h>
#include <set>
#include <string>
#include "tvm/target/codegen.h"
#include "tvm/tir/expr.h"
#include "codegen_c.h"
namespace tvm {
......@@ -53,6 +54,8 @@ class CodeGenCHost final : public CodeGenC {
private:
std::string module_name_;
/* \brief tracks declared global variables which live despite GetUniqueName */
std::set<std::string> declared_globals_;
/*! \brief whether to emit asserts in the resulting C code */
bool emit_asserts_;
......
......@@ -140,7 +140,7 @@ Target CreateTarget(const std::string& target_name,
t->keys_array.push_back("hexagon");
t->device_type = kDLHexagon;
} else {
LOG(ERROR) << "Unknown target name " << target_name;
LOG(ERROR) << "Unknown target name " << target_name << "; falling back to stackvm";
return target::stackvm();
}
......
......@@ -181,7 +181,9 @@ def add_header(fname, header):
skipline = False
ext = os.path.splitext(fname)[1][1:]
if lines[0][:2] == "#!":
if not lines:
skipline = False # File is enpty
elif lines[0][:2] == "#!":
skipline = True
elif lines[0][:2] == "<?":
skipline = True
......
......@@ -25,8 +25,10 @@ import tvm.micro as micro
from tvm.micro import create_micro_mod
from tvm.relay.testing import resnet
# Use the host emulated micro device.
DEV_CONFIG = micro.device.host.default_config()
# # Use the host emulated micro device.
DEV_CONFIG_A = micro.device.host.generate_config()
DEV_CONFIG_B = micro.device.host.generate_config()
TARGET = 'c -device=micro_dev'
def relay_micro_build(func, dev_config, params=None):
"""Create a graph runtime module with a micro device context from a Relay function.
......@@ -47,22 +49,41 @@ def relay_micro_build(func, dev_config, params=None):
mod : tvm.runtime.Module
graph runtime module for the target device
"""
with tvm.target.build_config(disable_vectorize=True):
graph, c_mod, params = relay.build(func, target="c", params=params)
micro_mod = create_micro_mod(c_mod, dev_config)
disable_vectorize = tvm.target.build_config(disable_vectorize=True)
disable_fusion = relay.build_config(disabled_pass={'FuseOps'})
with disable_vectorize, disable_fusion:
graph, c_mod, params = relay.build(func, target=TARGET, params=params)
micro_mod = micro.create_micro_mod(c_mod, dev_config)
ctx = tvm.micro_dev(0)
mod = graph_runtime.create(graph, micro_mod, ctx)
mod.set_input(**params)
return mod
GDB_INIT_TEMPLATE = """
layout asm
target remote localhost:{gdb_port}
set $pc = UTVMInit
break UTVMDone
"""
def reset_gdbinit():
if 'server_port' not in DEV_CONFIG_A:
return
gdb_init_dir = os.environ['MICRO_GDB_INIT_DIR']
with open(f'{gdb_init_dir}/.gdbinit', 'w') as f:
gdb_port = DEV_CONFIG_A['server_port'] - 3333
f.write(GDB_INIT_TEMPLATE.format(gdb_port=gdb_port))
def test_alloc():
"""Test tensor allocation on the device."""
if not tvm.runtime.enabled("micro_dev"):
return
shape = (1024,)
dtype = "float32"
with micro.Session(DEV_CONFIG):
with micro.Session(DEV_CONFIG_A):
ctx = tvm.micro_dev(0)
np_tensor = np.random.uniform(size=shape).astype(dtype)
micro_tensor = tvm.nd.array(np_tensor, ctx)
......@@ -76,6 +97,8 @@ def test_add():
shape = (1024,)
dtype = "float32"
reset_gdbinit()
# Construct TVM expression.
tvm_shape = tvm.runtime.convert(shape)
A = te.placeholder(tvm_shape, name="A", dtype=dtype)
......@@ -86,14 +109,24 @@ def test_add():
func_name = "fadd"
c_mod = tvm.build(s, [A, B, C], target="c", name=func_name)
with micro.Session(DEV_CONFIG):
micro_mod = create_micro_mod(c_mod, DEV_CONFIG)
with micro.Session(DEV_CONFIG_A) as sess:
micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
micro_func = micro_mod[func_name]
ctx = tvm.micro_dev(0)
a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx)
a_np = np.random.uniform(size=shape).astype(dtype)
a = tvm.nd.array(a_np, ctx)
b_np = np.random.uniform(size=shape).astype(dtype)
b = tvm.nd.array(b_np, ctx)
c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
micro_func(a, b, c)
# ensure inputs weren't corrupted
tvm.testing.assert_allclose(
a.asnumpy(), a_np)
tvm.testing.assert_allclose(
b.asnumpy(), b_np)
# ensure output is correct
tvm.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + b.asnumpy())
......@@ -105,6 +138,8 @@ def test_workspace_add():
shape = (1024,)
dtype = "float32"
reset_gdbinit()
# Construct TVM expression.
tvm_shape = tvm.runtime.convert(shape)
A = te.placeholder(tvm_shape, name="A", dtype=dtype)
......@@ -116,14 +151,19 @@ def test_workspace_add():
func_name = "fadd_two_workspace"
c_mod = tvm.build(s, [A, C], target="c", name=func_name)
with micro.Session(DEV_CONFIG):
micro_mod = create_micro_mod(c_mod, DEV_CONFIG)
with micro.Session(DEV_CONFIG_A) as sess:
micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
micro_func = micro_mod[func_name]
ctx = tvm.micro_dev(0)
a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx)
a_np = np.random.uniform(size=shape).astype(dtype)
a = tvm.nd.array(a_np, ctx)
c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
micro_func(a, c)
# ensure input wasn't corrupted
tvm.testing.assert_allclose(
a.asnumpy(), a_np)
# ensure output is correct
tvm.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + 2.0)
......@@ -141,47 +181,74 @@ def test_graph_runtime():
z = relay.add(xx, relay.const(1.0))
func = relay.Function([x], z)
with micro.Session(DEV_CONFIG):
mod = relay_micro_build(func, DEV_CONFIG)
with micro.Session(DEV_CONFIG_A):
mod = relay_micro_build(func, DEV_CONFIG_A)
x_in = np.random.uniform(size=shape[0]).astype(dtype)
mod.run(x=x_in)
result = mod.get_output(0).asnumpy()
tvm.testing.assert_allclose(
mod.get_input(0).asnumpy(), x_in)
tvm.testing.assert_allclose(
result, x_in * x_in + 1.0)
def test_multiple_modules():
"""Test loading multiple modules on the device simultaneously."""
def test_conv2d():
if not tvm.runtime.enabled("micro_dev"):
return
shape = (1024,)
dtype = "float32"
# Construct Relay add program.
x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype))
ret = relay.add(x, relay.const(1.0))
add_const_func = relay.Function([x], ret)
# Construct Relay subtract program.
x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype))
ret = relay.subtract(x, relay.const(1.0))
sub_const_func = relay.Function([x], ret)
from tvm.relay import create_executor
from tvm.relay import transform
with micro.Session(DEV_CONFIG):
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
sub_const_mod = relay_micro_build(sub_const_func, DEV_CONFIG)
dshape = (1, 4, 16, 16)
dtype = 'int8'
func_name = 'fused_nn_conv2d'
x_in = np.random.uniform(size=shape[0]).astype(dtype)
add_const_mod.run(x=x_in)
add_result = add_const_mod.get_output(0).asnumpy()
sub_const_mod.run(x=x_in)
sub_result = sub_const_mod.get_output(0).asnumpy()
reset_gdbinit()
tvm.testing.assert_allclose(
add_result, x_in + 1.0)
tvm.testing.assert_allclose(
sub_result, x_in - 1.0)
# Construct Relay program.
x = relay.var("x", shape=dshape, dtype=dtype)
conv_expr = relay.nn.conv2d(
x, relay.var("w"),
kernel_size=(3, 3),
padding=(1, 1),
channels=4)
func = relay.Function(relay.analysis.free_vars(conv_expr), conv_expr)
mod = tvm.IRModule.from_expr(func)
mod = transform.InferType()(mod)
x_shape = list(map(lambda x: x.value, mod['main'].params[0].checked_type.shape))
w_shape = list(map(lambda x: x.value, mod['main'].params[1].checked_type.shape))
out_shape = list(map(lambda x: x.value, mod['main'].ret_type.shape))
with tvm.target.build_config(disable_vectorize=True):
graph, c_mod, params = relay.build(mod, target="c")
with micro.Session(DEV_CONFIG_A):
micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A)
candidate_func_name = func_name
for i in range(100):
try:
micro_func = micro_mod[candidate_func_name]
break
except tvm.TVMError as e:
candidate_func_name = f'{func_name}_{i}'
else:
assert False
ctx = tvm.micro_dev(0)
x_data = tvm.nd.array(np.random.uniform(size=x_shape).astype(dtype), ctx)
w_data = tvm.nd.array(np.random.uniform(size=w_shape).astype(dtype), ctx)
result = tvm.nd.array(np.zeros(shape=out_shape, dtype=dtype), ctx)
micro_func(x_data, w_data, result)
out_data = np.zeros(out_shape, dtype=dtype)
params = { 'x': x_data.asnumpy(), 'w': w_data.asnumpy() }
intrp = create_executor('debug')
expected_result = intrp.evaluate(mod['main'])(x_data, w_data)
tvm.testing.assert_allclose(result.asnumpy(), expected_result.asnumpy())
def test_interleave_sessions():
......@@ -196,8 +263,8 @@ def test_interleave_sessions():
ret = relay.add(x, relay.const(1.0))
add_const_func = relay.Function([x], ret)
sess_a = micro.Session(DEV_CONFIG)
sess_b = micro.Session(DEV_CONFIG)
sess_a = micro.Session(DEV_CONFIG_A)
sess_b = micro.Session(DEV_CONFIG_B)
with sess_a:
np_tensor_a = np.random.uniform(size=shape).astype(dtype)
micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
......@@ -205,13 +272,13 @@ def test_interleave_sessions():
np_tensor_b = np.random.uniform(size=shape).astype(dtype)
micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0))
with sess_a:
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
add_const_mod.run(x=micro_tensor_a)
add_result = add_const_mod.get_output(0).asnumpy()
tvm.testing.assert_allclose(
add_result, np_tensor_a + 1.0)
with sess_b:
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_B)
add_const_mod.run(x=micro_tensor_b)
add_result = add_const_mod.get_output(0).asnumpy()
tvm.testing.assert_allclose(
......@@ -230,15 +297,15 @@ def test_nested_sessions():
ret = relay.add(x, relay.const(1.0))
add_const_func = relay.Function([x], ret)
sess_a = micro.Session(DEV_CONFIG)
sess_b = micro.Session(DEV_CONFIG)
sess_a = micro.Session(DEV_CONFIG_A)
sess_b = micro.Session(DEV_CONFIG_B)
with sess_a:
np_tensor_a = np.random.uniform(size=shape).astype(dtype)
micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
with sess_b:
np_tensor_b = np.random.uniform(size=shape).astype(dtype)
micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0))
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
add_const_mod.run(x=micro_tensor_a)
add_result = add_const_mod.get_output(0).asnumpy()
tvm.testing.assert_allclose(
......@@ -257,12 +324,12 @@ def test_inactive_session_use():
ret = relay.add(x, relay.const(1.0))
add_const_func = relay.Function([x], ret)
sess_a = micro.Session(DEV_CONFIG)
sess_b = micro.Session(DEV_CONFIG)
sess_a = micro.Session(DEV_CONFIG_A)
sess_b = micro.Session(DEV_CONFIG_B)
with sess_a:
np_tensor_a = np.random.uniform(size=shape).astype(dtype)
micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG_A)
with sess_b:
# These objects belong to `sess_a`.
......@@ -272,12 +339,42 @@ def test_inactive_session_use():
add_result, np_tensor_a + 1.0)
# TODO add workspace alloc/free stress test
if __name__ == "__main__":
test_alloc()
print()
print('finished alloc test')
input('[press enter to continue]')
test_add()
print()
print('finished add test')
input('[press enter to continue]')
test_workspace_add()
print()
print('finished workspace add test')
input('[press enter to continue]')
test_graph_runtime()
print()
print('finished graph runtime test')
input('[press enter to continue]')
test_conv2d()
print()
print('finished conv2d test')
input('[press enter to continue]')
test_multiple_modules()
print()
print('finished multiple modules test')
input('[press enter to continue]')
test_interleave_sessions()
print()
print('finished interleaved sessions test')
input('[press enter to continue]')
test_nested_sessions()
print()
print('finished nested sessions test')
input('[press enter to continue]')
test_inactive_session_use()
print()
print('finished use inactive session test')
input('[press enter to continue]')
......@@ -25,3 +25,4 @@ from . import conv2d_alter_op
from .bitserial_conv2d import *
from .bitserial_dense import *
from .injective import *
from . import cortex_m7
......@@ -31,6 +31,7 @@ from .conv2d_spatial_pack import conv2d_spatial_pack_nchw, \
conv2d_spatial_pack_nhwc, \
schedule_conv2d_spatial_pack_nchw, \
schedule_conv2d_spatial_pack_nhwc
from .cortex_m7.conv2d import direct_simd
@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
......@@ -425,3 +426,15 @@ def schedule_conv2d_nchw_winograd_nnpack_without_weight_transform(cfg, outs):
traverse_inline(s, outs[0].op, _callback)
return s
@autotvm.register_topi_compute("conv2d_direct_simd.arm_cpu")
def conv2d_direct_simd(cfg, data, kernel, strides, padding, dilation, out_dtype):
"""Compute conv2d with SIMD (v7e-m)."""
return direct_simd.conv2d_direct_simd_compute(
cfg, data, kernel, strides, padding, dilation, out_dtype)
@autotvm.register_topi_schedule("conv2d_direct_simd.arm_cpu")
def schedule_conv2d_direct_simd(cfg, outs):
"""Create schedule for conv2d_direct_simd"""
return direct_simd.conv2d_direct_simd_nhwc_schedule(cfg, outs)
......@@ -152,13 +152,13 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
cfg["ann_reduce"].apply(s, conv, [kh, kw],
axis_lens=[get_const_int(kh.dom.extent),
get_const_int(kw.dom.extent)],
max_unroll=16,
max_unroll=None,
cfg=cfg)
cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
axis_lens=[cfg['tile_oh'].size[-1],
cfg['tile_ow'].size[-1],
cfg['tile_co'].size[-1]],
max_unroll=16,
max_unroll=None,
cfg=cfg)
# schedule fusion
......
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Schedules specialized for cortex-m7."""
from . import conv2d
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Conv2d implementations for cortex-m7."""
from . import direct_simd
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name
"""Direct implementation of conv2d."""
import tvm
from tvm import autotvm
from tvm.autotvm.task import deserialize_args
from topi.nn.conv2d import conv2d_nchw, conv2d_nhwc
from topi.util import get_const_tuple, get_const_int, traverse_inline
def conv2d_direct(*args, **kwargs):
"""Schedule function for directly-scheduled conv2d."""
assert not kwargs, "Do not support kwargs in template function call"
args = deserialize_args(args)
data, kernel = args[:2]
layout = args[-2]
cfg = autotvm.get_config()
args = [cfg] + args
conv = conv2d_direct_compute(*args)
if layout == 'NHWC':
sched = conv2d_direct_nhwc_schedule(cfg, [data, kernel, conv])
elif layout == 'NCHW':
sched = conv2d_direct_nchw_schedule(cfg, [data, kernel, conv])
else:
raise RuntimeError(f'unsupported data layout "{layout}"')
return sched, [data, kernel, conv]
conv2d_direct.template_key = 'direct'
conv2d_direct.default_data_layout = 'NHWC'
conv2d_direct.default_kernel_layout = 'HWIO'
@autotvm.register_topi_compute('conv2d_direct.micro_dev')
def conv2d_direct_compute(*args):
layout = args[-2]
if layout == 'NHWC':
return _conv2d_direct_nhwc_compute(*args)
if layout == 'NCHW':
return _conv2d_direct_nchw_compute(*args)
raise RuntimeError(f'unsupported data layout "{layout}"')
def _conv2d_direct_nhwc_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
assert layout == 'NHWC'
conv = conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
# Config Space Definition
N, H, W, CI = get_const_tuple(data.shape)
KH, KW, _, CO = get_const_tuple(kernel.shape)
n, oh, ow, co = cfg.axis(N), cfg.axis(H), cfg.axis(W), cfg.axis(CO)
kh, kw, ci = cfg.reduce_axis(KH), cfg.reduce_axis(KW), cfg.reduce_axis(CI)
# TODO should we add a max_factor attr to these splits?
co, vc = cfg.define_split('tile_co', co, num_outputs=2)
oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
cfg.define_reorder('reorder_0',
[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
policy='candidate', candidate=[
[n, co, oh, ow, ci, kh, kw, vh, vw, vc],
[n, co, oh, ow, ci, kh, kw, vc, vh, vw],
[n, co, oh, ow, ci, vh, vw, vc, kh, kw],
[n, co, oh, ow, ci, vc, vh, vw, kh, kw]])
cfg.define_annotate('ann_reduce', [kh, kw], policy='try_unroll')
cfg.define_annotate('ann_spatial', [vh, vw, vc], policy='try_unroll')
cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32])
cfg.define_knob('unroll_explicit', [0, 1])
return conv
def _conv2d_direct_nchw_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
assert layout == 'NCHW'
conv = conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
###########################
# Config Space Definition #
###########################
cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32])
cfg.define_knob('unroll_explicit', [0, 1])
return conv
@autotvm.register_topi_schedule('conv2d_direct_nhwc.micro_dev')
def conv2d_direct_nhwc_schedule(cfg, outs):
"""Schedule function for directly-scheduled conv2d on NHWC layout."""
sched = tvm.create_schedule([x.op for x in outs])
def _callback(op):
if 'conv2d_nhwc' not in op.tag:
return
### extract tensors ###
output = op.output(0)
conv = op
data_vec = conv.input_tensors[0]
kernel = conv.input_tensors[1] # pylint: disable=unused-variable
last = outs[0] # pylint: disable=unused-variable
# tile reduction axes
n, oh, ow, co = sched[conv].op.axis
kh, kw, ci = sched[conv].op.reduce_axis
# NOTE we can't inline data padding in the SIMD path, because it
# introduces conditionals in the inner loop.
data_pad = data_vec.op
sched[data_pad].compute_inline()
co, vc = cfg['tile_co'].apply(sched, conv, co)
oh, vh = cfg['tile_oh'].apply(sched, conv, oh)
ow, vw = cfg['tile_ow'].apply(sched, conv, ow)
cfg['reorder_0'].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
cfg['ann_reduce'].apply(sched, conv, [kh, kw],
axis_lens=[get_const_int(kh.dom.extent),
get_const_int(kw.dom.extent)],
max_unroll=8,
cfg=cfg)
cfg['ann_spatial'].apply(sched, conv, [vh, vw, vc],
axis_lens=[cfg['tile_oh'].size[-1],
cfg['tile_ow'].size[-1],
cfg['tile_co'].size[-1]],
max_unroll=8,
cfg=cfg)
kernel_scope = n # this is the scope to attach global config inside this kernel
# tune unroll
sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
traverse_inline(sched, outs[-1].op, _callback)
return sched
@autotvm.register_topi_schedule('conv2d_direct_nchw.micro_dev')
def conv2d_direct_nchw_schedule(cfg, outs):
"""Schedule function for Cortex-M7 direct implementation of conv2d."""
# use default schedule
sched = tvm.create_schedule([x.op for x in outs])
conv = outs[-1].op
output = conv.output(0)
data_vec = conv.input_tensors[0]
data_pad = data_vec.op
sched[data_pad].compute_inline()
# TODO add more schedule opts (similar to the NHWC template)
n, _, _, _ = sched[conv].op.axis
kernel_scope = n # this is the scope to attach global config inside this kernel
# tune unroll
sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
return sched
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name, no-value-for-parameter
"""Direct implementation of conv2d."""
from tvm import autotvm
from tvm.autotvm.task import deserialize_args
from tvm import te
from topi.util import simplify, traverse_inline
from topi.nn.pad import pad
from topi.nn.util import get_pad_tuple
from ..micro_kernel.gemm import (
intrin_gemm_MxKxN, gemm_MxKxN_impl,
)
def conv2d_direct_simd(*args, **kwargs):
"""Defines the Cortex-M7 SIMD implementation of conv2d."""
assert not kwargs, "Do not support kwargs in template function call"
args = deserialize_args(args)
data, kernel = args[:2]
layout = args[-2]
cfg = autotvm.get_config()
args = [cfg] + args
assert layout == 'NHWC'
conv = conv2d_direct_simd_compute(*args)
sched = conv2d_direct_simd_nhwc_schedule(cfg, [data, kernel, conv])
return sched, [data, kernel, conv]
conv2d_direct_simd.template_key = 'direct_simd'
conv2d_direct_simd.default_data_layout = 'NHWC'
conv2d_direct_simd.default_kernel_layout = 'HWOI'
def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
"""Compute function for Cortex-M7 SIMD implementation of conv2d."""
assert isinstance(strides, int) or len(strides) == 2
assert isinstance(dilation, int) or len(dilation) == 2
if isinstance(strides, int):
stride_h = stride_w = strides
else:
stride_h, stride_w = strides
if isinstance(dilation, int):
dilation_h = dilation_w = dilation
else:
dilation_h, dilation_w = dilation
batch_size, in_height, in_width, in_channels = data.shape
kernel_h, kernel_w, out_channels, _ = kernel.shape
# compute the output shape
dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
padding, (dilated_kernel_h, dilated_kernel_w))
out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
pad_before = [0, pad_top, pad_left, 0]
pad_after = [0, pad_down, pad_right, 0]
padded_data = pad(data, pad_before, pad_after, name='padded_data')
rc = te.reduce_axis((0, in_channels), name='rc')
ry = te.reduce_axis((0, kernel_h), name='ry')
rx = te.reduce_axis((0, kernel_w), name='rx')
conv = te.compute(
(batch_size, out_height, out_width, out_channels),
lambda nn, yy, xx, ff: te.sum(
padded_data[nn, yy * stride_h + ry * dilation_h,
xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
kernel[ry, rx, ff, rc].astype(out_dtype), axis=[ry, rx, rc]),
name='conv2d', tag='conv2d_nhwc')
###########################
# Config Space Definition #
###########################
n, oh, ow, co = (cfg.axis(batch_size.value),
cfg.axis(out_height.value),
cfg.axis(out_width.value),
cfg.axis(out_channels.value))
kh, kw, ci = (cfg.reduce_axis(kernel_h.value),
cfg.reduce_axis(kernel_w.value),
cfg.reduce_axis(in_channels.value))
assert in_channels.value % 4 == 0
owo, owi = cfg.define_split('tile_ow', ow, policy='factors', num_outputs=2)
cio, cii = cfg.define_split('tile_ci', ci, policy='factors', num_outputs=2,
filter=lambda x: x.size[-1] % 4 == 0)
coo, coi = cfg.define_split('tile_co', co, policy='factors', num_outputs=2)
cfg.define_reorder('reorder_0_simd',
[n, oh, owo, owi, coo, coi, kh, kw, cio, cii],
policy='candidate', candidate=[
[n, oh, kh, kw, owo, coo, cio, owi, coi, cii],
[n, oh, kh, kw, coo, owo, cio, owi, coi, cii],
[n, kh, kw, oh, owo, coo, cio, owi, coi, cii],
[n, kh, kw, oh, coo, owo, cio, owi, coi, cii]])
cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32])
cfg.define_knob('unroll_explicit', [0, 1])
return conv
def conv2d_direct_simd_nhwc_schedule(cfg, outs):
"""Schedule function for Cortex-M7 SIMD implementation of conv2d."""
sched = te.create_schedule([x.op for x in outs])
def _callback(op):
if 'conv2d_nhwc' not in op.tag:
return
# extract tensors
output = op.output(0)
conv = op
data_vec = conv.input_tensors[0]
kernel = conv.input_tensors[1] # pylint: disable=unused-variable
last = outs[0] # pylint: disable=unused-variable
# tile reduction axes
n, oh, ow, co = sched[conv].op.axis
kh, kw, ci = sched[conv].op.reduce_axis
M = cfg['tile_ow'].size[-1]
K = cfg['tile_ci'].size[-1]
N = cfg['tile_co'].size[-1]
owo, owi = cfg['tile_ow'].apply(sched, conv, ow)
cio, cii = cfg['tile_ci'].apply(sched, conv, ci)
coo, coi = cfg['tile_co'].apply(sched, conv, co)
cfg['reorder_0_simd'].apply(sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii])
gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype)
sched[output].tensorize(owi, gemm)
sched[output].pragma(n, 'import_c', gemm_MxKxN_impl(M, K, N, uniq_id))
# this is the scope to attach global config inside this kernel
kernel_scope = n
# tune unroll
sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
traverse_inline(sched, outs[-1].op, _callback)
return sched
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name, no-value-for-parameter
"""Defines gemm intrinsics for SIMD matrix multiplication."""
import random
import string
import tvm
from tvm import te
##########################
# MxKxN MatMul Intrinsic #
##########################
# NOTE this is transposed matmul (A * B^T)
def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
"""Defines a SIMD-accelerated transposed matmul."""
# we generate a unique ID for every intrinsic definition, to prevent name
# collisions in the generated source (e.g., if there are multiple operators
# in the same module that use the same intrinsic)
#
# TODO(weberlo, areusch): to cut down on memory usage, we should cache each intrinsic
# instantiation and include it only once, eliminating the need for unique
# IDs
UNIQ_ID_LEN = 8
uniq_id = ''.join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN))
if isinstance(M, tvm.tir.IntImm):
M = M.value
if isinstance(K, tvm.tir.IntImm):
K = K.value
if isinstance(N, tvm.tir.IntImm):
N = N.value
assert K % 4 == 0
# TODO(weberlo, areusch): support more dtypes?
assert in_dtype == 'int8'
assert out_dtype == 'int32'
A = te.placeholder((M, K), name='a', dtype=in_dtype)
B = te.placeholder((N, K), name='b', dtype=in_dtype)
k = te.reduce_axis((0, K), name='k')
C = te.compute(
(M, N),
lambda i, j: te.sum(A[i, k].astype(out_dtype) * B[j, k].astype(out_dtype), axis=k),
name='c')
A_buf = tvm.tir.decl_buffer(
A.shape, A.dtype,
name="A",
offset_factor=1,
strides=[te.var("A_s"), 1])
B_buf = tvm.tir.decl_buffer(
B.shape, B.dtype,
name="B",
offset_factor=1,
strides=[te.var("B_s"), 1])
C_buf = tvm.tir.decl_buffer(
C.shape, C.dtype,
name="C",
offset_factor=1,
strides=[te.var("C_s"), 1])
def intrin_func(ins, outs):
aa, bb = ins
cc = outs[0]
def _reduce_update():
ib = tvm.tir.ir_builder.create()
ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_update_{uniq_id}",
aa.access_ptr("r"),
bb.access_ptr("r"),
cc.access_ptr("w"),
aa.strides[0],
bb.strides[0],
cc.strides[0]))
return ib.get()
def _reduce_reset():
ib = tvm.tir.ir_builder.create()
ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_reset_{uniq_id}",
cc.access_ptr("w"),
cc.strides[0]))
return ib.get()
def _body():
ib = tvm.tir.ir_builder.create()
ib.emit(tvm.tir.call_extern("int32", f"gemm_{M}x{K}x{N}_body_{uniq_id}",
aa.access_ptr("r"),
bb.access_ptr("r"),
cc.access_ptr("w"),
aa.strides[0],
bb.strides[0],
cc.strides[0]))
return ib.get()
return _body(), _reduce_reset(), _reduce_update()
with tvm.target.build_config(offset_factor=1):
intrin_decl = te.decl_tensor_intrin(
C.op, intrin_func, binds={A: A_buf, B: B_buf, C: C_buf})
return intrin_decl, uniq_id
def gemm_MxKxN_impl(M, K, N, uniq_id):
"""Emit C code for gemm impl."""
# TODO(weberlo, areusch): are there any SIMD tricks to zero out arrays quickly?
aa_pad_size = M * K
bb_pad_size = N * K
# code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601)
cc_code = f"""
#ifdef __cplusplus
extern "C"
#endif
__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
int8_t *aa, int8_t *bb, int32_t *cc,
int A_stride, int B_stride, int C_stride) {{
int16_t aa_pad[{aa_pad_size}];
int16_t bb_pad[{bb_pad_size}];
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {K} / 4; j++) {{
read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]);
}}
}}
for (int i = 0; i < {N}; i++) {{
for (int j = 0; j < {K} / 4; j++) {{
read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
}}
}}
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {N}; j++) {{
int32_t sum = 0;
for (int l = 0; l < {K} / 2; l++) {{
sum = __SMLAD(
*((int32_t*) &aa_pad[i*{K} + l*2]),
*((int32_t*) &bb_pad[j*{K} + l*2]),
sum);
}}
// NOTE: this is the line where `*_body` differs from `*_update`. here
// we're *setting* the result, instead of accumulating, because we know
// the `i` and `j` itervars span their entire respective axes.
cc[i*C_stride + j] = sum;
}}
}}
return 0;
}}
#ifdef __cplusplus
extern "C"
#endif
__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
int8_t *aa, int8_t *bb, int32_t *cc,
int A_stride, int B_stride, int C_stride) {{
int16_t aa_pad[{aa_pad_size}];
int16_t bb_pad[{bb_pad_size}];
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {K} / 4; j++) {{
read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]);
}}
}}
for (int i = 0; i < {N}; i++) {{
for (int j = 0; j < {K} / 4; j++) {{
read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]);
}}
}}
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {N}; j++) {{
int32_t sum = 0;
for (int l = 0; l < {K} / 2; l++) {{
sum = __SMLAD(
*((int32_t*) &aa_pad[i*{K} + l*2]),
*((int32_t*) &bb_pad[j*{K} + l*2]),
sum);
}}
cc[i*C_stride + j] += sum;
}}
}}
return 0;
}}
#ifdef __cplusplus
extern "C"
#endif
__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_reset_{uniq_id}(int32_t *cc, int C_stride) {{
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {N}; j++) {{
cc[i*C_stride + j] = 0;
}}
}}
return 0;
}}
"""
return cc_code
......@@ -24,7 +24,7 @@ def default_schedule(outs, auto_inline):
"""Default schedule for llvm."""
target = tvm.target.Target.current(allow_none=False)
outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
if target.target_name != "llvm":
if target.target_name not in ("llvm", "c"):
raise RuntimeError("schedule not registered for '%s'" % target)
s = te.create_schedule([x.op for x in outs])
if auto_inline:
......
......@@ -35,10 +35,8 @@ def _conv2d_nhwc_python(a_np, w_np, stride, padding):
stride : int or a list/tuple of two ints
Stride size, or [stride_height, stride_width]
padding : int or str or a list/tuple of 2 or 4 ints
Padding size, or ['VALID', 'SAME'], or
[pad_height, pad_width] for 2 ints, or
[pad_top, pad_left, pad_bottom, pad_right] for 2 ints
padding : int or str or a list/tuple of two ints
Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width]
Returns
-------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment