[µTVM] Enable AutoTVM for ARM STM32F746XX Boards (#4274)

47c870a9 · Logan Weber · Tianqi Chen · 11af82c0 · 47c870a9 · 47c870a9
Commit 47c870a9 authored Dec 02, 2019 by Logan Weber Committed by Tianqi Chen Dec 02, 2019
37 changed files
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -75,6 +75,9 @@ def context(dev_type, dev_id=0):
      assert tvm.context("cuda", 0) == tvm.gpu(0)
    """
    if isinstance(dev_type, string_types):
+        if '-device=micro_dev' in dev_type:
+            dev_type = 'micro_dev'
+        else:
            dev_type = dev_type.split()[0]
            if dev_type not in TVMContext.STR2MASK:
                raise ValueError("Unknown device type %s" % dev_type)

--- a/python/tvm/contrib/binutil.py
+++ b/python/tvm/contrib/binutil.py
@@ -19,9 +19,81 @@
 import os
 import subprocess
 from . import util
-from .._ffi.base import py_str
 from ..api import register_func

+RELOCATION_LD_SCRIPT_TEMPLATE = """
+/* linker symbol for use in UTVMInit */
+_utvm_stack_pointer_init = 0x{stack_pointer_init:x};
+
+SECTIONS
+{{
+  . = 0x{text_start:x};
+  . = ALIGN({word_size});
+  .text :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.text))
+    KEEP(*(.text*))
+    . = ALIGN({word_size});
+  }}
+
+  . = 0x{rodata_start:x};
+  . = ALIGN({word_size});
+  .rodata :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.rodata))
+    KEEP(*(.rodata*))
+    . = ALIGN({word_size});
+  }}
+
+  . = 0x{data_start:x};
+  . = ALIGN({word_size});
+  .data :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.data))
+    KEEP(*(.data*))
+    . = ALIGN({word_size});
+  }}
+
+  . = 0x{bss_start:x};
+  . = ALIGN({word_size});
+  .bss :
+  {{
+    . = ALIGN({word_size});
+    KEEP(*(.bss))
+    KEEP(*(.bss*))
+    . = ALIGN({word_size});
+  }}
+}}
+"""
+
+def run_cmd(cmd):
+    """Runs `cmd` in a subprocess and awaits its completion.
+
+    Parameters
+    ----------
+    cmd : List[str]
+        list of command-line arguments
+
+    Returns
+    -------
+    output : str
+        resulting stdout capture from the subprocess
+    """
+    proc = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT)
+    (output, _) = proc.communicate()
+    output = output.decode("utf-8")
+    if proc.returncode != 0:
+        cmd_str = " ".join(cmd)
+        msg = f"error while running command \"{cmd_str}\":\n{output}"
+        raise RuntimeError(msg)
+    return output
+

 @register_func("tvm_callback_get_section_size")
 def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
@@ -48,14 +120,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
        raise RuntimeError("no such file \"{}\"".format(binary_path))
    # We use the "-A" flag here to get the ".rodata" section's size, which is
    # not included by default.
-    size_proc = subprocess.Popen(
-        ["{}size".format(toolchain_prefix), "-A", binary_path], stdout=subprocess.PIPE)
-    (size_output, _) = size_proc.communicate()
-    size_output = size_output.decode("utf-8")
-    if size_proc.returncode != 0:
-        msg = "error in finding section size:\n"
-        msg += py_str(size_output)
-        raise RuntimeError(msg)
+    size_output = run_cmd(["{}size".format(toolchain_prefix), "-A", binary_path])

    # TODO(weberlo): Refactor this method and `*relocate_binary` so they are
    # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss".
@@ -74,13 +139,15 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
            continue
        entry_name = tokens[0]
        entry_size = int(tokens[1])
-        if entry_name in sections_to_sum:
+        for section in sections_to_sum:
+            if entry_name.startswith(section):
                section_size += entry_size
+                break

    # NOTE: For some reason, the size of the BSS section on the RISC-V
    # GCC is sometimes reported to be smaller than it is, so we need to adjust
    # for this.
-    if "riscv" in toolchain_prefix and section_name == 'bss':
+    if "riscv" in toolchain_prefix and section_name == "bss":
        # TODO(weberlo): Figure out why 32 is the minimum constant that works.
        #
        # The current hypothesis is that the last symbols in the ".bss" and
@@ -97,7 +164,14 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):

 @register_func("tvm_callback_relocate_binary")
 def tvm_callback_relocate_binary(
-        binary_path, text_addr, rodata_addr, data_addr, bss_addr, toolchain_prefix):
+        binary_path,
+        word_size,
+        text_start,
+        rodata_start,
+        data_start,
+        bss_start,
+        stack_end,
+        toolchain_prefix):
    """Relocates sections in the binary to new addresses

    Parameters
@@ -105,17 +179,23 @@ def tvm_callback_relocate_binary(
    binary_path : str
        path of the binary file

-    text_addr : str
-        text section absolute address
+    word_size : int
+        word size on the target machine
+
+    text_start : int
+        text section address

-    rodata_addr : str
-        rodata section absolute address
+    rodata_start : int
+        rodata section address

-    data_addr : str
-        data section absolute address
+    data_start : int
+        data section address

-    bss_addr : str
-        bss section absolute address
+    bss_start : int
+        bss section address
+
+    stack_end : int
+        stack section end address

    toolchain_prefix : str
        prefix for binary names in target compiler toolchain
@@ -125,68 +205,29 @@ def tvm_callback_relocate_binary(
    rel_bin : bytearray
        the relocated binary
    """
-    tmp_dir = util.tempdir()
-    rel_obj_path = tmp_dir.relpath("relocated.o")
+    stack_pointer_init = stack_end - word_size
    ld_script_contents = ""
    # TODO(weberlo): There should be a better way to configure this for different archs.
    if "riscv" in toolchain_prefix:
        ld_script_contents += "OUTPUT_ARCH( \"riscv\" )\n\n"
-    # TODO(weberlo): Generate the script in a more procedural manner.
-    ld_script_contents += """
-SECTIONS
-{
-  . = %s;
-  . = ALIGN(8);
-  .text :
-  {
-    *(.text)
-    . = ALIGN(8);
-    *(.text*)
-  }
-  . = %s;
-  . = ALIGN(8);
-  .rodata :
-  {
-    *(.rodata)
-    . = ALIGN(8);
-    *(.rodata*)
-  }
-  . = %s;
-  . = ALIGN(8);
-  .data :
-  {
-    *(.data)
-    . = ALIGN(8);
-    *(.data*)
-    . = ALIGN(8);
-    *(.sdata)
-  }
-  . = %s;
-  . = ALIGN(8);
-  .bss :
-  {
-    *(.bss)
-    . = ALIGN(8);
-    *(.bss*)
-    . = ALIGN(8);
-    *(.sbss)
-  }
-}
-    """ % (text_addr, rodata_addr, data_addr, bss_addr)
+    ld_script_contents += RELOCATION_LD_SCRIPT_TEMPLATE.format(
+        word_size=word_size,
+        text_start=text_start,
+        rodata_start=rodata_start,
+        data_start=data_start,
+        bss_start=bss_start,
+        stack_pointer_init=stack_pointer_init)
+
+    tmp_dir = util.tempdir()
+    rel_obj_path = tmp_dir.relpath("relocated.obj")
    rel_ld_script_path = tmp_dir.relpath("relocated.lds")
    with open(rel_ld_script_path, "w") as f:
        f.write(ld_script_contents)
-    ld_proc = subprocess.Popen(["{}ld".format(toolchain_prefix), binary_path,
+    run_cmd([
+        "{}ld".format(toolchain_prefix),
+        binary_path,
        "-T", rel_ld_script_path,
-                                "-o", rel_obj_path],
-                               stdout=subprocess.PIPE,
-                               stderr=subprocess.STDOUT)
-    (out, _) = ld_proc.communicate()
-    if ld_proc.returncode != 0:
-        msg = "linking error using ld:\n"
-        msg += py_str(out)
-        raise RuntimeError(msg)
-
+        "-o", rel_obj_path])
    with open(rel_obj_path, "rb") as f:
        rel_bin = bytearray(f.read())
    return rel_bin
@@ -217,16 +258,11 @@ def tvm_callback_read_binary_section(binary, section, toolchain_prefix):
    tmp_section = tmp_dir.relpath("tmp_section.bin")
    with open(tmp_bin, "wb") as out_file:
        out_file.write(bytes(binary))
-    objcopy_proc = subprocess.Popen(["{}objcopy".format(toolchain_prefix), "--dump-section",
+    run_cmd([
+        "{}objcopy".format(toolchain_prefix),
+        "--dump-section",
        ".{}={}".format(section, tmp_section),
-                                     tmp_bin],
-                                    stdout=subprocess.PIPE,
-                                    stderr=subprocess.STDOUT)
-    (out, _) = objcopy_proc.communicate()
-    if objcopy_proc.returncode != 0:
-        msg = "error in using objcopy:\n"
-        msg += py_str(out)
-        raise RuntimeError(msg)
+        tmp_bin])
    if os.path.isfile(tmp_section):
        # Get section content if it exists.
        with open(tmp_section, "rb") as f:
@@ -259,15 +295,12 @@ def tvm_callback_get_symbol_map(binary, toolchain_prefix):
    tmp_obj = tmp_dir.relpath("tmp_obj.bin")
    with open(tmp_obj, "wb") as out_file:
        out_file.write(bytes(binary))
-    nm_proc = subprocess.Popen(["{}nm".format(toolchain_prefix), "-C", "--defined-only", tmp_obj],
-                               stdout=subprocess.PIPE,
-                               stderr=subprocess.STDOUT)
-    (nm_output, _) = nm_proc.communicate()
-    if nm_proc.returncode != 0:
-        msg = "error in using nm:\n"
-        msg += py_str(nm_output)
-        raise RuntimeError(msg)
-    nm_output = nm_output.decode("utf8").splitlines()
+    nm_output = run_cmd([
+        "{}nm".format(toolchain_prefix),
+        "-C",
+        "--defined-only",
+        tmp_obj])
+    nm_output = nm_output.splitlines()
    map_str = ""
    for line in nm_output:
        line = line.split()

--- a/python/tvm/exec/rpc_server.py
+++ b/python/tvm/exec/rpc_server.py
@@ -19,14 +19,22 @@
 from __future__ import absolute_import

 import argparse
+import ast
 import multiprocessing
 import sys
 import logging
+import tvm
+from tvm import micro
 from .. import rpc

 def main(args):
-    """Main function"""
+    """Main function

+    Parameters
+    ----------
+    args : argparse.Namespace
+        parsed args from command-line invocation
+    """
    if args.tracker:
        url, port = args.tracker.rsplit(":", 1)
        port = int(port)
@@ -37,6 +45,9 @@ def main(args):
    else:
        tracker_addr = None

+    if args.utvm_dev_config or args.utvm_dev_id:
+        init_utvm(args)
+
    server = rpc.Server(args.host,
                        args.port,
                        args.port_end,
@@ -48,6 +59,38 @@ def main(args):
    server.proc.join()


+def init_utvm(args):
+    """MicroTVM-specific RPC initialization
+
+    Parameters
+    ----------
+    args : argparse.Namespace
+        parsed args from command-line invocation
+    """
+    if args.utvm_dev_config and args.utvm_dev_id:
+        raise RuntimeError('only one of --utvm-dev-config and --utvm-dev-id allowed')
+
+    if args.utvm_dev_config:
+        with open(args.utvm_dev_config, 'r') as dev_conf_file:
+            dev_config = json.load(dev_conf_file)
+    else:
+        dev_config_args = ast.literal_eval(args.utvm_dev_config_args)
+        default_config_func = micro.device.get_device_funcs(args.utvm_dev_id)['default_config']
+        dev_config = default_config_func(*dev_config_args)
+
+    if args.utvm_dev_config or args.utvm_dev_id:
+        # add MicroTVM overrides
+        @tvm.register_func('tvm.rpc.server.start', override=True)
+        def server_start():
+            # pylint: disable=unused-variable
+            session = micro.Session(dev_config)
+            session._enter()
+
+            @tvm.register_func('tvm.rpc.server.shutdown', override=True)
+            def server_shutdown():
+                session._exit()
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--host', type=str, default="0.0.0.0",
@@ -71,6 +114,13 @@ if __name__ == "__main__":
                         and ROCM compilers.")
    parser.add_argument('--custom-addr', type=str,
                        help="Custom IP Address to Report to RPC Tracker")
+    parser.add_argument('--utvm-dev-config', type=str,
+                        help='JSON config file for the target device (if using MicroTVM)')
+    parser.add_argument('--utvm-dev-id', type=str,
+                        help='Unique ID for the target device (if using MicroTVM)')
+    parser.add_argument('--utvm-dev-config-args', type=str,
+                        help=('Python list of literals required to generate a default'
+                              ' MicroTVM config (if --utvm-dev-id is specified)'))

    parser.set_defaults(fork=True)
    args = parser.parse_args()

--- a/python/tvm/micro/__init__.py
+++ b/python/tvm/micro/__init__.py
@@ -14,13 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-"""uTVM module for bare-metal backends.
-
-uTVM (or the micro backend) enables provides support for bare-metal devices.
-Its targets currently include a host-emulated device which is used for testing,
-and JTAG-based openocd device which allows actual interfacing with microdevices.
-"""
+"""MicroTVM module for bare-metal backends"""

 from ..contrib import binutil
-from .base import Session, cross_compiler, create_micro_lib
+from .base import Session, create_micro_mod, cross_compiler
+from .base import LibType, get_micro_host_driven_dir, get_micro_device_dir
+from . import device
--- a/python/tvm/micro/base.py
+++ b/python/tvm/micro/base.py
@@ -14,71 +14,100 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-"""Base definitions for micro."""
+"""Base definitions for MicroTVM"""

 from __future__ import absolute_import

-import logging
 import os
 import sys
+from enum import Enum

+import tvm
 from tvm.contrib import util as _util
 from tvm.contrib import cc as _cc
-
 from .._ffi.function import _init_api
-from .._ffi.libinfo import find_include_path

-SUPPORTED_DEVICE_TYPES = ["host", "openocd"]
+class LibType(Enum):
+    """Enumeration of library types that can be compiled and loaded onto a device"""
+    # library to be used as a MicroTVM runtime
+    RUNTIME = 0
+    # library to be used as an operator
+    OPERATOR = 1
+

 class Session:
    """MicroTVM Device Session

    Parameters
    ----------
-    device_type : str
-        type of low-level device
-
-    toolchain_prefix : str
-        toolchain prefix to be used. For example, a prefix of
-        "riscv64-unknown-elf-" means "riscv64-unknown-elf-gcc" is used as
-        the compiler and "riscv64-unknown-elf-ld" is used as the linker,
-        etc.
+    config : dict
+        configuration for this session (as generated by
+        `tvm.micro.device.host.default_config()`, for example)

    Example
    --------
    .. code-block:: python

      c_mod = ...  # some module generated with "c" as the target
-      device_type = "openocd"
-      toolchain_prefix = "riscv64-unknown-elf-"
-      with tvm.micro.Session(device_type,
-                             toolchain_prefix,
-                             base_addr=0x10010000,
-                             server_addr="127.0.0.1",
-                             port=6666):
-          c_mod.export_library(lib_obj_path, fcompile=tvm.micro.cross_compiler(toolchain_prefix))
-          micro_mod = tvm.module.load(lib_obj_path, "micro_dev")
+      dev_config = micro.device.arm.stm32f746xx.default_config("127.0.0.1", 6666)
+      with tvm.micro.Session(dev_config) as sess:
+          micro_mod = create_micro_mod(c_mod, dev_config)
    """

-    def __init__(self, device_type, toolchain_prefix, **kwargs):
-        if device_type not in SUPPORTED_DEVICE_TYPES:
-            raise RuntimeError("unknown micro device type \"{}\"".format(device_type))
+    def __init__(self, config):
        self._check_system()
-        self._check_args(device_type, kwargs)
+        # TODO(weberlo): add config validation
+
+        # grab a binutil instance from the ID in the config
+        dev_funcs = tvm.micro.device.get_device_funcs(config["device_id"])
+        self.create_micro_lib = dev_funcs["create_micro_lib"]
+        self.toolchain_prefix = config["toolchain_prefix"]
+        self.mem_layout = config["mem_layout"]
+        self.word_size = config["word_size"]
+        self.thumb_mode = config["thumb_mode"]
+        self.comms_method = config["comms_method"]

        # First, find and compile runtime library.
-        runtime_src_path = os.path.join(_get_micro_device_dir(), "utvm_runtime.c")
+        runtime_src_path = os.path.join(get_micro_host_driven_dir(), "utvm_runtime.c")
        tmp_dir = _util.tempdir()
        runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj")
-        create_micro_lib(
-            runtime_obj_path, runtime_src_path, toolchain_prefix, include_dev_lib_header=False)
+        self.create_micro_lib(runtime_obj_path, runtime_src_path, LibType.RUNTIME)
+        #input(f"check {runtime_obj_path}: ")
+
+        comms_method = config["comms_method"]
+        if comms_method == "openocd":
+            server_addr = config["server_addr"]
+            server_port = config["server_port"]
+        elif comms_method == "host":
+            server_addr = ""
+            server_port = 0
+        else:
+            raise RuntimeError(f"unknown communication method: f{self.comms_method}")

-        base_addr = kwargs.get("base_addr", 0)
-        server_addr = kwargs.get("server_addr", "")
-        port = kwargs.get("port", 0)
        self.module = _CreateSession(
-            device_type, runtime_obj_path, toolchain_prefix, base_addr, server_addr, port)
+            comms_method,
+            runtime_obj_path,
+            self.toolchain_prefix,
+            self.mem_layout["text"].get("start", 0),
+            self.mem_layout["text"]["size"],
+            self.mem_layout["rodata"].get("start", 0),
+            self.mem_layout["rodata"]["size"],
+            self.mem_layout["data"].get("start", 0),
+            self.mem_layout["data"]["size"],
+            self.mem_layout["bss"].get("start", 0),
+            self.mem_layout["bss"]["size"],
+            self.mem_layout["args"].get("start", 0),
+            self.mem_layout["args"]["size"],
+            self.mem_layout["heap"].get("start", 0),
+            self.mem_layout["heap"]["size"],
+            self.mem_layout["workspace"].get("start", 0),
+            self.mem_layout["workspace"]["size"],
+            self.mem_layout["stack"].get("start", 0),
+            self.mem_layout["stack"]["size"],
+            self.word_size,
+            self.thumb_mode,
+            server_addr,
+            server_port)
        self._enter = self.module["enter"]
        self._exit = self.module["exit"]

@@ -88,55 +117,57 @@ class Session:
        Raises error if not supported.
        """
        if not sys.platform.startswith("linux"):
-            raise RuntimeError("microTVM is currently only supported on Linux")
+            raise RuntimeError("MicroTVM is currently only supported on Linux hosts")
        # TODO(weberlo): Add 32-bit support.
        # It's primarily the compilation pipeline that isn't compatible.
        if sys.maxsize <= 2**32:
-            raise RuntimeError("microTVM is currently only supported on 64-bit platforms")
-
-    def _check_args(self, device_type, args):
-        """Check if the given configuration is valid."""
-        if device_type == "host":
-            pass
-        elif device_type == "openocd":
-            assert "base_addr" in args
-            assert "server_addr" in args
-            assert "port" in args
+            raise RuntimeError("MicroTVM is currently only supported on 64-bit host platforms")

    def __enter__(self):
        self._enter()
+        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self._exit()


-def _get_micro_device_dir():
-    """Get directory path for uTVM runtime source files.
+def create_micro_mod(c_mod, dev_config):
+    """Produces a micro module from a given module.
+
+    Parameters
+    ----------
+    c_mod : tvm.module.Module
+        module with "c" as its target backend
+
+    dev_config : Dict[str, Any]
+        MicroTVM config dict for the target device

    Return
    ------
-    micro_device_dir : str
-        directory path
+    micro_mod : tvm.module.Module
+        micro module for the target device
    """
-    micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-    micro_device_dir = os.path.join(micro_dir, "..", "..", "..",
-                                    "src", "runtime", "micro", "device")
-    return micro_device_dir
+    temp_dir = _util.tempdir()
+    lib_obj_path = temp_dir.relpath("dev_lib.obj")
+    c_mod.export_library(
+        lib_obj_path,
+        fcompile=cross_compiler(dev_config, LibType.OPERATOR))
+    micro_mod = tvm.module.load(lib_obj_path)
+    return micro_mod


-def cross_compiler(toolchain_prefix, include_dev_lib_header=True):
-    """Creates a cross compile function that wraps `create_micro_lib`.
+def cross_compiler(dev_config, lib_type):
+    """Create a cross-compile function that wraps `create_lib` for a `Binutil` instance.

    For use in `tvm.module.Module.export_library`.

    Parameters
    ----------
-    toolchain_prefix : str
-        toolchain prefix to be used
+    dev_config : Dict[str, Any]
+        MicroTVM config dict for the target device

-    include_dev_lib_header : Optional[bool]
-        whether to include the device library header containing definitions of
-        library functions.
+    lib_type : micro.LibType
+        whether to compile a MicroTVM runtime or operator library

    Return
    ------
@@ -149,78 +180,46 @@ def cross_compiler(toolchain_prefix, include_dev_lib_header=True):
    .. code-block:: python

      c_mod = ...  # some module generated with "c" as the target
-      fcompile = tvm.micro.cross_compiler(toolchain_prefix="")
+      fcompile = tvm.micro.cross_compiler(dev_config, LibType.OPERATOR)
      c_mod.export_library("dev_lib.obj", fcompile=fcompile)
    """
+    dev_funcs = tvm.micro.device.get_device_funcs(dev_config['device_id'])
+    create_micro_lib = dev_funcs['create_micro_lib']
    def compile_func(obj_path, src_path, **kwargs):
        if isinstance(obj_path, list):
            obj_path = obj_path[0]
        if isinstance(src_path, list):
            src_path = src_path[0]
-        create_micro_lib(obj_path, src_path, toolchain_prefix,
-                         kwargs.get("options", None), include_dev_lib_header)
-    return _cc.cross_compiler(compile_func)
+        create_micro_lib(obj_path, src_path, lib_type, kwargs.get("options", None))
+    return _cc.cross_compiler(compile_func, output_format="obj")


-def create_micro_lib(
-        obj_path, src_path, toolchain_prefix, options=None, include_dev_lib_header=True):
-    """Compiles code into a binary for the target micro device.
+def get_micro_host_driven_dir():
+    """Get directory path for uTVM host-driven runtime source files.

-    Parameters
-    ----------
-    obj_path : Optional[str]
-        path to generated object file (defaults to same directory as `src_path`)
+    Return
+    ------
+    micro_device_dir : str
+        directory path
+    """
+    micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
+    micro_host_driven_dir = os.path.join(micro_dir, "..", "..", "..",
+                                         "src", "runtime", "micro", "host_driven")
+    return micro_host_driven_dir

-    src_path : str
-        path to source file

-    toolchain_prefix : str
-        toolchain prefix to be used
+def get_micro_device_dir():
+    """Get directory path for parent directory of device-specific source files

-    include_dev_lib_header : bool
-        whether to include the device library header containing definitions of
-        library functions.
+    Return
+    ------
+    micro_device_dir : str
+        directory path
    """
-    def replace_suffix(s, new_suffix):
-        if "." in os.path.basename(s):
-            # There already exists an extension.
-            return os.path.join(
-                os.path.dirname(s),
-                ".".join(os.path.basename(s).split(".")[:-1] + [new_suffix]))
-        # No existing extension; we can just append.
-        return s + "." + new_suffix
-
-    # uTVM object files cannot have an ".o" suffix, because it triggers the
-    # code path for creating shared objects in `tvm.module.load`.  So we replace
-    # ".o" suffixes with ".obj".
-    if obj_path.endswith(".o"):
-        logging.warning(
-            "\".o\" suffix in \"%s\" has been replaced with \".obj\"", obj_path)
-        obj_path = replace_suffix(obj_path, "obj")
-
-    options = ["-I" + path for path in find_include_path()]
-    options += ["-I{}".format(_get_micro_device_dir())]
-    options += ["-fno-stack-protector"]
-    # TODO(weberlo): Don't rely on the toolchain prefix to identify if this is the host
-    # device.
-    if toolchain_prefix == "" and sys.maxsize > 2**32 and sys.platform.startswith("linux"):
-        # Only add this option if the host is a 64-bit Linux.
-        options += ["-mcmodel=large"]
-    compile_cmd = "{}gcc".format(toolchain_prefix)
-
-    if include_dev_lib_header:
-        # Create a temporary copy of the source, so we can inject the dev lib
-        # header without modifying the original.
-        tmp_dir = _util.tempdir()
-        temp_src_path = tmp_dir.relpath("temp.c")
-        with open(src_path, "r") as f:
-            src_lines = f.read().splitlines()
-        src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"")
-        with open(temp_src_path, "w") as f:
-            f.write("\n".join(src_lines))
-        src_path = temp_src_path
-
-    _cc.create_shared(obj_path, src_path, options, compile_cmd)
+    micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
+    micro_device_dir = os.path.join(micro_dir, "..", "..", "..",
+                                    "src", "runtime", "micro", "device")
+    return micro_device_dir


 _init_api("tvm.micro", "tvm.micro.base")
--- a/python/tvm/micro/device/__init__.py
+++ b/python/tvm/micro/device/__init__.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Device-specific configuration for MicroTVM"""
+
+from .base import register_device, get_device_funcs, create_micro_lib_base
+from . import host
+from . import arm
+from . import riscv_spike
--- a/python/tvm/micro/device/arm/__init__.py
+++ b/python/tvm/micro/device/arm/__init__.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Base module for ARM device configurations"""
+
+from . import stm32f746xx
--- a/python/tvm/micro/device/arm/stm32f746xx.py
+++ b/python/tvm/micro/device/arm/stm32f746xx.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Compilation and config definitions for ARM STM32F746XX devices"""
+from .. import create_micro_lib_base, register_device
+
+DEVICE_ID = "arm.stm32f746xx"
+TOOLCHAIN_PREFIX = "arm-none-eabi-"
+
+def create_micro_lib(obj_path, src_path, lib_type, options=None):
+    """Wrapper over `create_micro_lib_base` to add device-specific options
+
+    Parameters
+    ----------
+    obj_path : str
+        path to generated object file
+
+    src_path : str
+        path to source file
+
+    lib_type : micro.LibType
+        whether to compile a MicroTVM runtime or operator library
+
+    options : Optional[List[str]]
+        additional options to pass to GCC
+    """
+    if options is None:
+        options = []
+    options += [
+        "-mcpu=cortex-m7",
+        "-mlittle-endian",
+        "-mfloat-abi=hard",
+        "-mfpu=fpv5-sp-d16",
+        "-mthumb",
+        "-gdwarf-5",
+        ]
+    create_micro_lib_base(
+        obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options)
+
+
+def default_config(server_addr, server_port):
+    """Generates a default configuration for ARM STM32F746XX devices
+
+    Parameters
+    ----------
+    server_addr : str
+        address of OpenOCD server to connect to
+
+    server_port : int
+        port of OpenOCD server to connect to
+
+    Return
+    ------
+    config : Dict[str, Any]
+        MicroTVM config dict for this device
+    """
+    return {
+        "device_id": DEVICE_ID,
+        "toolchain_prefix": TOOLCHAIN_PREFIX,
+        #
+        # [Device Memory Layout]
+        #   RAM   (rwx) : START = 0x20000000, LENGTH = 320K
+        #   FLASH (rx)  : START = 0x8000000,  LENGTH = 1024K
+        #
+        "mem_layout": {
+            "text": {
+                "start": 0x20000180,
+                "size": 20480,
+            },
+            "rodata": {
+                "start": 0x20005180,
+                "size": 20480,
+            },
+            "data": {
+                "start": 0x2000a180,
+                "size": 768,
+            },
+            "bss": {
+                "start": 0x2000a480,
+                "size": 768,
+            },
+            "args": {
+                "start": 0x2000a780,
+                "size": 1280,
+            },
+            "heap": {
+                "start": 0x2000ac80,
+                "size": 262144,
+            },
+            "workspace": {
+                "start": 0x2004ac80,
+                "size": 20480,
+            },
+            "stack": {
+                "start": 0x2004fc80,
+                "size": 80,
+            },
+        },
+        "word_size": 4,
+        "thumb_mode": True,
+        "comms_method": "openocd",
+        "server_addr": server_addr,
+        "server_port": server_port,
+    }
+
+
+register_device(DEVICE_ID, {
+    "create_micro_lib": create_micro_lib,
+    "default_config": default_config,
+})
--- a/python/tvm/micro/device/base.py
+++ b/python/tvm/micro/device/base.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Base definitions for MicroTVM config"""
+import glob
+import os
+from pathlib import Path
+
+from tvm.contrib import util as _util
+from tvm.contrib.binutil import run_cmd
+from tvm._ffi.libinfo import find_include_path
+from tvm.micro import LibType, get_micro_host_driven_dir, get_micro_device_dir
+
+_DEVICE_REGISTRY = {}
+
+def register_device(device_id, device_funcs):
+    """Register a device and associated compilation/config functions
+
+    Parameters
+    ----------
+    device_id : str
+        unique identifier for the device
+
+    device_funcs : Dict[str, func]
+        dictionary with compilation and config generation functions as values
+    """
+    if device_id in _DEVICE_REGISTRY:
+        raise RuntimeError(f"\"{device_id}\" already exists in the device registry")
+    _DEVICE_REGISTRY[device_id] = device_funcs
+
+
+def get_device_funcs(device_id):
+    """Get compilation and config generation functions for device
+
+    Parameters
+    ----------
+    device_id : str
+        unique identifier for the device
+
+    Return
+    ------
+    device_funcs : Dict[str, func]
+        dictionary with compilation and config generation functions as values
+    """
+    if device_id not in _DEVICE_REGISTRY:
+        raise RuntimeError(f"\"{device_id}\" does not exist in the binutil registry")
+    device_funcs = _DEVICE_REGISTRY[device_id]
+    return device_funcs
+
+
+def create_micro_lib_base(
+        out_obj_path,
+        in_src_path,
+        toolchain_prefix,
+        device_id,
+        lib_type,
+        options=None):
+    """Compiles code into a binary for the target micro device.
+
+    Parameters
+    ----------
+    out_obj_path : str
+        path to generated object file
+
+    in_src_path : str
+        path to source file
+
+    toolchain_prefix : str
+        toolchain prefix to be used. For example, a prefix of
+        "riscv64-unknown-elf-" means "riscv64-unknown-elf-gcc" is used as
+        the compiler and "riscv64-unknown-elf-ld" is used as the linker,
+        etc.
+
+    device_id : str
+        unique identifier for the target device
+
+    lib_type : micro.LibType
+        whether to compile a MicroTVM runtime or operator library
+
+    options : List[str]
+        additional options to pass to GCC
+    """
+    base_compile_cmd = [
+        f"{toolchain_prefix}gcc",
+        "-std=c11",
+        "-Wall",
+        "-Wextra",
+        "--pedantic",
+        "-c",
+        "-O0",
+        "-g",
+        "-nostartfiles",
+        "-nodefaultlibs",
+        "-nostdlib",
+        "-fdata-sections",
+        "-ffunction-sections",
+        ]
+    if options is not None:
+        base_compile_cmd += options
+
+    src_paths = []
+    include_paths = find_include_path() + [get_micro_host_driven_dir()]
+    tmp_dir = _util.tempdir()
+    # we might transform the src path in one of the branches below
+    new_in_src_path = in_src_path
+    if lib_type == LibType.RUNTIME:
+        dev_dir = _get_device_source_dir(device_id)
+        dev_src_paths = glob.glob(f"{dev_dir}/*.[csS]")
+        # there needs to at least be a utvm_timer.c file
+        assert dev_src_paths
+        assert "utvm_timer.c" in map(os.path.basename, dev_src_paths)
+        src_paths += dev_src_paths
+    elif lib_type == LibType.OPERATOR:
+        # create a temporary copy of the source, so we can inject the dev lib
+        # header without modifying the original.
+        temp_src_path = tmp_dir.relpath("temp.c")
+        with open(in_src_path, "r") as f:
+            src_lines = f.read().splitlines()
+        src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"")
+        with open(temp_src_path, "w") as f:
+            f.write("\n".join(src_lines))
+        new_in_src_path = temp_src_path
+        base_compile_cmd += ["-c"]
+    else:
+        raise RuntimeError("unknown lib type")
+
+    src_paths += [new_in_src_path]
+
+    for path in include_paths:
+        base_compile_cmd += ["-I", path]
+
+    prereq_obj_paths = []
+    for src_path in src_paths:
+        curr_obj_path = Path(src_path).with_suffix(".o").name
+        assert curr_obj_path not in prereq_obj_paths
+        prereq_obj_paths.append(curr_obj_path)
+        curr_compile_cmd = base_compile_cmd + [src_path, "-o", curr_obj_path]
+        run_cmd(curr_compile_cmd)
+
+    ld_cmd = [f"{toolchain_prefix}ld", "-relocatable"]
+    ld_cmd += prereq_obj_paths
+    ld_cmd += ["-o", out_obj_path]
+    run_cmd(ld_cmd)
+
+
+def _get_device_source_dir(device_id):
+    """Grabs the source directory for device-specific uTVM files"""
+    dev_subdir = "/".join(device_id.split("."))
+    return get_micro_device_dir() + "/" + dev_subdir
--- a/python/tvm/micro/device/host.py
+++ b/python/tvm/micro/device/host.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Compilation and config definitions for the host emulated device"""
+import sys
+
+from . import create_micro_lib_base, register_device
+
+DEVICE_ID = "host"
+TOOLCHAIN_PREFIX = ""
+
+def create_micro_lib(obj_path, src_path, lib_type, options=None):
+    """Wrapper over `create_micro_lib_base` to add device-specific options
+
+    Parameters
+    ----------
+    obj_path : str
+        path to generated object file
+
+    src_path : str
+        path to source file
+
+    lib_type : micro.LibType
+        whether to compile a MicroTVM runtime or operator library
+
+    options : Optional[List[str]]
+        additional options to pass to GCC
+    """
+    if options is None:
+        options = []
+    if sys.maxsize > 2**32 and sys.platform.startswith("linux"):
+        options += ["-mcmodel=large"]
+    create_micro_lib_base(
+        obj_path, src_path, TOOLCHAIN_PREFIX, DEVICE_ID, lib_type, options=options)
+
+
+def default_config():
+    """Generates a default configuration for the host emulated device
+
+    Return
+    ------
+    config : Dict[str, Any]
+        MicroTVM config dict for this device
+    """
+    return {
+        "device_id": DEVICE_ID,
+        "toolchain_prefix": TOOLCHAIN_PREFIX,
+        "mem_layout": {
+            "text": {
+                "size": 20480,
+            },
+            "rodata": {
+                "size": 20480,
+            },
+            "data": {
+                "size": 768,
+            },
+            "bss": {
+                "size": 768,
+            },
+            "args": {
+                "size": 1280,
+            },
+            "heap": {
+                "size": 262144,
+            },
+            "workspace": {
+                "size": 20480,
+            },
+            "stack": {
+                "size": 80,
+            },
+        },
+        "word_size": 8 if sys.maxsize > 2**32 else 4,
+        "thumb_mode": False,
+        "comms_method": "host",
+    }
+
+
+register_device(DEVICE_ID, {
+    "create_micro_lib": create_micro_lib,
+    "default_config": default_config,
+})
--- a/python/tvm/micro/device/riscv_spike.py
+++ b/python/tvm/micro/device/riscv_spike.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Compilation and config definitions for Spike, a RISC-V functional ISA simulator"""
+from collections import OrderedDict
+
+from . import create_micro_lib_base, register_device
+
+DEVICE_ID = "riscv_spike"
+TOOLCHAIN_PREFIX = "riscv64-unknown-elf-"
+
+def create_micro_lib(obj_path, src_path, lib_type, options=None):
+    """Wrapper over `create_micro_lib_base` to add device-specific options
+
+    Parameters
+    ----------
+    obj_path : str
+        path to generated object file
+
+    src_path : str
+        path to source file
+
+    lib_type : micro.LibType
+        whether to compile a MicroTVM runtime or operator library
+
+    options : Optional[List[str]]
+        additional options to pass to GCC
+    """
+    create_micro_lib_base(
+        obj_path,
+        src_path,
+        TOOLCHAIN_PREFIX,
+        DEVICE_ID,
+        lib_type,
+        options=options)
+
+
+def default_config(base_addr, server_addr, server_port):
+    """Generates a default configuration for Spike
+
+    Parameters
+    ----------
+    base_addr : int
+        base address of the simulator (for calculating the memory layout)
+
+    server_addr : str
+        address of OpenOCD server to connect to
+
+    server_port : int
+        port of OpenOCD server to connect to
+
+    Return
+    ------
+    config : Dict[str, Any]
+        MicroTVM config dict for this device
+    """
+    res = {
+        "device_id": DEVICE_ID,
+        "toolchain_prefix": TOOLCHAIN_PREFIX,
+        "mem_layout": OrderedDict([
+            ("text", {
+                "size": 20480,
+            }),
+            ("rodata", {
+                "size": 20480,
+            }),
+            ("data", {
+                "size": 768,
+            }),
+            ("bss", {
+                "size": 768,
+            }),
+            ("args", {
+                "size": 1280,
+            }),
+            ("heap", {
+                "size": 262144,
+            }),
+            ("workspace", {
+                "size": 20480,
+            }),
+            ("stack", {
+                "size": 80,
+            }),
+        ]),
+        "word_size": 4,
+        "thumb_mode": True,
+        "comms_method": "openocd",
+        "server_addr": server_addr,
+        "server_port": server_port,
+    }
+    # generate section start addresses from the given `base_addr`
+    curr_offset = 0
+    mem_layout = res["mem_layout"]
+    for region_dict in mem_layout.values():
+        region_dict["start"] = base_addr + curr_offset
+        curr_offset += region_dict["size"]
+    return res
+
+
+register_device(DEVICE_ID, {
+    "create_micro_lib": create_micro_lib,
+    "default_config": default_config,
+})
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -265,6 +265,9 @@ def load(path, fmt=""):
        files = [tar_temp.relpath(x) for x in tar_temp.listdir()]
        _cc.create_shared(path + ".so", files)
        path += ".so"
+    # TODO(weberlo): we should probably use a more distinctive suffix for uTVM object files
+    elif path.endswith(".obj"):
+        fmt = "micro_dev"
    # Redirect to the load API
    return _LoadFromFile(path, fmt)


--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -85,7 +85,9 @@ Target CreateTarget(const std::string& target_name,
  }
  t->device_type = kDLCPU;
  t->thread_warp_size = 1;
-  if (target_name == "c" || target_name == "llvm") {
+  if (target_name == "c" && t->device_name == "micro_dev") {
+    t->device_type = kDLMicroDev;
+  } else if (target_name == "c" || target_name == "llvm") {
    t->keys_array.push_back(ir::StringImm::make("cpu"));
  } else if (target_name == "cuda" || target_name == "nvptx") {
    t->device_type = kDLGPU;

--- a/src/codegen/codegen_c_host.cc
+++ b/src/codegen/codegen_c_host.cc
@@ -33,7 +33,8 @@ CodeGenCHost::CodeGenCHost() {
  module_name_ = GetUniqueName("__tvm_module_ctx");
 }

-void CodeGenCHost::Init(bool output_ssa) {
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts) {
+  emit_asserts_ = emit_asserts;
  decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
  decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
  decl_stream << "extern void* " << module_name_ << " = NULL;\n";
@@ -237,6 +238,7 @@ void CodeGenCHost::VisitExpr_(const Call *op, std::ostream& os) { // NOLINT(*)
 }

 void CodeGenCHost::VisitStmt_(const AssertStmt *op) { // NOLINT(*)
+  if (emit_asserts_) {
    std::string cond = PrintExpr(op->condition);
    PrintIndent();
    stream << "if (!(" << cond << ")) {\n";
@@ -248,6 +250,7 @@ void CodeGenCHost::VisitStmt_(const AssertStmt *op) { // NOLINT(*)
    this->EndScope(assert_if_scope);
    PrintIndent();
    stream << "}\n";
+  }
  this->PrintStmt(op->body);
 }

@@ -277,8 +280,9 @@ inline void CodeGenCHost::PrintTernaryCondExpr(const T* op,
 runtime::Module BuildCHost(Array<LoweredFunc> funcs) {
  using tvm::runtime::Registry;
  bool output_ssa = false;
+  bool emit_asserts = false;
  CodeGenCHost cg;
-  cg.Init(output_ssa);
+  cg.Init(output_ssa, emit_asserts);
  for (LoweredFunc f : funcs) {
    cg.AddFunction(f);
  }

--- a/src/codegen/codegen_c_host.h
+++ b/src/codegen/codegen_c_host.h
@@ -35,7 +35,7 @@ namespace codegen {
 class CodeGenCHost final : public CodeGenC {
 public:
  CodeGenCHost();
-  void Init(bool output_ssa);
+  void Init(bool output_ssa, bool emit_asserts);
  void AddFunction(LoweredFunc f);
  std::string Finish();

@@ -53,6 +53,8 @@ class CodeGenCHost final : public CodeGenC {

 private:
  std::string module_name_;
+  /*! \brief whether to emit asserts in the resulting C code */
+  bool emit_asserts_;

  void PrintGetFuncFromBackend(const std::string& func_name, const std::string& packed_func_name);
  void PrintFuncCall(const std::string& packed_func_name, int num_args);

--- a/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s
+++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_init.s
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file utvm_init.s
+ * \brief uTVM init definition for STM32F746XX-series boards
+ */
+
+.syntax unified
+.cpu cortex-m7
+.fpu softvfp
+.thumb
+
+.section .text.UTVMInit
+.type UTVMInit, %function
+UTVMInit:
+  /* enable fpu */
+  ldr r0, =0xE000ED88
+  ldr r1, [r0]
+  ldr r2, =0xF00000
+  orr r1, r2
+  str r1, [r0]
+  dsb
+  isb
+  /* set stack pointer */
+  ldr sp, =_utvm_stack_pointer_init
+  bl UTVMMain
+.size UTVMInit, .-UTVMInit
--- a/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c
+++ b/src/runtime/micro/device/arm/stm32f746xx/utvm_timer.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file utvm_timer.c
+ * \brief uTVM timer API definitions for STM32F746XX-series boards
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "utvm_runtime.h"
+
+// There are two implementations of cycle counters on the STM32F7X: SysTick and
+// CYCCNT.  SysTick is preferred, as it gives better error handling, but the
+// counter is only 24 bits wide.  If a larger timer is needed, use the CYCCNT
+// implementation, which has a 32-bit counter.
+#define USE_SYSTICK
+
+#ifdef USE_SYSTICK
+
+#define SYST_CSR    (*((volatile uint32_t *) 0xE000E010))
+#define SYST_RVR    (*((volatile uint32_t *) 0xE000E014))
+#define SYST_CVR    (*((volatile uint32_t *) 0xE000E018))
+#define SYST_CALIB  (*((volatile uint32_t *) 0xE000E01C))
+
+#define SYST_CSR_ENABLE     0
+#define SYST_CSR_TICKINT    1
+#define SYST_CSR_CLKSOURCE  2
+#define SYST_COUNTFLAG      16
+
+#define SYST_CALIB_NOREF  31
+#define SYST_CALIB_SKEW   30
+
+uint32_t start_time = 0;
+uint32_t stop_time = 0;
+
+int32_t UTVMTimerStart() {
+  SYST_CSR = (1 << SYST_CSR_ENABLE) | (1 << SYST_CSR_CLKSOURCE);
+  // wait until timer starts
+  while (SYST_CVR == 0) {}
+  start_time = SYST_CVR;
+  return 0;
+}
+
+void UTVMTimerStop() {
+  SYST_CSR = 0;
+  stop_time = SYST_CVR;
+}
+
+void UTVMTimerReset() {
+  SYST_CSR = 0;
+  // maximum reload value (24-bit)
+  SYST_RVR = (~((uint32_t) 0)) >> 8;
+  SYST_CVR = 0;
+}
+
+uint32_t UTVMTimerRead() {
+  if (SYST_CSR & SYST_COUNTFLAG) {
+    TVMAPISetLastError("timer overflowed");
+    return -1;
+  } else {
+    return start_time - stop_time;
+  }
+}
+
+#else  // !USE_SYSTICK
+
+#define DWT_CTRL    (*((volatile uint32_t *) 0xE0001000))
+#define DWT_CYCCNT  (*((volatile uint32_t *) 0xE0001004))
+
+#define DWT_CTRL_NOCYCCNT   25
+#define DWT_CTRL_CYCCNTENA  0
+
+uint32_t start_time = 0;
+uint32_t stop_time = 0;
+
+void UTVMTimerReset() {
+  DWT_CYCCNT = 0;
+}
+
+int32_t UTVMTimerStart() {
+  if (DWT_CTRL & DWT_CTRL_NOCYCCNT) {
+    TVMAPISetLastError("cycle counter not implemented on device");
+    return -1;
+  }
+  start_time = DWT_CYCCNT;
+  DWT_CTRL |= (1 << DWT_CTRL_CYCCNTENA);
+}
+
+void UTVMTimerStop() {
+  stop_time = DWT_CYCCNT;
+  DWT_CTRL &= ~(1 << DWT_CTRL_CYCCNTENA);
+}
+
+int32_t UTVMTimerRead() {
+  if (stop_time > stop_time) {
+    return stop_time - start_time;
+  } else {
+    uint32_t largest = ~0;
+    return (largest - start_time) + stop_time;
+  }
+}
+
+#endif  // USE_SYSTICK
+
+#ifdef __cplusplus
+}  // TVM_EXTERN_C
+#endif
--- a/src/runtime/micro/device/host/utvm_init.c
+++ b/src/runtime/micro/device/host/utvm_init.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file utvm_init.c
+ * \brief uTVM init definition for the host emulated device
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "utvm_runtime.h"
+
+void UTVMInit() {
+  // no init required for the host
+  UTVMMain();
+}
+
+#ifdef __cplusplus
+}  // TVM_EXTERN_C
+#endif
--- a/src/runtime/micro/device/host/utvm_timer.c
+++ b/src/runtime/micro/device/host/utvm_timer.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file utvm_timer.c
+ * \brief uTVM timer API stubs for the host emulated device
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "utvm_runtime.h"
+
+// TODO(weberlo): use this? https://stackoverflow.com/questions/5141960/get-the-current-time-in-c
+
+int32_t UTVMTimerStart() {
+  return 0;
+}
+
+void UTVMTimerStop() { }
+
+void UTVMTimerReset() { }
+
+uint32_t UTVMTimerRead() {
+  return 1;
+}
+
+#ifdef __cplusplus
+}  // TVM_EXTERN_C
+#endif
--- a/src/runtime/micro/device/utvm_device_dylib_redirect.c
+++ b/src/runtime/micro/device/utvm_device_dylib_redirect.c
--- a/src/runtime/micro/device/utvm_runtime.c
+++ b/src/runtime/micro/device/utvm_runtime.c
@@ -21,9 +21,9 @@
 * \file utvm_runtime.cc
 * \brief uTVM runtime
 *
- * All function calls go through `UTVMMain`, which reads from the current
- * `UTVMTask` and calls the appropriate function with the arguments from the
- * task.
+ * All function calls go through the externally defined `UTVMInit`, which
+ * performs device-specific setup, then calls `UTVMMain`.  `UTVMMain` then
+ * calls the function in `utvm_task` with the arguments from the task.
 *
 * Additionally included in this file are definitions for some of the most
 * common functions used in the C runtime API.
@@ -35,10 +35,17 @@ extern "C" {
 #include "utvm_runtime.h"

 // Task pointers must be patched before calling a function.
-UTVMTask task;
+UTVMTask utvm_task = {
+    .func = NULL,
+    .arg_values = NULL,
+    .arg_type_codes = NULL,
+    .num_args = 0,
+};
+
+size_t utvm_word_size = 0;  // NOLINT(*)

 // These pointers are patched at load time to point to the workspace section.
-char* utvm_workspace_begin = NULL;  // NOLINT(*)
+char* utvm_workspace_start = NULL;  // NOLINT(*)
 char* utvm_workspace_end = NULL;    // NOLINT(*)
 char* utvm_workspace_curr = NULL;   // NOLINT(*)
 // Keep track of how many active allocations there are on the workspace.
@@ -47,24 +54,39 @@ size_t utvm_num_active_allocs = 0;
 const char* utvm_last_error = NULL;  // NOLINT(*)
 int32_t utvm_return_code = 0;        // NOLINT(*)

-// We use a dummy function to signal execution is finished for device
-// backends which require breakpoints.
-void UTVMDone() { }
+uint32_t utvm_task_time = 0;

+// Gets called by UTVMInit, after device-specific initialization is finished.
 void UTVMMain() {
-  utvm_workspace_curr = utvm_workspace_begin;
+  utvm_workspace_curr = utvm_workspace_start;
  utvm_num_active_allocs = 0;
  utvm_last_error = NULL;  // NOLINT(*)
  utvm_return_code = 0;
-  utvm_return_code = task.func((void*) task.arg_values, (void*) task.arg_type_codes,  // NOLINT(*)
-                               task.num_args);
+  utvm_task_time = 0;
+  UTVMTimerReset();
+  int32_t err = UTVMTimerStart();
+  if (err < 0) {
+    utvm_return_code = err;
+    UTVMDone();
+  }
+  utvm_return_code = utvm_task.func(
+          (void*) utvm_task.arg_values,      // NOLINT(*)
+          (void*) utvm_task.arg_type_codes,  // NOLINT(*)
+          utvm_task.num_args);
+  UTVMTimerStop();
+  utvm_task_time = UTVMTimerRead();
  UTVMDone();
 }

+// We use a dummy function to signal execution is finished for device
+// backends which require breakpoints.
+void UTVMDone() { }
+
 void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size,
                               int dtype_code_hint, int dtype_bits_hint) {
  // Align up to 8 bytes.
-  utvm_workspace_curr += (8 - ((uintptr_t) utvm_workspace_curr % 8)) % 8;  // NOLINT(*)
+  utvm_workspace_curr +=
+    (utvm_word_size - ((uintptr_t) utvm_workspace_curr % utvm_word_size)) % utvm_word_size;  // NOLINT(*)
  if (utvm_workspace_curr + size > utvm_workspace_end) {
    // Out of space in workspace.
    return NULL;
@@ -81,11 +103,11 @@ int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
    TVMAPISetLastError("free called with no active workspace allocations");
    // Reset allocations and workspace (for future task executions).
    utvm_num_active_allocs = 0;
-    utvm_workspace_curr = utvm_workspace_begin;
+    utvm_workspace_curr = utvm_workspace_start;
    return -1;
  } else if (utvm_num_active_allocs == 0) {
    // No more allocations.  Reset workspace.
-    utvm_workspace_curr = utvm_workspace_begin;
+    utvm_workspace_curr = utvm_workspace_start;
    return 0;
  } else {
    return 0;

--- a/src/runtime/micro/device/utvm_runtime.h
+++ b/src/runtime/micro/device/utvm_runtime.h
@@ -21,8 +21,8 @@
 * \file utvm_runtime.h
 * \brief uTVM runtime headers
 */
-#ifndef TVM_RUNTIME_MICRO_DEVICE_UTVM_RUNTIME_H_
-#define TVM_RUNTIME_MICRO_DEVICE_UTVM_RUNTIME_H_
+#ifndef TVM_RUNTIME_MICRO_HOST_DRIVEN_UTVM_RUNTIME_H_
+#define TVM_RUNTIME_MICRO_HOST_DRIVEN_UTVM_RUNTIME_H_

 #ifdef __cplusplus
 extern "C" {
@@ -30,6 +30,7 @@ extern "C" {

 #include <stdint.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/c_backend_api.h>

 /*!
 * \brief Task structure for uTVM
@@ -45,8 +46,22 @@ typedef struct {
  int32_t num_args;
 } UTVMTask;

+extern void UTVMInit();
+
+extern void UTVMTimerReset();
+
+extern int32_t UTVMTimerStart();
+
+extern void UTVMTimerStop();
+
+extern uint32_t UTVMTimerRead();
+
+void UTVMMain();
+
+void UTVMDone();
+
 #ifdef __cplusplus
 }  // TVM_EXTERN_C
 #endif

-#endif  // TVM_RUNTIME_MICRO_DEVICE_UTVM_RUNTIME_H_
+#endif  // TVM_RUNTIME_MICRO_HOST_DRIVEN_UTVM_RUNTIME_H_
--- a/src/runtime/micro/host_low_level_device.cc
+++ b/src/runtime/micro/host_low_level_device.cc
@@ -31,6 +31,9 @@
 namespace tvm {
 namespace runtime {

+/*! \brief number of bytes in each page */
+constexpr int kPageSize = 4096;
+
 /*!
 * \brief emulated low-level device on host machine
 */
@@ -40,40 +43,33 @@ class HostLowLevelDevice final : public LowLevelDevice {
   * \brief constructor to initialize on-host memory region to act as device
   * \param num_bytes size of the emulated on-device memory region
   */
-  explicit HostLowLevelDevice(size_t num_bytes) : size_(num_bytes) {
+  explicit HostLowLevelDevice(size_t num_bytes, void** base_addr) : size_(num_bytes) {
    size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize;
    // TODO(weberlo): Set permissions per section (e.g., read-write perms for
    // the heap, execute perms for text, etc.).
    int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
    int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE;
-    base_addr_ = reinterpret_cast<std::uintptr_t>(
-        mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0));
+    base_addr_ = mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0);
+    *base_addr = base_addr_;
  }

  /*!
   * \brief destructor to deallocate on-host device region
   */
  virtual ~HostLowLevelDevice() {
-    munmap(reinterpret_cast<void*>(base_addr_), size_);
-  }
-
-  void Read(DevBaseOffset offset, void* buf, size_t num_bytes) {
-    void* addr = ToDevPtr(offset).cast_to<void*>();
-    std::memcpy(buf, addr, num_bytes);
+    munmap(base_addr_, size_);
  }

-  void Write(DevBaseOffset offset, const void* buf, size_t num_bytes) {
-    void* addr = ToDevPtr(offset).cast_to<void*>();
-    std::memcpy(addr, buf, num_bytes);
+  void Read(DevPtr addr, void* buf, size_t num_bytes) {
+    std::memcpy(buf, addr.cast_to<void*>(), num_bytes);
  }

-  void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) {
-    DevPtr func_addr = ToDevPtr(func_offset);
-    reinterpret_cast<void (*)(void)>(func_addr.value())();
+  void Write(DevPtr addr, const void* buf, size_t num_bytes) {
+    std::memcpy(addr.cast_to<void*>(), buf, num_bytes);
  }

-  std::uintptr_t base_addr() const final {
-    return base_addr_;
+  void Execute(DevPtr func_addr, DevPtr breakpoint_addr) {
+    reinterpret_cast<void (*)(void)>(func_addr.value().val64)();
  }

  const char* device_type() const final {
@@ -82,14 +78,14 @@ class HostLowLevelDevice final : public LowLevelDevice {

 private:
  /*! \brief base address of the micro device memory region */
-  std::uintptr_t base_addr_;
+  void* base_addr_;
  /*! \brief size of memory region */
  size_t size_;
 };

-const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes) {
+const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr) {
  std::shared_ptr<LowLevelDevice> lld =
-      std::make_shared<HostLowLevelDevice>(num_bytes);
+      std::make_shared<HostLowLevelDevice>(num_bytes, base_addr);
  return lld;
 }


--- a/src/runtime/micro/low_level_device.h
+++ b/src/runtime/micro/low_level_device.h
@@ -40,87 +40,52 @@ class LowLevelDevice {
  virtual ~LowLevelDevice() {}

  /*!
-   * \brief reads num_bytes from device memory at base_addr + offset into buffer
-   * \param offset on-device memory offset pointer to be read from
+   * \brief reads num_bytes from device memory at addr into buffer
+   * \param addr on-device memory address to read from
   * \param buffer on-host buffer to be read into
-   * \param num_bytes number of bytes to be read
+   * \param num_bytes number of bytes to read
   */
-  virtual void Read(DevBaseOffset offset,
+  virtual void Read(DevPtr addr,
                    void* buffer,
                    size_t num_bytes) = 0;

  /*!
-   * \brief writes num_bytes from buffer to device memory at base_addr + offset
-   * \param offset on-device memory offset pointer to be written to
-   * \param buffer on-host buffer to be written
-   * \param num_bytes number of bytes to be written
+   * \brief writes num_bytes from buffer to device memory at addr
+   * \param addr on-device memory address to write into
+   * \param buffer host buffer to write from
+   * \param num_bytes number of bytes to write
   */
-  virtual void Write(DevBaseOffset offset,
+  virtual void Write(DevPtr addr,
                     const void* buffer,
                     size_t num_bytes) = 0;

  /*!
-   * \brief starts execution of device at offset
+   * \brief starts execution of device at func_addr
   * \param func_addr offset of the init stub function
-   * \param breakpoint breakpoint at which to stop function execution
+   * \param breakpoint_addr address at which to stop function execution
   */
-  virtual void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) = 0;
-
-  // TODO(weberlo): Should we just give the device the *entire* memory layout
-  // decided by the session?
-
-  /*!
-   * \brief sets the offset of the top of the stack section
-   * \param stack_top offset of the stack top
-   */
-  virtual void SetStackTop(DevBaseOffset stack_top) {
-    LOG(FATAL) << "unimplemented";
-  }
-
-  /*!
-   * \brief convert from base offset to absolute address
-   * \param offset base offset
-   * \return absolute address
-   */
-  DevPtr ToDevPtr(DevBaseOffset offset) {
-    return DevPtr(base_addr() + offset.value());
-  }
-
-  /*!
-   * \brief convert from absolute address to base offset
-   * \param ptr absolute address
-   * \return base offset
-   */
-  DevBaseOffset ToDevOffset(DevPtr ptr) {
-    return DevBaseOffset(ptr.value() - base_addr());
-  }
+  virtual void Execute(DevPtr func_addr, DevPtr breakpoint_addr) = 0;

  /*!
   * \brief getter function for low-level device type
   * \return string containing device type
   */
  virtual const char* device_type() const = 0;
-
- protected:
-  /*!
-   * \brief getter function for base_addr
-   * \return the base address of the device memory region
-   */
-  virtual std::uintptr_t base_addr() const = 0;
 };

 /*!
 * \brief create a host low-level device
 * \param num_bytes size of the memory region
+ * \param base_addr pointer to write the host device's resulting base address into
 */
-const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes);
+const std::shared_ptr<LowLevelDevice> HostLowLevelDeviceCreate(size_t num_bytes, void** base_addr);

 /*!
 * \brief connect to OpenOCD and create an OpenOCD low-level device
+ * \param addr address of the OpenOCD server to connect to
 * \param port port of the OpenOCD server to connect to
 */
-const std::shared_ptr<LowLevelDevice> OpenOCDLowLevelDeviceCreate(std::uintptr_t base_addr,
-                                                                  const std::string& addr,
+const std::shared_ptr<LowLevelDevice> OpenOCDLowLevelDeviceCreate(const std::string& addr,
                                                                  int port);

 }  // namespace runtime

--- a/src/runtime/micro/micro_common.cc
+++ b/src/runtime/micro/micro_common.cc
@@ -35,30 +35,6 @@
 namespace tvm {
 namespace runtime {

-size_t GetDefaultSectionSize(SectionKind kind) {
-  switch (kind) {
-    case SectionKind::kText:
-      return 0xF000;
-    case SectionKind::kRodata:
-      return 0xF000;
-    case SectionKind::kData:
-      return 0xF00;
-    case SectionKind::kBss:
-      return 0xF00;
-    case SectionKind::kArgs:
-      return 0xF0000;
-    case SectionKind::kStack:
-      return 0xF000;
-    case SectionKind::kHeap:
-      return 0xF00000;
-    case SectionKind::kWorkspace:
-      return 0xF0000;
-    default:
-      LOG(FATAL) << "invalid section " << static_cast<size_t>(kind);
-      return 0;
-  }
-}
-
 const char* SectionToString(SectionKind section) {
  switch (section) {
    case SectionKind::kText: return "text";
@@ -66,37 +42,32 @@ const char* SectionToString(SectionKind section) {
    case SectionKind::kData: return "data";
    case SectionKind::kBss: return "bss";
    case SectionKind::kArgs: return "args";
-    case SectionKind::kStack: return "stack";
    case SectionKind::kHeap: return "heap";
    case SectionKind::kWorkspace: return "workspace";
+    case SectionKind::kStack: return "stack";
    default: return "";
  }
 }

-static std::string AddrToString(void* addr) {
-  std::stringstream stream;
-  if (addr != nullptr)
-    stream << addr;
-  else
-    stream << "0x0";
-  std::string string_addr = stream.str();
-  return string_addr;
-}
-
-std::string RelocateBinarySections(const std::string& binary_path,
-                                   DevPtr text,
-                                   DevPtr rodata,
-                                   DevPtr data,
-                                   DevPtr bss,
+std::string RelocateBinarySections(
+    const std::string& binary_path,
+    size_t word_size,
+    DevPtr text_start,
+    DevPtr rodata_start,
+    DevPtr data_start,
+    DevPtr bss_start,
+    DevPtr stack_end,
    const std::string& toolchain_prefix) {
  const auto* f = Registry::Get("tvm_callback_relocate_binary");
  CHECK(f != nullptr)
    << "Require tvm_callback_relocate_binary to exist in registry";
  std::string relocated_bin = (*f)(binary_path,
-                                   AddrToString(text.cast_to<void*>()),
-                                   AddrToString(rodata.cast_to<void*>()),
-                                   AddrToString(data.cast_to<void*>()),
-                                   AddrToString(bss.cast_to<void*>()),
+                                   word_size,
+                                   text_start.cast_to<uint64_t>(),
+                                   rodata_start.cast_to<uint64_t>(),
+                                   data_start.cast_to<uint64_t>(),
+                                   bss_start.cast_to<uint64_t>(),
+                                   stack_end.cast_to<uint64_t>(),
                                   toolchain_prefix);
  return relocated_bin;
 }

--- a/src/runtime/micro/micro_common.h
+++ b/src/runtime/micro/micro_common.h
@@ -46,122 +46,79 @@ enum class SectionKind : size_t {
  kData,
  kBss,
  kArgs,
-  kStack,
  kHeap,
  kWorkspace,
+  kStack,
  kNumKinds,
 };

-/*! \brief default size alignment */
-constexpr int kDefaultSizeAlignment = 8;
+/*! \brief union for storing values on varying target word sizes */
+union TargetVal {
+  /*! \brief 32-bit pointer */
+  uint32_t val32;
+  /*! \brief 64-bit pointer */
+  uint64_t val64;
+};

-/*! \brief Base class for interfacing with device locations (pointers/offsets) */
-class DeviceLocation {
+/*! \brief absolute device address */
+class DevPtr {
 public:
-  /*! \brief construct a location with value `value` */
-  explicit DeviceLocation(std::uintptr_t value) : value_(value) {}
+  /*! \brief construct a device address with value `value` */
+  explicit DevPtr(std::uintptr_t value) : value_(TargetVal { .val64 = value }) {}

  /*! \brief default constructor */
-  DeviceLocation() : value_(0) {}
+  DevPtr() : value_(TargetVal { .val64 = 0 }) {}

-  /*! \brief construct a null location */
-  explicit DeviceLocation(std::nullptr_t value) : value_(0) {}
+  /*! \brief construct a null address */
+  explicit DevPtr(std::nullptr_t value) : value_(TargetVal { .val64 = 0 }) {}

  /*! \brief destructor */
-  virtual ~DeviceLocation() {}
+  ~DevPtr() {}

  /*!
-   * \brief get value of location
-   * \return value of location
+   * \brief get value of pointer
+   * \return value of pointer
   */
-  std::uintptr_t value() const { return value_; }
+  TargetVal value() const { return value_; }

  /*!
   * \brief cast location to type `T`
   * \return casted result
   */
  template <typename T>
-  T cast_to() const { return reinterpret_cast<T>(value_); }
+  T cast_to() const { return reinterpret_cast<T>(value_.val64); }

  /*! \brief check if location is null */
-  bool operator==(std::nullptr_t) const { return value_ == 0; }
+  bool operator==(std::nullptr_t) const { return value_.val64 == 0; }

  /*! \brief check if location is not null */
-  bool operator!=(std::nullptr_t) const { return value_ != 0; }
-
- protected:
-  /*! \brief raw value storing the location */
-  std::uintptr_t value_;
-};
-
-/*! \brief absolute device address */
-class DevPtr : public DeviceLocation {
- public:
-  /*! \brief construct an absolute address with value `value` */
-  explicit DevPtr(std::uintptr_t val) : DeviceLocation(val) {}
-
-  /*! \brief default constructor */
-  DevPtr() : DeviceLocation() {}
-
-  /*! \brief construct a null absolute address */
-  explicit DevPtr(std::nullptr_t val) : DeviceLocation(val) {}
+  bool operator!=(std::nullptr_t) const { return value_.val64 != 0; }

  /*! \brief add an integer to this absolute address to get a larger absolute address */
  DevPtr operator+(size_t n) const {
-    return DevPtr(value_ + n);
+    return DevPtr(value_.val64 + n);
  }

  /*! \brief mutably add an integer to this absolute address */
  DevPtr& operator+=(size_t n) {
-    value_ += n;
+    value_.val64 += n;
    return *this;
  }

  /*! \brief subtract an integer from this absolute address to get a smaller absolute address */
  DevPtr operator-(size_t n) const {
-    return DevPtr(value_ - n);
+    return DevPtr(value_.val64 - n);
  }

  /*! \brief mutably subtract an integer from this absolute address */
  DevPtr& operator-=(size_t n) {
-    value_ -= n;
+    value_.val64 -= n;
    return *this;
  }
-};
-
-/*! \brief offset from device base address */
-class DevBaseOffset : public DeviceLocation {
- public:
-  /*! \brief construct a base offset with value `value` */
-  explicit DevBaseOffset(std::uintptr_t value) : DeviceLocation(value) {}
-
-  /*! \brief default constructor */
-  DevBaseOffset() : DeviceLocation() {}
-
-  /*! \brief construct a null base offset */
-  explicit DevBaseOffset(std::nullptr_t value) : DeviceLocation(value) {}

-  /*! \brief add an integer to this base offset to get a larger base offset */
-  DevBaseOffset operator+(size_t n) const {
-    return DevBaseOffset(value_ + n);
-  }
-
-  /*! \brief mutably add an integer to this base offset */
-  DevBaseOffset& operator+=(size_t n) {
-    value_ += n;
-    return *this;
-  }
-
-  /*! \brief subtract an integer from this base offset to get a smaller base offset */
-  DevBaseOffset operator-(size_t n) const {
-    return DevBaseOffset(value_ - n);
-  }
-
-  /*! \brief mutably subtract an integer from this base offset */
-  DevBaseOffset& operator-=(size_t n) {
-    value_ -= n;
-    return *this;
-  }
+ private:
+  /*! \brief raw value storing the pointer */
+  TargetVal value_;
 };

 /*!
@@ -212,6 +169,10 @@ class SymbolMap {
    return result->second;
  }

+  bool HasSymbol(const std::string& name) const {
+    return map_.find(name) != map_.end();
+  }
+
 private:
  /*! \brief backing map */
  std::unordered_map<std::string, DevPtr> map_;
@@ -220,7 +181,7 @@ class SymbolMap {
 /*! \brief struct containing start and size of a device memory region */
 struct DevMemRegion {
  /*! \brief section start offset */
-  DevBaseOffset start;
+  DevPtr start;
  /*! \brief size of section */
  size_t size;
 };
@@ -239,16 +200,13 @@ struct BinaryInfo {
  SymbolMap symbol_map;
 };

-// TODO(weberlo): should this be here?
-/*! \brief number of bytes in each page */
-constexpr int kPageSize = 4096;
-
-const DevBaseOffset kDeviceStart = DevBaseOffset(64);
-
-/*!
- * \brief return default size of given section kind in bytes
- */
-size_t GetDefaultSectionSize(SectionKind kind);
+struct BinaryContents {
+  BinaryInfo binary_info;
+  std::string text_contents;
+  std::string rodata_contents;
+  std::string data_contents;
+  std::string bss_contents;
+};

 /*!
 * \brief upper-aligns value according to specified alignment
@@ -270,18 +228,23 @@ const char* SectionToString(SectionKind section);
 /*!
 * \brief links binary by repositioning section addresses
 * \param binary_name input binary filename
- * \param text new text section address
- * \param rodata new rodata section address
- * \param data new data section address
- * \param bss new bss section address
+ * \param word_size word size on the target machine
+ * \param text_start text section address
+ * \param rodata_start rodata section address
+ * \param data_start data section address
+ * \param bss_start bss section address
+ * \param stack_end stack section end address
 * \param toolchain_prefix prefix of compiler toolchain to use
 * \return relocated binary file contents
 */
-std::string RelocateBinarySections(const std::string& binary_name,
-                                   DevPtr text,
-                                   DevPtr rodata,
-                                   DevPtr data,
-                                   DevPtr bss,
+std::string RelocateBinarySections(
+    const std::string& binary_path,
+    size_t word_size,
+    DevPtr text_start,
+    DevPtr rodata_start,
+    DevPtr data_start,
+    DevPtr bss_start,
+    DevPtr stack_end,
    const std::string& toolchain_prefix);

 /*!
@@ -306,7 +269,7 @@ std::string ReadSection(const std::string& binary,
 size_t GetSectionSize(const std::string& binary_name,
                      SectionKind section,
                      const std::string& toolchain_prefix,
-                      size_t align = kDefaultSizeAlignment);
+                      size_t align);

 }  // namespace runtime
 }  // namespace tvm

--- a/src/runtime/micro/micro_device_api.cc
+++ b/src/runtime/micro/micro_device_api.cc
@@ -61,7 +61,7 @@ class MicroDeviceAPI final : public DeviceAPI {
  void FreeDataSpace(TVMContext ctx, void* ptr) final {
    MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(ptr);
    dev_space->session->FreeInSection(
-      SectionKind::kHeap, DevBaseOffset(reinterpret_cast<std::uintptr_t>(dev_space->data)));
+      SectionKind::kHeap, DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
    delete dev_space;
  }

@@ -89,12 +89,12 @@ class MicroDeviceAPI final : public DeviceAPI {
      ObjectPtr<MicroSession>& session = from_space->session;
      const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();

-      DevBaseOffset from_dev_offset = GetDevLoc(from_space, from_offset);
-      DevBaseOffset to_dev_offset = GetDevLoc(to_space, to_offset);
+      DevPtr from_dev_addr = GetDevLoc(from_space, from_offset);
+      DevPtr to_dev_addr = GetDevLoc(to_space, to_offset);

      std::vector<uint8_t> buffer(size);
-      lld->Read(from_dev_offset, static_cast<void*>(buffer.data()), size);
-      lld->Write(to_dev_offset, static_cast<void*>(buffer.data()), size);
+      lld->Read(from_dev_addr, static_cast<void*>(buffer.data()), size);
+      lld->Write(to_dev_addr, static_cast<void*>(buffer.data()), size);
    } else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) {
      // Reading from the device.

@@ -102,9 +102,9 @@ class MicroDeviceAPI final : public DeviceAPI {
      ObjectPtr<MicroSession>& session = from_space->session;
      const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();

-      DevBaseOffset from_dev_offset = GetDevLoc(from_space, from_offset);
+      DevPtr from_dev_addr = GetDevLoc(from_space, from_offset);
      void* to_host_ptr = GetHostLoc(to, to_offset);
-      lld->Read(from_dev_offset, to_host_ptr, size);
+      lld->Read(from_dev_addr, to_host_ptr, size);
    } else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) {
      // Writing to the device.

@@ -113,8 +113,8 @@ class MicroDeviceAPI final : public DeviceAPI {
      const std::shared_ptr<LowLevelDevice>& lld = session->low_level_device();

      void* from_host_ptr = GetHostLoc(from, from_offset);
-      DevBaseOffset to_dev_offset = GetDevLoc(to_space, to_offset);
-      lld->Write(to_dev_offset, from_host_ptr, size);
+      DevPtr to_dev_addr = GetDevLoc(to_space, to_offset);
+      lld->Write(to_dev_addr, from_host_ptr, size);
    } else {
      LOG(FATAL) << "Expect copy from/to micro device or between micro device\n";
    }
@@ -138,7 +138,7 @@ class MicroDeviceAPI final : public DeviceAPI {
    MicroDevSpace* dev_space = static_cast<MicroDevSpace*>(data);
    ObjectPtr<MicroSession>& session = dev_space->session;
    session->FreeInSection(SectionKind::kWorkspace,
-                           DevBaseOffset(reinterpret_cast<std::uintptr_t>(dev_space->data)));
+                           DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data)));
    delete dev_space;
  }

@@ -152,10 +152,8 @@ class MicroDeviceAPI final : public DeviceAPI {
  }

 private:
-  DevBaseOffset GetDevLoc(MicroDevSpace* dev_space, size_t offset) {
-    DevBaseOffset dev_offset =
-        DevBaseOffset(reinterpret_cast<std::uintptr_t>(dev_space->data) + offset);
-    return dev_offset;
+  DevPtr GetDevLoc(MicroDevSpace* dev_space, size_t offset) {
+    return DevPtr(reinterpret_cast<std::uintptr_t>(dev_space->data) + offset);
  }

  void* GetHostLoc(const void* ptr, size_t offset) {

--- a/src/runtime/micro/micro_module.cc
+++ b/src/runtime/micro/micro_module.cc
@@ -55,62 +55,48 @@ class MicroModuleNode final : public ModuleNode {
   */
  void InitMicroModule(const std::string& binary_path) {
    session_ = MicroSession::Current();
-    binary_path_ = binary_path;
-    binary_info_ = session_->LoadBinary(binary_path_);
-  }
-
-  /*!
-   * \brief runs selected function on the micro device
-   * \param func_name name of the function to be run
-   * \param func_offset offset of the function to be run
-   * \param args type-erased arguments passed to the function
-   */
-  void RunFunction(const std::string& func_name, DevBaseOffset func_offset, const TVMArgs& args) {
-    session_->PushToExecQueue(func_offset, args);
+    symbol_map_ = session_->LoadBinary(binary_path, true).symbol_map;
  }

 private:
-  /*! \brief module binary info */
-  BinaryInfo binary_info_;
-  /*! \brief path to module binary */
-  std::string binary_path_;
+  SymbolMap symbol_map_;
  /*! \brief global session pointer */
  ObjectPtr<MicroSession> session_;
 };

 class MicroWrappedFunc {
 public:
-  MicroWrappedFunc(MicroModuleNode* m,
-                   ObjectPtr<MicroSession> session,
-                   const std::string& func_name,
-                   DevBaseOffset func_offset) {
-    m_ = m;
+  MicroWrappedFunc(ObjectPtr<MicroSession> session,
+                   DevPtr func_ptr) {
    session_ = session;
-    func_name_ = func_name;
-    func_offset_ = func_offset;
+    func_ptr_ = func_ptr;
  }

  void operator()(TVMArgs args, TVMRetValue* rv) const {
-    m_->RunFunction(func_name_, func_offset_, args);
+    *rv = session_->PushToExecQueue(func_ptr_, args);
  }

 private:
-  /*! \brief internal module */
-  MicroModuleNode* m_;
  /*! \brief reference to the session for this function (to keep the session alive) */
  ObjectPtr<MicroSession> session_;
-  /*! \brief name of the function */
-  std::string func_name_;
  /*! \brief offset of the function to be called */
-  DevBaseOffset func_offset_;
+  DevPtr func_ptr_;
 };

 PackedFunc MicroModuleNode::GetFunction(
    const std::string& name,
    const ObjectPtr<Object>& sptr_to_self) {
-  DevBaseOffset func_offset =
-      session_->low_level_device()->ToDevOffset(binary_info_.symbol_map[name]);
-  MicroWrappedFunc f(this, session_, name, func_offset);
+  DevPtr func_ptr;
+  if (name == tvm::runtime::symbol::tvm_module_main) {
+    if (symbol_map_.HasSymbol(tvm::runtime::symbol::tvm_module_main)) {
+      func_ptr = symbol_map_[tvm::runtime::symbol::tvm_module_main];
+    } else {
+      func_ptr = symbol_map_["default_function"];
+    }
+  } else {
+    func_ptr = symbol_map_[name];
+  }
+  MicroWrappedFunc f(session_, func_ptr);
  return PackedFunc(f);
 }


--- a/src/runtime/micro/micro_section_allocator.h
+++ b/src/runtime/micro/micro_section_allocator.h
@@ -38,11 +38,15 @@ class MicroSectionAllocator {
   * \brief constructor that specifies section boundaries
   * \param region location and size of the section on the device
   */
-  explicit MicroSectionAllocator(DevMemRegion region)
-    : start_offset_(region.start),
+  explicit MicroSectionAllocator(DevMemRegion region, size_t word_size)
+    : start_addr_(region.start),
      size_(0),
-      capacity_(region.size) {
-      CHECK_EQ(start_offset_.value() % 8, 0) << "micro section not aligned to 8 bytes";
+      capacity_(region.size),
+      word_size_(word_size) {
+      CHECK_EQ(start_addr_.value().val64 % word_size, 0)
+        << "micro section start not aligned to " << word_size << " bytes";
+      CHECK_EQ(capacity_ % word_size, 0)
+        << "micro section end not aligned to " << word_size << " bytes";
    }

  /*!
@@ -55,15 +59,15 @@ class MicroSectionAllocator {
   * \param size size of allocated memory in bytes
   * \return pointer to allocated memory region in section, nullptr if out of space
   */
-  DevBaseOffset Allocate(size_t size) {
-    size_ = UpperAlignValue(size_, 8);
+  DevPtr Allocate(size_t size) {
+    size_ = UpperAlignValue(size_, word_size_);
    CHECK(size_ + size < capacity_)
        << "cannot alloc " << size << " bytes in section with start_addr " <<
-        start_offset_.value();
-    DevBaseOffset alloc_ptr = start_offset_ + size_;
+        start_addr_.cast_to<void*>();
+    DevPtr alloc_addr = start_addr_ + size_;
    size_ += size;
-    alloc_map_[alloc_ptr.value()] = size;
-    return alloc_ptr;
+    alloc_map_[alloc_addr.value().val64] = size;
+    return alloc_addr;
  }

  /*!
@@ -71,10 +75,10 @@ class MicroSectionAllocator {
   * \param offs offset to allocated memory
   * \note simple allocator scheme, more complex versions will be implemented later
   */
-  void Free(DevBaseOffset offs) {
-    std::uintptr_t ptr = offs.value();
-    CHECK(alloc_map_.find(ptr) != alloc_map_.end()) << "freed pointer was never allocated";
-    alloc_map_.erase(ptr);
+  void Free(DevPtr addr) {
+    CHECK(alloc_map_.find(addr.value().val64) != alloc_map_.end())
+      << "freed pointer was never allocated";
+    alloc_map_.erase(addr.value().val64);
    if (alloc_map_.empty()) {
      size_ = 0;
    }
@@ -83,17 +87,17 @@ class MicroSectionAllocator {
  /*!
   * \brief start offset of the memory region managed by this allocator
   */
-  DevBaseOffset start_offset() const { return start_offset_; }
+  DevPtr start_addr() const { return start_addr_; }

  /*!
-   * \brief current end offset of the space being used in this memory region
+   * \brief current end addr of the space being used in this memory region
   */
-  DevBaseOffset curr_end_offset() const { return start_offset_ + size_; }
+  DevPtr curr_end_addr() const { return start_addr_ + size_; }

  /*!
-   * \brief end offset of the memory region managed by this allocator
+   * \brief end addr of the memory region managed by this allocator
   */
-  DevBaseOffset max_end_offset() const { return start_offset_ + capacity_; }
+  DevPtr max_addr() const { return start_addr_ + capacity_; }

  /*!
   * \brief size of the section
@@ -107,13 +111,15 @@ class MicroSectionAllocator {

 private:
  /*! \brief start address of the section */
-  DevBaseOffset start_offset_;
+  DevPtr start_addr_;
  /*! \brief current size of the section */
  size_t size_;
  /*! \brief total storage capacity of the section */
  size_t capacity_;
+  /*! \brief number of bytes in a word on the target device */
+  size_t word_size_;
  /*! \brief allocation map for allocation sizes */
-  std::unordered_map<std::uintptr_t, size_t> alloc_map_;
+  std::unordered_map<uint64_t, size_t> alloc_map_;
 };

 }  // namespace runtime

--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -23,6 +23,7 @@

 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
+#include <memory>
 #include <stack>
 #include <tuple>
 #include <vector>
@@ -56,99 +57,270 @@ void MicroSession::ExitWithScope() {
  entry->session_stack.pop();
 }

-MicroSession::MicroSession() {
-  DevBaseOffset curr_start_offset = kDeviceStart;
-  for (size_t i = 0; i < static_cast<size_t>(SectionKind::kNumKinds); i++) {
-    size_t section_size = GetDefaultSectionSize(static_cast<SectionKind>(i));
-    section_allocators_[i] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
-      .start = curr_start_offset,
-      .size = section_size,
-    });
-    curr_start_offset += section_size;
-  }
-  memory_size_ = curr_start_offset.cast_to<size_t>();
-}
-
-MicroSession::~MicroSession() {
-  for (size_t i = 0; i < static_cast<size_t>(SectionKind::kNumKinds); i++) {
-    section_allocators_[i] = nullptr;
-  }
-  low_level_device_ = nullptr;
-}
-
-void MicroSession::CreateSession(const std::string& device_type,
+MicroSession::MicroSession(
+    const std::string& comms_method,
    const std::string& binary_path,
    const std::string& toolchain_prefix,
-                                 std::uintptr_t base_addr,
+    uint64_t text_start,
+    size_t text_size,
+    uint64_t rodata_start,
+    size_t rodata_size,
+    uint64_t data_start,
+    size_t data_size,
+    uint64_t bss_start,
+    size_t bss_size,
+    uint64_t args_start,
+    size_t args_size,
+    uint64_t heap_start,
+    size_t heap_size,
+    uint64_t workspace_start,
+    size_t workspace_size,
+    uint64_t stack_start,
+    size_t stack_size,
+    size_t word_size,
+    bool thumb_mode,
    const std::string& server_addr,
-                                 int port) {
-  // TODO(weberlo): make device type enum
-  toolchain_prefix_ = toolchain_prefix;
-  if (device_type == "host") {
-    low_level_device_ = HostLowLevelDeviceCreate(memory_size_);
-  } else if (device_type == "openocd") {
-    // TODO(weberlo): We need a better way of configuring devices.
-    low_level_device_ = OpenOCDLowLevelDeviceCreate(base_addr, server_addr, port);
+    int port)
+    : toolchain_prefix_(toolchain_prefix)
+    , word_size_(word_size)
+    , thumb_mode_(thumb_mode) {
+  CHECK(word_size_ == 4 || word_size_ == 8) << "unsupported word size " << word_size_;
+  if (comms_method == "host") {
+    // TODO(weberlo): move checks to python
+    CHECK(
+        text_start == 0 &&
+        rodata_start == 0 &&
+        data_start == 0 &&
+        bss_start == 0 &&
+        args_start == 0 &&
+        heap_start == 0 &&
+        workspace_start == 0 &&
+        stack_start == 0) << "unable to specify section addresses for host device";
+    size_t memory_size =
+      text_size + rodata_size + data_size + bss_size +
+      args_size + heap_size + workspace_size + stack_size;
+    void* base_addr;
+    low_level_device_ = HostLowLevelDeviceCreate(memory_size, &base_addr);
+    CHECK_EQ(reinterpret_cast<std::uintptr_t>(base_addr) % word_size_, 0)
+      << "base address not aligned to " << word_size_ << " bytes";
+    DevPtr curr_addr = DevPtr(reinterpret_cast<std::uintptr_t>(base_addr));
+
+    section_allocators_[0] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = curr_addr,
+      .size = text_size,
+    }, word_size_);
+    curr_addr += text_size;
+    section_allocators_[1] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = curr_addr,
+      .size = rodata_size,
+    }, word_size_);
+    curr_addr += rodata_size;
+    section_allocators_[2] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = curr_addr,
+      .size = data_size,
+    }, word_size_);
+    curr_addr += data_size;
+    section_allocators_[3] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = curr_addr,
+      .size = bss_size,
+    }, word_size_);
+    curr_addr += bss_size;
+    section_allocators_[4] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = curr_addr,
+      .size = args_size,
+    }, word_size_);
+    curr_addr += args_size;
+    section_allocators_[5] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = curr_addr,
+      .size = heap_size,
+    }, word_size_);
+    curr_addr += heap_size;
+    section_allocators_[6] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = curr_addr,
+      .size = workspace_size,
+    }, word_size_);
+    curr_addr += workspace_size;
+    section_allocators_[7] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = curr_addr,
+      .size = stack_size,
+    }, word_size_);
+    curr_addr += stack_size;
+  } else if (comms_method == "openocd") {
+    low_level_device_ = OpenOCDLowLevelDeviceCreate(server_addr, port);
+    section_allocators_[0] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = DevPtr(text_start),
+      .size = text_size,
+    }, word_size_);
+    section_allocators_[1] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = DevPtr(rodata_start),
+      .size = rodata_size,
+    }, word_size_);
+    section_allocators_[2] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = DevPtr(data_start),
+      .size = data_size,
+    }, word_size_);
+    section_allocators_[3] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = DevPtr(bss_start),
+      .size = bss_size,
+    }, word_size_);
+    section_allocators_[4] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = DevPtr(args_start),
+      .size = args_size,
+    }, word_size_);
+    section_allocators_[5] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = DevPtr(heap_start),
+      .size = heap_size,
+    }, word_size_);
+    section_allocators_[6] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = DevPtr(workspace_start),
+      .size = workspace_size,
+    }, word_size_);
+    section_allocators_[7] = std::make_shared<MicroSectionAllocator>(DevMemRegion {
+      .start = DevPtr(stack_start),
+      .size = stack_size,
+    }, word_size_);
  } else {
    LOG(FATAL) << "unsupported micro low-level device";
  }

-  SetRuntimeBinaryPath(binary_path);
-  CHECK(!runtime_binary_path_.empty()) << "uTVM runtime not initialized";
-  runtime_bin_info_ = LoadBinary(runtime_binary_path_, /* patch_dylib_pointers */ false);
-  utvm_main_symbol_ = low_level_device()->ToDevOffset(runtime_symbol_map()["UTVMMain"]);
-  utvm_done_symbol_ = low_level_device()->ToDevOffset(runtime_symbol_map()["UTVMDone"]);
-
-  if (device_type == "openocd") {
-    // Set OpenOCD device's stack pointer.
-    auto stack_section = GetAllocator(SectionKind::kStack);
-    low_level_device_->SetStackTop(stack_section->max_end_offset());
+  runtime_symbol_map_ = LoadBinary(binary_path, false).symbol_map;
+
+  // Patch pointers to define the bounds of the workspace section and the word
+  // size (for allocation alignment).
+  std::shared_ptr<MicroSectionAllocator> ws_allocator = GetAllocator(SectionKind::kWorkspace);
+  TargetVal ws_start = ws_allocator->start_addr().value();
+  TargetVal ws_end = ws_allocator->max_addr().value();
+  TargetVal target_word_size { .val64 = word_size_ };
+  if (word_size_ == 4) {
+    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val32);
+    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val32);
+    DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val32);
+  } else if (word_size_ == 8) {
+    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_start", ws_start.val64);
+    DevSymbolWrite(runtime_symbol_map_, "utvm_workspace_end", ws_end.val64);
+    DevSymbolWrite(runtime_symbol_map_, "utvm_word_size", target_word_size.val64);
  }
+}

-  // Patch workspace pointers to the start of the workspace section.
-  DevBaseOffset workspace_start_offset = GetAllocator(SectionKind::kWorkspace)->start_offset();
-  DevBaseOffset workspace_end_offset = GetAllocator(SectionKind::kWorkspace)->max_end_offset();
-  void* workspace_start_addr =
-      low_level_device_->ToDevPtr(workspace_start_offset).cast_to<void*>();
-  void* workspace_end_addr =
-      low_level_device_->ToDevPtr(workspace_end_offset).cast_to<void*>();
-  DevSymbolWrite(runtime_symbol_map(), "utvm_workspace_begin", workspace_start_addr);
-  DevSymbolWrite(runtime_symbol_map(), "utvm_workspace_end", workspace_end_addr);
+MicroSession::~MicroSession() {
+  for (size_t i = 0; i < static_cast<size_t>(SectionKind::kNumKinds); i++) {
+    section_allocators_[i] = nullptr;
+  }
+  low_level_device_ = nullptr;
 }

-void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) {
-  int32_t (*func_dev_addr)(void*, void*, int32_t) =
-      reinterpret_cast<int32_t (*)(void*, void*, int32_t)>(
-          low_level_device()->ToDevPtr(func).value());
+double MicroSession::PushToExecQueue(DevPtr func_ptr, const TVMArgs& args) {
+  if (thumb_mode_) {
+    func_ptr += 1;
+  }

  // Create an allocator stream for the memory region after the most recent
  // allocation in the args section.
-  DevPtr args_addr =
-      low_level_device()->ToDevPtr(GetAllocator(SectionKind::kArgs)->curr_end_offset());
-  TargetDataLayoutEncoder encoder(args_addr);
+  DevPtr args_addr = GetAllocator(SectionKind::kArgs)->curr_end_addr();
+  TargetDataLayoutEncoder encoder(args_addr, word_size_);

  std::tuple<DevPtr, DevPtr> arg_field_addrs = EncoderAppend(&encoder, args);
+
  // Flush `stream` to device memory.
-  DevBaseOffset stream_dev_offset =
+  DevPtr stream_dev_addr =
      GetAllocator(SectionKind::kArgs)->Allocate(encoder.buf_size());
-  low_level_device()->Write(stream_dev_offset,
+  low_level_device()->Write(stream_dev_addr,
                            reinterpret_cast<void*>(encoder.data()),
                            encoder.buf_size());

-  UTVMTask task = {
-      .func = func_dev_addr,
-      .arg_values = std::get<0>(arg_field_addrs).cast_to<TVMValue*>(),
-      .arg_type_codes = std::get<1>(arg_field_addrs).cast_to<int*>(),
+  TargetVal arg_values_dev_addr = std::get<0>(arg_field_addrs).value();
+  TargetVal arg_type_codes_dev_addr = std::get<1>(arg_field_addrs).value();
+  if (word_size_ == 4) {
+    UTVMTask32 task = {
+      .func = func_ptr.value().val32,
+      .arg_values = arg_values_dev_addr.val32,
+      .arg_type_codes = arg_type_codes_dev_addr.val32,
+      .num_args = args.num_args,
+    };
+    // Write the task.
+    DevSymbolWrite(runtime_symbol_map_, "utvm_task", task);
+  } else if (word_size_ == 8) {
+    UTVMTask64 task = {
+      .func = func_ptr.value().val64,
+      .arg_values = arg_values_dev_addr.val64,
+      .arg_type_codes = arg_type_codes_dev_addr.val64,
      .num_args = args.num_args,
    };
    // Write the task.
-  DevSymbolWrite(runtime_symbol_map(), "task", task);
+    DevSymbolWrite(runtime_symbol_map_, "utvm_task", task);
+  }
+
+  DevPtr utvm_init_addr = runtime_symbol_map_["UTVMInit"];
+  DevPtr utvm_done_addr = runtime_symbol_map_["UTVMDone"];
+  if (thumb_mode_) {
+    utvm_init_addr += 1;
+  }

-  low_level_device()->Execute(utvm_main_symbol_, utvm_done_symbol_);
+  low_level_device()->Execute(utvm_init_addr, utvm_done_addr);
  // Check if there was an error during execution.  If so, log it.
  CheckDeviceError();
+  uint32_t task_time = DevSymbolRead<uint32_t>(runtime_symbol_map_, "utvm_task_time");
+  GetAllocator(SectionKind::kArgs)->Free(stream_dev_addr);
+  return static_cast<double>(task_time);
+}
+
+BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) {
+  DevMemRegion text_section;
+  DevMemRegion rodata_section;
+  DevMemRegion data_section;
+  DevMemRegion bss_section;
+
+  text_section.size = GetSectionSize(
+      binary_path, SectionKind::kText, toolchain_prefix_, word_size_);
+  rodata_section.size = GetSectionSize(
+      binary_path, SectionKind::kRodata, toolchain_prefix_, word_size_);
+  data_section.size = GetSectionSize(
+      binary_path, SectionKind::kData, toolchain_prefix_, word_size_);
+  bss_section.size = GetSectionSize(
+      binary_path, SectionKind::kBss, toolchain_prefix_, word_size_);
+
+  text_section.start = AllocateInSection(SectionKind::kText, text_section.size);
+  rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size);
+  data_section.start = AllocateInSection(SectionKind::kData, data_section.size);
+  bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size);
+  CHECK(text_section.start != nullptr && rodata_section.start != nullptr &&
+        data_section.start != nullptr && bss_section.start != nullptr)
+      << "not enough space to load module on device";

-  GetAllocator(SectionKind::kArgs)->Free(stream_dev_offset);
+  std::string relocated_bin = RelocateBinarySections(
+      binary_path,
+      word_size_,
+      text_section.start,
+      rodata_section.start,
+      data_section.start,
+      bss_section.start,
+      GetAllocator(SectionKind::kStack)->max_addr(),
+      toolchain_prefix_);
+  std::string text_contents = ReadSection(relocated_bin, SectionKind::kText, toolchain_prefix_);
+  std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata, toolchain_prefix_);
+  std::string data_contents = ReadSection(relocated_bin, SectionKind::kData, toolchain_prefix_);
+  std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss, toolchain_prefix_);
+
+  low_level_device_->Write(text_section.start, &text_contents[0], text_section.size);
+  low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size);
+  low_level_device_->Write(data_section.start, &data_contents[0], data_section.size);
+  low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size);
+  SymbolMap symbol_map {relocated_bin, toolchain_prefix_};
+
+  if (patch_dylib_pointers) {
+    // Patch device lib pointers.
+    PatchImplHole(symbol_map, "TVMBackendAllocWorkspace");
+    PatchImplHole(symbol_map, "TVMBackendFreeWorkspace");
+    PatchImplHole(symbol_map, "TVMAPISetLastError");
+  }
+
+  return BinaryInfo {
+      .text_section = text_section,
+      .rodata_section = rodata_section,
+      .data_section = data_section,
+      .bss_section = bss_section,
+      .symbol_map = symbol_map,
+  };
 }

 std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
@@ -171,7 +343,12 @@ std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
        // Mutate the array to unwrap the `data` field.
        base_arr_handle->data = reinterpret_cast<MicroDevSpace*>(old_data)->data;
        // Now, encode the unwrapped version.
-        void* arr_ptr = EncoderAppend(encoder, *base_arr_handle).cast_to<void*>();
+        void* arr_ptr = nullptr;
+        if (word_size_ == 4) {
+          arr_ptr = EncoderAppend<TVMArray32>(encoder, *base_arr_handle).cast_to<void*>();
+        } else if (word_size_ == 8) {
+          arr_ptr = EncoderAppend<TVMArray64>(encoder, *base_arr_handle).cast_to<void*>();
+        }
        // And restore the original wrapped version.
        base_arr_handle->data = old_data;

@@ -190,54 +367,53 @@ std::tuple<DevPtr, DevPtr> MicroSession::EncoderAppend(
    }
  }
  type_codes_slot.WriteArray(type_codes, num_args);
-
  return std::make_tuple(tvm_vals_slot.start_addr(), type_codes_slot.start_addr());
 }

+template <typename T>
 DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArray& arr) {
-  auto tvm_arr_slot = encoder->Alloc<TVMArray>();
+  auto tvm_arr_slot = encoder->Alloc<T>();
  auto shape_slot = encoder->Alloc<int64_t>(arr.ndim);

  // `shape` and `strides` are stored on the host, so we need to write them to
  // the device first. The `data` field is already allocated on the device and
  // is a device pointer, so we don't need to write it.
  shape_slot.WriteArray(arr.shape, arr.ndim);
-  DevPtr shape_addr = shape_slot.start_addr();
-  DevPtr strides_addr = DevPtr(nullptr);
+  DevPtr shape_dev_addr = shape_slot.start_addr();
+  DevPtr strides_dev_addr = DevPtr(nullptr);
  if (arr.strides != nullptr) {
    auto stride_slot = encoder->Alloc<int64_t>(arr.ndim);
    stride_slot.WriteArray(arr.strides, arr.ndim);
-    strides_addr = stride_slot.start_addr();
+    strides_dev_addr = stride_slot.start_addr();
  }

-  // Copy `arr`, update the copy's pointers to be device pointers, then
-  // write the copy to `tvm_arr_slot`.
-  TVMArray dev_arr = arr;
-  // Update the device type to look like a host, because codegen generates
-  // checks that it is a host array.
+  T dev_arr(
+      TargetVal { .val64 = reinterpret_cast<uint64_t>(arr.data) },
+      arr.ctx,
+      arr.ndim,
+      arr.dtype,
+      shape_dev_addr.value(),
+      strides_dev_addr.value(),
+      TargetVal { .val64 = arr.byte_offset });
  CHECK(dev_arr.ctx.device_type == static_cast<DLDeviceType>(kDLMicroDev))
    << "attempt to write TVMArray with non-micro device type";
+  // Update the device type to CPU, because from the microcontroller's
+  // perspective, it is.
  dev_arr.ctx.device_type = DLDeviceType::kDLCPU;
-  // Add the base address of the device to the array's data's device offset to
-  // get a device address.
-  DevBaseOffset arr_offset(reinterpret_cast<std::uintptr_t>(arr.data));
-  dev_arr.data = low_level_device()->ToDevPtr(arr_offset).cast_to<void*>();
-  dev_arr.shape = shape_addr.cast_to<int64_t*>();
-  dev_arr.strides = strides_addr.cast_to<int64_t*>();
  tvm_arr_slot.WriteValue(dev_arr);
  return tvm_arr_slot.start_addr();
 }

 void MicroSession::CheckDeviceError() {
-  int32_t return_code = DevSymbolRead<int32_t>(runtime_symbol_map(), "utvm_return_code");
+  int32_t return_code = DevSymbolRead<int32_t>(runtime_symbol_map_, "utvm_return_code");

  if (return_code) {
    std::uintptr_t last_error =
-        DevSymbolRead<std::uintptr_t>(runtime_symbol_map(), "utvm_last_error");
+        DevSymbolRead<std::uintptr_t>(runtime_symbol_map_, "utvm_last_error");
    std::string last_error_str;
    if (last_error) {
-      DevBaseOffset last_err_offset = low_level_device()->ToDevOffset(DevPtr(last_error));
-      last_error_str = ReadString(last_err_offset);
+      DevPtr last_err_addr = DevPtr(last_error);
+      last_error_str = ReadString(last_err_addr);
    }
    LOG(FATAL) << "error during micro function execution:\n"
               << "  return code: " << std::dec << return_code << "\n"
@@ -246,100 +422,51 @@ void MicroSession::CheckDeviceError() {
  }
 }

-BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) {
-  DevMemRegion text_section;
-  DevMemRegion rodata_section;
-  DevMemRegion data_section;
-  DevMemRegion bss_section;
-
-  text_section.size = GetSectionSize(binary_path, SectionKind::kText, toolchain_prefix_);
-  rodata_section.size = GetSectionSize(binary_path, SectionKind::kRodata, toolchain_prefix_);
-  data_section.size = GetSectionSize(binary_path, SectionKind::kData, toolchain_prefix_);
-  bss_section.size = GetSectionSize(binary_path, SectionKind::kBss, toolchain_prefix_);
-
-  text_section.start = AllocateInSection(SectionKind::kText, text_section.size);
-  rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size);
-  data_section.start = AllocateInSection(SectionKind::kData, data_section.size);
-  bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size);
-  CHECK(text_section.start != nullptr && rodata_section.start != nullptr &&
-        data_section.start != nullptr && bss_section.start != nullptr)
-      << "not enough space to load module on device";
-
-  std::string relocated_bin = RelocateBinarySections(
-      binary_path,
-      low_level_device_->ToDevPtr(text_section.start),
-      low_level_device_->ToDevPtr(rodata_section.start),
-      low_level_device_->ToDevPtr(data_section.start),
-      low_level_device_->ToDevPtr(bss_section.start),
-      toolchain_prefix_);
-  std::string text_contents = ReadSection(relocated_bin, SectionKind::kText, toolchain_prefix_);
-  std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata, toolchain_prefix_);
-  std::string data_contents = ReadSection(relocated_bin, SectionKind::kData, toolchain_prefix_);
-  std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss, toolchain_prefix_);
-  low_level_device_->Write(text_section.start, &text_contents[0], text_section.size);
-  low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size);
-  low_level_device_->Write(data_section.start, &data_contents[0], data_section.size);
-  low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size);
-  SymbolMap symbol_map {relocated_bin, toolchain_prefix_};
-
-  if (patch_dylib_pointers) {
-    // Patch device lib pointers.
-    PatchImplHole(symbol_map, "TVMBackendAllocWorkspace");
-    PatchImplHole(symbol_map, "TVMBackendFreeWorkspace");
-    PatchImplHole(symbol_map, "TVMAPISetLastError");
-  }
-
-  return BinaryInfo {
-      .text_section = text_section,
-      .rodata_section = rodata_section,
-      .data_section = data_section,
-      .bss_section = bss_section,
-      .symbol_map = symbol_map,
-  };
-}
-
 void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name) {
-  void* runtime_impl_addr = runtime_symbol_map()[func_name].cast_to<void*>();
+  DevPtr runtime_impl_addr = runtime_symbol_map_[func_name];
+  if (thumb_mode_) {
+    runtime_impl_addr += 1;
+  }
  std::ostringstream func_name_underscore;
  func_name_underscore << func_name << "_";
-  DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr);
-}
-
-void MicroSession::SetRuntimeBinaryPath(std::string path) {
-  runtime_binary_path_ = path;
+  if (word_size_ == 4) {
+    DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val32);
+  } else if (word_size_ == 8) {
+    DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr.value().val64);
+  }
 }

-std::string MicroSession::ReadString(DevBaseOffset str_offset) {
+std::string MicroSession::ReadString(DevPtr str_addr) {
  std::ostringstream result;
  const size_t buf_size = 256;
  std::vector<char> buf(buf_size, 0);
  size_t i = buf_size;
  while (i == buf_size) {
-    low_level_device()->Read(str_offset, buf.data(), buf_size);
+    low_level_device()->Read(str_addr, buf.data(), buf_size);
    i = 0;
    while (i < buf_size) {
      if (buf[i] == 0) break;
      result << buf[i];
      i++;
    }
-    str_offset = str_offset + i;
+    str_addr = str_addr + i;
  }
  return result.str();
 }

-DevBaseOffset MicroSession::AllocateInSection(SectionKind type, size_t size) {
+DevPtr MicroSession::AllocateInSection(SectionKind type, size_t size) {
  return GetAllocator(type)->Allocate(size);
 }

-void MicroSession::FreeInSection(SectionKind type, DevBaseOffset ptr) {
-  return GetAllocator(type)->Free(ptr);
+void MicroSession::FreeInSection(SectionKind type, DevPtr addr) {
+  return GetAllocator(type)->Free(addr);
 }

 template <typename T>
 T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol) {
-  DevBaseOffset sym_offset = low_level_device()->ToDevOffset(symbol_map[symbol]);
+  DevPtr sym_addr = symbol_map[symbol];
  T result;
-  low_level_device()->Read(sym_offset, &result, sizeof(T));
+  low_level_device()->Read(sym_addr, &result, sizeof(T));
  return result;
 }

@@ -347,8 +474,8 @@ template <typename T>
 void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map,
                                  const std::string& symbol,
                                  const T& value) {
-  DevBaseOffset sym_offset = low_level_device()->ToDevOffset(symbol_map[symbol]);
-  low_level_device()->Write(sym_offset, &value, sizeof(T));
+  DevPtr sym_addr = symbol_map[symbol];
+  low_level_device()->Write(sym_addr, &value, sizeof(T));
 }

 PackedFunc MicroSession::GetFunction(
@@ -370,15 +497,53 @@ PackedFunc MicroSession::GetFunction(
 // create micro session and low-level device from Python frontend
 TVM_REGISTER_GLOBAL("micro._CreateSession")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-    const std::string& device_type = args[0];
+    const std::string& comms_method = args[0];
    const std::string& binary_path = args[1];
    const std::string& toolchain_prefix = args[2];
-    uint64_t base_addr = args[3];
-    const std::string& server_addr = args[4];
-    int port = args[5];
-    ObjectPtr<MicroSession> session = make_object<MicroSession>();
-    session->CreateSession(
-        device_type, binary_path, toolchain_prefix, base_addr, server_addr, port);
+    uint64_t text_start = args[3];
+    size_t text_size = args[4];
+    uint64_t rodata_start = args[5];
+    size_t rodata_size = args[6];
+    uint64_t data_start = args[7];
+    size_t data_size = args[8];
+    uint64_t bss_start = args[9];
+    size_t bss_size = args[10];
+    uint64_t args_start = args[11];
+    size_t args_size = args[12];
+    uint64_t heap_start = args[13];
+    size_t heap_size = args[14];
+    uint64_t workspace_start = args[15];
+    size_t workspace_size = args[16];
+    uint64_t stack_start = args[17];
+    size_t stack_size = args[18];
+    size_t word_size = args[19];
+    bool thumb_mode = args[20];
+    const std::string& server_addr = args[21];
+    int port = args[22];
+    ObjectPtr<MicroSession> session = make_object<MicroSession>(
+        comms_method,
+        binary_path,
+        toolchain_prefix,
+        text_start,
+        text_size,
+        rodata_start,
+        rodata_size,
+        data_start,
+        data_size,
+        bss_start,
+        bss_size,
+        args_start,
+        args_size,
+        heap_start,
+        heap_size,
+        workspace_start,
+        workspace_size,
+        stack_start,
+        stack_size,
+        word_size,
+        thumb_mode,
+        server_addr,
+        port);
    *rv = Module(session);
    });


--- a/src/runtime/micro/micro_session.h
+++ b/src/runtime/micro/micro_session.h
@@ -47,7 +47,6 @@
 #include <tuple>

 #include "low_level_device.h"
-#include "device/utvm_runtime.h"
 #include "target_data_layout_encoder.h"

 namespace tvm {
@@ -75,9 +74,55 @@ class MicroSession : public ModuleNode {
  }

  /*!
-   * \brief constructor
+   * \brief creates session by setting up a low-level device and initting allocators for it
+   * \param comms_method method of communication with the device (e.g., "openocd")
+   * \param binary_path file system path to the runtime binary
+   * \param toolchain_prefix GCC toolchain prefix
+   * \param text_start text section start address
+   * \param text_size text section size
+   * \param rodata_start text section start address
+   * \param rodata_size rodata section size
+   * \param data_start data section start address
+   * \param data_size data section size
+   * \param bss_start bss section start address
+   * \param bss_size bss section size
+   * \param args_start args section start address
+   * \param args_size args section size
+   * \param heap_start heap section start address
+   * \param heap_size heap section size
+   * \param workspace_start workspace section start address
+   * \param workspace_size workspace section size
+   * \param stack_start stack section start address
+   * \param stack_size stack section size
+   * \param word_size number of bytes in a word on the target device
+   * \param thumb_mode whether the target device requires a thumb-mode bit on function addresses
+   * \param server_addr address of the OpenOCD server to connect to (if `comms_method == "openocd"`)
+   * \param port port of the OpenOCD server to connect to (if `comms_method == "openocd"`)
   */
-  MicroSession();
+  MicroSession(
+      const std::string& comms_method,
+      const std::string& binary_path,
+      const std::string& toolchain_prefix,
+      uint64_t text_start,
+      size_t text_size,
+      uint64_t rodata_start,
+      size_t rodata_size,
+      uint64_t data_start,
+      size_t data_size,
+      uint64_t bss_start,
+      size_t bss_size,
+      uint64_t args_start,
+      size_t args_size,
+      uint64_t heap_start,
+      size_t heap_size,
+      uint64_t workspace_start,
+      size_t workspace_size,
+      uint64_t stack_start,
+      size_t stack_size,
+      size_t word_size,
+      bool thumb_mode,
+      const std::string& server_addr,
+      int port);

  /*!
   * \brief destructor
@@ -87,20 +132,20 @@ class MicroSession : public ModuleNode {
  static ObjectPtr<MicroSession>& Current();

  /*!
-   * \brief creates session by setting up a low-level device and initting allocators for it
-   * \param args TVMArgs passed into the micro.init packedfunc
+   * \brief sets up runtime metadata for `func` and copies arguments for on-device execution
+   * \param func address of the function to be executed
+   * \param args args to the packed function
+   * \return elapsed time during function execution on the device
   */
-  void CreateSession(const std::string& device_type,
-                     const std::string& binary_path,
-                     const std::string& toolchain_prefix,
-                     std::uintptr_t base_addr,
-                     const std::string& server_addr,
-                     int port);
+  double PushToExecQueue(DevPtr func, const TVMArgs& args);

  /*!
-   * \brief ends the session by destructing the low-level device and its allocators
+   * \brief loads binary onto device
+   * \param binary_path path to binary object file
+   * \param patch_dylib_pointers whether to patch runtime API function pointers
+   * \return info about loaded binary
   */
-  void EndSession();
+  BinaryInfo LoadBinary(const std::string& binary_path, bool patch_dylib_pointers);

  /*!
   * \brief allocate memory in section
@@ -108,36 +153,21 @@ class MicroSession : public ModuleNode {
   * \param size size of allocated memory in bytes
   * \return pointer to allocated memory region in section, nullptr if out of space
   */
-  DevBaseOffset AllocateInSection(SectionKind type, size_t size);
+  DevPtr AllocateInSection(SectionKind type, size_t size);

  /*!
   * \brief free prior allocation from section
   * \param type type of section to allocate in
-   * \param ptr pointer to allocated memory
+   * \param addr device address of allocated memory
   */
-  void FreeInSection(SectionKind type, DevBaseOffset ptr);
+  void FreeInSection(SectionKind type, DevPtr addr);

  /*!
   * \brief read string from device to host
-   * \param str_offset device offset of first character of string
+   * \param str_addr device address of first character of string
   * \return host copy of device string that was read
   */
-  std::string ReadString(DevBaseOffset str_offset);
-
-  /*!
-   * \brief sets up runtime metadata for `func` and copies arguments for on-device execution
-   * \param func address of the function to be executed
-   * \param args args to the packed function
-   */
-  void PushToExecQueue(DevBaseOffset func, const TVMArgs& args);
-
-  /*!
-   * \brief loads binary onto device
-   * \param binary_path path to binary object file
-   * \param patch_dylib_pointers whether runtime API function pointer patching is needed
-   * \return info about loaded binary
-   */
-  BinaryInfo LoadBinary(const std::string& binary_path, bool patch_dylib_pointers = true);
+  std::string ReadString(DevPtr str_addr);

  /*!
  * \brief read value of symbol from device memory
@@ -174,16 +204,17 @@ class MicroSession : public ModuleNode {
  /*! \brief array of memory allocators for each on-device section */
  std::shared_ptr<MicroSectionAllocator>
      section_allocators_[static_cast<size_t>(SectionKind::kNumKinds)];
-  /*! \brief total number of bytes of usable device memory for this session */
-  size_t memory_size_;
-  /*! \brief uTVM runtime binary info */
-  BinaryInfo runtime_bin_info_;
-  /*! \brief path to uTVM runtime source code */
-  std::string runtime_binary_path_;
-  /*! \brief offset of the runtime entry function */
-  DevBaseOffset utvm_main_symbol_;
-  /*! \brief offset of the runtime exit breakpoint */
-  DevBaseOffset utvm_done_symbol_;
+  /*! \brief number of bytes in a word on the target device */
+  size_t word_size_;
+  /*! \brief whether the target device requires a thumb-mode bit on function addresses
+   *
+   * ARM and other manufacturers use the lowest bit of a function address to determine
+   * whether it's a "thumb mode" function.  The Thumb ISA is more restricted, but
+   * results in more compact binaries.
+   */
+  bool thumb_mode_;
+  /*! \brief symbol map for the device runtime */
+  SymbolMap runtime_symbol_map_;

  /*!
   * \brief patches a function pointer in this module to an implementation
@@ -192,12 +223,6 @@ class MicroSession : public ModuleNode {
  void PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name);

  /*!
-   * \brief sets the runtime binary path
-   * \param path to runtime binary
-   */
-  void SetRuntimeBinaryPath(std::string path);
-
-  /*!
   * \brief appends arguments to the host-side buffer of `encoder`
   * \param encoder encoder being used to append `args`
   * \param args args to be appended
@@ -211,6 +236,7 @@ class MicroSession : public ModuleNode {
   * \param arr TVMArray to be appended
   * \return device address of the allocated `TVMArray`
   */
+  template <typename T>
  DevPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArray& arr);

  /*!
@@ -228,18 +254,11 @@ class MicroSession : public ModuleNode {
  }

  /*!
-   * \brief returns the symbol map for the uTVM runtime
-   * \return reference to symbol map
-   */
-  const SymbolMap& runtime_symbol_map() {
-    return runtime_bin_info_.symbol_map;
-  }
-
-  /*!
    * \brief Push a new session context onto the thread-local stack.
    *  The session on top of the stack is used as the current global session.
    */
  static void EnterWithScope(ObjectPtr<MicroSession> session);
+
  /*!
    * \brief Pop a session off the thread-local context stack,
    *  restoring the previous session as the current context.
@@ -260,6 +279,118 @@ struct MicroDevSpace {
  ObjectPtr<MicroSession> session;
 };

+// TODO(weberlo): maybe templatize serialization to reduce redundancy
+
+/*! \brief TVM array for serialization to 32-bit devices */
+struct TVMArray32 {
+  TVMArray32(
+      TargetVal data,
+      DLContext ctx,
+      int32_t ndim,
+      DLDataType dtype,
+      TargetVal shape,
+      TargetVal strides,
+      TargetVal byte_offset)
+    : data(data.val32),
+      ctx(ctx),
+      ndim(ndim),
+      pad0(0),
+      dtype(dtype),
+      shape(shape.val32),
+      strides(strides.val32),
+      pad1(0),
+      byte_offset(byte_offset.val32),
+      pad2(0) { }
+
+  /*! \brief opaque pointer to the allocated data */
+  uint32_t data;
+  /*! \brief The device context of the tensor */
+  DLContext ctx;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief Padding to enforce struct alignment */
+  uint32_t pad0;
+  /*! \brief The data type of the pointer */
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  uint32_t shape;
+  /*!
+   * \brief strides of the tensor,
+   *  can be NULL, indicating tensor is compact.
+   */
+  uint32_t strides;
+  /*! \brief Padding to enforce struct alignment */
+  uint32_t pad1;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint32_t byte_offset;
+  /*! \brief Padding to enforce struct alignment */
+  uint32_t pad2;
+};
+
+/*! \brief TVM array for serialization to 64-bit devices */
+struct TVMArray64 {
+  TVMArray64(
+      TargetVal data,
+      DLContext ctx,
+      int32_t ndim,
+      DLDataType dtype,
+      TargetVal shape,
+      TargetVal strides,
+      TargetVal byte_offset)
+    : data(data.val64),
+      ctx(ctx),
+      ndim(ndim),
+      pad0(0),
+      dtype(dtype),
+      shape(shape.val64),
+      strides(strides.val64),
+      byte_offset(byte_offset.val64) { }
+
+  /*! \brief opaque pointer to the allocated data */
+  uint64_t data;
+  /*! \brief The device context of the tensor */
+  DLContext ctx;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief Padding to enforce struct alignment */
+  uint32_t pad0;
+  /*! \brief The data type of the pointer */
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  uint64_t shape;
+  /*!
+   * \brief strides of the tensor,
+   *  can be NULL, indicating tensor is compact.
+   */
+  uint64_t strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+};
+
+/*! \brief MicroTVM task for serialization to 32-bit devices */
+typedef struct StructUTVMTask32 {
+  /*! \brief Pointer to function to call for this task */
+  uint32_t func;
+  /*! \brief Array of argument values */
+  uint32_t arg_values;
+  /*! \brief Array of type codes for each argument value */
+  uint32_t arg_type_codes;
+  /*! \brief Number of arguments */
+  int32_t num_args;
+} UTVMTask32;
+
+/*! \brief MicroTVM task for serialization to 64-bit devices */
+typedef struct StructUTVMTask64 {
+  /*! \brief Pointer to function to call for this task */
+  uint64_t func;
+  /*! \brief Array of argument values */
+  uint64_t arg_values;
+  /*! \brief Array of type codes for each argument value */
+  uint64_t arg_type_codes;
+  /*! \brief Number of arguments */
+  int32_t num_args;
+} UTVMTask64;
+
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_MICRO_MICRO_SESSION_H_
--- a/src/runtime/micro/openocd_low_level_device.cc
+++ b/src/runtime/micro/openocd_low_level_device.cc
@@ -37,21 +37,20 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
 public:
  /*!
   * \brief constructor to initialize connection to openocd device
-   * \param base_addr base address of the device
   * \param server_addr address of the OpenOCD server to connect to
   * \param port port of the OpenOCD server to connect to
   */
-  explicit OpenOCDLowLevelDevice(std::uintptr_t base_addr,
-                                 const std::string& server_addr,
+  explicit OpenOCDLowLevelDevice(const std::string& server_addr,
                                 int port) : socket_() {
-      socket_.Connect(tvm::common::SockAddr(server_addr.c_str(), port));
-      socket_.cmd_builder() << "reset halt";
+    server_addr_ = server_addr;
+    port_ = port;
+
+    socket_.Connect(tvm::common::SockAddr(server_addr_.c_str(), port_));
+    socket_.cmd_builder() << "halt 0";
    socket_.SendCommand();
-      base_addr_ = base_addr;
-      CHECK(base_addr_ % 8 == 0) << "base address not aligned to 8 bytes";
  }

-  void Read(DevBaseOffset offset, void* buf, size_t num_bytes) {
+  void Read(DevPtr addr, void* buf, size_t num_bytes) {
    if (num_bytes == 0) {
      return;
    }
@@ -59,7 +58,6 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
    // TODO(weberlo): Refactor between read and write.
    // Check if we need to chunk this write request.
    if (num_bytes > kMemTransferLimit) {
-      DevBaseOffset curr_offset = offset;
      char* curr_buf_ptr = reinterpret_cast<char*>(buf);
      while (num_bytes != 0) {
        size_t amount_to_read;
@@ -68,8 +66,8 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
        } else {
          amount_to_read = num_bytes;
        }
-        Read(offset, reinterpret_cast<void*>(curr_buf_ptr), amount_to_read);
-        offset += amount_to_read;
+        Read(addr, reinterpret_cast<void*>(curr_buf_ptr), amount_to_read);
+        addr += amount_to_read;
        curr_buf_ptr += amount_to_read;
        num_bytes -= amount_to_read;
      }
@@ -79,7 +77,6 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
      socket_.cmd_builder() << "array unset output";
      socket_.SendCommand();

-      DevPtr addr = DevPtr(base_addr_ + offset.value());
      socket_.cmd_builder()
        << "mem2array output"
        << " " << std::dec << kWordSize
@@ -122,14 +119,13 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
    }
  }

-  void Write(DevBaseOffset offset, const void* buf, size_t num_bytes) {
+  void Write(DevPtr addr, const void* buf, size_t num_bytes) {
    if (num_bytes == 0) {
      return;
    }

    // Check if we need to chunk this write request.
    if (num_bytes > kMemTransferLimit) {
-      DevBaseOffset curr_offset = offset;
      const char* curr_buf_ptr = reinterpret_cast<const char*>(buf);
      while (num_bytes != 0) {
        size_t amount_to_write;
@@ -138,8 +134,8 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
        } else {
          amount_to_write = num_bytes;
        }
-        Write(offset, reinterpret_cast<const void*>(curr_buf_ptr), amount_to_write);
-        offset += amount_to_write;
+        Write(addr, reinterpret_cast<const void*>(curr_buf_ptr), amount_to_write);
+        addr += amount_to_write;
        curr_buf_ptr += amount_to_write;
        num_bytes -= amount_to_write;
      }
@@ -166,7 +162,6 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
      socket_.SendCommand();
    }
    {
-      DevPtr addr = DevPtr(base_addr_ + offset.value());
      socket_.cmd_builder()
        << "array2mem input"
        << " " << std::dec << kWordSize
@@ -176,20 +171,14 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
    }
  }

-  void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) {
+  void Execute(DevPtr func_addr, DevPtr breakpoint_addr) {
    socket_.cmd_builder() << "halt 0";
    socket_.SendCommand();

-    // Set up the stack pointer.
-    DevPtr stack_end = stack_top() - 8;
-    socket_.cmd_builder() << "reg sp " << stack_end.cast_to<void*>();
-    socket_.SendCommand();
-
    // Set a breakpoint at the beginning of `UTVMDone`.
-    socket_.cmd_builder() << "bp " << ToDevPtr(breakpoint).cast_to<void*>() << " 2";
+    socket_.cmd_builder() << "bp " << breakpoint_addr.cast_to<void*>() << " 2";
    socket_.SendCommand();

-    DevPtr func_addr = DevPtr(base_addr_ + func_offset.value());
    socket_.cmd_builder() << "resume " << func_addr.cast_to<void*>();
    socket_.SendCommand();

@@ -200,34 +189,21 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
    socket_.SendCommand();

    // Remove the breakpoint.
-    socket_.cmd_builder() << "rbp " << ToDevPtr(breakpoint).cast_to<void*>();
+    socket_.cmd_builder() << "rbp " << breakpoint_addr.cast_to<void*>();
    socket_.SendCommand();
  }

-  void SetStackTop(DevBaseOffset stack_top) {
-    stack_top_ = DevPtr(base_addr_ + stack_top.value());
-  }
-
-  std::uintptr_t base_addr() const final {
-    return base_addr_;
-  }
-
-  DevPtr stack_top() const {
-    CHECK(stack_top_ != nullptr) << "stack top was never initialized";
-    return stack_top_;
-  }
-
  const char* device_type() const final {
    return "openocd";
  }

 private:
-  /*! \brief base address of the micro device memory region */
-  std::uintptr_t base_addr_;
-  /*! \brief top of the stack section */
-  DevPtr stack_top_;
  /*! \brief socket used to communicate with the device through Tcl */
  TclSocket socket_;
+  /*! \brief address of OpenOCD server */
+  std::string server_addr_;
+  /*! \brief port of OpenOCD server */
+  int port_;

  /*! \brief number of bytes in a word on the target device (64-bit) */
  static const constexpr ssize_t kWordSize = 8;
@@ -239,11 +215,10 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice {
  static const constexpr int kWaitTime = 10000;
 };

-const std::shared_ptr<LowLevelDevice> OpenOCDLowLevelDeviceCreate(std::uintptr_t base_addr,
-                                                                  const std::string& server_addr,
+const std::shared_ptr<LowLevelDevice> OpenOCDLowLevelDeviceCreate(const std::string& server_addr,
                                                                  int port) {
  std::shared_ptr<LowLevelDevice> lld =
-      std::make_shared<OpenOCDLowLevelDevice>(base_addr, server_addr, port);
+      std::make_shared<OpenOCDLowLevelDevice>(server_addr, port);
  return lld;
 }


--- a/src/runtime/micro/target_data_layout_encoder.h
+++ b/src/runtime/micro/target_data_layout_encoder.h
@@ -25,7 +25,7 @@
 #define TVM_RUNTIME_MICRO_TARGET_DATA_LAYOUT_ENCODER_H_

 #include <vector>
-#include "device/utvm_runtime.h"
+#include "host_driven/utvm_runtime.h"

 namespace tvm {
 namespace runtime {
@@ -96,9 +96,9 @@ class TargetDataLayoutEncoder {
   * \brief constructor
   * \param start_addr start address of the encoder in device memory
   */
-  explicit TargetDataLayoutEncoder(DevPtr start_addr)
-      : buf_(std::vector<uint8_t>()), curr_offset_(0) {
-    start_addr_ = DevPtr(UpperAlignValue(start_addr.value(), 8));
+  explicit TargetDataLayoutEncoder(DevPtr start_addr, size_t word_size)
+      : buf_(std::vector<uint8_t>()), curr_offset_(0), word_size_(word_size) {
+    start_addr_ = DevPtr(UpperAlignValue(start_addr.value().val64, word_size_));
  }

  /*!
@@ -108,7 +108,7 @@ class TargetDataLayoutEncoder {
   */
  template <typename T>
  Slot<T> Alloc(size_t num_elems = 1) {
-    curr_offset_ = UpperAlignValue(curr_offset_, 8);
+    curr_offset_ = UpperAlignValue(curr_offset_, word_size_);
    size_t size = sizeof(T) * num_elems;
    if (curr_offset_ + size > buf_.size()) {
      buf_.resize(curr_offset_ + size);
@@ -141,6 +141,8 @@ class TargetDataLayoutEncoder {
  size_t curr_offset_;
  /*! \brief start address of the encoder in device memory */
  DevPtr start_addr_;
+  /*! \brief number of bytes in a word on the target device */
+  size_t word_size_;
 };

 template <typename T>

--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -21,6 +21,7 @@
 * \file rpc_session.cc
 * \brief RPC session for remote function call.
 */
+#include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
@@ -40,6 +41,7 @@

 namespace tvm {
 namespace runtime {
+
 // Temp buffer for data array
 struct RPCByteArrayBuffer {
  TVMByteArray arr;
@@ -1215,11 +1217,45 @@ void RPCSession::EventHandler::HandlePackedCall() {
  CHECK_EQ(state_, kRecvCode);
 }

+PackedFunc MicroTimeEvaluator(
+    PackedFunc pf,
+    TVMContext ctx,
+    int number,
+    int repeat) {
+  auto ftimer = [pf, ctx, number, repeat](TVMArgs args, TVMRetValue *rv) mutable {
+    TVMRetValue temp;
+    std::ostringstream os;
+    // skip first time call, to activate lazy compilation components.
+    pf.CallPacked(args, &temp);
+    DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+    for (int i = 0; i < repeat; ++i) {
+      double speed = 0.0;
+      for (int j = 0; j < number; ++j) {
+        pf.CallPacked(args, &temp);
+        DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+        speed += (temp.operator double()) / number;
+      }
+      os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
+    }
+    std::string blob = os.str();
+    TVMByteArray arr;
+    arr.size = blob.length();
+    arr.data = blob.data();
+    // return the time.
+    *rv = arr;
+  };
+  return PackedFunc(ftimer);
+}
+
 PackedFunc WrapTimeEvaluator(PackedFunc pf,
                             TVMContext ctx,
                             int number,
                             int repeat,
                             int min_repeat_ms) {
+  if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
+    return MicroTimeEvaluator(pf, ctx, number, repeat);
+  }
+
  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue *rv) mutable {
    TVMRetValue temp;
    std::ostringstream os;

--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -25,6 +25,7 @@ ALLOW_EXTENSION = {
    "cc",
    "c",
    "h",
+    "s",
    "rs",
    "m",
    "mm",

--- a/tests/python/contrib/test_binutil.py
+++ b/tests/python/contrib/test_binutil.py
@@ -73,12 +73,21 @@ def test_tvm_callback_relocate_binary():
    with open(tmp_bin, "wb") as f:
        f.write(binary)
    def verify():
-        text_loc_str = "0x0"
-        rodata_loc_str = "0x10000"
-        data_loc_str = "0x20000"
-        bss_loc_str = "0x30000"
+        word_size = 8
+        text_loc = 0x0
+        rodata_loc = 0x10000
+        data_loc = 0x20000
+        bss_loc = 0x30000
+        stack_end = 0x50000
        rel_bin = tvm_callback_relocate_binary(
-            tmp_bin, text_loc_str, rodata_loc_str, data_loc_str, bss_loc_str, TOOLCHAIN_PREFIX)
+            tmp_bin,
+            word_size,
+            text_loc,
+            rodata_loc,
+            data_loc,
+            bss_loc,
+            stack_end,
+            TOOLCHAIN_PREFIX)
        print("Relocated binary section sizes")
        test_tvm_callback_get_section_size(binary=rel_bin)
        relf = tmp_dir.relpath("rel.bin")
@@ -88,10 +97,6 @@ def test_tvm_callback_relocate_binary():
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)
        (out, _) = nm_proc.communicate()
-        # Ensure the relocated symbols are within the ranges we specified.
-        text_loc = int(text_loc_str, 16)
-        data_loc = int(data_loc_str, 16)
-        bss_loc = int(bss_loc_str, 16)
        symbol_entries = out.decode("utf-8").split("\n")
        for entry in symbol_entries:
            if len(entry) == 0:
@@ -127,12 +132,21 @@ def test_tvm_callback_get_symbol_map():
    with open(tmp_bin, "wb") as f:
        f.write(binary)
    def verify():
-        text_loc_str = "0x0"
-        rodata_loc_str = "0x10000"
-        data_loc_str = "0x20000"
-        bss_loc_str = "0x30000"
+        word_size = 8
+        text_loc = 0x0
+        rodata_loc = 0x10000
+        data_loc = 0x20000
+        bss_loc = 0x30000
+        stack_end = 0x50000
        rel_bin = tvm_callback_relocate_binary(
-            tmp_bin, text_loc_str, rodata_loc_str, data_loc_str, bss_loc_str, TOOLCHAIN_PREFIX)
+            tmp_bin,
+            word_size,
+            text_loc,
+            rodata_loc,
+            data_loc,
+            bss_loc,
+            stack_end,
+            TOOLCHAIN_PREFIX)
        symbol_map = tvm_callback_get_symbol_map(rel_bin, TOOLCHAIN_PREFIX)
        symbols = set()
        for i, line in enumerate(symbol_map.split('\n')):

--- a/tests/python/unittest/test_runtime_micro.py
+++ b/tests/python/unittest/test_runtime_micro.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
 import os

 import numpy as np
@@ -22,38 +21,13 @@ import tvm
 from tvm.contrib import graph_runtime, util
 from tvm import relay
 import tvm.micro as micro
+from tvm.micro import create_micro_mod
 from tvm.relay.testing import resnet

 # Use the host emulated micro device.
-DEVICE_TYPE = "host"
-TOOLCHAIN_PREFIX = ""
-
-def create_micro_mod(c_mod, toolchain_prefix):
-    """Produces a micro module from a given module.
-
-    Parameters
-    ----------
-    c_mod : tvm.module.Module
-        module with "c" as its target backend
-
-    toolchain_prefix : str
-        toolchain prefix to be used (see `tvm.micro.Session` docs)
-
-    Return
-    ------
-    micro_mod : tvm.module.Module
-        micro module for the target device
-    """
-    temp_dir = util.tempdir()
-    lib_obj_path = temp_dir.relpath("dev_lib.obj")
-    c_mod.export_library(
-            lib_obj_path,
-            fcompile=tvm.micro.cross_compiler(toolchain_prefix=toolchain_prefix))
-    micro_mod = tvm.module.load(lib_obj_path, "micro_dev")
-    return micro_mod
-
+DEV_CONFIG = micro.device.host.default_config()

-def relay_micro_build(func, toolchain_prefix, params=None):
+def relay_micro_build(func, dev_config, params=None):
    """Create a graph runtime module with a micro device context from a Relay function.

    Parameters
@@ -61,6 +35,9 @@ def relay_micro_build(func, toolchain_prefix, params=None):
    func : relay.Function
        function to compile

+    dev_config : Dict[str, Any]
+        MicroTVM config dict for the target device
+
    params : dict
        input parameters that do not change during inference

@@ -71,24 +48,20 @@ def relay_micro_build(func, toolchain_prefix, params=None):
    """
    with tvm.build_config(disable_vectorize=True):
        graph, c_mod, params = relay.build(func, target="c", params=params)
-    micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX)
+    micro_mod = create_micro_mod(c_mod, dev_config)
    ctx = tvm.micro_dev(0)
    mod = graph_runtime.create(graph, micro_mod, ctx)
    mod.set_input(**params)
    return mod


-# TODO(weberlo): Add example program to test scalar double/int TVMValue serialization.
-# TODO(weberlo): How can we test the OpenOCD device?  The CI would need to have OpenOCD
-# and Spike installed.
-
 def test_alloc():
    """Test tensor allocation on the device."""
    if not tvm.module.enabled("micro_dev"):
        return
    shape = (1024,)
    dtype = "float32"
-    with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX):
+    with micro.Session(DEV_CONFIG):
        ctx = tvm.micro_dev(0)
        np_tensor = np.random.uniform(size=shape).astype(dtype)
        micro_tensor = tvm.nd.array(np_tensor, ctx)
@@ -112,15 +85,14 @@ def test_add():
    func_name = "fadd"
    c_mod = tvm.build(s, [A, B, C], target="c", name=func_name)

-    with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX):
-        micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX)
+    with micro.Session(DEV_CONFIG):
+        micro_mod = create_micro_mod(c_mod, DEV_CONFIG)
        micro_func = micro_mod[func_name]
        ctx = tvm.micro_dev(0)
        a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx)
        c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
        micro_func(a, b, c)
-
        tvm.testing.assert_allclose(
                c.asnumpy(), a.asnumpy() + b.asnumpy())

@@ -143,8 +115,8 @@ def test_workspace_add():
    func_name = "fadd_two_workspace"
    c_mod = tvm.build(s, [A, C], target="c", name=func_name)

-    with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX):
-        micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX)
+    with micro.Session(DEV_CONFIG):
+        micro_mod = create_micro_mod(c_mod, DEV_CONFIG)
        micro_func = micro_mod[func_name]
        ctx = tvm.micro_dev(0)
        a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx)
@@ -168,8 +140,8 @@ def test_graph_runtime():
    z = relay.add(xx, relay.const(1.0))
    func = relay.Function([x], z)

-    with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX):
-        mod = relay_micro_build(func, TOOLCHAIN_PREFIX)
+    with micro.Session(DEV_CONFIG):
+        mod = relay_micro_build(func, DEV_CONFIG)

        x_in = np.random.uniform(size=shape[0]).astype(dtype)
        mod.run(x=x_in)
@@ -195,9 +167,9 @@ def test_multiple_modules():
    ret = relay.subtract(x, relay.const(1.0))
    sub_const_func = relay.Function([x], ret)

-    with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX):
-        add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX)
-        sub_const_mod = relay_micro_build(sub_const_func, TOOLCHAIN_PREFIX)
+    with micro.Session(DEV_CONFIG):
+        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
+        sub_const_mod = relay_micro_build(sub_const_func, DEV_CONFIG)

        x_in = np.random.uniform(size=shape[0]).astype(dtype)
        add_const_mod.run(x=x_in)
@@ -223,8 +195,8 @@ def test_interleave_sessions():
    ret = relay.add(x, relay.const(1.0))
    add_const_func = relay.Function([x], ret)

-    sess_a = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX)
-    sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX)
+    sess_a = micro.Session(DEV_CONFIG)
+    sess_b = micro.Session(DEV_CONFIG)
    with sess_a:
        np_tensor_a = np.random.uniform(size=shape).astype(dtype)
        micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
@@ -232,13 +204,13 @@ def test_interleave_sessions():
        np_tensor_b = np.random.uniform(size=shape).astype(dtype)
        micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0))
    with sess_a:
-        add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX)
+        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
        add_const_mod.run(x=micro_tensor_a)
        add_result = add_const_mod.get_output(0).asnumpy()
        tvm.testing.assert_allclose(
                add_result, np_tensor_a + 1.0)
    with sess_b:
-        add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX)
+        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
        add_const_mod.run(x=micro_tensor_b)
        add_result = add_const_mod.get_output(0).asnumpy()
        tvm.testing.assert_allclose(
@@ -257,15 +229,15 @@ def test_nested_sessions():
    ret = relay.add(x, relay.const(1.0))
    add_const_func = relay.Function([x], ret)

-    sess_a = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX)
-    sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX)
+    sess_a = micro.Session(DEV_CONFIG)
+    sess_b = micro.Session(DEV_CONFIG)
    with sess_a:
        np_tensor_a = np.random.uniform(size=shape).astype(dtype)
        micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
        with sess_b:
            np_tensor_b = np.random.uniform(size=shape).astype(dtype)
            micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0))
-        add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX)
+        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)
        add_const_mod.run(x=micro_tensor_a)
        add_result = add_const_mod.get_output(0).asnumpy()
        tvm.testing.assert_allclose(
@@ -284,12 +256,12 @@ def test_inactive_session_use():
    ret = relay.add(x, relay.const(1.0))
    add_const_func = relay.Function([x], ret)

-    sess_a = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX)
-    sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX)
+    sess_a = micro.Session(DEV_CONFIG)
+    sess_b = micro.Session(DEV_CONFIG)
    with sess_a:
        np_tensor_a = np.random.uniform(size=shape).astype(dtype)
        micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0))
-        add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX)
+        add_const_mod = relay_micro_build(add_const_func, DEV_CONFIG)

    with sess_b:
        # These objects belong to `sess_a`.