Add rust runtime (#1597)

5563b72b · nhynes · Tianqi Chen · 6330797d · 5563b72b · 5563b72b
Commit 5563b72b authored Oct 06, 2018 by nhynes Committed by Tianqi Chen Oct 06, 2018
28 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -91,10 +91,8 @@ ENV/
 *~
 *.pyc
 *~
-build
 config.mk
 config.cmake
-build_*
 Win32
 *.dir
 perf
@@ -187,7 +185,6 @@ tvm_u.*
 tvm_t.*
 # Mac OS X
 .DS_Store
-build*
 # Jetbrain
 .idea

--- a/rust/.gitignore
+++ b/rust/.gitignore
+Cargo.lock
+target/
+**/*.rs.bk
--- a/rust/.rustfmt.toml
+++ b/rust/.rustfmt.toml
+max_width = 100
+hard_tabs = false
+tab_spaces = 2
+newline_style = "Auto"
+use_small_heuristics = "Default"
+indent_style = "Block"
+wrap_comments = false
+comment_width = 80
+normalize_comments = false
+format_strings = false
+format_macro_matchers = false
+format_macro_bodies = true
+empty_item_single_line = true
+struct_lit_single_line = true
+fn_single_line = false
+where_single_line = false
+imports_indent = "Block"
+imports_layout = "Mixed"
+merge_imports = true
+reorder_imports = true
+reorder_modules = true
+reorder_impl_items = false
+type_punctuation_density = "Wide"
+space_before_colon = false
+space_after_colon = true
+spaces_around_ranges = false
+binop_separator = "Front"
+remove_nested_parens = true
+combine_control_expr = true
+struct_field_align_threshold = 0
+match_arm_blocks = true
+force_multiline_blocks = false
+fn_args_density = "Tall"
+brace_style = "SameLineWhere"
+control_brace_style = "AlwaysSameLine"
+trailing_semicolon = true
+trailing_comma = "Vertical"
+match_block_trailing_comma = false
+blank_lines_upper_bound = 1
+blank_lines_lower_bound = 0
+edition = "Edition2015"
+merge_derives = true
+use_try_shorthand = true
+use_field_init_shorthand = false
+force_explicit_abi = true
+condense_wildcard_suffixes = false
+color = "Auto"
+required_version = "0.99.4"
+unstable_features = false
+disable_all_formatting = false
+skip_children = false
+hide_parse_errors = false
+error_on_line_overflow = false
+error_on_unformatted = false
+report_todo = "Never"
+report_fixme = "Never"
+ignore = []
+emit_mode = "Files"
+make_backup = false
--- a/rust/.travis.yml
+++ b/rust/.travis.yml
+language: rust
+rust:
+  - nightly
+matrix:
+  fast_finish: true
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
+[package]
+name = "tvm"
+version = "0.1.0"
+license = "Apache-2.0"
+description = "TVM Rust runtime"
+repository = "https://github.com/dmlc/tvm"
+readme = "README.md"
+keywords = ["tvm", "nnvm"]
+categories = ["api-bindings", "science"]
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+[features]
+default = ["nom/std"]
+sgx = ["nom/alloc"]
+[dependencies]
+bounded-spsc-queue = "0.4.0"
+error-chain = { version = "0.12.0", default-features = false }
+itertools = "0.7.8"
+lazy_static = "1.1.0"
+ndarray = "0.11.2"
+nom = {version = "4.0.0", default-features = false }
+serde = "1.0.59"
+serde_derive = "1.0.79"
+serde_json = "1.0.17"
+[target.'cfg(not(target_env = "sgx"))'.dependencies]
+num_cpus = "1.8.0"
--- a/rust/src/errors.rs
+++ b/rust/src/errors.rs
+#[cfg(target_env = "sgx")]
+use alloc::alloc;
+#[cfg(not(target_env = "sgx"))]
+use std::alloc;
+use std::num;
+use ndarray;
+use serde_json;
+error_chain! {
+  errors {
+    TryFromTVMRetValueError(expected: String, actual: i64) {
+      description("mismatched types while downcasting TVMRetValue")
+      display("invalid downcast: expected `{}` but was `{}`", expected, actual)
+    }
+    GraphFormatError(msg: String) {
+      description("unable to load graph")
+      display("could not load graph json: {}", msg)
+    }
+    LoadGraphParamsError(msg: String) {
+      description("unable to load graph params")
+      display("could not load graph params: {}", msg)
+    }
+  }
+  foreign_links {
+    Alloc(alloc::AllocErr);
+    GraphDeserialize(serde_json::Error);
+    ParseInt(num::ParseIntError);
+    ShapeError(ndarray::ShapeError);
+  }
+}
+impl From<alloc::LayoutErr> for Error {
+  fn from(_err: alloc::LayoutErr) -> Error {
+    Error::from_kind(ErrorKind::Msg("Layout error".to_string()))
+  }
+}
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
+//! This crate is an implementation of the TVM runtime for modules compiled with `--system-lib`.
+//! It's mainly useful for compiling to WebAssembly and SGX,
+//! but also native if you prefer Rust to C++.
+//!
+//! For TVM graphs, the entrypoint to this crate is `runtime::GraphExecutor`.
+//! Single-function modules are used via the `packed_func!` macro after obtaining
+//! the function from `runtime::SystemLibModule`
+//!
+//! The main entrypoints to this crate are `GraphExecutor`
+//! For examples of use, please refer to the multi-file tests in the `tests` directory.
+#![feature(
+  alloc,
+  allocator_api,
+  box_syntax,
+  extern_prelude,
+  fn_traits,
+  try_from,
+  unboxed_closures,
+  vec_remove_item
+)]
+#[cfg(target_env = "sgx")]
+extern crate alloc;
+extern crate bounded_spsc_queue;
+#[cfg(target_env = "sgx")]
+extern crate core;
+#[macro_use]
+extern crate error_chain;
+#[macro_use]
+extern crate itertools;
+#[macro_use]
+extern crate lazy_static;
+extern crate ndarray;
+#[macro_use]
+extern crate nom;
+#[cfg(not(target_env = "sgx"))]
+extern crate num_cpus;
+extern crate serde;
+#[macro_use]
+extern crate serde_derive;
+extern crate serde_json;
+pub mod ffi {
+  #![allow(
+    non_camel_case_types,
+    non_snake_case,
+    non_upper_case_globals,
+    unused
+  )]
+  pub mod runtime {
+    use std::os::raw::{c_char, c_int, c_void};
+    include!(concat!(
+      env!("CARGO_MANIFEST_DIR"),
+      "/src/runtime/c_runtime_api.rs"
+    ));
+    pub type BackendPackedCFunc =
+      extern "C" fn(args: *const TVMValue, type_codes: *const c_int, num_args: c_int) -> c_int;
+  }
+}
+pub mod errors;
+pub mod runtime;
+pub use errors::*;
--- a/rust/src/runtime/allocator.rs
+++ b/rust/src/runtime/allocator.rs
+#[cfg(target_env = "sgx")]
+use alloc::alloc::{self, Layout};
+#[cfg(not(target_env = "sgx"))]
+use std::alloc::{self, Layout};
+use errors::*;
+const DEFAULT_ALIGN_BYTES: usize = 4;
+#[derive(PartialEq, Eq)]
+pub struct Allocation {
+  layout: Layout,
+  ptr: *mut u8,
+}
+impl Allocation {
+  /// Allocates a chunk of memory of `size` bytes with optional alignment.
+  pub fn new(size: usize, align: Option<usize>) -> Result<Self> {
+    let alignment = align.unwrap_or(DEFAULT_ALIGN_BYTES);
+    let layout = Layout::from_size_align(size, alignment)?;
+    let ptr = unsafe { alloc::alloc(layout.clone()) };
+    if ptr.is_null() {
+      alloc::handle_alloc_error(layout);
+    }
+    Ok(Self {
+      ptr: ptr,
+      layout: layout,
+    })
+  }
+  pub fn as_mut_ptr(&self) -> *mut u8 {
+    self.ptr
+  }
+  /// Returns the size of the Allocation in bytes.
+  pub fn size(&self) -> usize {
+    self.layout.size()
+  }
+  /// Returns the byte alignment of the Allocation.
+  pub fn align(&self) -> usize {
+    self.layout.align()
+  }
+}
+impl Drop for Allocation {
+  fn drop(&mut self) {
+    unsafe {
+      alloc::dealloc(self.ptr, self.layout.clone());
+    }
+  }
+}
--- a/rust/src/runtime/array.rs
+++ b/rust/src/runtime/array.rs
--- a/rust/src/runtime/c_runtime_api.rs
+++ b/rust/src/runtime/c_runtime_api.rs
--- a/rust/src/runtime/graph.rs
+++ b/rust/src/runtime/graph.rs
--- a/rust/src/runtime/mod.rs
+++ b/rust/src/runtime/mod.rs
+mod allocator;
+mod array;
+mod module;
+#[macro_use]
+mod packed_func;
+mod graph;
+#[cfg(target_env = "sgx")]
+#[macro_use]
+pub mod sgx;
+mod threading;
+mod workspace;
+use std::os::raw::c_char;
+pub use self::{array::*, graph::*, module::*, packed_func::*, threading::*, workspace::*};
+#[no_mangle]
+pub extern "C" fn TVMAPISetLastError(cmsg: *const c_char) {
+  #[cfg(not(target_env = "sgx"))]
+  unsafe {
+    panic!(std::ffi::CStr::from_ptr(cmsg).to_str().unwrap());
+  }
+  #[cfg(target_env = "sgx")]
+  ocall_packed!("__sgx_set_last_error__", cmsg);
+}
--- a/rust/src/runtime/module.rs
+++ b/rust/src/runtime/module.rs
+use std::{
+  collections::HashMap, convert::AsRef, ffi::CStr, os::raw::c_char, string::String, sync::Mutex,
+};
+use ffi::runtime::BackendPackedCFunc;
+use runtime::packed_func::{wrap_backend_packed_func, PackedFunc};
+pub trait Module {
+  fn get_function<S: AsRef<str>>(&self, name: S) -> Option<PackedFunc>;
+}
+pub struct SystemLibModule;
+lazy_static! {
+  static ref SYSTEM_LIB_FUNCTIONS: Mutex<HashMap<String, BackendPackedCFunc>> =
+    Mutex::new(HashMap::new());
+}
+impl Module for SystemLibModule {
+  fn get_function<S: AsRef<str>>(&self, name: S) -> Option<PackedFunc> {
+    SYSTEM_LIB_FUNCTIONS
+      .lock()
+      .unwrap()
+      .get(name.as_ref())
+      .map(|func| wrap_backend_packed_func(func.to_owned()))
+  }
+}
+impl Default for SystemLibModule {
+  fn default() -> Self {
+    SystemLibModule {}
+  }
+}
+#[no_mangle]
+pub extern "C" fn TVMBackendRegisterSystemLibSymbol(
+  cname: *const c_char,
+  func: BackendPackedCFunc,
+) -> i32 {
+  let name = unsafe { CStr::from_ptr(cname).to_str().unwrap() };
+  SYSTEM_LIB_FUNCTIONS
+    .lock()
+    .unwrap()
+    .insert(name.to_string(), func);
+  return 0;
+}
--- a/rust/src/runtime/packed_func.rs
+++ b/rust/src/runtime/packed_func.rs
+use std::{any::Any, convert::TryFrom, marker::PhantomData, os::raw::c_void};
+use ffi::runtime::{
+  BackendPackedCFunc, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLTensor,
+  TVMTypeCode_kArrayHandle, TVMTypeCode_kHandle, TVMValue,
+};
+use errors::*;
+pub type PackedFunc = Box<Fn(&[TVMArgValue]) -> TVMRetValue + Send + Sync>;
+/// Calls a packed function and returns a `TVMRetValue`.
+///
+/// # Example
+///
+/// `call_packed!(my_tvm_func, &mut arg1, &mut arg2)`
+#[macro_export]
+macro_rules! call_packed {
+  ($fn:expr, $($args:expr),+) => {
+    $fn(&[$($args.into(),)+])
+  };
+  ($fn:expr) => {
+    $fn(&Vec::new())
+  };
+}
+/// A borrowed TVMPODValue. Can be constructed using `into()` but the preferred way
+/// to obtain a `TVMArgValue` is automatically via `call_packed!`.
+#[derive(Clone, Copy)]
+pub struct TVMArgValue<'a> {
+  _lifetime: PhantomData<&'a ()>,
+  pub(crate) value: TVMValue,
+  pub(crate) type_code: i64,
+}
+impl<'a> TVMArgValue<'a> {
+  pub fn new(value: TVMValue, type_code: i64) -> Self {
+    TVMArgValue {
+      _lifetime: PhantomData,
+      value: value,
+      type_code: type_code,
+    }
+  }
+}
+/// Creates a conversion to a `TVMArgValue` for a primitive type and DLDataTypeCode.
+macro_rules! impl_prim_tvm_arg {
+  ($type:ty, $field:ident, $code:expr, $as:ty) => {
+    impl<'a> From<$type> for TVMArgValue<'a> {
+      fn from(val: $type) -> Self {
+        TVMArgValue {
+          value: TVMValue { $field: val as $as },
+          type_code: $code as i64,
+          _lifetime: PhantomData,
+        }
+      }
+    }
+  };
+  ($type:ty, $field:ident, $code:expr) => {
+    impl_prim_tvm_arg!($type, $field, $code, $type);
+  };
+  ($type:ty,v_int64) => {
+    impl_prim_tvm_arg!($type, v_int64, DLDataTypeCode_kDLInt, i64);
+  };
+  ($type:ty,v_float64) => {
+    impl_prim_tvm_arg!($type, v_float64, DLDataTypeCode_kDLFloat, f64);
+  };
+}
+impl_prim_tvm_arg!(f32, v_float64);
+impl_prim_tvm_arg!(f64, v_float64);
+impl_prim_tvm_arg!(i8, v_int64);
+impl_prim_tvm_arg!(u8, v_int64);
+impl_prim_tvm_arg!(i32, v_int64);
+impl_prim_tvm_arg!(u32, v_int64);
+impl_prim_tvm_arg!(i64, v_int64);
+impl_prim_tvm_arg!(u64, v_int64);
+impl_prim_tvm_arg!(bool, v_int64);
+/// Creates a conversion to a `TVMArgValue` for an object handle.
+impl<'a, T> From<*const T> for TVMArgValue<'a> {
+  fn from(ptr: *const T) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: ptr as *mut T as *mut c_void,
+      },
+      type_code: TVMTypeCode_kArrayHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+/// Creates a conversion to a `TVMArgValue` for a mutable object handle.
+impl<'a, T> From<*mut T> for TVMArgValue<'a> {
+  fn from(ptr: *mut T) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: ptr as *mut c_void,
+      },
+      type_code: TVMTypeCode_kHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+impl<'a> From<&'a mut DLTensor> for TVMArgValue<'a> {
+  fn from(arr: &'a mut DLTensor) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: arr as *mut _ as *mut c_void,
+      },
+      type_code: TVMTypeCode_kArrayHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+impl<'a> From<&'a DLTensor> for TVMArgValue<'a> {
+  fn from(arr: &'a DLTensor) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: arr as *const _ as *mut DLTensor as *mut c_void,
+      },
+      type_code: TVMTypeCode_kArrayHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+/// An owned TVMPODValue. Can be converted from a variety of primitive and object types.
+/// Can be downcasted using `try_from` if it contains the desired type.
+///
+/// # Example
+///
+/// ```
+/// let a = 42u32;
+/// let b: i64 = TVMRetValue::from(a).try_into().unwrap();
+///
+/// let s = "hello, world!";
+/// let t: TVMRetValue = s.into();
+/// assert_eq!(String::try_from(t).unwrap(), s);
+/// ```
+pub struct TVMRetValue {
+  /// A primitive return value, if any.
+  prim_value: u64,
+  /// An object return value, if any.
+  box_value: Box<Any>,
+  /// The DLDataTypeCode which determines whether `prim_value` or `box_value` is in use.
+  type_code: i64,
+}
+#[cfg(target_env = "sgx")]
+impl TVMRetValue {
+  pub(crate) fn from_tvm_value(value: TVMValue, type_code: i64) -> Self {
+    unsafe {
+      Self {
+        prim_value: match type_code {
+          0 | 1 => value.v_int64 as u64,
+          2 => value.v_float64 as u64,
+          3 | 7 | 8 | 9 | 10 => value.v_handle as u64,
+          11 | 12 => value.v_str as u64,
+          _ => 0,
+        } as u64,
+        box_value: box (),
+        type_code: type_code,
+      }
+    }
+  }
+  pub fn into_tvm_value(self) -> (TVMValue, i64) {
+    let val = match self.type_code {
+      0 | 1 => TVMValue {
+        v_int64: self.prim_value.clone() as i64,
+      },
+      2 => TVMValue {
+        v_float64: self.prim_value.clone() as f64,
+      },
+      3 | 7 | 8 | 9 | 10 => TVMValue {
+        v_handle: Box::into_raw(self.box_value) as *mut c_void,
+      },
+      11 | 12 => TVMValue {
+        v_str: Box::into_raw(self.box_value) as *const _,
+      },
+      _ => unreachable!(),
+    };
+    (val, self.type_code)
+  }
+}
+impl Default for TVMRetValue {
+  fn default() -> Self {
+    TVMRetValue {
+      prim_value: 0,
+      box_value: box (),
+      type_code: 0,
+    }
+  }
+}
+macro_rules! impl_prim_ret_value {
+  ($type:ty, $code:expr) => {
+    impl From<$type> for TVMRetValue {
+      fn from(val: $type) -> Self {
+        TVMRetValue {
+          prim_value: val as u64,
+          box_value: box (),
+          type_code: $code,
+        }
+      }
+    }
+    impl TryFrom<TVMRetValue> for $type {
+      type Error = Error;
+      fn try_from(ret: TVMRetValue) -> Result<$type> {
+        if ret.type_code == $code {
+          Ok(ret.prim_value as $type)
+        } else {
+          bail!(ErrorKind::TryFromTVMRetValueError(
+            stringify!($type).to_string(),
+            ret.type_code
+          ))
+        }
+      }
+    }
+  };
+}
+macro_rules! impl_boxed_ret_value {
+  ($type:ty, $code:expr) => {
+    impl From<$type> for TVMRetValue {
+      fn from(val: $type) -> Self {
+        TVMRetValue {
+          prim_value: 0,
+          box_value: box val,
+          type_code: $code,
+        }
+      }
+    }
+    impl TryFrom<TVMRetValue> for $type {
+      type Error = Error;
+      fn try_from(ret: TVMRetValue) -> Result<$type> {
+        if let Ok(val) = ret.box_value.downcast::<$type>() {
+          Ok(*val)
+        } else {
+          bail!(ErrorKind::TryFromTVMRetValueError(
+            stringify!($type).to_string(),
+            ret.type_code
+          ))
+        }
+      }
+    }
+  };
+}
+impl_prim_ret_value!(i8, 0);
+impl_prim_ret_value!(u8, 1);
+impl_prim_ret_value!(i16, 0);
+impl_prim_ret_value!(u16, 1);
+impl_prim_ret_value!(i32, 0);
+impl_prim_ret_value!(u32, 1);
+impl_prim_ret_value!(f32, 2);
+impl_prim_ret_value!(i64, 0);
+impl_prim_ret_value!(u64, 1);
+impl_prim_ret_value!(f64, 2);
+impl_prim_ret_value!(isize, 0);
+impl_prim_ret_value!(usize, 1);
+impl_boxed_ret_value!(String, 11);
+// @see `WrapPackedFunc` in `llvm_module.cc`.
+pub(super) fn wrap_backend_packed_func(func: BackendPackedCFunc) -> PackedFunc {
+  box move |args: &[TVMArgValue]| {
+    func(
+      args
+        .iter()
+        .map(|ref arg| arg.value)
+        .collect::<Vec<TVMValue>>()
+        .as_ptr(),
+      args
+        .iter()
+        .map(|ref arg| arg.type_code as i32)
+        .collect::<Vec<i32>>()
+        .as_ptr() as *const i32,
+      args.len() as i32,
+    );
+    TVMRetValue::default()
+  }
+}
--- a/rust/src/runtime/sgx.rs
+++ b/rust/src/runtime/sgx.rs
+use std::{
+  ffi::CString,
+  os::raw::{c_char, c_int},
+};
+use errors::Result;
+use ffi::runtime::TVMValue;
+use runtime::{threading::sgx_join_threads, SystemLibModule, TVMArgValue, TVMRetValue};
+pub use runtime::threading::tvm_run_worker as run_worker;
+#[macro_export]
+macro_rules! tvm_ocall {
+  ($func: expr) => {
+    match $func {
+      0 => Ok(()),
+      err => Err(format!("SGX error: {}", err)),
+    }
+  };
+}
+pub type SgxStatus = u32;
+#[cfg(target_env = "sgx")]
+extern "C" {
+  fn tvm_ocall_packed_func(
+    name: *const c_char,
+    arg_values: *const TVMValue,
+    type_codes: *const c_int,
+    num_args: c_int,
+    ret_val: *mut TVMValue,
+    ret_type_code: *mut c_int,
+  ) -> SgxStatus;
+}
+pub fn ocall_packed_func<S: AsRef<str>>(fn_name: S, args: &[TVMArgValue]) -> Result<TVMRetValue> {
+  let mut ret_val = TVMValue { v_int64: 0 };
+  let ret_type_code = 0i64;
+  unsafe {
+    tvm_ocall!(tvm_ocall_packed_func(
+      CString::new(fn_name.as_ref()).unwrap().as_ptr(),
+      args
+        .iter()
+        .map(|ref arg| arg.value)
+        .collect::<Vec<TVMValue>>()
+        .as_ptr(),
+      args
+        .iter()
+        .map(|ref arg| arg.type_code as i32)
+        .collect::<Vec<i32>>()
+        .as_ptr() as *const i32,
+      args.len() as i32,
+      &mut ret_val as *mut TVMValue,
+      &mut (ret_type_code as i32) as *mut c_int,
+    ))?;
+  }
+  Ok(TVMRetValue::from_tvm_value(ret_val, ret_type_code as i64))
+}
+#[macro_export]
+macro_rules! ocall_packed {
+  ($fn_name:expr, $($args:expr),+) => {
+    ::runtime::sgx::ocall_packed_func($fn_name, &[$($args.into(),)+])
+      .expect(concat!("Error calling `", $fn_name, "`"))
+  };
+  ($fn_name:expr) => {
+    ::runtime::sgx::ocall_packed_func($fn_name, &Vec::new())
+      .expect(concat!("Error calling `", $fn_name, "`"))
+  }
+}
+pub fn shutdown() {
+  if env!("TVM_NUM_THREADS") != "0" {
+    sgx_join_threads()
+  }
+}
+impl Drop for SystemLibModule {
+  fn drop(&mut self) {
+    shutdown()
+  }
+}
--- a/rust/src/runtime/threading.rs
+++ b/rust/src/runtime/threading.rs
+use std::{
+  os::raw::{c_int, c_void},
+  sync::{
+    atomic::{AtomicUsize, Ordering, ATOMIC_USIZE_INIT},
+    Arc, Barrier,
+  },
+};
+#[cfg(not(target_env = "sgx"))]
+use num_cpus;
+#[cfg(not(target_env = "sgx"))]
+use std::{
+  env,
+  thread::{self, JoinHandle},
+};
+#[cfg(target_env = "sgx")]
+use std::{collections::VecDeque, ptr, sync::Mutex};
+use bounded_spsc_queue::{self, Producer};
+use super::super::errors::*;
+use ffi::runtime::TVMParallelGroupEnv;
+#[cfg(target_env = "sgx")]
+use super::{TVMArgValue, TVMRetValue};
+type FTVMParallelLambda =
+  extern "C" fn(task_id: usize, penv: *const TVMParallelGroupEnv, cdata: *const c_void) -> i32;
+/// Holds a parallel job request made by a TVM library function.
+struct Job {
+  cb: FTVMParallelLambda,
+  cdata: *const c_void,
+  req_num_tasks: usize,
+  pending: Arc<AtomicUsize>,
+}
+impl Job {
+  /// Splits this job into a number of `Task`s which can be scheduled.
+  fn tasks(&self, num_workers: usize) -> Vec<Task> {
+    let num_tasks = if self.req_num_tasks == 0 {
+      num_workers
+    } else {
+      self.req_num_tasks.min(num_workers)
+    };
+    self.pending.store(num_tasks, Ordering::SeqCst);
+    let barrier = Arc::new(Barrier::new(num_tasks));
+    (0..num_tasks)
+      .map(move |i| Task {
+        id: i,
+        flambda: self.cb,
+        penv: TVMParallelGroupEnv {
+          sync_handle: &Arc::clone(&barrier) as *const _ as *mut c_void,
+          num_task: num_tasks as i32,
+        },
+        cdata: self.cdata,
+        pending: Arc::clone(&self.pending),
+      }).collect()
+  }
+  /// Waits for all tasks in this `Job` to be completed.
+  fn wait(&self) -> Result<()> {
+    while self.pending.load(Ordering::Acquire) > 0 {
+      #[cfg(not(target_env = "sgx"))]
+      thread::yield_now();
+    }
+    Ok(())
+  }
+}
+/// A chunk of work requested by a TVM function.
+struct Task {
+  id: usize,
+  flambda: FTVMParallelLambda,
+  penv: TVMParallelGroupEnv,
+  cdata: *const c_void,
+  pending: Arc<AtomicUsize>,
+}
+unsafe impl Send for Task {}
+unsafe impl Sync for Task {}
+impl FnOnce<()> for Task {
+  type Output = i32;
+  extern "rust-call" fn call_once(self, _args: ()) -> Self::Output {
+    let status = (self.flambda)(self.id, &self.penv as *const _, self.cdata);
+    self.pending.fetch_sub(1, Ordering::AcqRel);
+    status
+  }
+}
+#[derive(Default)]
+struct Threads {
+  #[allow(unused)]
+  #[cfg(not(target_env = "sgx"))]
+  handles: Vec<JoinHandle<()>>,
+  queues: Vec<Producer<Task>>,
+}
+impl<'a> Threads {
+  #[cfg(not(target_env = "sgx"))]
+  fn launch<F: Sync + Send + FnOnce(Consumer<Task>) + 'static + Copy>(
+    num_threads: usize,
+    cb: F,
+  ) -> Self {
+    let (handles, queues) = (0..num_threads)
+      .map(|_| {
+        let (p, c) = bounded_spsc_queue::make(2);
+        let handle = thread::spawn(move || cb(c.into()));
+        (handle, p)
+      }).unzip();
+    Threads {
+      handles: handles,
+      queues: queues,
+    }
+  }
+  #[cfg(target_env = "sgx")]
+  fn launch<F: Sync + Send + FnOnce(Consumer<Task>) + 'static + Copy>(
+    num_threads: usize,
+    _cb: F,
+  ) -> Self {
+    let mut consumer_queues = SGX_QUEUES.lock().unwrap();
+    let queues = (0..num_threads)
+      .map(|_| {
+        let (p, c) = bounded_spsc_queue::make(2);
+        consumer_queues.push_back(c.into());
+        p
+      }).collect();
+    ocall_packed!("__sgx_thread_group_launch__", num_threads as u64);
+    Threads { queues: queues }
+  }
+}
+struct ThreadPool {
+  num_workers: usize,
+  #[allow(unused)]
+  threads: Threads,
+}
+thread_local!(static THREAD_POOL: ThreadPool = ThreadPool::new());
+impl ThreadPool {
+  fn new() -> Self {
+    let num_workers = max_concurrency();
+    ThreadPool {
+      num_workers: num_workers,
+      threads: Threads::launch(num_workers, ThreadPool::run_worker),
+    }
+  }
+  fn launch(&self, job: Job) {
+    let mut tasks = job.tasks(self.num_workers + 1);
+    for (i, task) in tasks.split_off(1).into_iter().enumerate() {
+      self.threads.queues[i].push(task);
+    }
+    tasks.pop().unwrap()();
+    job.wait().unwrap();
+  }
+  fn run_worker(queue: Consumer<Task>) {
+    loop {
+      let task = queue.pop();
+      let result = task();
+      if result == <i32>::min_value() {
+        break;
+      } else if result != 0 {
+        panic!("Error running task.");
+      }
+    }
+  }
+}
+// Send + Sync wrapper for bounded_spsc_queue::Consumer
+struct Consumer<T> {
+  consumer: bounded_spsc_queue::Consumer<T>,
+}
+impl<T> From<bounded_spsc_queue::Consumer<T>> for Consumer<T> {
+  fn from(c: bounded_spsc_queue::Consumer<T>) -> Self {
+    Consumer { consumer: c }
+  }
+}
+impl<T> Consumer<T> {
+  fn pop(&self) -> T {
+    self.consumer.pop()
+  }
+}
+unsafe impl<T> Send for Consumer<T> {}
+unsafe impl<T> Sync for Consumer<T> {}
+#[cfg(target_env = "sgx")]
+lazy_static! {
+  /// Holds tasks for untrusted threads which re-enter the enclave to execute.
+  static ref SGX_QUEUES: Mutex<VecDeque<Consumer<Task>>> = Mutex::new(VecDeque::new());
+}
+#[cfg(all(not(target_arch = "wasm32"), not(target_env = "sgx")))]
+fn max_concurrency() -> usize {
+  if let Ok(threads_str) = env::var("TVM_NUM_THREADS").or(env::var("OMP_NUM_THREADS")) {
+    if let Ok(threads) = usize::from_str_radix(&threads_str, 10) {
+      return threads;
+    }
+  }
+  num_cpus::get_physical()
+}
+#[cfg(target_env = "sgx")]
+fn max_concurrency() -> usize {
+  usize::from_str_radix(env!("TVM_NUM_THREADS"), 10).unwrap_or(1)
+}
+#[cfg(target_arch = "wasm32")]
+fn max_concurrency() -> usize {
+  0 // wasm doesn't support threads yet
+}
+#[cfg(target_env = "sgx")]
+pub fn tvm_run_worker(_args: &[TVMArgValue]) -> TVMRetValue {
+  let q = {
+    let mut qs = SGX_QUEUES.lock().unwrap();
+    qs.pop_front()
+    // `qs: MutexGuard` needs to be dropped here since `run_worker` won't return
+  };
+  if let Some(q) = q {
+    ThreadPool::run_worker(q);
+  }
+  TVMRetValue::default()
+}
+#[no_mangle]
+pub extern "C" fn TVMBackendParallelLaunch(
+  cb: FTVMParallelLambda,
+  cdata: *const c_void,
+  num_task: usize,
+) -> c_int {
+  if max_concurrency() == 0 {
+    let penv = TVMParallelGroupEnv {
+      sync_handle: 0 as *mut c_void,
+      num_task: 1,
+    };
+    cb(0, &penv as *const _, cdata);
+  } else {
+    THREAD_POOL.with(|pool| {
+      pool.launch(Job {
+        cb: cb,
+        cdata: cdata,
+        req_num_tasks: num_task,
+        pending: Arc::new(ATOMIC_USIZE_INIT),
+      });
+    });
+  }
+  return 0;
+}
+#[cfg(target_env = "sgx")]
+pub(crate) fn sgx_join_threads() {
+  extern "C" fn poison_pill(
+    _task_id: usize,
+    _penv: *const TVMParallelGroupEnv,
+    _cdata: *const c_void,
+  ) -> i32 {
+    <i32>::min_value()
+  }
+  THREAD_POOL.with(|pool| {
+    pool.launch(Job {
+      cb: poison_pill,
+      cdata: ptr::null(),
+      req_num_tasks: 0,
+      pending: Arc::new(ATOMIC_USIZE_INIT),
+    });
+  });
+  ocall_packed!("__sgx_thread_group_join__", 0);
+}
+// @see https://github.com/dmlc/tvm/issues/988 for information on why this function is used.
+#[no_mangle]
+pub extern "C" fn TVMBackendParallelBarrier(_task_id: usize, penv: *const TVMParallelGroupEnv) {
+  let barrier: &Arc<Barrier> = unsafe { &*((*penv).sync_handle as *const Arc<Barrier>) };
+  barrier.wait();
+}
+#[cfg(test)]
+mod tests {
+  use std::{ptr, thread, time::Duration};
+  use super::*;
+  #[test]
+  fn test_max_concurrency() {
+    env::set_var("TVM_NUM_THREADS", "42");
+    env::set_var("OMP_NUM_THREADS", "24");
+    assert_eq!(max_concurrency(), 42);
+    env::remove_var("TVM_NUM_THREADS");
+    assert_eq!(max_concurrency(), 24);
+  }
+  extern "C" fn flambda(
+    task_id: usize,
+    penv: *const TVMParallelGroupEnv,
+    cdata: *const c_void,
+  ) -> i32 {
+    if cdata == ptr::null() {
+      return 0;
+    }
+    unsafe {
+      let &(ref counter, ref task_ids_sum) = &*(cdata as *const (AtomicUsize, AtomicUsize));
+      thread::sleep(Duration::from_millis(50 * task_id as u64));
+      counter.fetch_add(1, Ordering::SeqCst);
+      task_ids_sum.fetch_add(task_id, Ordering::SeqCst);
+      assert_eq!((*penv).num_task, 3);
+    }
+    0
+  }
+  #[test]
+  fn test_parallel_launch() {
+    TVMBackendParallelLaunch(flambda, ptr::null(), 6);
+    let counter = ATOMIC_USIZE_INIT;
+    let task_ids_sum = ATOMIC_USIZE_INIT;
+    let cdata = (counter, task_ids_sum);
+    let num_tasks = 3;
+    TVMBackendParallelLaunch(flambda, &cdata as *const _ as *const c_void, num_tasks);
+    assert_eq!(cdata.0.load(Ordering::SeqCst), num_tasks);
+    assert_eq!(
+      cdata.1.load(Ordering::SeqCst),
+      (0..num_tasks).sum::<usize>()
+    );
+  }
+}
--- a/rust/src/runtime/workspace.rs
+++ b/rust/src/runtime/workspace.rs
+use std::{
+  cell::RefCell,
+  os::raw::{c_int, c_void},
+  ptr,
+};
+use super::allocator::Allocation;
+use errors::*;
+const WS_ALIGN: usize = 64; // taken from `kTempAllocaAlignment` in `device_api.h`
+struct WorkspacePool {
+  workspaces: Vec<Allocation>,
+  free: Vec<usize>,
+  in_use: Vec<usize>,
+}
+impl WorkspacePool {
+  fn new() -> Self {
+    WorkspacePool {
+      workspaces: Vec::new(),
+      free: Vec::new(),
+      in_use: Vec::new(),
+    }
+  }
+  fn alloc_new(&mut self, size: usize) -> Result<*mut u8> {
+    self.workspaces.push(Allocation::new(size, Some(WS_ALIGN))?);
+    self.in_use.push(self.workspaces.len() - 1);
+    Ok(self.workspaces[self.workspaces.len() - 1].as_mut_ptr())
+  }
+  fn alloc(&mut self, size: usize) -> Result<*mut u8> {
+    if self.free.len() == 0 {
+      return self.alloc_new(size);
+    }
+    let idx = self
+      .free
+      .iter()
+      .fold(None, |cur_ws_idx: Option<usize>, &idx| {
+        let ws_size = self.workspaces[idx].size();
+        if !ws_size >= size {
+          return cur_ws_idx;
+        }
+        cur_ws_idx.or(Some(idx)).and_then(|cur_idx| {
+          let cur_size = self.workspaces[cur_idx].size();
+          Some(match ws_size <= cur_size {
+            true => idx,
+            false => cur_idx,
+          })
+        })
+      });
+    match idx {
+      Some(idx) => {
+        self.free.remove_item(&idx).unwrap();
+        self.in_use.push(idx);
+        Ok(self.workspaces[idx].as_mut_ptr())
+      }
+      None => self.alloc_new(size),
+    }
+  }
+  fn free(&mut self, ptr: *mut u8) -> Result<()> {
+    let mut ws_idx = None;
+    for i in 0..self.in_use.len() {
+      let idx = self.in_use[i];
+      if self.workspaces[idx].as_mut_ptr() == ptr {
+        self.in_use.remove(i);
+        ws_idx = Some(idx);
+        break;
+      }
+    }
+    Ok(
+      self
+        .free
+        .push(ws_idx.ok_or("Tried to free nonexistent workspace.")?),
+    )
+  }
+}
+thread_local!(static WORKSPACE_POOL: RefCell<WorkspacePool> = RefCell::new(WorkspacePool::new()));
+const WORKSPACE_PAGE_SIZE: usize = 4 << 10;
+#[no_mangle]
+pub extern "C" fn TVMBackendAllocWorkspace(
+  _device_type: c_int,
+  _device_id: c_int,
+  size: u64,
+  _dtype_code_hint: c_int,
+  _dtype_bits_hint: c_int,
+) -> *mut c_void {
+  let nbytes = if size == 0 {
+    WORKSPACE_PAGE_SIZE
+  } else {
+    size as usize
+  };
+  WORKSPACE_POOL.with(|pool_cell| {
+    pool_cell
+      .borrow_mut()
+      .alloc(nbytes as usize)
+      .unwrap_or(ptr::null_mut()) as *mut c_void
+  })
+}
+#[no_mangle]
+pub extern "C" fn TVMBackendFreeWorkspace(
+  _device_type: c_int,
+  _device_id: c_int,
+  ptr: *mut c_void,
+) -> c_int {
+  WORKSPACE_POOL.with(|pool_cell| {
+    (match pool_cell.borrow_mut().free(ptr as *mut u8) {
+      Ok(()) => 0,
+      Err(_) => -1,
+    }) as c_int
+  });
+  return 0;
+}
--- a/rust/tests/.gitignore
+++ b/rust/tests/.gitignore
+*.json
+*.params
+*.o
--- a/rust/tests/build_model.py
+++ b/rust/tests/build_model.py
+"""Builds a simple NNVM graph for testing."""
+from os import path as osp
+import nnvm
+from nnvm import sym
+from nnvm.compiler import graph_util
+from nnvm.testing import init
+import numpy as np
+import tvm
+CWD = osp.dirname(osp.abspath(osp.expanduser(__file__)))
+def _get_model(dshape):
+    data = sym.Variable('data', shape=dshape)
+    fc1 = sym.dense(data, units=dshape[-1]*2, use_bias=True)
+    left, right = sym.split(fc1, indices_or_sections=2, axis=1)
+    return sym.Group(((left + 1), (right - 1)))
+def _init_params(graph, input_shapes, initializer=init.Xavier(), seed=10):
+    if isinstance(graph, sym.Symbol):
+        graph = nnvm.graph.create(graph)
+    ishapes, _ = graph_util.infer_shape(graph, **input_shapes)
+    param_shapes = dict(zip(graph.index.input_names, ishapes))
+    np.random.seed(seed)
+    params = {}
+    for param, shape in param_shapes.items():
+        if param in {'data', 'label'} or not shape:
+            continue
+        init_value = np.empty(shape).astype('float32')
+        initializer(param, init_value)
+        params[param] = tvm.nd.array(init_value)
+    return params
+def main():
+    dshape = (32, 16)
+    net = _get_model(dshape)
+    ishape_dict = {'data': dshape}
+    params = _init_params(net, ishape_dict)
+    graph, lib, params = nnvm.compiler.build(net, 'llvm',
+                                             shape=ishape_dict,
+                                             params=params,
+                                             dtype='float32')
+    with open(osp.join(CWD, 'graph.json'), 'w') as f_resnet:
+        f_resnet.write(graph.json())
+    with open(osp.join(CWD, 'graph.params'), 'wb') as f_params:
+        f_params.write(nnvm.compiler.save_param_dict(params))
+if __name__ == '__main__':
+    main()
--- a/rust/tests/test_graph_serde.rs
+++ b/rust/tests/test_graph_serde.rs
+#![feature(try_from)]
+extern crate serde;
+extern crate serde_json;
+extern crate tvm;
+use std::{convert::TryFrom, fs, io::Read};
+use tvm::runtime::Graph;
+#[test]
+fn test_load_graph() {
+  let mut params_bytes = Vec::new();
+  fs::File::open(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/graph.params"))
+    .expect("Could not find TVM graph. Did you run `tests/build_model.py`?")
+    .read_to_end(&mut params_bytes)
+    .unwrap();
+  let _params = tvm::runtime::load_param_dict(&params_bytes);
+  let graph = Graph::try_from(
+    &fs::read_to_string(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/graph.json")).unwrap(),
+  ).unwrap();
+  assert_eq!(graph.nodes[3].op, "tvm_op");
+  assert_eq!(
+    graph.nodes[3]
+      .attrs
+      .as_ref()
+      .unwrap()
+      .get("func_name")
+      .unwrap(),
+    "fuse_dense"
+  );
+  assert_eq!(graph.nodes[5].inputs[0].index, 0);
+  assert_eq!(graph.nodes[6].inputs[0].index, 1);
+  assert_eq!(graph.heads.len(), 2);
+}
--- a/rust/tests/test_nnvm/Cargo.toml
+++ b/rust/tests/test_nnvm/Cargo.toml
+[package]
+name = "test-nnvm"
+version = "0.0.0"
+license = "Apache-2.0"
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+[dependencies]
+ndarray = "0.11.2"
+tvm = { path = "../../" }
+serde = "1.0.59"
+serde_json = "1.0.17"
+[build-dependencies]
+ar = "0.6.0"
--- a/rust/tests/test_nnvm/build.rs
+++ b/rust/tests/test_nnvm/build.rs
+extern crate ar;
+use std::{env, path::PathBuf, process::Command};
+use ar::Builder;
+use std::fs::File;
+fn main() {
+  let out_dir = env::var("OUT_DIR").unwrap();
+  let output = Command::new(concat!(
+    env!("CARGO_MANIFEST_DIR"),
+    "/src/build_test_graph.py"
+  )).arg(&out_dir)
+    .output()
+    .expect("Failed to execute command");
+  if output.stderr.len() > 0 {
+    panic!(String::from_utf8(output.stderr).unwrap());
+  }
+  let in_path: PathBuf = [&out_dir, "graph.o"].iter().collect();
+  let out_path: PathBuf = [&out_dir, "libgraph.a"].iter().collect();
+  let mut builder = Builder::new(File::create(out_path.to_str().unwrap()).unwrap());
+  builder.append_path(in_path.to_str().unwrap()).unwrap();
+  println!("cargo:rustc-link-lib=static=graph");
+  println!("cargo:rustc-link-search=native={}", out_dir);
+}
--- a/rust/tests/test_nnvm/src/build_test_graph.py
+++ b/rust/tests/test_nnvm/src/build_test_graph.py
+#!/usr/bin/env python3
+"""Builds a simple NNVM graph for testing."""
+from os import path as osp
+import sys
+import nnvm
+from nnvm import sym
+from nnvm.compiler import graph_util
+from nnvm.testing import init
+import numpy as np
+import tvm
+def _get_model(dshape):
+    data = sym.Variable('data', shape=dshape)
+    fc = sym.dense(data, units=dshape[-1]*2, use_bias=True)
+    left, right = sym.split(fc, indices_or_sections=2, axis=1)
+    return sym.Group(((left + 1), (right - 1), fc))
+def _init_params(graph, input_shapes, initializer=init.Xavier(), seed=10):
+    if isinstance(graph, sym.Symbol):
+        graph = nnvm.graph.create(graph)
+    ishapes, _ = graph_util.infer_shape(graph, **input_shapes)
+    param_shapes = dict(zip(graph.index.input_names, ishapes))
+    np.random.seed(seed)
+    params = {}
+    for param, shape in param_shapes.items():
+        if param in {'data', 'label'} or not shape:
+            continue
+        init_value = np.arange(np.product(shape), 0, -1).reshape(*shape).astype('float32')
+        if param.endswith('_bias'):
+            params[param] = tvm.nd.array(init_value)
+            continue
+        init_value = np.empty(shape).astype('float32')
+        initializer(param, init_value)
+        # init_value /= init_value.sum() + 1e-10
+        params[param] = tvm.nd.array(init_value)
+    return params
+def main():
+    dshape = (4, 8)
+    net = _get_model(dshape)
+    ishape_dict = {'data': dshape}
+    params = _init_params(net, ishape_dict)
+    graph, lib, params = nnvm.compiler.build(net, 'llvm --system-lib',
+                                             shape=ishape_dict,
+                                             params=params,
+                                             dtype='float32')
+    out_dir = sys.argv[1]
+    lib.save(osp.join(sys.argv[1], 'graph.o'))
+    with open(osp.join(out_dir, 'graph.json'), 'w') as f_resnet:
+        f_resnet.write(graph.json())
+    with open(osp.join(out_dir, 'graph.params'), 'wb') as f_params:
+        f_params.write(nnvm.compiler.save_param_dict(params))
+if __name__ == '__main__':
+    main()
--- a/rust/tests/test_nnvm/src/main.rs
+++ b/rust/tests/test_nnvm/src/main.rs
+#![feature(try_from)]
+#[macro_use]
+extern crate ndarray;
+extern crate serde;
+extern crate serde_json;
+extern crate tvm;
+use std::{collections::HashMap, convert::TryFrom, fs, io::Read};
+use ndarray::Array;
+use tvm::runtime::{Graph, GraphExecutor, SystemLibModule, Tensor};
+const BATCH_SIZE: usize = 4;
+const IN_DIM: usize = 8;
+macro_rules! check_sum {
+  ($e:expr, $a:ident, $b:ident) => {
+    let a = Array::try_from($e.get_input(stringify!($a)).unwrap()).unwrap();
+    check_sum!(a, $b);
+  };
+  ($e:expr, $a:expr, $b:ident) => {
+    let a = Array::try_from($e.get_output($a).unwrap()).unwrap();
+    check_sum!(a, $b);
+  };
+  ($a:ident, $b:ident) => {
+    let a_sum: f32 = $a.scalar_sum();
+    let b_sum: f32 = $b.scalar_sum();
+    assert!((a_sum - b_sum).abs() < 1e-2, "{} != {}", a_sum, b_sum);
+  };
+}
+fn main() {
+  let syslib = SystemLibModule::default();
+  let mut params_bytes = Vec::new();
+  fs::File::open(concat!(env!("OUT_DIR"), "/graph.params"))
+    .unwrap()
+    .read_to_end(&mut params_bytes)
+    .unwrap();
+  let params = tvm::runtime::load_param_dict(&params_bytes)
+    .unwrap()
+    .into_iter()
+    .map(|(k, v)| (k, v.to_owned()))
+    .collect::<HashMap<String, Tensor<'static>>>();
+  let graph =
+    Graph::try_from(&fs::read_to_string(concat!(env!("OUT_DIR"), "/graph.json")).unwrap()).unwrap();
+  let mut exec = GraphExecutor::new(graph, &syslib).unwrap();
+  let x = Array::from_shape_vec(
+    (BATCH_SIZE, IN_DIM),
+    (0..BATCH_SIZE * IN_DIM)
+      .map(|x| x as f32)
+      .collect::<Vec<f32>>(),
+  ).unwrap();
+  let w = Array::try_from(params.get("dense0_weight").unwrap())
+    .unwrap()
+    .into_shape((IN_DIM * 2, IN_DIM))
+    .unwrap();
+  let b = Array::try_from(params.get("dense0_bias").unwrap()).unwrap();
+  let dense = x.dot(&w.t()) + &b;
+  let left = dense.slice(s![.., 0..IN_DIM]);
+  let right = dense.slice(s![.., IN_DIM..]);
+  let expected_o0 = &left + 1f32;
+  let expected_o1 = &right - 1f32;
+  exec.load_params(params);
+  exec.set_input("data", x.clone().into());
+  check_sum!(exec, data, x);
+  check_sum!(exec, dense0_weight, w);
+  check_sum!(exec, dense0_bias, b);
+  exec.run();
+  check_sum!(exec, 0, expected_o0);
+  check_sum!(exec, 1, expected_o1);
+  check_sum!(exec, 2, dense);
+}
--- a/rust/tests/test_tvm_basic/Cargo.toml
+++ b/rust/tests/test_tvm_basic/Cargo.toml
+[package]
+name = "test-tvm-basic"
+version = "0.0.0"
+license = "Apache-2.0"
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+[dependencies]
+ndarray = "0.11.2"
+tvm = { path = "../../" }
+[build-dependencies]
+ar = "0.6.0"
--- a/rust/tests/test_tvm_basic/build.rs
+++ b/rust/tests/test_tvm_basic/build.rs
+extern crate ar;
+use std::{env, path::PathBuf, process::Command};
+use ar::Builder;
+use std::fs::File;
+fn main() {
+  let out_dir = env::var("OUT_DIR").unwrap();
+  let output = Command::new(concat!(
+    env!("CARGO_MANIFEST_DIR"),
+    "/src/build_test_lib.py"
+  )).arg(&out_dir)
+    .output()
+    .expect("Failed to execute command");
+  if output.stderr.len() > 0 {
+    panic!(String::from_utf8(output.stderr).unwrap());
+  }
+  let in_path: PathBuf = [&out_dir, "test.o"].iter().collect();
+  let out_path: PathBuf = [&out_dir, "libtest.a"].iter().collect();
+  let mut builder = Builder::new(File::create(out_path.to_str().unwrap()).unwrap());
+  builder.append_path(in_path.to_str().unwrap()).unwrap();
+  println!("cargo:rustc-link-lib=static=test");
+  println!("cargo:rustc-link-search=native={}", out_dir);
+}
--- a/rust/tests/test_tvm_basic/src/build_test_lib.py
+++ b/rust/tests/test_tvm_basic/src/build_test_lib.py
+#!/usr/bin/env python3
+"""Prepares a simple TVM library for testing."""
+from os import path as osp
+import sys
+import tvm
+def main():
+    n = tvm.var('n')
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.create_schedule(C.op)
+    s[C].parallel(s[C].op.axis[0])
+    print(tvm.lower(s, [A, B, C], simple_mode=True))
+    tvm.build(s, [A, B, C], 'llvm --system-lib').save(osp.join(sys.argv[1], 'test.o'))
+if __name__ == '__main__':
+    main()
--- a/rust/tests/test_tvm_basic/src/main.rs
+++ b/rust/tests/test_tvm_basic/src/main.rs
+extern crate ndarray;
+#[macro_use]
+extern crate tvm;
+use ndarray::Array;
+use tvm::{
+  ffi::runtime::DLTensor,
+  runtime::{Module, SystemLibModule},
+};
+fn main() {
+  let syslib = SystemLibModule::default();
+  let add = syslib
+    .get_function("default_function")
+    .expect("main function not found");
+  let mut a = Array::from_vec(vec![1f32, 2., 3., 4.]);
+  let mut b = Array::from_vec(vec![1f32, 0., 1., 0.]);
+  let mut c = Array::from_vec(vec![0f32; 4]);
+  let e = Array::from_vec(vec![2f32, 2., 4., 4.]);
+  let mut a_dl: DLTensor = (&mut a).into();
+  let mut b_dl: DLTensor = (&mut b).into();
+  let mut c_dl: DLTensor = (&mut c).into();
+  call_packed!(add, &mut a_dl, &mut b_dl, &mut c_dl);
+  assert!(c.all_close(&e, 1e-8f32));
+}