[RUNTIME] Add fp16/fp32 conversion functions (#1766)

bde53033 · Lianmin Zheng · Tianqi Chen · e8d6d9aa · bde53033 · cf6090ae
Commit bde53033 authored Sep 24, 2018 by Lianmin Zheng Committed by Tianqi Chen Sep 24, 2018
24 changed files
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "dmlc-core"]
-	path = dmlc-core
+	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core
 [submodule "HalideIR"]
-	path = HalideIR
+	path = 3rdparty/HalideIR
 	url = https://github.com/dmlc/HalideIR
 [submodule "dlpack"]
-	path = dlpack
+	path = 3rdparty/dlpack
 	url = https://github.com/dmlc/dlpack
--- a/HalideIR @ cf6090ae
+++ b/HalideIR @ cf6090ae
+Subproject commit cf6090aeaeb782d1daff54b0ca5c2c281d7008db
--- a/3rdparty/compiler-rt/builtin_fp16.h
+++ b/3rdparty/compiler-rt/builtin_fp16.h
+/*
+ * Copyright (c) 2009-2015 by llvm/compiler-rt contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+
+ * Copyright (c) 2018 by Contributors
+ * \file builtin_fp16.cc
+ * \brief Functions for conversion between fp32 and fp16, adopted from compiler-rt.
+ */
+
+#include <cstdint>
+
+static inline uint32_t __clz(uint32_t x) {
+  // count leading zeros
+  int n = 32;
+  uint32_t y;
+
+  y = x >>16; if (y) { n = n -16; x = y; }
+  y = x >> 8; if (y) { n = n - 8; x = y; }
+  y = x >> 4; if (y) { n = n - 4; x = y; }
+  y = x >> 2; if (y) { n = n - 2; x = y; }
+  y = x >> 1; if (y) return n - 2;
+  return n - x;
+}
+
+template <typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
+          typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
+static inline DST_T __truncXfYf2__(SRC_T a) {
+  // Various constants whose values follow from the type parameters.
+  // Any reasonable optimizer will fold and propagate all of these.
+  const int srcBits = sizeof(SRC_T) * 8;
+  const int srcExpBits = srcBits - SRC_SIG_BITS - 1;
+  const int srcInfExp = (1 << srcExpBits) - 1;
+  const int srcExpBias = srcInfExp >> 1;
+
+  const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS;
+  const SRC_REP_T srcSignificandMask = srcMinNormal - 1;
+  const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS;
+  const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits);
+  const SRC_REP_T srcAbsMask = srcSignMask - 1;
+  const SRC_REP_T roundMask = (SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS)) - 1;
+  const SRC_REP_T halfway = SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS - 1);
+  const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
+  const SRC_REP_T srcNaNCode = srcQNaN - 1;
+
+  const int dstBits = sizeof(DST_T) * 8;
+  const int dstExpBits = dstBits - DST_SIG_BITS - 1;
+  const int dstInfExp = (1 << dstExpBits) - 1;
+  const int dstExpBias = dstInfExp >> 1;
+
+  const int underflowExponent = srcExpBias + 1 - dstExpBias;
+  const int overflowExponent = srcExpBias + dstInfExp - dstExpBias;
+  const SRC_REP_T underflow = (SRC_REP_T)underflowExponent << SRC_SIG_BITS;
+  const SRC_REP_T overflow = (SRC_REP_T)overflowExponent << SRC_SIG_BITS;
+
+  const DST_REP_T dstQNaN = DST_REP_T(1) << (DST_SIG_BITS - 1);
+  const DST_REP_T dstNaNCode = dstQNaN - 1;
+
+  // Break a into a sign and representation of the absolute value
+  const union { SRC_T f; SRC_REP_T i; } src_rep = {.f = a};
+  const SRC_REP_T aRep = src_rep.i;
+  const SRC_REP_T aAbs = aRep & srcAbsMask;
+  const SRC_REP_T sign = aRep & srcSignMask;
+  DST_REP_T absResult;
+
+  if (aAbs - underflow < aAbs - overflow) {
+    // The exponent of a is within the range of normal numbers in the
+    // destination format.  We can convert by simply right-shifting with
+    // rounding and adjusting the exponent.
+    absResult = aAbs >> (SRC_SIG_BITS - DST_SIG_BITS);
+    absResult -= (DST_REP_T)(srcExpBias - dstExpBias) << DST_SIG_BITS;
+
+    const SRC_REP_T roundBits = aAbs & roundMask;
+    // Round to nearest
+    if (roundBits > halfway)
+      absResult++;
+      // Ties to even
+    else if (roundBits == halfway)
+      absResult += absResult & 1;
+  }
+  else if (aAbs > srcInfinity) {
+    // a is NaN.
+    // Conjure the result by beginning with infinity, setting the qNaN
+    // bit and inserting the (truncated) trailing NaN field.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+    absResult |= dstQNaN;
+    absResult |= ((aAbs & srcNaNCode) >> (SRC_SIG_BITS - DST_SIG_BITS)) & dstNaNCode;
+  }
+  else if (aAbs >= overflow) {
+    // a overflows to infinity.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+  }
+  else {
+    // a underflows on conversion to the destination type or is an exact
+    // zero.  The result may be a denormal or zero.  Extract the exponent
+    // to get the shift amount for the denormalization.
+    const int aExp = aAbs >> SRC_SIG_BITS;
+    const int shift = srcExpBias - dstExpBias - aExp + 1;
+
+    const SRC_REP_T significand = (aRep & srcSignificandMask) | srcMinNormal;
+
+    // Right shift by the denormalization amount with sticky.
+    if (shift > SRC_SIG_BITS) {
+      absResult = 0;
+    } else {
+      const bool sticky = significand << (srcBits - shift);
+      SRC_REP_T denormalizedSignificand = significand >> shift | sticky;
+      absResult = denormalizedSignificand >> (SRC_SIG_BITS - DST_SIG_BITS);
+      const SRC_REP_T roundBits = denormalizedSignificand & roundMask;
+      // Round to nearest
+      if (roundBits > halfway)
+        absResult++;
+        // Ties to even
+      else if (roundBits == halfway)
+        absResult += absResult & 1;
+    }
+  }
+
+  // Apply the signbit to (DST_T)abs(a).
+  const DST_REP_T result = absResult | sign >> (srcBits - dstBits);
+  const union { DST_T f; DST_REP_T i; } dst_rep = {.i = result};
+  return dst_rep.f;
+}
+
+template<typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
+         typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
+static inline DST_T __extendXfYf2__(SRC_T a) {
+  // Various constants whose values follow from the type parameters.
+  // Any reasonable optimizer will fold and propagate all of these.
+  const int srcBits = sizeof(SRC_T) * 8;
+  const int srcExpBits = srcBits - SRC_SIG_BITS - 1;
+  const int srcInfExp = (1 << srcExpBits) - 1;
+  const int srcExpBias = srcInfExp >> 1;
+
+  const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS;
+  const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS;
+  const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits);
+  const SRC_REP_T srcAbsMask = srcSignMask - 1;
+  const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
+  const SRC_REP_T srcNaNCode = srcQNaN - 1;
+
+  const int dstBits = sizeof(DST_T)*8;
+  const int dstExpBits = dstBits - DST_SIG_BITS - 1;
+  const int dstInfExp = (1 << dstExpBits) - 1;
+  const int dstExpBias = dstInfExp >> 1;
+
+  const DST_REP_T dstMinNormal = DST_REP_T(1) << DST_SIG_BITS;
+
+  // Break a into a sign and representation of the absolute value
+  const union { SRC_T f; SRC_REP_T i; } src_rep = {.f = a};
+  const SRC_REP_T aRep = src_rep.i;
+  const SRC_REP_T aAbs = aRep & srcAbsMask;
+  const SRC_REP_T sign = aRep & srcSignMask;
+  DST_REP_T absResult;
+
+  // If sizeof(SRC_REP_T) < sizeof(int), the subtraction result is promoted
+  // to (signed) int.  To avoid that, explicitly cast to SRC_REP_T.
+  if ((SRC_REP_T)(aAbs - srcMinNormal) < srcInfinity - srcMinNormal) {
+    // a is a normal number.
+    // Extend to the destination type by shifting the significand and
+    // exponent into the proper position and rebiasing the exponent.
+    absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS);
+    absResult += (DST_REP_T)(dstExpBias - srcExpBias) << DST_SIG_BITS;
+  }
+
+  else if (aAbs >= srcInfinity) {
+    // a is NaN or infinity.
+    // Conjure the result by beginning with infinity, then setting the qNaN
+    // bit (if needed) and right-aligning the rest of the trailing NaN
+    // payload field.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+    absResult |= (DST_REP_T)(aAbs & srcQNaN) << (DST_SIG_BITS - SRC_SIG_BITS);
+    absResult |= (DST_REP_T)(aAbs & srcNaNCode) << (DST_SIG_BITS - SRC_SIG_BITS);
+  }
+  else if (aAbs) {
+    // a is denormal.
+    // renormalize the significand and clear the leading bit, then insert
+    // the correct adjusted exponent in the destination type.
+    const int scale = __clz(aAbs) - __clz(srcMinNormal);
+    absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS + scale);
+    absResult ^= dstMinNormal;
+    const int resultExponent = dstExpBias - srcExpBias - scale + 1;
+    absResult |= (DST_REP_T)resultExponent << DST_SIG_BITS;
+  }
+  else {
+    // a is zero.
+    absResult = 0;
+  }
+
+  // Apply the signbit to (DST_T)abs(a).
+  const DST_REP_T result = absResult | (DST_REP_T)sign << (dstBits - srcBits);
+  const union { DST_T f; DST_REP_T i; } dst_rep = {.i = result};
+  return dst_rep.f;
+}
--- a/dlpack @ bee4d1dd
+++ b/dlpack @ bee4d1dd
+Subproject commit bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3
--- a/dmlc-core @ 4f0564ec
+++ b/dmlc-core @ 4f0564ec
+Subproject commit 4f0564ec769477c66d480dd966088f172050c874
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,8 +50,9 @@ tvm_option(USE_RANDOM "Build with random support" OFF)

 # include directories
 include_directories("include")
-include_directories("dlpack/include")
-include_directories("dmlc-core/include")
+include_directories("3rdparty/dlpack/include")
+include_directories("3rdparty/dmlc-core/include")
+include_directories("3rdparty/compiler-rt")

 # initial variables
 set(TVM_LINKER_LIBS "")
@@ -87,8 +88,8 @@ else(MSVC)
 endif(MSVC)

 # add source group
-FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "HalideIR/src/*.cpp" "nnvm/src/*.cc")
-FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h" "HalideIR/src/*.h"
+FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "3rdparty/HalideIR/src/*.cpp" "nnvm/src/*.cc")
+FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h" "3rdparty/HalideIR/src/*.h"
                                "nnvm/src/*.h" "nnvm/include/*.h")
 assign_source_group("Source" ${GROUP_SOURCE})
 assign_source_group("Include" ${GROUP_INCLUDE})
@@ -127,7 +128,7 @@ file(GLOB_RECURSE NNVM_COMPILER_SRCS
 file(GLOB TOPI_SRCS
    topi/src/*.cc
 )
-file(GLOB_RECURSE HALIDEIR_SRCS HalideIR/src/*.cpp)
+file(GLOB_RECURSE HALIDEIR_SRCS 3rdparty/HalideIR/src/*.cpp)
 list(APPEND COMPILER_SRCS ${HALIDEIR_SRCS})
 file(GLOB RUNTIME_SRCS src/runtime/*.cc)

@@ -194,7 +195,7 @@ target_link_libraries(nnvm_compiler tvm)
 # Related headers
 target_include_directories(
  tvm
-  PUBLIC "HalideIR/src"
+  PUBLIC "3rdparty/HalideIR/src"
  PUBLIC "topi/include")
 target_include_directories(
  tvm_topi
@@ -244,12 +245,12 @@ if (INSTALL_DEV)
    PATTERN "*.h"
  )
  install(
-    DIRECTORY "HalideIR/src/." DESTINATION "include/HalideIR"
+    DIRECTORY "3rdparty/HalideIR/src/." DESTINATION "include/HalideIR"
    FILES_MATCHING
    PATTERN "*.h"
  )
  install(
-    DIRECTORY "dlpack/include/." DESTINATION "include"
+    DIRECTORY "3rdparty/dlpack/include/." DESTINATION "include"
    FILES_MATCHING
    PATTERN "*.h"
    )

--- a/HalideIR @ cf6090ae
+++ b/HalideIR @ cf6090ae
-Subproject commit cf6090aeaeb782d1daff54b0ca5c2c281d7008db
--- a/Makefile
+++ b/Makefile
@@ -4,11 +4,11 @@ ROOTDIR = $(CURDIR)
 	 cython cython2 cython3 web runtime vta

 ifndef DMLC_CORE_PATH
-  DMLC_CORE_PATH = $(ROOTDIR)/dmlc-core
+  DMLC_CORE_PATH = $(ROOTDIR)/3rdparty/dmlc-core
 endif

 ifndef DLPACK_PATH
-  DLPACK_PATH = $(ROOTDIR)/dlpack
+  DLPACK_PATH = $(ROOTDIR)/3rdparty/dlpack
 endif

 INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include
@@ -50,10 +50,10 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc

 # Lint scripts
 cpplint:
-	python3 dmlc-core/scripts/lint.py vta cpp vta/include vta/src
-	python3 dmlc-core/scripts/lint.py topi cpp topi/include;
-	python3 dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
-	python3 dmlc-core/scripts/lint.py tvm cpp include src verilog\
+	python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
+	python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include;
+	python3 3rdparty/dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
+	python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp include src verilog\
 	 examples/extension/src examples/graph_executor/src

 pylint:
@@ -63,7 +63,7 @@ pylint:
 	python3 -m pylint vta/python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc

 jnilint:
-	python3 dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
+	python3 3rdparty/dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src

 lint: cpplint pylint jnilint


--- a/apps/android_deploy/app/src/main/jni/Android.mk
+++ b/apps/android_deploy/app/src/main/jni/Android.mk
@@ -20,9 +20,9 @@ LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc
 LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog

 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
-                    $(ROOT_PATH)/dlpack/include \
-                    $(ROOT_PATH)/dmlc-core/include \
-                    $(ROOT_PATH)/HalideIR/src \
+                    $(ROOT_PATH)/3rdparty/dlpack/include \
+                    $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/HalideIR/src \
                    $(ROOT_PATH)/topi/include

 LOCAL_MODULE = tvm4j_runtime_packed

--- a/apps/android_rpc/app/src/main/jni/Android.mk
+++ b/apps/android_rpc/app/src/main/jni/Android.mk
@@ -20,9 +20,9 @@ LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc
 LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog

 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
-                    $(ROOT_PATH)/dlpack/include \
-                    $(ROOT_PATH)/dmlc-core/include \
-                    $(ROOT_PATH)/HalideIR/src \
+                    $(ROOT_PATH)/3rdparty/dlpack/include \
+                    $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/HalideIR/src \
                    $(ROOT_PATH)/topi/include

 LOCAL_MODULE = tvm4j_runtime_packed

--- a/apps/extension/Makefile
+++ b/apps/extension/Makefile
@@ -2,9 +2,9 @@
 TVM_ROOT=$(shell cd ../..; pwd)
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
-	-I${TVM_ROOT}/dmlc-core/include\
-	-I${TVM_ROOT}/dlpack/include\
-	-I${TVM_ROOT}/HalideIR/src
+	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/HalideIR/src

 PKG_LDFLAGS =-L${TVM_ROOT}/lib
 UNAME_S := $(shell uname -s)

--- a/apps/howto_deploy/Makefile
+++ b/apps/howto_deploy/Makefile
 # Makefile Example to deploy TVM modules.
 TVM_ROOT=$(shell cd ../..; pwd)
 NNVM_PATH=nnvm
-DMLC_CORE=${TVM_ROOT}/dmlc-core
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core

 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\

 PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -lpthread


--- a/apps/howto_deploy/tvm_runtime_pack.cc
+++ b/apps/howto_deploy/tvm_runtime_pack.cc
@@ -8,8 +8,8 @@
 *  - Compile with -std=c++11
 *  - Add the following include path
 *     - /path/to/tvm/include/
- *     - /path/to/tvm/dmlc-core/include/
- *     - /path/to/tvm/dlpack/include/
+ *     - /path/to/tvm/3rdparty/dmlc-core/include/
+ *     - /path/to/tvm/3rdparty/dlpack/include/
 *   - Add -lpthread -ldl to the linked library.
 *   - You are good to go.
 *   - See the Makefile in the same folder for example.

--- a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
+++ b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
@@ -386,8 +386,8 @@
 				GCC_SYMBOLS_PRIVATE_EXTERN = NO;
 				HEADER_SEARCH_PATHS = (
 					../../include,
-					../../dlpack/include,
-					"../../dmlc-core/include",
+					../../3rdparty/dlpack/include,
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpc/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
@@ -406,8 +406,8 @@
 				GCC_SYMBOLS_PRIVATE_EXTERN = NO;
 				HEADER_SEARCH_PATHS = (
 					../../include,
-					../../dlpack/include,
-					"../../dmlc-core/include",
+					../../3rdparty/dlpack/include,
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpc/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
@@ -422,9 +422,9 @@
 				BUNDLE_LOADER = "$(TEST_HOST)";
 				DEVELOPMENT_TEAM = 3FR42MXLK9;
 				HEADER_SEARCH_PATHS = (
-					../../dlpack/include,
+					../../3rdparty/dlpack/include,
 					../../include,
-					"../../dmlc-core/include",
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpcLauncher/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks @loader_path/Frameworks";
@@ -440,9 +440,9 @@
 				BUNDLE_LOADER = "$(TEST_HOST)";
 				DEVELOPMENT_TEAM = 3FR42MXLK9;
 				HEADER_SEARCH_PATHS = (
-					../../dlpack/include,
+					../../3rdparty/dlpack/include,
 					../../include,
-					"../../dmlc-core/include",
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpcLauncher/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks @loader_path/Frameworks";

--- a/apps/rocm_rpc/Makefile
+++ b/apps/rocm_rpc/Makefile
@@ -3,12 +3,12 @@ ROCM_PATH=/opt/rocm

 TVM_ROOT=$(shell cd ../..; pwd)
 NNVM_PATH=nnvm
-DMLC_CORE=${TVM_ROOT}/dmlc-core
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core

 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
 	-I${ROCM_PATH}/include

 PKG_LDFLAGS = -L${ROCM_PATH}/lib -L${TVM_ROOT}/lib -ldl -lpthread -lhip_hcc -lMIOpen

--- a/apps/sgx/Makefile
+++ b/apps/sgx/Makefile
@@ -23,7 +23,7 @@ uservice_library_name := sgx_uae_service$(sgx_sim)
 pkg_cflags := -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
 	-I.\
 	-DDMLC_LOG_STACK_TRACE=0\
 	-fmax-errors=4

--- a/dlpack @ bee4d1dd
+++ b/dlpack @ bee4d1dd
-Subproject commit bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3
--- a/dmlc-core @ 4f0564ec
+++ b/dmlc-core @ 4f0564ec
-Subproject commit 4f0564ec769477c66d480dd966088f172050c874
--- a/nnvm/Makefile
+++ b/nnvm/Makefile
@@ -13,7 +13,7 @@ TVMPATH = ..

 export LDFLAGS = -pthread -lm
 export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
-CFLAGS += -I$(TVMPATH)/include -I$(TVMPATH)/dlpack/include -I$(TVMPATH)/HalideIR/src -I$(TVMPATH)/topi/include
+CFLAGS += -I$(TVMPATH)/include -I$(TVMPATH)/3rdparty/dlpack/include -I$(TVMPATH)/3rdparty/HalideIR/src -I$(TVMPATH)/topi/include

 ifdef DMLC_CORE_PATH
  CFLAGS += -I$(DMLC_CORE_PATH)/include

--- a/nnvm/amalgamation/Makefile
+++ b/nnvm/amalgamation/Makefile
@@ -4,7 +4,7 @@ export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
 ifdef DMLC_CORE_PATH
  CFLAGS += -I$(DMLC_CORE_PATH)/include
 else
-  CFLAGS += -I$(CURDIR)/../dmlc-core/include
+  CFLAGS += -I$(CURDIR)/../3rdparty/dmlc-core/include
 endif

 .PHONY: all clean

--- a/python/setup.py
+++ b/python/setup.py
@@ -74,8 +74,8 @@ def config_cython():
                "tvm._ffi.%s.%s" % (subdir, fn[:-4]),
                ["tvm/_ffi/_cython/%s" % fn],
                include_dirs=["../include/",
-                              "../dmlc-core/include",
-                              "../dlpack/include",
+                              "../3rdparty/dmlc-core/include",
+                              "../3rdparty/dlpack/include",
                ],
                library_dirs=library_dirs,
                libraries=libraries,

--- a/src/runtime/builtin_fp16.cc
+++ b/src/runtime/builtin_fp16.cc
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file builtin_fp16.cc
+ * \brief Functions for conversion between fp32 and fp16
+*/
+
+#include <builtin_fp16.h>
+
+namespace tvm {
+namespace runtime {
+
+extern "C"  uint16_t __gnu_f2h_ieee(float a) {
+  return __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(a);
+}
+
+extern "C" float __gnu_h2f_ieee(uint16_t a) {
+  return __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(a);
+}
+
+}  // namespace runtime
+}  // namespace tvm
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -35,5 +35,26 @@ def test_nd_create():
        ctx.sync()


+def test_fp16_conversion():
+    n = 100
+
+    for (src, dst) in [('float32', 'float16'), ('float16', 'float32')]:
+        A = tvm.placeholder((n,), dtype=src)
+        B = tvm.compute((n,), lambda i: A[i].astype(dst))
+
+        s = tvm.create_schedule([B.op])
+        func = tvm.build(s, [A, B], 'llvm')
+
+        x_tvm = tvm.nd.array(100 * np.random.randn(n).astype(src) - 50)
+        y_tvm = tvm.nd.array(100 * np.random.randn(n).astype(dst) - 50)
+
+        func(x_tvm, y_tvm)
+
+        expected = x_tvm.asnumpy().astype(dst)
+        real = y_tvm.asnumpy()
+
+        np.testing.assert_allclose(expected, real)
+
 if __name__ == "__main__":
    test_nd_create()
+    test_fp16_conversion()
--- a/vta/python/vta/pkg_config.py
+++ b/vta/python/vta/pkg_config.py
@@ -42,8 +42,8 @@ class PkgConfig(object):
        self.include_path = [
            "-I%s/include" % proj_root,
            "-I%s/vta/include" % proj_root,
-            "-I%s/dlpack/include" % proj_root,
-            "-I%s/dmlc-core/include" % proj_root
+            "-I%s/3rdparty/dlpack/include" % proj_root,
+            "-I%s/3rdparty/dmlc-core/include" % proj_root
        ]
        # List of source files that can be used to build standalone library.
        self.lib_source = []