Implemented kDLCPUPinned (cudaMallocHost) (#4985)

* implement kDLCPUPinned * Fix line endings * Fix whitespace for linter * cleanup up allocdataspace method

Implemented kDLCPUPinned (cudaMallocHost) (#4985)
* implement kDLCPUPinned * Fix line endings * Fix whitespace for linter * cleanup up allocdataspace method
fd39c5c0 · jmorrill · GitHub · 41e1d5f9 · fd39c5c0 · fd39c5c0
Unverified Commit fd39c5c0 authored Mar 09, 2020 by jmorrill Committed by GitHub Mar 09, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 6 deletions

include/tvm/runtime/device_api.h
+1 -0

src/runtime/cuda/cuda_device_api.cc
+34 -5

src/runtime/ndarray.cc
+3 -1

No files found.
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -207,6 +207,7 @@ inline const char* DeviceName(int type) {
  switch (type) {
    case kDLCPU: return "cpu";
    case kDLGPU: return "gpu";
+    case kDLCPUPinned: return "cpu_pinned";
    case kDLOpenCL: return "opencl";
    case kDLSDAccel: return "sdaccel";
    case kDLAOCL: return "aocl";

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -112,17 +112,25 @@ class CUDADeviceAPI final : public DeviceAPI {
                       size_t nbytes,
                       size_t alignment,
                       DLDataType type_hint) final {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
    CHECK_EQ(256 % alignment, 0U)
-        << "CUDA space is aligned at 256 bytes";
+          << "CUDA space is aligned at 256 bytes";
    void *ret;
-    CUDA_CALL(cudaMalloc(&ret, nbytes));
+    if (ctx.device_type == kDLCPUPinned) {
+      CUDA_CALL(cudaMallocHost(&ret, nbytes));
+    } else {
+      CUDA_CALL(cudaSetDevice(ctx.device_id));
+      CUDA_CALL(cudaMalloc(&ret, nbytes));
+    }
    return ret;
  }
  void FreeDataSpace(TVMContext ctx, void* ptr) final {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    if (ctx.device_type == kDLCPUPinned) {
-    CUDA_CALL(cudaFree(ptr));
+      CUDA_CALL(cudaFreeHost(ptr));
+    } else {
+      CUDA_CALL(cudaSetDevice(ctx.device_id));
+      CUDA_CALL(cudaFree(ptr));
+    }
  }
  void CopyDataFromTo(const void* from,
@@ -137,6 +145,21 @@ class CUDADeviceAPI final : public DeviceAPI {
    cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
    from = static_cast<const char*>(from) + from_offset;
    to = static_cast<char*>(to) + to_offset;
+    if (ctx_from.device_type == kDLCPUPinned) {
+      ctx_from.device_type = kDLCPU;
+    }
+    if (ctx_to.device_type == kDLCPUPinned) {
+      ctx_to.device_type = kDLCPU;
+    }
+    // In case there is a copy from host mem to host mem */
+    if (ctx_to.device_type == kDLCPU && ctx_from.device_type == kDLCPU) {
+        memcpy(to, from, size);
+        return;
+    }
    if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLGPU) {
      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
      if (ctx_from.device_id == ctx_to.device_id) {
@@ -235,5 +258,11 @@ TVM_REGISTER_GLOBAL("device_api.gpu")
    *rv = static_cast<void*>(ptr);
  });
+TVM_REGISTER_GLOBAL("device_api.cpu_pinned")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    DeviceAPI* ptr = CUDADeviceAPI::Global().get();
+    *rv = static_cast<void*>(ptr);
+  });
 }  // namespace runtime
 }  // namespace tvm
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -233,7 +233,9 @@ void NDArray::CopyFromTo(const DLTensor* from,
  CHECK(from->ctx.device_type == to->ctx.device_type
        || from->ctx.device_type == kDLCPU
-        || to->ctx.device_type == kDLCPU)
+        || to->ctx.device_type == kDLCPU
+        || from->ctx.device_type == kDLCPUPinned
+        || to->ctx.device_type == kDLCPUPinned)
    << "Can not copy across different ctx types directly";
  // Use the context that is *not* a cpu context to get the correct device