[VTA] Runtime refactor to allow for non-shared memory FPGAs (e.g. F1) (#3554)

* updated runtime to support non-shared memory FPGAs for instruction and micro-op kernels * adding driver-defined memcpy function to handle F1 cases * refactor to include flush/invalidate in memcpy driver function * update tsim driver * bug fixes * cleanup * pre-allocate fpga readable buffers to improve perf * fix * remove instruction stream address rewrite pass for micro op kernels * fix: * white spaces * fix lint * avoid signed/unsigned compilation warning * avoid signed/unsigned compilation warning * fix * fix * addressing comments * whitespace * moving flush/invalidate out of memmove * clearnup * fix * cosmetic * rename API * comment fix

[VTA] Runtime refactor to allow for non-shared memory FPGAs (e.g. F1) (#3554)
* updated runtime to support non-shared memory FPGAs for instruction and micro-op kernels * adding driver-defined memcpy function to handle F1 cases * refactor to include flush/invalidate in memcpy driver function * update tsim driver * bug fixes * cleanup * pre-allocate fpga readable buffers to improve perf * fix * remove instruction stream address rewrite pass for micro op kernels * fix: * white spaces * fix lint * avoid signed/unsigned compilation warning * avoid signed/unsigned compilation warning * fix * fix * addressing comments * whitespace * moving flush/invalidate out of memmove * clearnup * fix * cosmetic * rename API * comment fix
9d64d321 · Thierry Moreau · Tianqi Chen · 4d314833 · 9d64d321 · 9d64d321
Commit 9d64d321 authored Jul 22, 2019 by Thierry Moreau Committed by Tianqi Chen Jul 22, 2019
7 changed files
--- a/vta/include/vta/driver.h
+++ b/vta/include/vta/driver.h
@@ -98,7 +98,7 @@ int VTADeviceRun(VTADeviceHandle device,
 #endif

 /*!
- * \brief Allocates physically contiguous region in memory (limited by MAX_XFER).
+ * \brief Allocates physically contiguous region in memory readable/writeable by FPGA.
 * \param size Size of the region in Bytes.
 * \param cached Region can be set to not cached (write-back) if set to 0.
 * \return A pointer to the allocated region.
@@ -106,7 +106,7 @@ int VTADeviceRun(VTADeviceHandle device,
 void* VTAMemAlloc(size_t size, int cached);

 /*!
- * \brief Frees a physically contiguous region in memory.
+ * \brief Frees a physically contiguous region in memory readable/writeable by FPGA.
 * \param buf Buffer to free.
 */
 void VTAMemFree(void* buf);
@@ -119,6 +119,22 @@ void VTAMemFree(void* buf);
 vta_phy_addr_t VTAMemGetPhyAddr(void* buf);

 /*!
+ * \brief Performs a copy operation from host memory to buffer allocated with VTAMemAlloc.
+ * \param dst The desination buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc.
+ * \param src The source buffer in host memory.
+ * \param size Size of the region in Bytes.
+ */
+void VTAMemCopyFromHost(void* dst, const void* src, size_t size);
+
+/*!
+ * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
+ * \param dst The destination buffer in host memory.
+ * \param src The source buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc.
+ * \param size Size of the region in Bytes.
+ */
+void VTAMemCopyToHost(void* dst, const void* src, size_t size);
+
+/*!
 * \brief Flushes the region of memory out of the CPU cache to DRAM.
 * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
 *            This need to be the physical address.

--- a/vta/include/vta/runtime.h
+++ b/vta/include/vta/runtime.h
@@ -105,6 +105,7 @@ TVM_DLL void VTAWriteBarrier(VTACommandHandle cmd,
                             uint32_t elem_bits,
                             uint32_t start,
                             uint32_t extent);
+
 /*!
 * \brief Perform a read barrier to a memory region visible to VTA.
 * \param cmd The VTA command handle.

--- a/vta/src/pynq/pynq_driver.cc
+++ b/vta/src/pynq/pynq_driver.cc
@@ -29,10 +29,13 @@


 void* VTAMemAlloc(size_t size, int cached) {
+  assert(size <= VTA_MAX_XFER);
+  // Rely on the pynq-specific cma library
  return cma_alloc(size, cached);
 }

 void VTAMemFree(void* buf) {
+  // Rely on the pynq-specific cma library
  cma_free(buf);
 }

@@ -40,11 +43,25 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
  return cma_get_phy_addr(buf);
 }

+void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
+  // For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
+  memcpy(dst, src, size);
+}
+
+void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
+  // For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
+  memcpy(dst, src, size);
+}
+
 void VTAFlushCache(vta_phy_addr_t buf, int size) {
+  // Call the xlnkFlushCache on the CMA buffer
+  // so that the FPGA can read the buffer data.
  xlnkFlushCache(reinterpret_cast<void*>(buf), size);
 }

 void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
+  // Call the xlnkInvalidateCache on the CMA buffer
+  // so that the host needs to read the buffer data.
  xlnkInvalidateCache(reinterpret_cast<void*>(buf), size);
 }

@@ -54,7 +71,7 @@ void *VTAMapRegister(uint32_t addr, size_t length) {
  // Calculate base address offset w.r.t the base address
  uint32_t virt_offset = addr - virt_base;
  // Open file and mmap
-  uint32_t mmap_file = open(VTA_PYNQ_DEV_MEM_PATH, O_RDWR|O_SYNC);
+  uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC);
  return mmap(NULL,
              (length+virt_offset),
              PROT_READ|PROT_WRITE,

--- a/vta/src/pynq/pynq_driver.h
+++ b/vta/src/pynq/pynq_driver.h
@@ -56,13 +56,6 @@ void VTAUnmapRegister(void *vta, size_t length);
 void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
 uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);

-/*! \brief (Pynq only) Path to /dev/mem */
-#define VTA_PYNQ_DEV_MEM_PATH "/dev/mem"
-/*! \brief (Pynq only) MMIO driver constant */
-#define VTA_PYNQ_MMIO_WORD_LENGTH 4
-/*! \brief (Pynq only) MMIO driver constant */
-#define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
-
 /*! \brief VTA configuration register address range */
 #define VTA_RANGE 0x100
 /*! \brief VTA configuration register start value */

--- a/vta/src/runtime.cc
+++ b/vta/src/runtime.cc
@@ -6,9 +6,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
- * 
+ *
 *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -44,7 +44,7 @@ namespace vta {
 static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8,
              "VTA_UOP_WIDTH do not match VTAUop size");

-/*! \brief Enable coherent access between VTA and CPU. */
+/*! \brief Enable coherent access between VTA and CPU (used on shared mem systems). */
 static const bool kBufferCoherent = true;

 /*!
@@ -80,6 +80,24 @@ struct DataBuffer {
    }
  }
  /*!
+   * \brief Performs a copy operation from host memory to buffer allocated with VTAMemAlloc.
+   * \param dst The desination buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc().
+   * \param src The source buffer in host memory.
+   * \param size Size of the region in Bytes.
+   */
+  void MemCopyFromHost(void* dst, const void* src, size_t size) {
+    VTAMemCopyFromHost(dst, src, size);
+  }
+  /*!
+   * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
+   * \param dst The desination buffer in host memory.
+   * \param src The source buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc().
+   * \param size Size of the region in Bytes.
+   */
+  void MemCopyToHost(void* dst, const void* src, size_t size) {
+    VTAMemCopyToHost(dst, src, size);
+  }
+  /*!
   * \brief Allocate a buffer of a given size.
   * \param size The size of the buffer.
   */
@@ -274,7 +292,7 @@ class UopKernel {
  template<int, bool, bool>
  friend class UopQueue;
  friend class CommandQueue;
-  // SRAM location if begin != end.
+  // SRAM location if begin != end
  uint32_t sram_begin_{0};
  uint32_t sram_end_{0};
  // The signature used for verification
@@ -290,11 +308,12 @@ class UopKernel {
 /*!
 * \brief Base class of all queues to send and recv serial data.
 */
+template <class T>
 class BaseQueue {
 public:
  ~BaseQueue() {
-    if (dram_buffer_ != nullptr) {
-      VTAMemFree(dram_buffer_);
+    if (fpga_buff_ != nullptr) {
+      VTAMemFree(fpga_buff_);
    }
  }
  /*! \return Content of DRAM buffer. */
@@ -303,7 +322,8 @@ class BaseQueue {
  }
  /*! \return Physical address of DRAM. */
  vta_phy_addr_t dram_phy_addr() const {
-    return dram_phy_addr_;
+    CHECK(fpga_buff_phy_);
+    return fpga_buff_phy_;
  }
  /*! \return Whether there is pending information. */
  bool pending() const {
@@ -314,43 +334,23 @@ class BaseQueue {
    coherent_ = coherent;
    always_cache_ = always_cache;
    elem_bytes_ = elem_bytes;
-    dram_buffer_ = static_cast<char*>(VTAMemAlloc(
-        max_bytes, coherent || always_cache_));
-    CHECK(dram_buffer_ != nullptr);
-    dram_phy_addr_ = VTAMemGetPhyAddr(dram_buffer_);
+    // Allocate buffer ahead of time
+    fpga_buff_ = static_cast<char*>(VTAMemAlloc(
+        max_bytes, coherent_ || always_cache_));
+    CHECK(fpga_buff_ != nullptr);
+    fpga_buff_phy_ = VTAMemGetPhyAddr(fpga_buff_);
  }
  /*!
   * \brief Reset the pointer of the buffer.
   *  Set SRAM pointer to be the current end.
   */
  void Reset() {
-    dram_begin_ = dram_end_ = 0;
+    dram_buffer_.clear();
    sram_begin_ = sram_end_;
  }
-  void AutoReadBarrier() {
-    ReadBarrier(elem_bytes_ * 8, 0, dram_end_);
-  }
-  /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
-  void ReadBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) {
-    if (!coherent_ && always_cache_ && dram_extent != 0) {
-      dram_begin = dram_begin * elem_bits / 8;
-      dram_extent = dram_extent * elem_bits / 8;
-      VTAFlushCache(dram_phy_addr_ + dram_begin,
-                    dram_extent);
-    }
-  }
-  /*! \brief Read barrier to make sure that data written by VTA is visible to CPU. */
-  void WriteBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) {
-    if (!coherent_ && always_cache_ && dram_extent != 0) {
-      dram_begin = dram_begin * elem_bits / 8;
-      dram_extent = dram_extent * elem_bits / 8;
-      VTAInvalidateCache(dram_phy_addr_ + dram_begin,
-                         dram_extent);
-    }
-  }

 protected:
-  // Cache coherence access
+  // Cache coherence access (shared memory only)
  bool coherent_{false};
  // Make the buffer cacheable
  bool always_cache_{false};
@@ -360,21 +360,19 @@ class BaseQueue {
  uint32_t sram_begin_{0};
  // End location of current SRAM write in FIFO mode
  uint32_t sram_end_{0};
-  // The current pending offset in DRAM in FIFO mode
-  uint32_t dram_begin_{0};
-  // The current pending offset in DRAM in FIFO mode
-  uint32_t dram_end_{0};
  // The buffer in DRAM
-  char* dram_buffer_{nullptr};
-  // Physics address of the buffer
-  vta_phy_addr_t dram_phy_addr_;
+  std::vector<T> dram_buffer_;
+  // FPGA accessible buffer
+  void* fpga_buff_{NULL};
+  // Physical address of the FPGA buffer
+  vta_phy_addr_t fpga_buff_phy_{0};
 };

 /*!
 * \brief Micro op buffer that manages the micro op cache.
 */
 template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
-class UopQueue : public BaseQueue {
+class UopQueue : public BaseQueue<VTAUop> {
 public:
  void InitSpace() {
    BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
@@ -382,17 +380,20 @@ class UopQueue : public BaseQueue {
  // Push data to the queue
  template<typename FAutoSync>
  void Push(UopKernel* kernel, FAutoSync fautosync) {
+    // if the micro-op is cached in VTA SRAM, skip
    if (kernel->cached()) return;
+    // check if we've exceeded the size of the allocated FPGA readable buffer
    size_t num_op = kernel->size();
-    if (dram_end_ + num_op > kMaxElems) {
+    if (dram_buffer_.size() + num_op > kMaxElems) {
      fautosync();
-      CHECK(dram_end_ <= kMaxElems);
+      CHECK(dram_buffer_.size() <= kMaxElems);
    }
+    // Cannot have a micro-op kernel larger than SRAM buffer
    CHECK(num_op <= kMaxNumUop);
    uint32_t uop_begin = 0;
    if (sram_end_ + num_op > kMaxNumUop) {
      // Need to evict
-      cache_ptr_ = 0;
+      cache_idx_ = 0;
      sram_begin_ = 0;
      sram_end_ = num_op;
    } else {
@@ -400,51 +401,81 @@ class UopQueue : public BaseQueue {
      sram_end_ += num_op;
    }
    // Simple eviction policy
-    uint32_t evict_begin = cache_ptr_;
-    for (; cache_ptr_ < cache_.size(); ++cache_ptr_) {
-      if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break;
-      cache_[cache_ptr_]->sram_begin_ = 0;
-      cache_[cache_ptr_]->sram_end_ = 0;
+    uint32_t evict_begin = cache_idx_;
+    for (; cache_idx_ < cache_.size(); ++cache_idx_) {
+      if (cache_[cache_idx_]->sram_begin_ >= sram_end_) break;
+      // Mark the kernel as "invalid"
+      cache_[cache_idx_]->sram_begin_ = 0;
+      cache_[cache_idx_]->sram_end_ = 0;
    }
-    memcpy(dram_buffer_ + dram_end_ * kElemBytes,
-           kernel->data(),
-           num_op * kElemBytes);
-    dram_end_ += num_op;
+    // Increase size of buffer
    kernel->sram_begin_ = uop_begin;
    kernel->sram_end_ = sram_end_;
    CHECK(kernel->cached());
-    CHECK(uop_begin != sram_end_);
-    cache_.insert(cache_.begin() + cache_ptr_, kernel);
-    cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_ptr_);
-    cache_ptr_ = evict_begin + 1;
+    cache_.insert(cache_.begin() + cache_idx_, kernel);
+    cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_idx_);
+    cache_idx_ = evict_begin + 1;
  }
-  // Flush as weight load
+  // Flush micro op load instruction
  void FlushUopLoad(VTAMemInsn* insn) {
    if (sram_begin_ != sram_end_) {
-      CHECK((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_));
+      // Derive offset in FPGA-readable buffer
+      int32_t offset = 0;
+      for (uint32_t i = 0; i < cache_idx_ - 1; ++i) {
+        offset += cache_[i]->size() * kElemBytes;
+      }
      insn->memory_type = VTA_MEM_ID_UOP;
      insn->sram_base = sram_begin_;
+      // Update cache idx to physical address map
 #ifdef USE_TSIM
-      insn->dram_base = (uint32_t) dram_phy_addr_ + dram_begin_*kElemBytes;
+      insn->dram_base = fpga_buff_phy_ + offset;
 #else
-      insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_;
+      insn->dram_base = (fpga_buff_phy_ + offset) / kElemBytes;
 #endif
      insn->y_size = 1;
-      insn->x_size = (dram_end_ - dram_begin_);
-      insn->x_stride = (dram_end_ - dram_begin_);
+      insn->x_size = (sram_end_ - sram_begin_);
+      insn->x_stride = (sram_end_ - sram_begin_);
      insn->y_pad_0 = 0;
      insn->y_pad_1 = 0;
      insn->x_pad_0 = 0;
      insn->x_pad_1 = 0;
      // Reset indices
      sram_begin_ = sram_end_;
-      dram_begin_ = dram_end_;
+    }
+  }
+  void AutoReadBarrier() {
+    ReadBarrier();
+  }
+  /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
+  void ReadBarrier() {
+    CHECK(fpga_buff_ != nullptr);
+    CHECK(fpga_buff_phy_);
+    // Iterate over caches; allocate buffer in FPGA-readable memory
+    uint32_t buff_size = 0;
+    for (uint32_t i = 0; i < cache_.size(); ++i) {
+      buff_size += cache_[i]->size() * kElemBytes;
+    }
+    CHECK(buff_size <= kMaxBytes);
+    // Move kernel contents to FPGA readable buffer
+    uint32_t offset = 0;
+    for (uint32_t i = 0; i < cache_.size(); ++i) {
+      uint32_t ksize = cache_[i]->size() * kElemBytes;
+      VTAMemCopyFromHost(static_cast<char*>(fpga_buff_) + offset,
+                         cache_[i]->data(),
+                         ksize);
+      // Update offset
+      offset += ksize;
+    }
+    // Flush if we're using a shared memory system
+    // and if interface is non-coherent
+    if (!coherent_ && always_cache_) {
+      VTAFlushCache(fpga_buff_phy_, offset);
    }
  }

 private:
  // Cache pointer
-  uint32_t cache_ptr_{0};
+  uint32_t cache_idx_{0};
  // Cached ring, sorted by sram_begin
  std::vector<UopKernel*> cache_;
  // Constants
@@ -485,7 +516,7 @@ enum PipelineStage : int {

 // Instruction Queue
 template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
-class InsnQueue : public BaseQueue {
+class InsnQueue : public BaseQueue<VTAGenericInsn> {
 public:
  /*! \brief Initialize the space. */
  void InitSpace() {
@@ -496,11 +527,11 @@ class InsnQueue : public BaseQueue {
  }
  /*! \return The data pointer. */
  VTAGenericInsn* data() {
-    return reinterpret_cast<VTAGenericInsn*>(dram_buffer_);
+    return dram_buffer_.data();
  }
  /*! \return Number of instructions. */
  uint32_t count() {
-    return dram_end_;
+    return dram_buffer_.size();
  }
  // Insert dependency push of load
  void DepPop(int from, int to) {
@@ -524,9 +555,8 @@ class InsnQueue : public BaseQueue {
  void DepPush(int from, int to) {
    // NOTE: this instruction executes on queue[from]
    this->CommitPendingPop(from);
-    if (dram_end_ != 0) {
-      VTAMemInsn* mptr =
-          reinterpret_cast<VTAMemInsn*>(dram_buffer_) + dram_end_ - 1;
+    if (!dram_buffer_.empty()) {
+      VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(&dram_buffer_.back());
      if (GetPipelineStage(mptr) == from) {
        if (from < to && !mptr->push_next_dep) {
          // push(LD->C) or push(C->ST)
@@ -600,7 +630,6 @@ class InsnQueue : public BaseQueue {
      }
    }
  }
-
  // Helper function: Get Opcode string
  const char* getOpcodeString(int opcode, bool use_imm) {
      // The string name
@@ -628,7 +657,6 @@ class InsnQueue : public BaseQueue {

      return "unknown op";
  }
-
  // Dump instructions in the queue
  void DumpInsn() {
    // Keep tabs on dependence queues
@@ -790,7 +818,6 @@ class InsnQueue : public BaseQueue {
      printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
    }
  }
-
  // Commit all pending pop of corresponding stage
  void CommitPendingPop(int stage) {
    // Handle the LD<->compute queue
@@ -805,13 +832,11 @@ class InsnQueue : public BaseQueue {
      pending_pop_next_[stage] = 0;
    }
  }
-
  void CommitPending() {
    for (int i = kLoadStage; i <= kStoreStage; ++i) {
      CommitPendingPop(i);
    }
  }
-
  bool PendingPop() {
    for (int i = kLoadStage; i <= kStoreStage; ++i) {
      if (pending_pop_prev_[i]) return true;
@@ -819,14 +844,32 @@ class InsnQueue : public BaseQueue {
    }
    return false;
  }
+  void AutoReadBarrier() {
+    ReadBarrier();
+  }
+  /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
+  void ReadBarrier() {
+    CHECK(fpga_buff_ != nullptr);
+    CHECK(fpga_buff_phy_);
+    uint32_t buff_size = dram_buffer_.size() * elem_bytes_;
+    CHECK(buff_size <= kMaxBytes);
+    // Copy contents of DRAM buffer to FPGA buff
+    VTAMemCopyFromHost(fpga_buff_,
+                       dram_buffer_.data(),
+                       buff_size);
+    // Flush if we're using a shared memory system
+    // and if interface is non-coherent
+    if (!coherent_ && always_cache_) {
+      VTAFlushCache(fpga_buff_phy_, buff_size);
+    }
+  }

 protected:
  /*! \return Add new instruction to the buffer. */
  VTAGenericInsn* NextInsn() {
-    VTAGenericInsn* insn  = data() + dram_end_;
-    ++dram_end_;
-    CHECK(dram_end_ < kMaxElems);
-    return insn;
+    VTAGenericInsn insn;
+    dram_buffer_.push_back(insn);
+    return &dram_buffer_.back();
  }
  // Create a new instruction for a given stage
  VTAGenericInsn* Create(PipelineStage stage) {
@@ -859,7 +902,7 @@ class InsnQueue : public BaseQueue {
    if (insn->opcode == VTA_OPCODE_STORE) {
      // FIXME: Right now memory_type is a 2-bit field which means that
      //        VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from
-      //        checking the memory_type to avoid an CHECKion error...
+      //        checking the memory_type to avoid an CHECK error...
      return kStoreStage;
    }
    LOG(FATAL) << "not reached";
@@ -938,7 +981,7 @@ class CommandQueue {
    }
    /*
     * elements size should not larger than VTA_PAGE_BYTES.
-     * 
+     *
     */
    CHECK_GE(VTA_PAGE_BYTES, elem_bytes);
    return elem_bytes;
@@ -1256,7 +1299,7 @@ class CommandQueue {

  // Internal debug flag
  int debug_flag_{0};
-  // The kernel we currently recording
+  // The kernel we are currently recording
  UopKernel* record_kernel_{nullptr};
  // Micro op queue
  UopQueue<VTA_MAX_XFER, true, true> uop_queue_;
@@ -1303,14 +1346,18 @@ void VTABufferCopy(const void* from,
    to_buffer = vta::DataBuffer::FromHandle(to);
    to = to_buffer->virt_addr();
  }
+
  if (from_buffer) {
+    // This is an FPGA to host mem transfer
    from_buffer->InvalidateCache(from_offset, size);
-  }
-
-  memcpy(static_cast<char*>(to) + to_offset,
-         static_cast<const char*>(from) + from_offset,
-         size);
-  if (to_buffer) {
+    from_buffer->MemCopyToHost(static_cast<char*>(to) + to_offset,
+                                   static_cast<const char*>(from) + from_offset,
+                                   size);
+  } else if (to_buffer) {
+    // This is a host to FPGA mem transfer
+    to_buffer->MemCopyFromHost(static_cast<char*>(to) + to_offset,
+                               static_cast<const char*>(from) + from_offset,
+                               size);
    to_buffer->FlushCache(to_offset, size);
  }
 }

--- a/vta/src/sim/sim_driver.cc
+++ b/vta/src/sim/sim_driver.cc
@@ -607,6 +607,14 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
  return vta::sim::DRAM::Global()->GetPhyAddr(buf);
 }

+void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
+  memcpy(dst, src, size);
+}
+
+void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
+  memcpy(dst, src, size);
+}
+
 void VTAFlushCache(vta_phy_addr_t buf, int size) {
 }


--- a/vta/src/tsim/tsim_driver.cc
+++ b/vta/src/tsim/tsim_driver.cc
@@ -220,6 +220,14 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
  return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
 }

+void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
+  memcpy(dst, src, size);
+}
+
+void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
+  memcpy(dst, src, size);
+}
+
 void VTAFlushCache(vta_phy_addr_t buf, int size) {
 }