Commit 9d64d321 by Thierry Moreau Committed by Tianqi Chen

[VTA] Runtime refactor to allow for non-shared memory FPGAs (e.g. F1) (#3554)

* updated runtime to support non-shared memory FPGAs for instruction and micro-op kernels

* adding driver-defined memcpy function to handle F1 cases

* refactor to include flush/invalidate in memcpy driver function

* update tsim driver

* bug fixes

* cleanup

* pre-allocate fpga readable buffers to improve perf

* fix

* remove instruction stream address rewrite pass for micro op kernels

* fix:

* white spaces

* fix lint

* avoid signed/unsigned compilation warning

* avoid signed/unsigned compilation warning

* fix

* fix

* addressing comments

* whitespace

* moving flush/invalidate out of memmove

* clearnup

* fix

* cosmetic

* rename API

* comment fix
parent 4d314833
...@@ -98,7 +98,7 @@ int VTADeviceRun(VTADeviceHandle device, ...@@ -98,7 +98,7 @@ int VTADeviceRun(VTADeviceHandle device,
#endif #endif
/*! /*!
* \brief Allocates physically contiguous region in memory (limited by MAX_XFER). * \brief Allocates physically contiguous region in memory readable/writeable by FPGA.
* \param size Size of the region in Bytes. * \param size Size of the region in Bytes.
* \param cached Region can be set to not cached (write-back) if set to 0. * \param cached Region can be set to not cached (write-back) if set to 0.
* \return A pointer to the allocated region. * \return A pointer to the allocated region.
...@@ -106,7 +106,7 @@ int VTADeviceRun(VTADeviceHandle device, ...@@ -106,7 +106,7 @@ int VTADeviceRun(VTADeviceHandle device,
void* VTAMemAlloc(size_t size, int cached); void* VTAMemAlloc(size_t size, int cached);
/*! /*!
* \brief Frees a physically contiguous region in memory. * \brief Frees a physically contiguous region in memory readable/writeable by FPGA.
* \param buf Buffer to free. * \param buf Buffer to free.
*/ */
void VTAMemFree(void* buf); void VTAMemFree(void* buf);
...@@ -119,6 +119,22 @@ void VTAMemFree(void* buf); ...@@ -119,6 +119,22 @@ void VTAMemFree(void* buf);
vta_phy_addr_t VTAMemGetPhyAddr(void* buf); vta_phy_addr_t VTAMemGetPhyAddr(void* buf);
/*! /*!
* \brief Performs a copy operation from host memory to buffer allocated with VTAMemAlloc.
* \param dst The desination buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc.
* \param src The source buffer in host memory.
* \param size Size of the region in Bytes.
*/
void VTAMemCopyFromHost(void* dst, const void* src, size_t size);
/*!
* \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
* \param dst The destination buffer in host memory.
* \param src The source buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc.
* \param size Size of the region in Bytes.
*/
void VTAMemCopyToHost(void* dst, const void* src, size_t size);
/*!
* \brief Flushes the region of memory out of the CPU cache to DRAM. * \brief Flushes the region of memory out of the CPU cache to DRAM.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed. * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
* This need to be the physical address. * This need to be the physical address.
......
...@@ -105,6 +105,7 @@ TVM_DLL void VTAWriteBarrier(VTACommandHandle cmd, ...@@ -105,6 +105,7 @@ TVM_DLL void VTAWriteBarrier(VTACommandHandle cmd,
uint32_t elem_bits, uint32_t elem_bits,
uint32_t start, uint32_t start,
uint32_t extent); uint32_t extent);
/*! /*!
* \brief Perform a read barrier to a memory region visible to VTA. * \brief Perform a read barrier to a memory region visible to VTA.
* \param cmd The VTA command handle. * \param cmd The VTA command handle.
......
...@@ -29,10 +29,13 @@ ...@@ -29,10 +29,13 @@
void* VTAMemAlloc(size_t size, int cached) { void* VTAMemAlloc(size_t size, int cached) {
assert(size <= VTA_MAX_XFER);
// Rely on the pynq-specific cma library
return cma_alloc(size, cached); return cma_alloc(size, cached);
} }
void VTAMemFree(void* buf) { void VTAMemFree(void* buf) {
// Rely on the pynq-specific cma library
cma_free(buf); cma_free(buf);
} }
...@@ -40,11 +43,25 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) { ...@@ -40,11 +43,25 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
return cma_get_phy_addr(buf); return cma_get_phy_addr(buf);
} }
void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
// For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
memcpy(dst, src, size);
}
void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
// For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
memcpy(dst, src, size);
}
void VTAFlushCache(vta_phy_addr_t buf, int size) { void VTAFlushCache(vta_phy_addr_t buf, int size) {
// Call the xlnkFlushCache on the CMA buffer
// so that the FPGA can read the buffer data.
xlnkFlushCache(reinterpret_cast<void*>(buf), size); xlnkFlushCache(reinterpret_cast<void*>(buf), size);
} }
void VTAInvalidateCache(vta_phy_addr_t buf, int size) { void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
// Call the xlnkInvalidateCache on the CMA buffer
// so that the host needs to read the buffer data.
xlnkInvalidateCache(reinterpret_cast<void*>(buf), size); xlnkInvalidateCache(reinterpret_cast<void*>(buf), size);
} }
...@@ -54,7 +71,7 @@ void *VTAMapRegister(uint32_t addr, size_t length) { ...@@ -54,7 +71,7 @@ void *VTAMapRegister(uint32_t addr, size_t length) {
// Calculate base address offset w.r.t the base address // Calculate base address offset w.r.t the base address
uint32_t virt_offset = addr - virt_base; uint32_t virt_offset = addr - virt_base;
// Open file and mmap // Open file and mmap
uint32_t mmap_file = open(VTA_PYNQ_DEV_MEM_PATH, O_RDWR|O_SYNC); uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC);
return mmap(NULL, return mmap(NULL,
(length+virt_offset), (length+virt_offset),
PROT_READ|PROT_WRITE, PROT_READ|PROT_WRITE,
......
...@@ -56,13 +56,6 @@ void VTAUnmapRegister(void *vta, size_t length); ...@@ -56,13 +56,6 @@ void VTAUnmapRegister(void *vta, size_t length);
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val); void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset); uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
/*! \brief (Pynq only) Path to /dev/mem */
#define VTA_PYNQ_DEV_MEM_PATH "/dev/mem"
/*! \brief (Pynq only) MMIO driver constant */
#define VTA_PYNQ_MMIO_WORD_LENGTH 4
/*! \brief (Pynq only) MMIO driver constant */
#define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
/*! \brief VTA configuration register address range */ /*! \brief VTA configuration register address range */
#define VTA_RANGE 0x100 #define VTA_RANGE 0x100
/*! \brief VTA configuration register start value */ /*! \brief VTA configuration register start value */
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the * to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance * "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at * with the License. You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
...@@ -44,7 +44,7 @@ namespace vta { ...@@ -44,7 +44,7 @@ namespace vta {
static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8, static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8,
"VTA_UOP_WIDTH do not match VTAUop size"); "VTA_UOP_WIDTH do not match VTAUop size");
/*! \brief Enable coherent access between VTA and CPU. */ /*! \brief Enable coherent access between VTA and CPU (used on shared mem systems). */
static const bool kBufferCoherent = true; static const bool kBufferCoherent = true;
/*! /*!
...@@ -80,6 +80,24 @@ struct DataBuffer { ...@@ -80,6 +80,24 @@ struct DataBuffer {
} }
} }
/*! /*!
* \brief Performs a copy operation from host memory to buffer allocated with VTAMemAlloc.
* \param dst The desination buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc().
* \param src The source buffer in host memory.
* \param size Size of the region in Bytes.
*/
void MemCopyFromHost(void* dst, const void* src, size_t size) {
VTAMemCopyFromHost(dst, src, size);
}
/*!
* \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
* \param dst The desination buffer in host memory.
* \param src The source buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc().
* \param size Size of the region in Bytes.
*/
void MemCopyToHost(void* dst, const void* src, size_t size) {
VTAMemCopyToHost(dst, src, size);
}
/*!
* \brief Allocate a buffer of a given size. * \brief Allocate a buffer of a given size.
* \param size The size of the buffer. * \param size The size of the buffer.
*/ */
...@@ -274,7 +292,7 @@ class UopKernel { ...@@ -274,7 +292,7 @@ class UopKernel {
template<int, bool, bool> template<int, bool, bool>
friend class UopQueue; friend class UopQueue;
friend class CommandQueue; friend class CommandQueue;
// SRAM location if begin != end. // SRAM location if begin != end
uint32_t sram_begin_{0}; uint32_t sram_begin_{0};
uint32_t sram_end_{0}; uint32_t sram_end_{0};
// The signature used for verification // The signature used for verification
...@@ -290,11 +308,12 @@ class UopKernel { ...@@ -290,11 +308,12 @@ class UopKernel {
/*! /*!
* \brief Base class of all queues to send and recv serial data. * \brief Base class of all queues to send and recv serial data.
*/ */
template <class T>
class BaseQueue { class BaseQueue {
public: public:
~BaseQueue() { ~BaseQueue() {
if (dram_buffer_ != nullptr) { if (fpga_buff_ != nullptr) {
VTAMemFree(dram_buffer_); VTAMemFree(fpga_buff_);
} }
} }
/*! \return Content of DRAM buffer. */ /*! \return Content of DRAM buffer. */
...@@ -303,7 +322,8 @@ class BaseQueue { ...@@ -303,7 +322,8 @@ class BaseQueue {
} }
/*! \return Physical address of DRAM. */ /*! \return Physical address of DRAM. */
vta_phy_addr_t dram_phy_addr() const { vta_phy_addr_t dram_phy_addr() const {
return dram_phy_addr_; CHECK(fpga_buff_phy_);
return fpga_buff_phy_;
} }
/*! \return Whether there is pending information. */ /*! \return Whether there is pending information. */
bool pending() const { bool pending() const {
...@@ -314,43 +334,23 @@ class BaseQueue { ...@@ -314,43 +334,23 @@ class BaseQueue {
coherent_ = coherent; coherent_ = coherent;
always_cache_ = always_cache; always_cache_ = always_cache;
elem_bytes_ = elem_bytes; elem_bytes_ = elem_bytes;
dram_buffer_ = static_cast<char*>(VTAMemAlloc( // Allocate buffer ahead of time
max_bytes, coherent || always_cache_)); fpga_buff_ = static_cast<char*>(VTAMemAlloc(
CHECK(dram_buffer_ != nullptr); max_bytes, coherent_ || always_cache_));
dram_phy_addr_ = VTAMemGetPhyAddr(dram_buffer_); CHECK(fpga_buff_ != nullptr);
fpga_buff_phy_ = VTAMemGetPhyAddr(fpga_buff_);
} }
/*! /*!
* \brief Reset the pointer of the buffer. * \brief Reset the pointer of the buffer.
* Set SRAM pointer to be the current end. * Set SRAM pointer to be the current end.
*/ */
void Reset() { void Reset() {
dram_begin_ = dram_end_ = 0; dram_buffer_.clear();
sram_begin_ = sram_end_; sram_begin_ = sram_end_;
} }
void AutoReadBarrier() {
ReadBarrier(elem_bytes_ * 8, 0, dram_end_);
}
/*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
void ReadBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) {
if (!coherent_ && always_cache_ && dram_extent != 0) {
dram_begin = dram_begin * elem_bits / 8;
dram_extent = dram_extent * elem_bits / 8;
VTAFlushCache(dram_phy_addr_ + dram_begin,
dram_extent);
}
}
/*! \brief Read barrier to make sure that data written by VTA is visible to CPU. */
void WriteBarrier(uint32_t elem_bits, uint32_t dram_begin, uint32_t dram_extent) {
if (!coherent_ && always_cache_ && dram_extent != 0) {
dram_begin = dram_begin * elem_bits / 8;
dram_extent = dram_extent * elem_bits / 8;
VTAInvalidateCache(dram_phy_addr_ + dram_begin,
dram_extent);
}
}
protected: protected:
// Cache coherence access // Cache coherence access (shared memory only)
bool coherent_{false}; bool coherent_{false};
// Make the buffer cacheable // Make the buffer cacheable
bool always_cache_{false}; bool always_cache_{false};
...@@ -360,21 +360,19 @@ class BaseQueue { ...@@ -360,21 +360,19 @@ class BaseQueue {
uint32_t sram_begin_{0}; uint32_t sram_begin_{0};
// End location of current SRAM write in FIFO mode // End location of current SRAM write in FIFO mode
uint32_t sram_end_{0}; uint32_t sram_end_{0};
// The current pending offset in DRAM in FIFO mode
uint32_t dram_begin_{0};
// The current pending offset in DRAM in FIFO mode
uint32_t dram_end_{0};
// The buffer in DRAM // The buffer in DRAM
char* dram_buffer_{nullptr}; std::vector<T> dram_buffer_;
// Physics address of the buffer // FPGA accessible buffer
vta_phy_addr_t dram_phy_addr_; void* fpga_buff_{NULL};
// Physical address of the FPGA buffer
vta_phy_addr_t fpga_buff_phy_{0};
}; };
/*! /*!
* \brief Micro op buffer that manages the micro op cache. * \brief Micro op buffer that manages the micro op cache.
*/ */
template<int kMaxBytes, bool kCoherent, bool kAlwaysCache> template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
class UopQueue : public BaseQueue { class UopQueue : public BaseQueue<VTAUop> {
public: public:
void InitSpace() { void InitSpace() {
BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache); BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
...@@ -382,17 +380,20 @@ class UopQueue : public BaseQueue { ...@@ -382,17 +380,20 @@ class UopQueue : public BaseQueue {
// Push data to the queue // Push data to the queue
template<typename FAutoSync> template<typename FAutoSync>
void Push(UopKernel* kernel, FAutoSync fautosync) { void Push(UopKernel* kernel, FAutoSync fautosync) {
// if the micro-op is cached in VTA SRAM, skip
if (kernel->cached()) return; if (kernel->cached()) return;
// check if we've exceeded the size of the allocated FPGA readable buffer
size_t num_op = kernel->size(); size_t num_op = kernel->size();
if (dram_end_ + num_op > kMaxElems) { if (dram_buffer_.size() + num_op > kMaxElems) {
fautosync(); fautosync();
CHECK(dram_end_ <= kMaxElems); CHECK(dram_buffer_.size() <= kMaxElems);
} }
// Cannot have a micro-op kernel larger than SRAM buffer
CHECK(num_op <= kMaxNumUop); CHECK(num_op <= kMaxNumUop);
uint32_t uop_begin = 0; uint32_t uop_begin = 0;
if (sram_end_ + num_op > kMaxNumUop) { if (sram_end_ + num_op > kMaxNumUop) {
// Need to evict // Need to evict
cache_ptr_ = 0; cache_idx_ = 0;
sram_begin_ = 0; sram_begin_ = 0;
sram_end_ = num_op; sram_end_ = num_op;
} else { } else {
...@@ -400,51 +401,81 @@ class UopQueue : public BaseQueue { ...@@ -400,51 +401,81 @@ class UopQueue : public BaseQueue {
sram_end_ += num_op; sram_end_ += num_op;
} }
// Simple eviction policy // Simple eviction policy
uint32_t evict_begin = cache_ptr_; uint32_t evict_begin = cache_idx_;
for (; cache_ptr_ < cache_.size(); ++cache_ptr_) { for (; cache_idx_ < cache_.size(); ++cache_idx_) {
if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break; if (cache_[cache_idx_]->sram_begin_ >= sram_end_) break;
cache_[cache_ptr_]->sram_begin_ = 0; // Mark the kernel as "invalid"
cache_[cache_ptr_]->sram_end_ = 0; cache_[cache_idx_]->sram_begin_ = 0;
cache_[cache_idx_]->sram_end_ = 0;
} }
memcpy(dram_buffer_ + dram_end_ * kElemBytes, // Increase size of buffer
kernel->data(),
num_op * kElemBytes);
dram_end_ += num_op;
kernel->sram_begin_ = uop_begin; kernel->sram_begin_ = uop_begin;
kernel->sram_end_ = sram_end_; kernel->sram_end_ = sram_end_;
CHECK(kernel->cached()); CHECK(kernel->cached());
CHECK(uop_begin != sram_end_); cache_.insert(cache_.begin() + cache_idx_, kernel);
cache_.insert(cache_.begin() + cache_ptr_, kernel); cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_idx_);
cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_ptr_); cache_idx_ = evict_begin + 1;
cache_ptr_ = evict_begin + 1;
} }
// Flush as weight load // Flush micro op load instruction
void FlushUopLoad(VTAMemInsn* insn) { void FlushUopLoad(VTAMemInsn* insn) {
if (sram_begin_ != sram_end_) { if (sram_begin_ != sram_end_) {
CHECK((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_)); // Derive offset in FPGA-readable buffer
int32_t offset = 0;
for (uint32_t i = 0; i < cache_idx_ - 1; ++i) {
offset += cache_[i]->size() * kElemBytes;
}
insn->memory_type = VTA_MEM_ID_UOP; insn->memory_type = VTA_MEM_ID_UOP;
insn->sram_base = sram_begin_; insn->sram_base = sram_begin_;
// Update cache idx to physical address map
#ifdef USE_TSIM #ifdef USE_TSIM
insn->dram_base = (uint32_t) dram_phy_addr_ + dram_begin_*kElemBytes; insn->dram_base = fpga_buff_phy_ + offset;
#else #else
insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_; insn->dram_base = (fpga_buff_phy_ + offset) / kElemBytes;
#endif #endif
insn->y_size = 1; insn->y_size = 1;
insn->x_size = (dram_end_ - dram_begin_); insn->x_size = (sram_end_ - sram_begin_);
insn->x_stride = (dram_end_ - dram_begin_); insn->x_stride = (sram_end_ - sram_begin_);
insn->y_pad_0 = 0; insn->y_pad_0 = 0;
insn->y_pad_1 = 0; insn->y_pad_1 = 0;
insn->x_pad_0 = 0; insn->x_pad_0 = 0;
insn->x_pad_1 = 0; insn->x_pad_1 = 0;
// Reset indices // Reset indices
sram_begin_ = sram_end_; sram_begin_ = sram_end_;
dram_begin_ = dram_end_; }
}
void AutoReadBarrier() {
ReadBarrier();
}
/*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
void ReadBarrier() {
CHECK(fpga_buff_ != nullptr);
CHECK(fpga_buff_phy_);
// Iterate over caches; allocate buffer in FPGA-readable memory
uint32_t buff_size = 0;
for (uint32_t i = 0; i < cache_.size(); ++i) {
buff_size += cache_[i]->size() * kElemBytes;
}
CHECK(buff_size <= kMaxBytes);
// Move kernel contents to FPGA readable buffer
uint32_t offset = 0;
for (uint32_t i = 0; i < cache_.size(); ++i) {
uint32_t ksize = cache_[i]->size() * kElemBytes;
VTAMemCopyFromHost(static_cast<char*>(fpga_buff_) + offset,
cache_[i]->data(),
ksize);
// Update offset
offset += ksize;
}
// Flush if we're using a shared memory system
// and if interface is non-coherent
if (!coherent_ && always_cache_) {
VTAFlushCache(fpga_buff_phy_, offset);
} }
} }
private: private:
// Cache pointer // Cache pointer
uint32_t cache_ptr_{0}; uint32_t cache_idx_{0};
// Cached ring, sorted by sram_begin // Cached ring, sorted by sram_begin
std::vector<UopKernel*> cache_; std::vector<UopKernel*> cache_;
// Constants // Constants
...@@ -485,7 +516,7 @@ enum PipelineStage : int { ...@@ -485,7 +516,7 @@ enum PipelineStage : int {
// Instruction Queue // Instruction Queue
template<int kMaxBytes, bool kCoherent, bool kAlwaysCache> template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
class InsnQueue : public BaseQueue { class InsnQueue : public BaseQueue<VTAGenericInsn> {
public: public:
/*! \brief Initialize the space. */ /*! \brief Initialize the space. */
void InitSpace() { void InitSpace() {
...@@ -496,11 +527,11 @@ class InsnQueue : public BaseQueue { ...@@ -496,11 +527,11 @@ class InsnQueue : public BaseQueue {
} }
/*! \return The data pointer. */ /*! \return The data pointer. */
VTAGenericInsn* data() { VTAGenericInsn* data() {
return reinterpret_cast<VTAGenericInsn*>(dram_buffer_); return dram_buffer_.data();
} }
/*! \return Number of instructions. */ /*! \return Number of instructions. */
uint32_t count() { uint32_t count() {
return dram_end_; return dram_buffer_.size();
} }
// Insert dependency push of load // Insert dependency push of load
void DepPop(int from, int to) { void DepPop(int from, int to) {
...@@ -524,9 +555,8 @@ class InsnQueue : public BaseQueue { ...@@ -524,9 +555,8 @@ class InsnQueue : public BaseQueue {
void DepPush(int from, int to) { void DepPush(int from, int to) {
// NOTE: this instruction executes on queue[from] // NOTE: this instruction executes on queue[from]
this->CommitPendingPop(from); this->CommitPendingPop(from);
if (dram_end_ != 0) { if (!dram_buffer_.empty()) {
VTAMemInsn* mptr = VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(&dram_buffer_.back());
reinterpret_cast<VTAMemInsn*>(dram_buffer_) + dram_end_ - 1;
if (GetPipelineStage(mptr) == from) { if (GetPipelineStage(mptr) == from) {
if (from < to && !mptr->push_next_dep) { if (from < to && !mptr->push_next_dep) {
// push(LD->C) or push(C->ST) // push(LD->C) or push(C->ST)
...@@ -600,7 +630,6 @@ class InsnQueue : public BaseQueue { ...@@ -600,7 +630,6 @@ class InsnQueue : public BaseQueue {
} }
} }
} }
// Helper function: Get Opcode string // Helper function: Get Opcode string
const char* getOpcodeString(int opcode, bool use_imm) { const char* getOpcodeString(int opcode, bool use_imm) {
// The string name // The string name
...@@ -628,7 +657,6 @@ class InsnQueue : public BaseQueue { ...@@ -628,7 +657,6 @@ class InsnQueue : public BaseQueue {
return "unknown op"; return "unknown op";
} }
// Dump instructions in the queue // Dump instructions in the queue
void DumpInsn() { void DumpInsn() {
// Keep tabs on dependence queues // Keep tabs on dependence queues
...@@ -790,7 +818,6 @@ class InsnQueue : public BaseQueue { ...@@ -790,7 +818,6 @@ class InsnQueue : public BaseQueue {
printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
} }
} }
// Commit all pending pop of corresponding stage // Commit all pending pop of corresponding stage
void CommitPendingPop(int stage) { void CommitPendingPop(int stage) {
// Handle the LD<->compute queue // Handle the LD<->compute queue
...@@ -805,13 +832,11 @@ class InsnQueue : public BaseQueue { ...@@ -805,13 +832,11 @@ class InsnQueue : public BaseQueue {
pending_pop_next_[stage] = 0; pending_pop_next_[stage] = 0;
} }
} }
void CommitPending() { void CommitPending() {
for (int i = kLoadStage; i <= kStoreStage; ++i) { for (int i = kLoadStage; i <= kStoreStage; ++i) {
CommitPendingPop(i); CommitPendingPop(i);
} }
} }
bool PendingPop() { bool PendingPop() {
for (int i = kLoadStage; i <= kStoreStage; ++i) { for (int i = kLoadStage; i <= kStoreStage; ++i) {
if (pending_pop_prev_[i]) return true; if (pending_pop_prev_[i]) return true;
...@@ -819,14 +844,32 @@ class InsnQueue : public BaseQueue { ...@@ -819,14 +844,32 @@ class InsnQueue : public BaseQueue {
} }
return false; return false;
} }
void AutoReadBarrier() {
ReadBarrier();
}
/*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
void ReadBarrier() {
CHECK(fpga_buff_ != nullptr);
CHECK(fpga_buff_phy_);
uint32_t buff_size = dram_buffer_.size() * elem_bytes_;
CHECK(buff_size <= kMaxBytes);
// Copy contents of DRAM buffer to FPGA buff
VTAMemCopyFromHost(fpga_buff_,
dram_buffer_.data(),
buff_size);
// Flush if we're using a shared memory system
// and if interface is non-coherent
if (!coherent_ && always_cache_) {
VTAFlushCache(fpga_buff_phy_, buff_size);
}
}
protected: protected:
/*! \return Add new instruction to the buffer. */ /*! \return Add new instruction to the buffer. */
VTAGenericInsn* NextInsn() { VTAGenericInsn* NextInsn() {
VTAGenericInsn* insn = data() + dram_end_; VTAGenericInsn insn;
++dram_end_; dram_buffer_.push_back(insn);
CHECK(dram_end_ < kMaxElems); return &dram_buffer_.back();
return insn;
} }
// Create a new instruction for a given stage // Create a new instruction for a given stage
VTAGenericInsn* Create(PipelineStage stage) { VTAGenericInsn* Create(PipelineStage stage) {
...@@ -859,7 +902,7 @@ class InsnQueue : public BaseQueue { ...@@ -859,7 +902,7 @@ class InsnQueue : public BaseQueue {
if (insn->opcode == VTA_OPCODE_STORE) { if (insn->opcode == VTA_OPCODE_STORE) {
// FIXME: Right now memory_type is a 2-bit field which means that // FIXME: Right now memory_type is a 2-bit field which means that
// VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from // VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from
// checking the memory_type to avoid an CHECKion error... // checking the memory_type to avoid an CHECK error...
return kStoreStage; return kStoreStage;
} }
LOG(FATAL) << "not reached"; LOG(FATAL) << "not reached";
...@@ -938,7 +981,7 @@ class CommandQueue { ...@@ -938,7 +981,7 @@ class CommandQueue {
} }
/* /*
* elements size should not larger than VTA_PAGE_BYTES. * elements size should not larger than VTA_PAGE_BYTES.
* *
*/ */
CHECK_GE(VTA_PAGE_BYTES, elem_bytes); CHECK_GE(VTA_PAGE_BYTES, elem_bytes);
return elem_bytes; return elem_bytes;
...@@ -1256,7 +1299,7 @@ class CommandQueue { ...@@ -1256,7 +1299,7 @@ class CommandQueue {
// Internal debug flag // Internal debug flag
int debug_flag_{0}; int debug_flag_{0};
// The kernel we currently recording // The kernel we are currently recording
UopKernel* record_kernel_{nullptr}; UopKernel* record_kernel_{nullptr};
// Micro op queue // Micro op queue
UopQueue<VTA_MAX_XFER, true, true> uop_queue_; UopQueue<VTA_MAX_XFER, true, true> uop_queue_;
...@@ -1303,14 +1346,18 @@ void VTABufferCopy(const void* from, ...@@ -1303,14 +1346,18 @@ void VTABufferCopy(const void* from,
to_buffer = vta::DataBuffer::FromHandle(to); to_buffer = vta::DataBuffer::FromHandle(to);
to = to_buffer->virt_addr(); to = to_buffer->virt_addr();
} }
if (from_buffer) { if (from_buffer) {
// This is an FPGA to host mem transfer
from_buffer->InvalidateCache(from_offset, size); from_buffer->InvalidateCache(from_offset, size);
} from_buffer->MemCopyToHost(static_cast<char*>(to) + to_offset,
static_cast<const char*>(from) + from_offset,
memcpy(static_cast<char*>(to) + to_offset, size);
static_cast<const char*>(from) + from_offset, } else if (to_buffer) {
size); // This is a host to FPGA mem transfer
if (to_buffer) { to_buffer->MemCopyFromHost(static_cast<char*>(to) + to_offset,
static_cast<const char*>(from) + from_offset,
size);
to_buffer->FlushCache(to_offset, size); to_buffer->FlushCache(to_offset, size);
} }
} }
......
...@@ -607,6 +607,14 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) { ...@@ -607,6 +607,14 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
return vta::sim::DRAM::Global()->GetPhyAddr(buf); return vta::sim::DRAM::Global()->GetPhyAddr(buf);
} }
void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
void VTAFlushCache(vta_phy_addr_t buf, int size) { void VTAFlushCache(vta_phy_addr_t buf, int size) {
} }
......
...@@ -220,6 +220,14 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) { ...@@ -220,6 +220,14 @@ vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf)); return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
} }
void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
void VTAFlushCache(vta_phy_addr_t buf, int size) { void VTAFlushCache(vta_phy_addr_t buf, int size) {
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment