Commit 0faefafc by Tianqi Chen

[DRIVER][RUNTIME] Make runtime fully device agnostic (#23)

parent dea167a8
/*! /*!
* Copyright (c) 2018 by Contributors * Copyright (c) 2018 by Contributors
* \file vta_driver.h * \file vta_driver.h
* \brief General driver interface. * \brief Driver interface that is used by runtime.
*
* Driver's implementation is device specific.
*/ */
#ifndef VTA_DRIVER_H_ #ifndef VTA_DRIVER_H_
...@@ -11,16 +13,50 @@ ...@@ -11,16 +13,50 @@
extern "C" { extern "C" {
#endif #endif
#include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <stdlib.h>
/*! \brief Memory management constants */ /*! \brief Memory management constants for cached memory */
#define VTA_CACHED 1 #define VTA_CACHED 1
/*! \brief Memory management constants */ /*! \brief Memory management constants for non-cached memory */
#define VTA_NOT_CACHED 0 #define VTA_NOT_CACHED 0
/*! \brief VTA command handle */ /*! \brief Physically contiguous buffer size limit */
typedef void * VTAHandle; #ifndef VTA_MAX_XFER
#define VTA_MAX_XFER (1<<22)
#endif
/*! \brief Device resource context */
typedef void * VTADeviceHandle;
/*! \brief physical address */
typedef uint32_t vta_phy_addr_t;
/*!
* \brief Allocate a device resource handle
* \return The device handle.
*/
VTADeviceHandle VTADeviceAlloc();
/*!
* \brief Free a device handle
* \param handle The device handle to be freed.
*/
void VTADeviceFree(VTADeviceHandle handle);
/*!
* \brief Launch the instructions block until done.
* \param The device handle.
* \param insn_phy_addr The physical address of instruction stream.
* \param insn_count Instruction count.
* \param wait_cycles The maximum of cycles to wait
*
* \return 0 if running is successful, 1 if timeout.
*/
int VTADeviceRun(VTADeviceHandle device,
vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles);
/*! /*!
* \brief Allocates physically contiguous region in memory (limited by MAX_XFER). * \brief Allocates physically contiguous region in memory (limited by MAX_XFER).
...@@ -41,52 +77,23 @@ void VTAMemFree(void* buf); ...@@ -41,52 +77,23 @@ void VTAMemFree(void* buf);
* \param buf Pointer to memory region allocated with VTAMemAlloc. * \param buf Pointer to memory region allocated with VTAMemAlloc.
* \return The physical address of the memory region. * \return The physical address of the memory region.
*/ */
uint32_t VTAGetMemPhysAddr(void* buf); vta_phy_addr_t VTAGetMemPhysAddr(void* buf);
/*! /*!
* \brief Flushes the region of memory out of the CPU cache to DRAM. * \brief Flushes the region of memory out of the CPU cache to DRAM.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed. * \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
* This need to be the physical address.
* \param size Size of the region to flush in Bytes. * \param size Size of the region to flush in Bytes.
*/ */
void VTAFlushCache(void* buf, int size); void VTAFlushCache(vta_phy_addr_t buf, int size);
/*! /*!
* \brief Invalidates the region of memory that is cached. * \brief Invalidates the region of memory that is cached.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated. * \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
* This need to be the physical address.
* \param size Size of the region to invalidate in Bytes. * \param size Size of the region to invalidate in Bytes.
*/ */
void VTAInvalidateCache(void* buf, int size); void VTAInvalidateCache(vta_phy_addr_t buf, int size);
/*!
* \brief Returns a memory map to FPGA configuration registers.
* \param addr The base physical address of the configuration registers.
* \param length The size of the memory mapped region in bytes.
* \return A pointer to the memory mapped region.
*/
void *VTAMapRegister(unsigned addr, size_t length);
/*!
* \brief Deletes the configuration register memory map.
* \param vta The memory mapped region.
* \param length The size of the memory mapped region in bytes.
*/
void VTAUnmapRegister(void *vta, size_t length);
/*!
* \brief Writes to a memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to write to.
* \param val The value to be written to the memory mapped register.
*/
void VTAWriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
/*!
* \brief Reads from the memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to read from.
* \return The value read from the memory mapped register.
*/
unsigned VTAReadMappedReg(VTAHandle vta_base, unsigned offset);
/*! /*!
* \brief Programming the bit stream on the FPGA. * \brief Programming the bit stream on the FPGA.
......
...@@ -35,7 +35,7 @@ struct DataBuffer { ...@@ -35,7 +35,7 @@ struct DataBuffer {
*/ */
void InvalidateCache(size_t offset, size_t size) { void InvalidateCache(size_t offset, size_t size) {
if (!kBufferCoherent) { if (!kBufferCoherent) {
VTAInvalidateCache(reinterpret_cast<void*>(phy_addr_ + offset), size); VTAInvalidateCache(phy_addr_ + offset, size);
} }
} }
/*! /*!
...@@ -45,7 +45,7 @@ struct DataBuffer { ...@@ -45,7 +45,7 @@ struct DataBuffer {
*/ */
void FlushCache(size_t offset, size_t size) { void FlushCache(size_t offset, size_t size) {
if (!kBufferCoherent) { if (!kBufferCoherent) {
VTAFlushCache(reinterpret_cast<void*>(phy_addr_ + offset), size); VTAFlushCache(phy_addr_ + offset, size);
} }
} }
/*! /*!
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
*/ */
#include <vta/driver.h> #include <vta/driver.h>
#include <thread>
#include "./pynq_driver.h" #include "./pynq_driver.h"
...@@ -16,16 +17,16 @@ void VTAMemFree(void* buf) { ...@@ -16,16 +17,16 @@ void VTAMemFree(void* buf) {
cma_free(buf); cma_free(buf);
} }
uint32_t VTAGetMemPhysAddr(void* buf) { vta_phy_addr_t VTAGetMemPhysAddr(void* buf) {
return cma_get_phy_addr(buf); return cma_get_phy_addr(buf);
} }
void VTAFlushCache(void* buf, int size) { void VTAFlushCache(vta_phy_addr_t buf, int size) {
xlnkFlushCache(buf, size); xlnkFlushCache(reinterpret_cast<void*>(buf), size);
} }
void VTAInvalidateCache(void* buf, int size) { void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
xlnkInvalidateCache(buf, size); xlnkInvalidateCache(reinterpret_cast<void*>(buf), size);
} }
void *VTAMapRegister(uint32_t addr, size_t length) { void *VTAMapRegister(uint32_t addr, size_t length) {
...@@ -57,6 +58,85 @@ uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) { ...@@ -57,6 +58,85 @@ uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset)); return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
} }
class VTADevice {
public:
VTADevice() {
// VTA stage handles
vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
}
~VTADevice() {
// Close VTA stage handle
VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE);
VTAUnmapRegister(vta_load_handle_, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle_, VTA_RANGE);
VTAUnmapRegister(vta_store_handle_, VTA_RANGE);
}
int Run(vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles) {
// NOTE: Register address map is derived from the auto-generated
// driver files available under hardware/build/vivado/<design>/export/driver
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_phy_addr);
// LOAD @ 0x10 : Data signal of inputs_V
VTAWriteMappedReg(vta_load_handle_, 0x10, 0);
// LOAD @ 0x18 : Data signal of weight_V
VTAWriteMappedReg(vta_load_handle_, 0x18, 0);
// COMPUTE @ 0x20 : Data signal of uops_V
VTAWriteMappedReg(vta_compute_handle_, 0x20, 0);
// COMPUTE @ 0x28 : Data signal of biases_V
VTAWriteMappedReg(vta_compute_handle_, 0x28, 0);
// STORE @ 0x10 : Data signal of outputs_V
VTAWriteMappedReg(vta_store_handle_, 0x10, 0);
// VTA start
VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART);
VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART);
VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART);
// Loop until the VTA is done
unsigned t, flag = 0;
for (t = 0; t < wait_cycles; ++t) {
flag = VTAReadMappedReg(vta_compute_handle_, 0x18);
if (flag == VTA_DONE) break;
std::this_thread::yield();
}
// Report error if timeout
return t < wait_cycles ? 0 : 1;
}
private:
// VTA handles (register maps)
void* vta_fetch_handle_{nullptr};
void* vta_load_handle_{nullptr};
void* vta_compute_handle_{nullptr};
void* vta_store_handle_{nullptr};
};
VTADeviceHandle VTADeviceAlloc() {
return new VTADevice();
}
void VTADeviceFree(VTADeviceHandle handle) {
delete static_cast<VTADevice*>(handle);
}
int VTADeviceRun(VTADeviceHandle handle,
vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles) {
return static_cast<VTADevice*>(handle)->Run(
insn_phy_addr, insn_count, wait_cycles);
}
void VTAProgram(const char* bitstream) { void VTAProgram(const char* bitstream) {
int elem; int elem;
FILE *src, *dst, *partial; FILE *src, *dst, *partial;
......
...@@ -32,6 +32,11 @@ void xlnkFlushCache(void* buf, int size); ...@@ -32,6 +32,11 @@ void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size); void xlnkInvalidateCache(void* buf, int size);
#endif #endif
void *VTAMapRegister(uint32_t addr, size_t length);
void VTAUnmapRegister(void *vta, size_t length);
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
/*! \brief (Pynq only) Partial bitstream status file path */ /*! \brief (Pynq only) Partial bitstream status file path */
#define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream" #define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
/*! \brief (Pynq only) Bitstream destination file path */ /*! \brief (Pynq only) Bitstream destination file path */
...@@ -44,9 +49,6 @@ void xlnkInvalidateCache(void* buf, int size); ...@@ -44,9 +49,6 @@ void xlnkInvalidateCache(void* buf, int size);
/*! \brief (Pynq only) MMIO driver constant */ /*! \brief (Pynq only) MMIO driver constant */
#define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1)) #define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
/*! \brief Physically contiguous buffer size limit */
#define VTA_MAX_XFER (1<<22)
/*! \brief VTA configuration register address range */ /*! \brief VTA configuration register address range */
#define VTA_RANGE 0x100 #define VTA_RANGE 0x100
/*! \brief VTA configuration register start value */ /*! \brief VTA configuration register start value */
......
/*! /*!
* Copyright (c) 2018 by Contributors * Copyright (c) 2018 by Contributors
* \file runtime.cc * \file runtime.cc
* \brief VTA runtime for PYNQ in C++11 * \brief Generic VTA runtime in C++11.
*
* The runtime depends on specific instruction
* stream spec as specified in hw_spec.h
* It is intended to be used as a dynamic library
* to enable hot swapping of hardware configurations.
*/ */
#ifdef VTA_PYNQ_TARGET
#include "./pynq/pynq_driver.h"
#endif // VTA_PYNQ_TARGET
#include <vta/driver.h> #include <vta/driver.h>
#include <vta/hw_spec.h> #include <vta/hw_spec.h>
#include <vta/runtime.h> #include <vta/runtime.h>
...@@ -245,7 +245,7 @@ class BaseQueue { ...@@ -245,7 +245,7 @@ class BaseQueue {
if (!coherent_ && always_cache_ && dram_extent != 0) { if (!coherent_ && always_cache_ && dram_extent != 0) {
dram_begin = dram_begin * elem_bits / 8; dram_begin = dram_begin * elem_bits / 8;
dram_extent = dram_extent * elem_bits / 8; dram_extent = dram_extent * elem_bits / 8;
VTAFlushCache(reinterpret_cast<void*>(dram_phy_addr_ + dram_begin), VTAFlushCache(dram_phy_addr_ + dram_begin,
dram_extent); dram_extent);
} }
} }
...@@ -254,7 +254,7 @@ class BaseQueue { ...@@ -254,7 +254,7 @@ class BaseQueue {
if (!coherent_ && always_cache_ && dram_extent != 0) { if (!coherent_ && always_cache_ && dram_extent != 0) {
dram_begin = dram_begin * elem_bits / 8; dram_begin = dram_begin * elem_bits / 8;
dram_extent = dram_extent * elem_bits / 8; dram_extent = dram_extent * elem_bits / 8;
VTAInvalidateCache(reinterpret_cast<void*>(dram_phy_addr_ + dram_begin), VTAInvalidateCache(dram_phy_addr_ + dram_begin,
dram_extent); dram_extent);
} }
} }
...@@ -818,20 +818,13 @@ class CommandQueue { ...@@ -818,20 +818,13 @@ class CommandQueue {
void InitSpace() { void InitSpace() {
uop_queue_.InitSpace(); uop_queue_.InitSpace();
insn_queue_.InitSpace(); insn_queue_.InitSpace();
// VTA stage handles device_ = VTADeviceAlloc();
vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE); assert(device_ != nullptr);
vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
printf("Initialize VTACommandHandle...\n"); printf("Initialize VTACommandHandle...\n");
} }
~CommandQueue() { ~CommandQueue() {
// Close VTA stage handle VTADeviceFree(device_);
VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE);
VTAUnmapRegister(vta_load_handle_, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle_, VTA_RANGE);
VTAUnmapRegister(vta_store_handle_, VTA_RANGE);
printf("Close VTACommandhandle...\n"); printf("Close VTACommandhandle...\n");
} }
...@@ -951,44 +944,14 @@ class CommandQueue { ...@@ -951,44 +944,14 @@ class CommandQueue {
assert(reinterpret_cast<VTAMemInsn*>( assert(reinterpret_cast<VTAMemInsn*>(
insn_queue_.data())[insn_queue_.count()-1].opcode == VTA_OPCODE_FINISH); insn_queue_.data())[insn_queue_.count()-1].opcode == VTA_OPCODE_FINISH);
#ifdef VTA_PYNQ_TARGET
// Make sure that we don't exceed contiguous physical memory limits // Make sure that we don't exceed contiguous physical memory limits
assert(insn_queue_.count() < VTA_MAX_XFER); assert(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER);
int timeout = VTADeviceRun(
// NOTE: Register address map is derived from the auto-generated device_,
// driver files available under hardware/build/vivado/<design>/export/driver insn_queue_.dram_phy_addr(),
// FETCH @ 0x10 : Data signal of insn_count_V insn_queue_.count(),
VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_queue_.count()); wait_cycles);
// FETCH @ 0x18 : Data signal of insns_V assert(timeout == 0);
VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_queue_.dram_phy_addr());
// LOAD @ 0x10 : Data signal of inputs_V
VTAWriteMappedReg(vta_load_handle_, 0x10, 0);
// LOAD @ 0x18 : Data signal of weight_V
VTAWriteMappedReg(vta_load_handle_, 0x18, 0);
// COMPUTE @ 0x20 : Data signal of uops_V
VTAWriteMappedReg(vta_compute_handle_, 0x20, 0);
// COMPUTE @ 0x28 : Data signal of biases_V
VTAWriteMappedReg(vta_compute_handle_, 0x28, 0);
// STORE @ 0x10 : Data signal of outputs_V
VTAWriteMappedReg(vta_store_handle_, 0x10, 0);
// VTA start
VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART);
VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART);
VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART);
// Loop until the VTA is done
unsigned t, flag = 0;
for (t = 0; t < wait_cycles; ++t) {
flag = VTAReadMappedReg(vta_compute_handle_, 0x18);
if (flag == VTA_DONE) break;
std::this_thread::yield();
}
// Report error if timeout
assert(t < wait_cycles);
#endif // VTA_PYNQ_TARGET
// Reset buffers // Reset buffers
uop_queue_.Reset(); uop_queue_.Reset();
insn_queue_.Reset(); insn_queue_.Reset();
...@@ -1147,7 +1110,7 @@ class CommandQueue { ...@@ -1147,7 +1110,7 @@ class CommandQueue {
void CheckInsnOverFlow() { void CheckInsnOverFlow() {
// At each API call, we can at most commit: // At each API call, we can at most commit:
// one pending store, one pending load, and one uop // one pending store, one pending load, and one uop
if (insn_queue_.count() >= VTA_MAX_XFER) { if ((insn_queue_.count() + 4) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
this->AutoSync(); this->AutoSync();
} }
} }
...@@ -1155,11 +1118,7 @@ class CommandQueue { ...@@ -1155,11 +1118,7 @@ class CommandQueue {
void AutoSync() { void AutoSync() {
this->Synchronize(1 << 31); this->Synchronize(1 << 31);
} }
// VTA handles (register maps)
VTAHandle vta_fetch_handle_{nullptr};
VTAHandle vta_load_handle_{nullptr};
VTAHandle vta_compute_handle_{nullptr};
VTAHandle vta_store_handle_{nullptr};
// Internal debug flag // Internal debug flag
int debug_flag_{0}; int debug_flag_{0};
// The kernel we currently recording // The kernel we currently recording
...@@ -1168,6 +1127,8 @@ class CommandQueue { ...@@ -1168,6 +1127,8 @@ class CommandQueue {
UopQueue<VTA_MAX_XFER, true, true> uop_queue_; UopQueue<VTA_MAX_XFER, true, true> uop_queue_;
// instruction queue // instruction queue
InsnQueue<VTA_MAX_XFER, true, true> insn_queue_; InsnQueue<VTA_MAX_XFER, true, true> insn_queue_;
// Device handle
VTADeviceHandle device_{nullptr};
}; };
} // namespace vta } // namespace vta
...@@ -1302,11 +1263,3 @@ void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) { ...@@ -1302,11 +1263,3 @@ void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) {
static_cast<vta::CommandQueue*>(cmd)-> static_cast<vta::CommandQueue*>(cmd)->
Synchronize(wait_cycles); Synchronize(wait_cycles);
} }
extern "C" int VTARuntimeDynamicMagic() {
#ifdef VTA_DYNAMIC_MAGIC
return VTA_DYNAMIC_MAGIC;
#else
return 0;
#endif
}
...@@ -11,10 +11,6 @@ ...@@ -11,10 +11,6 @@
#include "../../nnvm/tvm/src/runtime/workspace_pool.h" #include "../../nnvm/tvm/src/runtime/workspace_pool.h"
extern "C" {
typedef void (*FShutdown)();
typedef int (*FDynamicMagic)();
}
namespace tvm { namespace tvm {
namespace runtime { namespace runtime {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment