Commit 0faefafc by Tianqi Chen

[DRIVER][RUNTIME] Make runtime fully device agnostic (#23)

parent dea167a8
/*!
* Copyright (c) 2018 by Contributors
* \file vta_driver.h
* \brief General driver interface.
* \brief Driver interface that is used by runtime.
*
* Driver's implementation is device specific.
*/
#ifndef VTA_DRIVER_H_
......@@ -11,16 +13,50 @@
extern "C" {
#endif
#include <stdlib.h>
#include <stdint.h>
#include <stdlib.h>
/*! \brief Memory management constants */
/*! \brief Memory management constants for cached memory */
#define VTA_CACHED 1
/*! \brief Memory management constants */
/*! \brief Memory management constants for non-cached memory */
#define VTA_NOT_CACHED 0
/*! \brief VTA command handle */
typedef void * VTAHandle;
/*! \brief Physically contiguous buffer size limit */
#ifndef VTA_MAX_XFER
#define VTA_MAX_XFER (1<<22)
#endif
/*! \brief Device resource context */
typedef void * VTADeviceHandle;
/*! \brief physical address */
typedef uint32_t vta_phy_addr_t;
/*!
* \brief Allocate a device resource handle
* \return The device handle.
*/
VTADeviceHandle VTADeviceAlloc();
/*!
* \brief Free a device handle
* \param handle The device handle to be freed.
*/
void VTADeviceFree(VTADeviceHandle handle);
/*!
* \brief Launch the instructions block until done.
* \param The device handle.
* \param insn_phy_addr The physical address of instruction stream.
* \param insn_count Instruction count.
* \param wait_cycles The maximum of cycles to wait
*
* \return 0 if running is successful, 1 if timeout.
*/
int VTADeviceRun(VTADeviceHandle device,
vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles);
/*!
* \brief Allocates physically contiguous region in memory (limited by MAX_XFER).
......@@ -41,52 +77,23 @@ void VTAMemFree(void* buf);
* \param buf Pointer to memory region allocated with VTAMemAlloc.
* \return The physical address of the memory region.
*/
uint32_t VTAGetMemPhysAddr(void* buf);
vta_phy_addr_t VTAGetMemPhysAddr(void* buf);
/*!
* \brief Flushes the region of memory out of the CPU cache to DRAM.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be flushed.
* This need to be the physical address.
* \param size Size of the region to flush in Bytes.
*/
void VTAFlushCache(void* buf, int size);
void VTAFlushCache(vta_phy_addr_t buf, int size);
/*!
* \brief Invalidates the region of memory that is cached.
* \param buf Pointer to memory region allocated with VTAMemAlloc to be invalidated.
* This need to be the physical address.
* \param size Size of the region to invalidate in Bytes.
*/
void VTAInvalidateCache(void* buf, int size);
/*!
* \brief Returns a memory map to FPGA configuration registers.
* \param addr The base physical address of the configuration registers.
* \param length The size of the memory mapped region in bytes.
* \return A pointer to the memory mapped region.
*/
void *VTAMapRegister(unsigned addr, size_t length);
/*!
* \brief Deletes the configuration register memory map.
* \param vta The memory mapped region.
* \param length The size of the memory mapped region in bytes.
*/
void VTAUnmapRegister(void *vta, size_t length);
/*!
* \brief Writes to a memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to write to.
* \param val The value to be written to the memory mapped register.
*/
void VTAWriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
/*!
* \brief Reads from the memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to read from.
* \return The value read from the memory mapped register.
*/
unsigned VTAReadMappedReg(VTAHandle vta_base, unsigned offset);
void VTAInvalidateCache(vta_phy_addr_t buf, int size);
/*!
* \brief Programming the bit stream on the FPGA.
......
......@@ -35,7 +35,7 @@ struct DataBuffer {
*/
void InvalidateCache(size_t offset, size_t size) {
if (!kBufferCoherent) {
VTAInvalidateCache(reinterpret_cast<void*>(phy_addr_ + offset), size);
VTAInvalidateCache(phy_addr_ + offset, size);
}
}
/*!
......@@ -45,7 +45,7 @@ struct DataBuffer {
*/
void FlushCache(size_t offset, size_t size) {
if (!kBufferCoherent) {
VTAFlushCache(reinterpret_cast<void*>(phy_addr_ + offset), size);
VTAFlushCache(phy_addr_ + offset, size);
}
}
/*!
......
......@@ -5,6 +5,7 @@
*/
#include <vta/driver.h>
#include <thread>
#include "./pynq_driver.h"
......@@ -16,16 +17,16 @@ void VTAMemFree(void* buf) {
cma_free(buf);
}
uint32_t VTAGetMemPhysAddr(void* buf) {
vta_phy_addr_t VTAGetMemPhysAddr(void* buf) {
return cma_get_phy_addr(buf);
}
void VTAFlushCache(void* buf, int size) {
xlnkFlushCache(buf, size);
void VTAFlushCache(vta_phy_addr_t buf, int size) {
xlnkFlushCache(reinterpret_cast<void*>(buf), size);
}
void VTAInvalidateCache(void* buf, int size) {
xlnkInvalidateCache(buf, size);
void VTAInvalidateCache(vta_phy_addr_t buf, int size) {
xlnkInvalidateCache(reinterpret_cast<void*>(buf), size);
}
void *VTAMapRegister(uint32_t addr, size_t length) {
......@@ -57,6 +58,85 @@ uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
}
class VTADevice {
public:
VTADevice() {
// VTA stage handles
vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
}
~VTADevice() {
// Close VTA stage handle
VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE);
VTAUnmapRegister(vta_load_handle_, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle_, VTA_RANGE);
VTAUnmapRegister(vta_store_handle_, VTA_RANGE);
}
int Run(vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles) {
// NOTE: Register address map is derived from the auto-generated
// driver files available under hardware/build/vivado/<design>/export/driver
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_count);
// FETCH @ 0x18 : Data signal of insns_V
VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_phy_addr);
// LOAD @ 0x10 : Data signal of inputs_V
VTAWriteMappedReg(vta_load_handle_, 0x10, 0);
// LOAD @ 0x18 : Data signal of weight_V
VTAWriteMappedReg(vta_load_handle_, 0x18, 0);
// COMPUTE @ 0x20 : Data signal of uops_V
VTAWriteMappedReg(vta_compute_handle_, 0x20, 0);
// COMPUTE @ 0x28 : Data signal of biases_V
VTAWriteMappedReg(vta_compute_handle_, 0x28, 0);
// STORE @ 0x10 : Data signal of outputs_V
VTAWriteMappedReg(vta_store_handle_, 0x10, 0);
// VTA start
VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART);
VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART);
VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART);
// Loop until the VTA is done
unsigned t, flag = 0;
for (t = 0; t < wait_cycles; ++t) {
flag = VTAReadMappedReg(vta_compute_handle_, 0x18);
if (flag == VTA_DONE) break;
std::this_thread::yield();
}
// Report error if timeout
return t < wait_cycles ? 0 : 1;
}
private:
// VTA handles (register maps)
void* vta_fetch_handle_{nullptr};
void* vta_load_handle_{nullptr};
void* vta_compute_handle_{nullptr};
void* vta_store_handle_{nullptr};
};
VTADeviceHandle VTADeviceAlloc() {
return new VTADevice();
}
void VTADeviceFree(VTADeviceHandle handle) {
delete static_cast<VTADevice*>(handle);
}
int VTADeviceRun(VTADeviceHandle handle,
vta_phy_addr_t insn_phy_addr,
uint32_t insn_count,
uint32_t wait_cycles) {
return static_cast<VTADevice*>(handle)->Run(
insn_phy_addr, insn_count, wait_cycles);
}
void VTAProgram(const char* bitstream) {
int elem;
FILE *src, *dst, *partial;
......
......@@ -32,6 +32,11 @@ void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size);
#endif
void *VTAMapRegister(uint32_t addr, size_t length);
void VTAUnmapRegister(void *vta, size_t length);
void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
/*! \brief (Pynq only) Partial bitstream status file path */
#define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
/*! \brief (Pynq only) Bitstream destination file path */
......@@ -44,9 +49,6 @@ void xlnkInvalidateCache(void* buf, int size);
/*! \brief (Pynq only) MMIO driver constant */
#define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
/*! \brief Physically contiguous buffer size limit */
#define VTA_MAX_XFER (1<<22)
/*! \brief VTA configuration register address range */
#define VTA_RANGE 0x100
/*! \brief VTA configuration register start value */
......
/*!
* Copyright (c) 2018 by Contributors
* \file runtime.cc
* \brief VTA runtime for PYNQ in C++11
* \brief Generic VTA runtime in C++11.
*
* The runtime depends on specific instruction
* stream spec as specified in hw_spec.h
* It is intended to be used as a dynamic library
* to enable hot swapping of hardware configurations.
*/
#ifdef VTA_PYNQ_TARGET
#include "./pynq/pynq_driver.h"
#endif // VTA_PYNQ_TARGET
#include <vta/driver.h>
#include <vta/hw_spec.h>
#include <vta/runtime.h>
......@@ -245,7 +245,7 @@ class BaseQueue {
if (!coherent_ && always_cache_ && dram_extent != 0) {
dram_begin = dram_begin * elem_bits / 8;
dram_extent = dram_extent * elem_bits / 8;
VTAFlushCache(reinterpret_cast<void*>(dram_phy_addr_ + dram_begin),
VTAFlushCache(dram_phy_addr_ + dram_begin,
dram_extent);
}
}
......@@ -254,7 +254,7 @@ class BaseQueue {
if (!coherent_ && always_cache_ && dram_extent != 0) {
dram_begin = dram_begin * elem_bits / 8;
dram_extent = dram_extent * elem_bits / 8;
VTAInvalidateCache(reinterpret_cast<void*>(dram_phy_addr_ + dram_begin),
VTAInvalidateCache(dram_phy_addr_ + dram_begin,
dram_extent);
}
}
......@@ -818,20 +818,13 @@ class CommandQueue {
void InitSpace() {
uop_queue_.InitSpace();
insn_queue_.InitSpace();
// VTA stage handles
vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
device_ = VTADeviceAlloc();
assert(device_ != nullptr);
printf("Initialize VTACommandHandle...\n");
}
~CommandQueue() {
// Close VTA stage handle
VTAUnmapRegister(vta_fetch_handle_, VTA_RANGE);
VTAUnmapRegister(vta_load_handle_, VTA_RANGE);
VTAUnmapRegister(vta_compute_handle_, VTA_RANGE);
VTAUnmapRegister(vta_store_handle_, VTA_RANGE);
VTADeviceFree(device_);
printf("Close VTACommandhandle...\n");
}
......@@ -951,44 +944,14 @@ class CommandQueue {
assert(reinterpret_cast<VTAMemInsn*>(
insn_queue_.data())[insn_queue_.count()-1].opcode == VTA_OPCODE_FINISH);
#ifdef VTA_PYNQ_TARGET
// Make sure that we don't exceed contiguous physical memory limits
assert(insn_queue_.count() < VTA_MAX_XFER);
// NOTE: Register address map is derived from the auto-generated
// driver files available under hardware/build/vivado/<design>/export/driver
// FETCH @ 0x10 : Data signal of insn_count_V
VTAWriteMappedReg(vta_fetch_handle_, 0x10, insn_queue_.count());
// FETCH @ 0x18 : Data signal of insns_V
VTAWriteMappedReg(vta_fetch_handle_, 0x18, insn_queue_.dram_phy_addr());
// LOAD @ 0x10 : Data signal of inputs_V
VTAWriteMappedReg(vta_load_handle_, 0x10, 0);
// LOAD @ 0x18 : Data signal of weight_V
VTAWriteMappedReg(vta_load_handle_, 0x18, 0);
// COMPUTE @ 0x20 : Data signal of uops_V
VTAWriteMappedReg(vta_compute_handle_, 0x20, 0);
// COMPUTE @ 0x28 : Data signal of biases_V
VTAWriteMappedReg(vta_compute_handle_, 0x28, 0);
// STORE @ 0x10 : Data signal of outputs_V
VTAWriteMappedReg(vta_store_handle_, 0x10, 0);
// VTA start
VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART);
VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART);
VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART);
// Loop until the VTA is done
unsigned t, flag = 0;
for (t = 0; t < wait_cycles; ++t) {
flag = VTAReadMappedReg(vta_compute_handle_, 0x18);
if (flag == VTA_DONE) break;
std::this_thread::yield();
}
// Report error if timeout
assert(t < wait_cycles);
#endif // VTA_PYNQ_TARGET
assert(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER);
int timeout = VTADeviceRun(
device_,
insn_queue_.dram_phy_addr(),
insn_queue_.count(),
wait_cycles);
assert(timeout == 0);
// Reset buffers
uop_queue_.Reset();
insn_queue_.Reset();
......@@ -1147,7 +1110,7 @@ class CommandQueue {
void CheckInsnOverFlow() {
// At each API call, we can at most commit:
// one pending store, one pending load, and one uop
if (insn_queue_.count() >= VTA_MAX_XFER) {
if ((insn_queue_.count() + 4) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
this->AutoSync();
}
}
......@@ -1155,11 +1118,7 @@ class CommandQueue {
void AutoSync() {
this->Synchronize(1 << 31);
}
// VTA handles (register maps)
VTAHandle vta_fetch_handle_{nullptr};
VTAHandle vta_load_handle_{nullptr};
VTAHandle vta_compute_handle_{nullptr};
VTAHandle vta_store_handle_{nullptr};
// Internal debug flag
int debug_flag_{0};
// The kernel we currently recording
......@@ -1168,6 +1127,8 @@ class CommandQueue {
UopQueue<VTA_MAX_XFER, true, true> uop_queue_;
// instruction queue
InsnQueue<VTA_MAX_XFER, true, true> insn_queue_;
// Device handle
VTADeviceHandle device_{nullptr};
};
} // namespace vta
......@@ -1302,11 +1263,3 @@ void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) {
static_cast<vta::CommandQueue*>(cmd)->
Synchronize(wait_cycles);
}
extern "C" int VTARuntimeDynamicMagic() {
#ifdef VTA_DYNAMIC_MAGIC
return VTA_DYNAMIC_MAGIC;
#else
return 0;
#endif
}
......@@ -11,10 +11,6 @@
#include "../../nnvm/tvm/src/runtime/workspace_pool.h"
extern "C" {
typedef void (*FShutdown)();
typedef int (*FDynamicMagic)();
}
namespace tvm {
namespace runtime {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment