"""
Quick Start - End-to-End Tutorial for NNVM/TVM Pipeline
=======================================================
**Author**: `Yao Wang <https://github.com/kevinthesun>`_

This example shows how to build a neural network with NNVM python frontend and
generate runtime library for Nvidia GPU and Raspberry Pi with TVM. (Thanks to
Tianqi's `tutorial for cuda <http://nnvm.tvmlang.org/tutorials/get_started.html>`_ and
Ziheng's `tutorial for Raspberry Pi <http://nnvm.tvmlang.org/tutorials/deploy_model_on_rasp.html>`_)
To run this notebook, you need to install tvm and nnvm following
`these instructions <https://github.com/dmlc/nnvm/blob/master/docs/how_to/install.md>`_.
Notice that you need to build tvm with cuda and llvm.
"""

######################################################################
# Overview for Supported Hardware Backend of TVM
# -----------------------------
# The image below shows hardware backend currently supported by TVM:
#
# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tvm_support_list.png
#      :align: center
#      :scale: 100%
#
# In this tutorial, we'll choose cuda and llvm as target backends.
# To begin with, let's import NNVM and TVM.
import tvm
import nnvm.compiler
import nnvm.testing


######################################################################
# Define Neural Network in NNVM
# -----------------------------
# First, let's define a neural network with nnvm python frontend.
# For simplicity, we'll use pre-defined resnet-18 network in NNVM.
# Parameters are initialized with Xavier initializer.
# NNVM also supports other model formats such as MXNet, CoreML and ONNX.
#
# In this tutorial, we assume we will do inference on our device
# and the batch size is set to be 1. Input images are RGB color
# images of size 224 * 224. We can call the :any:`nnvm.symbol.debug_str`
# to show the network structure.

batch_size = 1
num_class = 1000
image_shape = (3, 224, 224)
data_shape = (batch_size,) + image_shape
out_shape = (batch_size, num_class)

net, params = nnvm.testing.resnet.get_workload(batch_size=batch_size, image_shape=image_shape)
print(net.debug_str())

######################################################################
# Compilation
# ----------------------------
# Next step is to compile the model using the NNVM/TVM pipeline.
# Users can specify the optimization level of the compilation.
# Currently this value can be 0 to 2, which corresponds to
# "SimplifyInference", "OpFusion" and "PrecomputePrune" respectively.
# In this example we set optimization level to be 0
# and use Raspberry Pi as compile target.
#
# :any:`nnvm.compiler.build` returns three components: the execution graph in
# json format, the TVM module library of compiled functions specifically
# for this graph on the target hardware, and the parameter blobs of
# the model. During the compilation, NNVM does the graph-level
# optimization while TVM does the tensor-level optimization, resulting
# in an optimized runtime module for model serving.
#
# We'll first compile for Nvidia GPU.

opt_level = 0
target = tvm.target.cuda()
with nnvm.compiler.build_config(opt_level=opt_level):
    graph, lib, params = nnvm.compiler.build(
        net, target, shape={"data": data_shape}, params=params)

######################################################################
# Save Compiled Module
# ----------------------------
# After compilation, we can save the graph, lib and params into separate files
# and deploy them to Nvidia GPU.

from tvm.contrib import util

temp = util.tempdir()
path_lib = temp.relpath("deploy_lib.so")
lib.export_library(path_lib)
with open(temp.relpath("deploy_graph.json"), "w") as fo:
    fo.write(graph.json())
with open(temp.relpath("deploy_param.params"), "wb") as fo:
    fo.write(nnvm.compiler.save_param_dict(params))
print(temp.listdir())

######################################################################
# Deploy locally to Nvidia GPU
# ------------------------------
# Now we can load the module back.

import numpy as np
from tvm.contrib import graph_runtime

loaded_lib = tvm.module.load(path_lib)
loaded_json = open(temp.relpath("deploy_graph.json")).read()
loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
module.load_params(loaded_params)

input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
module.run(data=input_data)
out = module.get_output(0, out=tvm.nd.empty(out_shape))
# Print first 10 elements of output
print(out.asnumpy()[0][0:10])

######################################################################
# Compile and Deploy the Model to Raspberry Pi Remotely with RPC
# ------------------------------
# Following the steps above, we can also compile the model for Raspberry Pi.
# TVM provides rpc module to help with remote deploying.
#
# For demonstration, we simply start an RPC server on the same machine,
# if :code:`use_rasp` is False. If you have set up the remote
# environment, please change the three lines below: change the
# :code:`use_rasp` to True, also change the host and port with your
# device's host address and port number.

# If we run the example locally for demonstration, we can simply set the 
# compilation target as `llvm`. 
# To run it on the Raspberry Pi, you need to specify its instruction set. 
# `llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon`
# is the recommended compilation configuration, thanks to Ziheng's work.

from tvm.contrib import rpc

use_rasp = False
host = 'rasp0'
port = 9090

if not use_rasp:
    # run server locally
    host = 'localhost'
    port = 9090
    server = rpc.Server(host=host, port=port)

# compile and save model library
if use_rasp:
    target = "llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon"
else:
    target = "llvm"
# use `with tvm.target.rasp` for some target-specified optimization
with tvm.target.rasp():
    graph, lib, params = nnvm.compiler.build(
        net, target, shape={"data": data_shape}, params=params)

temp = util.tempdir()
path_lib = temp.relpath("deploy_lib_rasp.o")
lib.save(path_lib)

# connect the server
remote = rpc.connect(host, port)

# upload the library to remote device and load it
remote.upload(path_lib)
rlib = remote.load_module('deploy_lib_rasp.o')

ctx = remote.cpu(0)
# upload the parameter
rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}

# create the remote runtime module
module = graph_runtime.create(graph, rlib, ctx)
# set parameter
module.set_input(**rparams)
# set input data
input_data = np.random.uniform(size=data_shape)
module.set_input('data', tvm.nd.array(input_data.astype('float32')))
# run
module.run()

out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx))
# Print first 10 elements of output
print(out.asnumpy()[0][0:10])

if not use_rasp:
    # terminate the local server
    server.terminate()