"""
Get Started with NNVM
=====================
**Author**: `Tianqi Chen <https://tqchen.github.io/>`_

This article is an introductory tutorial to workflow in NNVM.
"""
import nnvm.compiler
import nnvm.symbol as sym

######################################################################
# Declare Computation
# -------------------
# We start by describing our need using computational graph.
# Most deep learning frameworks use computation graph to describe
# their computation. In this example, we directly use
# NNVM's API to construct the computational graph.
#
# .. note::
#
#   In a typical deep learning compilation workflow,
#   we can get the models from :any:`nnvm.frontend`
#
# The following code snippet describes :math:`z = x + \sqrt{y}`
# and creates a nnvm graph from the description.
# We can print out the graph ir to check the graph content.

x = sym.Variable("x")
y = sym.Variable("y")
z = sym.elemwise_add(x, sym.sqrt(y))
compute_graph = nnvm.graph.create(z)
print("-------compute graph-------")
print(compute_graph.ir())

######################################################################
# Compile
# -------
# We can call :any:`nnvm.compiler.build` to compile the graph.
# The build function takes a shape parameter which specifies the
# input shape requirement. Here we only need to pass in shape of ``x``
# and the other one will be inferred automatically by NNVM.
#
# The function returns three values. ``deploy_graph`` contains
# the final compiled graph structure. ``lib`` is a :any:`tvm.module.Module`
# that contains compiled CUDA functions. We do not need the ``params``
# in this case.
shape = (4,)
deploy_graph, lib, params = nnvm.compiler.build(
    compute_graph, target="cuda", shape={"x": shape}, dtype="float32")

######################################################################
# We can print out the IR of ``deploy_graph`` to understand what just
# happened under the hood. We can find that ``deploy_graph`` only
# contains a single operator ``tvm_op``. This is because NNVM
# automatically fused the operator together into one operator.
#
print("-------deploy graph-------")
print(deploy_graph.ir())

######################################################################
# Let us also peek into content of ``lib``.
# Typically a compiled TVM CUDA module contains a host module(lib)
# and a device module(``lib.imported_modules[0]``) that contains the CUDA code.
# We print out the the generated device code here.
# This is exactly a fused CUDA version of kernel that the graph points to.
#
print("-------deploy library-------")
print(lib.imported_modules[0].get_source())

######################################################################
# Deploy and Run
# --------------
# Now that we have have compiled module, let us run it.
# We can use :any:`graph_runtime <tvm.contrib.graph_runtime.create>`
# in tvm to create a deployable :any:`GraphModule <tvm.contrib.graph_runtime.GraphModule>`.
# We can use the :any:`set_input <tvm.contrib.graph_runtime.GraphModule.set_input>`,
# :any:`run <tvm.contrib.graph_runtime.GraphModule.run>` and
# :any:`get_output <tvm.contrib.graph_runtime.GraphModule.get_output>` function
# to set the input, execute the graph and get the output we need.
#
import tvm
import numpy as np
from tvm.contrib import graph_runtime, util

module = graph_runtime.create(deploy_graph, lib, tvm.gpu(0))
x_np = np.array([1, 2, 3, 4]).astype("float32")
y_np = np.array([4, 4, 4, 4]).astype("float32")
# set input to the graph module
module.set_input(x=x_np, y=y_np)
# run forward computation
module.run()
# get the first output
out = module.get_output(0, out=tvm.nd.empty(shape))
print(out.asnumpy())

######################################################################
# Provide Model Parameters
# ------------------------
# Most deep learning models contains two types of inputs: parameters
# that remains fixed during inference and data input that need to
# change for each inference task. It is helpful to provide these
# information to NNVM. Let us assume that ``y`` is the parameter
# in our example. We can provide the model parameter information
# by the params argument to :any:`nnvm.compiler.build`.
#
deploy_graph, lib, params = nnvm.compiler.build(
    compute_graph, target="cuda", shape={"x": shape}, params={"y": y_np})

######################################################################
# This time we will need params value returned by :any:`nnvm.compiler.build`.
# NNVM applys  optimization  to pre-compute the intermediate values in
# the graph that can be determined by parameters. In this case
# :math:`\sqrt{y}` can be pre-computed. The pre-computed values
# are returned as new params. We can print out the new compiled library
# to confirm that the fused kernel only now contains add.
#
print("-----optimized params-----")
print(params)
print("-------deploy library-------")
print(lib.imported_modules[0].get_source())

######################################################################
# Save the Deployed Module
# ------------------------
# We can save the ``deploy_graph``, ``lib`` and ``params`` separately
# and load them back later. We can use :any:`tvm.module.Module` to export
# the compiled library. ``deploy_graph`` is saved in json format and ``params``
# is serialized into a bytearray.
#
temp = util.tempdir()
path_lib = temp.relpath("deploy.so")
lib.export_library(path_lib)
with open(temp.relpath("deploy.json"), "w") as fo:
    fo.write(deploy_graph.json())
with open(temp.relpath("deploy.params"), "wb") as fo:
    fo.write(nnvm.compiler.save_param_dict(params))
print(temp.listdir())

######################################################################
# We can load the module back.
loaded_lib = tvm.module.load(path_lib)
loaded_json = open(temp.relpath("deploy.json")).read()
loaded_params = bytearray(open(temp.relpath("deploy.params"), "rb").read())
module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
params = nnvm.compiler.load_param_dict(loaded_params)
# directly load from byte array
module.load_params(loaded_params)
module.run(x=x_np)
# get the first output
out = module.get_output(0, out=tvm.nd.empty(shape))
print(out.asnumpy())

######################################################################
# Deploy using Another Language
# -----------------------------
# We use python in this example for demonstration.
# We can also deploy the compiled modules with other languages
# supported by TVM such as  c++, java, javascript.
# The graph module itself is fully embedded in TVM runtime.
#
# The following block demonstrates how we can directly use TVM's
# runtime API to execute the compiled module.
# You can find similar runtime API in TVMRuntime of other languages.
#
fcreate = tvm.get_global_func("tvm.graph_runtime.create")
ctx = tvm.gpu(0)
gmodule = fcreate(loaded_json, loaded_lib, ctx.device_type, ctx.device_id)
set_input, get_output, run = gmodule["set_input"], gmodule["get_output"], gmodule["run"]
set_input("x", tvm.nd.array(x_np))
gmodule["load_params"](loaded_params)
run()
out = tvm.nd.empty(shape)
get_output(0, out)
print(out.asnumpy())