Commit dddd8d1a by Tianqi Chen

[DOCS] Add intro tutorial (#45)

parent 505f9dd1
...@@ -9,10 +9,10 @@ NNVM is a reusable computational graph compilation stack for deep learning syste ...@@ -9,10 +9,10 @@ NNVM is a reusable computational graph compilation stack for deep learning syste
- Optimize computation graphs to improve performance. - Optimize computation graphs to improve performance.
- Compile into executable modules and deploy to different hardware backends with minimum dependency. - Compile into executable modules and deploy to different hardware backends with minimum dependency.
NNVM is designed to add new frontend, operators and graph optimizations in a decentralized fashion without changing the core interface. NNVM is part of [TVM stack](https://github.com/dmlc/tvm). NNVM compiler toolchain can target hardware backends supported by TVM. NNVM is designed to add new frontend, operators and graph optimizations in a decentralized fashion without changing the core interface. It is part of [TVM stack](https://github.com/dmlc/tvm). The compiler toolchain can target hardware backends supported by TVM.
The compiled module can be deployed to server, mobile, embedded devices and browsers with minimum dependency, in languages including c++, python, javascript, java, objective-c. The compiled module can be deployed to server, mobile, embedded devices and browsers with minimum dependency, in languages including c++, python, javascript, java, objective-c.
The following code snippet demonstrates the general workflow of nnvm compiler toolchain. The following code snippet demonstrates the general workflow of nnvm.
```python ```python
import tvm import tvm
......
...@@ -16,6 +16,7 @@ import os, subprocess ...@@ -16,6 +16,7 @@ import os, subprocess
import shlex import shlex
import recommonmark import recommonmark
import sphinx_gallery import sphinx_gallery
from tvm.contrib import rpc, graph_runtime
from recommonmark.parser import CommonMarkParser from recommonmark.parser import CommonMarkParser
from recommonmark.transform import AutoStructify from recommonmark.transform import AutoStructify
......
...@@ -54,7 +54,7 @@ def set_dtype_inputs(g, dtype): ...@@ -54,7 +54,7 @@ def set_dtype_inputs(g, dtype):
""" """
if isinstance(dtype, dict): if isinstance(dtype, dict):
list_dtype = [ list_dtype = [
DTYPE_TO_TCODE[dtype.get(name, "default")] DTYPE_TO_TCODE[str(dtype.get(name, "default"))]
for name in g.index.input_names] for name in g.index.input_names]
else: else:
list_dtype = [DTYPE_TO_TCODE[dtype]] * len(g.index.input_names) list_dtype = [DTYPE_TO_TCODE[dtype]] * len(g.index.input_names)
......
"""
Get Started with NNVM
=====================
**Author**: `Tianqi Chen <https://tqchen.github.io/>`_
This article is an introductory tutorial to workflow in NNVM.
"""
import nnvm.compiler
import nnvm.symbol as sym
######################################################################
# Declare Computation
# -------------------
# We start by describing our need using computational graph.
# Most deep learning frameworks use computation graph to describe
# their computation. In this example, we directly use
# NNVM's API to construct the computational graph.
#
# .. note::
#
# In a typical deep learning compilation workflow,
# we can get the models from :any:`nnvm.frontend`
#
# The following code snippet describes :math:`z = x + \sqrt{y}`
# and creates a nnvm graph from the description.
# We can print out the graph ir to check the graph content.
x = sym.Variable("x")
y = sym.Variable("y")
z = sym.elemwise_add(x, sym.sqrt(y))
compute_graph = nnvm.graph.create(z)
print("-------compute graph-------")
print(compute_graph.ir())
######################################################################
# Compile
# -------
# We can call :any:`nnvm.compiler.build` to compile the graph.
# The build function takes a shape parameter which specifies the
# input shape requirement. Here we only need to pass in shape of ``x``
# and the other one will be inferred automatically by NNVM.
#
# The function returns three values. ``deploy_graph`` contains
# the final compiled graph structure. ``lib`` is a :any:`tvm.module.Module`
# that contains compiled CUDA functions. We do not need the ``params``
# in this case.
shape = (4,)
deploy_graph, lib, params = nnvm.compiler.build(
compute_graph, target="cuda", shape={"x": shape}, dtype="float32")
######################################################################
# We can print out the IR of ``deploy_graph`` to understand what just
# happened under the hood. We can find that ``deploy_graph`` only
# contains a single operator ``tvm_op``. This is because NNVM
# automatically fused the operator together into one operator.
#
print("-------deploy graph-------")
print(deploy_graph.ir())
######################################################################
# Let us also peek into content of ``lib``.
# Typically a compiled TVM CUDA module contains a host module(lib)
# and a device module(``lib.imported_modules[0]``) that contains the CUDA code.
# We print out the the generated device code here.
# This is exactly a fused CUDA version of kernel that the graph points to.
#
print("-------deploy library-------")
print(lib.imported_modules[0].get_source())
######################################################################
# Deploy and Run
# --------------
# Now that we have have compiled module, let us run it.
# We can use :any:`graph_runtime <tvm.contrib.graph_runtime.create>`
# in tvm to create a deployable :any:`GraphModule <tvm.contrib.graph_runtime.GraphModule>`.
# We can use the :any:`set_input <tvm.contrib.graph_runtime.GraphModule.set_input>`,
# :any:`run <tvm.contrib.graph_runtime.GraphModule.run>` and
# :any:`get_output <tvm.contrib.graph_runtime.GraphModule.get_output>` function
# to set the input, execute the graph and get the output we need.
#
import tvm
import numpy as np
from tvm.contrib import graph_runtime, util
module = graph_runtime.create(deploy_graph, lib, tvm.gpu(0))
x_np = np.array([1, 2, 3, 4]).astype("float32")
y_np = np.array([4, 4, 4, 4]).astype("float32")
# set input to the graph module
module.set_input(x=x_np, y=y_np)
# run forward computation
module.run()
# get the first output
out = module.get_output(0, out=tvm.nd.empty(shape))
print(out.asnumpy())
######################################################################
# Provide Model Parameters
# ------------------------
# Most deep learning models contains two types of inputs: parameters
# that remains fixed during inference and data input that need to
# change for each inference task. It is helpful to provide these
# information to NNVM. Let us assume that ``y`` is the parameter
# in our example. We can provide the model parameter information
# by the params argument to :any:`nnvm.compiler.build`.
#
deploy_graph, lib, params = nnvm.compiler.build(
compute_graph, target="cuda", shape={"x": shape}, params={"y": y_np})
######################################################################
# This time we will need params value returned by :any:`nnvm.compiler.build`.
# NNVM applys optimization to pre-compute the intermediate values in
# the graph that can be determined by parameters. In this case
# :math:`\sqrt{y}` can be pre-computed. The pre-computed values
# are returned as new params. We can print out the new compiled library
# to confirm that the fused kernel only now contains add.
#
print("-----optimized params-----")
print(params)
print("-------deploy library-------")
print(lib.imported_modules[0].get_source())
######################################################################
# Save the Deployed Module
# ------------------------
# We can save the ``deploy_graph``, ``lib`` and ``params`` separately
# and load them back later. We can use :any:`tvm.module.Module` to export
# the compiled library. ``deploy_graph`` is saved in json format and ``params``
# is serialized into a bytearray.
#
temp = util.tempdir()
path_lib = temp.relpath("deploy.so")
lib.export_library(path_lib)
with open(temp.relpath("deploy.json"), "w") as fo:
fo.write(deploy_graph.json())
with open(temp.relpath("deploy.params"), "wb") as fo:
fo.write(nnvm.compiler.save_param_dict(params))
print(temp.listdir())
######################################################################
# We can load the module back.
loaded_lib = tvm.module.load(path_lib)
loaded_json = open(temp.relpath("deploy.json")).read()
loaded_params = bytearray(open(temp.relpath("deploy.params"), "rb").read())
module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
params = nnvm.compiler.load_param_dict(loaded_params)
# directly load from byte array
module.load_params(loaded_params)
module.run(x=x_np)
# get the first output
out = module.get_output(0, out=tvm.nd.empty(shape))
print(out.asnumpy())
######################################################################
# Deploy using Another Language
# -----------------------------
# We use python in this example for demonstration.
# We can also deploy the compiled modules with other languages
# supported by TVM such as c++, java, javascript.
# The graph module itself is fully embedded in TVM runtime.
#
# The following block demonstrates how we can directly use TVM's
# runtime API to execute the compiled module.
# You can find similar runtime API in TVMRuntime of other languages.
#
fcreate = tvm.get_global_func("tvm.graph_runtime.create")
ctx = tvm.gpu(0)
gmodule = fcreate(loaded_json, loaded_lib, ctx.device_type, ctx.device_id)
set_input, get_output, run = gmodule["set_input"], gmodule["get_output"], gmodule["run"]
set_input("x", tvm.nd.array(x_np))
gmodule["load_params"](loaded_params)
run()
out = tvm.nd.empty(shape)
get_output(0, out)
print(out.asnumpy())
...@@ -17,19 +17,24 @@ import nnvm.testing ...@@ -17,19 +17,24 @@ import nnvm.testing
# --------------------------------- # ---------------------------------
# NNVM optimizes the graph and relies on TVM to generate fast GPU code. # NNVM optimizes the graph and relies on TVM to generate fast GPU code.
# To get the maximum performance, we need to enable nvcc's compiler hook. # To get the maximum performance, we need to enable nvcc's compiler hook.
# This gives better performance than nvrtc mode. # This usually gives better performance than nvrtc mode.
@tvm.register_func @tvm.register_func
def tvm_callback_cuda_compile(code): def tvm_callback_cuda_compile(code):
ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_52"]) ptx = nvcc.compile_cuda(code, target="ptx")
return ptx return ptx
###################################################################### ######################################################################
# Prepare the Benchmark # Prepare the Benchmark
# --------------------- # ---------------------
# We construct a standard imagenet inference benchmark. # We construct a standard imagenet inference benchmark.
# We use nnvm's testing utility to produce the model description and random parameters so that the example does not # NNVM needs two things to compile a deep learning model:
# depend on a specific front-end framework. #
# - net: the graph representation of the computation
# - params: a dictionary of str to parameters
#
# We use nnvm's testing utility to produce the model description and random parameters
# so that the example does not depend on a specific front-end framework.
# #
# .. note:: # .. note::
# #
...@@ -48,11 +53,6 @@ net, params = nnvm.testing.mobilenet.get_workload( ...@@ -48,11 +53,6 @@ net, params = nnvm.testing.mobilenet.get_workload(
###################################################################### ######################################################################
# Compile the Graph # Compile the Graph
# ----------------- # -----------------
# NNVM needs two things to compile a deep learning model:
#
# - net: the graph representation of the computation
# - params: a dictionary of str to parameters
#
# To compile the graph, we call the build function with the graph # To compile the graph, we call the build function with the graph
# configuration and parameters. # configuration and parameters.
# When parameters are provided, NNVM will pre-compute certain part of the graph if possible (e.g. simplify batch normalization to scale shift), # When parameters are provided, NNVM will pre-compute certain part of the graph if possible (e.g. simplify batch normalization to scale shift),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment