imagenet_inference_gpu.py

"""
Compile GPU Inference
=====================
**Author**: `Yuwei Hu <https://huyuwei.github.io/>`_

This is an example of using NNVM to compile MobileNet/ResNet model and deploy its inference on GPU.

To begin with, we import nnvm(for compilation) and TVM(for deployment).
"""
import tvm
import numpy as np
from tvm.contrib import nvcc, graph_runtime
import nnvm.compiler
import nnvm.testing

######################################################################
# Register the NVCC Compiler Option
# ---------------------------------
# NNVM optimizes the graph and relies on TVM to generate fast GPU code.
# To get the maximum performance, we need to enable nvcc's compiler hook.
# This usually gives better performance than nvrtc mode.

@tvm.register_func
def tvm_callback_cuda_compile(code):
    ptx = nvcc.compile_cuda(code, target="ptx")
    return ptx

######################################################################
# Prepare the Benchmark
# ---------------------
# We construct a standard imagenet inference benchmark.
# NNVM needs two things to compile a deep learning model:
#
# - net: the graph representation of the computation
# - params: a dictionary of str to parameters
#
# We use nnvm's testing utility to produce the model description and random parameters
# so that the example does not depend on a specific front-end framework.
#
# .. note::
#
#   In a typical workflow, we can get this pair from :any:`nnvm.frontend`
#
target = "cuda"
ctx = tvm.gpu(0)
batch_size = 1
num_classes = 1000
image_shape = (3, 224, 224)
data_shape = (batch_size,) + image_shape
out_shape = (batch_size, num_classes)
# To use ResNet to do inference, run the following instead
#net, params = nnvm.testing.resnet.get_workload(
#    batch_size=1, image_shape=image_shape)
net, params = nnvm.testing.mobilenet.get_workload(
    batch_size=1, image_shape=image_shape)

######################################################################
# Compile the Graph
# -----------------
# To compile the graph, we call the build function with the graph
# configuration and parameters.
# When parameters are provided, NNVM will pre-compute certain part of the graph if possible (e.g. simplify batch normalization to scale shift),
# and return the updated parameters.

graph, lib, params = nnvm.compiler.build(
    net, target, shape={"data": data_shape}, params=params)


######################################################################
# Run the Compiled Module
# -----------------------
#
# To deploy the module, we call :any:`tvm.contrib.graph_runtime.create` passing in the graph, the lib, and context.
# Thanks to TVM, we can deploy the compiled module to many platforms and languages.
# The deployment module is designed to contain minimum dependencies.
# This example runs on the same machine.
#
# Note that the code below no longer depends on NNVM, and only relies TVM's runtime to run(deploy).
data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
module = graph_runtime.create(graph, lib, ctx)
# set input
module.set_input(**params)
module.set_input("data", data)
# run
module.run()
# get output
out = module.get_output(0, tvm.nd.empty(out_shape))
# convert to numpy
out.asnumpy()