[WIP] WebGL Backend (#672)

Basic WebGL Backend

[WIP] WebGL Backend (#672)
Basic WebGL Backend
589831df · Zhixun Tan · Tianqi Chen · d4a46898 · 589831df · 589831df
Commit 589831df authored Jan 20, 2018 by Zhixun Tan Committed by Tianqi Chen Jan 20, 2018
47 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,7 @@ endif()
 tvm_option(USE_CUDA "Build with CUDA" OFF)
 tvm_option(USE_OPENCL "Build with OpenCL" OFF)
+tvm_option(USE_OPENGL "Build with OpenGL" OFF)
 tvm_option(USE_METAL "Build with Metal" OFF)
 tvm_option(USE_RPC "Build with RPC" ON)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
@@ -61,8 +62,8 @@ if(MSVC)
 else(MSVC)
  include(CheckCXXCompilerFlag)
  check_cxx_compiler_flag("-std=c++11"    SUPPORT_CXX11)
-  set(CMAKE_C_FLAGS "-O3 -Wall -std=c++11 -fPIC")
+  set(CMAKE_C_FLAGS "-O3 -Wall -fPIC")
-  set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS})
+  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -std=c++11")
 endif(MSVC)
 # add source group
@@ -87,6 +88,7 @@ file(GLOB RUNTIME_SRCS src/runtime/*.cc)
 file(GLOB COMPILER_LLVM_SRCS src/codegen/llvm/*.cc)
 file(GLOB RUNTIME_CUDA_SRCS src/runtime/cuda/*.cc)
 file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
+file(GLOB RUNTIME_OPENGL_SRCS src/runtime/opengl/*.cc)
 file(GLOB RUNTIME_METAL_SRCS src/runtime/metal/*.mm)
 file(GLOB RUNTIME_RPC_SRCS src/runtime/rpc/*.cc)
 file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
@@ -135,6 +137,18 @@ else(USE_OPENCL)
  add_definitions(-DTVM_OPENCL_RUNTIME=0)
 endif(USE_OPENCL)
+if(USE_OPENGL)
+  find_package(OpenGL QUIET REQUIRED)
+  find_package(glfw3 QUIET REQUIRED)
+  message(STATUS "Build with OpenGL support")
+  include_directories(${OPENGL_INCLUDE_DIRS})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenGL_LIBRARIES} glfw)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_OPENGL_SRCS})
+  add_definitions(-DTVM_OPENGL_RUNTIME=1)
+else(USE_OPENGL)
+  add_definitions(-DTVM_OPENGL_RUNTIME=0)
+endif(USE_OPENGL)
 if(USE_METAL)
  find_package(OpenCL QUIET REQUIRED)
  message(STATUS "Build with Metal support")

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -88,6 +88,7 @@ stage('Build') {
           echo USE_CUDNN=1 >> config.mk
           echo USE_CUDA=1 >> config.mk
           echo USE_OPENCL=1 >> config.mk
+           echo USE_OPENGL=1 >> config.mk
           echo LLVM_CONFIG=llvm-config-4.0 >> config.mk
           echo USE_RPC=1 >> config.mk
           echo USE_GRAPH_RUNTIME=1 >> config.mk
@@ -120,6 +121,7 @@ stage('Build') {
           echo USE_CUDA=0 >> config.mk
           echo USE_OPENCL=0 >> config.mk
           echo USE_RPC=0 >> config.mk
+           echo USE_OPENGL=1 >> config.mk
           echo LLVM_CONFIG=llvm-config-4.0 >> config.mk
           """
        make('cpu', '-j2')

--- a/Makefile
+++ b/Makefile
@@ -32,8 +32,8 @@ OBJCFLAGS = -fno-objc-arc
 EMCC_FLAGS= -std=c++11 -DDMLC_LOG_STACK_TRACE=0\
 	-Oz -s RESERVED_FUNCTION_POINTERS=2 -s MAIN_MODULE=1 -s NO_EXIT_RUNTIME=1\
 	-s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap','getValue','setValue','addFunction']"\
+	-s USE_GLFW=3 -s USE_WEBGL2=1 -lglfw\
 	$(INCLUDE_FLAGS)
 # llvm configuration
 ifdef LLVM_CONFIG
 	LLVM_VERSION=$(shell $(LLVM_CONFIG) --version| cut -b 1,3)
@@ -54,6 +54,7 @@ METAL_SRC = $(wildcard src/runtime/metal/*.mm)
 CUDA_SRC = $(wildcard src/runtime/cuda/*.cc)
 ROCM_SRC = $(wildcard src/runtime/rocm/*.cc)
 OPENCL_SRC = $(wildcard src/runtime/opencl/*.cc)
+OPENGL_SRC = $(wildcard src/runtime/opengl/*.cc)
 RPC_SRC = $(wildcard src/runtime/rpc/*.cc)
 GRAPH_SRC = $(wildcard src/runtime/graph/*.cc)
 RUNTIME_SRC = $(wildcard src/runtime/*.cc)
@@ -65,6 +66,7 @@ METAL_OBJ = $(patsubst src/%.mm, build/%.o, $(METAL_SRC))
 CUDA_OBJ = $(patsubst src/%.cc, build/%.o, $(CUDA_SRC))
 ROCM_OBJ = $(patsubst src/%.cc, build/%.o, $(ROCM_SRC))
 OPENCL_OBJ = $(patsubst src/%.cc, build/%.o, $(OPENCL_SRC))
+OPENGL_OBJ = $(patsubst src/%.cc, build/%.o, $(OPENGL_SRC))
 RPC_OBJ = $(patsubst src/%.cc, build/%.o, $(RPC_SRC))
 GRAPH_OBJ = $(patsubst src/%.cc, build/%.o, $(GRAPH_SRC))
 CC_OBJ = $(patsubst src/%.cc, build/%.o, $(CC_SRC)) $(LLVM_OBJ)
@@ -119,6 +121,19 @@ else
 	CFLAGS += -DTVM_OPENCL_RUNTIME=0
 endif
+ifeq ($(USE_OPENGL), 1)
+	CFLAGS += -DTVM_OPENGL_RUNTIME=1
+	EMCC_FLAGS += -DTVM_OPENGL_RUNTIME=1
+	ifeq ($(UNAME_S), Darwin)
+		FRAMEWORKS += -framework OpenGL
+	else
+		LDFLAGS += -lGL -lglfw
+	endif
+	RUNTIME_DEP += $(OPENGL_OBJ)
+else
+	CFLAGS += -DTVM_OPENGL_RUNTIME=0
+endif
 ifeq ($(USE_METAL), 1)
 	CFLAGS += -DTVM_METAL_RUNTIME=1
 	LDFLAGS += -lobjc

--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -55,9 +55,11 @@ typedef int64_t tvm_index_t;
 /*! \brief Extension device types in TVM */
 typedef enum {
+  kOpenGL = 11,
  // Extension DRAM type, used for quickly test extension device
  // The device api can differ depending on the xpu driver registered.
-  kExtDev = 12
+  kExtDev = 12,
  // AddExtraTVMType which is not in DLPack here
 } TVMDeviceExtType;

--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -55,11 +55,16 @@ class DeviceAPI {
  /*!
   * \brief Allocate a data space on device.
   * \param ctx The device context to perform operation.
-   * \param size The size of the memory
+   * \param nbytes The number of bytes in memory.
   * \param alignment The alignment of the memory.
-   * \return The allocated device pointer
+   * \param type_hint The type of elements. Only needed by certain backends such
+   * as OpenGL, as nbytes & alignment are sufficient for most backends.
+   * \return The allocated device pointer.
   */
-  virtual void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) = 0;
+  virtual void* AllocDataSpace(TVMContext ctx,
+                               size_t nbytes,
+                               size_t alignment,
+                               TVMType type_hint) = 0;
  /*!
   * \brief Free a data space on device.
   * \param ctx The device context to perform operation.

--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -214,6 +214,11 @@ class Stage : public NodeRef {
   */
  Stage& double_buffer();   // NOLINT(*)
  /*!
+   * \brief Schedule for OpenGL fragment shader.
+   * \return reference to self.
+   */
+  Stage& opengl(); // NOLINT(*)
+  /*!
   * \brief whether the stage has been scheduled.
   * \return whether the stage has been scheduled.
   */

--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -17,7 +17,7 @@ from . import ir_builder
 from . import target
 from . import ndarray as nd
-from .ndarray import context, cpu, gpu, opencl, cl, metal, mtl, vpi, rocm, ext_dev
+from .ndarray import context, cpu, gpu, opencl, cl, metal, mtl, vpi, rocm, opengl, ext_dev
 from ._ffi.runtime_ctypes import TypeCode
 from ._ffi.function import Function

--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -97,6 +97,7 @@ class TVMContext(ctypes.Structure):
        8 : 'metal',
        9 : 'vpi',
        10: 'rocm',
+        11: 'opengl',
        12: 'ext_dev',
    }
    STR2MASK = {
@@ -111,6 +112,7 @@ class TVMContext(ctypes.Structure):
        'metal': 8,
        'vpi': 9,
        'rocm': 10,
+        'opengl': 11,
        'ext_dev': 12,
    }
    def __init__(self, device_type, device_id):

--- a/python/tvm/contrib/rpc.py
+++ b/python/tvm/contrib/rpc.py
@@ -285,6 +285,10 @@ class RPCSession(object):
        """Construct remote Metal device."""
        return self.context(8, dev_id)
+    def opengl(self, dev_id=0):
+        """Construct remote OpenGL device."""
+        return self.context(11, dev_id)
    def ext_dev(self, dev_id=0):
        """Construct remote extension device."""
        return self.context(12, dev_id)

--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -120,6 +120,21 @@ def vpi(dev_id=0):
    """
    return TVMContext(9, dev_id)
+def opengl(dev_id=0):
+    """Construct a OpenGL device
+    Parameters
+    ----------
+    dev_id : int, optional
+        The integer device id
+    Returns
+    -------
+    ctx : TVMContext
+        The created context
+    """
+    return TVMContext(11, dev_id)
 def ext_dev(dev_id=0):
    """Construct a extension device

--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -611,4 +611,11 @@ class Stage(NodeBase):
        """
        _api_internal._StageDoubleBuffer(self)
+    def opengl(self):
+        """The special OpenGL schedule
+        Maps each output element to a pixel.
+        """
+        _api_internal._StageOpenGL(self)
 _init_api("tvm.schedule")
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -67,7 +67,7 @@ class Target(object):
    Parameters
    ----------
-    target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "ext_dev"}
+    target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev"}
        The major target name.
    options : list of str, optional
@@ -119,6 +119,8 @@ class Target(object):
        elif target_name in ("metal",):
            self.keys += ("gpu",)
            self.max_num_threads = 256
+        elif target_name in ("opengl",):
+            self.keys += ("opengl",)
        elif target_name in ("stackvm", "ext_dev"):
            # Do not now class for stacvm or ext_dev
            pass

--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -399,6 +399,11 @@ TVM_REGISTER_API("_StageDoubleBuffer")
    args[0].operator Stage().double_buffer();
  });
+TVM_REGISTER_API("_StageOpenGL")
+  .set_body([](TVMArgs args, TVMRetValue *ret) {
+    args[0].operator Stage().opengl();
+  });
 TVM_REGISTER_API("_ScheduleNormalize")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
    *ret = args[0].operator Schedule()

--- a/src/codegen/build_opengl.cc
+++ b/src/codegen/build_opengl.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ *  Build opengl modules from source.
+ * \file build_opengl.cc
+ */
+#include <tvm/base.h>
+#include "./codegen_opengl.h"
+#include "./build_common.h"
+namespace tvm {
+namespace codegen {
+runtime::Module BuildOpenGL(Array<LoweredFunc> funcs) {
+  bool output_ssa = false;
+  CodeGenOpenGL cg;
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  auto shaders = cg.Finish();
+#if TVM_OPENGL_RUNTIME
+  return OpenGLModuleCreate(shaders, "gl", ExtractFuncInfo(funcs));
+#else
+  LOG(WARNING) << "OpenGL runtime not enabled, return a source module...";
+  auto data = ToJSON(shaders);
+  return DeviceSourceModuleCreate(data, "gl", ExtractFuncInfo(funcs), "opengl");
+#endif  // TVM_OPENGL_RUNTIME
+}
+TVM_REGISTER_API("codegen.build_opengl")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = BuildOpenGL(args[0]);
+});
+}  // namespace codegen
+}  // namespace tvm
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -150,7 +150,7 @@ class CodeGenC :
  std::string GetStructRef(
      Type t, const Expr& buffer, const Expr& index, int kind);
  // print reference to a buffer as type t in index.
-  std::string GetBufferRef(
+  virtual std::string GetBufferRef(
      Type t, const Variable* buffer, Expr index);
  /*!
   * \brief If buffer is allocated as type t.

--- a/src/codegen/codegen_opengl.cc
+++ b/src/codegen/codegen_opengl.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_opengl.cc
+ *
+ * We are targeting OpenGL 3.3. The reason of not targeting a recent version
+ * of OpenGL is to have better compatibility of WebGL 2.
+ */
+#include <tvm/runtime/config.h>
+#include <tvm/packed_func_ext.h>
+#include <vector>
+#include <string>
+#include "./codegen_opengl.h"
+#include "../runtime/thread_storage_scope.h"
+namespace tvm {
+namespace codegen {
+CodeGenOpenGL::CodeGenOpenGL()
+    : output_(nullptr), output_iter_var_(nullptr) {}
+void CodeGenOpenGL::InitFuncState(LoweredFunc f) {
+  CodeGenC::InitFuncState(f);
+  output_ = nullptr;
+  inputs_.clear();
+  output_iter_var_ = nullptr;
+  thread_extent_var_ = "";
+}
+void CodeGenOpenGL::AddFunction(LoweredFunc f) {
+  // clear previous generated state.
+  this->InitFuncState(f);
+  this->decl_stream << "#version 300 es\n";
+  this->decl_stream << "precision highp float;\n";
+  // skip the first underscore, so SSA variable starts from _1
+  GetUniqueName("_");
+  // add to alloc buffer type.
+  for (const auto& kv : f->handle_data_type) {
+    RegisterHandleType(kv.first.get(), kv.second.type());
+  }
+  // Allocate argument names. Store in `var_idmap_`.
+  for (auto arg : f->args) {
+    auto arg_name = GetUniqueName(arg.get()->name_hint);
+    var_idmap_[arg.get()] = arg_name;
+  }
+  thread_extent_var_ = GetUniqueName("thread_extent");
+  this->decl_stream << "uniform int " << thread_extent_var_ << ";\n";
+  this->stream << "void main() {\n";
+  int func_scope = this->BeginScope();
+  this->PrintStmt(f->body);
+  this->EndScope(func_scope);
+  this->PrintIndent();
+  this->stream << "}\n\n";
+  // Declare arguments.
+  for (auto arg : f->args) {
+    if (this->inputs_.find(arg.get()) != this->inputs_.cend()) {
+      // Declare input texture.
+      // Format:
+      // - Float: "uniform sampler2D {name};"
+      // - Int: "uniform isampler2D {name};"
+      // - UInt: "uniform usampler2D {name};"
+      auto arg_name = GetVarID(arg.get());
+      auto type_it = this->handle_data_type_.find(arg.get());
+      CHECK(type_it != this->handle_data_type_.cend()) << "Cannot find type.";
+      auto type = Type2TVMType(type_it->second);
+      CHECK_EQ(type.lanes, 1) << "Vector type not supported.";
+      switch (type.code) {
+        case kDLInt:
+          this->decl_stream << "uniform isampler2D " << arg_name << ";\n";
+          break;
+        case kDLUInt:
+          this->decl_stream << "uniform usampler2D " << arg_name << ";\n";
+          break;
+        case kDLFloat:
+          this->decl_stream << "uniform sampler2D " << arg_name << ";\n";
+          break;
+        default:
+          LOG(FATAL) << "Unsupported type code.";
+      }
+    } else if (this->output_ == arg.get()) {
+      // Declare output texture.
+      // Format: "out {type} {name};"
+      auto arg_name = GetVarID(arg.get());
+      auto type_it = this->handle_data_type_.find(arg.get());
+      CHECK(type_it != this->handle_data_type_.cend()) << "Cannot find type.";
+      auto type = type_it->second;
+      this->decl_stream << "out ";
+      PrintType(type, this->decl_stream);
+      this->decl_stream << " " << arg_name << ";\n";
+    } else {
+      // Declare uniform value.
+      // Format: "uniform {type} {name};"
+      auto arg_name = GetVarID(arg.get());
+      auto type = arg.get()->type;
+      this->decl_stream << "uniform ";
+      PrintType(type, this->decl_stream);
+      this->decl_stream << " " << arg_name << ";\n";
+    }
+  }
+  std::vector<std::string> arg_names;
+  std::vector<runtime::OpenGLArgKind> arg_kinds;
+  for (auto arg : f->args) {
+    std::string name = GetVarID(arg.get());
+    runtime::OpenGLArgKind kind;
+    if (inputs_.find(arg.get()) != inputs_.cend()) {
+      kind = runtime::OpenGLArgKind::kInputTexture;
+    } else if (output_ == arg.get()) {
+      kind = runtime::OpenGLArgKind::kOutputTexture;
+    } else {
+      kind = runtime::OpenGLArgKind::kUniform;
+    }
+    arg_names.push_back(name);
+    arg_kinds.push_back(kind);
+  }
+  shaders_[f->name] = runtime::OpenGLShader(
+      this->decl_stream.str() + this->stream.str(),
+      std::move(arg_names), std::move(arg_kinds),
+      this->thread_extent_var_);
+}
+std::unordered_map<std::string, runtime::OpenGLShader> CodeGenOpenGL::Finish() {
+  return shaders_;
+}
+void CodeGenOpenGL::BindThreadIndex(const IterVar& iv) {
+  CHECK_EQ(iv->thread_tag, "threadIdx.x") << "Must be threadIdx.x";
+  CHECK(var_idmap_.find(iv->var.get()) == var_idmap_.end())
+    << "Only support one thread iter var";
+  CHECK(output_iter_var_ == nullptr) << "Only support one thread iter var";
+  var_idmap_[iv->var.get()] = iv->thread_tag;
+  output_iter_var_ = iv->var.get();
+  // Declare threadIdx local variable.
+  this->PrintIndent();
+  this->stream << "ivec2 threadIdx = ivec2(gl_FragCoord.xy);\n";
+  // Return directly if threadIdx.x >= thread_extent.
+  this->PrintIndent();
+  this->stream << "if (threadIdx.x >= " << thread_extent_var_ << ") {\n";
+  this->PrintIndent();
+  this->stream << "  return;\n";
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+// GLSL texture store is special. We can only store to one output texture, and
+// we must store to the index that matches the current "thread index".
+void CodeGenOpenGL::VisitStmt_(const Store* op) {
+  auto t = op->value.type();
+  auto buffer = op->buffer_var.get();
+  auto index = op->index;
+  if (t.lanes() == 1) {
+    // Store to a scalar.
+    CHECK(inputs_.find(buffer) == inputs_.cend())
+      << "Texture has been read from before. Must not store to it.";
+    if (output_ == nullptr) {
+      output_ = buffer;  // Record that this texture is the output.
+    } else {
+      CHECK(output_ == buffer) << "GLSL can only write to 1 texture.";
+    }
+    this->PrintIndent();
+    this->stream << GetBufferRef(t, buffer, index) << " = "
+                 << PrintExpr(op->value) << ";\n";
+  } else {
+    // Store to a vector.
+    LOG(FATAL) << "Vectorized store not implemented.";
+  }
+}
+// texelFetch(tex, ivec2(idx, 0), 0).r
+std::string CodeGenOpenGL::TexelFetch(const Variable* buffer, Expr index) {
+  std::ostringstream os;
+  os << "texelFetch(" << GetVarID(buffer) << ", ivec2(";
+  PrintExpr(index, os);
+  os << ", 0), 0).r";
+  return os.str();
+}
+// Print a reference expression to a buffer.
+// Format: texelFetch(buffer, index, 0).r
+std::string CodeGenOpenGL::GetBufferRef(
+    Type t, const Variable* buffer, Expr index) {
+  CHECK_EQ(t.lanes(), 1) << "Vector type not supported.";
+  CHECK(HandleTypeMatch(buffer, t)) << "Type mismatch not supported.";
+  if (buffer == this->output_) {
+    // This is the output texture.
+    CHECK_EQ(index.get(), output_iter_var_)
+      << "GLSL must access corresponding elem of output texture.";
+    return GetVarID(buffer);
+  } else {
+    // This is an input texture.
+    this->inputs_.insert(buffer);
+    return TexelFetch(buffer, index);
+  }
+}
+void CodeGenOpenGL::PrintType(Type t, std::ostream& os) {
+  switch (t.code()) {
+    case halideir_type_int:
+      CHECK_EQ(t.bits(), 32) << "Only support 32-bit int.";
+      os << "int";
+      break;
+    case halideir_type_uint:
+      CHECK_EQ(t.bits(), 32) << "Only support 32-bit uint.";
+      os << "uint";
+      break;
+    case halideir_type_float:
+      CHECK_EQ(t.bits(), 32) << "Only support 32-bit float.";
+      os << "float";
+      break;
+    default:
+      LOG(FATAL) << "Unsupported type code.";
+  }
+}
+// Codegen for immediate values
+void CodeGenOpenGL::VisitExpr_(const IntImm* op, std::ostream& os) {
+  CHECK_EQ(op->type, Int(32)) << "GLSL 3.0 only supports 32-bit ints.";
+  CodeGenC::VisitExpr_(op, os);
+}
+void CodeGenOpenGL::VisitExpr_(const UIntImm* op, std::ostream& os) {
+  CHECK_EQ(op->type, UInt(32)) << "GLSL 3.0 only supports 32-bit uints.";
+  CodeGenC::VisitExpr_(op, os);
+}
+void CodeGenOpenGL::VisitExpr_(const FloatImm* op, std::ostream& os) {
+  CHECK_EQ(op->type, Float(32)) << "GLSL 3.0 only supports 32-bit floats.";
+  CodeGenC::VisitExpr_(op, os);
+}
+void CodeGenOpenGL::VisitExpr_(const StringImm*, std::ostream& os) {
+  LOG(FATAL) << "GLSL 3.0 doesn't support strings.";
+}
+}  // namespace codegen
+}  // namespace tvm
--- a/src/codegen/codegen_opengl.h
+++ b/src/codegen/codegen_opengl.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_opengl.h
+ * \brief Generate OpenGL device code.
+ */
+#ifndef TVM_CODEGEN_CODEGEN_OPENGL_H_
+#define TVM_CODEGEN_CODEGEN_OPENGL_H_
+#include <tvm/codegen.h>
+#include <tvm/packed_func_ext.h>
+#include <string>
+#include "./codegen_c.h"
+#include "../runtime/opengl/opengl_module.h"
+namespace tvm {
+namespace codegen {
+class CodeGenOpenGL final : public CodeGenC {
+ public:
+  CodeGenOpenGL();
+  void AddFunction(LoweredFunc f);
+  std::unordered_map<std::string, runtime::OpenGLShader> Finish();
+  void InitFuncState(LoweredFunc f) final;
+  void BindThreadIndex(const IterVar& iv) final;
+  void VisitStmt_(const Store* op) final;
+  std::string TexelFetch(const Variable* buffer, Expr index);
+  std::string GetBufferRef(Type t, const Variable* buffer, Expr index) final;
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
+  // Codegen for immediate values
+  void VisitExpr_(const IntImm* op, std::ostream& os) final;  // NOLINT(*)
+  void VisitExpr_(const UIntImm* op, std::ostream& os) final;  // NOLINT(*)
+  void VisitExpr_(const FloatImm* op, std::ostream& os) final;  // NOLINT(*)
+  void VisitExpr_(const StringImm* op, std::ostream& os) final;  // NOLINT(*)
+ private:
+  const Variable* output_{nullptr};
+  std::unordered_set<const Variable*> inputs_;
+  const Variable* output_iter_var_{nullptr};
+  std::unordered_map<std::string, runtime::OpenGLShader> shaders_;
+  std::string thread_extent_var_;
+};
+}  // namespace codegen
+}  // namespace tvm
+#endif  // TVM_CODEGEN_CODEGEN_OPENGL_H_
--- a/src/codegen/verilog/vpi_device_api.cc
+++ b/src/codegen/verilog/vpi_device_api.cc
@@ -49,7 +49,10 @@ class VPIDeviceAPI final : public runtime::DeviceAPI {
      *rv = 1;
    }
  }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t size,
+                       size_t alignment,
+                       TVMType type_hint) final {
    // always align to 32 bytes at least.
    CHECK_LE(alignment, runtime::kAllocAlignment);
    alignment = runtime::kAllocAlignment;

--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -31,6 +31,7 @@ inline std::string DeviceName(int type) {
    case kDLMetal: return "metal";
    case kDLVPI: return "vpi";
    case kDLROCM: return "rocm";
+    case kOpenGL: return "opengl";
    case kExtDev: return "ext_dev";
    default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
  }
@@ -95,7 +96,8 @@ DeviceAPI* DeviceAPI::Get(TVMContext ctx, bool allow_missing) {
 }
 void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size) {
-  return AllocDataSpace(ctx, size, kTempAllocaAlignment);
+  TVMType type_hint{kDLUInt, 8, 1};
+  return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
 }
 void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) {
@@ -365,7 +367,7 @@ int TVMArrayAlloc(const tvm_index_t* shape,
  size_t size = GetDataSize(arr);
  size_t alignment = GetDataAlignment(arr);
  arr->data = DeviceAPIManager::Get(arr->ctx)->AllocDataSpace(
-      arr->ctx, size, alignment);
+      arr->ctx, size, alignment, arr->dtype);
  *out = arr;
  API_END_HANDLE_ERROR(TVMArrayFree_(arr));
 }

--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -20,13 +20,16 @@ class CPUDeviceAPI final : public DeviceAPI {
      *rv = 1;
    }
  }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
    void* ptr;
 #if _MSC_VER
-    ptr = _aligned_malloc(size, alignment);
+    ptr = _aligned_malloc(nbytes, alignment);
    if (ptr == nullptr) throw std::bad_alloc();
 #else
-    int ret = posix_memalign(&ptr, alignment, size);
+    int ret = posix_memalign(&ptr, alignment, nbytes);
    if (ret != 0) throw std::bad_alloc();
 #endif
    return ptr;

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -54,12 +54,15 @@ class CUDADeviceAPI final : public DeviceAPI {
    }
    *rv = value;
  }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
    CUDA_CALL(cudaSetDevice(ctx.device_id));
    CHECK_EQ(256 % alignment, 0U)
        << "CUDA space is aligned at 256 bytes";
    void *ret;
-    CUDA_CALL(cudaMalloc(&ret, size));
+    CUDA_CALL(cudaMalloc(&ret, nbytes));
    return ret;
  }

--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -63,7 +63,10 @@ class MetalWorkspace final : public DeviceAPI {
  // override device API
  void SetDevice(TVMContext ctx) final;
  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final;
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final;
  void FreeDataSpace(TVMContext ctx, void* ptr) final;
  void CopyDataFromTo(const void* from,
                      size_t from_size,

--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -123,12 +123,12 @@ void MetalWorkspace::SetDevice(TVMContext ctx) {
 }
 void* MetalWorkspace::AllocDataSpace(
-    TVMContext ctx, size_t size, size_t alignment) {
+    TVMContext ctx, size_t nbytes, size_t alignment, TVMType type_hint) {
  this->Init();
  id<MTLDevice> dev = GetDevice(ctx);
  // allocate buffer in GPU only mode.
  id<MTLBuffer> buf = [
-      dev newBufferWithLength:size
+      dev newBufferWithLength:nbytes
          options:MTLResourceStorageModePrivate];
  CHECK(buf != nil);
  return (__bridge void*)([buf retain]);

--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -115,6 +115,8 @@ bool RuntimeEnabled(const std::string& target) {
    f_name = "device_api.gpu";
  } else if (target == "cl" || target == "opencl") {
    f_name = "device_api.opencl";
+  } else if (target == "gl" || target == "opengl") {
+    f_name = "device_api.opengl";
  } else if (target == "mtl" || target == "metal") {
    f_name = "device_api.metal";
  } else if (target == "stackvm") {

--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -142,7 +142,10 @@ class OpenCLWorkspace final : public DeviceAPI {
  // override device API
  void SetDevice(TVMContext ctx) final;
  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final;
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t size,
+                       size_t alignment,
+                       TVMType type_hint) final;
  void FreeDataSpace(TVMContext ctx, void* ptr) final;
  void CopyDataFromTo(const void* from,
                      size_t from_offset,

--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -51,7 +51,7 @@ void OpenCLWorkspace::GetAttr(
 }
 void* OpenCLWorkspace::AllocDataSpace(
-    TVMContext ctx, size_t size, size_t alignment) {
+    TVMContext ctx, size_t size, size_t alignment, TVMType type_hint) {
  this->Init();
  CHECK(context != nullptr) << "No OpenCL device";
  cl_int err_code;

--- a/src/runtime/opengl/opengl_common.h
+++ b/src/runtime/opengl/opengl_common.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file opengl_common.h
+ * \brief OpenGL common header
+ */
+#ifndef TVM_RUNTIME_OPENGL_OPENGL_COMMON_H_
+#define TVM_RUNTIME_OPENGL_OPENGL_COMMON_H_
+#include <tvm/runtime/config.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/device_api.h>
+#include <dmlc/logging.h>
+#include <GL/gl.h>
+#include <GLFW/glfw3.h>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+namespace tvm {
+namespace runtime {
+namespace gl {
+// This file contains the following classes.
+class GLFunctionPointers;
+class OpenGLWorkspace;
+class Texture;
+class Program;
+inline GLFWglproc GetProcAddress(const char* procname) {
+  GLFWglproc proc = glfwGetProcAddress(procname);
+  CHECK(proc != nullptr) << "Cannot get function \"" << procname << "\"";
+  return proc;
+}
+#define SetGLFunctionPointer(NAME) \
+  NAME(decltype(NAME)(GetProcAddress("gl" #NAME)))
+/*!
+ * \brief The function pointers of all OpenGL APIs that are used.
+ * Must be constructed after creating an OpenGL context.
+ */
+class GLFunctionPointers {
+ public:
+  GLFunctionPointers()
+      : SetGLFunctionPointer(ActiveTexture),
+        SetGLFunctionPointer(AttachShader),
+        SetGLFunctionPointer(BindBuffer),
+        SetGLFunctionPointer(BindFramebuffer),
+        SetGLFunctionPointer(BindTexture),
+        SetGLFunctionPointer(BindVertexArray),
+        SetGLFunctionPointer(BufferData),
+        SetGLFunctionPointer(CheckFramebufferStatus),
+        SetGLFunctionPointer(Clear),
+        SetGLFunctionPointer(CompileShader),
+        SetGLFunctionPointer(CreateProgram),
+        SetGLFunctionPointer(CreateShader),
+        SetGLFunctionPointer(DeleteFramebuffers),
+        SetGLFunctionPointer(DeleteProgram),
+        SetGLFunctionPointer(DeleteShader),
+        SetGLFunctionPointer(DeleteTextures),
+        SetGLFunctionPointer(DetachShader),
+        SetGLFunctionPointer(DrawArrays),
+        SetGLFunctionPointer(DrawBuffers),
+        SetGLFunctionPointer(EnableVertexAttribArray),
+        SetGLFunctionPointer(Finish),
+        SetGLFunctionPointer(FramebufferTexture2D),
+        SetGLFunctionPointer(GenBuffers),
+        SetGLFunctionPointer(GenFramebuffers),
+        SetGLFunctionPointer(GenTextures),
+        SetGLFunctionPointer(GenVertexArrays),
+        SetGLFunctionPointer(GetAttribLocation),
+        SetGLFunctionPointer(GetError),
+        SetGLFunctionPointer(GetIntegerv),
+        SetGLFunctionPointer(GetProgramInfoLog),
+        SetGLFunctionPointer(GetProgramiv),
+        SetGLFunctionPointer(GetShaderInfoLog),
+        SetGLFunctionPointer(GetShaderiv),
+        SetGLFunctionPointer(GetString),
+        SetGLFunctionPointer(GetUniformLocation),
+        SetGLFunctionPointer(LinkProgram),
+        SetGLFunctionPointer(ReadPixels),
+        SetGLFunctionPointer(ShaderSource),
+        SetGLFunctionPointer(TexImage2D),
+        SetGLFunctionPointer(TexParameteri),
+        SetGLFunctionPointer(TexSubImage2D),
+        SetGLFunctionPointer(Uniform1f),
+        SetGLFunctionPointer(Uniform1i),
+        SetGLFunctionPointer(UseProgram),
+        SetGLFunctionPointer(VertexAttribPointer),
+        SetGLFunctionPointer(Viewport) {}
+  void (*ActiveTexture)(GLenum texture);
+  void (*AttachShader)(GLuint program, GLuint shader);
+  void (*BindBuffer)(GLenum target, GLuint buffer);
+  void (*BindFramebuffer)(GLenum target, GLuint framebuffer);
+  void (*BindTexture)(GLenum target, GLuint texture);
+  void (*BindVertexArray)(GLuint array);
+  void (*BufferData)(GLenum target, GLsizeiptr size, const GLvoid* data,
+                     GLenum usage);
+  GLenum (*CheckFramebufferStatus)(GLenum target);
+  void (*Clear)(GLbitfield mask);
+  void (*CompileShader)(GLuint shader);
+  GLuint (*CreateProgram)();
+  GLuint (*CreateShader)(GLenum shader_type);
+  void (*DeleteFramebuffers)(GLsizei n, const GLuint* framebuffers);
+  void (*DeleteProgram)(GLuint program);
+  void (*DeleteShader)(GLuint shader);
+  void (*DeleteTextures)(GLsizei n, const GLuint* textures);
+  void (*DetachShader)(GLuint program, GLuint shader);
+  void (*DrawArrays)(GLenum mode, GLint first, GLsizei count);
+  void (*DrawBuffers)(GLsizei n, const GLenum* bufs);
+  void (*EnableVertexAttribArray)(GLuint index);
+  void (*Finish)();
+  void (*FramebufferTexture2D)(GLenum target, GLenum attachment,
+                               GLenum textarget, GLuint texture, GLint level);
+  void (*GenBuffers)(GLsizei n, GLuint* buffers);
+  void (*GenFramebuffers)(GLsizei n, GLuint* ids);
+  void (*GenTextures)(GLsizei n, GLuint* textures);
+  void (*GenVertexArrays)(GLsizei n, GLuint* arrays);
+  GLint (*GetAttribLocation)(GLuint program, const GLchar* name);
+  GLenum (*GetError)();
+  void (*GetIntegerv)(GLenum pname, GLint* data);
+  void (*GetProgramInfoLog)(GLuint program, GLsizei maxLength, GLsizei* length,
+                            GLchar* info_log);
+  void (*GetProgramiv)(GLuint program, GLenum pname, GLint* params);
+  void (*GetShaderInfoLog)(GLuint shader, GLsizei max_length, GLsizei* length,
+                           GLchar* info_log);
+  void (*GetShaderiv)(GLuint shader, GLenum pname, GLint* params);
+  const GLubyte *(*GetString)(GLenum name);
+  GLint (*GetUniformLocation)(GLuint program, const GLchar* name);
+  void (*LinkProgram)(GLuint program);
+  void (*ReadPixels)(GLint x, GLint y, GLsizei width, GLsizei height,
+                     GLenum format, GLenum type, GLvoid* data);
+  void (*ShaderSource)(GLuint shader, GLsizei count, const GLchar** string,
+                       const GLint* length);
+  void (*TexImage2D)(GLenum target, GLint level, GLint internal_format,
+                     GLsizei width, GLsizei height, GLint border, GLenum format,
+                     GLenum type, const GLvoid* data);
+  void (*TexParameteri)(GLenum target, GLenum pname, GLint param);
+  void (*TexSubImage2D)(GLenum target, GLint level, GLint xoffset,
+                        GLint yoffset, GLsizei width, GLsizei height,
+                        GLenum format, GLenum type, const GLvoid* data);
+  void (*Uniform1f)(GLint location, GLfloat v0);
+  void (*Uniform1i)(GLint location, GLint v0);
+  void (*UseProgram)(GLuint program);
+  void (*VertexAttribPointer)(GLuint index, GLint size, GLenum type,
+                              GLboolean normalized, GLsizei stride,
+                              const GLvoid* pointer);
+  void (*Viewport)(GLint x, GLint y, GLsizei width, GLsizei height);
+};
+/*!
+ * \brief Process global OpenGL workspace.
+ */
+class OpenGLWorkspace final : public DeviceAPI {
+ public:
+  ~OpenGLWorkspace() final;
+  // override device API
+  void SetDevice(TVMContext ctx) final;
+  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final;
+  void FreeDataSpace(TVMContext ctx, void* ptr) final;
+  void CopyDataFromTo(const void* from,
+                      size_t from_offset,
+                      void* to,
+                      size_t to_offset,
+                      size_t size,
+                      TVMContext ctx_from,
+                      TVMContext ctx_to,
+                      TVMStreamHandle stream) final;
+  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
+  void* AllocWorkspace(TVMContext ctx, size_t size) final;
+  void FreeWorkspace(TVMContext ctx, void* data) final;
+  /*!
+   * \brief Get the global OpenGL workspace.
+   * \return The global OpenGL workspace.
+   */
+  static const std::shared_ptr<OpenGLWorkspace>& Global();
+  /*!
+   * \brief Create an OpenGL program that uses the given fragment shader.
+   * \param fragment_shader The fragment shader **source**.
+   * \return The OpenGL program.
+   */
+  Program CreateProgram(const char* fragment_shader_src);
+  /*!
+   * \brief Create an OpenGL texture that stores an array.
+   * \param type Element type.
+   * \param nbytes Number of bytes in the array.
+   * \return The OpenGL texture.
+   */
+  Texture CreateTexture(TVMType type, size_t nbytes);
+  /*!
+   * \brief Upload user data into a sub-region of an OpenGL texture.
+   * \param texture The texture to be written to.
+   * \param begin The index of the first element to be written to.
+   * \param nelems The number of elements to be written to.
+   * \param data The user data.
+   */
+  void PutTextureData(Texture* texture,
+                      GLint begin,
+                      GLsizei nelems,
+                      const GLvoid* data);
+  /*!
+   * \brief Download a sub-region of an OpenGL texture.
+   * \param texture The texture to download from.
+   * \param begin The index of first element to download from.
+   * \param nelems The number of elements to download from.
+   * \param data The user buffer.
+   */
+  void GetTextureData(const Texture* texture,
+                      GLint begin,
+                      GLsizei nelems,
+                      GLvoid* data);
+  /*!
+   * \brief Set currently used OpenGL program.
+   */
+  void SetCurrentProgram(const Program& program);
+  /*!
+   * \brief Set uniform values for an OpenGL program.
+   * Must call SetCurrentProgram before calling this.
+   * \param program The OpenGL program.
+   * \param name The uniform argument name.
+   * \param type The type of the uniform.
+   * \param value The value to pass in.
+   */
+  void SetUniform(const Program& program,
+                  const std::string& name,
+                  TVMType type,
+                  void* value);
+  /*!
+   * \brief Set input texture for an OpenGL program.
+   * Must call SetCurrentProgram before calling this.
+   * \param program The OpenGL program.
+   * \param name The texture uniform argument name.
+   * \param unit The texture unit to use. Each input texture must occupy a
+   * different unit.
+   * \param texture The OpenGL texture to pass in.
+   */
+  void SetInputTexture(const Program& program,
+                       const std::string& name,
+                       GLuint unit,
+                       Texture* texture);
+  /*!
+   * \brief Render to a texture.
+   * \param output The output texture.
+   */
+  void Render(Texture* output);
+ private:
+  friend class Texture;
+  friend class Program;
+  // Global singleton. Hide constructor.
+  OpenGLWorkspace();
+  GLFWwindow* window_;
+  std::unique_ptr<GLFunctionPointers> gl;
+  GLuint vertex_shader_;
+  static const int kWindowWidth = 640;
+  static const int kWindowHeight = 480;
+  struct Vertex {
+    float x, y;
+  };
+  static constexpr size_t kNumVertices = 6;
+  static const Vertex vertices[kNumVertices];
+  static const char* vertex_shader_text_;
+  /*!
+   * \brief Bind a texture to a "texture unit".
+   * After calling this function, the "texture unit" becomes "active", and the
+   * texture is bound to GL_TEXTURE_2D in that "texture unit".
+   * \param unit The texture unit to activate.
+   * \param texture The texture to bind.
+   */
+  void BindTextureUnit(GLuint unit, GLuint texture);
+  /*!
+   * \brief Callback in Texture's destructor.
+   */
+  void OnDeleteTexture(GLuint texture);
+  /*!
+   * \brief Callback in Program's destructor.
+   */
+  void OnDeleteProgram(GLuint program);
+  /*!
+   * \brief Check if there is any outstanding OpenGL error. If there is, crash.
+   */
+  void CheckOpenGLError();
+  /*!
+   * \brief Get the maximum number of texture units.
+   */
+  GLuint NumTextureUnits();
+  /*!
+   * \brief Create and compile a shader from a source string.
+   * \param shader_kind The kind of shader.
+   * Could be GL_VERTEX_SHADER or GL_FRAGMENT_SHADER.
+   * \param shader_src The source string of the shader.
+   * \return The compiled shader ID.
+   */
+  GLuint CreateShader(GLenum shader_kind, const char* shader_src);
+  /*!
+   * \brief Create an OpenGL program that uses the given fragment shader.
+   * \param fragment_shader The **compiled** fragment shader.
+   * \return The OpenGL program.
+   */
+  Program CreateProgram(GLuint fragment_shader);
+};
+/*!
+ * \brief An OpenGL program, composed of a vertex shader and a fragment shader.
+ * In TVM, every program has the same vertex shader.
+ * So a program just corresponds to a fragment shader.
+ * A program can only be created by the workspace.
+ * This class is just a wrapper over an OpenGL program ID.
+ */
+class Program {
+ public:
+  // Move constructor.
+  Program(Program&& other) noexcept
+      : workspace_(other.workspace_), program_(other.program_) {
+    other.program_ = kInvalidProgram;
+  }
+  // Move assignment.
+  Program& operator=(Program&& other) noexcept {
+    workspace_ = other.workspace_;
+    program_ = other.program_;
+    other.program_ = kInvalidProgram;
+    return *this;
+  }
+  // Disallow copy.
+  Program(const Program& other) = delete;
+  Program& operator=(const Program& other) = delete;
+  // Destructor.
+  ~Program() {
+    if (program_ != kInvalidProgram) {
+      workspace_->OnDeleteProgram(program_);
+      program_ = kInvalidProgram;
+    }
+  }
+ private:
+  friend class OpenGLWorkspace;
+  // Only OpenGLWorkspace can create a Program.
+  // We enforce this to make sure OpenGL is initialized.
+  explicit Program(OpenGLWorkspace* workspace, GLuint program)
+      : workspace_(workspace), program_(program) {}
+  // The internal OpenGL program ID.
+  GLuint program() const { return program_; }
+  static constexpr GLuint kInvalidProgram = static_cast<GLuint>(-1);
+  OpenGLWorkspace* workspace_;
+  GLuint program_;
+};
+/*!
+ * \brief The storage format of a texture.
+ * The members match the API of glTexImage2D.
+ */
+struct TextureFormat {
+  TextureFormat(GLint internal_format, GLenum format, GLenum type)
+      : internal_format(internal_format), format(format), type(type) {}
+  GLsizei elemsz() const {
+    switch (type) {
+      case GL_BYTE: case GL_UNSIGNED_BYTE:
+        return 1;
+      case GL_SHORT: case GL_UNSIGNED_SHORT:
+        return 2;
+      case GL_INT: case GL_UNSIGNED_INT:
+        return 4;
+      case GL_FLOAT:
+        return 4;
+      default:
+        LOG(FATAL) << "Unsupported type";
+        return -1;
+    }
+  }
+  bool operator==(const TextureFormat& other) const {
+    return std::make_tuple(internal_format, format, type) ==
+        std::make_tuple(other.internal_format, other.format, other.type);
+  }
+  GLint internal_format;  // OpenGL says this is GLint, not GLenum.
+  GLenum format;
+  GLenum type;
+};
+/*!
+ * \brief An OpenGL texture represents a chunk of GPU memory.
+ * This is the way we represent tensors.
+ * We always use 2D textures.
+ */
+class Texture {
+ public:
+  // Move constructor.
+  Texture(Texture&& other) noexcept
+      : workspace_(other.workspace_), texture_(other.texture_),
+        format_(other.format_), width_(other.width_), height_(other.height_) {
+    other.texture_ = kInvalidTexture;
+  }
+  // Move assignment.
+  Texture& operator=(Texture&& other) noexcept {
+    workspace_ = other.workspace_;
+    texture_ = other.texture_;
+    format_ = other.format_;
+    width_ = other.width_;
+    height_ = other.height_;
+    other.texture_ = kInvalidTexture;
+    return *this;
+  }
+  // Disallow copy.
+  Texture(const Texture& other) = delete;
+  Texture& operator=(const Texture& other) = delete;
+  // Destructor.
+  ~Texture() {
+    if (texture_ != kInvalidTexture) {
+      workspace_->OnDeleteTexture(texture_);
+      texture_ = kInvalidTexture;
+    }
+  }
+  /*!
+   * \brief The width of the texture in number of pixels.
+   */
+  GLsizei width() const { return width_; }
+  /*!
+   * \brief The height of the texture in number of pixels.
+   */
+  GLsizei height() const { return height_; }
+  /*!
+   * \brief The number of bytes of each element in the array.
+   */
+  GLsizei elemsz() const { return format_.elemsz(); }
+ private:
+  friend class OpenGLWorkspace;
+  // Only OpenGLWorkspace can create a Texture.
+  // We enforce this to make sure OpenGL is initialized.
+  // Always only use the first dimension of a 2D texture.
+  // The reason is that texelFetch only supports 2D textures.
+  explicit Texture(OpenGLWorkspace* workspace, GLuint texture,
+                   TextureFormat format,
+                   GLsizei width, GLsizei height)
+      : workspace_(workspace), texture_(texture), format_(format),
+        width_(width), height_(height) {}
+  // The internal texture ID.
+  GLuint texture() const { return texture_; }
+  static constexpr GLuint kInvalidTexture = static_cast<GLuint>(-1);
+  OpenGLWorkspace* workspace_;
+  GLuint texture_;
+  TextureFormat format_;
+  GLsizei width_;
+  GLsizei height_;
+};
+}  // namespace gl
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OPENGL_OPENGL_COMMON_H_
--- a/src/runtime/opengl/opengl_device_api.cc
+++ b/src/runtime/opengl/opengl_device_api.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file opengl_device_api.cc
+ */
+#include "./opengl_common.h"
+#if TVM_OPENGL_RUNTIME
+#include <tvm/runtime/registry.h>
+#include <cstring>
+namespace tvm {
+namespace runtime {
+namespace gl {
+/*!
+ * \brief Turn OpenGL error enum to string.
+ */
+static const char* GLGetErrorString(GLenum error) {
+  switch (error) {
+    case GL_NO_ERROR:
+      return "GL_NO_ERROR";
+    case GL_INVALID_ENUM:
+      return "GL_INVALID_ENUM";
+    case GL_INVALID_VALUE:
+      return "GL_INVALID_VALUE";
+    case GL_INVALID_OPERATION:
+      return "GL_INVALID_OPERATION";
+    case GL_STACK_OVERFLOW:
+      return "GL_STACK_OVERFLOW";
+    case GL_STACK_UNDERFLOW:
+      return "GL_STACK_UNDERFLOW";
+    case GL_OUT_OF_MEMORY:
+      return "GL_OUT_OF_MEMORY";
+    default:
+      return "Unknown OpenGL error code";
+  }
+}
+/*!
+ * \brief Get the latest error.
+ */
+void OpenGLWorkspace::CheckOpenGLError() {
+  GLenum err = gl->GetError();
+  CHECK_EQ(err, GL_NO_ERROR) << "OpenGL error, code=" << err << ": "
+                             << gl::GLGetErrorString(err);
+}
+/*!
+ * \brief Protected OpenGL call.
+ * \param func Expression to call.
+ */
+#define OPENGL_CALL(func)                                                      \
+  {                                                                            \
+    (func);                                                                    \
+    CheckOpenGLError();                                                        \
+  }
+/*!
+ * \brief The error handling callback passed to GLFW.
+ */
+void GlfwErrorCallback(int err, const char* str) {
+  LOG(FATAL) << "Error: [" << err << "] " << str;
+}
+const std::shared_ptr<OpenGLWorkspace>& OpenGLWorkspace::Global() {
+  static std::shared_ptr<OpenGLWorkspace> inst(new OpenGLWorkspace);
+  return inst;
+}
+void OpenGLWorkspace::SetDevice(TVMContext ctx) {
+  CHECK_EQ(ctx.device_type, static_cast<int>(kOpenGL))
+    << "Device type must be OpenGL.";
+  CHECK_EQ(ctx.device_id, 0) << "Only support 1 OpenGL \"device\".";
+}
+void OpenGLWorkspace::GetAttr(
+    TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) {
+  switch (kind) {
+    case kExist: {
+      *rv = static_cast<int>(ctx.device_id == 0);
+      break;
+    }
+    case kMaxThreadsPerBlock: {
+      GLint max_texture_size;
+      OPENGL_CALL(gl->GetIntegerv(GL_MAX_TEXTURE_SIZE, &max_texture_size));
+      break;
+    }
+    case kWarpSize: {
+      *rv = 1;
+      break;
+    }
+    case kComputeVersion: {
+      break;
+    }
+  }
+}
+void* OpenGLWorkspace::AllocDataSpace(
+    TVMContext ctx, size_t nbytes, size_t alignment, TVMType type_hint) {
+  return reinterpret_cast<void*>(new Texture(CreateTexture(type_hint, nbytes)));
+}
+void OpenGLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
+  delete reinterpret_cast<Texture*>(ptr);
+}
+void OpenGLWorkspace::CopyDataFromTo(const void* from,
+                                     size_t from_offset,
+                                     void* to,
+                                     size_t to_offset,
+                                     size_t size,
+                                     TVMContext ctx_from,
+                                     TVMContext ctx_to,
+                                     TVMStreamHandle stream) {
+  CHECK(stream == nullptr);
+  // TODO(zhixunt): This is a nasty hack to avoid comparison between
+  // incompatible enums. We should add kOpenGL to dlpack.
+  constexpr int gl_devtype = kOpenGL;
+  std::tuple<int, int> type_from_to(ctx_from.device_type, ctx_to.device_type);
+  if (type_from_to == std::make_tuple(gl_devtype, gl_devtype)) {
+    auto from_texture = static_cast<const Texture*>(from);
+    auto to_texture = static_cast<Texture*>(to);
+    auto temp_buffer = std::unique_ptr<char[]>(new char[size]);
+    CHECK(from_texture->format_ == to_texture->format_);
+    auto elemsz = from_texture->elemsz();
+    auto from_begin = static_cast<GLint>(from_offset / elemsz);
+    auto to_begin = static_cast<GLint>(to_offset / elemsz);
+    auto nelems = static_cast<GLsizei>(size / elemsz);
+    GetTextureData(from_texture, from_begin, nelems, temp_buffer.get());
+    PutTextureData(to_texture, to_begin, nelems, temp_buffer.get());
+  } else if (type_from_to == std::make_tuple(gl_devtype, kDLCPU)) {
+    auto texture = static_cast<const Texture*>(from);
+    void *data = static_cast<char *>(to) + to_offset;
+    auto elemsz = texture->elemsz();
+    auto begin = static_cast<GLint>(from_offset / elemsz);
+    auto nelems = static_cast<GLsizei>(size / elemsz);
+    GetTextureData(texture, begin, nelems, data);
+  } else if (type_from_to == std::make_tuple(kDLCPU, gl_devtype)) {
+    auto texture = reinterpret_cast<Texture*>(to);
+    const void* data = static_cast<const char*>(from) + from_offset;
+    auto elemsz = texture->elemsz();
+    auto begin = static_cast<GLint>(to_offset / elemsz);
+    auto nelems = static_cast<GLsizei>(size / elemsz);
+    PutTextureData(texture, begin, nelems, data);
+  } else {
+    LOG(FATAL) << "Expect copy from/to OpenGL or between OpenGL";
+  }
+}
+void OpenGLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {}
+void* OpenGLWorkspace::AllocWorkspace(TVMContext ctx, size_t size) {
+  LOG(FATAL) << "Cannot allocate OpenGL workspace.";
+  return nullptr;
+}
+void OpenGLWorkspace::FreeWorkspace(TVMContext ctx, void* data) {
+  LOG(FATAL) << "Cannot free OpenGL workspace.";
+}
+OpenGLWorkspace::OpenGLWorkspace() {
+  // Set an error handler.
+  // This can be called before glfwInit().
+  glfwSetErrorCallback(&GlfwErrorCallback);
+  // Initialize GLFW.
+  if (glfwInit() != GL_TRUE) {
+    LOG(FATAL) << "glfwInit() failed!";
+  }
+  // Create a window.
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
+  glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
+  glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
+  glfwWindowHint(GLFW_VISIBLE, GL_FALSE);
+  window_ = glfwCreateWindow(kWindowWidth, kWindowHeight, "", nullptr, nullptr);
+  if (window_ == nullptr) {
+    LOG(FATAL) << "glfwCreateWindow() failed!";
+  }
+  // Before using any OpenGL API, we must specify a context.
+  glfwMakeContextCurrent(window_);
+  // Load all OpenGL API function pointers.
+  gl = std::unique_ptr<GLFunctionPointers>(new GLFunctionPointers);
+  CheckOpenGLError();
+  // We always render the same vertices and triangles.
+  GLuint vertex_buffer;
+  OPENGL_CALL(gl->GenBuffers(1, &vertex_buffer));
+  OPENGL_CALL(gl->BindBuffer(GL_ARRAY_BUFFER, vertex_buffer));
+  OPENGL_CALL(gl->BufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices,
+                             GL_STATIC_DRAW));
+  GLuint vertex_array;
+  OPENGL_CALL(gl->GenVertexArrays(1, &vertex_array));
+  OPENGL_CALL(gl->BindVertexArray(vertex_array));
+  OPENGL_CALL(gl->BindBuffer(GL_ARRAY_BUFFER, vertex_buffer));
+  // We always use the same vertex shader.
+  vertex_shader_ = CreateShader(GL_VERTEX_SHADER, vertex_shader_text_);
+  LOG(INFO) << "OpenGL initialized, version = " << gl->GetString(GL_VERSION);
+}
+OpenGLWorkspace::~OpenGLWorkspace() {
+  // Paired with glfwCreateWindow().
+  glfwDestroyWindow(window_);
+  // Paired with glfwInit().
+  glfwTerminate();
+}
+void OpenGLWorkspace::BindTextureUnit(GLuint unit, GLuint texture) {
+  OPENGL_CALL(gl->ActiveTexture(GL_TEXTURE0 + unit));
+  OPENGL_CALL(gl->BindTexture(GL_TEXTURE_2D, texture));
+}
+void OpenGLWorkspace::OnDeleteTexture(GLuint texture) {
+  OPENGL_CALL(gl->DeleteTextures(1, &texture));
+}
+void OpenGLWorkspace::OnDeleteProgram(GLuint program) {
+  OPENGL_CALL(gl->DeleteProgram(program));
+}
+GLuint OpenGLWorkspace::NumTextureUnits() {
+  GLint num_units;
+  OPENGL_CALL(gl->GetIntegerv(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS, &num_units));
+  return static_cast<GLuint>(num_units);
+}
+const OpenGLWorkspace::Vertex OpenGLWorkspace::vertices[OpenGLWorkspace::kNumVertices] = {
+    {-1.f, -1.f},
+    {1.0f, -1.f},
+    {1.0f, 1.0f},
+    {-1.f, -1.f},
+    {-1.f, 1.0f},
+    {1.0f, 1.0f},
+};
+// Don't need to change this.
+// The vertex shader only needs to take in the triangle points.
+// No need for point transformations.
+const char* OpenGLWorkspace::vertex_shader_text_ = "#version 300 es\n"
+    "in vec2 point; // input to vertex shader\n"
+    "void main() {\n"
+    "  gl_Position = vec4(point, 0.0, 1.0);\n"
+    "}\n";
+Program OpenGLWorkspace::CreateProgram(
+    const char* fragment_shader_src) {
+  // Create and compile the shaders.
+  GLuint fragment_shader = CreateShader(GL_FRAGMENT_SHADER,
+                                        fragment_shader_src);
+  // Link the shaders and create the program.
+  Program program = CreateProgram(fragment_shader);
+  OPENGL_CALL(gl->DeleteShader(fragment_shader));
+  return program;
+}
+GLuint OpenGLWorkspace::CreateShader(GLenum shader_kind,
+                                     const char* shader_src) {
+  // Create the shader.
+  GLuint shader = gl->CreateShader(shader_kind);
+  gl->ShaderSource(shader, 1, &shader_src, nullptr);
+  gl->CompileShader(shader);
+  // Check compile errors.
+  GLint err;
+  gl->GetShaderiv(shader, GL_COMPILE_STATUS, &err);
+  GLint info_log_len;
+  gl->GetShaderiv(shader, GL_INFO_LOG_LENGTH, &info_log_len);
+  if (err != GL_TRUE) {
+    std::unique_ptr<char[]> err_msg(new char[info_log_len + 1]);
+    gl->GetShaderInfoLog(shader, info_log_len, nullptr, err_msg.get());
+    LOG(FATAL) << err_msg.get();
+    assert(false);
+  }
+  CheckOpenGLError();
+  return shader;
+}
+static TextureFormat GetTextureFormat(TVMType type) {
+  CHECK_EQ(type.lanes, 1) << "Not supporting multi-lane types.";
+  switch (type.code) {
+    case kDLInt: {
+      switch (type.bits) {
+        case 8:
+          return {GL_R8I, GL_RED_INTEGER, GL_BYTE};
+        case 16:
+          return {GL_R16I, GL_RED_INTEGER, GL_SHORT};
+        case 32:
+          return {GL_R32I, GL_RED_INTEGER, GL_INT};
+        default:
+          LOG(FATAL) << "Unsupported type bits " << type.bits;
+      }
+    }
+    case kDLUInt: {
+      switch (type.bits) {
+        case 8:
+          return {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE};
+        case 16:
+          return {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT};
+        case 32:
+          return {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT};
+        default:
+          LOG(FATAL) << "Unsupported type bits " << type.bits;
+      }
+    }
+    case kDLFloat: {
+      switch (type.bits) {
+        case 32:
+          return {GL_R32F, GL_RED, GL_FLOAT};
+        default:
+          LOG(FATAL) << "Unsupported type bits " << type.bits;
+      }
+    }
+    default:
+      LOG(FATAL) << "Unsupported type code" << type.code;
+  }
+  assert(false);
+}
+Texture OpenGLWorkspace::CreateTexture(TVMType type, size_t nbytes) {
+  // Create a texture.
+  GLuint texture;
+  OPENGL_CALL(gl->GenTextures(1, &texture));
+  BindTextureUnit(NumTextureUnits() - 1, texture);
+  // Use glTexImage2D with nullptr data to specify GPU data storage.
+  auto texture_format = GetTextureFormat(type);
+  auto width = static_cast<GLsizei>(nbytes / (type.bits / 8));
+  auto height = GLsizei(1);
+  OPENGL_CALL(gl->TexImage2D(GL_TEXTURE_2D, /*level=*/0,
+                             texture_format.internal_format,
+                             width, height, /*border=*/0,
+                             texture_format.format, texture_format.type,
+                             /*data=*/nullptr));
+  OPENGL_CALL(
+      gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+  OPENGL_CALL(
+      gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+  OPENGL_CALL(
+      gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+  OPENGL_CALL(
+      gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+  return Texture(this, texture, texture_format, width, height);
+}
+Program OpenGLWorkspace::CreateProgram(GLuint fragment_shader) {
+  // Create the program and link the shaders.
+  GLuint program = gl->CreateProgram();
+  gl->AttachShader(program, vertex_shader_);
+  gl->AttachShader(program, fragment_shader);
+  gl->LinkProgram(program);
+  // Check link errors.
+  GLint err;
+  gl->GetProgramiv(program, GL_LINK_STATUS, &err);
+  GLint info_log_len;
+  gl->GetProgramiv(program, GL_INFO_LOG_LENGTH, &info_log_len);
+  if (err != GL_TRUE) {
+    std::unique_ptr<char[]> err_msg(new char[info_log_len + 1]);
+    gl->GetProgramInfoLog(program, info_log_len, nullptr, err_msg.get());
+    LOG(FATAL) << err_msg.get();
+    assert(false);
+  }
+  CheckOpenGLError();
+  OPENGL_CALL(gl->DetachShader(program, vertex_shader_));
+  OPENGL_CALL(gl->DetachShader(program, fragment_shader));
+  auto point_attrib = GLuint(gl->GetAttribLocation(program, "point"));
+  OPENGL_CALL(gl->EnableVertexAttribArray(point_attrib));
+  OPENGL_CALL(gl->VertexAttribPointer(point_attrib, 2, GL_FLOAT, GL_FALSE,
+                                      sizeof(Vertex), nullptr));
+  return Program(this, program);
+}
+void OpenGLWorkspace::PutTextureData(Texture *texture,
+                                     GLint begin,
+                                     GLsizei nelems,
+                                     const GLvoid* data) {
+  // Bind to temporary unit.
+  BindTextureUnit(NumTextureUnits() - 1, texture->texture());
+  // Similar to cudaMemcpy.
+  OPENGL_CALL(gl->TexSubImage2D(GL_TEXTURE_2D, /*level=*/0,
+                                /*xoffset=*/begin, /*yoffset=*/0,
+                                /*width=*/nelems, /*height=*/1,
+                                texture->format_.format, texture->format_.type,
+                                data));
+}
+void OpenGLWorkspace::GetTextureData(const Texture *texture,
+                                     GLint begin,
+                                     GLsizei nelems,
+                                     GLvoid* data) {
+  BindTextureUnit(NumTextureUnits() - 1, texture->texture());
+  // Create frame buffer.
+  GLuint frame_buffer;
+  OPENGL_CALL(gl->GenFramebuffers(1, &frame_buffer));
+  OPENGL_CALL(gl->BindFramebuffer(GL_FRAMEBUFFER, frame_buffer));
+  // Bind texture to framebuffer's attachment 0.
+  OPENGL_CALL(gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+                                       GL_TEXTURE_2D, texture->texture(), 0));
+  // Always check that our framebuffer is okay.
+  if (gl->CheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+    LOG(FATAL) << "Framebuffer not complete.";
+  }
+#ifdef __EMSCRIPTEN__
+  // WebGL2's glReadPixels API doesn't allow GL_RED user buffer format.
+  // Instead, We must use GL_RGBA. This means the data we retrieve has useless
+  // GBA channels. Here we are applying a dirty hack.
+  // TODO(zhixunt): We really want to utilize all RGBA channels in textures.
+  //
+  // WebGL2's glReadPixels API also doesn't allow GL_RED_INTEGER or
+  // GL_RGB_INTEGER user buffer format, which means we cannot retrieve integer
+  // texture data? (need to confirm)
+  CHECK_EQ(texture->format_.internal_format, GL_R32F)
+      << "Retrieving integer texture not supported yet.";
+  auto elemsz = texture->format_.elemsz();
+  auto nchannels = 4;
+  auto padded_data_size = nchannels * nelems * elemsz;
+  auto padded_data = std::unique_ptr<char[]>(new char[padded_data_size]);
+  OPENGL_CALL(gl->ReadPixels(/*x=*/begin, /*y=*/0, /*width=*/nelems,
+                             /*height=*/1, GL_RGBA, GL_FLOAT,
+                             padded_data.get()));
+  for (GLsizei i = 0; i != nelems; ++i) {
+    auto dst = reinterpret_cast<char *>(data) + i * elemsz;
+    auto src = padded_data.get() + nchannels * i * elemsz;
+    std::memcpy(dst, src, elemsz);
+  }
+#else
+  OPENGL_CALL(gl->ReadPixels(/*x=*/begin, /*y=*/0, /*width=*/nelems,
+                             /*height=*/1, texture->format_.format,
+                             texture->format_.type, data));
+#endif
+  OPENGL_CALL(gl->DeleteFramebuffers(1, &frame_buffer));
+}
+void OpenGLWorkspace::SetCurrentProgram(const Program& program) {
+  OPENGL_CALL(gl->UseProgram(program.program()));
+}
+void OpenGLWorkspace::SetUniform(const Program& program,
+                                 const std::string& name,
+                                 TVMType type,
+                                 void* value) {
+  GLint location = gl->GetUniformLocation(program.program(), name.c_str());
+  switch (type.code) {
+    case kDLInt: {
+      CHECK_EQ(type.bits, 32) << "Only support 32-bit int for uniform.";
+      GLint uniform_value = *reinterpret_cast<GLint*>(value);
+      OPENGL_CALL(gl->Uniform1i(location, uniform_value));
+      break;
+    }
+    case kDLUInt: {
+      LOG(FATAL) << "Strangely, emcc WebGL does not support glUniform1ui.";
+      break;
+    }
+    case kDLFloat: {
+      CHECK_EQ(type.bits, 32) << "Only support 32-bit float for uniform.";
+      GLfloat uniform_value = *reinterpret_cast<GLfloat*>(value);
+      OPENGL_CALL(gl->Uniform1f(location, uniform_value));
+      break;
+    }
+    default: {
+      LOG(FATAL) << "Unsupported type code for uniform.";
+      break;
+    }
+  }
+}
+void OpenGLWorkspace::SetInputTexture(const Program& program,
+                                      const std::string& name,
+                                      GLuint unit,
+                                      Texture* texture) {
+  // We always use the last texture unit as temporary.
+  // Therefore, we can have "NumTextureUnits() - 1" input textures.
+  CHECK_LT(unit, NumTextureUnits() - 1) << "Too many textures.";
+  BindTextureUnit(unit, texture->texture());
+  GLint location = gl->GetUniformLocation(program.program_, name.c_str());
+  OPENGL_CALL(gl->Uniform1i(location, unit));
+}
+void OpenGLWorkspace::Render(Texture* output) {
+  // Create frame buffer.
+  GLuint frame_buffer;
+  OPENGL_CALL(gl->GenFramebuffers(1, &frame_buffer));
+  OPENGL_CALL(gl->BindFramebuffer(GL_FRAMEBUFFER, frame_buffer));
+  // Set "renderedTexture" as our colour attachement 0.
+  OPENGL_CALL(gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+                                       GL_TEXTURE_2D, output->texture(), 0));
+  // Specify that we will render to color attachment 0.
+  GLenum DrawBuffers[1] = {GL_COLOR_ATTACHMENT0};
+  OPENGL_CALL(gl->DrawBuffers(1, DrawBuffers));
+  // Always check that our framebuffer is okay.
+  if (gl->CheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+    LOG(FATAL) << "Framebuffer not complete.";
+  }
+  // Perform rendering.
+  OPENGL_CALL(gl->Viewport(0, 0, output->width(), output->height()));
+  OPENGL_CALL(gl->Clear(GL_COLOR_BUFFER_BIT));
+  OPENGL_CALL(gl->DrawArrays(GL_TRIANGLES, 0, 6));
+  OPENGL_CALL(gl->DeleteFramebuffers(1, &frame_buffer));
+}
+TVM_REGISTER_GLOBAL("device_api.opengl")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  DeviceAPI* ptr = OpenGLWorkspace::Global().get();
+  *rv = static_cast<void*>(ptr);
+});
+}  // namespace gl
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_OPENGL_RUNTIME
--- a/src/runtime/opengl/opengl_module.cc
+++ b/src/runtime/opengl/opengl_module.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file opengl_module.cc
+ */
+#include <utility>
+#include "./opengl_common.h"
+#include "./opengl_module.h"
+#if TVM_OPENGL_RUNTIME
+#include <tvm/runtime/registry.h>
+#include "../pack_args.h"
+#include "../thread_storage_scope.h"
+#include "../file_util.h"
+namespace tvm {
+namespace runtime {
+class OpenGLModuleNode final : public ModuleNode {
+ public:
+  OpenGLModuleNode(std::unordered_map<std::string, OpenGLShader> shaders,
+                   std::string fmt,
+                   std::unordered_map<std::string, FunctionInfo> fmap);
+  ~OpenGLModuleNode() override = default;
+  const char* type_key() const final { return "opengl"; }
+  PackedFunc GetFunction(const std::string& name,
+                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+  std::string GetSource(const std::string& format) final;
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final;
+  void SaveToBinary(dmlc::Stream* stream) final;
+  const gl::Program& GetProgram(const std::string& func_name) const;
+  const OpenGLShader& GetShader(const std::string& func_name) const;
+  const FunctionInfo& GetFunctionInfo(const std::string& func_name) const;
+  gl::OpenGLWorkspace& workspace() const { return *workspace_; }
+ private:
+  std::shared_ptr<gl::OpenGLWorkspace> workspace_;
+  std::unordered_map<std::string, OpenGLShader> shaders_;
+  std::string fmt_;
+  std::unordered_map<std::string, FunctionInfo> fmap_;
+  std::unordered_map<std::string, gl::Program> programs_;
+  DISALLOW_COPY_AND_ASSIGN(OpenGLModuleNode);
+};
+class OpenGLWrappedFunc {
+ public:
+  OpenGLWrappedFunc(OpenGLModuleNode* m,
+                    std::shared_ptr<ModuleNode> sptr,
+                    std::string func_name,
+                    std::vector<size_t> arg_size,
+                    const std::vector<std::string>& thread_axis_tags);
+  void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const;
+ private:
+  // The module
+  OpenGLModuleNode* m_;
+  // resource handle
+  std::shared_ptr<ModuleNode> sptr_;
+  // The name of the function.
+  std::string func_name_;
+  // convert code for void argument
+  std::vector<size_t> arg_size_;
+  // thread axis config
+  ThreadAxisConfig thread_axis_cfg_;
+};
+OpenGLModuleNode::OpenGLModuleNode(
+    std::unordered_map<std::string, OpenGLShader> shaders,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap)
+    : workspace_(gl::OpenGLWorkspace::Global()), shaders_(std::move(shaders)),
+      fmt_(std::move(fmt)), fmap_(std::move(fmap)), programs_() {
+  CHECK_EQ(fmt_, "gl") << "Unknown OpenGL format " << fmt_;
+  for (auto &pair : shaders_) {
+    auto &func_name = pair.first;
+    auto &shader = pair.second;
+    programs_.emplace(func_name,
+                      workspace_->CreateProgram(shader.source.c_str()));
+  }
+}
+PackedFunc OpenGLModuleNode::GetFunction(
+    const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  CHECK_EQ(sptr_to_self.get(), this);
+  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+  auto func_info_it = fmap_.find(name);
+  if (func_info_it == fmap_.end()) { return PackedFunc(); }
+  auto &func_info = func_info_it->second;
+  std::vector<size_t> arg_size(func_info.arg_types.size());
+  for (size_t i = 0; i < func_info.arg_types.size(); ++i) {
+    TVMType t = func_info.arg_types[i];
+    CHECK_EQ(t.lanes, 1U);
+    uint32_t bits = t.bits;
+    CHECK_EQ(bits % 8, 0U);
+    arg_size[i] = bits / 8;
+  }
+  // Initialize the wrapped func.
+  OpenGLWrappedFunc f(this, sptr_to_self, name, arg_size,
+                      func_info.thread_axis_tags);
+  return PackFuncVoidAddr(f, func_info.arg_types);
+}
+std::string OpenGLModuleNode::GetSource(const std::string& format) {
+  if (format != fmt_ && fmt_ != "gl") { return ""; }
+  std::ostringstream os;
+  for (auto &pair : shaders_) {
+    auto &name = pair.first;
+    auto &shader = pair.second;
+    os << "[" << name << "]" << "\n";
+    os << shader.source <<"\n";
+  }
+  return os.str();
+}
+void OpenGLModuleNode::SaveToFile(const std::string& file_name,
+                                  const std::string& format) {
+  std::string fmt = GetFileFormat(file_name, format);
+  CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+  std::string meta_file = GetMetaFilePath(file_name);
+  SaveMetaDataToFile(meta_file, fmap_);
+  SaveBinaryToFile(file_name, ToJSON(shaders_));
+}
+void OpenGLModuleNode::SaveToBinary(dmlc::Stream* stream) {
+  stream->Write(fmt_);
+  stream->Write(fmap_);
+  stream->Write(ToJSON(shaders_));
+}
+const gl::Program& OpenGLModuleNode::GetProgram(
+    const std::string& func_name) const {
+  auto it = programs_.find(func_name);
+  if (it == programs_.end()) {
+    LOG(FATAL) << "Cannot find program";
+  }
+  return it->second;
+}
+const OpenGLShader& OpenGLModuleNode::GetShader(
+    const std::string& func_name) const {
+  auto it = shaders_.find(func_name);
+  if (it == shaders_.end()) {
+    LOG(FATAL) << "Cannot find shader";
+  }
+  return it->second;
+}
+const FunctionInfo& OpenGLModuleNode::GetFunctionInfo(
+    const std::string& func_name) const {
+  auto it = fmap_.find(func_name);
+  if (it == fmap_.end()) {
+    LOG(FATAL) << "Cannot find shader";
+  }
+  return it->second;
+}
+OpenGLWrappedFunc::OpenGLWrappedFunc(
+    OpenGLModuleNode* m,
+    std::shared_ptr<ModuleNode> sptr,
+    std::string func_name,
+    std::vector<size_t> arg_size,
+    const std::vector<std::string>& thread_axis_tags)
+    : m_(m), sptr_(std::move(sptr)), func_name_(std::move(func_name)),
+      arg_size_(std::move(arg_size)) {
+  thread_axis_cfg_.Init(arg_size_.size(), thread_axis_tags);
+}
+void OpenGLWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv,
+                                   void** void_args) const {
+  auto &shader = m_->GetShader(func_name_);
+  auto &program = m_->GetProgram(func_name_);
+  auto &func_info = m_->GetFunctionInfo(func_name_);
+  size_t nargs = shader.arg_kinds.size();
+  // Must call this function before setting uniforms & input textures.
+  m_->workspace().SetCurrentProgram(program);
+  // Set all arguments.
+  GLuint texture_unit = 0;
+  gl::Texture* output = nullptr;
+  for (size_t i = 0; i != nargs; ++i) {
+    auto &name = shader.arg_names.at(i);
+    auto kind = shader.arg_kinds.at(i);
+    auto type = func_info.arg_types.at(i);
+    switch (kind) {
+      case OpenGLArgKind::kUniform: {
+        m_->workspace().SetUniform(program, name, type, void_args[i]);
+        break;
+      }
+      case OpenGLArgKind::kInputTexture: {
+        CHECK_EQ(type.code, kHandle) << "Type is not handle?";
+        auto texture = *static_cast<gl::Texture**>(void_args[i]);
+        m_->workspace().SetInputTexture(program, name, texture_unit, texture);
+        ++texture_unit;
+        break;
+      }
+      case OpenGLArgKind::kOutputTexture: {
+        CHECK_EQ(type.code, kHandle) << "Type is not handle?";
+        CHECK(output == nullptr) << "Can only have one output texture.";
+        output = *static_cast<gl::Texture**>(void_args[i]);
+        break;
+      }
+    }
+  }
+  // Set "thread_extent" uniform.
+  ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
+  std::unique_ptr<GLint> thread_extent(new GLint(wl.block_dim(0)));
+  m_->workspace().SetUniform(program, shader.thread_extent_var,
+                             TVMType{kDLInt, 32, 1},
+                             static_cast<void*>(thread_extent.get()));
+  m_->workspace().Render(output);
+}
+Module OpenGLModuleCreate(std::unordered_map<std::string, OpenGLShader> shaders,
+                          std::string fmt,
+                          std::unordered_map<std::string, FunctionInfo> fmap) {
+  auto n = std::make_shared<OpenGLModuleNode>(std::move(shaders),
+                                              std::move(fmt),
+                                              std::move(fmap));
+  return Module(n);
+}
+Module OpenGLModuleLoadFile(const std::string& file_name,
+                            const std::string& format) {
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt = GetFileFormat(file_name, format);
+  std::string meta_file = GetMetaFilePath(file_name);
+  LoadBinaryFromFile(file_name, &data);
+  LoadMetaDataFromFile(meta_file, &fmap);
+  return OpenGLModuleCreate(FromJSON(data), fmt, fmap);
+}
+Module OpenGLModuleLoadBinary(void* strm) {
+  auto stream = static_cast<dmlc::Stream*>(strm);
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt;
+  stream->Read(&fmt);
+  stream->Read(&fmap);
+  stream->Read(&data);
+  return OpenGLModuleCreate(FromJSON(data), fmt, fmap);
+}
+TVM_REGISTER_GLOBAL("module.loadfile_gl")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenGLModuleLoadFile(args[0], args[1]);
+  });
+TVM_REGISTER_GLOBAL("module.loadfile_glbin")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenGLModuleLoadFile(args[0], args[1]);
+  });
+TVM_REGISTER_GLOBAL("module.loadbinary_opengl")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenGLModuleLoadBinary(args[0]);
+  });
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_OPENGL_RUNTIME
--- a/src/runtime/opengl/opengl_module.h
+++ b/src/runtime/opengl/opengl_module.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file opengl_module.h
+ * \brief Execution handling of OpenGL kernels
+ */
+#ifndef TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_
+#define TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_
+#include <tvm/runtime/config.h>
+#include <tvm/runtime/packed_func.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "../meta_data.h"
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief Determines how we supply arguments.
+ */
+enum class OpenGLArgKind {
+  kInputTexture = 0,   // Bind to "gsampler2D" in GLSL.
+  kOutputTexture = 1,  // Bind to "out" in GLSL.
+  kUniform = 2,        // Bind to "uniform" in GLSL.
+};
+std::string OpenGLArgKind2String(OpenGLArgKind kind);
+OpenGLArgKind String2OpenGLArgKind(const std::string& str);
+/*!
+ * \brief The output of OpenGL codegen.
+ * Contains necessary information to build a fragment shader and bind arguments.
+ */
+struct OpenGLShader {
+  OpenGLShader() = default;
+  OpenGLShader(std::string source,
+               std::vector<std::string> arg_names,
+               std::vector<OpenGLArgKind> arg_kinds,
+               std::string thread_extent_var)
+      : source(std::move(source)), arg_names(std::move(arg_names)),
+        arg_kinds(std::move(arg_kinds)),
+        thread_extent_var(std::move(thread_extent_var)) {
+    CHECK_EQ(this->arg_names.size(), this->arg_kinds.size()) << "Invalid input";
+  }
+  std::string source;
+  std::vector<std::string> arg_names;    // Matches FunctionInfo.
+  std::vector<OpenGLArgKind> arg_kinds;  // Matches FunctionInfo.
+  std::string thread_extent_var;         // Stores the output length.
+  void Save(dmlc::JSONWriter* writer) const;
+  void Load(dmlc::JSONReader* reader);
+};
+std::string ToJSON(const std::unordered_map<std::string, OpenGLShader>& shaders);
+std::unordered_map<std::string, OpenGLShader> FromJSON(const std::string& str);
+/*!
+ * \brief Create an OpenGL module from data.
+ *
+ * \param data The module data.
+ * \param fmt The format of the data,
+ * \param fmap The map function information map of each function.
+ */
+Module OpenGLModuleCreate(std::unordered_map<std::string, OpenGLShader> shaders,
+                          std::string fmt,
+                          std::unordered_map<std::string, FunctionInfo> fmap);
+inline std::string OpenGLArgKind2String(OpenGLArgKind kind) {
+  switch (kind) {
+    case OpenGLArgKind::kOutputTexture:
+      return "output_texture";
+    case OpenGLArgKind::kInputTexture:
+      return "input_texture";
+    case OpenGLArgKind::kUniform:
+      return "uniform";
+  }
+  assert(false);
+}
+inline OpenGLArgKind String2OpenGLArgKind(const std::string& str) {
+  if (str == "output_texture") {
+    return OpenGLArgKind::kOutputTexture;
+  } else if (str == "input_texture") {
+    return OpenGLArgKind::kInputTexture;
+  } else if (str == "uniform") {
+    return OpenGLArgKind::kUniform;
+  } else {
+    LOG(FATAL) << "Invalid OpenGL arg kind.";
+    assert(false);
+  }
+}
+inline void OpenGLShader::Save(dmlc::JSONWriter* writer) const {
+  std::vector<std::string> arg_kind_strs;
+  for (auto kind : arg_kinds) {
+    arg_kind_strs.push_back(OpenGLArgKind2String(kind));
+  }
+  writer->BeginObject();
+  writer->WriteObjectKeyValue("arg_names", arg_names);
+  writer->WriteObjectKeyValue("arg_kinds", arg_kind_strs);
+  writer->WriteObjectKeyValue("source", source);
+  writer->WriteObjectKeyValue("thread_extent_var", thread_extent_var);
+  writer->EndObject();
+}
+inline void OpenGLShader::Load(dmlc::JSONReader* reader) {
+  std::vector<std::string> arg_kind_strs;
+  dmlc::JSONObjectReadHelper helper;
+  helper.DeclareField("arg_names", &arg_names);
+  helper.DeclareField("arg_kinds", &arg_kind_strs);
+  helper.DeclareField("source", &source);
+  helper.DeclareField("thread_extent_var", &thread_extent_var);
+  helper.ReadAllFields(reader);
+  arg_kinds.clear();
+  for (auto& str : arg_kind_strs) {
+    arg_kinds.push_back(String2OpenGLArgKind(str));
+  }
+}
+inline std::string ToJSON(
+    const std::unordered_map<std::string, OpenGLShader>& shaders) {
+  std::ostringstream os;
+  dmlc::JSONWriter writer(&os);
+  writer.BeginObject();
+  writer.WriteObjectKeyValue("shaders", shaders);
+  writer.EndObject();
+  return os.str();
+}
+inline std::unordered_map<std::string, OpenGLShader> FromJSON(
+    const std::string& str) {
+  std::unordered_map<std::string, OpenGLShader> shaders;
+  std::istringstream is(str);
+  dmlc::JSONReader reader(&is);
+  dmlc::JSONObjectReadHelper helper;
+  helper.DeclareField("shaders", &shaders);
+  helper.ReadAllFields(&reader);
+  return shaders;
+}
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -52,12 +52,15 @@ class ROCMDeviceAPI final : public DeviceAPI {
    }
    *rv = value;
  }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
    ROCM_CALL(hipSetDevice(ctx.device_id));
    CHECK_EQ(256 % alignment, 0U)
        << "ROCM space is aligned at 256 bytes";
    void *ret;
-    ROCM_CALL(hipMalloc(&ret, size));
+    ROCM_CALL(hipMalloc(&ret, nbytes));
    return ret;
  }

--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -20,10 +20,13 @@ class RPCDeviceAPI final : public DeviceAPI {
    *rv = GetSess(ctx)->CallRemote(
        RPCCode::kDevGetAttr, ctx, static_cast<int>(kind));
  }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
    auto sess = GetSess(ctx);
    void *data = sess->CallRemote(
-            RPCCode::kDevAllocData, ctx, size, alignment);
+            RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint);
    RemoteSpace* space = new RemoteSpace();
    space->data = data;
    space->sess = std::move(sess);

--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -887,9 +887,11 @@ void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) {
 void RPCDevAllocData(TVMArgs args, TVMRetValue *rv) {
  TVMContext ctx = args[0];
-  uint64_t size = args[1];
+  uint64_t nbytes = args[1];
  uint64_t alignment = args[2];
-  void* data = DeviceAPI::Get(ctx)->AllocDataSpace(ctx, size, alignment);
+  TVMType type_hint = args[3];
+  void* data = DeviceAPI::Get(ctx)->AllocDataSpace(
+      ctx, nbytes, alignment, type_hint);
  *rv = data;
 }

--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -23,28 +23,29 @@ class WorkspacePool::Pool {
    allocated_.push_back(e);
  }
  // allocate from pool
-  void* Alloc(TVMContext ctx, DeviceAPI* device, size_t size) {
+  void* Alloc(TVMContext ctx, DeviceAPI* device, size_t nbytes) {
    // Allocate align to page.
-    size = (size + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize;
+    nbytes = (nbytes + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize;
-    if (size == 0) size = kWorkspacePageSize;
+    if (nbytes == 0) nbytes = kWorkspacePageSize;
    Entry e;
+    TVMType type = {.code = kDLUInt, .bits = 8, .lanes = 1};
    if (free_list_.size() == 2) {
      e = free_list_.back();
      free_list_.pop_back();
-      if (e.size < size) {
+      if (e.size < nbytes) {
        // resize the page
        device->FreeDataSpace(ctx, e.data);
-        e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment);
+        e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
-        e.size = size;
+        e.size = nbytes;
      }
    } else if (free_list_.size() == 1) {
-      e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment);
+      e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
-      e.size = size;
+      e.size = nbytes;
    } else {
-      if (free_list_.back().size >= size) {
+      if (free_list_.back().size >= nbytes) {
        // find smallest fit
        auto it = free_list_.end() - 2;
-        for (; it->size >= size; --it) {}
+        for (; it->size >= nbytes; --it) {}
        e = *(it + 1);
        free_list_.erase(it + 1);
      } else {
@@ -52,8 +53,8 @@ class WorkspacePool::Pool {
        e = free_list_.back();
        free_list_.pop_back();
        device->FreeDataSpace(ctx, e.data);
-        e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment);
+        e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
-        e.size = size;
+        e.size = nbytes;
      }
    }
    allocated_.push_back(e);

--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -397,6 +397,45 @@ Stage& Stage::double_buffer() {
  return *this;
 }
+Stage& Stage::opengl() {
+  CHECK(!is_scheduled()) << "Must be a fresh schedule";
+  StageNode *self = operator->();
+  auto all_iter_vars = self->all_iter_vars;  // curr version of all_iter_vars
+  CHECK(!all_iter_vars.empty()) << "At least one iter var";
+  // Fuse all data parallel dimensions to 1.
+  IterVar fused = all_iter_vars[0];
+  for (size_t i = 1; i != all_iter_vars.size(); ++i) {
+    auto iter_var = all_iter_vars[i];
+    switch (iter_var->iter_type) {
+      case IterVarType::kDataPar: {
+        fuse(fused, all_iter_vars[i], &fused);
+        break;
+      }
+      case IterVarType::kThreadIndex: {
+        LOG(ERROR) << "A fresh schedule shouldn't have thread index iter var";
+        break;
+      }
+      case IterVarType::kCommReduce:
+      case IterVarType::kOrdered:
+      case IterVarType::kOpaque: {
+        break;
+      }
+      default: {
+        LOG(ERROR) << "Invalid iter var type "
+                   << IterVarType2String(iter_var->iter_type);
+        break;
+      }
+    }
+  }
+  // Bind the only dimension to threadIdx.x.
+  bind(fused, thread_axis(Range(nullptr), "threadIdx.x"));
+  return *this;
+}
 Stage CopyStage(const Stage& s) {
  std::shared_ptr<StageNode> n =
      std::make_shared<StageNode>(*s.operator->());

--- a/tests/ci_build/Dockerfile.cpu
+++ b/tests/ci_build/Dockerfile.cpu
@@ -20,3 +20,6 @@ RUN bash /install/ubuntu_install_java.sh
 COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
 RUN bash /install/ubuntu_install_llvm.sh
+COPY install/ubuntu_install_opengl.sh /install/ubuntu_install_opengl.sh
+RUN bash /install/ubuntu_install_opengl.sh
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -37,6 +37,9 @@ RUN bash /install/ubuntu_install_nodejs.sh
 COPY install/ubuntu_install_rocm.sh /install/ubuntu_install_rocm.sh
 RUN bash /install/ubuntu_install_rocm.sh
+COPY install/ubuntu_install_opengl.sh /install/ubuntu_install_opengl.sh
+RUN bash /install/ubuntu_install_opengl.sh
 # Enable doxygen for c++ doc build
 RUN apt-get install -y doxygen graphviz

--- a/tests/ci_build/install/ubuntu_install_opengl.sh
+++ b/tests/ci_build/install/ubuntu_install_opengl.sh
+apt-get update --fix-missing
+apt-get install -y --no-install-recommends --force-yes \
+        libgl1-mesa-dev libglfw3-dev
\ No newline at end of file
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -7,7 +7,8 @@ def enabled_ctx_list():
                ('cl', tvm.opencl(0)),
                ('metal', tvm.metal(0)),
                ('rocm', tvm.rocm(0)),
-                ('vpi', tvm.vpi(0))]
+                ('vpi', tvm.vpi(0)),
+                ('opengl', tvm.opengl(0))]
    for k, v  in ctx_list:
        assert tvm.context(k, 0) == v
    ctx_list = [x[1] for x in ctx_list if x[1].exist]
@@ -19,7 +20,8 @@ print("Testing using contexts:", ENABLED_CTX_LIST)
 def test_nd_create():
    for ctx in ENABLED_CTX_LIST:
-        for dtype in ["float32", "int8", "uint16"]:
+        for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32",
+                      "float32"]:
            x = np.random.randint(0, 10, size=(3, 4))
            x = np.array(x, dtype=dtype)
            y = tvm.nd.array(x, ctx=ctx)

--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -17,3 +17,5 @@ TVM_FFI=cython python -m nose -v tests/python/integration || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/integration || exit -1
 TVM_FFI=cython python -m nose -v tests/python/contrib || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/contrib || exit -1
+TVM_FFI=cython python -m nose -v tests/webgl || exit -1
+TVM_FFI=ctypes python3 -m nose -v tests/webgl || exit -1
--- a/tests/webgl/README.md
+++ b/tests/webgl/README.md
+## Test cases for the WebGL backend
+Any test case with name `test_local_...` tests the C++ OpenGL backend on the
+local OS, which can be executed automatically.
+Any test case with name `test_remote_...` tests the WebGL backend within the
+browser, which must be run manually. See instruction within the test.
--- a/tests/webgl/test_local_gemm.py
+++ b/tests/webgl/test_local_gemm.py
+import tvm
+import numpy as np
+def test_local_gemm():
+    if not tvm.module.enabled("opengl"):
+        return
+    if not tvm.module.enabled("llvm"):
+        return
+    nn = 2
+    n = tvm.var('n')
+    n = tvm.convert(nn)
+    m = n
+    l = n
+    A = tvm.placeholder((n, l), name='A', dtype='int32')
+    B = tvm.placeholder((m, l), name='B', dtype='int32')
+    k = tvm.reduce_axis((0, l), name='k')
+    C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
+                    name='CC')
+    s = tvm.create_schedule(C.op)
+    s[C].opengl()
+    print(tvm.lower(s, [A, B, C], simple_mode=True))
+    f = tvm.build(s, [A, B, C], "opengl", name="gemm")
+    print("------opengl code------")
+    print(f.imported_modules[0].get_source(fmt="gl"))
+    ctx = tvm.opengl()
+    n, m, l = nn, nn, nn
+    a_np = np.random.uniform(low=0, high=10, size=(n, l)).astype(A.dtype)
+    b_np = np.random.uniform(low=0, high=10, size=(m, l)).astype(B.dtype)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+    f(a, b, c)
+    np.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
+if __name__ == "__main__":
+    test_local_gemm()
--- a/tests/webgl/test_local_save_load.py
+++ b/tests/webgl/test_local_save_load.py
+import numpy as np
+import tvm
+from tvm.contrib import rpc, util, emscripten
+def test_local_save_load():
+    if not tvm.module.enabled("opengl"):
+        return
+    if not tvm.module.enabled("llvm"):
+        return
+    n = tvm.var("n")
+    A = tvm.placeholder((n,), name='A', dtype='int32')
+    B = tvm.placeholder((n,), name='B', dtype='int32')
+    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    s = tvm.create_schedule(C.op)
+    s[C].opengl()
+    f = tvm.build(s, [A, B, C], "opengl", target_host="llvm", name="myadd")
+    ctx = tvm.opengl(0)
+    n = 10
+    a = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(B.dtype), ctx)
+    c = tvm.nd.array(np.zeros((n), dtype=C.dtype), ctx)
+    f(a, b, c)
+    temp = util.tempdir()
+    path_so = temp.relpath("myadd.so")
+    f.export_library(path_so)
+    f1 = tvm.module.load(path_so)
+    f1(a, b, c)
+    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+if __name__ == "__main__":
+    test_local_save_load()
--- a/tests/webgl/test_remote_save_load.py
+++ b/tests/webgl/test_remote_save_load.py
+"""
+The following instruction is based on web/README.md.
+Setup an RPC server:
+$ python -m tvm.exec.rpc_proxy --example-rpc=1
+Go to http://localhost:9190 in browser.
+Click "Connect To Proxy".
+Run this test script:
+$ python tests/webgl/test_remote_save_load.py
+"""
+import numpy as np
+import tvm
+from tvm.contrib import rpc, util, emscripten
+proxy_host = "localhost"
+proxy_port = 9090
+def try_remote_save_load():
+    if not tvm.module.enabled("rpc"):
+        return
+    if not tvm.module.enabled("opengl"):
+        return
+    if not tvm.module.enabled("llvm"):
+        return
+    # Build the module.
+    n = tvm.var("n")
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    s = tvm.create_schedule(C.op)
+    s[C].opengl()
+    target_host = "llvm -target=asmjs-unknown-emscripten -system-lib"
+    f = tvm.build(s, [A, B, C], "opengl", target_host=target_host, name="myadd")
+    remote = rpc.connect(proxy_host, proxy_port, key="js")
+    temp = util.tempdir()
+    ctx = remote.opengl(0)
+    path_obj = temp.relpath("myadd.bc")
+    path_dso = temp.relpath("myadd.js")
+    path_gl = temp.relpath("myadd.gl")
+    path_json = temp.relpath("myadd.tvm_meta.json")
+    f.save(path_obj)
+    emscripten.create_js(path_dso, path_obj, side_module=True)
+    f.imported_modules[0].save(path_gl)
+    remote.upload(path_dso, "myadd.dso")
+    remote.upload(path_gl)
+    remote.upload(path_json)
+    remote.download("myadd.dso")
+    remote.download("myadd.gl")
+    remote.download("myadd.tvm_meta.json")
+    print('Loading myadd.dso')
+    fhost = remote.load_module("myadd.dso")
+    print('Loading myadd.gl')
+    fdev = remote.load_module("myadd.gl")
+    print('import_module')
+    fhost.import_module(fdev)
+    print('running...')
+    a = tvm.nd.array(np.random.uniform(size=16).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx)
+    c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx)
+    fhost(a, b, c)
+    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+if __name__ == "__main__":
+    try_remote_save_load()
--- a/web/example_rpc.html
+++ b/web/example_rpc.html
@@ -36,5 +36,9 @@
   <button onclick="connect_rpc()">Connect To Proxy</button>
   <button onclick="clear_log()">Clear Log</button>
   <div id="log"></div>
+   <canvas id="canvas"></canvas>
+   <script>
+    Module["canvas"] = document.getElementById("canvas");
+   </script>
 </body>
 </html>
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -686,7 +686,8 @@ var tvm_runtime = tvm_runtime || {};
      2 : "gpu",
      4 : "opencl",
      8 : "metal",
-      9 : "vpi"
+      9 : "vpi",
+      11 : "opengl",
    };
    var CTX_STR2MASK = {
      "cpu": 1,
@@ -695,7 +696,8 @@ var tvm_runtime = tvm_runtime || {};
      "cl": 4,
      "opencl": 4,
      "metal": 8,
-      "vpi": 9
+      "vpi": 9,
+      "opengl": 11,
    };
    TVMContext.prototype = {
      toString : function() {

--- a/web/web_runtime.cc
+++ b/web/web_runtime.cc
@@ -18,6 +18,8 @@
 #include "../src/runtime/rpc/rpc_event_impl.cc"
 #include "../src/runtime/rpc/rpc_server_env.cc"
 #include "../src/runtime/graph/graph_runtime.cc"
+#include "../src/runtime/opengl/opengl_device_api.cc"
+#include "../src/runtime/opengl/opengl_module.cc"
 namespace tvm {
 namespace contrib {