[WIP] WebGL Backend (#672)

Basic WebGL Backend

[WIP] WebGL Backend (#672)
Basic WebGL Backend
589831df · Zhixun Tan · Tianqi Chen · d4a46898 · 589831df · 589831df
Commit 589831df authored Jan 20, 2018 by Zhixun Tan Committed by Tianqi Chen Jan 20, 2018
47 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,7 @@ endif()

 tvm_option(USE_CUDA "Build with CUDA" OFF)
 tvm_option(USE_OPENCL "Build with OpenCL" OFF)
+tvm_option(USE_OPENGL "Build with OpenGL" OFF)
 tvm_option(USE_METAL "Build with Metal" OFF)
 tvm_option(USE_RPC "Build with RPC" ON)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
@@ -61,8 +62,8 @@ if(MSVC)
 else(MSVC)
  include(CheckCXXCompilerFlag)
  check_cxx_compiler_flag("-std=c++11"    SUPPORT_CXX11)
-  set(CMAKE_C_FLAGS "-O3 -Wall -std=c++11 -fPIC")
-  set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS})
+  set(CMAKE_C_FLAGS "-O3 -Wall -fPIC")
+  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -std=c++11")
 endif(MSVC)

 # add source group
@@ -87,6 +88,7 @@ file(GLOB RUNTIME_SRCS src/runtime/*.cc)
 file(GLOB COMPILER_LLVM_SRCS src/codegen/llvm/*.cc)
 file(GLOB RUNTIME_CUDA_SRCS src/runtime/cuda/*.cc)
 file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
+file(GLOB RUNTIME_OPENGL_SRCS src/runtime/opengl/*.cc)
 file(GLOB RUNTIME_METAL_SRCS src/runtime/metal/*.mm)
 file(GLOB RUNTIME_RPC_SRCS src/runtime/rpc/*.cc)
 file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
@@ -135,6 +137,18 @@ else(USE_OPENCL)
  add_definitions(-DTVM_OPENCL_RUNTIME=0)
 endif(USE_OPENCL)

+if(USE_OPENGL)
+  find_package(OpenGL QUIET REQUIRED)
+  find_package(glfw3 QUIET REQUIRED)
+  message(STATUS "Build with OpenGL support")
+  include_directories(${OPENGL_INCLUDE_DIRS})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenGL_LIBRARIES} glfw)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_OPENGL_SRCS})
+  add_definitions(-DTVM_OPENGL_RUNTIME=1)
+else(USE_OPENGL)
+  add_definitions(-DTVM_OPENGL_RUNTIME=0)
+endif(USE_OPENGL)
+
 if(USE_METAL)
  find_package(OpenCL QUIET REQUIRED)
  message(STATUS "Build with Metal support")

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -88,6 +88,7 @@ stage('Build') {
           echo USE_CUDNN=1 >> config.mk
           echo USE_CUDA=1 >> config.mk
           echo USE_OPENCL=1 >> config.mk
+           echo USE_OPENGL=1 >> config.mk
           echo LLVM_CONFIG=llvm-config-4.0 >> config.mk
           echo USE_RPC=1 >> config.mk
           echo USE_GRAPH_RUNTIME=1 >> config.mk
@@ -120,6 +121,7 @@ stage('Build') {
           echo USE_CUDA=0 >> config.mk
           echo USE_OPENCL=0 >> config.mk
           echo USE_RPC=0 >> config.mk
+           echo USE_OPENGL=1 >> config.mk
           echo LLVM_CONFIG=llvm-config-4.0 >> config.mk
           """
        make('cpu', '-j2')

--- a/Makefile
+++ b/Makefile
@@ -32,8 +32,8 @@ OBJCFLAGS = -fno-objc-arc
 EMCC_FLAGS= -std=c++11 -DDMLC_LOG_STACK_TRACE=0\
 	-Oz -s RESERVED_FUNCTION_POINTERS=2 -s MAIN_MODULE=1 -s NO_EXIT_RUNTIME=1\
 	-s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap','getValue','setValue','addFunction']"\
+	-s USE_GLFW=3 -s USE_WEBGL2=1 -lglfw\
 	$(INCLUDE_FLAGS)
-
 # llvm configuration
 ifdef LLVM_CONFIG
 	LLVM_VERSION=$(shell $(LLVM_CONFIG) --version| cut -b 1,3)
@@ -54,6 +54,7 @@ METAL_SRC = $(wildcard src/runtime/metal/*.mm)
 CUDA_SRC = $(wildcard src/runtime/cuda/*.cc)
 ROCM_SRC = $(wildcard src/runtime/rocm/*.cc)
 OPENCL_SRC = $(wildcard src/runtime/opencl/*.cc)
+OPENGL_SRC = $(wildcard src/runtime/opengl/*.cc)
 RPC_SRC = $(wildcard src/runtime/rpc/*.cc)
 GRAPH_SRC = $(wildcard src/runtime/graph/*.cc)
 RUNTIME_SRC = $(wildcard src/runtime/*.cc)
@@ -65,6 +66,7 @@ METAL_OBJ = $(patsubst src/%.mm, build/%.o, $(METAL_SRC))
 CUDA_OBJ = $(patsubst src/%.cc, build/%.o, $(CUDA_SRC))
 ROCM_OBJ = $(patsubst src/%.cc, build/%.o, $(ROCM_SRC))
 OPENCL_OBJ = $(patsubst src/%.cc, build/%.o, $(OPENCL_SRC))
+OPENGL_OBJ = $(patsubst src/%.cc, build/%.o, $(OPENGL_SRC))
 RPC_OBJ = $(patsubst src/%.cc, build/%.o, $(RPC_SRC))
 GRAPH_OBJ = $(patsubst src/%.cc, build/%.o, $(GRAPH_SRC))
 CC_OBJ = $(patsubst src/%.cc, build/%.o, $(CC_SRC)) $(LLVM_OBJ)
@@ -119,6 +121,19 @@ else
 	CFLAGS += -DTVM_OPENCL_RUNTIME=0
 endif

+ifeq ($(USE_OPENGL), 1)
+	CFLAGS += -DTVM_OPENGL_RUNTIME=1
+	EMCC_FLAGS += -DTVM_OPENGL_RUNTIME=1
+	ifeq ($(UNAME_S), Darwin)
+		FRAMEWORKS += -framework OpenGL
+	else
+		LDFLAGS += -lGL -lglfw
+	endif
+	RUNTIME_DEP += $(OPENGL_OBJ)
+else
+	CFLAGS += -DTVM_OPENGL_RUNTIME=0
+endif
+
 ifeq ($(USE_METAL), 1)
 	CFLAGS += -DTVM_METAL_RUNTIME=1
 	LDFLAGS += -lobjc

--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -55,9 +55,11 @@ typedef int64_t tvm_index_t;

 /*! \brief Extension device types in TVM */
 typedef enum {
+  kOpenGL = 11,
+
  // Extension DRAM type, used for quickly test extension device
  // The device api can differ depending on the xpu driver registered.
-  kExtDev = 12
+  kExtDev = 12,
  // AddExtraTVMType which is not in DLPack here
 } TVMDeviceExtType;


--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -55,11 +55,16 @@ class DeviceAPI {
  /*!
   * \brief Allocate a data space on device.
   * \param ctx The device context to perform operation.
-   * \param size The size of the memory
+   * \param nbytes The number of bytes in memory.
   * \param alignment The alignment of the memory.
-   * \return The allocated device pointer
+   * \param type_hint The type of elements. Only needed by certain backends such
+   * as OpenGL, as nbytes & alignment are sufficient for most backends.
+   * \return The allocated device pointer.
   */
-  virtual void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) = 0;
+  virtual void* AllocDataSpace(TVMContext ctx,
+                               size_t nbytes,
+                               size_t alignment,
+                               TVMType type_hint) = 0;
  /*!
   * \brief Free a data space on device.
   * \param ctx The device context to perform operation.

--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -214,6 +214,11 @@ class Stage : public NodeRef {
   */
  Stage& double_buffer();   // NOLINT(*)
  /*!
+   * \brief Schedule for OpenGL fragment shader.
+   * \return reference to self.
+   */
+  Stage& opengl(); // NOLINT(*)
+  /*!
   * \brief whether the stage has been scheduled.
   * \return whether the stage has been scheduled.
   */

--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -17,7 +17,7 @@ from . import ir_builder
 from . import target

 from . import ndarray as nd
-from .ndarray import context, cpu, gpu, opencl, cl, metal, mtl, vpi, rocm, ext_dev
+from .ndarray import context, cpu, gpu, opencl, cl, metal, mtl, vpi, rocm, opengl, ext_dev

 from ._ffi.runtime_ctypes import TypeCode
 from ._ffi.function import Function

--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -97,6 +97,7 @@ class TVMContext(ctypes.Structure):
        8 : 'metal',
        9 : 'vpi',
        10: 'rocm',
+        11: 'opengl',
        12: 'ext_dev',
    }
    STR2MASK = {
@@ -111,6 +112,7 @@ class TVMContext(ctypes.Structure):
        'metal': 8,
        'vpi': 9,
        'rocm': 10,
+        'opengl': 11,
        'ext_dev': 12,
    }
    def __init__(self, device_type, device_id):

--- a/python/tvm/contrib/rpc.py
+++ b/python/tvm/contrib/rpc.py
@@ -285,6 +285,10 @@ class RPCSession(object):
        """Construct remote Metal device."""
        return self.context(8, dev_id)

+    def opengl(self, dev_id=0):
+        """Construct remote OpenGL device."""
+        return self.context(11, dev_id)
+
    def ext_dev(self, dev_id=0):
        """Construct remote extension device."""
        return self.context(12, dev_id)

--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -120,6 +120,21 @@ def vpi(dev_id=0):
    """
    return TVMContext(9, dev_id)

+def opengl(dev_id=0):
+    """Construct a OpenGL device
+
+    Parameters
+    ----------
+    dev_id : int, optional
+        The integer device id
+
+    Returns
+    -------
+    ctx : TVMContext
+        The created context
+    """
+    return TVMContext(11, dev_id)
+
 def ext_dev(dev_id=0):
    """Construct a extension device


--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -611,4 +611,11 @@ class Stage(NodeBase):
        """
        _api_internal._StageDoubleBuffer(self)

+    def opengl(self):
+        """The special OpenGL schedule
+
+        Maps each output element to a pixel.
+        """
+        _api_internal._StageOpenGL(self)
+
 _init_api("tvm.schedule")
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -67,7 +67,7 @@ class Target(object):

    Parameters
    ----------
-    target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "ext_dev"}
+    target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev"}
        The major target name.

    options : list of str, optional
@@ -119,6 +119,8 @@ class Target(object):
        elif target_name in ("metal",):
            self.keys += ("gpu",)
            self.max_num_threads = 256
+        elif target_name in ("opengl",):
+            self.keys += ("opengl",)
        elif target_name in ("stackvm", "ext_dev"):
            # Do not now class for stacvm or ext_dev
            pass

--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -399,6 +399,11 @@ TVM_REGISTER_API("_StageDoubleBuffer")
    args[0].operator Stage().double_buffer();
  });

+TVM_REGISTER_API("_StageOpenGL")
+  .set_body([](TVMArgs args, TVMRetValue *ret) {
+    args[0].operator Stage().opengl();
+  });
+
 TVM_REGISTER_API("_ScheduleNormalize")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
    *ret = args[0].operator Schedule()

--- a/src/codegen/build_opengl.cc
+++ b/src/codegen/build_opengl.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ *  Build opengl modules from source.
+ * \file build_opengl.cc
+ */
+#include <tvm/base.h>
+#include "./codegen_opengl.h"
+#include "./build_common.h"
+
+namespace tvm {
+namespace codegen {
+
+runtime::Module BuildOpenGL(Array<LoweredFunc> funcs) {
+  bool output_ssa = false;
+  CodeGenOpenGL cg;
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  auto shaders = cg.Finish();
+#if TVM_OPENGL_RUNTIME
+  return OpenGLModuleCreate(shaders, "gl", ExtractFuncInfo(funcs));
+#else
+  LOG(WARNING) << "OpenGL runtime not enabled, return a source module...";
+  auto data = ToJSON(shaders);
+  return DeviceSourceModuleCreate(data, "gl", ExtractFuncInfo(funcs), "opengl");
+#endif  // TVM_OPENGL_RUNTIME
+}
+
+TVM_REGISTER_API("codegen.build_opengl")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = BuildOpenGL(args[0]);
+});
+}  // namespace codegen
+}  // namespace tvm
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -150,7 +150,7 @@ class CodeGenC :
  std::string GetStructRef(
      Type t, const Expr& buffer, const Expr& index, int kind);
  // print reference to a buffer as type t in index.
-  std::string GetBufferRef(
+  virtual std::string GetBufferRef(
      Type t, const Variable* buffer, Expr index);
  /*!
   * \brief If buffer is allocated as type t.

--- a/src/codegen/codegen_opengl.cc
+++ b/src/codegen/codegen_opengl.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_opengl.cc
+ *
+ * We are targeting OpenGL 3.3. The reason of not targeting a recent version
+ * of OpenGL is to have better compatibility of WebGL 2.
+ */
+#include <tvm/runtime/config.h>
+#include <tvm/packed_func_ext.h>
+#include <vector>
+#include <string>
+#include "./codegen_opengl.h"
+#include "../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace codegen {
+
+CodeGenOpenGL::CodeGenOpenGL()
+    : output_(nullptr), output_iter_var_(nullptr) {}
+
+void CodeGenOpenGL::InitFuncState(LoweredFunc f) {
+  CodeGenC::InitFuncState(f);
+  output_ = nullptr;
+  inputs_.clear();
+  output_iter_var_ = nullptr;
+  thread_extent_var_ = "";
+}
+
+void CodeGenOpenGL::AddFunction(LoweredFunc f) {
+  // clear previous generated state.
+  this->InitFuncState(f);
+
+  this->decl_stream << "#version 300 es\n";
+  this->decl_stream << "precision highp float;\n";
+
+  // skip the first underscore, so SSA variable starts from _1
+  GetUniqueName("_");
+  // add to alloc buffer type.
+  for (const auto& kv : f->handle_data_type) {
+    RegisterHandleType(kv.first.get(), kv.second.type());
+  }
+
+  // Allocate argument names. Store in `var_idmap_`.
+  for (auto arg : f->args) {
+    auto arg_name = GetUniqueName(arg.get()->name_hint);
+    var_idmap_[arg.get()] = arg_name;
+  }
+
+  thread_extent_var_ = GetUniqueName("thread_extent");
+  this->decl_stream << "uniform int " << thread_extent_var_ << ";\n";
+
+  this->stream << "void main() {\n";
+
+  int func_scope = this->BeginScope();
+  this->PrintStmt(f->body);
+  this->EndScope(func_scope);
+
+  this->PrintIndent();
+  this->stream << "}\n\n";
+
+  // Declare arguments.
+  for (auto arg : f->args) {
+    if (this->inputs_.find(arg.get()) != this->inputs_.cend()) {
+      // Declare input texture.
+      // Format:
+      // - Float: "uniform sampler2D {name};"
+      // - Int: "uniform isampler2D {name};"
+      // - UInt: "uniform usampler2D {name};"
+
+      auto arg_name = GetVarID(arg.get());
+
+      auto type_it = this->handle_data_type_.find(arg.get());
+      CHECK(type_it != this->handle_data_type_.cend()) << "Cannot find type.";
+      auto type = Type2TVMType(type_it->second);
+      CHECK_EQ(type.lanes, 1) << "Vector type not supported.";
+
+      switch (type.code) {
+        case kDLInt:
+          this->decl_stream << "uniform isampler2D " << arg_name << ";\n";
+          break;
+        case kDLUInt:
+          this->decl_stream << "uniform usampler2D " << arg_name << ";\n";
+          break;
+        case kDLFloat:
+          this->decl_stream << "uniform sampler2D " << arg_name << ";\n";
+          break;
+        default:
+          LOG(FATAL) << "Unsupported type code.";
+      }
+
+    } else if (this->output_ == arg.get()) {
+      // Declare output texture.
+      // Format: "out {type} {name};"
+
+      auto arg_name = GetVarID(arg.get());
+
+      auto type_it = this->handle_data_type_.find(arg.get());
+      CHECK(type_it != this->handle_data_type_.cend()) << "Cannot find type.";
+      auto type = type_it->second;
+
+      this->decl_stream << "out ";
+      PrintType(type, this->decl_stream);
+      this->decl_stream << " " << arg_name << ";\n";
+
+    } else {
+      // Declare uniform value.
+      // Format: "uniform {type} {name};"
+
+      auto arg_name = GetVarID(arg.get());
+      auto type = arg.get()->type;
+
+      this->decl_stream << "uniform ";
+      PrintType(type, this->decl_stream);
+      this->decl_stream << " " << arg_name << ";\n";
+    }
+  }
+
+  std::vector<std::string> arg_names;
+  std::vector<runtime::OpenGLArgKind> arg_kinds;
+  for (auto arg : f->args) {
+    std::string name = GetVarID(arg.get());
+
+    runtime::OpenGLArgKind kind;
+    if (inputs_.find(arg.get()) != inputs_.cend()) {
+      kind = runtime::OpenGLArgKind::kInputTexture;
+    } else if (output_ == arg.get()) {
+      kind = runtime::OpenGLArgKind::kOutputTexture;
+    } else {
+      kind = runtime::OpenGLArgKind::kUniform;
+    }
+
+    arg_names.push_back(name);
+    arg_kinds.push_back(kind);
+  }
+
+  shaders_[f->name] = runtime::OpenGLShader(
+      this->decl_stream.str() + this->stream.str(),
+      std::move(arg_names), std::move(arg_kinds),
+      this->thread_extent_var_);
+}
+
+std::unordered_map<std::string, runtime::OpenGLShader> CodeGenOpenGL::Finish() {
+  return shaders_;
+}
+
+void CodeGenOpenGL::BindThreadIndex(const IterVar& iv) {
+  CHECK_EQ(iv->thread_tag, "threadIdx.x") << "Must be threadIdx.x";
+  CHECK(var_idmap_.find(iv->var.get()) == var_idmap_.end())
+    << "Only support one thread iter var";
+  CHECK(output_iter_var_ == nullptr) << "Only support one thread iter var";
+
+  var_idmap_[iv->var.get()] = iv->thread_tag;
+  output_iter_var_ = iv->var.get();
+
+  // Declare threadIdx local variable.
+  this->PrintIndent();
+  this->stream << "ivec2 threadIdx = ivec2(gl_FragCoord.xy);\n";
+
+  // Return directly if threadIdx.x >= thread_extent.
+  this->PrintIndent();
+  this->stream << "if (threadIdx.x >= " << thread_extent_var_ << ") {\n";
+  this->PrintIndent();
+  this->stream << "  return;\n";
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+// GLSL texture store is special. We can only store to one output texture, and
+// we must store to the index that matches the current "thread index".
+void CodeGenOpenGL::VisitStmt_(const Store* op) {
+  auto t = op->value.type();
+  auto buffer = op->buffer_var.get();
+  auto index = op->index;
+
+  if (t.lanes() == 1) {
+    // Store to a scalar.
+    CHECK(inputs_.find(buffer) == inputs_.cend())
+      << "Texture has been read from before. Must not store to it.";
+    if (output_ == nullptr) {
+      output_ = buffer;  // Record that this texture is the output.
+    } else {
+      CHECK(output_ == buffer) << "GLSL can only write to 1 texture.";
+    }
+
+    this->PrintIndent();
+    this->stream << GetBufferRef(t, buffer, index) << " = "
+                 << PrintExpr(op->value) << ";\n";
+
+  } else {
+    // Store to a vector.
+    LOG(FATAL) << "Vectorized store not implemented.";
+  }
+}
+
+// texelFetch(tex, ivec2(idx, 0), 0).r
+std::string CodeGenOpenGL::TexelFetch(const Variable* buffer, Expr index) {
+  std::ostringstream os;
+  os << "texelFetch(" << GetVarID(buffer) << ", ivec2(";
+  PrintExpr(index, os);
+  os << ", 0), 0).r";
+  return os.str();
+}
+
+// Print a reference expression to a buffer.
+// Format: texelFetch(buffer, index, 0).r
+std::string CodeGenOpenGL::GetBufferRef(
+    Type t, const Variable* buffer, Expr index) {
+  CHECK_EQ(t.lanes(), 1) << "Vector type not supported.";
+  CHECK(HandleTypeMatch(buffer, t)) << "Type mismatch not supported.";
+
+  if (buffer == this->output_) {
+    // This is the output texture.
+    CHECK_EQ(index.get(), output_iter_var_)
+      << "GLSL must access corresponding elem of output texture.";
+    return GetVarID(buffer);
+  } else {
+    // This is an input texture.
+    this->inputs_.insert(buffer);
+    return TexelFetch(buffer, index);
+  }
+}
+
+void CodeGenOpenGL::PrintType(Type t, std::ostream& os) {
+  switch (t.code()) {
+    case halideir_type_int:
+      CHECK_EQ(t.bits(), 32) << "Only support 32-bit int.";
+      os << "int";
+      break;
+    case halideir_type_uint:
+      CHECK_EQ(t.bits(), 32) << "Only support 32-bit uint.";
+      os << "uint";
+      break;
+    case halideir_type_float:
+      CHECK_EQ(t.bits(), 32) << "Only support 32-bit float.";
+      os << "float";
+      break;
+    default:
+      LOG(FATAL) << "Unsupported type code.";
+  }
+}
+
+// Codegen for immediate values
+
+void CodeGenOpenGL::VisitExpr_(const IntImm* op, std::ostream& os) {
+  CHECK_EQ(op->type, Int(32)) << "GLSL 3.0 only supports 32-bit ints.";
+  CodeGenC::VisitExpr_(op, os);
+}
+
+void CodeGenOpenGL::VisitExpr_(const UIntImm* op, std::ostream& os) {
+  CHECK_EQ(op->type, UInt(32)) << "GLSL 3.0 only supports 32-bit uints.";
+  CodeGenC::VisitExpr_(op, os);
+}
+
+void CodeGenOpenGL::VisitExpr_(const FloatImm* op, std::ostream& os) {
+  CHECK_EQ(op->type, Float(32)) << "GLSL 3.0 only supports 32-bit floats.";
+  CodeGenC::VisitExpr_(op, os);
+}
+
+void CodeGenOpenGL::VisitExpr_(const StringImm*, std::ostream& os) {
+  LOG(FATAL) << "GLSL 3.0 doesn't support strings.";
+}
+
+}  // namespace codegen
+}  // namespace tvm
--- a/src/codegen/codegen_opengl.h
+++ b/src/codegen/codegen_opengl.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_opengl.h
+ * \brief Generate OpenGL device code.
+ */
+#ifndef TVM_CODEGEN_CODEGEN_OPENGL_H_
+#define TVM_CODEGEN_CODEGEN_OPENGL_H_
+
+#include <tvm/codegen.h>
+#include <tvm/packed_func_ext.h>
+#include <string>
+#include "./codegen_c.h"
+#include "../runtime/opengl/opengl_module.h"
+
+namespace tvm {
+namespace codegen {
+
+class CodeGenOpenGL final : public CodeGenC {
+ public:
+  CodeGenOpenGL();
+  void AddFunction(LoweredFunc f);
+  std::unordered_map<std::string, runtime::OpenGLShader> Finish();
+
+  void InitFuncState(LoweredFunc f) final;
+  void BindThreadIndex(const IterVar& iv) final;
+  void VisitStmt_(const Store* op) final;
+  std::string TexelFetch(const Variable* buffer, Expr index);
+  std::string GetBufferRef(Type t, const Variable* buffer, Expr index) final;
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
+
+  // Codegen for immediate values
+  void VisitExpr_(const IntImm* op, std::ostream& os) final;  // NOLINT(*)
+  void VisitExpr_(const UIntImm* op, std::ostream& os) final;  // NOLINT(*)
+  void VisitExpr_(const FloatImm* op, std::ostream& os) final;  // NOLINT(*)
+  void VisitExpr_(const StringImm* op, std::ostream& os) final;  // NOLINT(*)
+
+ private:
+  const Variable* output_{nullptr};
+  std::unordered_set<const Variable*> inputs_;
+  const Variable* output_iter_var_{nullptr};
+  std::unordered_map<std::string, runtime::OpenGLShader> shaders_;
+  std::string thread_extent_var_;
+};
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_CODEGEN_CODEGEN_OPENGL_H_
--- a/src/codegen/verilog/vpi_device_api.cc
+++ b/src/codegen/verilog/vpi_device_api.cc
@@ -49,7 +49,10 @@ class VPIDeviceAPI final : public runtime::DeviceAPI {
      *rv = 1;
    }
  }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t size,
+                       size_t alignment,
+                       TVMType type_hint) final {
    // always align to 32 bytes at least.
    CHECK_LE(alignment, runtime::kAllocAlignment);
    alignment = runtime::kAllocAlignment;

--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -31,6 +31,7 @@ inline std::string DeviceName(int type) {
    case kDLMetal: return "metal";
    case kDLVPI: return "vpi";
    case kDLROCM: return "rocm";
+    case kOpenGL: return "opengl";
    case kExtDev: return "ext_dev";
    default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
  }
@@ -95,7 +96,8 @@ DeviceAPI* DeviceAPI::Get(TVMContext ctx, bool allow_missing) {
 }

 void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size) {
-  return AllocDataSpace(ctx, size, kTempAllocaAlignment);
+  TVMType type_hint{kDLUInt, 8, 1};
+  return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
 }

 void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) {
@@ -365,7 +367,7 @@ int TVMArrayAlloc(const tvm_index_t* shape,
  size_t size = GetDataSize(arr);
  size_t alignment = GetDataAlignment(arr);
  arr->data = DeviceAPIManager::Get(arr->ctx)->AllocDataSpace(
-      arr->ctx, size, alignment);
+      arr->ctx, size, alignment, arr->dtype);
  *out = arr;
  API_END_HANDLE_ERROR(TVMArrayFree_(arr));
 }

--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -20,13 +20,16 @@ class CPUDeviceAPI final : public DeviceAPI {
      *rv = 1;
    }
  }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
    void* ptr;
 #if _MSC_VER
-    ptr = _aligned_malloc(size, alignment);
+    ptr = _aligned_malloc(nbytes, alignment);
    if (ptr == nullptr) throw std::bad_alloc();
 #else
-    int ret = posix_memalign(&ptr, alignment, size);
+    int ret = posix_memalign(&ptr, alignment, nbytes);
    if (ret != 0) throw std::bad_alloc();
 #endif
    return ptr;

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -54,12 +54,15 @@ class CUDADeviceAPI final : public DeviceAPI {
    }
    *rv = value;
  }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
    CUDA_CALL(cudaSetDevice(ctx.device_id));
    CHECK_EQ(256 % alignment, 0U)
        << "CUDA space is aligned at 256 bytes";
    void *ret;
-    CUDA_CALL(cudaMalloc(&ret, size));
+    CUDA_CALL(cudaMalloc(&ret, nbytes));
    return ret;
  }


--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -63,7 +63,10 @@ class MetalWorkspace final : public DeviceAPI {
  // override device API
  void SetDevice(TVMContext ctx) final;
  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final;
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final;
  void FreeDataSpace(TVMContext ctx, void* ptr) final;
  void CopyDataFromTo(const void* from,
                      size_t from_size,

--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -123,12 +123,12 @@ void MetalWorkspace::SetDevice(TVMContext ctx) {
 }

 void* MetalWorkspace::AllocDataSpace(
-    TVMContext ctx, size_t size, size_t alignment) {
+    TVMContext ctx, size_t nbytes, size_t alignment, TVMType type_hint) {
  this->Init();
  id<MTLDevice> dev = GetDevice(ctx);
  // allocate buffer in GPU only mode.
  id<MTLBuffer> buf = [
-      dev newBufferWithLength:size
+      dev newBufferWithLength:nbytes
          options:MTLResourceStorageModePrivate];
  CHECK(buf != nil);
  return (__bridge void*)([buf retain]);

--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -115,6 +115,8 @@ bool RuntimeEnabled(const std::string& target) {
    f_name = "device_api.gpu";
  } else if (target == "cl" || target == "opencl") {
    f_name = "device_api.opencl";
+  } else if (target == "gl" || target == "opengl") {
+    f_name = "device_api.opengl";
  } else if (target == "mtl" || target == "metal") {
    f_name = "device_api.metal";
  } else if (target == "stackvm") {

--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -142,7 +142,10 @@ class OpenCLWorkspace final : public DeviceAPI {
  // override device API
  void SetDevice(TVMContext ctx) final;
  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final;
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t size,
+                       size_t alignment,
+                       TVMType type_hint) final;
  void FreeDataSpace(TVMContext ctx, void* ptr) final;
  void CopyDataFromTo(const void* from,
                      size_t from_offset,

--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -51,7 +51,7 @@ void OpenCLWorkspace::GetAttr(
 }

 void* OpenCLWorkspace::AllocDataSpace(
-    TVMContext ctx, size_t size, size_t alignment) {
+    TVMContext ctx, size_t size, size_t alignment, TVMType type_hint) {
  this->Init();
  CHECK(context != nullptr) << "No OpenCL device";
  cl_int err_code;

--- a/src/runtime/opengl/opengl_common.h
+++ b/src/runtime/opengl/opengl_common.h
--- a/src/runtime/opengl/opengl_device_api.cc
+++ b/src/runtime/opengl/opengl_device_api.cc
--- a/src/runtime/opengl/opengl_module.cc
+++ b/src/runtime/opengl/opengl_module.cc
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file opengl_module.cc
+ */
+#include <utility>
+
+#include "./opengl_common.h"
+#include "./opengl_module.h"
+
+#if TVM_OPENGL_RUNTIME
+
+#include <tvm/runtime/registry.h>
+#include "../pack_args.h"
+#include "../thread_storage_scope.h"
+#include "../file_util.h"
+
+namespace tvm {
+namespace runtime {
+
+class OpenGLModuleNode final : public ModuleNode {
+ public:
+  OpenGLModuleNode(std::unordered_map<std::string, OpenGLShader> shaders,
+                   std::string fmt,
+                   std::unordered_map<std::string, FunctionInfo> fmap);
+
+  ~OpenGLModuleNode() override = default;
+
+  const char* type_key() const final { return "opengl"; }
+
+  PackedFunc GetFunction(const std::string& name,
+                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+
+  std::string GetSource(const std::string& format) final;
+
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final;
+
+  void SaveToBinary(dmlc::Stream* stream) final;
+
+  const gl::Program& GetProgram(const std::string& func_name) const;
+
+  const OpenGLShader& GetShader(const std::string& func_name) const;
+
+  const FunctionInfo& GetFunctionInfo(const std::string& func_name) const;
+
+  gl::OpenGLWorkspace& workspace() const { return *workspace_; }
+
+ private:
+  std::shared_ptr<gl::OpenGLWorkspace> workspace_;
+  std::unordered_map<std::string, OpenGLShader> shaders_;
+  std::string fmt_;
+  std::unordered_map<std::string, FunctionInfo> fmap_;
+  std::unordered_map<std::string, gl::Program> programs_;
+
+  DISALLOW_COPY_AND_ASSIGN(OpenGLModuleNode);
+};
+
+class OpenGLWrappedFunc {
+ public:
+  OpenGLWrappedFunc(OpenGLModuleNode* m,
+                    std::shared_ptr<ModuleNode> sptr,
+                    std::string func_name,
+                    std::vector<size_t> arg_size,
+                    const std::vector<std::string>& thread_axis_tags);
+
+  void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const;
+
+ private:
+  // The module
+  OpenGLModuleNode* m_;
+  // resource handle
+  std::shared_ptr<ModuleNode> sptr_;
+  // The name of the function.
+  std::string func_name_;
+  // convert code for void argument
+  std::vector<size_t> arg_size_;
+  // thread axis config
+  ThreadAxisConfig thread_axis_cfg_;
+};
+
+OpenGLModuleNode::OpenGLModuleNode(
+    std::unordered_map<std::string, OpenGLShader> shaders,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap)
+    : workspace_(gl::OpenGLWorkspace::Global()), shaders_(std::move(shaders)),
+      fmt_(std::move(fmt)), fmap_(std::move(fmap)), programs_() {
+  CHECK_EQ(fmt_, "gl") << "Unknown OpenGL format " << fmt_;
+  for (auto &pair : shaders_) {
+    auto &func_name = pair.first;
+    auto &shader = pair.second;
+    programs_.emplace(func_name,
+                      workspace_->CreateProgram(shader.source.c_str()));
+  }
+}
+
+PackedFunc OpenGLModuleNode::GetFunction(
+    const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  CHECK_EQ(sptr_to_self.get(), this);
+  CHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+
+  auto func_info_it = fmap_.find(name);
+  if (func_info_it == fmap_.end()) { return PackedFunc(); }
+  auto &func_info = func_info_it->second;
+
+  std::vector<size_t> arg_size(func_info.arg_types.size());
+  for (size_t i = 0; i < func_info.arg_types.size(); ++i) {
+    TVMType t = func_info.arg_types[i];
+    CHECK_EQ(t.lanes, 1U);
+    uint32_t bits = t.bits;
+    CHECK_EQ(bits % 8, 0U);
+    arg_size[i] = bits / 8;
+  }
+
+  // Initialize the wrapped func.
+  OpenGLWrappedFunc f(this, sptr_to_self, name, arg_size,
+                      func_info.thread_axis_tags);
+  return PackFuncVoidAddr(f, func_info.arg_types);
+}
+
+std::string OpenGLModuleNode::GetSource(const std::string& format) {
+  if (format != fmt_ && fmt_ != "gl") { return ""; }
+
+  std::ostringstream os;
+  for (auto &pair : shaders_) {
+    auto &name = pair.first;
+    auto &shader = pair.second;
+    os << "[" << name << "]" << "\n";
+    os << shader.source <<"\n";
+  }
+  return os.str();
+}
+
+void OpenGLModuleNode::SaveToFile(const std::string& file_name,
+                                  const std::string& format) {
+  std::string fmt = GetFileFormat(file_name, format);
+  CHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+  std::string meta_file = GetMetaFilePath(file_name);
+  SaveMetaDataToFile(meta_file, fmap_);
+  SaveBinaryToFile(file_name, ToJSON(shaders_));
+}
+
+void OpenGLModuleNode::SaveToBinary(dmlc::Stream* stream) {
+  stream->Write(fmt_);
+  stream->Write(fmap_);
+  stream->Write(ToJSON(shaders_));
+}
+
+const gl::Program& OpenGLModuleNode::GetProgram(
+    const std::string& func_name) const {
+  auto it = programs_.find(func_name);
+  if (it == programs_.end()) {
+    LOG(FATAL) << "Cannot find program";
+  }
+  return it->second;
+}
+
+const OpenGLShader& OpenGLModuleNode::GetShader(
+    const std::string& func_name) const {
+  auto it = shaders_.find(func_name);
+  if (it == shaders_.end()) {
+    LOG(FATAL) << "Cannot find shader";
+  }
+  return it->second;
+}
+
+const FunctionInfo& OpenGLModuleNode::GetFunctionInfo(
+    const std::string& func_name) const {
+  auto it = fmap_.find(func_name);
+  if (it == fmap_.end()) {
+    LOG(FATAL) << "Cannot find shader";
+  }
+  return it->second;
+}
+
+OpenGLWrappedFunc::OpenGLWrappedFunc(
+    OpenGLModuleNode* m,
+    std::shared_ptr<ModuleNode> sptr,
+    std::string func_name,
+    std::vector<size_t> arg_size,
+    const std::vector<std::string>& thread_axis_tags)
+    : m_(m), sptr_(std::move(sptr)), func_name_(std::move(func_name)),
+      arg_size_(std::move(arg_size)) {
+  thread_axis_cfg_.Init(arg_size_.size(), thread_axis_tags);
+}
+
+void OpenGLWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv,
+                                   void** void_args) const {
+  auto &shader = m_->GetShader(func_name_);
+  auto &program = m_->GetProgram(func_name_);
+  auto &func_info = m_->GetFunctionInfo(func_name_);
+  size_t nargs = shader.arg_kinds.size();
+
+  // Must call this function before setting uniforms & input textures.
+  m_->workspace().SetCurrentProgram(program);
+
+  // Set all arguments.
+  GLuint texture_unit = 0;
+  gl::Texture* output = nullptr;
+  for (size_t i = 0; i != nargs; ++i) {
+    auto &name = shader.arg_names.at(i);
+    auto kind = shader.arg_kinds.at(i);
+    auto type = func_info.arg_types.at(i);
+    switch (kind) {
+      case OpenGLArgKind::kUniform: {
+        m_->workspace().SetUniform(program, name, type, void_args[i]);
+        break;
+      }
+      case OpenGLArgKind::kInputTexture: {
+        CHECK_EQ(type.code, kHandle) << "Type is not handle?";
+        auto texture = *static_cast<gl::Texture**>(void_args[i]);
+        m_->workspace().SetInputTexture(program, name, texture_unit, texture);
+        ++texture_unit;
+        break;
+      }
+      case OpenGLArgKind::kOutputTexture: {
+        CHECK_EQ(type.code, kHandle) << "Type is not handle?";
+        CHECK(output == nullptr) << "Can only have one output texture.";
+        output = *static_cast<gl::Texture**>(void_args[i]);
+        break;
+      }
+    }
+  }
+
+  // Set "thread_extent" uniform.
+  ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
+  std::unique_ptr<GLint> thread_extent(new GLint(wl.block_dim(0)));
+  m_->workspace().SetUniform(program, shader.thread_extent_var,
+                             TVMType{kDLInt, 32, 1},
+                             static_cast<void*>(thread_extent.get()));
+
+  m_->workspace().Render(output);
+}
+
+Module OpenGLModuleCreate(std::unordered_map<std::string, OpenGLShader> shaders,
+                          std::string fmt,
+                          std::unordered_map<std::string, FunctionInfo> fmap) {
+  auto n = std::make_shared<OpenGLModuleNode>(std::move(shaders),
+                                              std::move(fmt),
+                                              std::move(fmap));
+  return Module(n);
+}
+
+Module OpenGLModuleLoadFile(const std::string& file_name,
+                            const std::string& format) {
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt = GetFileFormat(file_name, format);
+  std::string meta_file = GetMetaFilePath(file_name);
+  LoadBinaryFromFile(file_name, &data);
+  LoadMetaDataFromFile(meta_file, &fmap);
+  return OpenGLModuleCreate(FromJSON(data), fmt, fmap);
+}
+
+Module OpenGLModuleLoadBinary(void* strm) {
+  auto stream = static_cast<dmlc::Stream*>(strm);
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt;
+  stream->Read(&fmt);
+  stream->Read(&fmap);
+  stream->Read(&data);
+  return OpenGLModuleCreate(FromJSON(data), fmt, fmap);
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_gl")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenGLModuleLoadFile(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("module.loadfile_glbin")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenGLModuleLoadFile(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("module.loadbinary_opengl")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenGLModuleLoadBinary(args[0]);
+  });
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_OPENGL_RUNTIME
--- a/src/runtime/opengl/opengl_module.h
+++ b/src/runtime/opengl/opengl_module.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file opengl_module.h
+ * \brief Execution handling of OpenGL kernels
+ */
+#ifndef TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_
+#define TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_
+
+#include <tvm/runtime/config.h>
+#include <tvm/runtime/packed_func.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "../meta_data.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Determines how we supply arguments.
+ */
+enum class OpenGLArgKind {
+  kInputTexture = 0,   // Bind to "gsampler2D" in GLSL.
+  kOutputTexture = 1,  // Bind to "out" in GLSL.
+  kUniform = 2,        // Bind to "uniform" in GLSL.
+};
+
+std::string OpenGLArgKind2String(OpenGLArgKind kind);
+OpenGLArgKind String2OpenGLArgKind(const std::string& str);
+
+/*!
+ * \brief The output of OpenGL codegen.
+ * Contains necessary information to build a fragment shader and bind arguments.
+ */
+struct OpenGLShader {
+  OpenGLShader() = default;
+  OpenGLShader(std::string source,
+               std::vector<std::string> arg_names,
+               std::vector<OpenGLArgKind> arg_kinds,
+               std::string thread_extent_var)
+      : source(std::move(source)), arg_names(std::move(arg_names)),
+        arg_kinds(std::move(arg_kinds)),
+        thread_extent_var(std::move(thread_extent_var)) {
+    CHECK_EQ(this->arg_names.size(), this->arg_kinds.size()) << "Invalid input";
+  }
+
+  std::string source;
+  std::vector<std::string> arg_names;    // Matches FunctionInfo.
+  std::vector<OpenGLArgKind> arg_kinds;  // Matches FunctionInfo.
+  std::string thread_extent_var;         // Stores the output length.
+
+  void Save(dmlc::JSONWriter* writer) const;
+  void Load(dmlc::JSONReader* reader);
+};
+
+std::string ToJSON(const std::unordered_map<std::string, OpenGLShader>& shaders);
+std::unordered_map<std::string, OpenGLShader> FromJSON(const std::string& str);
+
+/*!
+ * \brief Create an OpenGL module from data.
+ *
+ * \param data The module data.
+ * \param fmt The format of the data,
+ * \param fmap The map function information map of each function.
+ */
+Module OpenGLModuleCreate(std::unordered_map<std::string, OpenGLShader> shaders,
+                          std::string fmt,
+                          std::unordered_map<std::string, FunctionInfo> fmap);
+
+inline std::string OpenGLArgKind2String(OpenGLArgKind kind) {
+  switch (kind) {
+    case OpenGLArgKind::kOutputTexture:
+      return "output_texture";
+    case OpenGLArgKind::kInputTexture:
+      return "input_texture";
+    case OpenGLArgKind::kUniform:
+      return "uniform";
+  }
+  assert(false);
+}
+
+inline OpenGLArgKind String2OpenGLArgKind(const std::string& str) {
+  if (str == "output_texture") {
+    return OpenGLArgKind::kOutputTexture;
+  } else if (str == "input_texture") {
+    return OpenGLArgKind::kInputTexture;
+  } else if (str == "uniform") {
+    return OpenGLArgKind::kUniform;
+  } else {
+    LOG(FATAL) << "Invalid OpenGL arg kind.";
+    assert(false);
+  }
+}
+
+inline void OpenGLShader::Save(dmlc::JSONWriter* writer) const {
+  std::vector<std::string> arg_kind_strs;
+  for (auto kind : arg_kinds) {
+    arg_kind_strs.push_back(OpenGLArgKind2String(kind));
+  }
+
+  writer->BeginObject();
+  writer->WriteObjectKeyValue("arg_names", arg_names);
+  writer->WriteObjectKeyValue("arg_kinds", arg_kind_strs);
+  writer->WriteObjectKeyValue("source", source);
+  writer->WriteObjectKeyValue("thread_extent_var", thread_extent_var);
+  writer->EndObject();
+}
+
+inline void OpenGLShader::Load(dmlc::JSONReader* reader) {
+  std::vector<std::string> arg_kind_strs;
+  dmlc::JSONObjectReadHelper helper;
+  helper.DeclareField("arg_names", &arg_names);
+  helper.DeclareField("arg_kinds", &arg_kind_strs);
+  helper.DeclareField("source", &source);
+  helper.DeclareField("thread_extent_var", &thread_extent_var);
+  helper.ReadAllFields(reader);
+
+  arg_kinds.clear();
+  for (auto& str : arg_kind_strs) {
+    arg_kinds.push_back(String2OpenGLArgKind(str));
+  }
+}
+
+inline std::string ToJSON(
+    const std::unordered_map<std::string, OpenGLShader>& shaders) {
+  std::ostringstream os;
+  dmlc::JSONWriter writer(&os);
+  writer.BeginObject();
+  writer.WriteObjectKeyValue("shaders", shaders);
+  writer.EndObject();
+  return os.str();
+}
+
+inline std::unordered_map<std::string, OpenGLShader> FromJSON(
+    const std::string& str) {
+  std::unordered_map<std::string, OpenGLShader> shaders;
+  std::istringstream is(str);
+  dmlc::JSONReader reader(&is);
+  dmlc::JSONObjectReadHelper helper;
+  helper.DeclareField("shaders", &shaders);
+  helper.ReadAllFields(&reader);
+  return shaders;
+}
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_OPENGL_OPENGL_MODULE_H_
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -52,12 +52,15 @@ class ROCMDeviceAPI final : public DeviceAPI {
    }
    *rv = value;
  }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
    ROCM_CALL(hipSetDevice(ctx.device_id));
    CHECK_EQ(256 % alignment, 0U)
        << "ROCM space is aligned at 256 bytes";
    void *ret;
-    ROCM_CALL(hipMalloc(&ret, size));
+    ROCM_CALL(hipMalloc(&ret, nbytes));
    return ret;
  }


--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -20,10 +20,13 @@ class RPCDeviceAPI final : public DeviceAPI {
    *rv = GetSess(ctx)->CallRemote(
        RPCCode::kDevGetAttr, ctx, static_cast<int>(kind));
  }
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+  void* AllocDataSpace(TVMContext ctx,
+                       size_t nbytes,
+                       size_t alignment,
+                       TVMType type_hint) final {
    auto sess = GetSess(ctx);
    void *data = sess->CallRemote(
-            RPCCode::kDevAllocData, ctx, size, alignment);
+            RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint);
    RemoteSpace* space = new RemoteSpace();
    space->data = data;
    space->sess = std::move(sess);

--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -887,9 +887,11 @@ void RPCDevGetAttr(TVMArgs args, TVMRetValue *rv) {

 void RPCDevAllocData(TVMArgs args, TVMRetValue *rv) {
  TVMContext ctx = args[0];
-  uint64_t size = args[1];
+  uint64_t nbytes = args[1];
  uint64_t alignment = args[2];
-  void* data = DeviceAPI::Get(ctx)->AllocDataSpace(ctx, size, alignment);
+  TVMType type_hint = args[3];
+  void* data = DeviceAPI::Get(ctx)->AllocDataSpace(
+      ctx, nbytes, alignment, type_hint);
  *rv = data;
 }


--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -23,28 +23,29 @@ class WorkspacePool::Pool {
    allocated_.push_back(e);
  }
  // allocate from pool
-  void* Alloc(TVMContext ctx, DeviceAPI* device, size_t size) {
+  void* Alloc(TVMContext ctx, DeviceAPI* device, size_t nbytes) {
    // Allocate align to page.
-    size = (size + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize;
-    if (size == 0) size = kWorkspacePageSize;
+    nbytes = (nbytes + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize;
+    if (nbytes == 0) nbytes = kWorkspacePageSize;
    Entry e;
+    TVMType type = {.code = kDLUInt, .bits = 8, .lanes = 1};
    if (free_list_.size() == 2) {
      e = free_list_.back();
      free_list_.pop_back();
-      if (e.size < size) {
+      if (e.size < nbytes) {
        // resize the page
        device->FreeDataSpace(ctx, e.data);
-        e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment);
-        e.size = size;
+        e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+        e.size = nbytes;
      }
    } else if (free_list_.size() == 1) {
-      e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment);
-      e.size = size;
+      e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+      e.size = nbytes;
    } else {
-      if (free_list_.back().size >= size) {
+      if (free_list_.back().size >= nbytes) {
        // find smallest fit
        auto it = free_list_.end() - 2;
-        for (; it->size >= size; --it) {}
+        for (; it->size >= nbytes; --it) {}
        e = *(it + 1);
        free_list_.erase(it + 1);
      } else {
@@ -52,8 +53,8 @@ class WorkspacePool::Pool {
        e = free_list_.back();
        free_list_.pop_back();
        device->FreeDataSpace(ctx, e.data);
-        e.data = device->AllocDataSpace(ctx, size, kTempAllocaAlignment);
-        e.size = size;
+        e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+        e.size = nbytes;
      }
    }
    allocated_.push_back(e);

--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -397,6 +397,45 @@ Stage& Stage::double_buffer() {
  return *this;
 }

+Stage& Stage::opengl() {
+  CHECK(!is_scheduled()) << "Must be a fresh schedule";
+  StageNode *self = operator->();
+
+  auto all_iter_vars = self->all_iter_vars;  // curr version of all_iter_vars
+  CHECK(!all_iter_vars.empty()) << "At least one iter var";
+
+  // Fuse all data parallel dimensions to 1.
+  IterVar fused = all_iter_vars[0];
+  for (size_t i = 1; i != all_iter_vars.size(); ++i) {
+    auto iter_var = all_iter_vars[i];
+    switch (iter_var->iter_type) {
+      case IterVarType::kDataPar: {
+        fuse(fused, all_iter_vars[i], &fused);
+        break;
+      }
+      case IterVarType::kThreadIndex: {
+        LOG(ERROR) << "A fresh schedule shouldn't have thread index iter var";
+        break;
+      }
+      case IterVarType::kCommReduce:
+      case IterVarType::kOrdered:
+      case IterVarType::kOpaque: {
+        break;
+      }
+      default: {
+        LOG(ERROR) << "Invalid iter var type "
+                   << IterVarType2String(iter_var->iter_type);
+        break;
+      }
+    }
+  }
+
+  // Bind the only dimension to threadIdx.x.
+  bind(fused, thread_axis(Range(nullptr), "threadIdx.x"));
+
+  return *this;
+}
+
 Stage CopyStage(const Stage& s) {
  std::shared_ptr<StageNode> n =
      std::make_shared<StageNode>(*s.operator->());

--- a/tests/ci_build/Dockerfile.cpu
+++ b/tests/ci_build/Dockerfile.cpu
@@ -20,3 +20,6 @@ RUN bash /install/ubuntu_install_java.sh

 COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
 RUN bash /install/ubuntu_install_llvm.sh
+
+COPY install/ubuntu_install_opengl.sh /install/ubuntu_install_opengl.sh
+RUN bash /install/ubuntu_install_opengl.sh
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -37,6 +37,9 @@ RUN bash /install/ubuntu_install_nodejs.sh
 COPY install/ubuntu_install_rocm.sh /install/ubuntu_install_rocm.sh
 RUN bash /install/ubuntu_install_rocm.sh

+COPY install/ubuntu_install_opengl.sh /install/ubuntu_install_opengl.sh
+RUN bash /install/ubuntu_install_opengl.sh
+
 # Enable doxygen for c++ doc build
 RUN apt-get install -y doxygen graphviz


--- a/tests/ci_build/install/ubuntu_install_opengl.sh
+++ b/tests/ci_build/install/ubuntu_install_opengl.sh
+apt-get update --fix-missing
+
+apt-get install -y --no-install-recommends --force-yes \
+        libgl1-mesa-dev libglfw3-dev
\ No newline at end of file
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -7,7 +7,8 @@ def enabled_ctx_list():
                ('cl', tvm.opencl(0)),
                ('metal', tvm.metal(0)),
                ('rocm', tvm.rocm(0)),
-                ('vpi', tvm.vpi(0))]
+                ('vpi', tvm.vpi(0)),
+                ('opengl', tvm.opengl(0))]
    for k, v  in ctx_list:
        assert tvm.context(k, 0) == v
    ctx_list = [x[1] for x in ctx_list if x[1].exist]
@@ -19,7 +20,8 @@ print("Testing using contexts:", ENABLED_CTX_LIST)

 def test_nd_create():
    for ctx in ENABLED_CTX_LIST:
-        for dtype in ["float32", "int8", "uint16"]:
+        for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32",
+                      "float32"]:
            x = np.random.randint(0, 10, size=(3, 4))
            x = np.array(x, dtype=dtype)
            y = tvm.nd.array(x, ctx=ctx)

--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -17,3 +17,5 @@ TVM_FFI=cython python -m nose -v tests/python/integration || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/integration || exit -1
 TVM_FFI=cython python -m nose -v tests/python/contrib || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/contrib || exit -1
+TVM_FFI=cython python -m nose -v tests/webgl || exit -1
+TVM_FFI=ctypes python3 -m nose -v tests/webgl || exit -1
--- a/tests/webgl/README.md
+++ b/tests/webgl/README.md
+## Test cases for the WebGL backend
+
+Any test case with name `test_local_...` tests the C++ OpenGL backend on the
+local OS, which can be executed automatically.
+
+Any test case with name `test_remote_...` tests the WebGL backend within the
+browser, which must be run manually. See instruction within the test.
--- a/tests/webgl/test_local_gemm.py
+++ b/tests/webgl/test_local_gemm.py
+import tvm
+import numpy as np
+
+def test_local_gemm():
+    if not tvm.module.enabled("opengl"):
+        return
+    if not tvm.module.enabled("llvm"):
+        return
+
+    nn = 2
+    n = tvm.var('n')
+    n = tvm.convert(nn)
+    m = n
+    l = n
+    A = tvm.placeholder((n, l), name='A', dtype='int32')
+    B = tvm.placeholder((m, l), name='B', dtype='int32')
+    k = tvm.reduce_axis((0, l), name='k')
+    C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
+                    name='CC')
+
+    s = tvm.create_schedule(C.op)
+    s[C].opengl()
+    print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+    f = tvm.build(s, [A, B, C], "opengl", name="gemm")
+    print("------opengl code------")
+    print(f.imported_modules[0].get_source(fmt="gl"))
+
+    ctx = tvm.opengl()
+    n, m, l = nn, nn, nn
+    a_np = np.random.uniform(low=0, high=10, size=(n, l)).astype(A.dtype)
+    b_np = np.random.uniform(low=0, high=10, size=(m, l)).astype(B.dtype)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+    f(a, b, c)
+
+    np.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
+
+if __name__ == "__main__":
+    test_local_gemm()
--- a/tests/webgl/test_local_save_load.py
+++ b/tests/webgl/test_local_save_load.py
+import numpy as np
+import tvm
+from tvm.contrib import rpc, util, emscripten
+
+def test_local_save_load():
+    if not tvm.module.enabled("opengl"):
+        return
+    if not tvm.module.enabled("llvm"):
+        return
+
+    n = tvm.var("n")
+    A = tvm.placeholder((n,), name='A', dtype='int32')
+    B = tvm.placeholder((n,), name='B', dtype='int32')
+    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    s = tvm.create_schedule(C.op)
+    s[C].opengl()
+
+    f = tvm.build(s, [A, B, C], "opengl", target_host="llvm", name="myadd")
+
+    ctx = tvm.opengl(0)
+    n = 10
+    a = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(high=10, size=(n)).astype(B.dtype), ctx)
+    c = tvm.nd.array(np.zeros((n), dtype=C.dtype), ctx)
+    f(a, b, c)
+
+    temp = util.tempdir()
+    path_so = temp.relpath("myadd.so")
+    f.export_library(path_so)
+    f1 = tvm.module.load(path_so)
+    f1(a, b, c)
+    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+
+if __name__ == "__main__":
+    test_local_save_load()
--- a/tests/webgl/test_remote_save_load.py
+++ b/tests/webgl/test_remote_save_load.py
+"""
+The following instruction is based on web/README.md.
+
+Setup an RPC server:
+$ python -m tvm.exec.rpc_proxy --example-rpc=1
+
+Go to http://localhost:9190 in browser.
+
+Click "Connect To Proxy".
+
+Run this test script:
+$ python tests/webgl/test_remote_save_load.py
+"""
+
+import numpy as np
+import tvm
+from tvm.contrib import rpc, util, emscripten
+
+proxy_host = "localhost"
+proxy_port = 9090
+
+def try_remote_save_load():
+    if not tvm.module.enabled("rpc"):
+        return
+    if not tvm.module.enabled("opengl"):
+        return
+    if not tvm.module.enabled("llvm"):
+        return
+
+    # Build the module.
+    n = tvm.var("n")
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    s = tvm.create_schedule(C.op)
+    s[C].opengl()
+    target_host = "llvm -target=asmjs-unknown-emscripten -system-lib"
+    f = tvm.build(s, [A, B, C], "opengl", target_host=target_host, name="myadd")
+
+    remote = rpc.connect(proxy_host, proxy_port, key="js")
+
+    temp = util.tempdir()
+    ctx = remote.opengl(0)
+    path_obj = temp.relpath("myadd.bc")
+    path_dso = temp.relpath("myadd.js")
+    path_gl = temp.relpath("myadd.gl")
+    path_json = temp.relpath("myadd.tvm_meta.json")
+
+    f.save(path_obj)
+    emscripten.create_js(path_dso, path_obj, side_module=True)
+    f.imported_modules[0].save(path_gl)
+
+    remote.upload(path_dso, "myadd.dso")
+    remote.upload(path_gl)
+    remote.upload(path_json)
+
+    remote.download("myadd.dso")
+    remote.download("myadd.gl")
+    remote.download("myadd.tvm_meta.json")
+
+    print('Loading myadd.dso')
+    fhost = remote.load_module("myadd.dso")
+
+    print('Loading myadd.gl')
+    fdev = remote.load_module("myadd.gl")
+
+    print('import_module')
+    fhost.import_module(fdev)
+
+    print('running...')
+    a = tvm.nd.array(np.random.uniform(size=16).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx)
+    c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx)
+    fhost(a, b, c)
+    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+
+if __name__ == "__main__":
+    try_remote_save_load()
--- a/web/example_rpc.html
+++ b/web/example_rpc.html
@@ -36,5 +36,9 @@
   <button onclick="connect_rpc()">Connect To Proxy</button>
   <button onclick="clear_log()">Clear Log</button>
   <div id="log"></div>
+   <canvas id="canvas"></canvas>
+   <script>
+    Module["canvas"] = document.getElementById("canvas");
+   </script>
 </body>
 </html>
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -686,7 +686,8 @@ var tvm_runtime = tvm_runtime || {};
      2 : "gpu",
      4 : "opencl",
      8 : "metal",
-      9 : "vpi"
+      9 : "vpi",
+      11 : "opengl",
    };
    var CTX_STR2MASK = {
      "cpu": 1,
@@ -695,7 +696,8 @@ var tvm_runtime = tvm_runtime || {};
      "cl": 4,
      "opencl": 4,
      "metal": 8,
-      "vpi": 9
+      "vpi": 9,
+      "opengl": 11,
    };
    TVMContext.prototype = {
      toString : function() {

--- a/web/web_runtime.cc
+++ b/web/web_runtime.cc
@@ -18,6 +18,8 @@
 #include "../src/runtime/rpc/rpc_event_impl.cc"
 #include "../src/runtime/rpc/rpc_server_env.cc"
 #include "../src/runtime/graph/graph_runtime.cc"
+#include "../src/runtime/opengl/opengl_device_api.cc"
+#include "../src/runtime/opengl/opengl_module.cc"

 namespace tvm {
 namespace contrib {