[CODEGEN] Bugfix multiple condition generation (#558)

163c4795 · Tianqi Chen · GitHub · 10faa893 · 163c4795 · 163c4795
Commit 163c4795 authored Oct 15, 2017 by Tianqi Chen Committed by GitHub Oct 15, 2017
9 changed files
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -131,26 +131,29 @@ class CodeGenAMDGPU : public CodeGenLLVM {
  }
 };
-runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
+inline int DetectROCMComputeVersion() {
-  CHECK(target.length(
+  TVMContext tvm_ctx;
-) >= 4 &&
+  tvm_ctx.device_type = kROCM;
-        target.substr(0, 4) == "rocm");
+  tvm_ctx.device_id = 0;
-  TVMContext tvmCtx;
-  tvmCtx.device_type = kROCM;
-  tvmCtx.device_id = 0;
  TVMRetValue val;
-  tvm::runtime::DeviceAPI::Get(tvmCtx)->GetAttr(tvmCtx, tvm::runtime::kExist, &val);
+  tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr(
+      tvm_ctx, tvm::runtime::kExist, &val);
  if (val.operator int() == 1) {
-    tvm::runtime::DeviceAPI::Get(tvmCtx)->GetAttr(tvmCtx, tvm::runtime::kComputeVersion, &val);
+    tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr(tvm_ctx, tvm::runtime::kComputeVersion, &val);
+    return val.operator int();
  } else {
-    val = 803;
+    return 803;
  }
+}
-  llvm::TargetMachine* tm = \
+runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
-    GetLLVMTargetMachine("-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx" + \
+  CHECK(target.length() >= 4 &&
-    std::to_string(val.operator int())+ target.substr(4, target.length() - 4));
+        target.substr(0, 4) == "rocm");
+  std::ostringstream config;
+  config << "-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx"
+         << DetectROCMComputeVersion()
+         << target.substr(4, target.length() - 4);
+  llvm::TargetMachine* tm = GetLLVMTargetMachine(config.str());
  std::unique_ptr<CodeGenAMDGPU> cg(new CodeGenAMDGPU());
  std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext());
  cg->Init(funcs[0]->name, tm, ctx.get(), false, false);
@@ -159,7 +162,6 @@ runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
  }
  std::unique_ptr<llvm::Module> module = cg->Finish();
  llvm::SmallString<8> dataObj, data_ll, dataAsm;
  llvm::raw_svector_ostream destObj(dataObj), dest_ll(data_ll), destAsm(dataAsm);
  destObj.SetUnbuffered();

--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -582,14 +582,16 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
    builder_->CreateCondBr(MakeValue(op->args[0]), then_block, else_block);
    builder_->SetInsertPoint(then_block);
    llvm::Value* then_value = MakeValue(op->args[1]);
+    BasicBlock* then_value_block = builder_->GetInsertBlock();
    builder_->CreateBr(end_block);
    builder_->SetInsertPoint(else_block);
    llvm::Value* else_value = MakeValue(op->args[2]);
+    BasicBlock* else_value_block = builder_->GetInsertBlock();
    builder_->CreateBr(end_block);
    builder_->SetInsertPoint(end_block);
    llvm::PHINode* value = builder_->CreatePHI(then_value->getType(), 2);
-    value->addIncoming(then_value, then_block);
+    value->addIncoming(then_value, then_value_block);
-    value->addIncoming(else_value, else_block);
+    value->addIncoming(else_value, else_value_block);
    return value;
  } else {
    LOG(FATAL) << "unknown intrinsic " << op->name;

--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -130,12 +130,34 @@ class CodeGenNVPTX : public CodeGenLLVM {
  }
 };
+inline int DetectCUDAComputeVersion() {
+  TVMContext tvm_ctx;
+  tvm_ctx.device_type = kGPU;
+  tvm_ctx.device_id = 0;
+  TVMRetValue val;
+  tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr(
+      tvm_ctx, tvm::runtime::kExist, &val);
+  if (val.operator int() == 1) {
+    tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr(
+        tvm_ctx, tvm::runtime::kComputeVersion, &val);
+    std::string version = val;
+    std::istringstream is(version);
+    double ver;
+    is >> ver;
+    return static_cast<int>(ver * 10);
+  } else {
+    return 20;
+  }
+}
 runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
  CHECK(target.length() >= 5 &&
        target.substr(0, 5) == "nvptx");
-  llvm::TargetMachine* tm = GetLLVMTargetMachine(
+  std::ostringstream config;
-      "-mtriple=nvptx64-nvidia-cuda -mcpu=sm_20" +
+  config << "-mtriple=nvptx64-nvidia-cuda -mcpu=sm_"
-      target.substr(5, target.length() - 5));
+         << DetectCUDAComputeVersion()
+         << target.substr(5, target.length() - 5);
+  llvm::TargetMachine* tm = GetLLVMTargetMachine(config.str());
  std::unique_ptr<CodeGenNVPTX> cg(new CodeGenNVPTX());
  std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext());
  cg->Init(funcs[0]->name, tm, ctx.get(), false, false);

--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -22,6 +22,7 @@ def schedule_injective(outs):
    target = tvm.target.current_target(allow_none=False)
    if target.target_name != "llvm":
        raise RuntimeError("schedule_injective not registered for '%s'" % target)
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    x = outs[0]
    s = tvm.create_schedule([x.op for x in outs])
    tvm.schedule.AutoInlineInjective(s)

--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -6,6 +6,7 @@ import tvm
 def _default_schedule(outs, auto_inline):
    """Default schedule for llvm."""
    target = tvm.target.current_target(allow_none=False)
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    if target.target_name != "llvm":
        raise RuntimeError("schedule_pool not registered for '%s'" % target)
    s = tvm.create_schedule([x.op for x in outs])

--- a/topi/recipe/gemm/cuda_gemm_square.py
+++ b/topi/recipe/gemm/cuda_gemm_square.py
@@ -125,10 +125,10 @@ def test_gemm():
        GFLOPS = num_flops / (t * 1e3) / 1e6
        print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS))
-    for device in ['cuda', 'opencl', 'rocm']:
+    for device in ["cuda", "opencl", "rocm"]:
        with tvm.build_config(auto_unroll_max_step=32,
                              auto_unroll_min_depth=0,
-                              unroll_explicit=device == 'rocm'):
+                              unroll_explicit=(device != "cuda")):
            check_device(device)
 if __name__ == "__main__":

--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -74,11 +74,9 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
        for _ in range(1):
            foo(data_tvm, out_tvm)
        np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
+    for device in ["cuda", "opencl", "metal", "llvm", "rocm"]:
+        check_device(device)
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
 def test_reduce_map():
    verify_reduce_map_ele(in_shape=(128, 24, 128, 24),

--- a/topi/tests/python/test_topi_softmax.py
+++ b/topi/tests/python/test_topi_softmax.py
@@ -3,6 +3,7 @@ import os
 import numpy as np
 import tvm
 import topi
+import logging
 from topi.util import get_const_tuple
 def verify_softmax(m, n):
@@ -42,8 +43,6 @@ def verify_log_softmax(m, n):
    # confirm lower works
    s = tvm.create_schedule([B.op])
    tvm.lower(s, [A, B], simple_mode=True)
    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
    b_np = topi.testing.log_softmax_python(a_np)
@@ -60,13 +59,15 @@ def verify_log_softmax(m, n):
        foo(a, b)
        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-    for device in ['cuda', 'opencl', 'metal', 'rocm']:
+    for device in ["cuda", "opencl", "metal", "rocm"]:
        check_device(device)
 def test_log_softmax():
    verify_log_softmax(32, 10)
    verify_log_softmax(3, 4)
 if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
    test_softmax()
    test_log_softmax()
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -21,10 +21,8 @@ def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
        foo(data_nd, out_nd)
        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
-    check_device("opencl")
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
-    check_device("cuda")
+        check_device(device)
-    check_device("metal")
-    check_device("rocm")
 def verify_tranpose(in_shape, axes):
@@ -45,10 +43,9 @@ def verify_tranpose(in_shape, axes):
        foo(data_nd, out_nd)
        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
-    check_device("cuda")
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
-    check_device("opencl")
+        check_device(device)
-    check_device("metal")
-    check_device("rocm")
 def verify_reshape(src_shape, dst_shape):
    A = tvm.placeholder(shape=src_shape, name="A")
@@ -68,10 +65,9 @@ def verify_reshape(src_shape, dst_shape):
        foo(data_nd, out_nd)
        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
-    check_device("cuda")
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
-    check_device("opencl")
+        check_device(device)
-    check_device("metal")
-    check_device("rocm")
 def verify_squeeze(src_shape, axis):
    A = tvm.placeholder(shape=src_shape, name="A")
@@ -95,10 +91,8 @@ def verify_squeeze(src_shape, axis):
        foo(data_nd, out_nd)
        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
-    check_device("cuda")
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
-    check_device("opencl")
+        check_device(device)
-    check_device("metal")
-    check_device("rocm")
 def verify_concatenate(shapes, axis):
    tensor_l = []
@@ -120,10 +114,9 @@ def verify_concatenate(shapes, axis):
        foo(*(data_nds + [out_nd]))
        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
-    check_device("cuda")
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
-    check_device("opencl")
+        check_device(device)
-    check_device("metal")
-    check_device("rocm")
 def verify_split(src_shape, indices_or_sections, axis):
    A = tvm.placeholder(shape=src_shape, name="A")
@@ -144,10 +137,9 @@ def verify_split(src_shape, indices_or_sections, axis):
        for out_nd, out_npy in zip(out_nds, out_npys):
            np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
-    check_device("cuda")
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
-    check_device("opencl")
+        check_device(device)
-    check_device("metal")
-    check_device("rocm")
 def test_expand_dims():
    verify_expand_dims((3, 10), (3, 10, 1, 1), 2, 2)
@@ -175,6 +167,7 @@ def test_squeeze():
 def test_concatenate():
+    verify_concatenate([(2,), (2,), (2,)], 0)
    verify_concatenate([(2, 3, 4), (2, 2, 4), (2, 5, 4)], 1)
    verify_concatenate([(1, 2, 4), (1, 2, 3), (1, 2, 7), (1, 2, 8), (1, 2, 1)], -1)
    verify_concatenate([(5, 6, 7, 3),
@@ -190,9 +183,9 @@ def test_split():
    verify_split((10, 12, 24), [5, 7, 9], -1)
 if __name__ == "__main__":
+    test_concatenate()
    test_tranpose()
    test_expand_dims()
    test_reshape()
    test_squeeze()
-    test_concatenate()
    test_split()