Commit 41d4dd6e by Wuwei Lin Committed by Tianqi Chen

Use int for int8x4 due to performance overhead of char4 (#1569)

* Use int for int8x4 due to performance overhead of char4

* Add a comment about using int

* Remove invalid test
parent 85483c37
...@@ -90,7 +90,11 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) { // NOLINT(*) ...@@ -90,7 +90,11 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) { // NOLINT(*)
if (t.lanes() == 4) { if (t.lanes() == 4) {
// directly 4 8 bit int in integer. // directly 4 8 bit int in integer.
enable_int8_ = true; enable_int8_ = true;
os << "char4"; return;
// We use int for int8x4 instead of char4 because using char4 is
// likely to produce extra instructions to pack four int8 elements
// into 32-bit data.
os << "int"; return;
} else if (t.lanes() == 8) { } else if (t.lanes() == 8) {
enable_int8_ = true; enable_int8_ = true;
os << "int2"; return; os << "int2"; return;
......
...@@ -31,7 +31,6 @@ def test_cuda_vectorize_add(): ...@@ -31,7 +31,6 @@ def test_cuda_vectorize_add():
check_cuda("float32", 64, 2) check_cuda("float32", 64, 2)
check_cuda("float16", 64, 2) check_cuda("float16", 64, 2)
check_cuda("int8", 64, 4)
def test_cuda_multiply_add(): def test_cuda_multiply_add():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment