Vulkan2 Runtime API (#3849)

2536465c · Andrew Tulloch · Tianqi Chen · 06aecc60 · 2536465c · 2536465c
Commit 2536465c authored Sep 13, 2019 by Andrew Tulloch Committed by Tianqi Chen Sep 13, 2019
13 changed files
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -62,8 +62,7 @@
 #endif
 #ifdef TVM_VULKAN_RUNTIME
-#include "../src/runtime/vulkan/vulkan_device_api.cc"
+#include "../src/runtime/vulkan/vulkan.cc"
-#include "../src/runtime/vulkan/vulkan_module.cc"
 #endif
 #ifdef USE_SORT

--- a/cmake/modules/Vulkan.cmake
+++ b/cmake/modules/Vulkan.cmake
@@ -18,6 +18,14 @@
 # Be compatible with older version of CMake
 find_vulkan(${USE_VULKAN})
+# Extra Vulkan runtime options, exposed for advanced users.
+tvm_option(USE_VULKAN_IMMEDIATE_MODE "Use Vulkan Immediate mode
+(KHR_push_descriptor extension)" ON IF USE_VULKAN)
+tvm_option(USE_VULKAN_DEDICATED_ALLOCATION "Use Vulkan dedicated allocations" ON
+IF USE_VULKAN)
+tvm_option(USE_VULKAN_VALIDATION "Enable Vulkan API validation layers" OFF
+  IF USE_VULKAN)
 if(Vulkan_FOUND)
  # always set the includedir
  # avoid global retrigger of cmake
@@ -28,12 +36,24 @@ if(USE_VULKAN)
  if(NOT Vulkan_FOUND)
    message(FATAL_ERROR "Cannot find Vulkan, USE_VULKAN=" ${USE_VULKAN})
  endif()
-  message(STATUS "Build with VULKAN support")
+  message(STATUS "Build with Vulkan support")
-  file(GLOB RUNTIME_VULKAN_SRCS src/runtime/vulkan/*.cc)
+  file(GLOB RUNTIME_VULKAN_SRCS src/runtime/vulkan/vulkan.cc)
  file(GLOB COMPILER_VULKAN_SRCS src/codegen/spirv/*.cc)
  list(APPEND RUNTIME_SRCS ${RUNTIME_VULKAN_SRCS})
  list(APPEND COMPILER_SRCS ${COMPILER_VULKAN_SRCS})
  list(APPEND TVM_LINKER_LIBS ${Vulkan_SPIRV_TOOLS_LIBRARY})
  list(APPEND TVM_RUNTIME_LINKER_LIBS ${Vulkan_LIBRARY})
+  if(USE_VULKAN_IMMEDIATE_MODE)
+    message(STATUS "Build with Vulkan immediate mode")
+    add_definitions(-DUSE_VULKAN_IMMEDIATE_MODE=1)
+  endif()
+  if(USE_VULKAN_DEDICATED_ALLOCATION)
+    message(STATUS "Build with Vulkan dedicated allocation")
+    add_definitions(-DUSE_VULKAN_DEDICATED_ALLOCATION=1)
+  endif()
+  if(USE_VULKAN_VALIDATION)
+    message(STATUS "Build with Vulkan API validation")
+    add_definitions(-DUSE_VULKAN_VALIDATION=1)
+  endif()
 endif(USE_VULKAN)
--- a/src/codegen/spirv/build_vulkan.cc
+++ b/src/codegen/spirv/build_vulkan.cc
@@ -29,6 +29,8 @@
 #include "codegen_spirv.h"
 #include "../build_common.h"
+#include "../../runtime/vulkan/vulkan_shader.h"
 #include "../../runtime/vulkan/vulkan_module.h"
 namespace tvm {

--- a/src/codegen/spirv/ir_builder.cc
+++ b/src/codegen/spirv/ir_builder.cc
@@ -33,7 +33,10 @@ namespace spirv {
 void IRBuilder::InitHeader() {
  CHECK_EQ(header_.size(), 0U);
  header_.push_back(spv::MagicNumber);
-  header_.push_back(spv::Version);
+  // Use SPIR-V v1.0. This needs to be kept in sync (or at least behind)
+  // `VkApplicationInfo.apiVersion` in `vulkan.cc` to ensure Vulkan API
+  // validation passes.
+  header_.push_back(0x10000);
  // generator: set to 0, unknown
  header_.push_back(0U);
  // Bound: set during Finalize

--- a/src/runtime/vulkan/README.md
+++ b/src/runtime/vulkan/README.md
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+## Components
+### VulkanDeviceAPI
+Implements the TVM DeviceAPI interface. Owns the core Vulkan datastructures. Is
+responsible for initializing the Vulkan instance and devices, querying for
+possible extensions.
+### VulkanThreadEntry
+Thread-local state for the Vulkan runtime. Maintains a staging buffer (for
+copies), and a VulkanStream per device.
+### VulkanWrappedFunc
+Responsible for launching computation kernels. Responsible for obtaining a
+VulkanPipeline instance (from the VulkanModuleNode), and launches the kernel
+(via immediate or deferred mode) on the active VulkanStream instance.
+## Stream execution in the Vulkan programming model.
+The natural model for TVM DeviceAPI implementation and runtime follows the CUDA
+API model. That is, we launch "kernels" onto a (implicit or explicit) "stream"
+(which execute asynchronously with respect to the host, but ordered with respect
+to the stream), and explicitly synchronize the stream with respect to the host.
+We simulate this behaviour in the Vulkan model by maintaining a thread-local
+`vkCommandBuffer` instance, and queueing up (or eagerly executing, depending on
+the availability of the `VK_KHR_push_descriptor` extension). When we synchronize
+the stream, we end the command buffer recording, submit it to the device queue,
+and wait on the corresponding fence.
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
--- a/src/runtime/vulkan/vulkan_common.h
+++ b/src/runtime/vulkan/vulkan_common.h
--- a/src/runtime/vulkan/vulkan_device_api.cc
+++ b/src/runtime/vulkan/vulkan_device_api.cc
--- a/src/runtime/vulkan/vulkan_module.cc
+++ b/src/runtime/vulkan/vulkan_module.cc
--- a/src/runtime/vulkan/vulkan_module.h
+++ b/src/runtime/vulkan/vulkan_module.h
@@ -6,9 +6,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
- * 
+ *
 *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -16,67 +16,22 @@
 * specific language governing permissions and limitations
 * under the License.
 */
+#pragma once
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file metal_module.h
- * \brief Execution handling of Metal kernels
- */
-#ifndef TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
-#define TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
-#include <tvm/runtime/packed_func.h>
-#include <dmlc/type_traits.h>
-#include <memory>
-#include <vector>
 #include <string>
 #include <unordered_map>
 #include "../meta_data.h"
+#include "vulkan_shader.h"
 namespace tvm {
 namespace runtime {
-/*! \brief Maximum number of GPU supported in VulkanModule. */
+namespace vulkan {
-static constexpr const int kVulkanMaxNumDevice = 8;
+Module VulkanModuleCreate(std::unordered_map<std::string, VulkanShader> smap,
+                          std::unordered_map<std::string, FunctionInfo> fmap, std::string source);
-/*! \brief TVM Vulkan binary pack magic number */
-static constexpr const int kVulkanModuleMagic = 0x02700027;
-/*!
- * \brief A single VK shader program
- *
- *  Due to the global resource declaration.
- *  Current SPIRV only allows one entry program per shader,
- *  making it less useful for a Module like system.
- *
- *  Instead we pass in map of str->VulkanShader until
- *  there is a native solution available.
- */
-struct VulkanShader {
-  /*! \brief header flag */
-  uint32_t flag{0};
-  /*! \brief Data segment */
-  std::vector<uint32_t> data;
-  void Save(dmlc::Stream *writer) const;
+}  // namespace vulkan
-  bool Load(dmlc::Stream *reader);
-};
-/*!
+using vulkan::VulkanModuleCreate;
- * \brief create a metal module from data.
- *
- * \param pmap The program map.
- * \param fmap The function information map.
- * \param source Optional, source code.
- */
-Module VulkanModuleCreate(
-    std::unordered_map<std::string, VulkanShader> smap,
-    std::unordered_map<std::string, FunctionInfo> fmap,
-    std::string source);
 }  // namespace runtime
 }  // namespace tvm
-namespace dmlc {
-DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::VulkanShader, true);
-}  // namespace dmlc
-#endif  // TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
--- a/src/runtime/vulkan/vulkan_shader.h
+++ b/src/runtime/vulkan/vulkan_shader.h
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#pragma once
+#include <dmlc/logging.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/packed_func.h>
+#include <vector>
+namespace tvm {
+namespace runtime {
+namespace vulkan {
+struct VulkanShader {
+  /*! \brief header flag */
+  uint32_t flag{0};
+  /*! \brief Data segment */
+  std::vector<uint32_t> data;
+  void Save(dmlc::Stream* writer) const {
+    writer->Write(flag);
+    writer->Write(data);
+  }
+  bool Load(dmlc::Stream* reader) {
+    if (!reader->Read(&flag)) return false;
+    if (!reader->Read(&data)) return false;
+    return true;
+  }
+};
+}  // namespace vulkan
+using vulkan::VulkanShader;
+}  // namespace runtime
+}  // namespace tvm
+namespace dmlc {
+DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::vulkan::VulkanShader, true);
+}  // namespace dmlc
--- a/src/runtime/vulkan/vulkan_stream.h
+++ b/src/runtime/vulkan/vulkan_stream.h
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#pragma once
+#include <functional>
+#include <memory>
+#include <vector>
+#include "vulkan_common.h"
+namespace tvm {
+namespace runtime {
+namespace vulkan {
+class VulkanStreamState {
+ public:
+  VkCommandBuffer cmd_buffer_;
+  VkFence fence_;
+};
+// Used to identify state that should only be used once-per-stream.
+struct VulkanStreamToken {
+  VkDescriptorSet descriptor_set_{VK_NULL_HANDLE};
+  std::vector<VkBuffer> buffers_;
+};
+class VulkanStream {
+ public:
+  explicit VulkanStream(const VulkanContext* vctx)
+      : vctx_(vctx), state_(new VulkanStreamState()) {
+    // create command pool
+    VkCommandPoolCreateInfo cmd_pool_cinfo;
+    cmd_pool_cinfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+    cmd_pool_cinfo.pNext = nullptr;
+    cmd_pool_cinfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
+    cmd_pool_cinfo.queueFamilyIndex = vctx_->queue_family_index;
+    VULKAN_CALL(vkCreateCommandPool(vctx_->device, &cmd_pool_cinfo, nullptr, &cmd_pool_));
+    VkCommandBufferAllocateInfo buffer_alloc_info;
+    buffer_alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    buffer_alloc_info.pNext = nullptr;
+    buffer_alloc_info.commandPool = cmd_pool_;
+    buffer_alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    buffer_alloc_info.commandBufferCount = 1;
+    VULKAN_CALL(
+        vkAllocateCommandBuffers(vctx_->device, &buffer_alloc_info, &(state_->cmd_buffer_)));
+    VkFenceCreateInfo fence_cinfo;
+    fence_cinfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    fence_cinfo.pNext = nullptr;
+    fence_cinfo.flags = 0;  // VK_FENCE_CREATE_SIGNALED_BIT;
+    VULKAN_CALL(vkCreateFence(vctx_->device, &fence_cinfo, nullptr, &(state_->fence_)));
+    VkCommandBufferBeginInfo cb_begin;
+    cb_begin.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    cb_begin.pNext = nullptr;
+    cb_begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    cb_begin.pInheritanceInfo = 0;
+    VULKAN_CALL(vkBeginCommandBuffer(state_->cmd_buffer_, &cb_begin));
+  }
+  ~VulkanStream() {
+    vkDestroyFence(vctx_->device, state_->fence_, nullptr);
+    vkDestroyCommandPool(vctx_->device, cmd_pool_, nullptr);
+  }
+  // Launch the kernel on the current stream.
+  void Launch(const std::function<void(VulkanStreamState*)>& kernel) {
+    if (vctx_->UseImmediate()) {
+      kernel(state_.get());
+    } else {
+      deferred_kernels_.push_back(kernel);
+    }
+  }
+  // Launch the kernel on the current stream,
+  void LaunchDeferred(const std::function<void()>& deferred_initializer,
+                      const std::function<void(VulkanStreamState*)>& deferred_kernel,
+                      const VulkanStreamToken& deferred_token) {
+    CHECK(!vctx_->UseImmediate());
+    // It is invalid to schedule this instance on the current stream if we already
+    // have a matching descriptor set and a non-matching buffer set.
+    if (std::any_of(deferred_tokens_.begin(), deferred_tokens_.end(),
+                    [&](const VulkanStreamToken& token) {
+                      return token.descriptor_set_ == deferred_token.descriptor_set_ &&
+                             token.buffers_ != deferred_token.buffers_;
+                    })) {
+      Synchronize();
+    }
+    // It is unnecessary to invoke our initializer if we have a matching token.
+    if (!std::any_of(deferred_tokens_.begin(), deferred_tokens_.end(),
+                     [&](const VulkanStreamToken& token) {
+                       // If we have a matching descriptor set
+                       return token.descriptor_set_ == deferred_token.descriptor_set_ &&
+                              token.buffers_ == deferred_token.buffers_;
+                     })) {
+      deferred_initializer();
+    }
+    deferred_kernels_.push_back(deferred_kernel);
+    deferred_tokens_.push_back(deferred_token);
+  }
+  // Synchronize the current stream `state_` with respect to the host.
+  void Synchronize() {
+    if (!vctx_->UseImmediate()) {
+      for (const auto& deferred_kernel : deferred_kernels_) {
+        deferred_kernel(state_.get());
+      }
+      deferred_kernels_.clear();
+      deferred_tokens_.clear();
+    } else {
+      DCHECK_EQ(deferred_kernels_.size(), 0);
+      DCHECK_EQ(deferred_tokens_.size(), 0);
+    }
+    VULKAN_CALL(vkEndCommandBuffer(state_->cmd_buffer_));
+    VkSubmitInfo cb_submit;
+    cb_submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    cb_submit.pNext = nullptr;
+    cb_submit.waitSemaphoreCount = 0;
+    cb_submit.pWaitSemaphores = nullptr;
+    cb_submit.pWaitDstStageMask = 0;
+    cb_submit.commandBufferCount = 1;
+    cb_submit.pCommandBuffers = &(state_->cmd_buffer_);
+    cb_submit.signalSemaphoreCount = 0;
+    cb_submit.pSignalSemaphores = nullptr;
+    {
+      // Multiple streams (on different threads) use the same VulkanContext
+      // instance, so we need to externally synchronize accesses.
+      std::lock_guard<std::mutex> g(*(vctx_->queue_mutex));
+      VULKAN_CALL(vkQueueSubmit(vctx_->queue, 1, &cb_submit, state_->fence_));
+    }
+    uint64_t timeout = 1UL << 30UL;
+    VkResult res;
+    do {
+      res = vkWaitForFences(vctx_->device, 1, &(state_->fence_), 0, timeout);
+    } while (res == VK_TIMEOUT);
+    VULKAN_CHECK_ERROR(res);
+    VULKAN_CALL(vkResetCommandBuffer(state_->cmd_buffer_, 0));
+    VULKAN_CALL(vkResetFences(vctx_->device, 1, &(state_->fence_)));
+    // Re-initialize the command buffer
+    VkCommandBufferBeginInfo cb_begin;
+    cb_begin.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    cb_begin.pNext = nullptr;
+    cb_begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    cb_begin.pInheritanceInfo = 0;
+    VULKAN_CALL(vkBeginCommandBuffer(state_->cmd_buffer_, &cb_begin));
+  }
+ private:
+  const VulkanContext* vctx_;
+  std::unique_ptr<VulkanStreamState> state_;
+  std::vector<VulkanStreamToken> deferred_tokens_;
+  std::vector<std::function<void(VulkanStreamState*)>> deferred_kernels_;
+  VkCommandPool cmd_pool_;
+};
+}  // namespace vulkan
+}  // namespace runtime
+}  // namespace tvm
--- a/tests/python/unittest/test_codegen_vulkan.py
+++ b/tests/python/unittest/test_codegen_vulkan.py
@@ -16,6 +16,7 @@
 # under the License.
 import tvm
 import re
+import numpy as np
 def test_vector_comparison():
@@ -54,5 +55,119 @@ def test_vector_comparison():
    check_correct_assembly('float16')
+tx = tvm.thread_axis("threadIdx.x")
+bx = tvm.thread_axis("blockIdx.x")
+def test_vulkan_copy():
+    def check_vulkan(dtype, n):
+        if not tvm.vulkan(0).exist or not tvm.module.enabled("vulkan"):
+            print("skip because vulkan is not enabled..")
+            return
+        A = tvm.placeholder((n,), name='A', dtype=dtype)
+        ctx = tvm.vulkan(0)
+        a_np = np.random.uniform(size=(n,)).astype(A.dtype)
+        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(a_np)
+        b_np = a.asnumpy()
+        tvm.testing.assert_allclose(a_np, b_np)
+        tvm.testing.assert_allclose(a_np, a.asnumpy())
+    for _ in range(100):
+        dtype = np.random.choice(["float32", "float16", "int8", "int32"])
+        logN = np.random.randint(1, 15)
+        peturb = np.random.uniform(low=0.5, high=1.5)
+        check_vulkan(dtype, int(peturb * (2 ** logN)))
+def test_vulkan_vectorize_add():
+    num_thread = 8
+    def check_vulkan(dtype, n, lanes):
+        if not tvm.vulkan(0).exist or not tvm.module.enabled("vulkan"):
+            print("skip because vulkan is not enabled..")
+            return
+        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B')
+        s = tvm.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
+        s[B].bind(xo, bx)
+        s[B].bind(xi, tx)
+        fun = tvm.build(s, [A, B], "vulkan")
+        ctx = tvm.vulkan(0)
+        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(
+            np.random.uniform(size=(n, lanes)))
+        c = tvm.nd.empty((n,), B.dtype, ctx)
+        fun(a, c)
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
+    check_vulkan("float32", 64, 2)
+    check_vulkan("float16", 64, 2)
+def test_vulkan_stress():
+    """
+    Launch a randomized test with multiple kernels per stream, multiple uses of
+    kernels per stream, over multiple threads.
+    """
+    import random
+    import threading
+    n = 1024
+    num_thread = 64
+    def run_stress():
+        def worker():
+            if not tvm.vulkan(0).exist or not tvm.module.enabled("vulkan"):
+                print("skip because vulkan is not enabled..")
+                return
+            A = tvm.placeholder((n,), name='A', dtype="float32")
+            B = tvm.placeholder((n,), name='B', dtype="float32")
+            functions = [
+                (lambda: tvm.compute((n,), lambda i: 2 * A[i] + 3 * B[i]),
+                 lambda a, b: 2 * a + 3 * b),
+                (lambda: tvm.compute((n,), lambda i: A[i]+B[i]),
+                 lambda a, b: a + b),
+                (lambda: tvm.compute((n,), lambda i: A[i]+2 * B[i]),
+                 lambda a, b: a + 2 * b),
+            ]
+            def build_f(f_ref):
+                (C_f, ref) = f_ref
+                C = C_f()
+                s = tvm.create_schedule(C.op)
+                xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
+                s[C].bind(xo, bx)
+                s[C].bind(xi, tx)
+                fun = tvm.build(s, [A, B, C], "vulkan")
+                return (fun, ref)
+            fs = [build_f(random.choice(functions))
+                  for _ in range(np.random.randint(low=1, high=10))]
+            ctx = tvm.vulkan(0)
+            a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(
+                np.random.uniform(size=(n,)))
+            b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(
+                np.random.uniform(size=(n,)))
+            cs = [tvm.nd.empty((n,), A.dtype, ctx) for _ in fs]
+            for ((f, _), c) in zip(fs, cs):
+                f(a, b, c)
+            for ((_, ref), c) in zip(fs, cs):
+                tvm.testing.assert_allclose(
+                    c.asnumpy(), ref(a.asnumpy(), b.asnumpy()))
+        ts = [threading.Thread(target=worker)
+              for _ in range(np.random.randint(1, 10))]
+        for t in ts:
+            t.start()
+        for t in ts:
+            t.join()
+    run_stress()
 if __name__ == "__main__":
    test_vector_comparison()
+    test_vulkan_copy()
+    test_vulkan_vectorize_add()
+    test_vulkan_stress()