import tvm def test_lower_rfactor(): n = tvm.var("n") m = tvm.var("m") A = tvm.placeholder((n, m), name='A') k = tvm.reduce_axis((0, m), "k") B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B") s = tvm.create_schedule(B.op) ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) BF = s.rfactor(B, ki) xo, xi = s[B].split(s[B].op.axis[0], factor=32) s[B.op].bind(xo, tvm.thread_axis("blockIdx.x")) s[B.op].bind(xi, tvm.thread_axis("threadIdx.y")) s[B].bind(s[B].op.reduce_axis[0], tvm.thread_axis("threadIdx.x")) s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) fapi = tvm.lower(s, [A, B]) if __name__ == "__main__": test_lower_rfactor()