Commit 56c50d2d by Lianmin Zheng Committed by Tianqi Chen

trigger ci (#1620)

parent c98ba601
...@@ -2,8 +2,9 @@ ...@@ -2,8 +2,9 @@
"""x86 nn operators""" """x86 nn operators"""
from __future__ import absolute_import as _abs from __future__ import absolute_import as _abs
import tvm import tvm
from .. import generic from .. import generic
from .. import tag from ..util import traverse_inline
@generic.schedule_softmax.register(["cpu"]) @generic.schedule_softmax.register(["cpu"])
def schedule_softmax(outs): def schedule_softmax(outs):
...@@ -53,44 +54,38 @@ def schedule_dense(outs): ...@@ -53,44 +54,38 @@ def schedule_dense(outs):
outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
s = tvm.create_schedule([x.op for x in outs]) s = tvm.create_schedule([x.op for x in outs])
scheduled_ops = []
def traverse(op):
"""Traverse operators from computation graph"""
# inline all one-to-one-mapping operators except the last stage (output)
if tag.is_broadcast(op.tag):
if op not in s.outputs:
s[op].compute_inline()
for tensor in op.input_tensors:
if tensor.op.input_tensors and tensor.op not in scheduled_ops:
traverse(tensor.op)
def _callback(op):
if 'dense' in op.tag: if 'dense' in op.tag:
C = op.output(0) output = outs[0]
x, y = C.op.axis dense = op.output(0)
# Write cache for blocks # Write cache for blocks
CC = s.cache_write(C, 'global') if dense.op in s.outputs:
CC = s.cache_write(dense, 'local')
else:
CC = dense
# Tile # Tile
bnx = 1 bnx = 1
bny = 4 bny = 4
_, yo, _, yi = s[C].tile(x, y, bnx, bny) x, y = output.op.axis
s[CC].compute_at(s[C], yo) xo, yo, xi, yi = s[output].tile(x, y, bnx, bny)
xc, yc = s[CC].op.axis xc, yc = s[CC].op.axis
k, = s[CC].op.reduce_axis k, = s[CC].op.reduce_axis
ko, ki = s[CC].split(k, factor=4) ko, ki = s[CC].split(k, factor=4)
s[CC].reorder(ko, xc, ki, yc) s[CC].reorder(ko, xc, ki, yc)
s[CC].unroll(ki) s[CC].unroll(ki)
s[CC].vectorize(yc) s[CC].vectorize(yc)
# Vectorization s[output].unroll(xi)
s[C].vectorize(yi) s[output].vectorize(yi)
# Parallelization
s[C].parallel(yo)
scheduled_ops.append(op) fused = s[output].fuse(xo, yo)
s[output].parallel(fused)
s[CC].compute_at(s[output], fused)
traverse(outs[0].op) traverse_inline(s, outs[0].op, _callback)
return s return s
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment