[AUTOTVM] Core part of auto-tuning module (#1312)

6ea74d41 · Lianmin Zheng · Tianqi Chen · 7e7154f1 · 6ea74d41 · 6ea74d41
Commit 6ea74d41 authored Jul 12, 2018 by Lianmin Zheng Committed by Tianqi Chen Jul 12, 2018
55 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,6 +96,7 @@ assign_source_group("Include" ${GROUP_INCLUDE})
 file(GLOB COMPILER_SRCS
    src/api/*.cc
    src/arithmetic/*.cc
+    src/autotvm/*.cc
    src/codegen/*.cc
    src/codegen/stack_vm/*.cc
    src/lang/*.cc

--- a/docs/api/python/autotvm.rst
+++ b/docs/api/python/autotvm.rst
+tvm.autotvm
+-----------
+.. automodule:: tvm.autotvm
+
+tvm.autotvm.measure
+~~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.measure.measure
+
+.. autoclass:: tvm.autotvm.measure.MeasureInput
+    :members:
+
+.. autoclass:: tvm.autotvm.measure.MeasureResult
+    :members:
+
+.. autofunction:: tvm.autotvm.measure.measure_option
+
+.. autofunction:: tvm.autotvm.measure.create_measure_batch
+
+
+tvm.autotvm.tuner
+~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.tuner
+    :members:
+
+.. autoclass:: tvm.autotvm.tuner.Tuner
+    :members:
+
+.. autoclass:: tvm.autotvm.tuner.RandomTuner
+    :members:
+    :inherited-members:
+
+.. autoclass:: tvm.autotvm.tuner.GridSearchTuner
+    :members:
+    :inherited-members:
+
+.. autoclass:: tvm.autotvm.tuner.GATuner
+    :members:
+    :inherited-members:
+
+.. autoclass:: tvm.autotvm.tuner.XGBTuner
+    :members:
+    :inherited-members:
+
+.. automodule:: tvm.autotvm.tuner.callback
+    :members:
+
+tvm.autotvm.task
+~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.task
+    :members:
+
+.. automodule:: tvm.autotvm.task.task
+    :members:
+
+.. automodule:: tvm.autotvm.task.space
+    :members:
+
+tvm.autotvm.record
+~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.autotvm.record
+    :members:
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -14,6 +14,7 @@ Python API
   ndarray
   container
   function
+   autotvm
   graph_runtime
   rpc
   bridge

--- a/docs/conf.py
+++ b/docs/conf.py
@@ -191,6 +191,7 @@ gallery_dirs = ["tutorials", "vta/tutorials"]
 subsection_order = ExplicitOrder(
    ['../tutorials/language',
     '../tutorials/optimize',
+     '../tutorials/autotvm',
     '../tutorials/vta',
     '../tutorials/topi',
     '../tutorials/deployment',

--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -488,7 +488,7 @@ bool VerifyMemory(LoweredFunc func, int device_type);
 *
 *        "max_local_memory_per_block": Total amount of local memory per block (in bytes).
 *        "max_shared_memory_per_block": Total amount of shared memory per block (in bytes).
- *        "max_thread_per_block": Maximum number of threads per block.
+ *        "max_threads_per_block": Maximum number of threads per block.
 *        "max_thread_x": Maximum length of threadIdx.x.
 *        "max_thread_y": Maximum length of threadIdx.y.
 *        "max_thread_z": Maximum length of threadIdx.z.

--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
+"""The auto-tuning module of tvm
+
+This module includes:
+
+* Tuning space definition API
+
+* Efficient auto-tuners
+
+* Tuning result and database support
+
+* Distributed measurement to scale up tuning
+"""
+
+from . import database
+from . import feature
+from . import measure
+from . import record
+from . import task
+from . import tuner
+from . import util
+
+# some shortcuts
+from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo
+from .tuner import callback
+from .task import template, get_config, create, ConfigSpace, ConfigEntity
+from .record import ApplyHistoryBest as apply_history_best
--- a/python/tvm/autotvm/database.py
+++ b/python/tvm/autotvm/database.py
+# pylint: disable=consider-using-enumerate,invalid-name
+"""
+Database of MeasureInput/MeasureResult pair.
+This can be used for replaying measurement.
+"""
+import os
+
+from .record import encode, decode, measure_str_key
+
+
+class Database(object):
+    """
+    Base class for a record database object.
+    """
+    def load(self, inp, get_all=False):
+        """
+        Load a result based on an input's string key
+
+        Parameters
+        ----------
+        inp: MeasureInput
+            to be translated into key for RedisDB
+        get_all: bool, optional
+            Whether the latest result (or all matching results) should be returned
+
+        Returns
+        -------
+        rec: MeasureResult if previously saved, otherwise None
+        """
+        raise NotImplementedError()
+
+    def save(self, inp, res, extend=False):
+        """
+        Save a result based on an input's string key
+
+        Parameters
+        ----------
+        inp: MeasureInput
+            to be translated into key for RedisDB
+        res: MeasureResult
+            to associate with key
+        extend:
+            Whether to extend existing MeasureResults if they exist
+        """
+        raise NotImplementedError()
+
+
+def filter_inputs(db, measure_inputs, retry=False):
+    """
+    Filter a measure_inputs batch based on saved db results
+
+    Parameters
+    ----------
+    db: Database
+        database object
+    measure_inputs: Array of MeasureInput
+        measure_inputs as expected in measure_batch
+    retry: bool
+        whether to retry if the saved result is a failure
+
+    Returns
+    -------
+    partial_results: Array of MeasureResult
+        a full list of result, where None denotes no corresponding saved result
+    unsaved: Array of MeasureInput
+        a list that only contains unsaved inputs
+    """
+    partial_results = list()
+    unsaved = list()
+    for inp in measure_inputs:
+        res = db.load(inp)
+        if res is None or (retry and res.error_no != 0):
+            unsaved.append(inp)
+            partial_results.append(None)
+        else:
+            partial_results.append(res)
+    return partial_results, unsaved
+
+class RedisDatabase(Database):
+    """
+    Redis version of record database
+    """
+    REDIS_PROD = 15
+    REDIS_LOCA = 14
+    REDIS_TEST = 13        # for unit test
+    REDIS_NIGHT_TEMP = 12  # for nightly report (will be flushed after every workload)
+
+    MAGIC_SPLIT = "$"
+
+    def __init__(self, db_index=REDIS_PROD):
+        import redis
+
+        if db_index == RedisDatabase.REDIS_TEST:
+            host = 'localhost'
+        else:
+            host = os.environ.get('TVM_FLEET_HOST')
+        self.db = redis.StrictRedis(host=host, port=6379, db=db_index)
+        self.db_index = db_index
+
+    def set(self, key, value):
+        self.db.set(key, value)
+
+    def get(self, key):
+        return self.db.get(key)
+
+    def load(self, inp, get_all=False):
+        current = self.get(measure_str_key(inp))
+        if current is not None:
+            current = str(current)
+            records = [decode(x) for x in current.split(RedisDatabase.MAGIC_SPLIT)]
+            results = [rec[1] for rec in records]
+            if get_all:
+                return results
+            return max(results, key=lambda result: result.timestamp)
+        return current
+
+    def save(self, inp, res, extend=False):
+        current = self.get(measure_str_key(inp))
+        if not extend or current is None:
+            self.set(measure_str_key(inp),
+                     RedisDatabase.MAGIC_SPLIT.join([encode(inp, res)]))
+        else:
+            current = current.split(RedisDatabase.MAGIC_SPLIT)
+            self.set(measure_str_key(inp),
+                     RedisDatabase.MAGIC_SPLIT.join(current + [encode(inp, res)]))
+
+    def filter(self, func):
+        """
+        Dump all of the records for a particular target
+
+        Parameters
+        ----------
+        func: callable
+            The signature of the function is bool (MeasureInput, Array of MeasureResult)
+
+        Returns
+        -------
+        list of records (inp, result) matching the target
+
+        Examples
+        --------
+        get records for a target
+        >>> db.filter(lambda inp, resulst: "cuda" in inp.target.keys)
+        """
+        matched_records = list()
+        # may consider filtering in iterator in the future
+        for key in self.db:
+            current = self.get(key)
+            try:
+                records = [decode(x) for x in current.spilt(RedisDatabase.MAGIC_SPLIT)]
+            except TypeError:  # got a badly formatted/old format record
+                continue
+
+            inps, results = zip(*records)
+            inp = inps[0]
+            if not func(inp, results):
+                continue
+            result = max(results, key=lambda res: res.timestamp)
+            matched_records.append((inp, result))
+        return matched_records
+
+    def flush(self):
+        self.db.flushdb()
+
+class DummyDatabase(RedisDatabase):
+    """
+    A database based on python dictionary for testing.
+    """
+
+    def __init__(self):
+        # pylint: disable=super-init-not-called
+        self.db = {}
+
+    def set(self, key, value):
+        self.db[key] = value
+
+    def get(self, key):
+        return self.db.get(key)
+
+    def flush(self):
+        self.db = {}
--- a/python/tvm/autotvm/env.py
+++ b/python/tvm/autotvm/env.py
+"""Global configuration/variable scope for autotvm"""
+
+class AutotvmGlobalScope(object):
+    current = None
+
+    def __init__(self):
+        self._old = AutotvmGlobalScope.current
+        AutotvmGlobalScope.current = self
+
+        self.cuda_target_arch = None
+
+GLOBAL_SCOPE = AutotvmGlobalScope()
--- a/python/tvm/autotvm/feature.py
+++ b/python/tvm/autotvm/feature.py
+# pylint: disable=invalid-name
+"""Extract feature of iter vars
+
+There are two types of feature
+1) Itervar feature
+   This feature is extracted based on loop variables.
+   Different loop structures will result in different shapes of feature
+2) Curve sample feature (relation feature)
+   This feature is extracted by sampling relation curve.
+   This feature is invariant of loop structure.
+"""
+
+import struct
+import numpy as np
+
+from tvm import schedule, ir_pass, build_module, get_global_func, target as _target
+
+def ana_lower(sch, args,
+              binds=None,
+              simple_mode=True):
+    """Do lower while keeping all axes in IR
+    i.e. Do not eliminate loop with extent of 1, do not vectorize, unroll or inject virtual threads
+    """
+    binds, _ = build_module.get_binds(args, binds)
+    sch = sch.normalize()
+    # Phase 0
+    bounds = schedule.InferBound(sch)
+    stmt = schedule.ScheduleOps(sch, bounds, True)
+    stmt = ir_pass.StorageFlatten(stmt, binds, 64)
+    stmt = ir_pass.CanonicalSimplify(stmt)
+    assert simple_mode
+    return stmt
+
+try:
+    _get_buffer_curve_sample_flatten = get_global_func(
+        "autotvm.feature.GetCurveSampleFeatureFlatten")
+    _get_itervar_feature = get_global_func("autotvm.feature.GetItervarFeature")
+    _get_itervar_feature_flatten = get_global_func("autotvm.feature.GetItervarFeatureFlatten")
+except ValueError as e:
+    def raise_error(*args, **kwargs):  # pylint: disable=unused-argument
+        raise RuntimeError("Cannot load autotvm c++ API")
+    _get_buffer_curve_sample_flatten = _get_itervar_feature = _get_itervar_feature_flatten = \
+        raise_error
+
+def get_itervar_feature(sch, args, take_log=False):
+    """get features of iter vars
+
+    Parameters
+    ----------
+    sch: tvm.schedule.Schedule
+    args: Array of tvm.tensor.Tensor
+        the buffer args for lower
+    take_log: bool
+        whether take log of numerical statics
+
+    Returns
+    -------
+    features of every axis in the IR, see doc/features.md for detail
+    """
+    stmt = ana_lower(sch, args, simple_mode=True)
+    feas = _get_itervar_feature(stmt, take_log)
+
+    # convert tvm node to python type
+    ret = []
+    for row in feas:
+        tmp = []
+        tmp.append([row[0][0].value, row[0][1]])
+        for item in row[1:]:
+            tmp.append([item[0].value] + [x.value for x in item[1:]])
+        ret.append(tmp)
+    return ret
+
+def flatten_itervar_feature(fea):
+    """flatten features into one-dimensional feature vectors
+
+    Parameters
+    ----------
+    fea: list
+        return value of get_itervar_feature
+
+    Returns
+    -------
+    flatten_feature: np.ndarray
+        one-dimensional vector
+    """
+    flatten = []
+    for axis in fea:
+        for pair in axis[1:]:
+            flatten.append(pair[1:])
+    return np.concatenate(flatten)
+
+def get_itervar_feature_flatten(sch, args, take_log=True):
+    """get flatten features of iter vars
+    this is equivalent to get_itervar_feature + flatten_itervar_feature, but much faster.
+
+    Parameters
+    ----------
+    sch: tvm.schedule.Schedule
+    args: Array of tvm.tensor.Tensor
+        the buffer args for lower
+    take_log: bool
+        whether take log of numerical statics
+
+    Returns
+    -------
+    flatten_feature: np.ndarray
+        one-dimensional vector
+    """
+    stmt = ana_lower(sch, args, simple_mode=True)
+    feas = _get_itervar_feature_flatten(stmt, take_log)
+    feas = struct.unpack('%df' % (len(feas)//4), feas)
+    return feas
+
+def get_flatten_name(fea):
+    """ Get names of feature after flatten.
+
+    Parameters
+    ----------
+    fea: list or str
+        return value of get_itervar_feature or a line of logfile
+
+    Returns
+    -------
+    feature_names: Array of str
+    """
+
+    feature_name = {
+        "_attr_": ["length", "nest_level", "topdown", "bottomup"] +
+                  ["ann_%d" % i for i in range(20)],
+        "_arith_": ["add", "mul", "div"],
+        "buf_touch": ["stride", "mod", "count", "reuse", "T_count", "T_reuse"],
+    }
+
+    if isinstance(fea, str):
+        from .record import decode
+        # flatten line to feature
+        line = fea
+        inp, _ = decode(line)
+        target = _target.create(inp.target)
+        with target:
+            s, args = inp.template.instantiate(inp.config)
+        fea = get_itervar_feature(s, args)
+
+    names = []
+    ct = 0
+    for row in fea:
+        var_name = str(row[0][1])
+        for pair in row[1:]:
+            key = pair[0]
+            if key in feature_name:
+                name_list = feature_name[key]
+            else:
+                name_list = feature_name["buf_touch"]
+
+            for i in range(len((pair[1:]))):
+                names.append(".".join(["f%d" % ct, var_name, key, name_list[i]]))
+                ct += 1
+    return names
+
+
+def get_buffer_curve_sample_flatten(sch, args, sample_n=30):
+    """
+    Get flatten curve sample feature (relation feature)
+
+    Parameters
+    ----------
+    sch: tvm.schedule.Schedule
+    args: Array of tvm.tensor.Tensor
+        the buffer args for lower
+    sample_n: int
+        number of sample points along one dimension
+
+    Returns
+    -------
+    flatten_feature: np.ndarray
+        one-dimensional vector
+    """
+    stmt = ana_lower(sch, args, simple_mode=True)
+    feas = _get_buffer_curve_sample_flatten(stmt, sample_n, False)
+    feas = struct.unpack('%df' % (len(feas)//4), feas)
+    return feas
--- a/python/tvm/autotvm/measure/__init__.py
+++ b/python/tvm/autotvm/measure/__init__.py
+"""Distributed executor infrastructure to scale up the tuning"""
+
+from .measure import MeasureInput, MeasureResult, MeasureErrorNo
+from .measure import create_measure_batch, measure_option
+from .measure_methods import request_remote
+
+from .local_executor import LocalExecutor
+from .executor import Future, Executor
--- a/python/tvm/autotvm/measure/executor.py
+++ b/python/tvm/autotvm/measure/executor.py
+""" Abstraction for asynchronous job execution """
+
+class Executor(object):
+    """
+    Base abstract executor interface for asynchronous job submission.
+    Allows submit asynchronous jobs and returns the Future object.
+    """
+    # timeout for jobs that may hang
+    DEFAULT_TIMEOUT = 60
+
+    def submit(self, func, *args, **kwargs):
+        """
+        Pass task (function, arguments) to the Executor.
+
+        Parameters
+        ----------
+        func : callable
+            function to be run by a worker
+        args : list or tuple, optional
+            arguments passed to the function
+        kwargs : dict, optional
+            The keyword arguments
+
+        Returns
+        -------
+        future : Future
+            Future object wrapping the task which can be used to
+            collect the task's result.
+        """
+        raise NotImplementedError()
+
+
+class Future(object):
+    """
+    Base class of the future object.
+    The implementations can return object of subclass of this.
+    This objects encapsulates the asynchronous execution of task
+    submitted to another thread, or another worker for execution.
+
+    Future objects store the state of tasks--can be polled for
+    result or a blocking call to retrieve the result can be used.
+    """
+    def done(self):
+        """
+        Return True if job was successfully cancelled or finished running.
+        """
+        raise NotImplementedError()
+
+    def get(self, timeout=None):
+        """
+        Get the result. This will block until the result is available.
+
+        Parameters
+        ----------
+        timeout : int or float, optional
+            Maximum number of seconds to wait before it timeouts.
+            If not specified, it means we block until the result is available.
+
+        Returns
+        -------
+        result : Any
+            The result returned by the submitted function.
+
+        Raises
+        ------
+        TimeoutError : if the result call timeouts.
+        """
+        raise NotImplementedError()
+
+class FutureError(RuntimeError):
+    """Base error class of all future events"""
+    pass
+
+# pylint:disable=redefined-builtin
+class TimeoutError(FutureError):
+    """Error raised when a task is timeout."""
+    pass
+
+class ExecutionError(FutureError):
+    """
+    Error raised when future execution crashes or failed.
+    """
+    pass
--- a/python/tvm/autotvm/measure/local_executor.py
+++ b/python/tvm/autotvm/measure/local_executor.py
+"""Local based implementation of the executor using multiprocessing"""
+
+import signal
+
+from multiprocessing import Process, Queue
+try:
+    from queue import Empty
+except ImportError:
+    from Queue import Empty
+
+import psutil
+
+from . import executor
+
+
+def kill_child_processes(parent_pid, sig=signal.SIGTERM):
+    """kill all child processes recursively"""
+    try:
+        parent = psutil.Process(parent_pid)
+    except psutil.NoSuchProcess:
+        return
+    children = parent.children(recursive=True)
+    for process in children:
+        try:
+            process.send_signal(sig)
+        except psutil.NoSuchProcess:
+            return
+
+def _execute_func(func, queue, args, kwargs):
+    """execute function and return the result or exception to a queue"""
+    try:
+        res = func(*args, **kwargs)
+    except Exception as exc:  # pylint: disable=broad-except
+        res = exc
+    queue.put(res)
+
+def timeout_monitor(queue, timeout, func, args, kwargs):
+    """A wrapper to support timeout of a function call"""
+
+    # start a new process for timeout (cannot use thread because we have c function)
+    p = Process(target=_execute_func, args=(func, queue, args, kwargs))
+    p.start()
+    p.join(timeout=timeout)
+
+    alive = p.is_alive()
+    kill_child_processes(p.pid)
+    p.terminate()
+    p.join()
+
+    if alive:
+        queue.put(executor.TimeoutError())
+    else:
+        if queue.empty():
+            queue.put(executor.ExecutionError("Fatal error in local executor"))
+
+
+class LocalFuture(executor.Future):
+    """Local wrapper for the future
+
+    Parameters
+    ----------
+    process: multiprocessing.Process
+        process for running this task
+    queue: multiprocessing.Queue
+        queue for receiving the result of this task
+    """
+    def __init__(self, process, queue):
+        self._done = False
+        self._process = process
+        self._queue = queue
+
+    def done(self):
+        self._done = self._done or not self._queue.empty()
+        return self._done
+
+    def get(self, timeout=None):
+        try:
+            res = self._queue.get(block=True, timeout=timeout)
+        except Empty:
+            raise executor.TimeoutError()
+        if self._process.is_alive():
+            kill_child_processes(self._process.pid)
+            self._process.terminate()
+        self._process.join()
+        self._queue.close()
+        self._queue.join_thread()
+        self._done = True
+        del self._queue
+        del self._process
+        return res
+
+
+class LocalFutureNoFork(executor.Future):
+    """Local wrapper for the future.
+    This is a none-fork version of LocalFuture.
+    Use this for the runtime that does not support fork (like cudnn)
+    """
+    def __init__(self, result):
+        self._result = result
+
+    def done(self):
+        return True
+
+    def get(self, timeout=None):
+        return self._result
+
+
+class LocalExecutor(executor.Executor):
+    """Local executor that runs workers on the same machine with multiprocessing."""
+    def __init__(self, timeout=None):
+        self.timeout = timeout or executor.Executor.DEFAULT_TIMEOUT
+
+    def submit(self, func, *args, **kwargs):
+        """
+
+        Note
+        ----------
+        By default, the executor will fork a new process for a new job
+        But some runtime does not support fork (e.g. cuda runtime, cudnn).
+        In this circumstance, you should set 'fork_new_process' to False in kwargs
+        """
+        fork_new_process = kwargs.pop('fork_new_process', True)
+
+        if not fork_new_process:
+            return LocalFutureNoFork(func(*args, **kwargs))
+
+        queue = Queue(1)
+        process = Process(target=timeout_monitor,
+                          args=(queue, self.timeout, func, args, kwargs))
+        process.start()
+        return LocalFuture(process, queue)
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
+"""Task is a tunable composition of template functions.
+
+Tuner takes a tunable task and optimizes the joint configuration
+space of all the template functions in the task.
+This module defines the task data structure, as well as a collection(zoo)
+of typical tasks of interest.
+"""
+
+from .task import Task, create, register, template, get_config
+from .space import ConfigSpace, ConfigEntity
+from .code_hash import attach_code_hash, attach_code_hash_to_arg
+from .dispatcher import DispatchContext, ApplyConfig, dispatcher
--- a/python/tvm/autotvm/task/code_hash.py
+++ b/python/tvm/autotvm/task/code_hash.py
+"""
+Decorator functions for hashing schedule code
+
+code hashing is used to check the consistence of schedule code and the parameters loaded from log
+"""
+import inspect
+import zlib
+
+from tvm import schedule
+
+def attach_code_hash(s):
+    """Decorator for attaching a code hash to a schedule
+
+    Parameters
+    ----------
+    s: Schedule
+        tvm.schedule.Schedule to attach the hash to
+    """
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            func(*args, **kwargs)
+            raw_hash = zlib.crc32(''.join(inspect.getsourcelines(func)[0]).encode())
+            s.code_hash = hex(raw_hash)[2:]
+        return wrapper
+    return decorator
+
+def attach_code_hash_to_arg(arg_idx=1):
+    """Decorator for attaching a code hash to a schedule
+
+    Parameters
+    ----------
+    arg_idx: int
+        index of the argument (expected to be a Schedule) to attach the code
+        hash to
+    """
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            func(*args, **kwargs)
+            assert isinstance(args[arg_idx], schedule.Schedule)
+            raw_hash = zlib.crc32(''.join(inspect.getsourcelines(func)[0]).encode())
+            args[arg_idx].code_hash = hex(raw_hash)[2:]
+        return wrapper
+    return decorator
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
+"""
+Template dispatcher module.
+
+A dispatcher is a function that can contains multiple behaviors.
+Its specific behavior is can be controlled by DispatchContext.
+
+DispatchContext is used in two ways, usually via different implementation
+of the DispatchContext base class.
+
+- During search, we can use it to pass the current proposal from tuner.
+- During evaluation, we can use it to set pick the best policy.
+"""
+from __future__ import absolute_import as _abs
+
+from decorator import decorate
+
+from tvm import target as _target
+
+class DispatchContext(object):
+    """
+    Base class of dispatch context.
+
+    DispatchContext enables the target and workload
+    specific dispatch mechanism for templates.
+    """
+    current = None
+
+    def query(self, target, workload):
+        """
+        Query the context to get the specific implementation.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload : Workload
+            The current workload.
+
+        Returns
+        -------
+        cfg : ConfigSpace
+            The specific configuration.
+        """
+        raise NotImplementedError()
+
+    def __enter__(self):
+        self._old_ctx = DispatchContext.current
+        DispatchContext.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        DispatchContext.current = self._old_ctx
+
+
+class ApplyConfig(DispatchContext):
+    """Apply a specific config entity during query.
+
+    Parameters
+    ----------
+    config : ConfigSpace or ConfigEntity
+        The specific configuration we care about.
+    """
+    def __init__(self, config):
+        super(ApplyConfig, self).__init__()
+        self._config = config
+        self.workload = None
+
+    def query(self, target, workload):
+        """Override query"""
+        self.workload = workload
+        return self._config
+
+
+def dispatcher(fworkload):
+    """Wrap a workload dispatcher function.
+
+    Parameters
+    ----------
+    fworkload : function
+        The workload extraction function from arguments.
+
+    Returns
+    -------
+    fdispatcher : function
+        A wrapped dispatcher function, which will
+        dispatch based on DispatchContext and
+        the current workload.
+    """
+    dispatch_dict = {}
+    func_name = fworkload.__name__
+
+    def register(key, func=None, override=False):
+        """Register template function.
+
+        Parameters
+        ----------
+        key : str or List of str
+            The template key to identify the template
+            under this dispatcher.
+        func : function
+            The function to be registered.
+            The first argument of the function is always
+            cfg returned by DispatchContext,
+            the rest arguments are the same as the fworkload.
+        override : bool
+            Whether override existing registration.
+
+        Returns
+        -------
+        The register function if necessary.
+        """
+        if isinstance(key, str):
+            key = [key]
+
+        def _do_reg(myf):
+            for x in key:
+                if x in dispatch_dict and not override:
+                    raise ValueError(
+                        "Key %s is already registered for %s" % (x, func_name))
+                dispatch_dict[x] = myf
+            return myf
+
+        if func:
+            return _do_reg(func)
+        return _do_reg
+
+    def dispatch_func(func, *args, **kwargs):
+        """The wrapped dispatch function"""
+        tgt = _target.current_target()
+        context = DispatchContext.current
+        if context is None:
+            raise RuntimeError("DispatchContext is not initialized")
+        workload = func(*args, **kwargs)
+        cfg = context.query(tgt, workload)
+        return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
+
+    fdecorate = decorate(fworkload, dispatch_func)
+    fdecorate.register = register
+    return fdecorate
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
--- a/python/tvm/autotvm/tuner/__init__.py
+++ b/python/tvm/autotvm/tuner/__init__.py
+"""
+A tuner takes a task as input. It proposes some promising :any:`ConfigEntity`
+in the :any:`ConfigSpace` and measure them on the real hardware. Then it
+proposed the next batch of :any:`ConfigEntity` according to the measure results.
+This tuning loop is repeated.
+"""
+
+from . import callback
+
+from .tuner import Tuner
+
+from .gridsearch_tuner import GridSearchTuner, RandomTuner
+from .ga_tuner import GATuner
+from .xgboost_tuner import XGBTuner
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
+# pylint: disable=consider-using-enumerate,invalid-name
+"""Namespace of callback utilities of AutoTVM"""
+
+import numpy as np
+
+from .. import record
+
+def log_to_file(file_out, protocol='json'):
+    """Log the tuning records into file.
+    The rows of the log are stored in the format of autotvm.record.encode.
+
+    Parameters
+    ----------
+    file_out : File or str
+        The file to log to.
+    protocol: str, optional
+        The log protocol. Can be 'json' or 'pickle'
+
+    Returns
+    -------
+    callback : callable
+        Callback function to do the logging.
+    """
+
+    def _callback(_, inputs, results):
+        """Callback implementation"""
+        if isinstance(file_out, str):
+            with open(file_out, "a") as f:
+                for inp, result in zip(inputs, results):
+                    f.write(record.encode(inp, result, protocol) + "\n")
+        else:
+            for inp, result in zip(inputs, results):
+                file_out.write(record.encode(inp, result, protocol) + "\n")
+    return _callback
+
+
+def save_tuner_state(prefix, save_every_sample=100):
+    """Save the state of tuner
+
+    Parameters
+    ----------
+    prefix : srt
+        prefix of the filename to store state
+    save_every_sample: int
+        save the state every x samples
+
+    Returns
+    -------
+    callback : function
+        Callback function to do the auto saving.
+    """
+    def _callback(tuner, inputs, results):
+        for _, __ in zip(inputs, results):
+            try:
+                ct = len(tuner.visited)
+            except AttributeError:
+                ct = 0
+            if ct % save_every_sample == 0:
+                tuner.save_state(prefix + "_%d.state" % ct)
+
+    return _callback
+
+
+def log_to_redis(host="localhost", port=6379, dbn=11):
+    """Record the tuning record to a redis DB.
+
+    Parameters
+    ----------
+    host: str, optional
+        Host address of redis db
+    port: int, optional
+        Port of redis db
+    dbn: int, optional
+        which redis db to use, default 11
+    """
+    # import here so only depend on redis when necessary
+    import redis
+    red = redis.StrictRedis(host=host, port=port, db=dbn)
+
+    def _callback(_, inputs, results):
+        """Callback implementation"""
+        for inp, result in zip(inputs, results):
+            red.set(inp, result)
+    return _callback
+
+class Monitor(object):
+    """A monitor to collect statistic during tuning"""
+    def __init__(self):
+        self.scores = []
+        self.timestamps = []
+
+    def __call__(self, tuner, inputs, results):
+        for inp, res in zip(inputs, results):
+            if res.error_no == 0:
+                flops = inp.task.flop / np.mean(res.costs)
+                self.scores.append(flops)
+            else:
+                self.scores.append(0)
+
+            self.timestamps.append(res.timestamp)
+
+    def reset(self):
+        self.scores = []
+        self.timestamps = []
+
+    def trial_scores(self):
+        """get scores (currently is flops) of all trials"""
+        return np.array(self.scores)
+
+    def trial_timestamps(self):
+        """get wall clock time stamp of all trials"""
+        return np.array(self.timestamps)
--- a/python/tvm/autotvm/tuner/cost_model.py
+++ b/python/tvm/autotvm/tuner/cost_model.py
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
+# pylint: disable=consider-using-enumerate,invalid-name,abstract-method
+
+"""Tuner with genetic algorithm"""
+
+import numpy as np
+
+from .tuner import Tuner
+from .model_based_tuner import knob2point, point2knob
+
+
+class GATuner(Tuner):
+    """Tuner with genetic algorithm.
+    This tuner does not have a cost model so it always run measurement on real machines.
+    This tuner expands the :code:`ConfigEntity` as gene.
+
+    Parameters
+    ----------
+    pop_size: int
+        number of genes in one generation
+    elite_num: int
+        number of elite to keep
+    mutation_prob: float
+        probability of mutation of a knob in a gene
+    """
+    def __init__(self, task, pop_size, elite_num=3, mutation_prob=0.1):
+        super(GATuner, self).__init__(task)
+
+        # algorithm configurations
+        self.pop_size = pop_size
+        self.elite_num = elite_num
+        self.mutation_prob = mutation_prob
+
+        assert elite_num <= pop_size, "The number of elites must be less than population size"
+
+        # space info
+        self.space = task.config_space
+        self.dims = [len(x) for x in self.space.space_map.values()]
+
+        self.visited = set([])
+
+        # current generation
+        self.genes = []
+        self.scores = []
+        self.elites = []
+        self.elite_scores = []
+        self.trial_pt = 0
+
+        # random initialization
+        self.pop_size = min(self.pop_size, len(self.space))
+        for _ in range(self.pop_size):
+            tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims)
+            while knob2point(tmp_gene, self.dims) in self.visited:
+                tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims)
+
+            self.genes.append(tmp_gene)
+            self.visited.add(knob2point(tmp_gene, self.dims))
+
+    def next_batch(self, batch_size):
+        ret = []
+        for _ in range(batch_size):
+            gene = self.genes[self.trial_pt % self.pop_size]
+            self.trial_pt += 1
+            ret.append(self.space.get(knob2point(gene, self.dims)))
+
+        return ret
+
+    def update(self, inputs, results):
+        for inp, res in zip(inputs, results):
+            if res.error_no == 0:
+                y = inp.task.flop / np.mean(res.costs)
+                self.scores.append(y)
+            else:
+                self.scores.append(0)
+
+        if len(self.scores) >= len(self.genes):
+            genes = self.genes + self.elites
+            scores = np.array(self.scores[:len(self.genes)] + self.elite_scores)
+
+            # reserve elite
+            self.elites, self.elite_scores = [], []
+            elite_indexes = np.argpartition(scores, -self.elite_num)[-self.elite_num:]
+            for ind in elite_indexes:
+                self.elites.append(genes[ind])
+                self.elite_scores.append(scores[ind])
+
+            # cross over
+            indices = np.arange(len(genes))
+            scores /= np.max(scores)
+            probs = scores / np.sum(scores)
+            tmp_genes = []
+            for _ in range(self.pop_size):
+                p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs)
+                p1, p2 = genes[p1], genes[p2]
+                point = np.random.randint(len(self.dims))
+                tmp_gene = p1[:point] + p2[point:]
+                tmp_genes.append(tmp_gene)
+
+            # mutation
+            next_genes = []
+            for tmp_gene in tmp_genes:
+                for j, dim in enumerate(self.dims):
+                    if np.random.random() < self.mutation_prob:
+                        tmp_gene[j] = np.random.randint(dim)
+
+                if len(self.visited) < len(self.space):
+                    while knob2point(tmp_gene, self.dims) in self.visited:
+                        j = np.random.randint(len(self.dims))
+                        tmp_gene[j] = np.random.randint(self.dims[j])
+                    next_genes.append(tmp_gene)
+                    self.visited.add(knob2point(tmp_gene, self.dims))
+                else:
+                    break
+
+            self.genes = next_genes
+            self.trial_pt = 0
+            self.scores = []
+
+    def has_next(self):
+        return len(self.visited) - (len(self.genes) - self.trial_pt) < len(self.space)
--- a/python/tvm/autotvm/tuner/gridsearch_tuner.py
+++ b/python/tvm/autotvm/tuner/gridsearch_tuner.py
+# pylint: disable=abstract-method
+"""Grid search tuner and random tuner"""
+
+import numpy as np
+
+from .tuner import Tuner
+
+
+class GridSearchTuner(Tuner):
+    """Enumerate the search space in a grid search order"""
+    def __init__(self, task):
+        super(GridSearchTuner, self).__init__(task)
+        self.counter = 0
+
+    def next_batch(self, batch_size):
+        ret = []
+        for _ in range(batch_size):
+            if self.counter >= len(self.task.config_space):
+                continue
+            index = self.counter
+            ret.append(self.task.config_space.get(index))
+            self.counter = self.counter + 1
+        return ret
+
+    def has_next(self):
+        return self.counter < len(self.task.config_space)
+
+    def __getstate__(self):
+        return {"counter": self.counter}
+
+    def __setstate__(self, state):
+        self.counter = state['counter']
+
+
+class RandomTuner(Tuner):
+    """Enumerate the search space in a random order"""
+    def __init__(self, task):
+        super(RandomTuner, self).__init__(task)
+        self.visited = set()
+
+    def next_batch(self, batch_size):
+        ret = []
+        counter = 0
+        while counter < batch_size:
+            if len(self.visited) >= len(self.task.config_space):
+                break
+            index = np.random.randint(len(self.task.config_space))
+            while index in self.visited:
+                index = np.random.randint(len(self.task.config_space))
+
+            ret.append(self.task.config_space.get(index))
+            self.visited.add(index)
+            counter += 1
+        return ret
+
+    def has_next(self):
+        return len(self.visited) < len(self.task.config_space)
+
+    def __getstate__(self):
+        return {"visited": self.counter}
+
+    def __setstate__(self, state):
+        self.counter = state['visited']
--- a/python/tvm/autotvm/tuner/metric.py
+++ b/python/tvm/autotvm/tuner/metric.py
+# pylint: disable=invalid-name
+"""Metrics for evaluating tuning process"""
+
+import numpy as np
+
+from ..util import get_rank
+
+def max_curve(trial_scores):
+    """ f(n) = max([s[i] fo i < n])
+
+    Parameters
+    ----------
+    trial_scores: Array of float
+        the score of i th trial
+
+    Returns
+    -------
+    curve: Array of float
+        function values
+    """
+    ret = np.empty(len(trial_scores))
+    keep = -1e9
+    for i, score in enumerate(trial_scores):
+        keep = max(keep, score)
+        ret[i] = keep
+    return ret
+
+def mean_curve(trial_scores):
+    """ f(n) = mean([s[i] fo i < n])
+
+    Parameters
+    ----------
+    trial_scores: Array of float
+        the score of i th trial
+
+    Returns
+    -------
+    curve: Array of float
+        function values
+    """
+    ret = np.empty(len(trial_scores))
+    keep = 0
+    for i, score in enumerate(trial_scores):
+        keep += score
+        ret[i] = keep / (i+1)
+    return ret
+
+def recall_curve(trial_ranks, top=None):
+    """
+    if top is None, f(n) = sum([I(rank[i] < n) for i < n]) / n
+    if top is K,    f(n) = sum([I(rank[i] < K) for i < n]) / K
+
+    Parameters
+    ----------
+    trial_ranks: Array of int
+        the rank of i th trial in labels
+    top: int or None
+        top-n recall
+
+    Returns
+    -------
+    curve: Array of float
+        function values
+    """
+    if not isinstance(trial_ranks, np.ndarray):
+        trial_ranks = np.array(trial_ranks)
+
+    ret = np.zeros(len(trial_ranks))
+    if top is None:
+        for i in range(len(trial_ranks)):
+            ret[i] = np.sum(trial_ranks[:i] <= i) / (i+1)
+    else:
+        for i in range(len(trial_ranks)):
+            ret[i] = 1.0 * np.sum(trial_ranks[:i] < top) / top
+    return ret
+
+def cover_curve(trial_ranks):
+    """
+    f(n) = max k s.t. {1,2,...,k} is a subset of {ranks[i] for i < n}
+
+    Parameters
+    ----------
+    trial_ranks: Array of int
+        the rank of i th trial in labels
+
+    Returns
+    -------
+    curve: Array of float
+        function values
+    """
+    ret = np.empty(len(trial_ranks))
+    keep = -1
+    cover = set()
+    for i, rank in enumerate(trial_ranks):
+        cover.add(rank)
+        while keep+1 in cover:
+            keep += 1
+        ret[i] = keep + 1
+    return ret / len(trial_ranks)
+
+def average_recall(preds, labels, N):
+    """evaluate average recall-n for predictions and labels"""
+    trials = np.argsort(preds)[::-1]
+    ranks = get_rank(labels[trials])
+    curve = recall_curve(ranks)
+    return np.sum(curve[:N]) / N
--- a/python/tvm/autotvm/tuner/model_based_tuner.py
+++ b/python/tvm/autotvm/tuner/model_based_tuner.py
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
+# pylint: disable=consider-using-enumerate
+"""
+Cost model optimizer based on simulated annealing
+"""
+
+import heapq
+import logging
+import time
+
+import numpy as np
+
+from ..util import sample_ints
+from .model_based_tuner import ModelOptimizer, knob2point, point2knob
+
+class SimulatedAnnealingOptimizer(ModelOptimizer):
+    """parallel simulated annealing optimization algorithm
+
+    Parameters
+    ----------
+    task: Task
+        The tuning task
+    n_iter: int
+        The number of iterations of simulated annealing
+    temp: float or Array of float
+        If is a single float, then use a constant temperature.
+        If is an Array, then perform linear cooling from temp[0] to temp[1]
+    early_stop: int, optional
+        Stop iteration if the optimal set do not change in `early_stop` rounds
+    verbose: int, optional
+        Print log every `verbose` iterations
+    """
+    def __init__(self, task, n_iter=500, temp=(1, 0), persistent=True, parallel_size=128,
+                 early_stop=30, verbose=50):
+        super(SimulatedAnnealingOptimizer, self).__init__()
+
+        self.task = task
+        self.dims = [len(x) for x in self.task.config_space.space_map.values()]
+
+        self.n_iter = n_iter
+        self.temp = temp
+        self.persistent = persistent
+        self.parallel_size = parallel_size
+        self.early_stop = early_stop
+        self.verbose = verbose
+        self.points = None
+
+    def find_maximums(self, model, num, exclusive):
+        tic = time.time()
+        temp, n_iter, early_stop, verbose = self.temp, self.n_iter, self.early_stop, self.verbose
+
+        if self.persistent and self.points is not None:
+            points = self.points
+        else:
+            points = np.array(sample_ints(0, len(self.task.config_space), self.parallel_size))
+
+        scores = model.predict(points)
+
+        # build heap and insert initial points
+        heap_items = [(float('-inf'), -i) for i in range(num)]
+        heapq.heapify(heap_items)
+        in_heap = set(exclusive)
+        in_heap.update([-i for i in range(num)])
+
+        for s, p in zip(scores, points):
+            if s > heap_items[0][0] and p not in in_heap:
+                pop = heapq.heapreplace(heap_items, (s, p))
+                in_heap.remove(pop[1])
+                in_heap.add(p)
+
+        k = 0
+        k_last_modify = 0
+
+        if isinstance(temp, (tuple, list, np.ndarray)):
+            t = temp[0]
+            cool = 1.0 * (temp[0] - temp[1]) / (n_iter + 1)
+        else:
+            t = temp
+            cool = 0
+
+        while k < n_iter and k < k_last_modify + early_stop:
+            new_points = np.empty_like(points)
+            for i, p in enumerate(points):
+                new_points[i] = random_walk(p, self.dims)
+
+            new_scores = model.predict(new_points)
+
+            ac_prob = np.exp((new_scores - scores) / t)
+            ac_index = np.random.random(len(ac_prob)) < ac_prob
+
+            points[ac_index] = new_points[ac_index]
+            scores[ac_index] = new_scores[ac_index]
+
+            for s, p in zip(new_scores, new_points):
+                if s > heap_items[0][0] and p not in in_heap:
+                    pop = heapq.heapreplace(heap_items, (s, p))
+                    in_heap.remove(pop[1])
+                    in_heap.add(p)
+                    k_last_modify = k
+
+            k += 1
+            t -= cool
+
+            if verbose >= 1 and k % verbose == 0:
+                t_str = "%.2f" % t
+                logging.info("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\ttemp: %s\t"
+                             "elapsed: %.2f",
+                             k, k_last_modify, heap_items[0][0],
+                             np.max([v for v, _ in heap_items]), t_str,
+                             time.time() - tic)
+
+        heap_items.sort(key=lambda item: -item[0])
+        if verbose:
+            logging.info("SA iter: %d\tlast_update: %d\tmax-0: %.2f\tmax-1: %.2f\telapsed: %.2f",
+                         k, k_last_modify, heap_items[-1][0], heap_items[0][0], time.time() - tic)
+            logging.info("SA Maximums: %s", heap_items)
+
+        if self.persistent:
+            self.points = points
+
+        return [x[1] for x in heap_items]
+
+def random_walk(p, dims):
+    """random walk as local transition
+
+    Parameters
+    ----------
+    p: int
+        index of the ConfigEntity
+    dims: Array of int
+        sizes of each dimension
+
+    Returns
+    -------
+    new_p: int
+        new neighborhood index
+    """
+    # transform to knob form
+    old = point2knob(p, dims)
+    new = list(old)
+
+    # mutate
+    while new == old:
+        from_i = np.random.randint(len(old))
+        to_v = np.random.randint(dims[from_i])
+        new[from_i] = to_v
+
+    # transform to index form
+    return knob2point(new, dims)
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
+# pylint: disable=unused-argument, no-self-use, invalid-name
+"""Base class of tuner"""
+import logging
+
+import numpy as np
+
+from ..measure import MeasureInput
+from ..measure import create_measure_batch
+
+
+class Tuner(object):
+    """Base class for tuners
+
+    Parameters
+    ----------
+    task: autotvm.task.Task
+        Tuning Task
+    """
+
+    def __init__(self, task, **kwargs):
+        self.param = kwargs
+        self.recorder = None
+
+        self.task = task
+
+        # keep the current best
+        self.best_config = None
+        self.best_flops = 0
+        self.best_measure_pair = None
+
+    def has_next(self):
+        """Whether has next untried config in the space
+
+        Returns
+        -------
+        has_next: bool
+        """
+        raise NotImplementedError()
+
+    def next_batch(self, batch_size):
+        """get the next batch of configs to be measure on real hardware
+
+        Parameters
+        ----------
+        batch_size: int
+            The size of the batch
+
+        Returns
+        -------
+        a batch of configs
+        """
+        raise NotImplementedError()
+
+    def update(self, inputs, results):
+        """Update parameters of the tuner according to measurement results
+
+        Parameters
+        ----------
+        inputs: Array of autotvm.measure.MeasureInput
+            The input for measurement
+        results: Array of autotvm.measure.MeasureResult
+            result for measurement
+        """
+        pass
+
+    def tune(self, n_trial, measure_option, verbose=1, callbacks=()):
+        """Begin tuning
+
+        Parameters
+        ----------
+        n_trial: int
+            Maximum number of configs to try (measure on real hardware)
+        measure_option: dict
+            The options for how to measure generated code.
+            You should use the return value ot autotvm.measure_option for this argument.
+        verbose: int
+            0: silent mode, no output
+            1: print every measurement result
+        callbacks: List of callable
+            A list of callback functions. The signature of callback function is
+            (Tuner, List of MeasureInput, List of MeasureResult)
+            with no return value. These callback functions will be called on
+            every measurement pair. See autotvm/tuner/callback.py for some examples.
+        """
+        measure_batch = create_measure_batch(self.task, measure_option)
+        parallel_num = getattr(measure_batch, 'parallel_num', 1)
+
+        i = 0
+        while i < n_trial:
+            if not self.has_next():
+                break
+
+            configs = self.next_batch(min(parallel_num, n_trial - i))
+
+            inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
+            results = measure_batch(inputs)
+
+            # print info
+            if verbose >= 1:
+                for k, (inp, res) in enumerate(zip(inputs, results)):
+                    config = inp.config
+                    if res.error_no == 0:
+                        flops = inp.task.flop / np.mean(res.costs)
+                    else:
+                        flops = 0
+                    if flops > self.best_flops:
+                        self.best_flops = flops
+                        self.best_config = config
+                        self.best_measure_pair = (inp, res)
+
+                    logging.info("No: %d\tGFLOPS: %.2f/%.2f\tresult: %s\t%s",
+                                 i + k + 1, flops / 1e9, self.best_flops / 1e9,
+                                 res, config)
+
+            i += len(results)
+
+            self.update(inputs, results)
+
+            for callback in callbacks:
+                callback(self, inputs, results)
+
+        del measure_batch
+
+    def reset(self):
+        """reset the status of tuner"""
+        self.best_config = None
+        self.best_flops = 0
+        self.best_measure_pair = None
+
+    def load_history(self, data_set):
+        """load history data for transfer learning
+
+        Parameters
+        ----------
+        data_set: Array of (MeasureInput, MeasureResult) pair
+            Previous tuning records
+        """
+        raise NotImplementedError()
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
--- a/python/tvm/autotvm/tuner/xgboost_tuner.py
+++ b/python/tvm/autotvm/tuner/xgboost_tuner.py
+"""Tuner that uses xgboost as cost model"""
+
+from .model_based_tuner import ModelBasedTuner, ModelOptimizer
+from .xgboost_cost_model import XGBoostCostModel
+from .sa_model_optimizer import SimulatedAnnealingOptimizer
+
+class XGBTuner(ModelBasedTuner):
+    """Tuner that uses xgboost as cost model
+
+    Parameters
+    ----------
+    task: Task
+        The tuning task
+    plan_size: int
+        The size of a plan. After `plan_size` trials, the tuner will refit a new cost model
+        and do planing for the next `plan_size` trials.
+    feature_type: str, optional
+        If is 'itervar', use features extracted from IterVar (loop variable).
+        If is 'knob', use flatten ConfigEntity directly.
+        If is 'curve', use sampled curve feature (relation feature).
+
+        Note on choosing feature type:
+        For single task tuning, 'itervar' and 'knob' is good.
+                                'itervar' is more accurate but 'knob' is much faster.
+        For cross-shape tuning (e.g. many convolutions with different shapes),
+                               'itervar' and 'curve' has better transferability,
+                               'knob' is faster.
+        For cross-device or cross-operator tuning, you can use 'curve' only.
+    loss_type: str
+        If is 'reg', use regression loss to train cost model.
+                     The cost model predicts the normalized flops.
+        If is 'rank', use pairwise rank loss to train cost model.
+                     The cost model predicts relative rank score.
+    num_threads: int, optional
+        The number of threads.
+    optimizer: str or ModelOptimizer, optional
+        If is 'sa', use a default simulated annealing optimizer.
+        Otherwise it should be a ModelOptimizer object.
+    diversity_filter_ratio: int or float, optional
+        If is not None, the tuner will first select
+        top-(plan_size * diversity_filter_ratio) candidates according to the cost model
+        and then pick batch_size of them according to the diversity metric.
+    """
+    def __init__(self, task, plan_size=32,
+                 feature_type='itervar', loss_type='rank', num_threads=None,
+                 optimizer='sa', diversity_filter_ratio=None):
+        cost_model = XGBoostCostModel(task,
+                                      feature_type=feature_type,
+                                      loss_type=loss_type,
+                                      num_threads=num_threads)
+        if optimizer == 'sa':
+            optimizer = SimulatedAnnealingOptimizer(task)
+        else:
+            assert isinstance(optimizer, ModelOptimizer), "Optimizer must be " \
+                                                          "a supported name string" \
+                                                          "or a ModelOptimizer object."
+
+        super(XGBTuner, self).__init__(task, cost_model, optimizer,
+                                       plan_size, diversity_filter_ratio)
--- a/python/tvm/autotvm/util.py
+++ b/python/tvm/autotvm/util.py
+# pylint: disable=invalid-name
+"""Utilities"""
+import logging
+import multiprocessing
+import time
+
+import numpy as np
+
+from .. import expr, ir_pass
+
+def get_rank(values):
+    """get rank of items
+
+    Parameters
+    ----------
+    values: Array
+
+    Returns
+    -------
+    ranks: Array of int
+        the rank of this item in the input (the largest value ranks first)
+    """
+    tmp = np.argsort(-values)
+    ranks = np.empty_like(tmp)
+    ranks[tmp] = np.arange(len(tmp))
+    return ranks
+
+
+def sample_ints(low, high, m):
+    """
+    Sample m different integer numbers from [low, high) without replacement
+    This function is an alternative of `np.random.choice` when (high - low) > 2 ^ 32, in
+    which case numpy does not work.
+
+    Parameters
+    ----------
+    low: int
+        low point of sample range
+    high: int
+        high point of sample range
+    m: int
+        The number of sampled int
+
+    Returns
+    -------
+    ints: an array of size m
+    """
+    vis = set()
+    assert m <= high - low
+    while len(vis) < m:
+        new = np.random.randint(low, high)
+        while new in vis:
+            new = np.random.randint(low, high)
+        vis.add(new)
+
+    return list(vis)
+
+
+def pool_map(func, args, batch_size, verbose=False, pool=None):
+    """A wrapper of multiprocessing.pool.Pool.map to support small-batch mapping
+    for large argument list. This can reduce memory usage
+
+    Parameters
+    ----------
+    func: Func(arg) -> np.ndarray
+        mapping function
+    args: List
+        list of arguments
+    batch_size: int
+        batch size in mapping
+    verbose: bool, optional
+        whether print progress
+    pool: multiprocessing.Pool, optional
+        pool objection
+
+    Returns
+    -------
+    converted numpy array
+    """
+
+    ret = None
+    tic = time.time()
+    local_pool = pool or multiprocessing.Pool()
+    if verbose:
+        logging.info("mapping begin")
+    for i in range(0, len(args), batch_size):
+        if verbose:
+            logging.info("mapping %d/%d elapsed %.2f", i, len(args),
+                         time.time() - tic)
+        tmp = np.array(local_pool.map(func, args[i:i+batch_size]))
+        ret = tmp if ret is None else np.concatenate((ret, tmp))
+    if verbose:
+        logging.info("mapping done")
+    if not pool:
+        local_pool.close()
+    return ret
+
+def get_func_name(func):
+    """Get name of a function
+
+    Parameters
+    ----------
+    func: Function
+        The function
+    Returns
+    -------
+    name: str
+        The name
+    """
+
+    return func.func_name if hasattr(func, 'func_name') else func.__name__
+
+
+def get_const_int(exp):
+    """Verifies expr is integer and get the constant value.
+
+    Parameters
+    ----------
+    exp : tvm.Expr or int
+        The input expression.
+
+    Returns
+    -------
+    out_value : int
+        The output.
+    """
+    if isinstance(exp, int):
+        return exp
+    if not isinstance(exp, (expr.IntImm, expr.UIntImm)):
+        exp = ir_pass.Simplify(expr)
+    if not isinstance(exp, (expr.IntImm, expr.UIntImm)):
+        raise ValueError("Expect value to be constant int")
+    return exp.value
+
+
+def get_const_tuple(in_tuple):
+    """Verifies input tuple is IntImm, returns tuple of int.
+
+    Parameters
+    ----------
+    in_tuple : tuple of Expr
+        The input.
+
+    Returns
+    -------
+    out_tuple : tuple of int
+        The output.
+    """
+    return tuple(get_const_int(x) for x in in_tuple)
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -229,8 +229,14 @@ class TrackerSession(object):
        res += "----------------------------\n"
        res += "key\tfree\tpending\n"
        res += "----------------------------\n"
-        for k, v in data["queue_info"].items():
-            res += "%s\t%d\t%g\n" % (k, v["free"], v["pending"])
+        queue_info = data['queue_info']
+        keys = list(queue_info.keys())
+        if keys:
+            keys.sort()
+            max_key_len = max([len(k) for k in keys])
+            for k in keys:
+                res += ("%%-%d" % max_key_len + "s\t%d\t%g\n") % \
+                       (k, queue_info[k]["free"], queue_info[k]["pending"])
        res += "----------------------------\n"
        return res


--- a/src/autotvm/feature_visitor.cc
+++ b/src/autotvm/feature_visitor.cc
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file feature_visitor.cc
+ * \brief Base class for feature extractor.
+ *        These features are used for machine learning cost model
+ */
+
+#include "feature_visitor.h"
+
+namespace tvm {
+namespace autotvm {
+
+// for loop
+void FeatureVisitor::Visit_(const For *op) {
+  const auto *extent = op->extent.as<IntImm>();
+  int64_t loop_extent = -1;
+  if (extent != nullptr)
+    loop_extent = extent->value;
+  AnnotationType ann = kSerial;
+  switch (op->for_type) {
+    case ForType ::Parallel:
+      ann = kParallel;
+      break;
+    case ForType::Unrolled:
+      ann = kUnrolled;
+      break;
+    case ForType::Vectorized:
+      ann = kVectorized;
+      break;
+    case ForType::Serial:
+      ann = kSerial;
+      break;
+  }
+
+  if (EnterItervar_(op->loop_var, loop_extent, ann)) {
+    IRVisitor::Visit_(op);
+    ExitItervar_();
+  }
+}
+
+// parallel axis, virtual thread
+void FeatureVisitor::Visit_(const AttrStmt *op) {
+  if (op->attr_key == attr::thread_extent ||
+      op->attr_key == attr::virtual_thread) {
+    VarExpr var = op->node.as<tvm::IterVarNode>()->var;
+    const auto *extent = op->value.as<IntImm>();
+    CHECK(extent);
+
+    std::string name = var.get()->name_hint;
+    AnnotationType ann = kParallel;
+    if (op->attr_key == attr::thread_extent) {
+      if (name == "blockIdx.x")
+        ann = kBlockX;
+      else if (name == "blockIdx.y")
+        ann = kBlockY;
+      else if (name == "blockIdx.z")
+        ann = kBlockZ;
+      else if (name == "threadIdx.x")
+        ann = kThreadX;
+      else if (name == "threadIdx.y")
+        ann = kThreadY;
+      else if (name == "threadIdx.z")
+        ann = kThreadZ;
+      else
+        LOG(FATAL) << "invalid thread itervar " + name;
+    } else {
+      ann = kVirtualThread;
+    }
+
+    if (EnterItervar_(var, extent->value, ann)) {
+      IRVisitor::Visit_(op);
+      ExitItervar_();
+    }
+  } else {
+    IRVisitor::Visit_(op);
+  }
+}
+
+// memory access
+void FeatureVisitor::Visit_(const Load *op) {
+  EnterMem_(op->buffer_var, op->index);
+  IRVisitor::Visit_(op);
+  ExitMem_();
+}
+
+void FeatureVisitor::Visit_(const Store *op) {
+  EnterMem_(op->buffer_var, op->index);
+  IRVisitor::Visit_(op);
+  ExitMem_();
+}
+
+}  // namespace autotvm
+}  // namespace tvm
--- a/src/autotvm/feature_visitor.h
+++ b/src/autotvm/feature_visitor.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file feature_visitor.h
+ * \brief Base class for feature extractor.
+ *        These features are used for machine learning cost model
+ */
+
+#ifndef TVM_AUTOTVM_FEATURE_VISITOR_H_
+#define TVM_AUTOTVM_FEATURE_VISITOR_H_
+
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+#include <string>
+
+namespace tvm {
+namespace autotvm {
+
+using namespace tvm::ir;
+
+/*!
+ * \brief Type of for loop, used as one-hot encoding in features
+ */
+enum AnnotationType {
+  kBlockX, kBlockY, kBlockZ, kThreadX, kThreadY, kThreadZ,
+  kUnrolled, kVectorized, kParallel, kSerial, kVirtualThread,
+  kNum,
+};
+
+/*!
+ * \brief A base class for feature extractor, used for processing
+ * for loop and memory access in the IR
+ */
+class FeatureVisitor : public IRVisitor {
+ public:
+  // for loop
+  void Visit_(const For *op);
+  void Visit_(const AttrStmt *op);
+
+  // memory access
+  void Visit_(const Load *op);
+  void Visit_(const Store *op);
+
+ protected:
+  /*!
+ * \brief Enter a for loop node
+ * \param var The expression to be printed.
+ * \param length The output stream
+ * \param ann_type The type for the for loop
+ * \return skip Whether skip this node
+ */
+  virtual bool EnterItervar_(tvm::VarExpr var, int64_t length, AnnotationType ann_type) = 0;
+  /*! \brief Exit a for loop subtree */
+  virtual void ExitItervar_() = 0;
+  /*!
+   * \brief Enter a memory access node
+   * \param buffer_var The buffer to access.
+   * \param index Index expression
+   */
+  virtual void EnterMem_(tvm::VarExpr buffer_var, tvm::Expr index) = 0;
+  /*! \brief Exit a memory access node */
+  virtual void ExitMem_() = 0;
+};
+
+}  // namespace autotvm
+}  // namespace tvm
+
+#endif  // TVM_AUTOTVM_FEATURE_VISITOR_H_
--- a/src/autotvm/touch_extractor.cc
+++ b/src/autotvm/touch_extractor.cc
--- a/src/autotvm/touch_extractor.h
+++ b/src/autotvm/touch_extractor.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file touch_extractor.h
+ * \brief Extract feature of touch pattern of axes in lowered IR
+ */
+
+#ifndef TVM_AUTOTVM_TOUCH_EXTRACTOR_H_
+#define TVM_AUTOTVM_TOUCH_EXTRACTOR_H_
+
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/api_registry.h>
+#include <stack>
+#include <vector>
+#include <map>
+#include <string>
+#include <deque>
+#include "feature_visitor.h"
+
+namespace tvm {
+namespace autotvm {
+
+using TouchedBuffer = std::string;
+
+// touch pattern buf[(stride * var) % mod) + other]
+struct TouchPattern {
+  int64_t stride{0};
+  int64_t mod{-1};  // -1 for +inf
+
+  int64_t count{1};
+  int64_t reuse{1};
+  int64_t thread_count{0};  // count when move thread axis into innermost
+  int64_t thread_reuse{0};  // reuse ratio move thread axis into innermost
+};
+
+// all the feature of an iter var
+struct ItervarFeature {
+  ItervarFeature(VarExpr var,
+                 int64_t extent,
+                 int nest,
+                 AnnotationType ann_type,
+                 int64_t topdown,
+                 int counter)
+      : length(extent), nest_level(nest), ann(ann_type), topdown_product(topdown), order(counter) {}
+  ItervarFeature() {}
+
+  // Axis Attributes
+  int64_t length;
+  int nest_level;
+  AnnotationType ann;         // one-hot axis type
+  int64_t topdown_product;    // accumulative product of axis length, in top-down order
+  int64_t bottomup_product;   // accumulative product of axis length, in bottom-up order
+  // bottomup_product = reuse * count for any touched buffer
+
+  int order;  // used for soring axis
+
+  // Arithmetic feature
+  int add_ct{0};
+  int mul_ct{0};
+  int div_ct{0};
+
+  // Memory Touch Feature
+  std::unordered_map<TouchedBuffer, TouchPattern> touch_feature;
+};
+
+// extract iter vars and their touch pattern from ir
+class TouchExtractor : public FeatureVisitor {
+ public:
+  void Analyze(Stmt stmt) {
+    this->Visit(stmt);
+  }
+
+  // arithmetic stats
+  void Visit_(const Add *op) {
+    if (op->type.is_float())
+      itervar_map[itervar_stack_.back()].add_ct++;
+    IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const Sub *op) {
+    if (op->type.is_float())
+      itervar_map[itervar_stack_.back()].add_ct++;
+    IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const Mul *op) {
+    if (op->type.is_float())
+      itervar_map[itervar_stack_.back()].mul_ct++;
+    IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const Div *op) {
+    if (op->type.is_float())
+      itervar_map[itervar_stack_.back()].div_ct++;
+    IRVisitor::Visit_(op);
+  }
+
+  void Visit_(const Mod *op) {
+    if (op->type.is_float())
+      itervar_map[itervar_stack_.back()].div_ct++;
+    IRVisitor::Visit_(op);
+  }
+
+  std::unordered_map<VarExpr, ItervarFeature, tvm::ExprHash, tvm::ExprEqual> itervar_map;
+
+ private:
+  bool EnterItervar_(VarExpr var, int64_t length, AnnotationType ann_type);
+  void ExitItervar_();
+  void EnterMem_(VarExpr buffer_var, Expr index);
+  void ExitMem_();
+
+  int64_t topdown_product_{1};
+  std::map<std::string, size_t> buffer_counter_;
+  size_t itervar_counter_{0};
+  std::deque<VarExpr> itervar_stack_;  // use deque instead of stack for indexing
+  std::deque<size_t> skip_stack_size_;
+
+  using IRVisitor::Visit_;
+};
+
+}  // namespace autotvm
+}  // namespace tvm
+
+#endif  // TVM_AUTOTVM_TOUCH_EXTRACTOR_H_
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -73,7 +73,7 @@ Target CreateTarget(const std::string& target_name,
    } else {
      t->device_type = kDLROCM;
    }
-    t->keys_array.push_back(ir::StringImm::make("rocm"));
+    t->keys_array.push_back(ir::StringImm::make(target_name));
    t->keys_array.push_back(ir::StringImm::make("gpu"));
    t->max_num_threads = 256;
    if (t->device_name == "intel_graphics") {
@@ -195,11 +195,7 @@ Target Target::create(const std::string& target_str) {
    options.push_back(item);
  }

-  if (device_name == "rasp") {
-    return target::rasp(options);
-  } else {
-    return CreateTarget(target_name, options);
-  }
+  return CreateTarget(target_name, options);
 }

 /*! \brief Entry to hold the Target context stack. */

--- a/src/pass/verify_gpu_code.cc
+++ b/src/pass/verify_gpu_code.cc
@@ -18,13 +18,13 @@ class GPUCodeVerifier : public IRVisitor {
  bool Verify(tvm::Stmt stmt,
              int64_t max_local_memory_per_block,
              int64_t max_shared_memory_per_block,
-              int64_t max_thread_per_block,
+              int64_t max_threads_per_block,
              int64_t max_thread_x,
              int64_t max_thread_y,
              int64_t max_thread_z) {
    max_local_memory_per_block_ = static_cast<size_t>(max_local_memory_per_block);
    max_shared_memory_per_block_ = static_cast<size_t>(max_shared_memory_per_block);
-    max_thread_per_block_ = static_cast<size_t>(max_thread_per_block);
+    max_threads_per_block_ = static_cast<size_t>(max_threads_per_block);
    max_thread_x_ = static_cast<size_t>(max_thread_x);
    max_thread_y_ = static_cast<size_t>(max_thread_y);
    max_thread_z_ = static_cast<size_t>(max_thread_z);
@@ -52,7 +52,7 @@ class GPUCodeVerifier : public IRVisitor {

    if (nest_level_ == 0) {
      // exit a kernel, check the validity
-      valid_ &= thread_per_block_ <= max_thread_per_block_;
+      valid_ &= thread_per_block_ <= max_threads_per_block_;

      valid_ &= local_memory_per_block_ <= max_local_memory_per_block_;
      valid_ &= shared_memory_per_block_ <= max_shared_memory_per_block_;
@@ -117,7 +117,7 @@ class GPUCodeVerifier : public IRVisitor {

  size_t max_local_memory_per_block_;
  size_t max_shared_memory_per_block_;
-  size_t max_thread_per_block_;
+  size_t max_threads_per_block_;
  size_t max_thread_x_, max_thread_y_, max_thread_z_;

  bool valid_{true};
@@ -137,26 +137,34 @@ bool VerifyGPUCode(Stmt stmt,
                   Map<std::string, Expr> constraints) {
  GPUCodeVerifier verifier;

-  auto get_int = [&constraints](std::string key, int64_t def) {
-    auto iter = constraints.find(key);
-    if (iter != constraints.end()) {
-      return ((*iter).second).as<IntImm>()->value;
-    } else {
-      return def;
-    }
-  };
-
-  int64_t max_local_memory_per_block = get_int("max_local_memory_per_block", INT64_MAX);
-  int64_t max_shared_memory_per_block = get_int("max_shared_memory_per_block", INT64_MAX);
-  int64_t max_thread_per_block = get_int("max_thread_per_block", INT64_MAX);
-  int64_t max_thread_x = get_int("max_thread_x", INT64_MAX);
-  int64_t max_thread_y = get_int("max_thread_y", INT64_MAX);
-  int64_t max_thread_z = get_int("max_thread_z", INT64_MAX);
+  int64_t max_local_memory_per_block = INT64_MAX;
+  int64_t max_shared_memory_per_block = INT64_MAX;
+  int64_t max_threads_per_block = INT64_MAX;
+  int64_t max_thread_x = INT64_MAX;
+  int64_t max_thread_y = INT64_MAX;
+  int64_t max_thread_z = INT64_MAX;
+
+  for (auto iter : constraints) {
+    if (iter.first == "max_local_memory_per_block")
+      max_local_memory_per_block = (iter.second).as<IntImm>()->value;
+    else if (iter.first == "max_shared_memory_per_block")
+      max_shared_memory_per_block = (iter.second).as<IntImm>()->value;
+    else if (iter.first == "max_threads_per_block")
+      max_threads_per_block = (iter.second).as<IntImm>()->value;
+    else if (iter.first == "max_thread_x")
+      max_thread_x = (iter.second).as<IntImm>()->value;
+    else if (iter.first == "max_thread_y")
+      max_thread_y = (iter.second).as<IntImm>()->value;
+    else if (iter.first == "max_thread_z")
+      max_thread_z = (iter.second).as<IntImm>()->value;
+    else
+      LOG(FATAL) << "Invalid check item: " << iter.first;
+  }

  return verifier.Verify(stmt,
                         max_local_memory_per_block,
                         max_shared_memory_per_block,
-                         max_thread_per_block,
+                         max_threads_per_block,
                         max_thread_x,
                         max_thread_y,
                         max_thread_z);

--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
+"""
+Test the tuner
+"""
+import logging
+import time
+
+import tvm
+
+from tvm import autotvm
+from tvm.autotvm.tuner import RandomTuner
+
+@autotvm.template
+def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
+    """An example template for testing"""
+    assert N == 1, "Only consider batch_size = 1 in this template"
+
+    data = tvm.placeholder((N, CI, H, W), name='data')
+    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
+
+    rc = tvm.reduce_axis((0, CI), name='rc')
+    ry = tvm.reduce_axis((0, KH), name='ry')
+    rx = tvm.reduce_axis((0, KW), name='rx')
+
+    conv = tvm.compute(
+        (N, CO, H - KH + 1, W - KW + 1),
+        lambda nn, ff, yy, xx: tvm.sum(
+            data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx],
+            axis=[rc, ry, rx]), tag="conv2d_nchw")
+
+    s = tvm.create_schedule([conv.op])
+
+    output = conv
+    OL = s.cache_write(conv, 'local')
+
+    # create cache stage
+    AA = s.cache_read(data, 'shared', [OL])
+    WW = s.cache_read(kernel, 'shared', [OL])
+    AL = s.cache_read(AA, 'local', [OL])
+    WL = s.cache_read(WW, 'local', [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
+    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
+    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+    kernel_scope = n  # this is the scope to attach global config inside this kernel
+
+    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    s[OL].compute_at(s[output], tx)
+
+    # tile and bind reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
+    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
+    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
+    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
+    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
+    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+    s[AL].compute_at(s[OL], rxm)
+    s[WL].compute_at(s[OL], rxm)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        n, f, y, x = s[load].op.axis
+        fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # tune unroll
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    return s, [data, kernel, conv]
+
+def get_sample_task(target=tvm.target.cuda(), target_host=None):
+    """return a sample task for testing"""
+    task = autotvm.task.create(conv2d_no_batching,
+                               args=(1, 7, 7, 512, 512, 3, 3),
+                               target=target, target_host=target_host)
+    return task, target
+
+
+def test_task_tuner_without_measurement():
+    """test task and tuner without measurement"""
+    task, target = get_sample_task()
+
+    def measure_batch(inputs):
+        from tvm.autotvm import MeasureResult
+
+        results = []
+        for inp in inputs:
+            tic = time.time()
+            # do nothing
+            time.sleep(0.001)
+            results.append(MeasureResult([time.time() - tic], 0,
+                                         time.time() - tic, time.time()))
+        return results
+    measure_option = autotvm.measure_option(mode='custom',
+                                            custom_measure_batch=measure_batch)
+
+    logging.info("%s", task.config_space)
+
+    # new tuner and recorder
+    for tuner_class in [autotvm.tuner.RandomTuner, autotvm.tuner.GridSearchTuner]:
+        tuner = tuner_class(task)
+        tuner.tune(n_trial=10, measure_option=measure_option)
+
+def test_tuning_with_measure():
+    def check(target, target_host):
+        ctx = tvm.context(target, 0)
+        if not ctx.exist:
+            logging.info("Skip test because %s is not available" % target)
+            return
+
+        # init task
+        task, target = get_sample_task(target, target_host)
+        logging.info("%s", task.config_space)
+
+        measure_option = autotvm.measure_option(mode='local',
+                                                timeout=4,
+                                                number=2)
+
+        tuner = RandomTuner(task)
+        tuner.tune(n_trial=10, measure_option=measure_option)
+
+    check("cuda", None)
+    check("opencl", None)
+
+if __name__ == "__main__":
+    # only print log when invoked from main
+    logging.basicConfig(level=logging.INFO)
+
+    test_task_tuner_without_measurement()
+    test_tuning_with_measure()
--- a/tests/python/unittest/test_autotvm_common.py
+++ b/tests/python/unittest/test_autotvm_common.py
+"""Common utilities for testing autotvm"""
+import time
+
+import tvm
+from tvm import autotvm
+from tvm.autotvm import MeasureInput, MeasureResult
+
+@autotvm.template
+def matmul(N, L, M, dtype):
+    A = tvm.placeholder((N, L), name='A', dtype=dtype)
+    B = tvm.placeholder((L, M), name='B', dtype=dtype)
+
+    k = tvm.reduce_axis((0, L), name='k')
+    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = tvm.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    ##### define space begin #####
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_y", y, num_outputs=2)
+    cfg.define_split("tile_x", x, num_outputs=2)
+    ##### define space end #####
+
+    # schedule according to config
+    yo, yi = cfg["tile_y"].apply(s, C, y)
+    xo, xi = cfg["tile_x"].apply(s, C, x)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+def get_sample_task(n=128):
+    """return a sample task for testing"""
+    target = tvm.target.create("llvm")
+    task = autotvm.task.create(matmul, args=(n, n, n, 'float32'), target=target)
+    return task, target
+
+def get_sample_records(n):
+    """get sample records for testing"""
+    tsk, target = get_sample_task()
+
+    inps, ress = [], []
+    for i in range(n):
+        inps.append(MeasureInput(target, tsk, tsk.config_space.get(i)))
+        ress.append(MeasureResult((i+1,), 0, i, time.time()))
+    return list(zip(inps, ress))
+
--- a/tests/python/unittest/test_autotvm_database.py
+++ b/tests/python/unittest/test_autotvm_database.py
+"""Test database"""
+import copy
+import logging
+import time
+
+import numpy as np
+import tvm
+
+from tvm import autotvm
+from tvm.autotvm import database
+from tvm.autotvm.measure.measure_methods import HashMismatchError
+from tvm.autotvm.record import encode, MeasureInput, MeasureResult
+
+from test_autotvm_common import get_sample_task, get_sample_records
+
+def test_save_load():
+    logging.info("test basic db load/save ...")
+    records = get_sample_records(3)
+    inp1, res1 = records[0]
+    inp2, res2 = records[1]
+    inp3, _ = records[2]
+
+    _db = database.DummyDatabase()
+    _db.flush()
+    _db.save(inp1, res1)
+    _db.save(inp2, res2)
+
+    load1 = _db.load(inp1)
+    load2 = _db.load(inp2)
+    load3 = _db.load(inp3)
+    assert load1 == res1
+    assert load2 == res2
+    assert load3 is None
+    assert load1 != load2
+
+TRIAL_LIMIT = 2
+
+def test_db_filter():
+    logging.info("test db filter ...")
+
+    # Pick a GPU target because there are more likely to be failures/invalid configs
+    task, target = get_sample_task()
+
+    ctx = tvm.context(str(target))
+    if not ctx.exist:
+        logging.warning("Skip this test because there is no supported device for test")
+
+    batch_size = 2
+
+    measure_option = autotvm.measure_option(mode='local-nofork', timeout=2)
+    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
+
+    ct = 0
+    all_inputs = list()
+    all_results = list()
+    batches = list()
+    tuner = autotvm.tuner.RandomTuner(task)
+    while ct < TRIAL_LIMIT:
+        inputs = list()
+        for i in range(batch_size):
+            cfg = tuner.next_batch(1)[0]
+            inputs.append((MeasureInput(target, task, cfg)))
+            all_inputs.append(inputs[-1])
+        batches.append(inputs)
+        results = measure_batch(inputs)
+        all_results += results
+        ct += 1
+
+    del measure_batch
+
+    db = database.DummyDatabase()
+    db.flush()
+
+    # First setting, memoize one input at a time, check that each is saved and replayed
+    measure_option = autotvm.measure_option(mode='local-nofork', timeout=2, replay_db=db)
+    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
+
+    for i in range(len(all_inputs)+1):
+        db.flush()
+        for j in range(i):
+            db.save(all_inputs[j], all_results[j])
+
+        for k in range(len(batches)):
+            batch = batches[k]
+            batch_result = measure_batch(batch)
+            for l in range(batch_size):
+                all_idx = k*batch_size + l
+                assert batch_result[l] is not None
+                if all_idx < i:
+                    assert encode(batch[l], batch_result[l]) == encode(batch[l], all_results[all_idx]), \
+                        "(no retry) EXPECTED MATCH, GOT MISMATCH"
+                else:
+                    assert encode(batch[l], batch_result[l]) != encode(batch[l], all_results[all_idx]), \
+                        "(no retry) EXPECTED MISMATCH, GOT MATCH"
+
+    del measure_batch
+
+def test_db_hash():
+    logging.info("test db hash check ...")
+    inp1, res1 = get_sample_records(1)[0]
+    inp2 = copy.deepcopy(inp1)
+    inp1.config.code_hash = 'cafecafe'
+    inp2.config.code_hash = 'dbffdbff'
+    res2l = list(tuple(res1))
+
+    # set timestamp
+    res2l[-1] = -1
+    res2 = MeasureResult(*res2l)
+    _db = database.DummyDatabase()
+    _db.flush()
+    _db.save(inp1, res1, extend=True)
+    _db.save(inp2, res2, extend=True)
+
+    load1 = _db.load(inp1)
+    load2 = _db.load(inp2)
+    assert load1 != load2
+    assert load1.timestamp != -1
+    assert load2.timestamp == -1
+
+def test_db_latest_all():
+    logging.info("test db load w/ multiple results ...")
+    inp1, res1 = get_sample_records(1)[0]
+    lis1 = list(tuple(res1))
+    lis2 = list(tuple(res1))
+    lis3 = list(tuple(res1))
+
+    # set timestamp
+    lis1[-1] = 0.0
+    lis2[-1] = 1.1
+    lis3[-1] = 9999.9999
+    res1 = MeasureResult(*lis1)
+    res2 = MeasureResult(*lis2)
+    res3 = MeasureResult(*lis3)
+
+    _db = database.DummyDatabase()
+    _db.flush()
+    _db.save(inp1, res1, extend=True)
+    load1 = _db.load(inp1)
+    assert load1.timestamp == 0.0
+    _db.save(inp1, res2, extend=True)
+    load2 = _db.load(inp1)
+    assert load2.timestamp == 1.1
+    _db.save(inp1, res3, extend=True)
+    load3 = _db.load(inp1)
+    assert load3.timestamp == 9999.9999
+
+    load4 = _db.load(inp1, get_all=True)
+    assert encode(inp1, load4[0]) == encode(inp1, res1)
+    assert encode(inp1, load4[1]) == encode(inp1, res2)
+    assert encode(inp1, load4[2]) == encode(inp1, res3)
+
+def test_db_save_replay():
+    logging.info("test db save (from measure_batch) and replay ...")
+    _db = database.DummyDatabase()
+    _db.flush()
+
+    task, target = get_sample_task()
+
+    ctx = tvm.context(str(target))
+    if not ctx.exist:
+        logging.warning("Skip this test because there is no supported device for test")
+
+    measure_option = autotvm.measure_option(mode='local-nofork',
+                                            timeout=2,
+                                            replay_db=_db, save_to_replay_db=True)
+    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
+
+    batch_size = 2
+
+    ct = 0
+    all_inputs = list()
+    all_results = list()
+    batches = list()
+    tuner = autotvm.tuner.RandomTuner(task)
+    while ct < TRIAL_LIMIT:
+        inputs = list()
+        for i in range(batch_size):
+            cfg = tuner.next_batch(1)[0]
+            inputs.append((MeasureInput(target, task, cfg)))
+            all_inputs.append(inputs[-1])
+        batches.append(inputs)
+        results = measure_batch(inputs)
+        all_results += results
+        ct += 1
+
+    assert len(_db.db.keys()) == batch_size * TRIAL_LIMIT, \
+        "%d vs %d" % (len(_db.db.keys()), batch_size * TRIAL_LIMIT)
+
+    all_results_2 = measure_batch(all_inputs)
+    all_results_3 = measure_batch(all_inputs)
+
+    for i in range(len(all_results)):
+        encr1 = encode(all_inputs[i], all_results[i])
+        encr2 = encode(all_inputs[i], all_results_2[i])
+        encr3 = encode(all_inputs[i], all_results_3[i])
+        assert encr1 == encr2, "EXPECTED MATCH WITH SAVE REPLAY (first replay), got MISMATCH"
+        assert encr2 == encr3, "EXPECTED MATCH WITH SAVE REPLAY (second replay), got MISMATCH"
+
+    del measure_batch
+
+def test_check_hashmismatch():
+    logging.info("test hash mismatch check")
+
+    task, target = get_sample_task()
+
+    ctx = tvm.context(str(target))
+    if not ctx.exist:
+        logging.warning("Skip this test because there is no supported device for test")
+
+    measure_option = autotvm.measure_option(mode='local-nofork')
+    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
+
+    inputs = list()
+    cfg = task.config_space.get(np.random.randint(len(task.config_space)))
+    # notvalidh is not a valid CRC32 hash (not hex)
+    cfg.code_hash = 'notvalidh'
+    inputs.append((MeasureInput(target, task, cfg)))
+
+    try:
+        results = measure_batch(inputs)
+        assert False, "HashMismatchError should be raised"
+    except HashMismatchError:
+        pass
+
+    del measure_batch
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    test_save_load()
+    test_db_filter()
+    test_db_hash()
+    test_db_latest_all()
+    test_db_save_replay()
+    test_check_hashmismatch()
--- a/tests/python/unittest/test_autotvm_dispatch_context.py
+++ b/tests/python/unittest/test_autotvm_dispatch_context.py
+"""Test dispatcher.
+The dispatcher can choose which template to use according
+to the parameters of workload"""
+
+from collections import namedtuple
+from tvm.autotvm.task import dispatcher, DispatchContext
+
+SimpleWorkload = namedtuple("SimpleWorkload", ["key"])
+SimpleConfig = namedtuple("SimpleConfig", ["template_key"])
+
+def test_dispatch():
+    @dispatcher
+    def my_dispatcher(a, b):
+        return SimpleWorkload(key=a + b)
+
+    @my_dispatcher.register("spatial_pack")
+    def _sp_pack_add(cfg, a, b):
+        return b + 100
+
+    @my_dispatcher.register("im2col")
+    def _im2col_add(cfg, a, b):
+        return a + 1
+
+    class SimpleDispatcher(DispatchContext):
+        def query(self, target, workload):
+            tkey = "spatial_pack" if workload.key > 2 else "im2col"
+            return SimpleConfig(tkey)
+
+    with SimpleDispatcher():
+        # im2col
+        assert my_dispatcher(1, 0) == 2
+        # spack
+        assert my_dispatcher(1, 100) == 200
+
+if __name__ == "__main__":
+    test_dispatch()
--- a/tests/python/unittest/test_autotvm_executor.py
+++ b/tests/python/unittest/test_autotvm_executor.py
+"""Test local executor"""
+import time
+
+from tvm.autotvm.measure import LocalExecutor, executor
+
+def slow(n):
+    r = 0
+    for i in range(0, n+1):
+        r += i
+    return r
+
+def fast(n):
+    return n*(n+1)//2
+
+def test_local_measure_async():
+    ex = LocalExecutor()
+    f1 = ex.submit(slow, 9999999)
+    f2 = ex.submit(fast, 9999999)
+    t1 = 0
+    t2 = 0
+    while True:
+        if t1 == 0 and f1.done():
+            t1 = time.time()
+        if t2 == 0 and f2.done():
+            t2 = time.time()
+        if t1 != 0 and t2 != 0:
+            break
+    assert t2 < t1, "Expected fast async job to finish first!"
+    assert f1.get() == f2.get()
+
+def timeout_job(n):
+    time.sleep(n * 1.5)
+
+def test_timeout():
+    timeout = 0.5
+
+    ex = LocalExecutor(timeout=timeout)
+
+    f1 = ex.submit(timeout_job, timeout)
+    while not f1.done():
+        pass
+    res = f1.get()
+    assert isinstance(res, executor.TimeoutError)
+
+if __name__ == "__main__":
+    test_local_measure_async()
+    test_timeout()
--- a/tests/python/unittest/test_autotvm_feature.py
+++ b/tests/python/unittest/test_autotvm_feature.py
+"""Test feature extraction"""
+
+import numpy as np
+
+import tvm
+from tvm.autotvm import feature
+
+def test_iter_feature_gemm():
+    N = 128
+
+    k = tvm.reduce_axis((0, N), 'k')
+    A = tvm.placeholder((N, N), name='A')
+    B = tvm.placeholder((N, N), name='B')
+    C = tvm.compute(
+        A.shape,
+        lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
+        name='C')
+
+    s = tvm.create_schedule(C.op)
+
+    feas = feature.get_itervar_feature(s, [A, B, C], take_log=False)
+
+    expected = [
+        {
+            '_attr_': [128, 1, 128, 2097152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+            'A_0': [128, -1, 16384, 128, 0, 0], 'B_0': [0, -1, 16384, 128, 0, 0],
+            'C_0': [128, -1, 16384, 128, 0, 0], 'C_1': [128, -1, 16384, 128, 0, 0],
+        },
+        {
+            '_attr_': [128, 2, 16384, 16384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+            'A_0': [0, -1, 128, 128, 0, 0], 'B_0': [1, -1, 16384, 1, 0, 0],
+            'C_0': [1, -1, 128, 128, 0, 0], 'C_1': [1, -1, 128, 128, 0, 0],
+        },
+        {
+            '_attr_': [128, 3, 2097152, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+            'A_0': [1, -1, 128, 1, 0, 0], 'B_0': [128, -1, 128, 1, 0, 0],
+            'C_1': [0, -1, 1, 128, 0, 0], 'C_2':  [0, -1, 1, 128, 0, 0],
+        }
+    ]
+
+    for ans, row in zip(expected, feas):
+        for pair in row:
+            if pair[0] not in ans:
+                continue
+            assert ans[pair[0]] == pair[1:], "%s: %s vs %s" % (pair[0], ans[pair[0]], pair[1:])
+
+
+def test_feature_shape():
+    """test the dimensions of flatten feature are the same"""
+
+    N = 1024
+    n_sample = 100
+
+    def get_gemm_feature(target):
+        k = tvm.reduce_axis((0, N), 'k')
+        A = tvm.placeholder((N, N), name='A')
+        B = tvm.placeholder((N, N), name='B')
+        C = tvm.compute(A.shape, lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
+                        name='C')
+
+        s = tvm.create_schedule(C.op)
+
+        y, x = s[C].op.axis
+        axes = list(s[C].tile(y, x, 8, 8)) + [k]
+        perm = np.random.permutation(5)
+        axes = [axes[x] for x in perm]
+        s[C].reorder(*axes)
+
+        if "gpu" in target.keys:
+            pick = []
+            # filter out reduction axis
+            for i in range(len(perm)):
+                if perm[i] != 4:
+                    pick.append(axes[i])
+            s[C].bind(pick[0], tvm.thread_axis("blockIdx.x"))
+            s[C].bind(pick[1], tvm.thread_axis("vthread"))
+            s[C].bind(pick[2], tvm.thread_axis("threadIdx.y"))
+
+        with target:
+            feas = feature.get_itervar_feature(s, [A, B, C])
+            feas = feature.flatten_itervar_feature(feas)
+        return feas
+
+    targets = [
+        tvm.target.cuda(),
+        tvm.target.mali(),
+        tvm.target.rasp(),
+    ]
+
+    for target in targets:
+        dim = len(get_gemm_feature(target))
+        for i in range(n_sample):
+            assert dim == len(get_gemm_feature(target)), "dimensions of feature do not match" \
+                                                   " for different configurations"
+
+
+if __name__ == "__main__":
+    test_iter_feature_gemm()
+    test_feature_shape()
--- a/tests/python/unittest/test_autotvm_flop_calculator.py
+++ b/tests/python/unittest/test_autotvm_flop_calculator.py
+"""Test flop calculation"""
+
+import tvm
+import numpy as np
+
+from tvm.autotvm.task.task import compute_flop
+
+def test_conv():
+    for i in range(5):
+        N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
+        D = tvm.placeholder((N, CI, H, W))
+        K = tvm.placeholder((CO, CI, KH, KW))
+
+        KH = min(H, KH)
+        KW = min(W, KW)
+
+        ci = tvm.reduce_axis((0, CI))
+        kh = tvm.reduce_axis((0, KH))
+        kw = tvm.reduce_axis((0, KW))
+
+        OH = (H - KH) + 1
+        OW = (W - KW) + 1
+
+        C = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:
+        tvm.sum(D[n][ci][h][w] * K[co][ci][h][w], axis=[ci, kh, kw]))
+
+        s = tvm.create_schedule([C.op])
+
+        assert compute_flop(s) == 2 * N * CO * OH * OW * CI * KH * KW
+
+def test_pack_gemm():
+    for i in range(5):
+        N, L, M = [np.random.randint(10, 128) * 4 for _ in range(3)]
+        A = tvm.placeholder((N, L))
+        B = tvm.placeholder((M, L))
+        k = tvm.reduce_axis((0, L))
+
+        bn = 4
+        A_pack = tvm.compute((N // bn, L, bn), lambda i, j, k: A[i * bn + k][j])
+        B_pack = tvm.compute((M // bn, L, bn), lambda i, j, k: B[i * bn + k][j])
+        C_pack = tvm.compute((N // bn, M // bn, bn, bn), lambda i, j, ii, jj:
+        tvm.sum(A_pack[i, k, ii] * B_pack[j, k, jj], axis=[k]))
+        C = tvm.compute((N, M), lambda i, j: C_pack[i // bn][j // bn][i % bn][j % bn])
+
+        s = tvm.create_schedule([C.op])
+        assert compute_flop(s) == 2 * N * L * M
+
+def test_outer_dot():
+    for i in range(5):
+        N, M = [np.random.randint(10, 128) * 4 for _ in range(2)]
+        A = tvm.placeholder((N,))
+        B = tvm.placeholder((M,))
+
+        C = tvm.compute((N, M), lambda i, j: A[i] * B[j])
+
+        s = tvm.create_schedule([C.op])
+        assert compute_flop(s) == N * M
+
+def test_move():
+    """No float number operation in simple move. So the estimator should raise an error """
+    N = 1024
+
+    A = tvm.placeholder((N,))
+    C = tvm.compute((N,), lambda i: A[i])
+    s = tvm.create_schedule([C.op])
+
+    try:
+        compute_flop(s)
+        assert False
+    except RuntimeError:
+        pass
+
+if __name__ == '__main__':
+    test_conv()
+    test_pack_gemm()
+    test_outer_dot()
+    test_move()
--- a/tests/python/unittest/test_autotvm_record.py
+++ b/tests/python/unittest/test_autotvm_record.py
+"""test the correctness of dump and load of data log"""
+import time
+
+import tvm
+from tvm.contrib import util
+
+from tvm import autotvm
+from tvm.autotvm.measure import MeasureInput, MeasureResult, MeasureErrorNo
+from tvm.autotvm.record import encode, decode, ApplyHistoryBest, measure_str_key
+
+from test_autotvm_common import get_sample_task
+
+def test_load_dump():
+    task, target = get_sample_task()
+
+    inp = MeasureInput(target, task, task.config_space.get(0))
+    result = MeasureResult((2.0, 2.23, 0.23, 0.123, 0.234, 0.123), MeasureErrorNo.NO_ERROR,
+                           2.3, time.time())
+
+    for protocol in ['json', 'pickle']:
+        row = encode(inp, result, protocol=protocol)
+        inp_2, result_2 = decode(row, protocol=protocol)
+
+        assert measure_str_key(inp) == measure_str_key(inp_2), \
+            "%s vs %s" % (measure_str_key(inp), measure_str_key(inp_2))
+        assert result.costs == result_2.costs
+        assert result.error_no == result_2.error_no
+        assert result.timestamp == result_2.timestamp
+
+
+def test_file_io():
+    temp = util.tempdir()
+    file_path = temp.relpath("temp.log")
+
+    tsk, target = get_sample_task()
+    inputs = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(0, 10)]
+    results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)]
+
+    with open(file_path, "w") as fo:
+        cb = autotvm.callback.log_to_file(fo)
+        cb(None, inputs, results)
+
+    ref = zip(inputs, results)
+    for x, y in zip(ref, autotvm.record.load_from_file(file_path)):
+        assert x[1] == y[1]
+
+
+def test_apply_history_best():
+    tsk, target = get_sample_task()
+
+    records = [
+        (MeasureInput(target, tsk, tsk.config_space.get(0)), MeasureResult((0.1,), 0, 2.3, 0)),
+        (MeasureInput(target, tsk, tsk.config_space.get(1)), MeasureResult((0.3,), 0, 2.3, 0)),
+        (MeasureInput(target, tsk, tsk.config_space.get(2)), MeasureResult((0.01,), 0, 2.3, 0)),
+        (MeasureInput(target, tsk, tsk.config_space.get(4)), MeasureResult((0.4,), 0, 2.3, 0))
+    ]
+    hist_best = ApplyHistoryBest(records)
+    x = hist_best.query(target, tsk.workload)
+    assert str(x) == str(tsk.config_space.get(2))
+
+
+if __name__ == "__main__":
+    test_load_dump()
+    test_apply_history_best()
+    test_file_io()
--- a/tests/python/unittest/test_autotvm_space.py
+++ b/tests/python/unittest/test_autotvm_space.py
+"""Test space definition primitives"""
+
+import tvm
+from tvm.autotvm.task.space import ConfigSpace
+
+def gemm_func(cfg, N):
+    A = tvm.placeholder((N, N), name='A')
+    B = tvm.placeholder((N, N), name='B')
+
+    k = tvm.reduce_axis((0, N), name='k')
+    C = tvm.compute((N, N), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=[k]), name='C')
+
+    s = tvm.create_schedule([C.op])
+
+    y, x = s[C].op.axis
+
+    cfg.define_split('tile_y', cfg.axis(y), num_outputs=2)
+    cfg.define_split('tile_x', cfg.axis(x), num_outputs=2)
+
+    return s, [A, B, C]
+
+def test_split():
+    cfg = ConfigSpace()
+
+    gemm_func(cfg, 128)
+    assert len(cfg) == 64
+    assert len(cfg.space_map['tile_y']) == 8
+
+if __name__ == '__main__':
+    test_split()
--- a/tests/python/unittest/test_pass_verify_gpu_code.py
+++ b/tests/python/unittest/test_pass_verify_gpu_code.py
@@ -31,14 +31,14 @@ def test_shared_memory():
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=4 * M - 1,
-                                max_thread_per_block=M))]}):
+                                max_threads_per_block=M))]}):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=4 * M,
-                                max_thread_per_block=M))]}):
+                                max_threads_per_block=M))]}):
            tvm.build(s, [A, B], target)
        assert valid[0]

@@ -66,14 +66,14 @@ def test_local_memory():
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_local_memory_per_block=4 * M - 1,
-                                max_thread_per_block=1))]}):
+                                max_threads_per_block=1))]}):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_local_memory_per_block=4 * M,
-                                max_thread_per_block=1))]}):
+                                max_threads_per_block=1))]}):
            tvm.build(s, [A, B], target)
        assert valid[0]

@@ -101,21 +101,21 @@ def test_num_thread():
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
-                                max_thread_per_block=N - 1))]}):
+                                max_threads_per_block=N - 1))]}):
            tvm.build(s, [A, B], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
-                                max_thread_per_block=N))]}):
+                                max_threads_per_block=N))]}):
            tvm.build(s, [A, B], target)
        assert valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
-                                max_thread_per_block=N,
+                                max_threads_per_block=N,
                                max_thread_y=M-1))]}):
            tvm.build(s, [A, B], target)
        assert not valid[0]
@@ -123,7 +123,7 @@ def test_num_thread():
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
-                                max_thread_per_block=N,
+                                max_threads_per_block=N,
                                max_thread_y=M))]}):
            tvm.build(s, [A, B], target)
        assert valid[0]
@@ -151,14 +151,14 @@ def test_multiple_kernels():
        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
-                                max_thread_per_block=N - 1))]}):
+                                max_threads_per_block=N - 1))]}):
            tvm.build(s, [A, C], target)
        assert not valid[0]

        with tvm.build_config(**{"add_lower_pass": [
            (2, get_verify_pass(valid,
                                max_shared_memory_per_block=0,
-                                max_thread_per_block=N))]}):
+                                max_threads_per_block=N))]}):
            tvm.build(s, [A, C], target)
        assert valid[0]


--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -8,7 +8,7 @@ make doc
 jsdoc web/tvm_runtime.js web/README.md || exit -1
 mv out docs/_build/html/jsdoc || exit -1

-rm -rf python/tvm/*.pyc python/tvm/*/*.pyc
+rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc

 cd docs
 PYTHONPATH=`pwd`/../python make html || exit -1

--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -2,7 +2,7 @@
 export PYTHONPATH=python:apps/extension/python
 export LD_LIBRARY_PATH=build:${LD_LIBRARY_PATH}

-rm -rf python/tvm/*.pyc python/tvm/*/*.pyc
+rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc

 # Test TVM
 make cython || exit -1

--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -2,7 +2,7 @@

 export PYTHONPATH=python:topi/python

-rm -rf python/tvm/*.pyc python/tvm/*/*.pyc
+rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc

 TVM_FFI=ctypes python -m nose -v tests/python/unittest || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/unittest || exit -1

--- a/tutorials/autotvm/README.txt
+++ b/tutorials/autotvm/README.txt
+Auto tuning
+-------------
+
--- a/tutorials/autotvm/tune_cuda_conv2d.py
+++ b/tutorials/autotvm/tune_cuda_conv2d.py
+"""
+How to get high performance convolution kernel on NVIDIA GPU by auto-tuning
+=========================================================================
+**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+
+This is an advanced tutorial for writing high performance tunable template for 
+NVIDIA GPU. By running auto-tuner on this template, we can outperform the
+vendor provided library CuDNN in many cases.
+"""
+
+import logging
+import sys
+
+import tvm
+import topi
+
+from tvm import autotvm
+
+######################################################################
+# Step 1:  Define the search space
+# ---------------------------------
+# There are plenty of useful schedule primitives in tvm. You can also find 
+# some tutorials that describe them in more details, such as 
+# (1). :doc:``Optimizing Conv2d on NVIDIA GPU <../optimize/opt_conv_cuda>`
+# (2). `Optimizing DepthwiseConv on NVIDIA GPU <https://tvm.ai/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html>`_
+# 
+# However, their implementations are manually tuned for some special input
+# shapes. In this section, we build a large enough space to cover
+# the techniques used in these tutorials. Then we rely on the efficient auto-tuner
+# to search through this space and pick some good configurations.
+# 
+# If you are familiar with writing cuda schedule, you can find the following
+# template is very general. Actually this template can be easily modified 
+# to tune other operators such as depthwise convolution and gemm.
+# In order to fully understand this template, you should be familiar with
+# the schedule primitives and auto tuning API. You can refer to the above
+# tutorials and :doc:`autotvm tutorial <tune_simple_template>`
+#
+# It is worth noting that the search space for a conv2d operator
+# can be very large (at the level of 10^9 for some input shapes)
+#
+
+@autotvm.template
+def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
+    assert N == 1, "Only consider batch_size = 1 in this template"
+
+    data = tvm.placeholder((N, CI, H, W), name='data')
+    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
+    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, 'float32')
+    s = tvm.create_schedule([conv.op])
+
+    # inline padding
+    pad_data = s[conv].op.input_tensors[0]
+    s[pad_data].compute_inline()
+    data, raw_data = pad_data, data
+
+    output = conv
+    OL = s.cache_write(conv, 'local')
+
+    # create cache stage
+    AA = s.cache_read(data, 'shared', [OL])
+    WW = s.cache_read(kernel, 'shared', [OL])
+    AL = s.cache_read(AA, 'local', [OL])
+    WL = s.cache_read(WW, 'local', [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
+    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
+    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+    kernel_scope = n  # this is the scope to attach global config inside this kernel
+
+    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    s[OL].compute_at(s[output], tx)
+
+    # tile and bind reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
+    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
+    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
+    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
+    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
+    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+    s[AL].compute_at(s[OL], rxm)
+    s[WL].compute_at(s[OL], rxm)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        n, f, y, x = s[load].op.axis
+        fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # tune unroll
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    return s, [raw_data, kernel, conv]
+
+######################################################################
+# Step 2:  Search through the space
+# ---------------------------------
+# We pick the last layer on resnet as test case.
+# Since our space is very large, :code:`XGBoostTuner` is most suitable
+# for our case. Here we only do 20 trials for demonstration.
+# In practice, making 1000 trials usually can find some good kernels
+# for this template
+
+# logging config (for printing tuning log to screen)
+logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+
+# the last layer in resnet 
+task = autotvm.task.create(conv2d_no_batching,
+                           args=(1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)),
+                           target='cuda')
+print(task.config_space)
+
+# use local gpu, measure 5 times for every config to reduce variance
+# run 8 parallel threads for compilation
+measure_option = autotvm.measure_option(mode='local',
+                                        number=10,
+                                        parallel_num=8,
+                                        timeout=20)
+
+# begin tuning, log records to file `cache.tsv`
+tuner = autotvm.tuner.XGBTuner(task)
+tuner.tune(n_trial=20,
+           measure_option=measure_option,
+           callbacks=[autotvm.callback.log_to_file('cache.tsv')])
+
+# get best config from cache file
+dispatch_context = autotvm.apply_history_best("cache.tsv")
+best_config = dispatch_context.query(task.target, task.workload)
+print("\nBest config:")
+print(best_config)
+
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py