Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
T
tic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
wenyuanbo
tic
Commits
e0810512
Commit
e0810512
authored
Nov 22, 2019
by
Zhi
Committed by
Tianqi Chen
Nov 22, 2019
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TVM][RUNTIME] A minimum example to generate external library wrappers for DSOModule (#4280)
parent
74299972
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
923 additions
and
4 deletions
+923
-4
CMakeLists.txt
+6
-0
cmake/config.cmake
+3
-0
include/tvm/runtime/module.h
+1
-1
python/tvm/module.py
+6
-1
src/codegen/source_module.cc
+3
-0
src/runtime/contrib/example_ext_runtime/example_ext_runtime.cc
+344
-0
src/runtime/dso_module.cc
+1
-1
src/runtime/graph/graph_runtime.cc
+1
-1
tests/python/relay/test_external_runtime.py
+558
-0
No files found.
CMakeLists.txt
View file @
e0810512
...
@@ -232,6 +232,12 @@ if(USE_VM_PROFILER)
...
@@ -232,6 +232,12 @@ if(USE_VM_PROFILER)
list
(
APPEND RUNTIME_SRCS
${
RUNTIME_VM_PROFILER_SRCS
}
)
list
(
APPEND RUNTIME_SRCS
${
RUNTIME_VM_PROFILER_SRCS
}
)
endif
(
USE_VM_PROFILER
)
endif
(
USE_VM_PROFILER
)
if
(
USE_EXAMPLE_EXT_RUNTIME
)
message
(
STATUS
"Build with example external runtime..."
)
file
(
GLOB RUNTIME_EXAMPLE_EXTERNAL_SRCS src/runtime/contrib/example_ext_runtime/*.cc
)
list
(
APPEND RUNTIME_SRCS
${
RUNTIME_EXAMPLE_EXTERNAL_SRCS
}
)
endif
(
USE_EXAMPLE_EXT_RUNTIME
)
# Module rules
# Module rules
include
(
cmake/modules/VTA.cmake
)
include
(
cmake/modules/VTA.cmake
)
include
(
cmake/modules/CUDA.cmake
)
include
(
cmake/modules/CUDA.cmake
)
...
...
cmake/config.cmake
View file @
e0810512
...
@@ -181,3 +181,6 @@ set(USE_VTA_TSIM ON)
...
@@ -181,3 +181,6 @@ set(USE_VTA_TSIM ON)
# Whether to build VTA FPGA driver (device side only)
# Whether to build VTA FPGA driver (device side only)
set
(
USE_VTA_FPGA OFF
)
set
(
USE_VTA_FPGA OFF
)
# Whether to build the example external runtime module
set
(
USE_EXAMPLE_EXT_RUNTIME OFF
)
include/tvm/runtime/module.h
View file @
e0810512
...
@@ -111,7 +111,7 @@ class Module : public ObjectRef {
...
@@ -111,7 +111,7 @@ class Module : public ObjectRef {
*
*
* \endcode
* \endcode
*/
*/
class
ModuleNode
:
public
Object
{
class
TVM_DLL
ModuleNode
:
public
Object
{
public
:
public
:
/*! \brief virtual destructor */
/*! \brief virtual destructor */
virtual
~
ModuleNode
()
{}
virtual
~
ModuleNode
()
{}
...
...
python/tvm/module.py
View file @
e0810512
...
@@ -144,7 +144,12 @@ class Module(ModuleBase):
...
@@ -144,7 +144,12 @@ class Module(ModuleBase):
else
:
else
:
fcompile
=
_cc
.
create_shared
fcompile
=
_cc
.
create_shared
if
self
.
type_key
==
"c"
:
if
self
.
type_key
==
"c"
:
kwargs
.
update
({
'options'
:
[
"-I"
+
path
for
path
in
find_include_path
()]})
options
=
[]
if
"options"
in
kwargs
:
opts
=
kwargs
[
"options"
]
options
=
opts
if
isinstance
(
opts
,
(
list
,
tuple
))
else
[
opts
]
opts
=
options
+
[
"-I"
+
path
for
path
in
find_include_path
()]
kwargs
.
update
({
'options'
:
opts
})
fcompile
(
file_name
,
files
,
**
kwargs
)
fcompile
(
file_name
,
files
,
**
kwargs
)
def
time_evaluator
(
self
,
func_name
,
ctx
,
number
=
10
,
repeat
=
1
,
min_repeat_ms
=
0
):
def
time_evaluator
(
self
,
func_name
,
ctx
,
number
=
10
,
repeat
=
1
,
min_repeat_ms
=
0
):
...
...
src/codegen/source_module.cc
View file @
e0810512
...
@@ -185,5 +185,8 @@ runtime::Module DeviceSourceModuleCreate(
...
@@ -185,5 +185,8 @@ runtime::Module DeviceSourceModuleCreate(
TVM_REGISTER_GLOBAL
(
"module.source_module_create"
)
TVM_REGISTER_GLOBAL
(
"module.source_module_create"
)
.
set_body_typed
(
SourceModuleCreate
);
.
set_body_typed
(
SourceModuleCreate
);
TVM_REGISTER_GLOBAL
(
"module.csource_module_create"
)
.
set_body_typed
(
CSourceModuleCreate
);
}
// namespace codegen
}
// namespace codegen
}
// namespace tvm
}
// namespace tvm
src/runtime/contrib/example_ext_runtime/example_ext_runtime.cc
0 → 100644
View file @
e0810512
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file external_runtime_test.cc
* \brief Test an example runtime module to interpreting a json string.
*
* This is an exmaple runtime employed to show how we can interprete and execute
* a json string that represents a simple computational (sub)graph. Users will
* mainly need to implement four functions as follows:
* - GetFunction. It is used to get the packed function from the json runtime
* module using a provided function name. This function returns a PackedFunc
* that can be directly invoked by feeding it with parameters.
* - SaveToBinary. This function is used to achieve the serialization purpose.
* The emitted binary stream can be directly saved to disk so that users can
* load then back when needed.
* - LoadFromBinary. This function uses binary stream to load the json that
* saved by SaveToBinary which essentially performs deserialization.
*/
#include <dmlc/logging.h>
#include <tvm/runtime/c_runtime_api.h>
#include <tvm/runtime/memory.h>
#include <tvm/runtime/module.h>
#include <tvm/runtime/ndarray.h>
#include <tvm/runtime/object.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>
#include <fstream>
#include <cmath>
#include <map>
#include <sstream>
#include <string>
#include <vector>
namespace
tvm
{
namespace
runtime
{
// A simple JSON node that contains multiple inputs and a single output.
struct
NodeEntry
{
int
id
;
int
output
;
std
::
vector
<
int
>
inputs
;
};
/*!
* \brief The following 6 functions are examples for demonstration. Users need
* to provide their own API when they use the external library. The ones that
* accecpt TVMValue are wrappers used to bridge the PackedFunc and user-defined
* kernels.
*/
void
Add_
(
float
*
a
,
int
len_a
,
float
*
b
,
int
len_b
,
float
*
c
)
{
for
(
int
i
=
0
;
i
<
len_a
*
len_b
;
i
++
)
{
c
[
i
]
=
a
[
i
]
+
b
[
i
];
}
}
int
Add
(
TVMValue
*
value
,
int
*
type_code
,
int
nargs
)
{
CHECK_EQ
(
nargs
,
3U
)
<<
"Expect 3 args, but get "
<<
nargs
<<
"
\n
"
;
DLTensor
*
arg0
=
static_cast
<
DLTensor
*>
(
value
[
0
].
v_handle
);
DLTensor
*
arg1
=
static_cast
<
DLTensor
*>
(
value
[
1
].
v_handle
);
DLTensor
*
out
=
static_cast
<
DLTensor
*>
(
value
[
2
].
v_handle
);
Add_
(
static_cast
<
float
*>
(
arg0
->
data
),
arg0
->
shape
[
0
],
static_cast
<
float
*>
(
arg1
->
data
),
arg1
->
shape
[
0
],
static_cast
<
float
*>
(
out
->
data
));
return
0
;
}
void
Sub_
(
float
*
a
,
int
len_a
,
float
*
b
,
int
len_b
,
float
*
c
)
{
for
(
int
i
=
0
;
i
<
len_a
*
len_b
;
i
++
)
{
c
[
i
]
=
a
[
i
]
-
b
[
i
];
}
}
int
Sub
(
TVMValue
*
value
,
int
*
type_code
,
int
nargs
)
{
CHECK_EQ
(
nargs
,
3U
)
<<
"Expect 3 args, but get "
<<
nargs
<<
"
\n
"
;
DLTensor
*
arg0
=
static_cast
<
DLTensor
*>
(
value
[
0
].
v_handle
);
DLTensor
*
arg1
=
static_cast
<
DLTensor
*>
(
value
[
1
].
v_handle
);
DLTensor
*
out
=
static_cast
<
DLTensor
*>
(
value
[
2
].
v_handle
);
Sub_
(
static_cast
<
float
*>
(
arg0
->
data
),
arg0
->
shape
[
0
],
static_cast
<
float
*>
(
arg1
->
data
),
arg1
->
shape
[
0
],
static_cast
<
float
*>
(
out
->
data
));
return
0
;
}
void
Mul_
(
float
*
a
,
int
len_a
,
float
*
b
,
int
len_b
,
float
*
c
)
{
for
(
int
i
=
0
;
i
<
len_a
*
len_b
;
i
++
)
{
c
[
i
]
=
a
[
i
]
*
b
[
i
];
}
}
int
Mul
(
TVMValue
*
value
,
int
*
type_code
,
int
nargs
)
{
CHECK_EQ
(
nargs
,
3U
)
<<
"Expect 3 args, but get "
<<
nargs
<<
"
\n
"
;
DLTensor
*
arg0
=
static_cast
<
DLTensor
*>
(
value
[
0
].
v_handle
);
DLTensor
*
arg1
=
static_cast
<
DLTensor
*>
(
value
[
1
].
v_handle
);
DLTensor
*
out
=
static_cast
<
DLTensor
*>
(
value
[
2
].
v_handle
);
Mul_
(
static_cast
<
float
*>
(
arg0
->
data
),
arg0
->
shape
[
0
],
static_cast
<
float
*>
(
arg1
->
data
),
arg1
->
shape
[
0
],
static_cast
<
float
*>
(
out
->
data
));
return
0
;
}
/*!
* \brief The example json runtime module. Here we define a simple format for
* the computational graph using json for demonstration purpose. Users should
* customize their own format.
*/
class
ExampleJsonModule
:
public
ModuleNode
{
public
:
explicit
ExampleJsonModule
(
std
::
string
graph_json
)
{
this
->
graph_json_
=
graph_json
;
ParseJson
(
this
->
graph_json_
);
}
/*!
* \brief Get a PackedFunc from the example json module.
*
* \param name the name of the function.
* \param sptr_to_self The ObjectPtr that points to this module node.
*
* \return The function pointer when it is found, otherwise, PackedFunc(nullptr).
*/
PackedFunc
GetFunction
(
const
std
::
string
&
name
,
const
ObjectPtr
<
Object
>&
sptr_to_self
)
final
{
if
(
this
->
graph_
.
find
(
name
)
!=
this
->
graph_
.
end
())
{
this
->
curr_subgraph_
=
name
;
return
PackedFunc
([
sptr_to_self
,
this
](
TVMArgs
args
,
TVMRetValue
*
rv
)
{
for
(
auto
i
=
0
;
i
<
args
.
size
();
++
i
)
{
CHECK
(
args
[
i
].
type_code
()
==
kNDArrayContainer
||
args
[
i
].
type_code
()
==
kArrayHandle
)
<<
"Expect NDArray or DLTensor as inputs"
<<
"
\n
"
;
if
(
args
[
i
].
type_code
()
==
kArrayHandle
)
{
DLTensor
*
arg
=
args
[
i
];
this
->
data_entry_
[
i
].
CopyFrom
(
arg
);
}
else
{
NDArray
arg
=
args
[
i
];
this
->
data_entry_
[
i
].
CopyFrom
(
arg
);
}
}
for
(
const
auto
&
it
:
this
->
graph_
[
this
->
curr_subgraph_
])
{
this
->
Run
(
it
.
id
,
it
.
inputs
,
it
.
output
);
}
CHECK_GT
(
graph_
.
count
(
this
->
curr_subgraph_
),
0U
);
auto
out_idx
=
graph_
[
this
->
curr_subgraph_
].
back
().
output
;
if
(
args
[
args
.
size
()
-
1
].
type_code
()
==
kArrayHandle
)
{
DLTensor
*
arg
=
args
[
args
.
size
()
-
1
];
this
->
data_entry_
[
out_idx
].
CopyTo
(
arg
);
}
else
{
NDArray
arg
=
args
[
args
.
size
()
-
1
];
this
->
data_entry_
[
out_idx
].
CopyTo
(
arg
);
}
*
rv
=
data_entry_
.
back
();
});
}
else
{
LOG
(
FATAL
)
<<
"Unkown runtime type: "
<<
name
<<
"
\n
"
;
return
PackedFunc
();
}
}
/*!
* \brief Execute a function with provided arguments. The output will be
* packed to the last argument according to TVM's calling convention.
*
* \param id The id of the function.
* \param inputs The input indices that indicate where the data should be
* fetched in the data entry pool.
* \param output The output index.
*/
void
Run
(
int
id
,
const
std
::
vector
<
int
>&
inputs
,
int
output
)
{
std
::
vector
<
int
>
args
(
inputs
.
begin
(),
inputs
.
end
());
args
.
push_back
(
output
);
std
::
vector
<
TVMValue
>
values
(
args
.
size
());
std
::
vector
<
int
>
type_codes
(
args
.
size
());
TVMArgsSetter
setter
(
values
.
data
(),
type_codes
.
data
());
if
(
op_id_
[
id
]
==
"add"
||
op_id_
[
id
]
==
"sub"
||
op_id_
[
id
]
==
"mul"
)
{
for
(
size_t
i
=
0
;
i
<
args
.
size
();
i
++
)
{
setter
(
i
,
data_entry_
[
args
[
i
]]);
}
}
if
(
op_id_
[
id
]
==
"add"
)
{
Add
(
values
.
data
(),
type_codes
.
data
(),
args
.
size
());
}
else
if
(
op_id_
[
id
]
==
"sub"
)
{
Sub
(
values
.
data
(),
type_codes
.
data
(),
args
.
size
());
}
else
if
(
op_id_
[
id
]
==
"mul"
)
{
Mul
(
values
.
data
(),
type_codes
.
data
(),
args
.
size
());
}
else
{
LOG
(
FATAL
)
<<
"Unknown op: "
<<
op_id_
[
id
]
<<
"
\n
"
;
}
}
const
char
*
type_key
()
const
{
return
"examplejson"
;
}
/*!
* \brief Save the json runtime to a binary stream, which can then be
* serialized to disk.
*
* \param stream. The stream to save the binary.
*/
void
SaveToBinary
(
dmlc
::
Stream
*
stream
)
final
{
stream
->
Write
(
this
->
graph_json_
);
}
/*!
* \brief Parse the example json string.
*
* \param json. The json string that represents a simple computational graph.
*
* \Note this is a very simple json that only serves for demostration purpose.
* Users usually have their own format and they can serialize it using the
* SaveToBinary method and deserialize it using LoadFromFile.
*/
void
ParseJson
(
const
std
::
string
&
json
)
{
std
::
string
line
;
std
::
string
curr_subgraph
;
std
::
stringstream
ss
(
json
);
while
(
std
::
getline
(
ss
,
line
,
'\n'
))
{
std
::
stringstream
ss2
(
line
);
std
::
string
token
;
int
id
=
0
;
ss2
>>
token
;
if
(
token
.
find
(
"json_rt_"
)
!=
std
::
string
::
npos
)
{
curr_subgraph
=
token
;
continue
;
}
ss2
>>
id
;
if
(
op_id_
.
size
()
<=
static_cast
<
size_t
>
(
id
))
{
op_id_
.
resize
(
id
+
1
);
data_entry_
.
resize
(
id
+
1
);
}
int64_t
total_elements
=
1
;
std
::
vector
<
int64_t
>
shape
;
if
(
token
==
"input"
)
{
int64_t
size
=
0
;
while
(
ss2
>>
size
)
{
total_elements
*=
size
;
shape
.
push_back
(
size
);
}
}
else
{
op_id_
[
id
]
=
token
;
bool
shape_data
=
false
;
NodeEntry
entry
;
while
(
ss2
>>
token
)
{
if
(
token
==
"shape:"
)
{
shape_data
=
true
;
}
else
if
(
shape_data
)
{
total_elements
*=
std
::
stoll
(
token
);
shape
.
push_back
(
std
::
stoll
(
token
));
}
else
if
(
token
!=
"inputs:"
)
{
entry
.
inputs
.
push_back
(
std
::
stoi
(
token
));
}
}
entry
.
id
=
id
;
entry
.
output
=
id
;
graph_
[
curr_subgraph
].
push_back
(
entry
);
}
DLContext
ctx
;
ctx
.
device_type
=
static_cast
<
DLDeviceType
>
(
1
);
ctx
.
device_id
=
0
;
data_entry_
[
id
]
=
NDArray
::
Empty
(
shape
,
DLDataType
{
kDLFloat
,
32
,
1
},
ctx
);
}
}
/*!
* \brief Create a module from a file path of a serialized graph.
*
* \param path The file path contains a computational graph representation.
*
* \return The created json module.
*/
static
Module
Create
(
const
std
::
string
&
path
)
{
std
::
ifstream
filep
;
filep
.
open
(
path
,
std
::
ios
::
in
);
std
::
string
graph_json
;
std
::
string
line
;
while
(
std
::
getline
(
filep
,
line
))
{
graph_json
+=
line
;
graph_json
+=
"
\n
"
;
}
filep
.
close
();
auto
n
=
tvm
::
runtime
::
make_object
<
ExampleJsonModule
>
(
graph_json
);
return
Module
(
n
);
}
/*!
* \brief Load a json module from stream.
*
* \param strm The binary stream to load json.
*
* \return The created json module.
*/
static
Module
LoadFromBinary
(
void
*
strm
)
{
dmlc
::
Stream
*
stream
=
static_cast
<
dmlc
::
Stream
*>
(
strm
);
std
::
string
graph_json
;
stream
->
Read
(
&
graph_json
);
auto
n
=
tvm
::
runtime
::
make_object
<
ExampleJsonModule
>
(
graph_json
);
return
Module
(
n
);
}
private
:
/* \brief The json string that represents a computational graph. */
std
::
string
graph_json_
;
/* \brief The subgraph that being processed. */
std
::
string
curr_subgraph_
;
/*! \brief A simple graph from subgraph id to node entries. */
std
::
map
<
std
::
string
,
std
::
vector
<
NodeEntry
>
>
graph_
;
/* \brief A simple pool to contain the tensor for each node in the graph. */
std
::
vector
<
NDArray
>
data_entry_
;
/* \brief A mapping from node id to op name. */
std
::
vector
<
std
::
string
>
op_id_
;
};
TVM_REGISTER_GLOBAL
(
"module.loadfile_examplejson"
)
.
set_body
([](
TVMArgs
args
,
TVMRetValue
*
rv
)
{
*
rv
=
ExampleJsonModule
::
Create
(
args
[
0
]);
});
TVM_REGISTER_GLOBAL
(
"module.loadbinary_examplejson"
)
.
set_body_typed
(
ExampleJsonModule
::
LoadFromBinary
);
}
// namespace runtime
}
// namespace tvm
src/runtime/dso_module.cc
View file @
e0810512
...
@@ -18,7 +18,7 @@
...
@@ -18,7 +18,7 @@
*/
*/
/*!
/*!
* \file dso_
dll_
module.cc
* \file dso_module.cc
* \brief Module to load from dynamic shared library.
* \brief Module to load from dynamic shared library.
*/
*/
#include <tvm/runtime/module.h>
#include <tvm/runtime/module.h>
...
...
src/runtime/graph/graph_runtime.cc
View file @
e0810512
...
@@ -396,7 +396,7 @@ std::pair<std::function<void()>, std::shared_ptr<GraphRuntime::OpArgs> > GraphRu
...
@@ -396,7 +396,7 @@ std::pair<std::function<void()>, std::shared_ptr<GraphRuntime::OpArgs> > GraphRu
// Get compiled function from the module that contains both host and device
// Get compiled function from the module that contains both host and device
// code.
// code.
tvm
::
runtime
::
PackedFunc
pf
=
module_
.
GetFunction
(
param
.
func_name
,
fals
e
);
tvm
::
runtime
::
PackedFunc
pf
=
module_
.
GetFunction
(
param
.
func_name
,
tru
e
);
CHECK
(
pf
!=
nullptr
)
<<
"no such function in module: "
<<
param
.
func_name
;
CHECK
(
pf
!=
nullptr
)
<<
"no such function in module: "
<<
param
.
func_name
;
auto
fexec
=
[
arg_ptr
,
pf
]()
{
auto
fexec
=
[
arg_ptr
,
pf
]()
{
...
...
tests/python/relay/test_external_runtime.py
0 → 100644
View file @
e0810512
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from
shutil
import
which
import
json
import
pytest
import
sys
import
numpy
as
np
import
tvm
from
tvm
import
relay
from
tvm
import
module
as
_tvm_module
from
tvm.contrib
import
util
tmp_path
=
util
.
tempdir
()
def
generate_csource_module
():
"""Mock the codegen with an external library (e.g., CBLAS/cuDNN)"""
code
=
r'''
#include <tvm/runtime/c_runtime_api.h>
#include <dlpack/dlpack.h>
#include <cstdint>
#include <cstring>
#include <iostream>
#define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)
\
extern "C" void p_ID_(float* a, float* b, float* out) {
\
for (int64_t i = 0; i < p_DIM1_; ++i) {
\
out[i] = a[i] p_OP_ b[i];
\
}
\
}
#define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)
\
extern "C" void p_ID_(float* a, float* b, float* out) {
\
for (int64_t i = 0; i < p_DIM1_; ++i) {
\
for (int64_t j = 0; j < p_DIM2_; ++j) {
\
int64_t k = i * p_DIM2_ + j;
\
out[k] = a[k] p_OP_ b[k];
\
}
\
}
\
}
GCC_BINARY_OP_2D(gcc_1_0, *, 10, 10);
GCC_BINARY_OP_2D(gcc_1_1, -, 10, 10);
GCC_BINARY_OP_2D(gcc_1_2, +, 10, 10);
extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
float* gcc_input6, float* gcc_input7, float* out) {
float* buf_0 = (float*)malloc(4 * 100);
float* buf_1 = (float*)malloc(4 * 100);
gcc_1_2(gcc_input4, gcc_input5, buf_0);
gcc_1_1(buf_0, gcc_input6, buf_1);
gcc_1_0(buf_1, gcc_input7, out);
free(buf_0);
free(buf_1);
}
extern "C" int json_rt_1(TVMValue* value, int* type_code, int nargs) {
if (nargs != 5) {
printf("Expect 5 args, but get
%
d", nargs);
return 1;
}
DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
static_cast<float*>(out->data));
return 0;
}
GCC_BINARY_OP_2D(gcc_0_0, *, 10, 10);
GCC_BINARY_OP_2D(gcc_0_1, -, 10, 10);
GCC_BINARY_OP_2D(gcc_0_2, +, 10, 10);
extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1,
float* gcc_input2, float* gcc_input3, float* out) {
float* buf_0 = (float*)malloc(4 * 100);
float* buf_1 = (float*)malloc(4 * 100);
gcc_0_2(gcc_input0, gcc_input1, buf_0);
gcc_0_1(buf_0, gcc_input2, buf_1);
gcc_0_0(buf_1, gcc_input3, out);
free(buf_0);
free(buf_1);
}
extern "C" int json_rt_0(TVMValue* value, int* type_code, int nargs) {
if (nargs != 5) {
printf("Expect 5 args, but get
%
d", nargs);
return 1;
}
DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
static_cast<float*>(out->data));
return 0;
}
'''
csource_module
=
_tvm_module
.
csource_module_create
(
code
,
"cc"
)
return
csource_module
def
generate_engine_module
():
"""
Mock the codegen of an external backend with its own runtime engine
(e.g., MKL-DNN/TensorRT)
"""
code
=
r'''
#include <tvm/runtime/c_runtime_api.h>
#include <dlpack/dlpack.h>
#include "gcc_engine.h"
extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
float* gcc_input6, float* gcc_input7, float* out) {
std::string graph =
"add_2d,10,10\n"
"sub_2d,10,10\n"
"mul_2d,10,10\n";
Engine engine;
engine.run(graph, {gcc_input4, gcc_input5, gcc_input6, gcc_input7}, out);
}
extern "C" int json_rt_1(TVMValue* value, int* type_code, int nargs) {
if (nargs != 5) {
printf("Expect 5 args, but get
%
d", nargs);
return 1;
}
DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
gcc_1_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
static_cast<float*>(out->data));
return 0;
}
extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1,
float* gcc_input2, float* gcc_input3, float* out) {
std::string graph =
"add_2d,10,10\n"
"sub_2d,10,10\n"
"mul_2d,10,10\n";
Engine engine;
engine.run(graph, {gcc_input0, gcc_input1, gcc_input2, gcc_input3}, out);
}
extern "C" int json_rt_0(TVMValue* value, int* type_code, int nargs) {
if (nargs != 5) {
printf("Expect 5 args, but get
%
d", nargs);
return 1;
}
DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
DLTensor* arg2 = static_cast<DLTensor*>(value[2].v_handle);
DLTensor* arg3 = static_cast<DLTensor*>(value[3].v_handle);
DLTensor* out = static_cast<DLTensor*>(value[4].v_handle);
gcc_0_(static_cast<float*>(arg0->data), static_cast<float*>(arg1->data),
static_cast<float*>(arg2->data), static_cast<float*>(arg3->data),
static_cast<float*>(out->data));
return 0;
}
'''
gen_gcc_engine
()
csource_module
=
_tvm_module
.
csource_module_create
(
code
,
"cc"
)
return
csource_module
def
gen_gcc_engine
():
"""An example of external backend runtime engine. This is supposed to be provided
by third-party vendors and included when building the generated external kernel code.
"""
code
=
r'''
#ifndef _GCC_ENGINE_H_
#define _GCC_ENGINE_H_
#include <cstdint>
#include <string>
#include <sstream>
#include <vector>
#define GCC_BINARY_OP_2D(p_ID_, p_OP_)
\
void p_ID_(int64_t dim1, int64_t dim2, float* a, float* b, float* out) {
\
for (int64_t i = 0; i < dim1; ++i) {
\
for (int64_t j = 0; j < dim2; ++j) {
\
int64_t k = i * dim2 + j;
\
out[k] = a[k] p_OP_ b[k];
\
}
\
}
\
}
GCC_BINARY_OP_2D(add_2d, +);
GCC_BINARY_OP_2D(sub_2d, -);
GCC_BINARY_OP_2D(mul_2d, *);
struct Layer {
void (*op)(int64_t, int64_t, float*, float*, float*);
std::vector<int64_t> shapes;
std::vector<float*> args;
};
class Engine {
public:
float* alloc_buffer(int64_t size) {
float* buf = (float*)malloc(sizeof(float) * size);
buffers.push_back(buf);
return buf;
}
void add(std::string op, int64_t dim1, int64_t dim2, float* in1, float* in2, float* out) {
Layer layer;
layer.shapes.push_back(dim1);
layer.shapes.push_back(dim2);
layer.args.push_back(in1);
layer.args.push_back(in2);
layer.args.push_back(out);
if (op == "add_2d")
layer.op = &add_2d;
else if (op == "sub_2d")
layer.op = &sub_2d;
else if (op == "mul_2d")
layer.op = &mul_2d;
net.push_back(layer);
return ;
}
void run(std::string graph, std::vector<float*> args, float* out) {
std::stringstream ss(graph);
std::string line;
int layer_idx = 0;
int arg_idx = 0;
float* buf = nullptr;
while (std::getline(ss, line, '\n')) {
std::stringstream ss2(line);
std::string token;
std::vector<std::string> attrs;
while (std::getline(ss2, token, ',')) {
attrs.push_back(token);
}
int64_t dim1 = stoll(attrs[1]);
int64_t dim2 = stoll(attrs[2]);
auto out_buf = this->alloc_buffer(dim1 * dim2);
if (layer_idx == 0) {
this->add(attrs[0], dim1, dim2, args[0], args[1], out_buf);
buf = out_buf;
arg_idx = 2;
}
else {
this->add(attrs[0], dim1, dim2, buf, args[arg_idx], out_buf);
buf = out_buf;
arg_idx++;
}
layer_idx++;
}
this->net.back().args.back() = out;
for (auto layer : net) {
(*layer.op)(layer.shapes[0], layer.shapes[1], layer.args[0], layer.args[1], layer.args[2]);
}
}
~Engine() {
for (auto buf : buffers) {
free(buf);
}
}
private:
std::vector<Layer> net;
std::vector<float*> buffers;
};
#endif
'''
header_file
=
tmp_path
.
relpath
(
"gcc_engine.h"
)
with
open
(
header_file
,
'w'
)
as
f
:
f
.
write
(
code
)
def
get_synthetic_lib
():
x
=
relay
.
var
(
'x'
,
shape
=
(
10
,
10
))
w0
=
relay
.
var
(
'w0'
,
shape
=
(
10
,
10
))
w1
=
relay
.
var
(
'w1'
,
shape
=
(
10
,
10
))
w2
=
relay
.
var
(
'w2'
,
shape
=
(
10
,
10
))
w3
=
relay
.
var
(
'w3'
,
shape
=
(
10
,
10
))
w4
=
relay
.
var
(
'w4'
,
shape
=
(
10
,
10
))
w5
=
relay
.
var
(
'w5'
,
shape
=
(
10
,
10
))
w6
=
relay
.
var
(
'w6'
,
shape
=
(
10
,
10
))
w7
=
relay
.
var
(
'w7'
,
shape
=
(
10
,
10
))
# subgraph0
gcc_input0
=
relay
.
var
(
'gcc_input0'
,
shape
=
(
10
,
10
))
gcc_input1
=
relay
.
var
(
'gcc_input1'
,
shape
=
(
10
,
10
))
gcc_input2
=
relay
.
var
(
'gcc_input2'
,
shape
=
(
10
,
10
))
gcc_input3
=
relay
.
var
(
'gcc_input3'
,
shape
=
(
10
,
10
))
subgraph0
=
relay
.
Function
([
gcc_input0
,
gcc_input1
,
gcc_input2
,
gcc_input3
],
relay
.
copy
(
gcc_input0
))
subgraph0
=
subgraph0
.
set_attribute
(
"Primitive"
,
tvm
.
expr
.
IntImm
(
"int32"
,
1
))
# Call subgraph0
subgraph0_ret
=
relay
.
Call
(
subgraph0
,
[
x
,
w0
,
w1
,
w2
])
# subgraph1
gcc_input4
=
relay
.
var
(
'gcc_input4'
,
shape
=
(
10
,
10
))
gcc_input5
=
relay
.
var
(
'gcc_input5'
,
shape
=
(
10
,
10
))
gcc_input6
=
relay
.
var
(
'gcc_input6'
,
shape
=
(
10
,
10
))
gcc_input7
=
relay
.
var
(
'gcc_input7'
,
shape
=
(
10
,
10
))
subgraph1
=
relay
.
Function
([
gcc_input4
,
gcc_input5
,
gcc_input6
,
gcc_input7
],
relay
.
copy
(
gcc_input4
))
subgraph1
=
subgraph1
.
set_attribute
(
"Primitive"
,
tvm
.
expr
.
IntImm
(
"int32"
,
1
))
# Call subgraph1
subgraph1_ret
=
relay
.
Call
(
subgraph1
,
[
x
,
w3
,
w4
,
w5
])
# Other ops that will be executed on TVM.
add2
=
relay
.
add
(
x
,
w6
)
sub2
=
relay
.
subtract
(
add2
,
w7
)
ret
=
relay
.
concatenate
((
subgraph0_ret
,
subgraph1_ret
,
sub2
),
0
)
func
=
relay
.
Function
([
x
,
w0
,
w1
,
w2
,
w3
,
w4
,
w5
,
w6
,
w7
],
ret
)
mod
=
relay
.
Module
.
from_expr
(
func
)
_
,
lib
,
_
=
relay
.
build
(
mod
,
"llvm"
)
return
lib
def
get_whole_graph_json
():
nodex
=
{
"op"
:
"null"
,
"name"
:
"x"
,
"inputs"
:
[]}
node0
=
{
"op"
:
"null"
,
"name"
:
"w0"
,
"inputs"
:
[]}
node1
=
{
"op"
:
"null"
,
"name"
:
"w1"
,
"inputs"
:
[]}
node2
=
{
"op"
:
"null"
,
"name"
:
"w2"
,
"inputs"
:
[]}
node3
=
{
"op"
:
"null"
,
"name"
:
"w3"
,
"inputs"
:
[]}
node4
=
{
"op"
:
"null"
,
"name"
:
"w4"
,
"inputs"
:
[]}
node5
=
{
"op"
:
"null"
,
"name"
:
"w5"
,
"inputs"
:
[]}
node6
=
{
"op"
:
"null"
,
"name"
:
"w6"
,
"inputs"
:
[]}
node7
=
{
"op"
:
"null"
,
"name"
:
"w7"
,
"inputs"
:
[]}
subgraph0
=
{
"op"
:
"tvm_op"
,
"name"
:
"json_rt_0"
,
"attrs"
:
{
"num_outputs"
:
"1"
,
"num_inputs"
:
"4"
,
"func_name"
:
"json_rt_0"
,
"flatten_data"
:
"0"
},
"inputs"
:
[
[
0
,
0
,
0
],
[
1
,
0
,
0
],
[
2
,
0
,
0
],
[
3
,
0
,
0
],
]
}
subgraph1
=
{
"op"
:
"tvm_op"
,
"name"
:
"json_rt_1"
,
"attrs"
:
{
"num_outputs"
:
"1"
,
"num_inputs"
:
"4"
,
"func_name"
:
"json_rt_1"
,
"flatten_data"
:
"0"
},
"inputs"
:
[
[
0
,
0
,
0
],
[
4
,
0
,
0
],
[
5
,
0
,
0
],
[
6
,
0
,
0
],
]
}
fused_op
=
{
"op"
:
"tvm_op"
,
"name"
:
"fused_add_subtract_concatenate"
,
"attrs"
:
{
"num_outputs"
:
"1"
,
"num_inputs"
:
"5"
,
"func_name"
:
"fused_add_subtract_concatenate"
,
"flatten_data"
:
"0"
},
"inputs"
:
[
[
9
,
0
,
0
],
[
10
,
0
,
0
],
[
0
,
0
,
0
],
[
7
,
0
,
0
],
[
8
,
0
,
0
]
]
}
nodes
=
[
nodex
,
node0
,
node1
,
node2
,
node3
,
node4
,
node5
,
node6
,
node7
,
subgraph0
,
subgraph1
,
fused_op
]
arg_nodes
=
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
]
heads
=
[[
11
,
0
,
0
]]
node_row_ptr
=
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
]
storage_id
=
[
"list_int"
,
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
]]
shape
=
[
"list_shape"
,
[
[
10
,
10
],
[
10
,
10
],
[
10
,
10
],
[
10
,
10
],
[
10
,
10
],
[
10
,
10
],
[
10
,
10
],
[
10
,
10
],
[
10
,
10
],
[
10
,
10
],
[
10
,
10
],
[
30
,
10
]]]
dltype
=
[
"list_str"
,
[
"float32"
,
"float32"
,
"float32"
,
"float32"
,
"float32"
,
"float32"
,
"float32"
,
"float32"
,
"float32"
,
"float32"
,
"float32"
,
"float32"
]]
attrs
=
{
"shape"
:
shape
,
"dltype"
:
dltype
,
"storage_id"
:
storage_id
,
}
graph
=
{
"nodes"
:
nodes
,
"arg_nodes"
:
arg_nodes
,
"node_row_ptr"
:
node_row_ptr
,
"heads"
:
heads
,
"attrs"
:
attrs
}
return
json
.
dumps
(
graph
)
def
run_extern
(
label
,
get_extern_src
,
**
kwargs
):
if
which
(
"gcc"
)
is
None
:
print
(
"Skip test because gcc is not available."
)
obj_name
=
"{}.o"
.
format
(
label
)
lib_name
=
"external_{}.so"
.
format
(
label
)
# Get Json and the compiled library.
graph_json
=
get_whole_graph_json
()
lib
=
get_synthetic_lib
()
lib
.
save
(
obj_name
)
# library that contains external code.
csource_module
=
get_extern_src
()
kwargs
[
"options"
]
=
[
obj_name
]
+
kwargs
[
"options"
]
lib_path
=
tmp_path
.
relpath
(
lib_name
)
csource_module
.
export_library
(
lib_path
,
fcompile
=
False
,
**
kwargs
)
# load module for execution.
lib
=
tvm
.
module
.
load
(
lib_path
)
mod
=
tvm
.
contrib
.
graph_runtime
.
create
(
graph_json
,
lib
,
tvm
.
cpu
(
0
))
x_data
=
np
.
random
.
rand
(
10
,
10
)
.
astype
(
'float32'
)
mod
.
set_input
(
"x"
,
x_data
)
w_data
=
[]
for
i
in
range
(
8
):
data
=
np
.
random
.
rand
(
10
,
10
)
.
astype
(
'float32'
)
w_data
.
append
(
data
)
var
=
"w"
+
str
(
i
)
mod
.
set_input
(
var
,
data
)
mod
.
run
()
out
=
tvm
.
nd
.
empty
((
30
,
10
),
ctx
=
tvm
.
cpu
())
out
=
mod
.
get_output
(
0
,
out
)
tvm
.
testing
.
assert_allclose
(
out
.
asnumpy
(),
np
.
concatenate
((((
x_data
+
w_data
[
0
])
-
w_data
[
1
])
*
w_data
[
2
],
((
x_data
+
w_data
[
3
])
-
w_data
[
4
])
*
w_data
[
5
],
x_data
+
w_data
[
6
]
-
w_data
[
7
]),
axis
=
0
))
def
test_dso_extern
():
run_extern
(
"lib"
,
generate_csource_module
,
options
=
[
"-O2"
,
"-std=c++11"
])
def
test_engine_extern
():
run_extern
(
"engine"
,
generate_engine_module
,
options
=
[
"-O2"
,
"-std=c++11"
,
"-I"
+
tmp_path
.
relpath
(
""
)])
def
test_json_extern
():
if
which
(
"gcc"
)
is
None
:
print
(
"Skip test because gcc is not available."
)
# Get subgraph Json.
subgraph_json
=
(
"json_rt_0
\n
"
+
"input 0 10 10
\n
"
+
"input 1 10 10
\n
"
+
"input 2 10 10
\n
"
+
"input 3 10 10
\n
"
+
"add 4 inputs: 0 1 shape: 10 10
\n
"
+
"sub 5 inputs: 4 2 shape: 10 10
\n
"
+
"mul 6 inputs: 5 3 shape: 10 10
\n
"
+
"json_rt_1
\n
"
+
"input 0 10 10
\n
"
+
"input 1 10 10
\n
"
+
"input 2 10 10
\n
"
+
"input 3 10 10
\n
"
+
"add 4 inputs: 0 1 shape: 10 10
\n
"
+
"sub 5 inputs: 4 2 shape: 10 10
\n
"
+
"mul 6 inputs: 5 3 shape: 10 10"
)
subgraph_path
=
tmp_path
.
relpath
(
'subgraph.examplejson'
)
with
open
(
subgraph_path
,
'w'
)
as
f
:
f
.
write
(
subgraph_json
)
# Get Json and module.
graph_json
=
get_whole_graph_json
()
lib
=
get_synthetic_lib
()
ext_lib
=
tvm
.
module
.
load
(
subgraph_path
,
"examplejson"
)
lib
.
import_module
(
ext_lib
)
lib_name
=
'external.so'
lib_path
=
tmp_path
.
relpath
(
lib_name
)
lib
.
export_library
(
lib_path
)
# load module for execution.
lib
=
tvm
.
module
.
load
(
lib_path
)
mod
=
tvm
.
contrib
.
graph_runtime
.
create
(
graph_json
,
lib
,
tvm
.
cpu
(
0
))
x_data
=
np
.
random
.
rand
(
10
,
10
)
.
astype
(
'float32'
)
mod
.
set_input
(
"x"
,
x_data
)
w_data
=
[]
for
i
in
range
(
8
):
data
=
np
.
random
.
rand
(
10
,
10
)
.
astype
(
'float32'
)
w_data
.
append
(
data
)
var
=
"w"
+
str
(
i
)
mod
.
set_input
(
var
,
data
)
mod
.
run
()
out
=
tvm
.
nd
.
empty
((
30
,
10
),
ctx
=
tvm
.
cpu
())
out
=
mod
.
get_output
(
0
,
out
)
tvm
.
testing
.
assert_allclose
(
out
.
asnumpy
(),
np
.
concatenate
((((
x_data
+
w_data
[
0
])
-
w_data
[
1
])
*
w_data
[
2
],
((
x_data
+
w_data
[
3
])
-
w_data
[
4
])
*
w_data
[
5
],
x_data
+
w_data
[
6
]
-
w_data
[
7
]),
axis
=
0
))
if
__name__
==
"__main__"
:
test_dso_extern
()
test_engine_extern
()
test_json_extern
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment