Commit bfb811c7 by Animesh Jain Committed by Zhi

[QNN][TFLite] Parsing TFLite quantized models. (#3900)

parent 7530e043
...@@ -24,6 +24,7 @@ from .. import analysis ...@@ -24,6 +24,7 @@ from .. import analysis
from .. import expr as _expr from .. import expr as _expr
from .. import module as _module from .. import module as _module
from .. import op as _op from .. import op as _op
from .. import qnn as _qnn
from ... import nd as _nd from ... import nd as _nd
from .common import ExprTable from .common import ExprTable
from .common import infer_shape as _infer_shape from .common import infer_shape as _infer_shape
...@@ -32,10 +33,11 @@ __all__ = ['from_tflite'] ...@@ -32,10 +33,11 @@ __all__ = ['from_tflite']
class TensorWrapper(object): class TensorWrapper(object):
"""Tensor wrapper for TFLite Tensor""" """Tensor wrapper for TFLite Tensor"""
def __init__(self, tensor_idx, tensor, buffer): def __init__(self, tensor_idx, tensor, buffer, qnn_params=None):
self.tensor_idx = tensor_idx self.tensor_idx = tensor_idx
self.tensor = tensor self.tensor = tensor
self.buffer = buffer self.buffer = buffer
self.qnn_params = qnn_params
class OperatorConverter(object): class OperatorConverter(object):
"""Operator Converted for converting TFLite ops to Relay ops""" """Operator Converted for converting TFLite ops to Relay ops"""
...@@ -161,7 +163,19 @@ class OperatorConverter(object): ...@@ -161,7 +163,19 @@ class OperatorConverter(object):
tensor = self.subgraph.Tensors(tensor_idx) tensor = self.subgraph.Tensors(tensor_idx)
buffer_idx = tensor.Buffer() buffer_idx = tensor.Buffer()
buffer = self.model.Buffers(buffer_idx) buffer = self.model.Buffers(buffer_idx)
return_list.append(TensorWrapper(tensor_idx, tensor, buffer))
# Check if the tensors are quantized. Parse if yes.
qnn_params = None
tflite_qnn_params = tensor.Quantization()
if tflite_qnn_params is not None:
scale = float(tflite_qnn_params.ScaleAsNumpy())
zero_point = int(tflite_qnn_params.ZeroPointAsNumpy())
# Check that the scale and zero points are valid.
if scale != 0 or zero_point != 0:
qnn_params = dict()
qnn_params['scale'] = scale
qnn_params['zero_point'] = zero_point
return_list.append(TensorWrapper(tensor_idx, tensor, buffer, qnn_params))
return return_list return return_list
def get_tensor_value(self, tensor_wrapper): def get_tensor_value(self, tensor_wrapper):
...@@ -206,6 +220,10 @@ class OperatorConverter(object): ...@@ -206,6 +220,10 @@ class OperatorConverter(object):
raise NotImplementedError("Tensor type {} is currently not supported" raise NotImplementedError("Tensor type {} is currently not supported"
.format(str(tensor_type))) .format(str(tensor_type)))
def has_same_qnn_params(self, lhs_tensor, rhs_tensor):
return lhs_tensor.qnn_params['scale'] == rhs_tensor.qnn_params['scale'] and \
lhs_tensor.qnn_params['zero_point'] == rhs_tensor.qnn_params['zero_point']
def convert_conv2d(self, op): def convert_conv2d(self, op):
"""Convert TFLite conv2d""" """Convert TFLite conv2d"""
return self.convert_conv(op, "conv2d") return self.convert_conv(op, "conv2d")
...@@ -244,8 +262,15 @@ class OperatorConverter(object): ...@@ -244,8 +262,15 @@ class OperatorConverter(object):
target_shape = reshape_options.NewShapeAsNumpy() target_shape = reshape_options.NewShapeAsNumpy()
in_expr = self.get_expr(input_tensor_idx) in_expr = self.get_expr(input_tensor_idx)
out = _op.reshape(in_expr, newshape=tuple(target_shape))
# If the tensors are quantized, ensure that input/output qnn params are same.
if input_tensor.qnn_params:
output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "There should be only 1 output tensor"
output_tensor = output_tensors[0]
assert self.has_same_qnn_params(input_tensor, output_tensor), \
"TFLite reshape requires input and output scale and zero points to be equal"
out = _op.reshape(in_expr, newshape=tuple(target_shape))
return out return out
def _convert_resize(self, method, op): def _convert_resize(self, method, op):
...@@ -330,10 +355,33 @@ class OperatorConverter(object): ...@@ -330,10 +355,33 @@ class OperatorConverter(object):
input_tensor = input_tensors[0] input_tensor = input_tensors[0]
input_tensor_idx = input_tensor.tensor_idx input_tensor_idx = input_tensor.tensor_idx
output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors length should be 1"
output_tensor = output_tensors[0]
output_tensor_type = output_tensor.tensor.Type()
output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
params = {'axis': 1} # 1 is channel params = {'axis': 1} # 1 is channel
in_expr = self.get_expr(input_tensor_idx) in_expr = self.get_expr(input_tensor_idx)
# TODO - Naive softmax int8 implementation leads to bad accuracy. Currently, we can
# dequantize to FP32 and perform softmax on FP32. We can investigate an integer only softmax
# implementation in future.
if input_tensor.qnn_params:
in_expr = _qnn.op.dequantize(data=in_expr,
input_scale=input_tensor.qnn_params['scale'],
input_zero_point=input_tensor.qnn_params['zero_point'])
out = _op.nn.softmax(in_expr, **params) out = _op.nn.softmax(in_expr, **params)
# Go back to integer dataype if the original operator was quantized.
if output_tensor.qnn_params:
out = _qnn.op.quantize(data=out,
output_scale=output_tensor.qnn_params['scale'],
output_zero_point=output_tensor.qnn_params['zero_point'],
out_dtype=output_tensor_type_str)
return out return out
def convert_tanh(self, op): def convert_tanh(self, op):
...@@ -386,7 +434,8 @@ class OperatorConverter(object): ...@@ -386,7 +434,8 @@ class OperatorConverter(object):
in_exprs = [self.get_expr(input_tensor.tensor_idx) for input_tensor in input_tensors] in_exprs = [self.get_expr(input_tensor.tensor_idx) for input_tensor in input_tensors]
output_tensors = self.get_output_tensors(op) output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors should be 1" assert len(output_tensors) == 1, "output tensors length should be 1"
output_tensor = output_tensors[0]
assert op.BuiltinOptionsType() == BuiltinOptions.ConcatenationOptions assert op.BuiltinOptionsType() == BuiltinOptions.ConcatenationOptions
op_options = op.BuiltinOptions() op_options = op.BuiltinOptions()
...@@ -395,12 +444,27 @@ class OperatorConverter(object): ...@@ -395,12 +444,27 @@ class OperatorConverter(object):
concatenation_axis = concatenation_options.Axis() concatenation_axis = concatenation_options.Axis()
fused_activation_fn = concatenation_options.FusedActivationFunction() fused_activation_fn = concatenation_options.FusedActivationFunction()
# with axis in N H W C if not input_tensors[0].qnn_params:
out = _op.concatenate(in_exprs, axis=concatenation_axis) out = _op.concatenate(in_exprs, axis=concatenation_axis)
else:
input_scales = [input_tensor.qnn_params['scale'] for input_tensor in input_tensors]
input_zero_points = \
[input_tensor.qnn_params['zero_point'] for input_tensor in input_tensors]
out = _qnn.op.concatenate(in_exprs,
input_scales=input_scales,
input_zero_points=input_zero_points,
output_scale=output_tensor.qnn_params['scale'],
output_zero_point=output_tensor.qnn_params['zero_point'],
axis=concatenation_axis)
# if we have activation fn # if we have activation fn
if fused_activation_fn != ActivationFunctionType.NONE: if fused_activation_fn != ActivationFunctionType.NONE:
out = self.convert_fused_activation_function(out, fused_activation_fn) if not output_tensor.qnn_params:
out = self.convert_fused_activation_function(out, fused_activation_fn)
else:
raise tvm.error.OpNotImplemented(
'Operator {} with fused activation is not supported yet.'
.format('qnn.op.concatenate'))
return out return out
def _convert_elemwise(self, relay_op, op): def _convert_elemwise(self, relay_op, op):
...@@ -563,6 +627,12 @@ class OperatorConverter(object): ...@@ -563,6 +627,12 @@ class OperatorConverter(object):
input_tensor_idx = input_tensor.tensor_idx input_tensor_idx = input_tensor.tensor_idx
weight_tensor = input_tensors[1] weight_tensor = input_tensors[1]
output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors length should be 1"
output_tensor = output_tensors[0]
output_tensor_type = output_tensor.tensor.Type()
output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
input_tensor_shape = input_tensor.tensor.ShapeAsNumpy() input_tensor_shape = input_tensor.tensor.ShapeAsNumpy()
weight_tensor_shape = weight_tensor.tensor.ShapeAsNumpy() weight_tensor_shape = weight_tensor.tensor.ShapeAsNumpy()
...@@ -590,7 +660,13 @@ class OperatorConverter(object): ...@@ -590,7 +660,13 @@ class OperatorConverter(object):
weight_value = self.get_tensor_value(weight_tensor) weight_value = self.get_tensor_value(weight_tensor)
weight_expr = self.exp_tab.new_const(weight_value, dtype=weight_tensor_type_str) weight_expr = self.exp_tab.new_const(weight_value, dtype=weight_tensor_type_str)
out = _op.nn.dense(in_expr, weight_expr) if input_tensor.qnn_params:
out = _qnn.op.dense(in_expr, weight_expr,
input_zero_point=input_tensor.qnn_params['zero_point'],
kernel_zero_point=weight_tensor.qnn_params['zero_point'],
out_dtype='int32')
else:
out = _op.nn.dense(in_expr, weight_expr)
# if we have bias # if we have bias
if len(input_tensors) == 3: if len(input_tensors) == 3:
...@@ -605,7 +681,23 @@ class OperatorConverter(object): ...@@ -605,7 +681,23 @@ class OperatorConverter(object):
# If we have fused activations # If we have fused activations
if fused_activation_fn != ActivationFunctionType.NONE: if fused_activation_fn != ActivationFunctionType.NONE:
out = self.convert_fused_activation_function(out, fused_activation_fn) if not output_tensor.qnn_params:
out = self.convert_fused_activation_function(out, fused_activation_fn)
else:
raise tvm.error.OpNotImplemented(
'Operator {} with fused activation is not supported yet.'
.format('qnn.op.dense'))
# Finally if the dense is quantized. Add a requantize at the end.
if output_tensor.qnn_params:
input_scale = input_tensor.qnn_params['scale'] * weight_tensor.qnn_params['scale']
input_zero_point = 0
out = _qnn.op.requantize(out,
input_scale=input_scale,
input_zero_point=input_zero_point,
output_scale=output_tensor.qnn_params['scale'],
output_zero_point=output_tensor.qnn_params['zero_point'],
out_dtype=output_tensor_type_str)
return out return out
...@@ -677,6 +769,12 @@ class OperatorConverter(object): ...@@ -677,6 +769,12 @@ class OperatorConverter(object):
input_tensor_idx = input_tensor.tensor_idx input_tensor_idx = input_tensor.tensor_idx
weight_tensor = input_tensors[1] weight_tensor = input_tensors[1]
output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors length should be 1"
output_tensor = output_tensors[0]
output_tensor_type = output_tensor.tensor.Type()
output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
is_depthwise_conv = False is_depthwise_conv = False
if conv_type == 'conv2d': if conv_type == 'conv2d':
assert op.BuiltinOptionsType() == BuiltinOptions.Conv2DOptions assert op.BuiltinOptionsType() == BuiltinOptions.Conv2DOptions
...@@ -764,7 +862,14 @@ class OperatorConverter(object): ...@@ -764,7 +862,14 @@ class OperatorConverter(object):
raise tvm.error.OpAttributeUnImplemented( raise tvm.error.OpAttributeUnImplemented(
'Padding format {} is not supported for operator Conv.'.format(padding)) 'Padding format {} is not supported for operator Conv.'.format(padding))
out = _op.nn.conv2d(data=in_expr, weight=weight_expr, **params) if input_tensor.qnn_params:
qnn_conv2d_params = dict(params)
qnn_conv2d_params['input_zero_point'] = input_tensor.qnn_params['zero_point']
qnn_conv2d_params['kernel_zero_point'] = weight_tensor.qnn_params['zero_point']
qnn_conv2d_params['out_dtype'] = 'int32'
out = _qnn.op.conv2d(in_expr, weight_expr, **qnn_conv2d_params)
else:
out = _op.nn.conv2d(in_expr, weight_expr, **params)
# if we have bias # if we have bias
if len(input_tensors) == 3: if len(input_tensors) == 3:
...@@ -780,7 +885,23 @@ class OperatorConverter(object): ...@@ -780,7 +885,23 @@ class OperatorConverter(object):
# If we have fused activations # If we have fused activations
if fused_activation_fn != ActivationFunctionType.NONE: if fused_activation_fn != ActivationFunctionType.NONE:
out = self.convert_fused_activation_function(out, fused_activation_fn) if not output_tensor.qnn_params:
out = self.convert_fused_activation_function(out, fused_activation_fn)
else:
raise tvm.error.OpNotImplemented(
'Operator {} with fused activation is not supported yet.'
.format('qnn.op.conv2d'))
# Finally if the conv is quantized. Add a requantize at the end.
if output_tensor.qnn_params:
input_scale = input_tensor.qnn_params['scale'] * weight_tensor.qnn_params['scale']
input_zero_point = 0
out = _qnn.op.requantize(out,
input_scale=input_scale,
input_zero_point=input_zero_point,
output_scale=output_tensor.qnn_params['scale'],
output_zero_point=output_tensor.qnn_params['zero_point'],
out_dtype=output_tensor_type_str)
return out return out
...@@ -910,6 +1031,12 @@ class OperatorConverter(object): ...@@ -910,6 +1031,12 @@ class OperatorConverter(object):
input_tensor = input_tensors[0] input_tensor = input_tensors[0]
input_tensor_idx = input_tensor.tensor_idx input_tensor_idx = input_tensor.tensor_idx
output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors should be 1"
output_tensor = output_tensors[0]
output_tensor_type = output_tensor.tensor.Type()
output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
assert op.BuiltinOptionsType() == BuiltinOptions.Pool2DOptions assert op.BuiltinOptionsType() == BuiltinOptions.Pool2DOptions
op_options = op.BuiltinOptions() op_options = op.BuiltinOptions()
pool2d_options = Pool2DOptions() pool2d_options = Pool2DOptions()
...@@ -940,8 +1067,19 @@ class OperatorConverter(object): ...@@ -940,8 +1067,19 @@ class OperatorConverter(object):
'Padding format {} for operator Pool2D is not supported.'.format(padding)) 'Padding format {} for operator Pool2D is not supported.'.format(padding))
if pool_type == "average": if pool_type == "average":
out = _op.nn.avg_pool2d(in_expr, **params) if input_tensor.qnn_params:
assert self.has_same_qnn_params(input_tensor, output_tensor), \
'TFLite avg_pool2dreshape requires input and output scale' \
'and zero points to be equal'
out = _op.cast(in_expr, dtype="int32")
out = _op.nn.avg_pool2d(out, **params)
out = _op.cast(out, dtype=output_tensor_type_str)
else:
out = _op.nn.avg_pool2d(in_expr, **params)
elif pool_type == "max": elif pool_type == "max":
if input_tensor.qnn_params:
assert self.has_same_qnn_params(input_tensor, output_tensor), \
"qnn.op.max_pool2d requires input and output qnn params to be same"
out = _op.nn.max_pool2d(in_expr, **params) out = _op.nn.max_pool2d(in_expr, **params)
else: else:
raise tvm.error.OpNotImplemented( raise tvm.error.OpNotImplemented(
...@@ -949,8 +1087,12 @@ class OperatorConverter(object): ...@@ -949,8 +1087,12 @@ class OperatorConverter(object):
# If we have fused activations # If we have fused activations
if fused_activation_fn != ActivationFunctionType.NONE: if fused_activation_fn != ActivationFunctionType.NONE:
out = self.convert_fused_activation_function(out, fused_activation_fn) if input_tensor.qnn_params:
raise tvm.error.OpNotImplemented(
'Operator {} with fused activation is not supported yet.'
.format('qnn.op.pool2d'))
else:
out = self.convert_fused_activation_function(out, fused_activation_fn)
return out return out
def convert_pad(self, op): def convert_pad(self, op):
...@@ -993,7 +1135,7 @@ class OperatorConverter(object): ...@@ -993,7 +1135,7 @@ class OperatorConverter(object):
in_exprs = [self.get_expr(input_tensor.tensor_idx) for input_tensor in input_tensors] in_exprs = [self.get_expr(input_tensor.tensor_idx) for input_tensor in input_tensors]
output_tensors = self.get_output_tensors(op) output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors should be 1" assert len(output_tensors) == 1, "output tensors length should be 1"
assert op.BuiltinOptionsType() == BuiltinOptions.PackOptions assert op.BuiltinOptionsType() == BuiltinOptions.PackOptions
op_options = op.BuiltinOptions() op_options = op.BuiltinOptions()
...@@ -1241,4 +1383,5 @@ def from_tflite(model, shape_dict, dtype_dict): ...@@ -1241,4 +1383,5 @@ def from_tflite(model, shape_dict, dtype_dict):
outputs = [exp_tab.get_expr(get_tensor_name(subgraph, i)) for i in model_outputs] outputs = [exp_tab.get_expr(get_tensor_name(subgraph, i)) for i in model_outputs]
outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs) outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
func = _expr.Function(analysis.free_vars(outputs), outputs) func = _expr.Function(analysis.free_vars(outputs), outputs)
return _module.Module.from_expr(func), params mod = _module.Module.from_expr(func)
return mod, params
...@@ -997,6 +997,46 @@ def test_forward_inception_v4_net(): ...@@ -997,6 +997,46 @@ def test_forward_inception_v4_net():
tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
rtol=1e-5, atol=1e-5) rtol=1e-5, atol=1e-5)
def test_forward_qnn_inception_v1_net():
"""Test the Quantized TFLite Inception model."""
# InceptionV1
tflite_model_file = tf_testing.get_workload_official(
"https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz",
"inception_v1_224_quant.tflite")
with open(tflite_model_file, "rb") as f:
tflite_model_buf = f.read()
# Checking the labels because the requantize implementation is different between TFLite and
# Relay. This cause final output numbers to mismatch. So, testing accuracy via labels.
np.random.seed(0)
data = np.random.random_integers(low=0, high=128, size=(1, 224, 224, 3)).astype('uint8')
tflite_output = run_tflite_graph(tflite_model_buf, data)
tflite_predictions = np.squeeze(tflite_output)
tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1]
tvm_output = run_tvm_graph(tflite_model_buf, data, 'input')
tvm_predictions = np.squeeze(tvm_output)
tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1]
tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
def test_forward_qnn_mobilenet_v1_net():
"""Test the Quantized TFLite Mobilenet V1 model."""
# MobilenetV1
tflite_model_file = tf_testing.get_workload_official(
"https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
"mobilenet_v1_1.0_224_quant.tflite")
with open(tflite_model_file, "rb") as f:
tflite_model_buf = f.read()
# Checking the labels because the requantize implementation is different between TFLite and
# Relay. This cause final output numbers to mismatch. So, testing accuracy via labels.
np.random.seed(0)
data = np.random.random_integers(low=0, high=128, size=(1, 224, 224, 3)).astype('uint8')
tflite_output = run_tflite_graph(tflite_model_buf, data)
tflite_predictions = np.squeeze(tflite_output)
tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1]
tvm_output = run_tvm_graph(tflite_model_buf, data, 'input')
tvm_predictions = np.squeeze(tvm_output)
tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1]
tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
####################################################################### #######################################################################
# SSD Mobilenet # SSD Mobilenet
# ------------- # -------------
...@@ -1069,3 +1109,8 @@ if __name__ == '__main__': ...@@ -1069,3 +1109,8 @@ if __name__ == '__main__':
test_forward_inception_v3_net() test_forward_inception_v3_net()
test_forward_inception_v4_net() test_forward_inception_v4_net()
test_forward_ssd_mobilenet_v1() test_forward_ssd_mobilenet_v1()
# End to End quantized
# TODO - MobilenetV2 fails for now. Remove when fixed.
test_forward_qnn_inception_v1_net()
test_forward_qnn_mobilenet_v1_net()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment