Commit f4b96743 by Klin
parents 38c41268 91d53d31
# -*- coding: utf-8 -*-
import numpy
import numpy as np
import torch
import sys
from mmd_loss import *
from collections import OrderedDict
d1 = sys.argv[1] # bit
d2 = sys.argv[2] # epoch
# d1=4
# d2=5
sum=0
flag=0
total_quan_list=list()
total_base_list=list()
# CNN FLOPs = Cout * Hout * Wout * (2 * Cin * K * K ) 是考虑bias 否则-1
# FCN FLOPs = Cout * Cin 是考虑bias 否则-1
# 把相关的relu,pool也考虑进去了
# MAdd
# weight0 =np.array( [ 705600.0+4704.0+ 3528.0 , 480000.0+ 1600.0 + 1200.0 , 95880.0 + 120.0,
# 20076.0 + 84.0 , 1670.0 ])
# weight1=np.array([705,600.0 , 480,000.0,+ 95,880.0 ,
# 20,076.0 , 1,670.0 ])
# flops
weight_f0= np.array([357504+4704+4704, 241600+1600+1600,48000+120,10080+84,840])
weight_f1=np.array([357504, 241600,48000,10080,840])
summary_quan_dict=OrderedDict()
summary_base_dict=OrderedDict()
losses=[]
# 最外层:不同epoch的字典 内层:各个网络层的grads
for i in range(int(d2)):
total_quan_list.append(torch.load('./project/p/checkpoint/cifar-10_lenet_bn_quant/' + str(d1) + '/ckpt_cifar-10_lenet_bn_quant_'+str(i+1)+'.pth'))
#total_quan_list.append(torch.load('checkpoint/cifar-10_lenet_bn/full' + '/ckpt_cifar-10_lenet_bn_' + str(d2) + '.pth'))
total_base_list.append(torch.load('./project/p/checkpoint/cifar-10_lenet_bn/full' + '/ckpt_cifar-10_lenet_bn_' + str(i+1) + '.pth'))
for k, _ in total_base_list[i]['grads'].items():
if flag == 0:
summary_quan_dict[k] = total_quan_list[i]['grads'][k].reshape(1,-1)
summary_base_dict[k] = total_base_list[i]['grads'][k].reshape(1,-1)
else :
# 字典里的数据不能直接改,需要重新赋值
a=summary_quan_dict[k]
b=total_quan_list[i]['grads'][k].reshape(1,-1)
c=np.vstack((a,b))
summary_quan_dict[k] = c
a = summary_base_dict[k]
b = total_base_list[i]['grads'][k].reshape(1,-1)
c = np.vstack((a, b))
summary_base_dict[k] = c
flag = 1
cnt = 0
flag = 0
for k, _ in summary_quan_dict.items():
if flag == 0:
sum += 0.99*weight_f1[cnt] * MK_MMD(source=summary_base_dict[k], target=summary_quan_dict[k]) # weight
else:
sum += 0.01*weight_f1[cnt] * MK_MMD(source=summary_base_dict[k], target=summary_quan_dict[k]) #bias
if flag == 1:
cnt = cnt + 1
flag = 0
else:
flag=1
sum=sum/(weight_f0.sum()*2)
print(sum)
f = open('./project/p/lenet_ptq_similarity.txt','a')
f.write('bit:' + str(d1) + ' epoch_num:' + str(d2) +': '+str(sum)+'\n')
f.close()
# for k,v in summary_base_dict.items():
# if k== 'conv_layers.conv1.weight':
# print(v)
# print('===========')
# print(summary_quan_dict[k])
\ No newline at end of file
import sys
import os
# 从get_param.py输出重定向文件val.txt中提取参数量和计算量
def extract_ratio(md='ResNet18'):
fr = open('param_flops_' + md + '.txt','r')
lines = fr.readlines()
layer = []
par_ratio = []
flop_ratio = []
for line in lines:
# if '(' in line and ')' in line:
if 'Conv' in line or 'BatchNorm2d' in line or 'Linear' in line:
layer.append(line.split(':')[1].split('(')[0])
r1 = line.split('%')[0].split(',')[-1]
r1 = float(r1)
par_ratio.append(r1)
r2 = line.split('%')[-2].split(',')[-1]
r2 = float(r2)
flop_ratio.append(r2)
return layer, par_ratio, flop_ratio
if __name__ == "__main__":
layer, par_ratio, flop_ratio = extract_ratio()
print(len(layer))
print(len(par_ratio))
print(len(flop_ratio))
\ No newline at end of file
from torch.autograd import Function
class FakeQuantize(Function):
@staticmethod
def forward(ctx, x, qparam):
x = qparam.quantize_tensor(x)
x = qparam.dequantize_tensor(x)
return x
@staticmethod
def backward(ctx, grad_output):
return grad_output, None
\ No newline at end of file
from model import *
import torch
from ptflops import get_model_complexity_info
import argparse
def get_children(model: torch.nn.Module):
# get children form model!
# 为了后续也能够更新参数,需要用nn.ModuleList来承载
# children = nn.ModuleList(model.children())
# print(children)
# 方便对其中的module进行后续的更新
# flatt_children = nn.ModuleList()
children = list(model.children())
# flatt_children = nn.ModuleList()
flatt_children = []
if len(children) == 0:
# if model has no children; model is last child! :O
return model
else:
# look for children from children... to the last child!
for child in children:
try:
flatt_children.extend(get_children(child))
except TypeError:
flatt_children.append(get_children(child))
# print(flatt_children)
return flatt_children
# 定义获取不包含wrapper的所有子模块的函数
def get_all_child_modules(module):
for name, child in module.named_children():
if isinstance(child, nn.Sequential):
yield from get_all_child_modules(child)
elif len(list(child.children())) > 0:
yield from child.children()
else:
yield child
def filter_fn(module, n_inp, outp_shape):
# if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d, torch.nn.ReLU,torch.nn.BatchNorm2d,torch.nn.Linear,torch.nn.AdaptiveAvgPool2d)):
if 'conv' in module or 'bn' in module or 'fc' in module or 'avg' in module or 'relu' in module:
return True
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Model Analysis --- params & flops')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='MobileNetV2')
args = parser.parse_args()
# if args.model == 'ResNet18':
# model = resnet18()
# elif args.model == 'ResNet50':
# model = resnet50()
# elif args.model == 'ResNet152':
# model = resnet152()
if args.model == 'MobileNetV2':
model = MobileNetV2()
full_file = 'ckpt/cifar10_' + args.model + '.pt'
model.load_state_dict(torch.load(full_file))
# flat = get_children(model)
# print(flat)
# flat = get_children(model)
# new_model = nn.Sequential(*flat)
flops, params = get_model_complexity_info(model, (3, 32, 32), as_strings=True, print_per_layer_stat=True)
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import OrderedDict
def get_model_histogram(model):
"""
Description:
- get norm gradients from model, and store in a OrderDict
Args:
- model: (torch.nn.Module), torch model
Returns:
- grads in OrderDict
"""
gradshisto = OrderedDict()
grads = OrderedDict()
for name, params in model.named_parameters():
grad = params.grad
if grad is not None:
tmp = {}
params_np = grad.cpu().numpy()
histogram, bins = np.histogram(params_np.flatten(),bins=20)
tmp['histogram'] = list(histogram)
tmp['bins'] = list(bins)
gradshisto[name] = tmp
grads[name] = params_np
return gradshisto,grads
def get_model_norm_gradient(model):
"""
Description:
- get norm gradients from model, and store in a OrderDict
Args:
- model: (torch.nn.Module), torch model
Returns:
- grads in OrderDict
"""
grads = OrderedDict()
for name, params in model.named_parameters():
grad = params.grad
if grad is not None:
grads[name] = grad.norm().item()
return grads
def get_grad_histogram(grads_sum):
gradshisto = OrderedDict()
# grads = OrderedDict()
for name, params in grads_sum.items():
grad = params
if grad is not None:
tmp = {}
#params_np = grad.cpu().numpy()
params_np = grad
histogram, bins = np.histogram(params_np.flatten(),bins=20)
tmp['histogram'] = list(histogram)
tmp['bins'] = list(bins)
gradshisto[name] = tmp #每层一个histogram (tmp中的是描述直方图的信息)
# grads[name] = params_np
return gradshisto
\ No newline at end of file
class GlobalVariables:
SELF_INPLANES = 0
\ No newline at end of file
# -*- coding: utf-8 -*-
# 用于多个module之间共享全局变量
def _init(): # 初始化
global _global_dict
_global_dict = {}
def set_value(value,is_bias=False):
# 定义一个全局变量
if is_bias:
_global_dict[0] = value
else:
_global_dict[1] = value
def get_value(is_bias=False): # 给bias独立于各变量外的精度
if is_bias:
return _global_dict[0]
else:
return _global_dict[1]
from model import *
from extract_ratio import *
from utils import *
import argparse
import openpyxl
import os
import os.path as osp
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
def js_div_norm(a,b):
a_norm = F.normalize(a.data,p=2,dim=-1)
b_norm = F.normalize(b.data,p=2,dim=-1)
return js_div(a_norm,b_norm)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Loss-Grad Analysis')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='resnet18')
args = parser.parse_args()
wb = openpyxl.Workbook()
ws = wb.active
writer = SummaryWriter(log_dir='log/' + args.model + '/qat_loss_grad')
# layer, par_ratio, flop_ratio = extract_ratio()
layer, par_ratio, flop_ratio = extract_ratio(args.model)
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
layer = []
for name, param in model.named_parameters():
if 'weight' in name:
n = name.split('.')
pre = '.'.join(n[:len(n)-1])
layer.append(pre)
# dir_prefix = 'ckpt/qat/epoch_'
dir_prefix = 'ckpt/qat/'+ args.model + '/'
quant_type_list = ['INT','POT','FLOAT']
for epoch in [5,10,15,20]:
ws_epoch = wb.create_sheet('epoch_%d'%epoch)
full_state = torch.load(dir_prefix+'%d/'%epoch + 'full.pt')
ws_epoch.cell(row=1,column=2,value='loss')
ws_epoch.cell(row=1,column=3,value='loss_sum')
ws_epoch.cell(row=1,column=4,value='loss_avg')
ws_epoch.cell(row=2,column=1,value='FP32')
ws_epoch.cell(row=2,column=2,value=full_state['loss'].cpu().item())
ws_epoch.cell(row=2,column=3,value=full_state['loss_sum'].cpu().item())
ws_epoch.cell(row=2,column=4,value=full_state['loss_avg'].cpu().item())
# full_grad = full_state['grad_dict']
# full_grad_sum = full_state['grad_dict_sum']
full_grad_avg = full_state['grad_dict_avg']
for name,tmpgrad in full_grad_avg.items():
writer.add_histogram('FULL: '+name,tmpgrad,global_step=epoch)
ws_epoch.cell(row=4,column=1,value='title')
ws_epoch.cell(row=4,column=2,value='loss')
ws_epoch.cell(row=4,column=3,value='loss_sum')
ws_epoch.cell(row=4,column=4,value='loss_avg')
ws_epoch.cell(row=4,column=5,value='js_grad_avg_norm')
# ws_epoch.cell(row=4,column=6,value='conv1.weight')
# ws_epoch.cell(row=4,column=7,value='conv1.bias')
# ws_epoch.cell(row=4,column=8,value='conv2.weight')
# ws_epoch.cell(row=4,column=9,value='conv2.bias')
# ws_epoch.cell(row=4,column=10,value='conv3.weight')
# ws_epoch.cell(row=4,column=11,value='conv3.bias')
# ws_epoch.cell(row=4,column=12,value='conv4.weight')
# ws_epoch.cell(row=4,column=13,value='conv4.bias')
# ws_epoch.cell(row=4,column=14,value='conv5.weight')
# ws_epoch.cell(row=4,column=15,value='conv5.bias')
# ws_epoch.cell(row=4,column=16,value='fc1.weight')
# ws_epoch.cell(row=4,column=17,value='fc1.bias')
# ws_epoch.cell(row=4,column=18,value='fc2.weight')
# ws_epoch.cell(row=4,column=19,value='fc2.bias')
# ws_epoch.cell(row=4,column=20,value='fc3.weight')
# ws_epoch.cell(row=4,column=21,value='fc3.bias')
cnt = 5
for n in layer:
cnt = cnt + 1
ws_epoch.cell(row=4,column=cnt,value=n)
currow=4
for quant_type in quant_type_list:
num_bit_list = numbit_list(quant_type)
for num_bits in num_bit_list:
e_bit_list = ebit_list(quant_type,num_bits)
for e_bits in e_bit_list:
if quant_type == 'FLOAT':
title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
else:
title = '%s_%d' % (quant_type, num_bits)
print('\nAnalyse: '+title)
currow += 1
qat_state=torch.load(dir_prefix+'%d/'%epoch+title+'.pt')
js_grad_avg_norm=0.
grad_avg = qat_state['grad_dict_avg']
for name,tmpgrad in grad_avg.items():
writer.add_histogram(title+': '+name,tmpgrad,global_step=epoch)
colidx=5
for name,_ in full_grad_avg.items():
prefix = name.split('.')[0]
colidx += 1
layer_idx = layer.index(prefix)
js_norm = js_div_norm(full_grad_avg[name],grad_avg[name])
ws_epoch.cell(row=currow,column=colidx,value=js_norm.cpu().item())
js_grad_avg_norm += flop_ratio[layer_idx] * js_norm
ws_epoch.cell(row=currow,column=1,value=title)
ws_epoch.cell(row=currow,column=2,value=qat_state['loss'].cpu().item())
ws_epoch.cell(row=currow,column=3,value=qat_state['loss_sum'].cpu().item())
ws_epoch.cell(row=currow,column=4,value=qat_state['loss_avg'].cpu().item())
ws_epoch.cell(row=currow,column=5,value=js_grad_avg_norm.cpu().item())
wb.save('loss_grad.xlsx')
writer.close()
\ No newline at end of file
import torch
import torch.nn as nn
import torch.nn.functional as F
from module import *
import module
from global_var import GlobalVariables
# 定义 MobileNet V2 模型
# 适用于Cifar10
class MobileNetV2(nn.Module):
def __init__(self, num_classes=10):
super(MobileNetV2, self).__init__()
self.conv1 = nn.Conv2d(3, 32, 3, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(32)
self.relu1 = nn.ReLU6(inplace=True)
# Bottleneck 层次, t指channel扩充系数
self.layer1 = MakeLayer(32, 16, 1, t=1, stride=1)
self.layer2 = MakeLayer(16, 24, 2, t=6, stride=2)
self.layer3 = MakeLayer(24, 32, 3, t=6, stride=2)
# 根据CIFAR-10图像大小调整层数
self.layer4 = MakeLayer(32, 96, 3, t=6, stride=1)
self.layer5 = MakeLayer(96, 160, 3, t=6, stride=2)
self.layer6 = MakeLayer(160, 320, 1, t=6, stride=1)
self.conv2 = nn.Conv2d(320, 1280, 1)
self.avg1 = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(1280, num_classes)
def forward(self, x):
# x = self.layers(x)
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.layer5(x)
x = self.layer6(x)
x = self.conv2(x)
x = self.avg1(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def quantize(self, quant_type, num_bits=8, e_bits=3):
self.qconvbnrelu1 = QConvBNReLU6(quant_type,self.conv1,self.bn1,qi=True,qo=True,num_bits=num_bits,e_bits=e_bits)
# 没有输入num_bits 需修改
self.layer1.quantize(quant_type=quant_type,num_bits=num_bits, e_bits=e_bits)
self.layer2.quantize(quant_type=quant_type,num_bits=num_bits, e_bits=e_bits)
self.layer3.quantize(quant_type=quant_type,num_bits=num_bits, e_bits=e_bits)
self.layer4.quantize(quant_type=quant_type,num_bits=num_bits, e_bits=e_bits)
self.layer5.quantize(quant_type=quant_type,num_bits=num_bits, e_bits=e_bits)
self.layer6.quantize(quant_type=quant_type,num_bits=num_bits, e_bits=e_bits)
self.qconv1 = QConv2d(quant_type, self.conv2, qi=False, qo=True, num_bits=num_bits, e_bits=e_bits)
self.qavgpool1 = QAdaptiveAvgPool2d(quant_type,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
self.qfc1 = QLinear(quant_type, self.fc,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
def quantize_forward(self, x):
# for _, layer in self.quantize_layers.items():
# x = layer(x)
# out = F.softmax(x, dim=1)
# return out
x = self.qconvbnrelu1(x)
x = self.layer1.quantize_forward(x)
x = self.layer2.quantize_forward(x)
x = self.layer3.quantize_forward(x)
x = self.layer4.quantize_forward(x)
x = self.layer5.quantize_forward(x)
x = self.layer6.quantize_forward(x)
x = self.qconv1(x)
x = self.qavgpool1(x)
x = x.view(x.size(0), -1)
x = self.qfc1(x)
out = F.softmax(x,dim = 1) # 这里不softmax也行 影响不大
return out
def freeze(self):
self.qconvbnrelu1.freeze() # 因为作为第一层是有qi的,所以freeze的时候无需再重新提供qi
qo = self.layer1.freeze(qinput = self.qconvbnrelu1.qo)
qo = self.layer2.freeze(qinput = qo)
qo = self.layer3.freeze(qinput = qo)
qo = self.layer4.freeze(qinput = qo)
qo = self.layer5.freeze(qinput = qo)
qo = self.layer6.freeze(qinput = qo)
self.qconv1.freeze(qi = qo)
self.qavgpool1.freeze(qi=self.qconv1.qo)
self.qfc1.freeze(qi=self.qavgpool1.qo)
# self.qfc1.freeze()
def quantize_inference(self, x):
qx = self.qconvbnrelu1.qi.quantize_tensor(x)
qx = self.qconvbnrelu1.quantize_inference(qx)
qx = self.layer1.quantize_inference(qx)
qx = self.layer2.quantize_inference(qx)
qx = self.layer3.quantize_inference(qx)
qx = self.layer4.quantize_inference(qx)
qx = self.layer5.quantize_inference(qx)
qx = self.layer6.quantize_inference(qx)
qx = self.qconv1.quantize_inference(qx)
qx = self.qavgpool1.quantize_inference(qx)
qx = qx.view(qx.size(0), -1)
qx = self.qfc1.quantize_inference(qx)
qx = self.qfc1.qo.dequantize_tensor(qx)
out = F.softmax(qx,dim = 1) # 这里不softmax也行 影响不大
return out
class InvertedResidual(nn.Module):
def __init__(self, in_channels, out_channels, stride, expand_ratio):
super(InvertedResidual, self).__init__()
hidden_dims = int(in_channels * expand_ratio)
self.identity_flag = stride == 1 and in_channels == out_channels
# self.bottleneck = nn.Sequential(
# # Pointwise Convolution
# nn.Conv2d(in_channels, hidden_dims, 1),
# nn.BatchNorm2d(hidden_dims),
# nn.ReLU6(inplace=True),
# # Depthwise Convolution
# nn.Conv2d(hidden_dims, hidden_dims, 3, stride=stride, padding=1, groups=hidden_dims),
# nn.BatchNorm2d(hidden_dims),
# nn.ReLU6(inplace=True),
# # Pointwise & Linear Convolution
# nn.Conv2d(hidden_dims, out_channels, 1),
# nn.BatchNorm2d(out_channels),
# )
self.conv1 = nn.Conv2d(in_channels, hidden_dims, 1)
self.bn1 = nn.BatchNorm2d(hidden_dims)
self.relu1 = nn.ReLU6(inplace=True)
# Depthwise Convolution
self.conv2 = nn.Conv2d(hidden_dims, hidden_dims, 3, stride=stride, padding=1, groups=hidden_dims)
self.bn2 = nn.BatchNorm2d(hidden_dims)
self.relu2 = nn.ReLU6(inplace=True)
# Pointwise & Linear Convolution
self.conv3 = nn.Conv2d(hidden_dims, out_channels, 1)
self.bn3 = nn.BatchNorm2d(out_channels)
def forward(self, x):
# if self.identity_flag:
# return x + self.bottleneck(x)
# else:
# return self.bottleneck(x)
identity = x
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu2(x)
x = self.conv3(x)
x = self.bn3(x)
if self.identity_flag:
return identity + x
else:
return x
def quantize(self, quant_type ,num_bits=8, e_bits=3):
self.qconvbnrelu1 = QConvBNReLU6(quant_type,self.conv1,self.bn1,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
self.qconvbnrelu2 = QConvBNReLU6(quant_type,self.conv2,self.bn2,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
self.qconvbn1 = QConvBN(quant_type,self.conv3,self.bn3,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
self.qelementadd = QElementwiseAdd(quant_type,qi0=False, qi1=False, qo=True,num_bits=num_bits,e_bits=e_bits)
def quantize_forward(self, x):
identity = x
out = self.qconvbnrelu1(x)
out = self.qconvbnrelu2(out)
out = self.qconvbn1(out)
if self.identity_flag:
out = self.qelementadd(out, identity)
return out
def freeze(self, qinput):
# 这里的qconvbnrelu1其实是可以用前一层的qo的,但感觉不太好传参,就没用
# 还需仔细检查
self.qconvbnrelu1.freeze(qi= qinput) # 需要接前一个module的最后一个qo
self.qconvbnrelu2.freeze(qi=self.qconvbnrelu1.qo)
self.qconvbn1.freeze(qi = self.qconvbnrelu2.qo)
if self.identity_flag:
self.qelementadd.freeze(qi0 = self.qconvbn1.qo, qi1 = qinput)
return self.qelementadd.qo
else:
return self.qconvbn1.qo
def quantize_inference(self, x):
# 感觉是不需要进行初始的quantize_tensor和dequantize_tensor,因为他不是最前/后一层,只要中间的每层都在量化后的领域内,就不需要这种处理。
identity = x
out = self.qconvbnrelu1.quantize_inference(x)
out = self.qconvbnrelu2.quantize_inference(out)
out = self.qconvbn1.quantize_inference(out)
if self.identity_flag:
out = self.qelementadd.quantize_inference(out, identity)
return out
class MakeLayer(nn.Module):
# def _make_bottleneck(self, in_channels, out_channels, n_repeat, t, stride):
# layers = []
# for i in range(n_repeat):
# if i == 0:
# layers.append(InvertedResidual(in_channels, out_channels, stride, t))
# else:
# layers.append(InvertedResidual(in_channels, out_channels, 1, t))
# in_channels = out_channels
# return nn.Sequential(*layers)
def __init__(self, in_channels, out_channels, n_repeat, t, stride):
super(MakeLayer, self).__init__()
# print('makelayer init:'+ str(GlobalVariables.SELF_INPLANES))
self.layers = nn.ModuleList()
for i in range(n_repeat):
if i == 0:
self.layers.append(InvertedResidual(in_channels, out_channels, stride, t))
else:
self.layers.append(InvertedResidual(in_channels, out_channels, 1, t))
in_channels = out_channels
# for l in self.layers:
# print(l)
def forward(self,x):
for layer in self.layers:
x = layer(x)
return x
def quantize(self, quant_type, num_bits=8, e_bits=3):
# 需检查
# print('CHECK======')
for layer in self.layers:
layer.quantize(quant_type=quant_type,num_bits=num_bits,e_bits=e_bits) # 这里是因为每一块都是block,而block中有具体的quantize策略, n_exp和mode已经在__init__中赋值了
# print(layer)
# print('CHECK======')
def quantize_forward(self, x):
for layer in self.layers:
x = layer.quantize_forward(x) # 各个block中有具体的quantize_forward
return x
def freeze(self, qinput): # 需要在 Module Resnet的freeze里传出来
# 这里的qconvbnrelu1其实是可以用前一层的qo的,但感觉不太好传参,就没用
# 还需仔细检查
cnt = 0
for layer in self.layers:
if cnt == 0:
qo = layer.freeze(qinput = qinput)
cnt = 1
else:
qo = layer.freeze(qinput = qo) # 各个block中有具体的freeze
return qo # 供后续的层用
def quantize_inference(self, x):
# 感觉是不需要进行初始的quantize_tensor和dequantize_tensor,因为他不是最前/后一层,只要中间的每层都在量化后的领域内,就不需要这种处理。
for layer in self.layers:
x = layer.quantize_inference(x) # 每个block中有具体的quantize_inference
return x
import math
import numpy as np
import gol
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from function import FakeQuantize
# 获取最近的量化值
# def get_nearest_val(quant_type,x,is_bias=False):
# if quant_type=='INT':
# return x.round_()
# plist = gol.get_value(is_bias)
# # print('get')
# # print(plist)
# # x = x / 64
# shape = x.shape
# xhard = x.view(-1)
# plist = plist.type_as(x)
# # 取最近幂次作为索引
# idx = (xhard.unsqueeze(0) - plist.unsqueeze(1)).abs().min(dim=0)[1]
# xhard = plist[idx].view(shape)
# xout = (xhard - x).detach() + x
# # xout = xout * 64
# return xout
def get_nearest_val(quant_type, x, is_bias=False, block_size=1000000):
if quant_type == 'INT':
return x.round_()
plist = gol.get_value(is_bias)
shape = x.shape
xhard = x.view(-1)
xout = torch.zeros_like(xhard)
plist = plist.type_as(x)
n_blocks = (x.numel() + block_size - 1) // block_size
for i in range(n_blocks):
start_idx = i * block_size
end_idx = min(start_idx + block_size, xhard.numel())
block_size_i = end_idx - start_idx
# print(x.numel())
# print(block_size_i)
# print(start_idx)
# print(end_idx)
xblock = xhard[start_idx:end_idx]
# xblock = xblock.view(shape[start_idx:end_idx])
plist_block = plist.unsqueeze(1) #.expand(-1, block_size_i)
idx = (xblock.unsqueeze(0) - plist_block).abs().min(dim=0)[1]
# print(xblock.shape)
xhard_block = plist[idx].view(xblock.shape)
xout[start_idx:end_idx] = (xhard_block - xblock).detach() + xblock
xout = xout.view(shape)
return xout
# 采用对称有符号量化时,获取量化范围最大值
def get_qmax(quant_type,num_bits=None, e_bits=None):
if quant_type == 'INT':
qmax = 2. ** (num_bits - 1) - 1
elif quant_type == 'POT':
qmax = 1
else: #FLOAT
m_bits = num_bits - 1 - e_bits
dist_m = 2 ** (-m_bits)
e = 2 ** (e_bits - 1)
expo = 2 ** e
m = 2 ** m_bits -1
frac = 1. + m * dist_m
qmax = frac * expo
return qmax
# 都采用有符号量化,zeropoint都置为0
def calcScaleZeroPoint(min_val, max_val, qmax):
scale = torch.max(max_val.abs(),min_val.abs()) / qmax
zero_point = torch.tensor(0.)
return scale, zero_point
# 将输入进行量化,输入输出都为tensor
def quantize_tensor(quant_type, x, scale, zero_point, qmax, is_bias=False):
# 量化后范围,直接根据位宽确定
qmin = -qmax
q_x = zero_point + x / scale
q_x.clamp_(qmin, qmax)
q_x = get_nearest_val(quant_type, q_x, is_bias)
return q_x
# bias使用不同精度,需要根据量化类型指定num_bits/e_bits
def bias_qmax(quant_type):
if quant_type == 'INT':
return get_qmax(quant_type, 64)
elif quant_type == 'POT':
return get_qmax(quant_type)
else:
return get_qmax(quant_type, 16, 7)
# 转化为FP32,不需再做限制
def dequantize_tensor(q_x, scale, zero_point):
return scale * (q_x - zero_point)
class QParam(nn.Module):
def __init__(self,quant_type, num_bits=8, e_bits=3):
super(QParam, self).__init__()
self.quant_type = quant_type
self.num_bits = num_bits
self.e_bits = e_bits
self.qmax = get_qmax(quant_type, num_bits, e_bits)
scale = torch.tensor([], requires_grad=False)
zero_point = torch.tensor([], requires_grad=False)
min = torch.tensor([], requires_grad=False)
max = torch.tensor([], requires_grad=False)
# 通过注册为register,使得buffer可以被记录到state_dict
self.register_buffer('scale', scale)
self.register_buffer('zero_point', zero_point)
self.register_buffer('min', min)
self.register_buffer('max', max)
# 更新统计范围及量化参数
def update(self, tensor):
if self.max.nelement() == 0 or self.max.data < tensor.max().data:
self.max.data = tensor.max().data
self.max.clamp_(min=0)
if self.min.nelement() == 0 or self.min.data > tensor.min().data:
self.min.data = tensor.min().data
self.min.clamp_(max=0)
self.scale, self.zero_point = calcScaleZeroPoint(self.min, self.max, self.qmax)
def quantize_tensor(self, tensor):
return quantize_tensor(self.quant_type, tensor, self.scale, self.zero_point, self.qmax)
def dequantize_tensor(self, q_x):
return dequantize_tensor(q_x, self.scale, self.zero_point)
# 该方法保证了可以从state_dict里恢复
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
error_msgs):
key_names = ['scale', 'zero_point', 'min', 'max']
for key in key_names:
value = getattr(self, key)
value.data = state_dict[prefix + key].data
state_dict.pop(prefix + key)
# 该方法返回值将是打印该对象的结果
def __str__(self):
info = 'scale: %.10f ' % self.scale
info += 'zp: %.6f ' % self.zero_point
info += 'min: %.6f ' % self.min
info += 'max: %.6f' % self.max
return info
# 作为具体量化层的父类,qi和qo分别为量化输入/输出
class QModule(nn.Module):
def __init__(self,quant_type, qi=True, qo=True, num_bits=8, e_bits=3):
super(QModule, self).__init__()
if qi:
self.qi = QParam(quant_type,num_bits, e_bits)
if qo:
self.qo = QParam(quant_type,num_bits, e_bits)
self.quant_type = quant_type
self.num_bits = num_bits
self.e_bits = e_bits
self.bias_qmax = bias_qmax(quant_type)
def freeze(self):
pass # 空语句
def quantize_inference(self, x):
raise NotImplementedError('quantize_inference should be implemented.')
"""
QModule 量化卷积
:quant_type: 量化类型
:conv_module: 卷积模块
:qi: 是否量化输入特征图
:qo: 是否量化输出特征图
:num_bits: 8位bit数
"""
class QConv2d(QModule):
def __init__(self, quant_type, conv_module, qi=True, qo=True, num_bits=8, e_bits=3):
super(QConv2d, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.conv_module = conv_module
self.qw = QParam(quant_type, num_bits,e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
# freeze方法可以固定真量化的权重参数,并将该值更新到原全精度层上,便于散度计算
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
# 这里因为在池化或者激活的输入,不需要对最大值和最小是进行额外的统计,会共享相同的输出
if qi is not None:
self.qi = qi
if qo is not None:
self.qo = qo
# 根据https://zhuanlan.zhihu.com/p/156835141, 这是式3 的系数
self.M.data = (self.qw.scale * self.qi.scale / self.qo.scale).data
self.conv_module.weight.data = self.qw.quantize_tensor(self.conv_module.weight.data)
self.conv_module.weight.data = self.conv_module.weight.data - self.qw.zero_point
self.conv_module.bias.data = quantize_tensor(self.quant_type,
self.conv_module.bias.data, scale=self.qi.scale * self.qw.scale,
zero_point=0.,qmax=self.bias_qmax, is_bias=True)
def forward(self, x): # 前向传播,输入张量,x为浮点型数据
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi) # 对输入张量X完成量化
# foward前更新qw,保证量化weight时候scale正确
self.qw.update(self.conv_module.weight.data)
# 注意:此处主要为了统计各层x和weight范围,未对bias进行量化操作
tmp_wgt = FakeQuantize.apply(self.conv_module.weight, self.qw)
x = F.conv2d(x, tmp_wgt, self.conv_module.bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding, dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
# 利用公式 q_a = M(\sigma(q_w-Z_w)(q_x-Z_x) + q_b)
def quantize_inference(self, x): # 此处input为已经量化的qx
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point
return x
class QLinear(QModule):
def __init__(self, quant_type, fc_module, qi=True, qo=True, num_bits=8, e_bits=3):
super(QLinear, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.fc_module = fc_module
self.qw = QParam(quant_type, num_bits, e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
if qi is not None:
self.qi = qi
if qo is not None:
self.qo = qo
self.M.data = (self.qw.scale * self.qi.scale / self.qo.scale).data
self.fc_module.weight.data = self.qw.quantize_tensor(self.fc_module.weight.data)
self.fc_module.weight.data = self.fc_module.weight.data - self.qw.zero_point
self.fc_module.bias.data = quantize_tensor(self.quant_type,
self.fc_module.bias.data, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax, is_bias=True)
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
self.qw.update(self.fc_module.weight.data)
tmp_wgt = FakeQuantize.apply(self.fc_module.weight, self.qw)
x = F.linear(x, tmp_wgt, self.fc_module.bias)
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x):
x = x - self.qi.zero_point
x = self.fc_module(x)
x = self.M * x
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point
return x
class QReLU(QModule):
def __init__(self,quant_type, qi=False, qo=True, num_bits=8, e_bits=3):
super(QReLU, self).__init__(quant_type, qi, qo, num_bits, e_bits)
def freeze(self, qi=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if qi is not None:
self.qi = qi
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
x = F.relu(x)
return x
def quantize_inference(self, x):
x = x.clone()
# x[x < self.qi.zero_point] = self.qi.zero_point
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
a = self.qi.zero_point.float().to(device)
x[x < a] = a
return x
class QMaxPooling2d(QModule):
def __init__(self, quant_type, kernel_size=3, stride=1, padding=0, qi=False, qo=True, num_bits=8,e_bits=3):
super(QMaxPooling2d, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
def freeze(self, qi=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if qi is not None:
self.qi = qi
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding)
return x
def quantize_inference(self, x):
return F.max_pool2d(x, self.kernel_size, self.stride, self.padding)
class QConvBNReLU(QModule):
def __init__(self, quant_type, conv_module, bn_module, qi=True, qo=True, num_bits=8, e_bits=3):
super(QConvBNReLU, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.conv_module = conv_module
self.bn_module = bn_module
self.qw = QParam(quant_type, num_bits,e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def fold_bn(self, mean, std):
if self.bn_module.affine:
gamma_ = self.bn_module.weight / std
weight = self.conv_module.weight * gamma_.view(self.conv_module.out_channels, 1, 1, 1)
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean + self.bn_module.bias
else:
bias = self.bn_module.bias - gamma_ * mean
else:
gamma_ = 1 / std
weight = self.conv_module.weight * gamma_
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean
else:
bias = -gamma_ * mean
return weight, bias
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
if qi is not None:
self.qi = qi
if qo is not None:
self.qo = qo
self.M.data = (self.qw.scale * self.qi.scale / self.qo.scale).data
std = torch.sqrt(self.bn_module.running_var + self.bn_module.eps)
weight, bias = self.fold_bn(self.bn_module.running_mean, std)
self.conv_module.weight.data = self.qw.quantize_tensor(weight.data)
self.conv_module.weight.data = self.conv_module.weight.data - self.qw.zero_point
if self.conv_module.bias is None:
self.conv_module.bias = nn.Parameter(quantize_tensor(self.quant_type,
bias, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax,is_bias=True))
else:
self.conv_module.bias.data = quantize_tensor(self.quant_type,
bias, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax,is_bias=True)
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
if self.training:
y = F.conv2d(x, self.conv_module.weight, self.conv_module.bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding,
dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
y = y.permute(1, 0, 2, 3) # NCHW -> CNHW
y = y.contiguous().view(self.conv_module.out_channels, -1) # CNHW -> C,NHW
# mean = y.mean(1)
# var = y.var(1)
mean = y.mean(1).detach()
var = y.var(1).detach()
self.bn_module.running_mean = \
(1 - self.bn_module.momentum) * self.bn_module.running_mean + \
self.bn_module.momentum * mean
self.bn_module.running_var = \
(1 - self.bn_module.momentum) * self.bn_module.running_var + \
self.bn_module.momentum * var
else:
mean = Variable(self.bn_module.running_mean)
var = Variable(self.bn_module.running_var)
std = torch.sqrt(var + self.bn_module.eps)
weight, bias = self.fold_bn(mean, std)
self.qw.update(weight.data)
x = F.conv2d(x, FakeQuantize.apply(weight, self.qw), bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding, dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
x = F.relu(x)
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x):
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point
x.clamp_(min=0)
return x
class QConvBN(QModule):
def __init__(self, quant_type, conv_module, bn_module, qi=True, qo=True, num_bits=8, e_bits=3):
super(QConvBN, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.conv_module = conv_module
self.bn_module = bn_module
self.qw = QParam(quant_type, num_bits,e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def fold_bn(self, mean, std):
if self.bn_module.affine:
gamma_ = self.bn_module.weight / std
weight = self.conv_module.weight * gamma_.view(self.conv_module.out_channels, 1, 1, 1)
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean + self.bn_module.bias
else:
bias = self.bn_module.bias - gamma_ * mean
else:
gamma_ = 1 / std
weight = self.conv_module.weight * gamma_
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean
else:
bias = -gamma_ * mean
return weight, bias
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
if qi is not None:
self.qi = qi
if qo is not None:
self.qo = qo
self.M.data = (self.qw.scale * self.qi.scale / self.qo.scale).data
std = torch.sqrt(self.bn_module.running_var + self.bn_module.eps)
weight, bias = self.fold_bn(self.bn_module.running_mean, std)
self.conv_module.weight.data = self.qw.quantize_tensor(weight.data)
self.conv_module.weight.data = self.conv_module.weight.data - self.qw.zero_point
if self.conv_module.bias is None:
self.conv_module.bias = nn.Parameter(quantize_tensor(self.quant_type,
bias, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax,is_bias=True))
else:
self.conv_module.bias.data = quantize_tensor(self.quant_type,
bias, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax,is_bias=True)
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
if self.training:
y = F.conv2d(x, self.conv_module.weight, self.conv_module.bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding,
dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
y = y.permute(1, 0, 2, 3) # NCHW -> CNHW
y = y.contiguous().view(self.conv_module.out_channels, -1) # CNHW -> C,NHW
# mean = y.mean(1)
# var = y.var(1)
mean = y.mean(1).detach()
var = y.var(1).detach()
self.bn_module.running_mean = \
(1 - self.bn_module.momentum) * self.bn_module.running_mean + \
self.bn_module.momentum * mean
self.bn_module.running_var = \
(1 - self.bn_module.momentum) * self.bn_module.running_var + \
self.bn_module.momentum * var
else:
mean = Variable(self.bn_module.running_mean)
var = Variable(self.bn_module.running_var)
std = torch.sqrt(var + self.bn_module.eps)
weight, bias = self.fold_bn(mean, std)
self.qw.update(weight.data)
x = F.conv2d(x, FakeQuantize.apply(weight, self.qw), bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding, dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
# x = F.relu(x)
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x):
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point
# x.clamp_(min=0)
return x
# 待修改 需要有qo吧
class QAdaptiveAvgPool2d(QModule):
def __init__(self, quant_type, qi=False, qo=True, num_bits=8, e_bits=3):
super(QAdaptiveAvgPool2d, self).__init__(quant_type,qi,qo,num_bits,e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if qi is not None:
self.qi = qi
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
if qo is not None:
self.qo = qo
self.M.data = (self.qi.scale / self.qo.scale).data
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi) # 与ReLu一样,先更新qi的scale,再将x用PoT表示了 (不过一般前一层的qo都是True,则x已经被PoT表示了)
x = F.adaptive_avg_pool2d(x,(1, 1)) # 对输入输出都量化一下就算是量化了
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x):
x = F.adaptive_avg_pool2d(x,(1, 1)) # 对输入输出都量化一下就算是量化了
x = self.M * x
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
return x
class QConvBNReLU6(QModule):
def __init__(self, quant_type, conv_module, bn_module, qi=True, qo=True, num_bits=8, e_bits=3):
super(QConvBNReLU6, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.conv_module = conv_module
self.bn_module = bn_module
self.qw = QParam(quant_type, num_bits,e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def fold_bn(self, mean, std):
if self.bn_module.affine:
gamma_ = self.bn_module.weight / std
weight = self.conv_module.weight * gamma_.view(self.conv_module.out_channels, 1, 1, 1)
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean + self.bn_module.bias
else:
bias = self.bn_module.bias - gamma_ * mean
else:
gamma_ = 1 / std
weight = self.conv_module.weight * gamma_
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean
else:
bias = -gamma_ * mean
return weight, bias
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
if qi is not None:
self.qi = qi
if qo is not None:
self.qo = qo
self.M.data = (self.qw.scale * self.qi.scale / self.qo.scale).data
std = torch.sqrt(self.bn_module.running_var + self.bn_module.eps)
weight, bias = self.fold_bn(self.bn_module.running_mean, std)
self.conv_module.weight.data = self.qw.quantize_tensor(weight.data)
self.conv_module.weight.data = self.conv_module.weight.data - self.qw.zero_point
self.conv_module.bias.data = quantize_tensor(self.quant_type,
bias, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax,is_bias=True)
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
if self.training:
y = F.conv2d(x, self.conv_module.weight, self.conv_module.bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding,
dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
y = y.permute(1, 0, 2, 3) # NCHW -> CNHW
y = y.contiguous().view(self.conv_module.out_channels, -1) # CNHW -> C,NHW
# mean = y.mean(1)
# var = y.var(1)
mean = y.mean(1).detach()
var = y.var(1).detach()
self.bn_module.running_mean = \
(1 - self.bn_module.momentum) * self.bn_module.running_mean + \
self.bn_module.momentum * mean
self.bn_module.running_var = \
(1 - self.bn_module.momentum) * self.bn_module.running_var + \
self.bn_module.momentum * var
else:
mean = Variable(self.bn_module.running_mean)
var = Variable(self.bn_module.running_var)
std = torch.sqrt(var + self.bn_module.eps)
weight, bias = self.fold_bn(mean, std)
self.qw.update(weight.data)
x = F.conv2d(x, FakeQuantize.apply(weight, self.qw), bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding, dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
x = F.relu6(x)
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x):
a = torch.tensor(6)
a = self.qo.quantize_tensor(a)
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
# if self.quant_type is not 'POT':
# x = get_nearest_val(self.quant_type,x)
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point # 属于qo范围的数据
x.clamp_(min=0, max=a.item())
return x
class QModule_2(nn.Module):
def __init__(self,quant_type, qi0=True, qi1=True, qo=True, num_bits=8, e_bits=3):
super(QModule_2, self).__init__()
if qi0:
self.qi0 = QParam(quant_type,num_bits, e_bits) # qi在此处就已经被num_bits和mode赋值了
if qi1:
self.qi1 = QParam(quant_type,num_bits, e_bits) # qi在此处就已经被num_bits和mode赋值了
if qo:
self.qo = QParam(quant_type,num_bits, e_bits) # qo在此处就已经被num_bits和mode赋值了
self.quant_type = quant_type
self.num_bits = num_bits
self.e_bits = e_bits
self.bias_qmax = bias_qmax(quant_type)
def freeze(self):
pass
def fakefreeze(self):
pass
def quantize_inference(self, x):
raise NotImplementedError('quantize_inference should be implemented.')
class QElementwiseAdd(QModule_2):
def __init__(self, quant_type, qi0=True, qi1=True, qo=True, num_bits=8, e_bits=3):
super(QElementwiseAdd, self).__init__(quant_type, qi0, qi1, qo, num_bits, e_bits)
self.register_buffer('M0', torch.tensor([], requires_grad=False)) # 将M注册为buffer
self.register_buffer('M1', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def freeze(self, qi0=None, qi1=None ,qo=None):
if hasattr(self, 'qi') and qi0 is not None:
raise ValueError('qi0 has been provided in init function.')
if not hasattr(self, 'qi') and qi0 is None:
raise ValueError('qi0 is not existed, should be provided.')
if hasattr(self, 'qi1') and qi0 is not None:
raise ValueError('qi1 has been provided in init function.')
if not hasattr(self, 'qi1') and qi0 is None:
raise ValueError('qi1 is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
# 这里因为在池化或者激活的输入,不需要对最大值和最小是进行额外的统计,会共享相同的输出
if qi0 is not None:
self.qi0 = qi0
if qi1 is not None:
self.qi1 = qi1
if qo is not None:
self.qo = qo
# 根据https://zhuanlan.zhihu.com/p/156835141, 这是式3 的系数
self.M0.data = self.qi0.scale / self.qo.scale
self.M1.data = self.qi1.scale / self.qi0.scale
# self.M0.data = self.qi0.scale / self.qo.scale
# self.M1.data = self.qi1.scale / self.qo.scale
def forward(self, x0, x1): # 前向传播,输入张量,x为浮点型数据
if hasattr(self, 'qi0'):
self.qi0.update(x0)
x0 = FakeQuantize.apply(x0, self.qi0) # 对输入张量X完成量化
if hasattr(self, 'qi1'):
self.qi1.update(x1)
x1 = FakeQuantize.apply(x1, self.qi1) # 对输入张量X完成量化
x = x0 + x1
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x0, x1): # 此处input为已经量化的qx
x0 = x0 - self.qi0.zero_point
x1 = x1 - self.qi1.zero_point
x = self.M0 * (x0 + x1*self.M1)
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point
return x
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from get_weight import *
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms
from torchvision.datasets import CIFAR10
from torch.optim.lr_scheduler import CosineAnnealingLR
from model import *
from torchvision.transforms import transforms
# import models
import time
import os
import argparse
# 定义模型
def train(model, optimizer, criterion, train_loader, device):
model.train()
running_loss = 0.0
flag = 0
cnt = 0
for i, data in enumerate(train_loader):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
histo, grads = (get_model_histogram(model))
if flag == 0:
flag = 1
grads_sum = grads
else:
for k,v in grads_sum.items():
grads_sum[k] += grads[k]
optimizer.step()
running_loss += loss.item()
train_loss = running_loss / len(train_loader)
for k, v in grads_sum.items():
grads_sum[k] = v / len(train_loader)
return train_loss,grads_sum
def evaluate(model, criterion, test_loader, device):
model.eval()
correct, total = 0, 0
with torch.no_grad():
for data in test_loader:
images, labels = data
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
return accuracy
# def get_children(model: torch.nn.Module):
# # get children form model!
# # 为了后续也能够更新参数,需要用nn.ModuleList来承载
# children = nn.ModuleList(model.children())
# # print(children)
# # 方便对其中的module进行后续的更新
# flatt_children = nn.ModuleList()
# # children = list(model.children())
# # flatt_children = nn.ModuleList()
# # flatt_children = []
# if len(children) == 0:
# # if model has no children; model is last child! :O
# return model
# else:
# # look for children from children... to the last child!
# for child in children:
# try:
# flatt_children.extend(get_children(child))
# except TypeError:
# flatt_children.append(get_children(child))
# # print(flatt_children)
# return flatt_children
if __name__ == "__main__":
# torch.cuda.empty_cache()
parser = argparse.ArgumentParser(description='PyTorch FP32 Training')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='resnet18')
parser.add_argument('-e','--epochs', default=100, type=int, metavar='EPOCHS', help='number of total epochs to run')
parser.add_argument('-b', '--batch_size', default=128, type=int, metavar='BATCH SIZE', help='mini-batch size (default: 128)')
parser.add_argument('-j','--workers', default=4, type=int, metavar='WORKERS',help='number of data loading workers (default: 4)')
parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('-wd','--weight_decay',default=0.0001,type=float,metavar='WD',help='lr schduler weight decay',dest='wd')
parser.add_argument('-t', '--test', dest='test', action='store_true', help='test model on test set')
# 训练参数
args = parser.parse_args()
num_epochs = args.epochs
print(num_epochs)
batch_size = args.batch_size
print(batch_size)
num_workers = args.workers
lr = args.lr
weight_decay = args.wd
best_acc = float("-inf")
start_time = time.time()
# 模型、损失函数和优化器
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 加入设备选择
print(device)
if args.model == 'MobileNetV2' :
model = MobileNetV2().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
# optimizer = optim.AdaBound(model.parameters(), lr=lr,
# weight_decay=weight_decay, final_lr=0.001*lr)
# print("ok!")
# 数据并行
if torch.cuda.device_count() > 1:
print(f"Using {torch.cuda.device_count()} GPUs")
model = nn.DataParallel(model)
# 加载数据
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=False,
transform=transforms.Compose([
transforms.RandomCrop(32, padding=2),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, download=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True
)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
# 学习率调度器
# lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
# TensorBoard
# WARN
# writer = SummaryWriter(log_dir='./project/p/models_log/trail/full_log')
writer = SummaryWriter(log_dir='log/' + args.model + '/full_log')
# Early Stopping 参数
patience = 20
count = 0
# WARN
# save_dir = './project/p/ckpt/trail'
save_dir = 'ckpt'
if not os.path.isdir(save_dir):
os.makedirs(save_dir, mode=0o777)
os.chmod(save_dir, mode=0o777)
# checkpoint_dir = './project/p/checkpoint/cifar-10_trail_model'
checkpoint_dir = 'checkpoint'
if not os.path.isdir(checkpoint_dir):
os.makedirs(checkpoint_dir, mode=0o777)
os.chmod(checkpoint_dir, mode=0o777)
# 训练循环
if args.test == True:
model.load_state_dict(torch.load(save_dir + '/cifar10_' +args.model + '.pt'))
acc = evaluate(model, criterion, test_loader, device=device)
print(f"test accuracy: {acc:.2f}%")
# for name, module in model.named_modules():
# print(f"{name}: {module}\n")
# print('========================================================')
# print('========================================================')
# model.quantize()
# for name , layer in model.quantize_layers.items():
# print(f"Layer {name}: {layer} ") # 足够遍历了
else:
for epoch in range(num_epochs):
# 训练模型并记录 loss
train_loss,grads_sum = train(model, optimizer, criterion,
train_loader, device=device)
writer.add_scalar("Training Loss", train_loss, epoch + 1)
# 评估模型并记录 accuracy
if (epoch + 1) % 5 == 0:
acc = evaluate(model, criterion, test_loader, device=device)
writer.add_scalar("Validation Accuracy", acc, epoch + 1)
# checkpoint = {
# 'epoch': epoch,
# 'grads': grads_sum,
# 'accuracy':acc
# }
# for name, param in grads_sum.items():
# # 此处的grad是累加值吧 不是平均值
# writer.add_histogram(tag=name + '_grad', values=param, global_step=epoch)
# # 取这个epoch最后一个batch算完之后的weight
# for name, param in model.named_parameters():
# writer.add_histogram(tag=name + '_data', values=param.data, global_step=epoch)
# torch.save(checkpoint, checkpoint_dir + '/cifar10_' + args.model + '_%s.pt' % (str(epoch+1)))
# 存储最好的模型
if acc > best_acc:
best_acc = acc
count = 0
# WARN
# torch.save(model.state_dict(), save_dir+'/model_trail.pt')
torch.save(model.state_dict(), save_dir + '/cifar10_' +args.model + '.pt')
else:
count += 1
print(
f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.5f}, Val Acc: {acc:.2f}%")
# 判断是否需要 early stopping
if count == patience:
print(f"No improvement after {patience} epochs. Early stop!")
break
# 更新学习率
lr_scheduler.step()
# 训练用时和最佳验证集准确率
print(f"Training took {(time.time() - start_time) / 60:.2f} minutes")
print(f"Best validation accuracy: {best_acc:.2f}%")
# 加载并测试最佳模型
# model.load_state_dict(torch.load("best_model.pth"))
# model.to(device)
# test_acc = evaluate(model, criterion, test_loader, device="cuda")
# print(f"Test Accuracy: {test_acc:.2f}%")
# 关闭 TensorBoard 写入器
writer.close()
MobileNetV2(
2.03 M, 100.000% Params, 59.12 MMac, 100.000% MACs,
(conv1): Conv2d(896, 0.044% Params, 917.5 KMac, 1.552% MACs, 3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(bn1): BatchNorm2d(64, 0.003% Params, 65.54 KMac, 0.111% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 32.77 KMac, 0.055% MACs, inplace=True)
(layer1): MakeLayer(
2.06 k, 0.102% Params, 2.18 MMac, 3.686% MACs,
(layers): ModuleList(
2.06 k, 0.102% Params, 2.18 MMac, 3.686% MACs,
(0): InvertedResidual(
2.06 k, 0.102% Params, 2.18 MMac, 3.686% MACs,
(conv1): Conv2d(1.06 k, 0.052% Params, 1.08 MMac, 1.829% MACs, 32, 32, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(64, 0.003% Params, 65.54 KMac, 0.111% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 32.77 KMac, 0.055% MACs, inplace=True)
(conv2): Conv2d(320, 0.016% Params, 327.68 KMac, 0.554% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32)
(bn2): BatchNorm2d(64, 0.003% Params, 65.54 KMac, 0.111% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 32.77 KMac, 0.055% MACs, inplace=True)
(conv3): Conv2d(528, 0.026% Params, 540.67 KMac, 0.915% MACs, 32, 16, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(32, 0.002% Params, 32.77 KMac, 0.055% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(layer2): MakeLayer(
14.5 k, 0.716% Params, 5.31 MMac, 8.979% MACs,
(layers): ModuleList(
14.5 k, 0.716% Params, 5.31 MMac, 8.979% MACs,
(0): InvertedResidual(
5.35 k, 0.264% Params, 2.89 MMac, 4.895% MACs,
(conv1): Conv2d(1.63 k, 0.081% Params, 1.67 MMac, 2.827% MACs, 16, 96, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(192, 0.009% Params, 196.61 KMac, 0.333% MACs, 96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 98.3 KMac, 0.166% MACs, inplace=True)
(conv2): Conv2d(960, 0.047% Params, 245.76 KMac, 0.416% MACs, 96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96)
(bn2): BatchNorm2d(192, 0.009% Params, 49.15 KMac, 0.083% MACs, 96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 24.58 KMac, 0.042% MACs, inplace=True)
(conv3): Conv2d(2.33 k, 0.115% Params, 595.97 KMac, 1.008% MACs, 96, 24, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(48, 0.002% Params, 12.29 KMac, 0.021% MACs, 24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): InvertedResidual(
9.14 k, 0.451% Params, 2.41 MMac, 4.084% MACs,
(conv1): Conv2d(3.6 k, 0.178% Params, 921.6 KMac, 1.559% MACs, 24, 144, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(288, 0.014% Params, 73.73 KMac, 0.125% MACs, 144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 36.86 KMac, 0.062% MACs, inplace=True)
(conv2): Conv2d(1.44 k, 0.071% Params, 368.64 KMac, 0.624% MACs, 144, 144, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=144)
(bn2): BatchNorm2d(288, 0.014% Params, 73.73 KMac, 0.125% MACs, 144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 36.86 KMac, 0.062% MACs, inplace=True)
(conv3): Conv2d(3.48 k, 0.172% Params, 890.88 KMac, 1.507% MACs, 144, 24, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(48, 0.002% Params, 12.29 KMac, 0.021% MACs, 24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(layer3): MakeLayer(
40.85 k, 2.016% Params, 3.46 MMac, 5.846% MACs,
(layers): ModuleList(
40.85 k, 2.016% Params, 3.46 MMac, 5.846% MACs,
(0): InvertedResidual(
10.32 k, 0.509% Params, 1.45 MMac, 2.458% MACs,
(conv1): Conv2d(3.6 k, 0.178% Params, 921.6 KMac, 1.559% MACs, 24, 144, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(288, 0.014% Params, 73.73 KMac, 0.125% MACs, 144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 36.86 KMac, 0.062% MACs, inplace=True)
(conv2): Conv2d(1.44 k, 0.071% Params, 92.16 KMac, 0.156% MACs, 144, 144, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=144)
(bn2): BatchNorm2d(288, 0.014% Params, 18.43 KMac, 0.031% MACs, 144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 9.22 KMac, 0.016% MACs, inplace=True)
(conv3): Conv2d(4.64 k, 0.229% Params, 296.96 KMac, 0.502% MACs, 144, 32, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(64, 0.003% Params, 4.1 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): InvertedResidual(
15.26 k, 0.754% Params, 1.0 MMac, 1.694% MACs,
(conv1): Conv2d(6.34 k, 0.313% Params, 405.5 KMac, 0.686% MACs, 32, 192, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(384, 0.019% Params, 24.58 KMac, 0.042% MACs, 192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 12.29 KMac, 0.021% MACs, inplace=True)
(conv2): Conv2d(1.92 k, 0.095% Params, 122.88 KMac, 0.208% MACs, 192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192)
(bn2): BatchNorm2d(384, 0.019% Params, 24.58 KMac, 0.042% MACs, 192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 12.29 KMac, 0.021% MACs, inplace=True)
(conv3): Conv2d(6.18 k, 0.305% Params, 395.26 KMac, 0.669% MACs, 192, 32, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(64, 0.003% Params, 4.1 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(2): InvertedResidual(
15.26 k, 0.754% Params, 1.0 MMac, 1.694% MACs,
(conv1): Conv2d(6.34 k, 0.313% Params, 405.5 KMac, 0.686% MACs, 32, 192, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(384, 0.019% Params, 24.58 KMac, 0.042% MACs, 192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 12.29 KMac, 0.021% MACs, inplace=True)
(conv2): Conv2d(1.92 k, 0.095% Params, 122.88 KMac, 0.208% MACs, 192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192)
(bn2): BatchNorm2d(384, 0.019% Params, 24.58 KMac, 0.042% MACs, 192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 12.29 KMac, 0.021% MACs, inplace=True)
(conv3): Conv2d(6.18 k, 0.305% Params, 395.26 KMac, 0.669% MACs, 192, 32, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(64, 0.003% Params, 4.1 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(layer4): MakeLayer(
266.78 k, 13.170% Params, 17.25 MMac, 29.172% MACs,
(layers): ModuleList(
266.78 k, 13.170% Params, 17.25 MMac, 29.172% MACs,
(0): InvertedResidual(
27.74 k, 1.370% Params, 1.8 MMac, 3.045% MACs,
(conv1): Conv2d(6.34 k, 0.313% Params, 405.5 KMac, 0.686% MACs, 32, 192, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(384, 0.019% Params, 24.58 KMac, 0.042% MACs, 192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 12.29 KMac, 0.021% MACs, inplace=True)
(conv2): Conv2d(1.92 k, 0.095% Params, 122.88 KMac, 0.208% MACs, 192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192)
(bn2): BatchNorm2d(384, 0.019% Params, 24.58 KMac, 0.042% MACs, 192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 12.29 KMac, 0.021% MACs, inplace=True)
(conv3): Conv2d(18.53 k, 0.915% Params, 1.19 MMac, 2.006% MACs, 192, 96, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(192, 0.009% Params, 12.29 KMac, 0.021% MACs, 96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): InvertedResidual(
119.52 k, 5.900% Params, 7.72 MMac, 13.064% MACs,
(conv1): Conv2d(55.87 k, 2.758% Params, 3.58 MMac, 6.049% MACs, 96, 576, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(1.15 k, 0.057% Params, 73.73 KMac, 0.125% MACs, 576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 36.86 KMac, 0.062% MACs, inplace=True)
(conv2): Conv2d(5.76 k, 0.284% Params, 368.64 KMac, 0.624% MACs, 576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=576)
(bn2): BatchNorm2d(1.15 k, 0.057% Params, 73.73 KMac, 0.125% MACs, 576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 36.86 KMac, 0.062% MACs, inplace=True)
(conv3): Conv2d(55.39 k, 2.734% Params, 3.55 MMac, 5.997% MACs, 576, 96, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(192, 0.009% Params, 12.29 KMac, 0.021% MACs, 96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(2): InvertedResidual(
119.52 k, 5.900% Params, 7.72 MMac, 13.064% MACs,
(conv1): Conv2d(55.87 k, 2.758% Params, 3.58 MMac, 6.049% MACs, 96, 576, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(1.15 k, 0.057% Params, 73.73 KMac, 0.125% MACs, 576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 36.86 KMac, 0.062% MACs, inplace=True)
(conv2): Conv2d(5.76 k, 0.284% Params, 368.64 KMac, 0.624% MACs, 576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=576)
(bn2): BatchNorm2d(1.15 k, 0.057% Params, 73.73 KMac, 0.125% MACs, 576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 36.86 KMac, 0.062% MACs, inplace=True)
(conv3): Conv2d(55.39 k, 2.734% Params, 3.55 MMac, 5.997% MACs, 576, 96, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(192, 0.009% Params, 12.29 KMac, 0.021% MACs, 96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(layer5): MakeLayer(
800.74 k, 39.528% Params, 15.66 MMac, 26.483% MACs,
(layers): ModuleList(
800.74 k, 39.528% Params, 15.66 MMac, 26.483% MACs,
(0): InvertedResidual(
156.58 k, 7.729% Params, 5.29 MMac, 8.945% MACs,
(conv1): Conv2d(55.87 k, 2.758% Params, 3.58 MMac, 6.049% MACs, 96, 576, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(1.15 k, 0.057% Params, 73.73 KMac, 0.125% MACs, 576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 36.86 KMac, 0.062% MACs, inplace=True)
(conv2): Conv2d(5.76 k, 0.284% Params, 92.16 KMac, 0.156% MACs, 576, 576, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=576)
(bn2): BatchNorm2d(1.15 k, 0.057% Params, 18.43 KMac, 0.031% MACs, 576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 9.22 KMac, 0.016% MACs, inplace=True)
(conv3): Conv2d(92.32 k, 4.557% Params, 1.48 MMac, 2.499% MACs, 576, 160, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(320, 0.016% Params, 5.12 KMac, 0.009% MACs, 160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): InvertedResidual(
322.08 k, 15.899% Params, 5.18 MMac, 8.769% MACs,
(conv1): Conv2d(154.56 k, 7.630% Params, 2.47 MMac, 4.183% MACs, 160, 960, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(1.92 k, 0.095% Params, 30.72 KMac, 0.052% MACs, 960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 15.36 KMac, 0.026% MACs, inplace=True)
(conv2): Conv2d(9.6 k, 0.474% Params, 153.6 KMac, 0.260% MACs, 960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960)
(bn2): BatchNorm2d(1.92 k, 0.095% Params, 30.72 KMac, 0.052% MACs, 960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 15.36 KMac, 0.026% MACs, inplace=True)
(conv3): Conv2d(153.76 k, 7.590% Params, 2.46 MMac, 4.161% MACs, 960, 160, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(320, 0.016% Params, 5.12 KMac, 0.009% MACs, 160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(2): InvertedResidual(
322.08 k, 15.899% Params, 5.18 MMac, 8.769% MACs,
(conv1): Conv2d(154.56 k, 7.630% Params, 2.47 MMac, 4.183% MACs, 160, 960, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(1.92 k, 0.095% Params, 30.72 KMac, 0.052% MACs, 960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 15.36 KMac, 0.026% MACs, inplace=True)
(conv2): Conv2d(9.6 k, 0.474% Params, 153.6 KMac, 0.260% MACs, 960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960)
(bn2): BatchNorm2d(1.92 k, 0.095% Params, 30.72 KMac, 0.052% MACs, 960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 15.36 KMac, 0.026% MACs, inplace=True)
(conv3): Conv2d(153.76 k, 7.590% Params, 2.46 MMac, 4.161% MACs, 960, 160, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(320, 0.016% Params, 5.12 KMac, 0.009% MACs, 160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(layer6): MakeLayer(
476.16 k, 23.506% Params, 7.65 MMac, 12.939% MACs,
(layers): ModuleList(
476.16 k, 23.506% Params, 7.65 MMac, 12.939% MACs,
(0): InvertedResidual(
476.16 k, 23.506% Params, 7.65 MMac, 12.939% MACs,
(conv1): Conv2d(154.56 k, 7.630% Params, 2.47 MMac, 4.183% MACs, 160, 960, kernel_size=(1, 1), stride=(1, 1))
(bn1): BatchNorm2d(1.92 k, 0.095% Params, 30.72 KMac, 0.052% MACs, 960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu1): ReLU6(0, 0.000% Params, 15.36 KMac, 0.026% MACs, inplace=True)
(conv2): Conv2d(9.6 k, 0.474% Params, 153.6 KMac, 0.260% MACs, 960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960)
(bn2): BatchNorm2d(1.92 k, 0.095% Params, 30.72 KMac, 0.052% MACs, 960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu2): ReLU6(0, 0.000% Params, 15.36 KMac, 0.026% MACs, inplace=True)
(conv3): Conv2d(307.52 k, 15.181% Params, 4.92 MMac, 8.323% MACs, 960, 320, kernel_size=(1, 1), stride=(1, 1))
(bn3): BatchNorm2d(640, 0.032% Params, 10.24 KMac, 0.017% MACs, 320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(conv2): Conv2d(410.88 k, 20.283% Params, 6.57 MMac, 11.120% MACs, 320, 1280, kernel_size=(1, 1), stride=(1, 1))
(avg1): AdaptiveAvgPool2d(0, 0.000% Params, 20.48 KMac, 0.035% MACs, output_size=1)
(fc): Linear(12.81 k, 0.632% Params, 12.81 KMac, 0.022% MACs, in_features=1280, out_features=10, bias=True)
)
\ No newline at end of file
# -*- coding: utf-8 -*-
from torch.serialization import load
from model import *
from extract_ratio import *
from utils import *
import gol
import openpyxl
import sys
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.transforms.functional import InterpolationMode
import torch.utils.bottleneck as bn
import os
import os.path as osp
from torch.utils.tensorboard import SummaryWriter
def direct_quantize(model, test_loader,device):
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model.quantize_forward(data).cpu()
if i % 500 == 0:
break
print('direct quantization finish')
def full_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model(data).cpu()
pred = output.argmax(dim=1, keepdim=True)
# print(pred)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Full Model Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
return 100. * correct / len(test_loader.dataset)
def quantize_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model.quantize_inference(data).cpu()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('Test set: Quant Model Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
return 100. * correct / len(test_loader.dataset)
def js_div(p_output, q_output, get_softmax=True):
"""
Function that measures JS divergence between target and output logits:
"""
KLDivLoss = nn.KLDivLoss(reduction='sum')
if get_softmax:
p_output = F.softmax(p_output)
q_output = F.softmax(q_output)
log_mean_output = ((p_output + q_output)/2).log()
return (KLDivLoss(log_mean_output, p_output) + KLDivLoss(log_mean_output, q_output))/2
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='PyTorch FP32 Training')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='ResNet18')
parser.add_argument('-b', '--batch_size', default=128, type=int, metavar='BATCH SIZE', help='mini-batch size (default: 128)')
parser.add_argument('-j','--workers', default=4, type=int, metavar='WORKERS',help='number of data loading workers (default: 4)')
parser.add_argument('-s', '--save', default=False, type=bool)
# parser.add_argument('-t', '--test', dest='test', action='store_true', help='test model on test set')
# 训练参数
args = parser.parse_args()
batch_size = args.batch_size
num_workers = args.workers
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=False,
transform=transforms.Compose([
transforms.RandomCrop(32, padding=2),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, download=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True
)
# model = AlexNet_BN()
# if args.model == 'ResNet18':
# model = resnet18()
# elif args.model == 'ResNet50':
# model = resnet50()
# elif args.model == 'ResNet152':
# model = resnet152()
if args.model == 'MobileNetV2':
model = MobileNetV2()
writer = SummaryWriter(log_dir='log/' + args.model + '/ptq')
full_file = 'ckpt/cifar10_' + args.model + '.pt'
model.load_state_dict(torch.load(full_file))
model.to(device)
load_ptq = False
ptq_file_prefix = 'ckpt/cifar10_' + args.model + '_ptq_'
model.eval()
full_acc = full_inference(model, test_loader, device)
model_fold = fold_model(model) # 可以得到conv,bn,relu,fc的各个层
full_params = []
layer, par_ratio, flop_ratio = extract_ratio(args.model)
# print(layer)
layer = []
for name, param in model.named_parameters():
if 'weight' in name:
n = name.split('.')
pre = '.'.join(n[:len(n)-1])
# 提取出weight前的名字(就是这个层的名字,if weight是避免bias重复提取一遍名字)
layer.append(pre)
# print(name)
print('===================')
# print(layer)
par_ratio, flop_ratio = fold_ratio(layer, par_ratio, flop_ratio)
# sys.exit()
for name, param in model_fold.named_parameters():
if 'bn' in name or 'sample.1' in name:
continue
param_norm = F.normalize(param.data.cpu(),p=2,dim=-1)
full_params.append(param_norm) # 没统计bn的 只统计了conv的 而且还是fold后的
writer.add_histogram(tag='Full_' + name + '_data', values=param.data)
gol._init()
quant_type_list = ['INT','POT','FLOAT']
title_list = []
js_flops_list = []
js_param_list = []
ptq_acc_list = []
acc_loss_list = []
for quant_type in quant_type_list:
num_bit_list = numbit_list(quant_type)
# 对一个量化类别,只需设置一次bias量化表
# int由于位宽大,使用量化表开销过大,直接_round即可
if quant_type != 'INT':
bias_list = build_bias_list(quant_type)
gol.set_value(bias_list, is_bias=True)
for num_bits in num_bit_list:
e_bit_list = ebit_list(quant_type,num_bits)
for e_bits in e_bit_list:
# model_ptq = resnet18()
# if args.model == 'ResNet18':
# model_ptq = resnet18()
# elif args.model == 'ResNet50':
# model_ptq = resnet50()
# elif args.model == 'ResNet152':
# model_ptq = resnet152()
if args.model == 'MobileNetV2':
model_ptq = MobileNetV2()
if quant_type == 'FLOAT':
title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
else:
title = '%s_%d' % (quant_type, num_bits)
print('\nPTQ: '+title)
title_list.append(title)
# 设置量化表
if quant_type != 'INT':
plist = build_list(quant_type, num_bits, e_bits)
gol.set_value(plist)
# 判断是否需要载入
if load_ptq is True and osp.exists(ptq_file_prefix + title + '.pt'):
model_ptq.quantize(quant_type,num_bits,e_bits)
model_ptq.load_state_dict(torch.load(ptq_file_prefix + title + '.pt'))
model_ptq.to(device)
print('Successfully load ptq model: ' + title)
else:
model_ptq.load_state_dict(torch.load(full_file))
model_ptq.to(device)
model_ptq.quantize(quant_type,num_bits,e_bits)
model_ptq.eval()
direct_quantize(model_ptq, train_loader, device)
if args.save == True:
torch.save(model_ptq.state_dict(), ptq_file_prefix + title + '.pt')
model_ptq.freeze()
ptq_acc = quantize_inference(model_ptq, test_loader, device)
ptq_acc_list.append(ptq_acc)
acc_loss = (full_acc - ptq_acc) / full_acc
acc_loss_list.append(acc_loss)
idx = -1
# 获取计算量/参数量下的js-div
js_flops = 0.
js_param = 0.
for name, param in model_ptq.named_parameters():
# if '.' not in name or 'bn' in name:
if 'bn' in name or 'sample.1' in name:
continue
writer.add_histogram(tag=title +':'+ name + '_data', values=param.data)
idx = idx + 1
# renset中有多个. 需要改写拼一下
# prefix = name.split('.')[0]
n = name.split('.')
prefix = '.'.join(n[:len(n) - 1])
# weight和bias 1:1 ? 对于ratio,是按层赋予的,此处可以对weight和bias再单独赋予不同的权重,比如(8:2)
if prefix in layer:
layer_idx = layer.index(prefix)
ptq_param = param.data.cpu()
# 取L2范数
ptq_norm = F.normalize(ptq_param,p=2,dim=-1)
writer.add_histogram(tag=title +':'+ name + '_data', values=ptq_param)
# print(name)
# print('=========')
# print(ptq_norm)
# print('=========')
# print(full_params[idx])
js = js_div(ptq_norm,full_params[idx]) # 这里算了fold后的量化前后模型的js距离
js = js.item()
if js < 0.:
js = 0.
js_flops = js_flops + js * flop_ratio[layer_idx]
js_param = js_param + js * par_ratio[layer_idx]
js_flops_list.append(js_flops)
js_param_list.append(js_param)
print(title + ': js_flops: %f js_param: %f acc_loss: %f' % (js_flops, js_param, acc_loss))
# 写入xlsx
workbook = openpyxl.Workbook()
worksheet = workbook.active
worksheet.cell(row=1,column=1,value='FP32-acc')
worksheet.cell(row=1,column=2,value=full_acc)
worksheet.cell(row=3,column=1,value='title')
worksheet.cell(row=3,column=2,value='js_flops')
worksheet.cell(row=3,column=3,value='js_param')
worksheet.cell(row=3,column=4,value='ptq_acc')
worksheet.cell(row=3,column=5,value='acc_loss')
for i in range(len(title_list)):
worksheet.cell(row=i+4, column=1, value=title_list[i])
worksheet.cell(row=i+4, column=2, value=js_flops_list[i])
worksheet.cell(row=i+4, column=3, value=js_param_list[i])
worksheet.cell(row=i+4, column=4, value=ptq_acc_list[i])
worksheet.cell(row=i+4, column=5, value=acc_loss_list[i])
workbook.save('ptq_result_' + args.model + '.xlsx')
writer.close()
ft = open('ptq_result_' + args.model + '.txt','w')
print('title_list:',file=ft)
print(" ".join(title_list),file=ft)
print('js_flops_list:',file=ft)
print(" ".join(str(i) for i in js_flops_list), file=ft)
print('js_param_list:',file=ft)
print(" ".join(str(i) for i in js_param_list), file=ft)
print('ptq_acc_list:',file=ft)
print(" ".join(str(i) for i in ptq_acc_list), file=ft)
print('acc_loss_list:',file=ft)
print(" ".join(str(i) for i in acc_loss_list), file=ft)
ft.close()
title_list:
INT_2 INT_3 INT_4 INT_5 INT_6 INT_7 INT_8 INT_9 INT_10 INT_11 INT_12 INT_13 INT_14 INT_15 INT_16 POT_2 POT_3 POT_4 POT_5 POT_6 POT_7 POT_8 FLOAT_3_E1 FLOAT_4_E1 FLOAT_4_E2 FLOAT_5_E1 FLOAT_5_E2 FLOAT_5_E3 FLOAT_6_E1 FLOAT_6_E2 FLOAT_6_E3 FLOAT_6_E4 FLOAT_7_E1 FLOAT_7_E2 FLOAT_7_E3 FLOAT_7_E4 FLOAT_7_E5 FLOAT_8_E1 FLOAT_8_E2 FLOAT_8_E3 FLOAT_8_E4 FLOAT_8_E5 FLOAT_8_E6
js_flops_list:
3937.841102576776 3363.8190205631026 2484.282190565725 1666.2525440043232 1004.491228033656 619.2894405795665 418.1602230520327 312.24372234629664 249.45408998160573 193.16718609999484 138.595522944474 91.58500300086834 56.08617073690407 31.952073152774126 17.65229708562858 3937.814777378682 3068.5487659326095 613.4307511092446 32.126779884075525 0.1766064000339766 0.17675544226821238 0.17669672446190357 2644.0738554237805 1741.4374062424974 1207.0395601708549 1028.5434609756649 668.1556835852132 273.1682252338685 625.3801636402046 430.06294426631223 203.6193231190563 6.546919210669541 419.88446951149035 315.44088340877573 143.55644125961152 2.884048071197845 0.05101264315330961 312.7100494425904 250.59377938308924 93.48144767534725 1.3453977569799638 0.013982248527794 0.10109822916201198
js_param_list:
5523.5133138619085 4892.5106928859905 3857.644315893276 2707.0379699110163 1671.0773218634195 1050.057190366859 723.4234749265271 550.470391030846 445.310410225667 347.5339012995234 250.31388514236687 165.8204601056395 101.56282095527794 57.86236490894071 31.945254415161216 5523.499968355952 4553.489563739368 1040.3277683793247 57.8656798626289 0.006984618427447457 0.006992355997222603 0.007052672537742183 4053.9039848113193 2820.767118173691 1991.9978400778782 1709.3926174557196 1129.0122760496113 485.4325026388394 1059.7935852519893 742.5316942830176 365.9667180995057 11.782087473714286 726.0475165732927 555.5680608520271 259.1510147521308 5.1897931543416576 0.0014493051074338339 551.1504206498132 447.20405741295923 169.2517013866318 2.4155090345624393 0.000397355594218542 0.03827793516377293
ptq_acc_list:
10.0 10.0 10.44 37.82 81.77 88.99 91.31 91.84 91.96 91.9 91.93 91.88 91.87 91.88 91.89 10.0 10.0 14.09 19.53 11.56 11.34 17.05 10.01 10.36 20.89 25.1 62.06 79.8 45.72 81.59 89.76 79.92 71.56 85.92 91.67 90.2 81.46 73.66 87.45 91.65 91.51 90.24 81.11
acc_loss_list:
0.8914812805208898 0.8914812805208898 0.886706456863809 0.5895822029300054 0.11264243081931644 0.034291915355398925 0.009115572436245289 0.003364080303852439 0.002061855670103222 0.0027129679869777536 0.0023874118285404106 0.002930005425936085 0.003038524145415096 0.002930005425936085 0.0028214867064569192 0.8914812805208898 0.8914812805208898 0.8470971242539338 0.7880629408572979 0.8745523602821487 0.8769397721106891 0.8149755832881173 0.8913727618014107 0.8875746066196419 0.773304395008139 0.7276180141074337 0.32653282691264246 0.1340206185567011 0.5038524145415084 0.11459576776994033 0.02593597395550733 0.13271839392295173 0.22344004340748783 0.06760716223548566 0.00520889853499733 0.02116115029842651 0.11600651112316887 0.20065111231687474 0.051003798155181794 0.005425935973955507 0.006945198046663055 0.020727075420510156 0.11980466630493766
from model import *
from utils import *
import gol
import sys
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import CosineAnnealingLR
import os
import os.path as osp
from torch.utils.tensorboard import SummaryWriter
def quantize_aware_training(model, device, train_loader, optimizer, epoch):
lossLayer = torch.nn.CrossEntropyLoss()
#统计loss和每个参数的grad
#初始化
loss_sum = 0.
grad_dict = {}
# 可以遍历各种weight和bias
for name,param in model.named_parameters():
grad_dict[name] = torch.zeros_like(param) #param.grad和param形状相同
for batch_idx, (data, target) in enumerate(train_loader, 1):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model.quantize_forward(data)
# 对一批数据求得的loss是平均值
loss = lossLayer(output, target)
loss.backward()
#loss和grads累加
loss_sum += loss
for name,param in model.named_parameters():
if param.grad is not None:
# print('-------'+name+'-------')
grad_dict[name] += param.grad
# print(grad_dict[name])
# print(grad_dict.items())
# input()
optimizer.step()
if batch_idx % 50 == 0:
print('Quantize Aware Training Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
batch_size = len(train_loader.batch_sampler)
#对不同batch累加值求平均
for name,grad in grad_dict.items():
grad_dict[name] = grad / batch_size
loss_avg = loss_sum / batch_size
return loss_avg, grad_dict
def full_inference(model, test_loader):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Full Model Accuracy: {:.2f}%\n'.format(100. * correct / len(test_loader.dataset)))
def train(model, device, train_loader, optimizer, epoch):
model.train()
lossLayer = torch.nn.CrossEntropyLoss()
#统计loss和每个参数的grad
#初始化
loss_sum = 0.
grad_dict = {}
for name,param in model.named_parameters():
grad_dict[name] = torch.zeros_like(param) #param.grad和param形状相同
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = lossLayer(output, target)
loss.backward()
#loss和grads累加
loss_sum += loss
for name,param in model.named_parameters():
if param.grad is not None:
# print('-------'+name+'-------')
grad_dict[name] += param.grad # 对batch的累加
# print(grad_dict[name])
optimizer.step()
if batch_idx % 50 == 0:
print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
batch_size = len(train_loader.batch_sampler) # batch数量
#对不同batch累加值求平均
for name,grad in grad_dict.items():
grad_dict[name] = grad / batch_size # 对batch的平均
loss_avg = loss_sum / batch_size
return loss_avg, grad_dict
def quantize_inference(model, test_loader):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data, target = data.to(device), target.to(device)
output = model.quantize_inference(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Quant Model Accuracy: {:.2f}%\n'.format(100. * correct / len(test_loader.dataset)))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='QAT Training')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='resnet18')
parser.add_argument('-e','--epochs', default=20, type=int, metavar='EPOCHS', help='number of total epochs to run')
parser.add_argument('-b', '--batch_size', default=128, type=int, metavar='BATCH SIZE', help='mini-batch size (default: 128)')
parser.add_argument('-j','--workers', default=1, type=int, metavar='WORKERS',help='number of data loading workers (default: 4)')
parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('-wd','--weight_decay',default=0.0001,type=float,metavar='WD',help='lr schduler weight decay',dest='wd')
parser.add_argument('-t', '--test', dest='test', action='store_true', help='test model on test set')
args = parser.parse_args()
batch_size = args.batch_size
seed = 1
epochs = args.epochs
lr = args.lr
# momentum = 0.5
weight_decay = args.wd
torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
writer = SummaryWriter(log_dir='log/' + args.model + '/qat')
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=args.workers, pin_memory=False
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=args.workers, pin_memory=False
)
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
writer = SummaryWriter(log_dir='log/' + args.model + '/qat')
# full_file = 'ckpt/cifar10_' + args.model + '.pt'
# model.load_state_dict(torch.load(full_file))
model.to(device)
# optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
optimizer = optim.Adam(model.parameters(), lr=lr)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
load_qat = False
ckpt_prefix = 'ckpt/qat/'+ args.model + '/'
loss_sum = 0.
grad_dict_sum = {}
grad_dict_avg = {}
for name,param in model.named_parameters():
grad_dict_sum[name] = torch.zeros_like(param)
grad_dict_avg[name] = torch.zeros_like(param)
# full precision from scratch
for epoch in range(1, epochs+1):
# 训练原模型,获取梯度分布
loss,grad_dict = train(model, device, train_loader, optimizer, epoch)
# print('loss:%f' % loss_avg)
writer.add_scalar('Full.loss',loss,epoch)
for name,grad in grad_dict.items():
writer.add_histogram('Full.'+name+'_grad',grad,global_step=epoch)
loss_sum += loss
loss_avg = loss_sum / epoch
for name,grad in grad_dict.items():
grad_dict_sum[name] += grad_dict[name] # 对epoch的累加
grad_dict_avg[name] = grad_dict_sum[name] / epoch # 对epoch求平均
ckpt = {
'epoch' : epoch,
'loss' : loss,
'loss_sum' : loss_sum,
'loss_avg' : loss_avg,
'grad_dict_avg' : grad_dict_avg
}
if epoch % 5 == 0:
subdir = 'epoch_%d/' % epoch
if not os.path.isdir(ckpt_prefix+ subdir):
os.makedirs(ckpt_prefix+ subdir, mode=0o777)
os.chmod(ckpt_prefix+ subdir, mode=0o777)
torch.save(ckpt,ckpt_prefix+ subdir +'full.pt')
lr_scheduler.step()
# loss_avg,grad_dict = quantize_aware_training(model_ptq, device, train_loader, optimizer, epoch)
# print('qat_loss:%f' % loss_avg)
# for name,grad in grad_dict.items():
# writer.add_histogram('qat_'+name+'_grad',grad,global_step=epoch)
# QAT from scratch
quant_type_list = ['INT','POT','FLOAT']
gol._init()
for quant_type in quant_type_list:
num_bit_list = numbit_list(quant_type)
# 对一个量化类别,只需设置一次bias量化表
# int由于位宽大,使用量化表开销过大,直接_round即可
if quant_type != 'INT':
bias_list = build_bias_list(quant_type)
gol.set_value(bias_list, is_bias=True)
for num_bits in num_bit_list:
e_bit_list = ebit_list(quant_type,num_bits)
for e_bits in e_bit_list:
if quant_type == 'FLOAT':
title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
else:
title = '%s_%d' % (quant_type, num_bits)
if load_qat is True and osp.exists(ckpt_prefix+'epoch_20/'+title+'.pt'):
continue
print('\nQAT: '+title)
# model_ptq = AlexNet()
if args.model == 'ResNet18':
model_ptq = resnet18()
elif args.model == 'ResNet50':
model_ptq = resnet50()
elif args.model == 'ResNet152':
model_ptq = resnet152()
# 设置量化表
if quant_type != 'INT':
plist = build_list(quant_type, num_bits, e_bits)
gol.set_value(plist)
# model_ptq.load_state_dict(torch.load(full_file))
model_ptq.to(device)
model_ptq.quantize(quant_type,num_bits,e_bits)
model_ptq.train()
loss_sum = 0.
grad_dict_sum = {}
grad_dict_avg = {}
for name,param in model.named_parameters():
grad_dict_sum[name] = torch.zeros_like(param)
grad_dict_avg[name] = torch.zeros_like(param)
for epoch in range(1, epochs+1):
loss,grad_dict = quantize_aware_training(model_ptq, device, train_loader, optimizer, epoch)
# print('loss:%f' % loss_avg)
writer.add_scalar(title+'.loss',loss,epoch)
for name,grad in grad_dict.items():
writer.add_histogram(title+'.'+name+'_grad',grad,global_step=epoch)
loss_sum += loss
loss_avg = loss_sum / epoch
for name,param in model.named_parameters():
grad_dict_sum[name] += grad_dict[name]
grad_dict_avg[name] = grad_dict_sum[name] / epoch
ckpt = {
'epoch' : epoch,
'loss' : loss,
'loss_sum' : loss_sum,
'loss_avg' : loss_avg,
# 'grad_dict' : grad_dict,
# 'grad_dict_sum' : grad_dict_sum,
'grad_dict_avg' : grad_dict_avg
}
if epoch % 5 == 0:
subdir = 'epoch_%d/' % epoch
if not os.path.isdir(ckpt_prefix+ subdir):
os.makedirs(ckpt_prefix+ subdir, mode=0o777)
os.chmod(ckpt_prefix+ subdir, mode=0o777)
torch.save(ckpt,ckpt_prefix+subdir + title+'.pt')
lr_scheduler.step()
writer.close()
# # model.eval()
# # full_inference(model, test_loader)
# num_bits = 8
# e_bits = 0
# gol._init()
# print("qat: INT8")
# model.quantize('INT',num_bits,e_bits)
# print('Quantization bit: %d' % num_bits)
# if load_quant_model_file is not None:
# model.load_state_dict(torch.load(load_quant_model_file))
# print("Successfully load quantized model %s" % load_quant_model_file)
# else:
# model.train()
# for epoch in range(1, epochs+1):
# quantize_aware_training(model, device, train_loader, optimizer, epoch)
# # for epoch in range(epochs1 + 1, epochs2 + 1):
# # quantize_aware_training(model, device, train_loader, optimizer2, epoch)
# model.eval()
# # torch.save(model.state_dict(), save_file)
# model.freeze()
# # for name, param in model.named_parameters():
# # print(name)
# # print(param.data)
# # print('----------')
# # for param_tensor, param_value in model.state_dict().items():
# # print(param_tensor, "\t", param_value)
# quantize_inference(model, test_loader)
## update: <br>2023.4.24<br>
- 解决的问题
1. 解决了权值参数相似度不正常,无法拟合曲线的问题。
2. 修改了一些小bug
- 思路记录 <br>
在ResNet中遇到的权值参数相似度不太合理的问题在MobileNetV2中更加显著,使得MobileNet无法拟合曲线。考虑到PTQ后推理精度数据比较正常,则问题可能出在了权值参数的相似度计算上。<br>
我仔细检查了是否成功fold了BN,以及相应的ratio,计算js散度时是否使用的是匹配的两个权值参数,发现都没有问题。但在具体查看权值参数的数据时,发现了一些层的权值参数数据异常,也是这些异常层的js非常大干扰了整体的js计算。<br>
从tensorboard来观察,这些层的数据分布相似度应该与全精度模型很像,但js计算结果没能反应出这一点,对这些层的数据重点观察,他们有很多1或-1的值,因此我想到可能是对量化前后的模型权值参数先进行的normalize操作导致了数据分布变得不合理,进而导致了问题。<br>
考虑到normalize操作的本意应该是为了将量化前后的模型权值参数归一到同一个scale进而方便使用js散度计算距离,则可以考虑将量化后的模型的权值参数通过dequantize来恢复到与全精度模型相近的scale,而后再使用js散度计算距离。我将上述过程命名为fakefreeze. 经过实践,效果很好,重新计算的js散度反映的权值参数相似程度与tensorboard直接对数据分布的观察比较一致。
<img src = "fig/defreeze.png" class="h-90 auto">
## update: <br>2023.4.23<br>
1. 实现了MobileNetV2的PTQ量化
2. 目前存在一些问题:<br>
虽然PTQ量化后的acc比较合理,但权值参数相似度比较不正常。无法拟合出合理的曲线。<br>
对数据分析并通过tensorboard观察,一些计算量/参数量大的层在图中观察数据分布相似度较高,但计算出的js散度较大,又因为其加权权重较大,导致了整体的js距离很大。
3. 后续将检查出现问题的原因是代码设计问题还是加权系数不适用于MobileNetV2
from model import *
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.transforms.functional import InterpolationMode
import os
import os.path as osp
def train(model, device, train_loader, optimizer, epoch):
model.train()
lossLayer = torch.nn.CrossEntropyLoss()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = lossLayer(output, target)
loss.backward()
optimizer.step()
if batch_idx % 50 == 0:
print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
lossLayer = torch.nn.CrossEntropyLoss(reduction='sum')
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += lossLayer(output, target).item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
test_loss, 100. * correct / len(test_loader.dataset)
))
if __name__ == "__main__":
batch_size = 128
test_batch_size = 128
seed = 1
epochs1 = 15
epochs2 = epochs1+10
epochs3 = epochs2+10
lr1 = 0.01
lr2 = 0.001
lr3 = 0.0001
momentum = 0.5
save_model = True
torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=True,
transform=transforms.Compose([
transforms.Resize((32, 32), interpolation=InterpolationMode.BICUBIC),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, transform=transforms.Compose([
transforms.Resize((32, 32), interpolation=InterpolationMode.BICUBIC),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=test_batch_size, shuffle=True, num_workers=1, pin_memory=True
)
# model = AlexNet_BN().to(device)
model = resnet18().to(device)
optimizer1 = optim.SGD(model.parameters(), lr=lr1, momentum=momentum)
optimizer2 = optim.SGD(model.parameters(), lr=lr2, momentum=momentum)
optimizer3 = optim.SGD(model.parameters(), lr=lr3, momentum=momentum)
for epoch in range(1, epochs1 + 1):
train(model, device, train_loader, optimizer1, epoch)
test(model, device, test_loader)
for epoch in range(epochs1 + 1, epochs2 + 1):
train(model, device, train_loader, optimizer2, epoch)
test(model, device, test_loader)
for epoch in range(epochs2 + 1, epochs3 + 1):
train(model, device, train_loader, optimizer3, epoch)
test(model, device, test_loader)
if save_model:
if not osp.exists('ckpt'):
os.makedirs('ckpt')
torch.save(model.state_dict(), 'ckpt/cifar10_ResNet18.pt')
\ No newline at end of file
from model import *
model = MobileNetV2()
model.quantize('INT',8,0)
# for name, module in model.named_modules():
# print(name)
# print('==============================')
# for name, param in model.named_parameters():
# print(name)
\ No newline at end of file
import torch
import torch.nn as nn
import torch.nn.functional as F
def js_div(p_output, q_output, get_softmax=True):
"""
Function that measures JS divergence between target and output logits:
"""
KLDivLoss = nn.KLDivLoss(reduction='sum')
if get_softmax:
p_output = F.softmax(p_output)
q_output = F.softmax(q_output)
log_mean_output = ((p_output + q_output)/2).log()
return (KLDivLoss(log_mean_output, p_output) + KLDivLoss(log_mean_output, q_output))/2
def ebit_list(quant_type, num_bits):
if quant_type == 'FLOAT':
e_bit_list = list(range(1,num_bits-1))
else:
e_bit_list = [0]
return e_bit_list
def numbit_list(quant_type):
if quant_type == 'INT':
num_bit_list = list(range(2,17))
elif quant_type == 'POT':
num_bit_list = list(range(2,9))
else:
num_bit_list = list(range(2,9))
# num_bit_list = [8]
return num_bit_list
def build_bias_list(quant_type):
if quant_type == 'POT':
return build_pot_list(8) #
else:
return build_float_list(16,7)
def build_list(quant_type, num_bits, e_bits):
if quant_type == 'POT':
return build_pot_list(num_bits)
else:
return build_float_list(num_bits,e_bits)
def build_pot_list(num_bits):
plist = [0.]
for i in range(-2 ** (num_bits-1) + 2, 1):
# i最高到0,即pot量化最大值为1
plist.append(2. ** i)
plist.append(-2. ** i)
plist = torch.Tensor(list(set(plist)))
# plist = plist.mul(1.0 / torch.max(plist))
return plist
def build_float_list(num_bits,e_bits):
m_bits = num_bits - 1 - e_bits
plist = [0.]
# 相邻尾数的差值
dist_m = 2 ** (-m_bits)
e = -2 ** (e_bits - 1) + 1
for m in range(1, 2 ** m_bits):
frac = m * dist_m # 尾数部分
expo = 2 ** e # 指数部分
flt = frac * expo
plist.append(flt)
plist.append(-flt)
for e in range(-2 ** (e_bits - 1) + 2, 2 ** (e_bits - 1) + 1):
expo = 2 ** e
for m in range(0, 2 ** m_bits):
frac = 1. + m * dist_m
flt = frac * expo
plist.append(flt)
plist.append(-flt)
plist = torch.Tensor(list(set(plist)))
return plist
def fold_ratio(layer, par_ratio, flop_ratio):
idx = -1
for name in layer:
idx = idx + 1
# layer是for name, param in model.named_parameters()中提取出来的,一定是有downsample的
if 'bn' in name or 'sample.1' in name:
par_ratio[idx-1] += par_ratio[idx]
flop_ratio[idx-1] += flop_ratio[idx]
return par_ratio,flop_ratio
def fold_model(model):
idx = -1
module_list = []
# print('fold model:')
for name, module in model.named_modules():
# print(name+'-- +')
idx += 1
module_list.append(module)
# 这里之前忘记考虑downsampl里的conv了,导致少融合了一些
if 'bn' in name or 'sample.1' in name:
# print(name+'-- *')
module_list[idx-1] = fold_bn(module_list[idx-1],module) # 在这里修改了
return model
# def fold_model(model):
# last_conv = None
# last_bn = None
# for name, module in model.named_modules():
# if isinstance(module, nn.Conv2d):
# # 如果当前模块是卷积层,则将其 "fold" 到上一个 BN 层中
# if last_bn is not None:
# last_conv = fold_bn(last_conv, last_bn)
# last_bn = None
# last_conv = module
# elif isinstance(module, nn.BatchNorm2d):
# # 如果当前模块是 BN 层,则将其 "fold" 到上一个卷积层中
# last_bn = module
# if last_conv is not None:
# last_conv = fold_bn(last_conv, last_bn)
# last_bn = None
# # 处理最后一个 BN 层
# if last_bn is not None:
# last_conv = fold_bn(last_conv, last_bn)
# return model
def fold_bn(conv, bn):
# 获取 BN 层的参数
gamma = bn.weight.data
beta = bn.bias.data
mean = bn.running_mean
var = bn.running_var
eps = bn.eps
std = torch.sqrt(var + eps)
feat = bn.num_features
# 获取卷积层的参数
weight = conv.weight.data
if conv.bias is not None:
bias = conv.bias.data
if bn.affine:
gamma_ = gamma / std
weight = weight * gamma_.view(feat, 1, 1, 1)
if conv.bias is not None:
bias = gamma_ * bias - gamma_ * mean + beta
else:
bias = beta - gamma_ * mean
else:
gamma_ = 1 / std
weight = weight * gamma_
if conv.bias is not None:
bias = gamma_ * bias - gamma_ * mean
else:
bias = -gamma_ * mean
# 设置新的 weight 和 bias
conv.weight.data = weight
# 适用于bias=none的
if conv.bias is None:
conv.bias = nn.Parameter(bias)
else:
conv.bias.data = bias
return conv
\ No newline at end of file
......@@ -30,9 +30,21 @@ def get_children(model: torch.nn.Module):
# print(flatt_children)
return flatt_children
# 定义获取不包含wrapper的所有子模块的函数
def get_all_child_modules(module):
for name, child in module.named_children():
if isinstance(child, nn.Sequential):
yield from get_all_child_modules(child)
elif len(list(child.children())) > 0:
yield from child.children()
else:
yield child
def filter_fn(module, n_inp, outp_shape):
# if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d, torch.nn.ReLU,torch.nn.BatchNorm2d,torch.nn.Linear,torch.nn.AdaptiveAvgPool2d)):
if 'conv' in module or 'bn' in module or 'fc' in module or 'avg' in module or 'relu' in module:
return True
return False
if __name__ == "__main__":
......
......@@ -108,16 +108,16 @@ class ResNet(nn.Module):
def fakefreeze(self):
pass
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
qx = self.qconvbnrelu1.qi.quantize_tensor(x)
qx = self.qconvbnrelu1.quantize_inference(qx)
qx = self.layer1.quantize_inference(qx)
qx = self.layer2.quantize_inference(qx)
qx = self.layer3.quantize_inference(qx)
qx = self.layer4.quantize_inference(qx)
qx = self.qavgpool1.quantize_inference(qx)
qx = self.qconvbnrelu1.quantize_inference(qx, quant_type)
qx = self.layer1.quantize_inference(qx, quant_type)
qx = self.layer2.quantize_inference(qx, quant_type)
qx = self.layer3.quantize_inference(qx, quant_type)
qx = self.layer4.quantize_inference(qx, quant_type)
qx = self.qavgpool1.quantize_inference(qx, quant_type)
qx = qx.view(qx.size(0), -1)
qx = self.qfc1.quantize_inference(qx)
qx = self.qfc1.quantize_inference(qx, quant_type)
qx = self.qfc1.qo.dequantize_tensor(qx)
......@@ -209,18 +209,18 @@ class BasicBlock(nn.Module):
self.qrelu1.freeze(qi = self.qelementadd.qo)
return self.qrelu1.qi # relu后的qo可用relu统计的qi
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
# 感觉是不需要进行初始的quantize_tensor和dequantize_tensor,因为他不是最前/后一层,只要中间的每层都在量化后的领域内,就不需要这种处理。
identity = x
out = self.qconvbnrelu1.quantize_inference(x)
out = self.qconvbn1.quantize_inference(out)
out = self.qconvbnrelu1.quantize_inference(x, quant_type)
out = self.qconvbn1.quantize_inference(out, quant_type)
if self.downsample is not None:
identity = self.qconvbn2.quantize_inference(identity)
identity = self.qconvbn2.quantize_inference(identity, quant_type)
# out = identity + out # 这里可能需要写一个elementwiseadd的变换的,待后续修改
out = self.qelementadd.quantize_inference(out,identity)
out = self.qrelu1.quantize_inference(out)
out = self.qelementadd.quantize_inference(out,identity, quant_type)
out = self.qrelu1.quantize_inference(out, quant_type)
return out
......@@ -318,19 +318,19 @@ class Bottleneck(nn.Module):
self.qrelu1.freeze(qi = self.qelementadd.qo) # 需要自己统计qi
return self.qrelu1.qi # relu后的qo可用relu统计的qi
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
# 感觉是不需要进行初始的quantize_tensor和dequantize_tensor,因为他不是最前/后一层,只要中间的每层都在量化后的领域内,就不需要这种处理。
identity = x
out = self.qconvbnrelu1.quantize_inference(x)
out = self.qconvbnrelu2.quantize_inference(out)
out = self.qconvbn1.quantize_inference(out)
out = self.qconvbnrelu1.quantize_inference(x, quant_type)
out = self.qconvbnrelu2.quantize_inference(out, quant_type)
out = self.qconvbn1.quantize_inference(out, quant_type)
if self.downsample is not None:
identity = self.qconvbn2.quantize_inference(identity)
identity = self.qconvbn2.quantize_inference(identity, quant_type)
# out = identity + out # 这里可能需要写一个elementwiseadd的变换的,待后续修改
out = self.qelementadd.quantize_inference(out,identity)
out = self.qrelu1.quantize_inference(out)
out = self.qelementadd.quantize_inference(out,identity, quant_type)
out = self.qrelu1.quantize_inference(out, quant_type)
return out
......@@ -408,10 +408,10 @@ class MakeLayer(nn.Module):
return qo # 供后续的层用
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
# 感觉是不需要进行初始的quantize_tensor和dequantize_tensor,因为他不是最前/后一层,只要中间的每层都在量化后的领域内,就不需要这种处理。
for _, layer in self.blockdict.items():
x = layer.quantize_inference(x) # 每个block中有具体的quantize_inference
x = layer.quantize_inference(x, quant_type) # 每个block中有具体的quantize_inference
return x
......
......@@ -105,10 +105,13 @@ class QParam(nn.Module):
self.scale, self.zero_point = calcScaleZeroPoint(self.min, self.max, self.qmax)
def quantize_tensor(self, tensor):
return quantize_tensor(self.quant_type, tensor, self.scale, self.zero_point, self.qmax)
def quantize_tensor(self, tensor,quant_type = None):
if quant_type == None:
return quantize_tensor(self.quant_type, tensor, self.scale, self.zero_point, self.qmax)
else:
return quantize_tensor(quant_type, tensor, self.scale, self.zero_point, self.qmax)
def dequantize_tensor(self, q_x):
def dequantize_tensor(self, q_x,quant_type = None):
return dequantize_tensor(q_x, self.scale, self.zero_point)
# 该方法保证了可以从state_dict里恢复
......@@ -146,7 +149,7 @@ class QModule(nn.Module):
def freeze(self):
pass # 空语句
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
raise NotImplementedError('quantize_inference should be implemented.')
......@@ -219,13 +222,16 @@ class QConv2d(QModule):
return x
# 利用公式 q_a = M(\sigma(q_w-Z_w)(q_x-Z_x) + q_b)
def quantize_inference(self, x): # 此处input为已经量化的qx
def quantize_inference(self, x, quant_type): # 此处input为已经量化的qx
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
if self.quant_type is not 'POT':
x = get_nearest_val(self.quant_type,x)
# if self.quant_type is not 'POT':
# x = get_nearest_val(self.quant_type,x)
x = get_nearest_val(quant_type,x)
x = x + self.qo.zero_point
return x
......@@ -279,14 +285,15 @@ class QLinear(QModule):
return x
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
x = x - self.qi.zero_point
x = self.fc_module(x)
x = self.M * x
if self.quant_type is not 'POT':
x = get_nearest_val(self.quant_type,x)
# if self.quant_type is not 'POT':
# x = get_nearest_val(self.quant_type,x)
x = get_nearest_val(quant_type,x)
x = x + self.qo.zero_point
......@@ -317,7 +324,7 @@ class QReLU(QModule):
return x
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
x = x.clone()
# x[x < self.qi.zero_point] = self.qi.zero_point
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
......@@ -351,7 +358,7 @@ class QMaxPooling2d(QModule):
return x
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
return F.max_pool2d(x, self.kernel_size, self.stride, self.padding)
class QConvBNReLU(QModule):
......@@ -457,13 +464,15 @@ class QConvBNReLU(QModule):
return x
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
if self.quant_type is not 'POT':
x = get_nearest_val(self.quant_type,x)
# if self.quant_type is not 'POT':
# x = get_nearest_val(self.quant_type,x)
x = get_nearest_val(quant_type,x)
x = x + self.qo.zero_point
......@@ -575,14 +584,17 @@ class QConvBN(QModule):
return x
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
if self.quant_type is not 'POT':
x = get_nearest_val(self.quant_type,x)
# print(self.quant_type)
# if self.quant_type is not 'POT':
# x = get_nearest_val(self.quant_type,x)
x = get_nearest_val(quant_type,x)
x = x + self.qo.zero_point
# x.clamp_(min=0)
......@@ -626,13 +638,14 @@ class QAdaptiveAvgPool2d(QModule):
return x
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
x = F.adaptive_avg_pool2d(x,(1, 1)) # 对输入输出都量化一下就算是量化了
x = self.M * x
if self.quant_type is not 'POT':
x = get_nearest_val(self.quant_type,x)
# if self.quant_type is not 'POT':
# x = get_nearest_val(self.quant_type,x)
x = get_nearest_val(quant_type,x)
return x
......@@ -662,7 +675,7 @@ class QModule_2(nn.Module):
def fakefreeze(self):
pass
def quantize_inference(self, x):
def quantize_inference(self, x, quant_type):
raise NotImplementedError('quantize_inference should be implemented.')
......@@ -718,15 +731,16 @@ class QElementwiseAdd(QModule_2):
return x
def quantize_inference(self, x0, x1): # 此处input为已经量化的qx
def quantize_inference(self, x0, x1, quant_type): # 此处input为已经量化的qx
x0 = x0 - self.qi0.zero_point
x1 = x1 - self.qi1.zero_point
x = self.M0 * (x0 + x1*self.M1)
if self.quant_type is not 'POT':
x = get_nearest_val(self.quant_type,x)
# if self.quant_type is not 'POT':
# x = get_nearest_val(self.quant_type,x)
x = get_nearest_val(quant_type,x)
x = x + self.qo.zero_point
......
# -*- coding: utf-8 -*-
from torch.serialization import load
from model import *
from extract_ratio import *
from utils import *
import gol
import openpyxl
import sys
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.transforms.functional import InterpolationMode
import torch.utils.bottleneck as bn
import os
import os.path as osp
from torch.utils.tensorboard import SummaryWriter
def direct_quantize(model, test_loader,device):
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model.quantize_forward(data).cpu()
if i % 500 == 0:
break
print('direct quantization finish')
def full_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model(data).cpu()
pred = output.argmax(dim=1, keepdim=True)
# print(pred)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Full Model Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
return 100. * correct / len(test_loader.dataset)
def quantize_inference(model, test_loader, device, quant_type):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model.quantize_inference(data, quant_type).cpu()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('Test set: Quant Model Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
return 100. * correct / len(test_loader.dataset)
def js_div(p_output, q_output, get_softmax=True):
"""
Function that measures JS divergence between target and output logits:
"""
KLDivLoss = nn.KLDivLoss(reduction='sum')
if get_softmax:
p_output = F.softmax(p_output)
q_output = F.softmax(q_output)
log_mean_output = ((p_output + q_output)/2).log()
return (KLDivLoss(log_mean_output, p_output) + KLDivLoss(log_mean_output, q_output))/2
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='PyTorch FP32 Training')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='ResNet18')
parser.add_argument('-b', '--batch_size', default=128, type=int, metavar='BATCH SIZE', help='mini-batch size (default: 128)')
parser.add_argument('-j','--workers', default=4, type=int, metavar='WORKERS',help='number of data loading workers (default: 4)')
# parser.add_argument('-t', '--test', dest='test', action='store_true', help='test model on test set')
# 训练参数
args = parser.parse_args()
batch_size = args.batch_size
num_workers = args.workers
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=False,
transform=transforms.Compose([
transforms.RandomCrop(32, padding=2),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, download=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True
)
# model = AlexNet_BN()
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
writer = SummaryWriter(log_dir='log/' + args.model + '/ptq')
full_file = 'ckpt/cifar10_' + args.model + '.pt'
model.load_state_dict(torch.load(full_file))
model.to(device)
load_ptq = True
ptq_file_prefix = 'ckpt/cifar10_' + args.model + '_ptq_'
model.eval()
full_acc = full_inference(model, test_loader, device)
model_fold = fold_model(model) #
full_params = []
layer, par_ratio, flop_ratio = extract_ratio(args.model)
# print(layer)
layer = []
for name, param in model.named_parameters():
if 'weight' in name:
n = name.split('.')
pre = '.'.join(n[:len(n)-1])
layer.append(pre)
# print(name)
print('===================')
# print(layer)
par_ratio, flop_ratio = fold_ratio(layer, par_ratio, flop_ratio)
# sys.exit()
for name, param in model_fold.named_parameters():
if 'bn' in name or 'sample.1' in name:
continue
param_norm = F.normalize(param.data.cpu(),p=2,dim=-1)
full_params.append(param_norm) # 没统计bn的 只统计了conv的 而且还是fold后的
writer.add_histogram(tag='Full_' + name + '_data', values=param.data)
gol._init()
quant_type_list = ['INT','POT','FLOAT']
title_list = []
js_flops_list = []
js_param_list = []
ptq_acc_list = []
acc_loss_list = []
if args.model == 'ResNet18':
model_ptq = resnet18()
elif args.model == 'ResNet50':
model_ptq = resnet50()
elif args.model == 'ResNet152':
model_ptq = resnet152()
if load_ptq is True and osp.exists(ptq_file_prefix + 'POT_6' + '.pt'):
model_ptq.quantize('POT',6,0)
model_ptq.load_state_dict(torch.load(ptq_file_prefix + 'POT_6' + '.pt'))
model_ptq.to(device)
print('Successfully load ptq model: ' + 'POT_6')
# 此时需要用到一堆list,还需要构建呢,之后才能换掉
bias_list = build_bias_list('POT')
gol.set_value(bias_list, is_bias=True)
plist = build_list('POT', 6, 0)
gol.set_value(plist)
model_ptq.freeze()
for quant_type in quant_type_list:
num_bit_list = numbit_list(quant_type)
# 对一个量化类别,只需设置一次bias量化表
# int由于位宽大,使用量化表开销过大,直接_round即可
if quant_type != 'INT':
bias_list = build_bias_list(quant_type)
gol.set_value(bias_list, is_bias=True)
for num_bits in num_bit_list:
e_bit_list = ebit_list(quant_type,num_bits)
for e_bits in e_bit_list:
if quant_type == 'FLOAT':
title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
else:
title = '%s_%d' % (quant_type, num_bits)
print('\nPTQ: '+title)
title_list.append(title)
# 设置量化表
if quant_type != 'INT':
plist = build_list(quant_type, num_bits, e_bits)
gol.set_value(plist)
ptq_acc = quantize_inference(model_ptq, test_loader, device, quant_type)
ptq_acc_list.append(ptq_acc)
acc_loss = (full_acc - ptq_acc) / full_acc
acc_loss_list.append(acc_loss)
idx = -1
# 获取计算量/参数量下的js-div
js_flops = 0.
js_param = 0.
for name, param in model_ptq.named_parameters():
# if '.' not in name or 'bn' in name:
if 'bn' in name or 'sample.1' in name:
continue
writer.add_histogram(tag=title +':'+ name + '_data', values=param.data)
idx = idx + 1
# renset中有多个. 需要改写拼一下
# prefix = name.split('.')[0]
n = name.split('.')
prefix = '.'.join(n[:len(n) - 1])
# weight和bias 1:1 ? 对于ratio,是按层赋予的,此处可以对weight和bias再单独赋予不同的权重,比如(8:2)
if prefix in layer:
layer_idx = layer.index(prefix)
ptq_param = param.data.cpu()
# 取L2范数
ptq_norm = F.normalize(ptq_param,p=2,dim=-1)
writer.add_histogram(tag=title +':'+ name + '_data', values=ptq_param)
js = js_div(ptq_norm,full_params[idx]) # 这里算了fold后的量化前后模型的js距离
js = js.item()
if js < 0.:
js = 0.
js_flops = js_flops + js * flop_ratio[layer_idx]
js_param = js_param + js * par_ratio[layer_idx]
js_flops_list.append(js_flops)
js_param_list.append(js_param)
print(title + ': js_flops: %f js_param: %f acc_loss: %f' % (js_flops, js_param, acc_loss))
# 写入xlsx
workbook = openpyxl.Workbook()
worksheet = workbook.active
worksheet.cell(row=1,column=1,value='FP32-acc')
worksheet.cell(row=1,column=2,value=full_acc)
worksheet.cell(row=3,column=1,value='title')
worksheet.cell(row=3,column=2,value='js_flops')
worksheet.cell(row=3,column=3,value='js_param')
worksheet.cell(row=3,column=4,value='ptq_acc')
worksheet.cell(row=3,column=5,value='acc_loss')
for i in range(len(title_list)):
worksheet.cell(row=i+4, column=1, value=title_list[i])
worksheet.cell(row=i+4, column=2, value=js_flops_list[i])
worksheet.cell(row=i+4, column=3, value=js_param_list[i])
worksheet.cell(row=i+4, column=4, value=ptq_acc_list[i])
worksheet.cell(row=i+4, column=5, value=acc_loss_list[i])
workbook.save('POT_ptq_result_' + args.model + '.xlsx')
writer.close()
ft = open('POT_ptq_result_' + args.model + '.txt','w')
print('title_list:',file=ft)
print(" ".join(title_list),file=ft)
print('js_flops_list:',file=ft)
print(" ".join(str(i) for i in js_flops_list), file=ft)
print('js_param_list:',file=ft)
print(" ".join(str(i) for i in js_param_list), file=ft)
print('ptq_acc_list:',file=ft)
print(" ".join(str(i) for i in ptq_acc_list), file=ft)
print('acc_loss_list:',file=ft)
print(" ".join(str(i) for i in acc_loss_list), file=ft)
ft.close()
title_list:
INT_2 INT_3 INT_4 INT_5 INT_6 INT_7 INT_8 INT_9 INT_10 INT_11 INT_12 INT_13 INT_14 INT_15 INT_16 POT_2 POT_3 POT_4 POT_5 POT_6 POT_7 POT_8 FLOAT_3_E1 FLOAT_4_E1 FLOAT_4_E2 FLOAT_5_E1 FLOAT_5_E2 FLOAT_5_E3 FLOAT_6_E1 FLOAT_6_E2 FLOAT_6_E3 FLOAT_6_E4 FLOAT_7_E1 FLOAT_7_E2 FLOAT_7_E3 FLOAT_7_E4 FLOAT_7_E5 FLOAT_8_E1 FLOAT_8_E2 FLOAT_8_E3 FLOAT_8_E4 FLOAT_8_E5 FLOAT_8_E6
js_flops_list:
1575.126077030527 980.8324825038856 447.4871705577316 203.8177281153719 94.1658153206219 44.73944284292641 21.730716696253086 10.687903335080755 5.2935009924434775 2.6865031426677675 1.345978185346981 0.6738058971124082 0.34590930672785625 0.16620132379306904 0.09185943251823848 1575.0264663456858 767.5068295225365 59.80415491853343 17.32189175246257 17.160386413787755 17.15972613238827 17.160655554562823 547.0296470821636 228.09197712606053 153.9307141697144 102.8744121697856 63.04910506966272 11.893784458090247 49.68929151890493 30.72369295281706 4.336553462330601 4.810517948583543 25.62475856077897 16.963161148931942 1.7730239215421446 1.2492962287085048 4.844787354857122 14.21240714817728 10.605240065475499 0.7963437572573967 0.32131797583853794 1.3061700599586734 4.844787523330232
js_param_list:
2231.9475377209037 1458.7430817370525 656.866021106162 290.661557510572 132.0211812900384 62.06574209045005 29.96287022906031 14.768791159744465 7.344364349715033 3.757019554618513 1.896182903527843 0.9241808205303167 0.45857306080932436 0.2269121111102425 0.12261352661167306 2231.901673608193 1143.359470049635 82.82637961696304 24.06635574752677 23.843136397545287 23.842358607630732 23.84306741528584 799.9775130544906 323.8336430582792 218.61973701520765 143.18120884416584 88.72081224892759 16.52024912262558 68.08470436272326 43.20128678260041 6.041579655336327 6.686327875421352 34.6238061335222 24.064747116161215 2.491426419987749 1.7403336017725606 6.690842031928857 18.94797143083834 15.257619881935225 1.0957373786589855 0.44768947355373956 1.7705741794826835 6.690842738428997
ptq_acc_list:
10.0 10.0 10.0 78.52 86.7 89.95 90.73 90.96 90.64 87.4 74.21 52.1 40.65 30.51 20.3 10.0 10.0 10.0 39.21 40.15 44.33 34.83 10.0 19.98 10.0 34.59 85.82 80.56 57.06 88.62 90.17 81.06 68.03 89.75 90.85 88.77 10.0 72.61 90.02 91.08 89.55 10.0 10.0
acc_loss_list:
0.8900978129464776 0.8900978129464776 0.8900978129464776 0.1370480272557424 0.04714803824596101 0.011429827453566238 0.0028574568633914815 0.0003297065611605796 0.0038465765468732203 0.03945488515221441 0.18441586987581055 0.4274096054511484 0.5532476096274316 0.6646884272997032 0.7768985602813496 0.8900978129464776 0.8900978129464776 0.8900978129464776 0.5690735245631388 0.5587427189801077 0.5128036047917354 0.6172106824925816 0.8900978129464776 0.7804154302670623 0.8900978129464776 0.6198483349818661 0.05681943070667109 0.11462798109682375 0.3728981206726013 0.026046818331684696 0.00901197933838876 0.10913287174414764 0.2523354214748873 0.013627871194636718 0.0015386306187493194 0.024398285525881955 0.8900978129464776 0.20200021980437408 0.010660512144191657 -0.0009891196834817388 0.015825914935707196 0.8900978129464776 0.8900978129464776
title_list:
INT_2 INT_3 INT_4 INT_5 INT_6 INT_7 INT_8 INT_9 INT_10 INT_11 INT_12 INT_13 INT_14 INT_15 INT_16 POT_2 POT_3 POT_4 POT_5 POT_6 POT_7 POT_8 FLOAT_3_E1 FLOAT_4_E1 FLOAT_4_E2 FLOAT_5_E1 FLOAT_5_E2 FLOAT_5_E3 FLOAT_6_E1 FLOAT_6_E2 FLOAT_6_E3 FLOAT_6_E4 FLOAT_7_E1 FLOAT_7_E2 FLOAT_7_E3 FLOAT_7_E4 FLOAT_7_E5 FLOAT_8_E1 FLOAT_8_E2 FLOAT_8_E3 FLOAT_8_E4 FLOAT_8_E5 FLOAT_8_E6
js_flops_list:
1833.4576454973073 814.7863891368864 217.7654229387627 54.07616924802023 13.731802945455469 3.5847427020530582 0.9118541432904458 0.2622900218848318 0.07627003915874074 0.027745791769400664 0.015915006254486226 0.012409352705166696 0.0077479353538904274 0.0062617873011873975 0.005917287498327866 1833.2003417254284 544.2136113656462 35.21026365121499 33.83804856891729 33.83703344984572 33.83750169488491 33.84147193704756 342.096219925368 82.6043808610444 75.92517125989443 27.82235802343243 26.574672151466128 9.6049355988981 14.044291246668882 14.55114135659603 2.4864347446515884 9.426150750133262 10.07193874315086 10.701781541507756 0.6597191298214471 2.4197650833272104 9.550487563237345 8.849135643873504 9.216705123201871 0.1929881628940372 0.6207588325434388 2.5428780026080493 9.550487563237345
js_param_list:
3613.037168160796 1825.7907466758202 512.0932785883192 129.26071365337654 33.314456921282606 8.673843570791789 2.1826018682118424 0.6138186833325912 0.1691841503982388 0.05180905191755439 0.02266508641878177 0.014530378356803484 0.00975786055068809 0.005063431812688739 0.00398069855228542 3612.992302272399 1246.9340617899438 71.14710558688047 67.61964317269017 67.6172664356203 67.61753548832318 67.6175100773394 755.379587970111 181.41267691267066 170.89087380459807 56.989989927129535 59.371069176236894 19.274735775346528 26.031672719261728 32.363778392002544 5.0043194398511135 18.814548222792805 17.309141148134536 23.84953967534161 1.332034978863292 4.83191046013193 18.864051408815957 14.787650268158211 20.519388091926267 0.3942680972083926 1.231435885110694 4.879394902995963 18.864051408815957
ptq_acc_list:
10.0 10.0 31.15 81.89 84.93 85.69 85.78 85.63 82.63 74.8 51.56 29.34 13.78 11.57 10.17 10.0 10.0 44.45 44.64 46.43 44.18 38.58 9.92 38.85 70.91 65.34 82.3 80.82 73.99 84.14 85.05 76.68 75.95 84.95 85.55 81.54 10.0 77.73 85.18 85.98 81.93 10.01 13.34
acc_loss_list:
0.8835991153532767 0.8835991153532767 0.6374112443254569 0.04679315562798273 0.011407286695378766 0.0025608194622279 0.0015132115004073503 0.003259224770108266 0.038179490164125265 0.1293213828425096 0.3998370387614945 0.6584798044465138 0.8395995809568153 0.8653241764637412 0.8816203003142824 0.8835991153532767 0.8835991153532767 0.48259806774531483 0.4803864509370271 0.45955069258526365 0.4857408916307764 0.5509253870329415 0.8845303224304505 0.5477825631474799 0.17460132697008499 0.2394366197183098 0.04202071935746711 0.05924805028518221 0.1387498544988942 0.02060295658246998 0.010010476079618198 0.10743801652892551 0.11593528110813635 0.011174484926085367 0.004190431847282033 0.05086718659061798 0.8835991153532767 0.09521592364101959 0.008497264579210684 -0.0008148061925271492 0.04632755208939576 0.8834827144686299 0.844721219881271
title_list:
INT_2 INT_3 INT_4 INT_5 INT_6 INT_7 INT_8 INT_9 INT_10 INT_11 INT_12 INT_13 INT_14 INT_15 INT_16 POT_2 POT_3 POT_4 POT_5 POT_6 POT_7 POT_8 FLOAT_3_E1 FLOAT_4_E1 FLOAT_4_E2 FLOAT_5_E1 FLOAT_5_E2 FLOAT_5_E3 FLOAT_6_E1 FLOAT_6_E2 FLOAT_6_E3 FLOAT_6_E4 FLOAT_7_E1 FLOAT_7_E2 FLOAT_7_E3 FLOAT_7_E4 FLOAT_7_E5 FLOAT_8_E1 FLOAT_8_E2 FLOAT_8_E3 FLOAT_8_E4 FLOAT_8_E5 FLOAT_8_E6
js_flops_list:
1489.6432790793892 858.47390911721 350.38842997977486 146.66108726257698 65.51871772345022 30.802447738403625 15.015633081763848 7.372939759214539 3.602748170145869 1.7596017349024324 0.9023980469489912 0.42604559053986407 0.2086617904343124 0.11696138076612213 0.06650877266410397 1489.4950585919782 648.6726890563766 44.0313761461945 14.813184296979202 14.70886411284525 14.708637223793453 14.708329981851291 442.110054514285 167.03961080744105 110.912352356486 73.14257321252117 44.75826464643717 8.918392355710786 35.41728607793805 22.00069249787684 3.0807357022322006 4.133106679411769 18.786975210869198 12.291142909976228 1.2420341267864268 1.0780820385160967 4.196963771246701 10.816659177228358 7.780715811154463 0.513961854917128 0.2801788104206083 1.150574461896198 4.196963771246701
js_param_list:
2988.8747567488617 1887.9793935628438 794.5371505720092 330.3960680775245 145.92231495119452 67.9559314292448 33.03981244952361 16.124047122726786 8.021401990398326 3.943098007875918 1.9811299823118427 0.9460539051395199 0.44709418282093033 0.22449034273754867 0.12425914862692854 2988.8363531204886 1451.7681143260804 94.67273844326954 30.460878266197444 30.244231409403923 30.2446749589304 30.244134610251493 984.9086948427197 371.60971497639866 248.5749360354289 159.90777702378026 99.54631101875773 19.048673214252524 75.87671359764475 48.95576239520067 6.683113070521427 8.485231215526596 39.31778320380456 27.44412247810391 2.6854627255413566 2.207580403630901 8.479439151405776 21.80574465505866 17.614834435129385 1.148945392883737 0.5553705895013917 2.2254689905601692 8.479439151405776
ptq_acc_list:
10.0 10.0 10.01 72.69 87.21 89.67 90.45 90.33 89.37 79.82 61.97 35.21 22.84 21.47 13.47 10.0 10.0 12.81 17.49 27.49 30.18 34.97 10.0 15.78 21.89 33.3 82.29 82.49 58.04 87.21 88.9 82.42 67.65 88.34 90.33 87.15 10.05 70.35 89.06 90.52 88.78 9.99 10.0
acc_loss_list:
0.8896369054188279 0.8896369054188279 0.8895265423242467 0.19777066548946035 0.037523452157598565 0.010374130890630148 0.0017658095132987153 0.00309016664827283 0.01368502372806528 0.11908177905308472 0.31607990288047677 0.6114115439796932 0.747930691976603 0.7630504359342236 0.8513409115991613 0.8896369054188279 0.8896369054188279 0.8586248758415186 0.8069749475775301 0.6966118529963581 0.6669241805540227 0.6140602582496413 0.8896369054188279 0.8258470367509105 0.7584151859618143 0.6324908950446971 0.09182209469153507 0.08961483279991177 0.3594525990508774 0.037523452157598565 0.018872089173380353 0.09038737446197989 0.253393665158371 0.025052422469926013 0.00309016664827283 0.03818563072508546 0.8890850899459222 0.22359562962145466 0.017106279660081637 0.0009932678512305862 0.020196446308354467 0.8897472685134091 0.8896369054188279
## update: <br>2023.4.24<br>
在ResNet_nobias中采用了fakefreeze,无需再对PoT进行单独考虑。
## update: <br>2023.4.23<br>
对PoT量化进行单独的考虑,先只是量化权值,看一下权值分布相似度跟acc的关系。然后选一个acc比较高的PoT权值保持不变,再用不同的数据表示去量化激活,这时候看激活的位宽和acc的关系。<br>
因为在PTQ的freeze后,qb.scale已经固定(对bias的量化有直接影响),所有的scale都不方便重新修改,因此INT量化无法被用作激活<br>
FP量化因为受scale影响相对较小,因此可以被试着作为激活量化。具体数据在POT_ptq_result_ResNet50.xlsx和POT_ptq_result_ResNet152.xlsx中。<br>
以FP作为激活的ResNet50,152中的数据有相似的规律,不过目前还没能解释为什么E1经常acc较高,E2,E3降低,E4又回升。
## update: <br>2023.4.17<br>
- 已针对4.12中的问题进行了修正和补充:
......
from model import *
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.transforms.functional import InterpolationMode
import os
import os.path as osp
def train(model, device, train_loader, optimizer, epoch):
model.train()
lossLayer = torch.nn.CrossEntropyLoss()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = lossLayer(output, target)
loss.backward()
optimizer.step()
if batch_idx % 50 == 0:
print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
lossLayer = torch.nn.CrossEntropyLoss(reduction='sum')
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += lossLayer(output, target).item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
test_loss, 100. * correct / len(test_loader.dataset)
))
if __name__ == "__main__":
batch_size = 128
test_batch_size = 128
seed = 1
epochs1 = 15
epochs2 = epochs1+10
epochs3 = epochs2+10
lr1 = 0.01
lr2 = 0.001
lr3 = 0.0001
momentum = 0.5
save_model = True
torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=True,
transform=transforms.Compose([
transforms.Resize((32, 32), interpolation=InterpolationMode.BICUBIC),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, transform=transforms.Compose([
transforms.Resize((32, 32), interpolation=InterpolationMode.BICUBIC),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=test_batch_size, shuffle=True, num_workers=1, pin_memory=True
)
# model = AlexNet_BN().to(device)
model = resnet18().to(device)
optimizer1 = optim.SGD(model.parameters(), lr=lr1, momentum=momentum)
optimizer2 = optim.SGD(model.parameters(), lr=lr2, momentum=momentum)
optimizer3 = optim.SGD(model.parameters(), lr=lr3, momentum=momentum)
for epoch in range(1, epochs1 + 1):
train(model, device, train_loader, optimizer1, epoch)
test(model, device, test_loader)
for epoch in range(epochs1 + 1, epochs2 + 1):
train(model, device, train_loader, optimizer2, epoch)
test(model, device, test_loader)
for epoch in range(epochs2 + 1, epochs3 + 1):
train(model, device, train_loader, optimizer3, epoch)
test(model, device, test_loader)
if save_model:
if not osp.exists('ckpt'):
os.makedirs('ckpt')
torch.save(model.state_dict(), 'ckpt/cifar10_ResNet18.pt')
\ No newline at end of file
import re
pattern = r"\(\w+(\.\w+)*\)"
# text = "" # 输入的文本字符串
# 从文本中查找匹配的子模块名称列表
matches = re.findall(pattern, text)
# 提取所有子模块路径并存储到一个列表中
submodule_paths = [match.strip("()") for match in matches]
# 输出所有子模块路径
print(submodule_paths)
\ No newline at end of file
import sys
import os
# 从get_param.py输出重定向文件val.txt中提取参数量和计算量
def extract_ratio(md='ResNet18'):
fr = open('param_flops_' + md + '.txt','r')
lines = fr.readlines()
layer = []
par_ratio = []
flop_ratio = []
for line in lines:
# if '(' in line and ')' in line:
if 'Conv' in line or 'BatchNorm2d' in line or 'Linear' in line:
layer.append(line.split(':')[1].split('(')[0])
r1 = line.split('%')[0].split(',')[-1]
r1 = float(r1)
par_ratio.append(r1)
r2 = line.split('%')[-2].split(',')[-1]
r2 = float(r2)
flop_ratio.append(r2)
return layer, par_ratio, flop_ratio
if __name__ == "__main__":
layer, par_ratio, flop_ratio = extract_ratio()
print(len(layer))
print(len(par_ratio))
print(len(flop_ratio))
\ No newline at end of file
# -*- coding: utf-8 -*-
from torch.serialization import load
from model import *
from extract_ratio import *
from utils import *
import gol
import openpyxl
import sys
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.transforms.functional import InterpolationMode
import torch.utils.bottleneck as bn
import os
import os.path as osp
from torch.utils.tensorboard import SummaryWriter
def direct_quantize(model, test_loader,device):
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model.quantize_forward(data).cpu()
if i % 500 == 0:
break
print('direct quantization finish')
def full_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model(data).cpu()
pred = output.argmax(dim=1, keepdim=True)
# print(pred)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Full Model Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
return 100. * correct / len(test_loader.dataset)
def quantize_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model.quantize_inference(data).cpu()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('Test set: Quant Model Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
return 100. * correct / len(test_loader.dataset)
def js_div(p_output, q_output, get_softmax=True):
"""
Function that measures JS divergence between target and output logits:
"""
KLDivLoss = nn.KLDivLoss(reduction='sum')
if get_softmax:
p_output = F.softmax(p_output)
q_output = F.softmax(q_output)
log_mean_output = ((p_output + q_output)/2).log()
return (KLDivLoss(log_mean_output, p_output) + KLDivLoss(log_mean_output, q_output))/2
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='PyTorch FP32 Training')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='ResNet18')
parser.add_argument('-b', '--batch_size', default=128, type=int, metavar='BATCH SIZE', help='mini-batch size (default: 128)')
parser.add_argument('-j','--workers', default=4, type=int, metavar='WORKERS',help='number of data loading workers (default: 4)')
parser.add_argument('-s', '--save', help='Save the output', action='store_true')
# parser.add_argument('-t', '--test', dest='test', action='store_true', help='test model on test set')
# 训练参数
args = parser.parse_args()
batch_size = args.batch_size
num_workers = args.workers
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=False,
transform=transforms.Compose([
transforms.RandomCrop(32, padding=2),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, download=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True
)
# model = AlexNet_BN()
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
writer = SummaryWriter(log_dir='log/' + args.model + '/ptq')
full_file = 'ckpt/cifar10_' + args.model + '.pt'
model.load_state_dict(torch.load(full_file))
model.to(device)
load_ptq = False
ptq_file_prefix = 'ckpt/cifar10_' + args.model + '_ptq_'
model.eval()
full_acc = full_inference(model, test_loader, device)
model_fold = fold_model(model) #
full_params = []
layer, par_ratio, flop_ratio = extract_ratio(args.model)
# print(layer)
layer = []
for name, param in model.named_parameters():
if 'weight' in name:
n = name.split('.')
pre = '.'.join(n[:len(n)-1])
# 提取出weight前的名字(就是这个层的名字,if weight是避免bias重复提取一遍名字)
layer.append(pre)
# print(name)
print('===================')
# print(layer)
par_ratio, flop_ratio = fold_ratio(layer, par_ratio, flop_ratio)
# sys.exit()
for name, param in model_fold.named_parameters():
if 'bn' in name or 'sample.1' in name:
continue
# param_norm = F.normalize(param.data.cpu(),p=2,dim=-1)
param_norm = param.data.cpu()
full_params.append(param_norm) # 没统计bn的 只统计了conv的 而且还是fold后的
writer.add_histogram(tag='Full_' + name + '_data', values=param.data)
gol._init()
quant_type_list = ['INT','POT','FLOAT']
title_list = []
js_flops_list = []
js_param_list = []
ptq_acc_list = []
acc_loss_list = []
for quant_type in quant_type_list:
num_bit_list = numbit_list(quant_type)
# 对一个量化类别,只需设置一次bias量化表
# int由于位宽大,使用量化表开销过大,直接_round即可
if quant_type != 'INT':
bias_list = build_bias_list(quant_type)
gol.set_value(bias_list, is_bias=True)
for num_bits in num_bit_list:
e_bit_list = ebit_list(quant_type,num_bits)
for e_bits in e_bit_list:
# model_ptq = resnet18()
if args.model == 'ResNet18':
model_ptq = resnet18()
elif args.model == 'ResNet50':
model_ptq = resnet50()
elif args.model == 'ResNet152':
model_ptq = resnet152()
if quant_type == 'FLOAT':
title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
else:
title = '%s_%d' % (quant_type, num_bits)
print('\nPTQ: '+title)
title_list.append(title)
# 设置量化表
if quant_type != 'INT':
plist = build_list(quant_type, num_bits, e_bits)
gol.set_value(plist)
# 判断是否需要载入
if load_ptq is True and osp.exists(ptq_file_prefix + title + '.pt'):
model_ptq.quantize(quant_type,num_bits,e_bits)
model_ptq.load_state_dict(torch.load(ptq_file_prefix + title + '.pt'))
model_ptq.to(device)
print('Successfully load ptq model: ' + title)
else:
model_ptq.load_state_dict(torch.load(full_file))
model_ptq.to(device)
model_ptq.quantize(quant_type,num_bits,e_bits)
model_ptq.eval()
direct_quantize(model_ptq, train_loader, device)
if args.save:
torch.save(model_ptq.state_dict(), ptq_file_prefix + title + '.pt')
model_ptq.freeze()
ptq_acc = quantize_inference(model_ptq, test_loader, device)
ptq_acc_list.append(ptq_acc)
acc_loss = (full_acc - ptq_acc) / full_acc
acc_loss_list.append(acc_loss)
idx = -1
model_ptq.fakefreeze()
# 获取计算量/参数量下的js-div
js_flops = 0.
js_param = 0.
for name, param in model_ptq.named_parameters():
# if '.' not in name or 'bn' in name:
if 'bn' in name or 'sample.1' in name:
continue
writer.add_histogram(tag=title +':'+ name + '_data', values=param.data)
idx = idx + 1
# renset中有多个. 需要改写拼一下
# prefix = name.split('.')[0]
n = name.split('.')
prefix = '.'.join(n[:len(n) - 1])
# weight和bias 1:1 ? 对于ratio,是按层赋予的,此处可以对weight和bias再单独赋予不同的权重,比如(8:2)
if prefix in layer:
layer_idx = layer.index(prefix)
ptq_param = param.data.cpu()
# 取L2范数
# ptq_norm = F.normalize(ptq_param,p=2,dim=-1)
ptq_norm = ptq_param
writer.add_histogram(tag=title +':'+ name + '_data', values=ptq_param)
# print(name)
# print('=========')
# print(ptq_norm)
# print('=========')
# print(full_params[idx])
js = js_div(ptq_norm,full_params[idx]) # 这里算了fold后的量化前后模型的js距离
js = js.item()
if js < 0.:
js = 0.
js_flops = js_flops + js * flop_ratio[layer_idx]
js_param = js_param + js * par_ratio[layer_idx]
js_flops_list.append(js_flops)
js_param_list.append(js_param)
print(title + ': js_flops: %f js_param: %f acc_loss: %f' % (js_flops, js_param, acc_loss))
# 写入xlsx
workbook = openpyxl.Workbook()
worksheet = workbook.active
worksheet.cell(row=1,column=1,value='FP32-acc')
worksheet.cell(row=1,column=2,value=full_acc)
worksheet.cell(row=3,column=1,value='title')
worksheet.cell(row=3,column=2,value='js_flops')
worksheet.cell(row=3,column=3,value='js_param')
worksheet.cell(row=3,column=4,value='ptq_acc')
worksheet.cell(row=3,column=5,value='acc_loss')
for i in range(len(title_list)):
worksheet.cell(row=i+4, column=1, value=title_list[i])
worksheet.cell(row=i+4, column=2, value=js_flops_list[i])
worksheet.cell(row=i+4, column=3, value=js_param_list[i])
worksheet.cell(row=i+4, column=4, value=ptq_acc_list[i])
worksheet.cell(row=i+4, column=5, value=acc_loss_list[i])
workbook.save('ptq_result_' + args.model + '.xlsx')
writer.close()
ft = open('ptq_result_' + args.model + '.txt','w')
print('title_list:',file=ft)
print(" ".join(title_list),file=ft)
print('js_flops_list:',file=ft)
print(" ".join(str(i) for i in js_flops_list), file=ft)
print('js_param_list:',file=ft)
print(" ".join(str(i) for i in js_param_list), file=ft)
print('ptq_acc_list:',file=ft)
print(" ".join(str(i) for i in ptq_acc_list), file=ft)
print('acc_loss_list:',file=ft)
print(" ".join(str(i) for i in acc_loss_list), file=ft)
ft.close()
from torch.autograd import Function
class FakeQuantize(Function):
@staticmethod
def forward(ctx, x, qparam):
x = qparam.quantize_tensor(x)
x = qparam.dequantize_tensor(x)
return x
@staticmethod
def backward(ctx, grad_output):
return grad_output, None
\ No newline at end of file
from model import *
import torch
from ptflops import get_model_complexity_info
import argparse
def get_children(model: torch.nn.Module):
# get children form model!
# 为了后续也能够更新参数,需要用nn.ModuleList来承载
# children = nn.ModuleList(model.children())
# print(children)
# 方便对其中的module进行后续的更新
# flatt_children = nn.ModuleList()
children = list(model.children())
# flatt_children = nn.ModuleList()
flatt_children = []
if len(children) == 0:
# if model has no children; model is last child! :O
return model
else:
# look for children from children... to the last child!
for child in children:
try:
flatt_children.extend(get_children(child))
except TypeError:
flatt_children.append(get_children(child))
# print(flatt_children)
return flatt_children
# 定义获取不包含wrapper的所有子模块的函数
def get_all_child_modules(module):
for name, child in module.named_children():
if isinstance(child, nn.Sequential):
yield from get_all_child_modules(child)
elif len(list(child.children())) > 0:
yield from child.children()
else:
yield child
def filter_fn(module, n_inp, outp_shape):
# if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d, torch.nn.ReLU,torch.nn.BatchNorm2d,torch.nn.Linear,torch.nn.AdaptiveAvgPool2d)):
if 'conv' in module or 'bn' in module or 'fc' in module or 'avg' in module or 'relu' in module:
return True
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Model Analysis --- params & flops')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='resnet18')
args = parser.parse_args()
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
full_file = 'ckpt/cifar10_' + args.model + '.pt'
model.load_state_dict(torch.load(full_file))
# flat = get_children(model)
# print(flat)
# flat = get_children(model)
# new_model = nn.Sequential(*flat)
flops, params = get_model_complexity_info(model, (3, 32, 32), as_strings=True, print_per_layer_stat=True)
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import OrderedDict
def get_model_histogram(model):
"""
Description:
- get norm gradients from model, and store in a OrderDict
Args:
- model: (torch.nn.Module), torch model
Returns:
- grads in OrderDict
"""
gradshisto = OrderedDict()
grads = OrderedDict()
for name, params in model.named_parameters():
grad = params.grad
if grad is not None:
tmp = {}
params_np = grad.cpu().numpy()
histogram, bins = np.histogram(params_np.flatten(),bins=20)
tmp['histogram'] = list(histogram)
tmp['bins'] = list(bins)
gradshisto[name] = tmp
grads[name] = params_np
return gradshisto,grads
def get_model_norm_gradient(model):
"""
Description:
- get norm gradients from model, and store in a OrderDict
Args:
- model: (torch.nn.Module), torch model
Returns:
- grads in OrderDict
"""
grads = OrderedDict()
for name, params in model.named_parameters():
grad = params.grad
if grad is not None:
grads[name] = grad.norm().item()
return grads
def get_grad_histogram(grads_sum):
gradshisto = OrderedDict()
# grads = OrderedDict()
for name, params in grads_sum.items():
grad = params
if grad is not None:
tmp = {}
#params_np = grad.cpu().numpy()
params_np = grad
histogram, bins = np.histogram(params_np.flatten(),bins=20)
tmp['histogram'] = list(histogram)
tmp['bins'] = list(bins)
gradshisto[name] = tmp #每层一个histogram (tmp中的是描述直方图的信息)
# grads[name] = params_np
return gradshisto
\ No newline at end of file
class GlobalVariables:
SELF_INPLANES = 0
\ No newline at end of file
# -*- coding: utf-8 -*-
# 用于多个module之间共享全局变量
def _init(): # 初始化
global _global_dict
_global_dict = {}
def set_value(value,is_bias=False):
# 定义一个全局变量
if is_bias:
_global_dict[0] = value
else:
_global_dict[1] = value
def get_value(is_bias=False): # 给bias独立于各变量外的精度
if is_bias:
return _global_dict[0]
else:
return _global_dict[1]
from model import *
from extract_ratio import *
from utils import *
import argparse
import openpyxl
import os
import os.path as osp
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
def js_div_norm(a,b):
a_norm = F.normalize(a.data,p=2,dim=-1)
b_norm = F.normalize(b.data,p=2,dim=-1)
return js_div(a_norm,b_norm)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Loss-Grad Analysis')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='resnet18')
args = parser.parse_args()
wb = openpyxl.Workbook()
ws = wb.active
writer = SummaryWriter(log_dir='log/' + args.model + '/qat_loss_grad')
# layer, par_ratio, flop_ratio = extract_ratio()
layer, par_ratio, flop_ratio = extract_ratio(args.model)
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
layer = []
for name, param in model.named_parameters():
if 'weight' in name:
n = name.split('.')
pre = '.'.join(n[:len(n)-1])
layer.append(pre)
# dir_prefix = 'ckpt/qat/epoch_'
dir_prefix = 'ckpt/qat/'+ args.model + '/'
quant_type_list = ['INT','POT','FLOAT']
for epoch in [5,10,15,20]:
ws_epoch = wb.create_sheet('epoch_%d'%epoch)
full_state = torch.load(dir_prefix+'%d/'%epoch + 'full.pt')
ws_epoch.cell(row=1,column=2,value='loss')
ws_epoch.cell(row=1,column=3,value='loss_sum')
ws_epoch.cell(row=1,column=4,value='loss_avg')
ws_epoch.cell(row=2,column=1,value='FP32')
ws_epoch.cell(row=2,column=2,value=full_state['loss'].cpu().item())
ws_epoch.cell(row=2,column=3,value=full_state['loss_sum'].cpu().item())
ws_epoch.cell(row=2,column=4,value=full_state['loss_avg'].cpu().item())
# full_grad = full_state['grad_dict']
# full_grad_sum = full_state['grad_dict_sum']
full_grad_avg = full_state['grad_dict_avg']
for name,tmpgrad in full_grad_avg.items():
writer.add_histogram('FULL: '+name,tmpgrad,global_step=epoch)
ws_epoch.cell(row=4,column=1,value='title')
ws_epoch.cell(row=4,column=2,value='loss')
ws_epoch.cell(row=4,column=3,value='loss_sum')
ws_epoch.cell(row=4,column=4,value='loss_avg')
ws_epoch.cell(row=4,column=5,value='js_grad_avg_norm')
# ws_epoch.cell(row=4,column=6,value='conv1.weight')
# ws_epoch.cell(row=4,column=7,value='conv1.bias')
# ws_epoch.cell(row=4,column=8,value='conv2.weight')
# ws_epoch.cell(row=4,column=9,value='conv2.bias')
# ws_epoch.cell(row=4,column=10,value='conv3.weight')
# ws_epoch.cell(row=4,column=11,value='conv3.bias')
# ws_epoch.cell(row=4,column=12,value='conv4.weight')
# ws_epoch.cell(row=4,column=13,value='conv4.bias')
# ws_epoch.cell(row=4,column=14,value='conv5.weight')
# ws_epoch.cell(row=4,column=15,value='conv5.bias')
# ws_epoch.cell(row=4,column=16,value='fc1.weight')
# ws_epoch.cell(row=4,column=17,value='fc1.bias')
# ws_epoch.cell(row=4,column=18,value='fc2.weight')
# ws_epoch.cell(row=4,column=19,value='fc2.bias')
# ws_epoch.cell(row=4,column=20,value='fc3.weight')
# ws_epoch.cell(row=4,column=21,value='fc3.bias')
cnt = 5
for n in layer:
cnt = cnt + 1
ws_epoch.cell(row=4,column=cnt,value=n)
currow=4
for quant_type in quant_type_list:
num_bit_list = numbit_list(quant_type)
for num_bits in num_bit_list:
e_bit_list = ebit_list(quant_type,num_bits)
for e_bits in e_bit_list:
if quant_type == 'FLOAT':
title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
else:
title = '%s_%d' % (quant_type, num_bits)
print('\nAnalyse: '+title)
currow += 1
qat_state=torch.load(dir_prefix+'%d/'%epoch+title+'.pt')
js_grad_avg_norm=0.
grad_avg = qat_state['grad_dict_avg']
for name,tmpgrad in grad_avg.items():
writer.add_histogram(title+': '+name,tmpgrad,global_step=epoch)
colidx=5
for name,_ in full_grad_avg.items():
prefix = name.split('.')[0]
colidx += 1
layer_idx = layer.index(prefix)
js_norm = js_div_norm(full_grad_avg[name],grad_avg[name])
ws_epoch.cell(row=currow,column=colidx,value=js_norm.cpu().item())
js_grad_avg_norm += flop_ratio[layer_idx] * js_norm
ws_epoch.cell(row=currow,column=1,value=title)
ws_epoch.cell(row=currow,column=2,value=qat_state['loss'].cpu().item())
ws_epoch.cell(row=currow,column=3,value=qat_state['loss_sum'].cpu().item())
ws_epoch.cell(row=currow,column=4,value=qat_state['loss_avg'].cpu().item())
ws_epoch.cell(row=currow,column=5,value=js_grad_avg_norm.cpu().item())
wb.save('loss_grad.xlsx')
writer.close()
\ No newline at end of file
import torch
import torch.nn as nn
import torch.nn.functional as F
from module import *
import module
from global_var import GlobalVariables
# 定义 ResNet 模型
# 适用于Cifar10
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=10): # 这里将类别数设置为10
super(ResNet, self).__init__()
self.inplanes = 16 # 因为 CIFAR-10 图片较小,所以开始时需要更少的通道数
GlobalVariables.SELF_INPLANES = self.inplanes
# print('resnet init:'+ str(GlobalVariables.SELF_INPLANES))
# 输入层
self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1,
bias=False)
self.bn1 = nn.BatchNorm2d(16)
self.relu = nn.ReLU()
# 残差层(4 个阶段,每个阶段包含 6n+2 个卷积层)
self.layer1 = MakeLayer(block, 16, layers[0])
self.layer2 = MakeLayer(block, 32, layers[1], stride=2)
self.layer3 = MakeLayer(block, 64, layers[2], stride=2)
self.layer4 = MakeLayer(block, 128, layers[3], stride=2)
# 分类层
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(128 * block.expansion, num_classes)
# 参数初始化
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x):
# 输入层
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
# 这里相比于imagenet的,少了一个maxpool,因为cifar10本身图片就小,如果再pool就太小了
# 残差层
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
# 分类层
x = self.avgpool(x) # 输出的尺寸为 B,C,1,1
x = x.view(x.size(0), -1)
x = self.fc(x)
out = F.softmax(x,dim = 1) # 这里不softmax也行 影响不大
return out
def quantize(self, quant_type, num_bits=8, e_bits=3):
self.qconvbnrelu1 = QConvBNReLU(quant_type,self.conv1,self.bn1,qi=True,qo=True,num_bits=num_bits,e_bits=e_bits)
# 没有输入num_bits 需修改
self.layer1.quantize(quant_type=quant_type,num_bits=num_bits, e_bits=e_bits)
self.layer2.quantize(quant_type=quant_type,num_bits=num_bits, e_bits=e_bits)
self.layer3.quantize(quant_type=quant_type,num_bits=num_bits, e_bits=e_bits)
self.layer4.quantize(quant_type=quant_type,num_bits=num_bits, e_bits=e_bits)
self.qavgpool1 = QAdaptiveAvgPool2d(quant_type,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
self.qfc1 = QLinear(quant_type, self.fc,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
# self.qfc1 = QLinear(quant_type, self.fc,qi=True,qo=True,num_bits=num_bits,e_bits=e_bits)
def quantize_forward(self, x):
# for _, layer in self.quantize_layers.items():
# x = layer(x)
# out = F.softmax(x, dim=1)
# return out
x = self.qconvbnrelu1(x)
x = self.layer1.quantize_forward(x)
x = self.layer2.quantize_forward(x)
x = self.layer3.quantize_forward(x)
x = self.layer4.quantize_forward(x)
x = self.qavgpool1(x)
x = x.view(x.size(0), -1)
x = self.qfc1(x)
out = F.softmax(x,dim = 1) # 这里不softmax也行 影响不大
return out
def freeze(self):
self.qconvbnrelu1.freeze() # 因为作为第一层是有qi的,所以freeze的时候无需再重新提供qi
qo = self.layer1.freeze(qinput = self.qconvbnrelu1.qo)
qo = self.layer2.freeze(qinput = qo)
qo = self.layer3.freeze(qinput = qo)
qo = self.layer4.freeze(qinput = qo)
self.qavgpool1.freeze(qi=qo)
self.qfc1.freeze(qi=self.qavgpool1.qo)
# self.qfc1.freeze()
def fakefreeze(self):
self.qconvbnrelu1.fakefreeze()
self.layer1.fakefreeze()
self.layer2.fakefreeze()
self.layer3.fakefreeze()
self.layer4.fakefreeze()
self.qfc1.fakefreeze()
def quantize_inference(self, x):
qx = self.qconvbnrelu1.qi.quantize_tensor(x)
qx = self.qconvbnrelu1.quantize_inference(qx)
qx = self.layer1.quantize_inference(qx)
qx = self.layer2.quantize_inference(qx)
qx = self.layer3.quantize_inference(qx)
qx = self.layer4.quantize_inference(qx)
qx = self.qavgpool1.quantize_inference(qx)
qx = qx.view(qx.size(0), -1)
qx = self.qfc1.quantize_inference(qx)
qx = self.qfc1.qo.dequantize_tensor(qx)
out = F.softmax(qx,dim = 1) # 这里不softmax也行 影响不大
return out
# BasicBlock 类
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
# 第一个卷积层
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
# 第二个卷积层
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
# shortcut
self.relu = nn.ReLU()
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(identity)
out += identity
out = self.relu(out)
return out
def quantize(self, quant_type ,num_bits=8, e_bits=3):
self.qconvbnrelu1 = QConvBNReLU(quant_type,self.conv1,self.bn1,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
self.qconvbn1 = QConvBN(quant_type,self.conv2,self.bn2,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
if self.downsample is not None:
self.qconvbn2 = QConvBN(quant_type,self.downsample[0],self.downsample[1],qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
self.qelementadd = QElementwiseAdd(quant_type,qi0=False, qi1=False, qo=True,num_bits=num_bits,e_bits=e_bits)
self.qrelu1 = QReLU(quant_type,qi= False,num_bits=num_bits,e_bits=e_bits) # 需要qi
def quantize_forward(self, x):
identity = x
out = self.qconvbnrelu1(x)
out = self.qconvbn1(out)
if self.downsample is not None:
identity = self.qconvbn2(identity)
# residual add
# out = identity + out # 这里是需要写一个elementwiseadd的变换的,待后续修改
out = self.qelementadd(out,identity)
out = self.qrelu1(out)
return out
def freeze(self, qinput):
# 这里的qconvbnrelu1其实是可以用前一层的qo的,但感觉不太好传参,就没用
# 还需仔细检查
self.qconvbnrelu1.freeze(qi= qinput) # 需要接前一个module的最后一个qo
self.qconvbn1.freeze(qi = self.qconvbnrelu1.qo)
if self.downsample is not None:
self.qconvbn2.freeze(qi = qinput) # 一条支路
self.qelementadd.freeze(qi0 = self.qconvbn1.qo, qi1 = self.qconvbn2.qo)
else:
self.qelementadd.freeze(qi0 = self.qconvbn1.qo, qi1 = qinput)
# 这里或许需要补充个层来处理elementwise add
self.qrelu1.freeze(qi = self.qelementadd.qo)
return self.qrelu1.qi # relu后的qo可用relu统计的qi
def fakefreeze(self):
# 这里的qconvbnrelu1其实是可以用前一层的qo的,但感觉不太好传参,就没用
# 还需仔细检查
self.qconvbnrelu1.fakefreeze() # 需要接前一个module的最后一个qo
self.qconvbn1.fakefreeze()
if self.downsample is not None:
self.qconvbn2.fakefreeze() # 一条支路
def quantize_inference(self, x):
# 感觉是不需要进行初始的quantize_tensor和dequantize_tensor,因为他不是最前/后一层,只要中间的每层都在量化后的领域内,就不需要这种处理。
identity = x
out = self.qconvbnrelu1.quantize_inference(x)
out = self.qconvbn1.quantize_inference(out)
if self.downsample is not None:
identity = self.qconvbn2.quantize_inference(identity)
# out = identity + out # 这里可能需要写一个elementwiseadd的变换的,待后续修改
out = self.qelementadd.quantize_inference(out,identity)
out = self.qrelu1.quantize_inference(out)
return out
# Bottleneck 类
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
# 1x1 卷积层
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
# 3x3 卷积层
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
# 1x1 卷积层
self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
# shortcut
self.relu = nn.ReLU()
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity # 相加是在这里处理的
out = self.relu(out)
return out
def quantize(self, quant_type ,num_bits=8, e_bits=3):
self.qconvbnrelu1 = QConvBNReLU(quant_type,self.conv1,self.bn1,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
self.qconvbnrelu2 = QConvBNReLU(quant_type,self.conv2,self.bn2,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
self.qconvbn1 = QConvBN(quant_type,self.conv3,self.bn3,qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
if self.downsample is not None:
self.qconvbn2 = QConvBN(quant_type,self.downsample[0],self.downsample[1],qi=False,qo=True,num_bits=num_bits,e_bits=e_bits)
self.qelementadd = QElementwiseAdd(quant_type,qi0=False, qi1=False, qo=True,num_bits=num_bits,e_bits=e_bits)
self.qrelu1 = QReLU(quant_type,qi= False,num_bits=num_bits,e_bits=e_bits) # 需要qi
def quantize_forward(self, x):
identity = x
out = self.qconvbnrelu1(x)
out = self.qconvbnrelu2(out)
out = self.qconvbn1(out)
if self.downsample is not None:
identity = self.qconvbn2(identity)
# residual add
# out = identity + out # 这里是需要写一个elementwiseadd的变换的,待后续修改
out = self.qelementadd(out,identity)
out = self.qrelu1(out)
return out
def freeze(self, qinput):
# 这里的qconvbnrelu1其实是可以用前一层的qo的,但感觉不太好传参,就没用
# 还需仔细检查
self.qconvbnrelu1.freeze(qi= qinput) # 需要接前一个module的最后一个qo
self.qconvbnrelu2.freeze(qi=self.qconvbnrelu1.qo)
self.qconvbn1.freeze(qi = self.qconvbnrelu2.qo)
if self.downsample is not None:
self.qconvbn2.freeze(qi = qinput) # 一条支路
self.qelementadd.freeze(qi0 = self.qconvbn1.qo, qi1 = self.qconvbn2.qo)
else:
self.qelementadd.freeze(qi0 = self.qconvbn1.qo, qi1 = qinput)
# 这里或许需要补充个层来处理elementwise add
self.qrelu1.freeze(qi = self.qelementadd.qo) # 需要自己统计qi
return self.qrelu1.qi # relu后的qo可用relu统计的qi
def fakefreeze(self):
# 这里的qconvbnrelu1其实是可以用前一层的qo的,但感觉不太好传参,就没用
# 还需仔细检查
self.qconvbnrelu1.fakefreeze()
self.qconvbnrelu2.fakefreeze()
self.qconvbn1.fakefreeze()
if self.downsample is not None:
self.qconvbn2.fakefreeze() # 一条支路
def quantize_inference(self, x):
# 感觉是不需要进行初始的quantize_tensor和dequantize_tensor,因为他不是最前/后一层,只要中间的每层都在量化后的领域内,就不需要这种处理。
identity = x
out = self.qconvbnrelu1.quantize_inference(x)
out = self.qconvbnrelu2.quantize_inference(out)
out = self.qconvbn1.quantize_inference(out)
if self.downsample is not None:
identity = self.qconvbn2.quantize_inference(identity)
# out = identity + out # 这里可能需要写一个elementwiseadd的变换的,待后续修改
out = self.qelementadd.quantize_inference(out,identity)
out = self.qrelu1.quantize_inference(out)
return out
class MakeLayer(nn.Module):
def __init__(self, block, planes, blocks, stride=1):
super(MakeLayer, self).__init__()
# print('makelayer init:'+ str(GlobalVariables.SELF_INPLANES))
self.downsample = None
if stride != 1 or GlobalVariables.SELF_INPLANES != planes * block.expansion:
self.downsample = nn.Sequential(
nn.Conv2d(GlobalVariables.SELF_INPLANES, planes * block.expansion,kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion)
)
self.blockdict = nn.ModuleDict()
self.blockdict['block1'] = block(inplanes=GlobalVariables.SELF_INPLANES, planes=planes, stride=stride, downsample=self.downsample)
GlobalVariables.SELF_INPLANES = planes * block.expansion
for i in range(1, blocks): # block的个数 这里只能用字典了
self.blockdict['block' + str(i+1)] = block(inplanes=GlobalVariables.SELF_INPLANES, planes=planes) # 此处进行实例化了
# def _make_layer(self, block, planes, blocks, stride=1):
# downsample = None
# # stride 是卷积层的步幅,而 self.inplanes 表示当前残差块输入的通道数,
# # planes * block.expansion 则表示当前残差块输出的通道数。因此,当 stride 不等于 1 或者 self.inplanes 不等于 planes * block.expansion 时,就需要进行下采样操作
# #该层中除了第一个残差块之外,其他所有残差块的输入通道数和输出通道数都相等,并且具有相同的步幅(都为 1 或者 2)。这些卷积层的输入张量大小不变, 输出张量高宽尺寸会随着残差块的堆叠而逐渐降低
# if stride != 1 or SELF_INPLANES != planes * block.expansion:
# downsample = nn.Sequential(
# nn.Conv2d(SELF_INPLANES, planes * block.expansion,
# kernel_size=1, stride=stride, bias=False),
# nn.BatchNorm2d(planes * block.expansion),
# )
# layers = []
# layers.append(block(SELF_INPLANES, planes, stride, downsample))
# SELF_INPLANES = planes * block.expansion
# for _ in range(1, blocks): # block的个数
# layers.append(block(SELF_INPLANES, planes))
# return nn.Sequential(*layers)
def forward(self,x):
for _, layer in self.blockdict.items():
x = layer(x)
return x
def quantize(self, quant_type, num_bits=8, e_bits=3):
# 需检查
for _, layer in self.blockdict.items():
layer.quantize(quant_type=quant_type,num_bits=num_bits,e_bits=e_bits) # 这里是因为每一块都是block,而block中有具体的quantize策略, n_exp和mode已经在__init__中赋值了
def quantize_forward(self, x):
for _, layer in self.blockdict.items():
x = layer.quantize_forward(x) # 各个block中有具体的quantize_forward
return x
def freeze(self, qinput): # 需要在 Module Resnet的freeze里传出来
# 这里的qconvbnrelu1其实是可以用前一层的qo的,但感觉不太好传参,就没用
# 还需仔细检查
cnt = 0
for _, layer in self.blockdict.items():
if cnt == 0:
qo = layer.freeze(qinput = qinput)
cnt = 1
else:
qo = layer.freeze(qinput = qo) # 各个block中有具体的freeze
return qo # 供后续的层用
def fakefreeze(self):
for _, layer in self.blockdict.items():
layer.fakefreeze()
def quantize_inference(self, x):
# 感觉是不需要进行初始的quantize_tensor和dequantize_tensor,因为他不是最前/后一层,只要中间的每层都在量化后的领域内,就不需要这种处理。
for _, layer in self.blockdict.items():
x = layer.quantize_inference(x) # 每个block中有具体的quantize_inference
return x
# 使用 ResNet18 模型
def resnet18(**kwargs):
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
return model
# 使用 ResNet50 模型
def resnet50(**kwargs):
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
return model
# 使用 ResNet152 模型
def resnet152(**kwargs):
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
return model
import math
import numpy as np
import gol
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from function import FakeQuantize
# 获取最近的量化值
# def get_nearest_val(quant_type,x,is_bias=False):
# if quant_type=='INT':
# return x.round_()
# plist = gol.get_value(is_bias)
# # print('get')
# # print(plist)
# # x = x / 64
# shape = x.shape
# xhard = x.view(-1)
# plist = plist.type_as(x)
# # 取最近幂次作为索引
# idx = (xhard.unsqueeze(0) - plist.unsqueeze(1)).abs().min(dim=0)[1]
# xhard = plist[idx].view(shape)
# xout = (xhard - x).detach() + x
# # xout = xout * 64
# return xout
def get_nearest_val(quant_type, x, is_bias=False, block_size=1000000):
if quant_type == 'INT':
return x.round_()
plist = gol.get_value(is_bias)
shape = x.shape
xhard = x.view(-1)
xout = torch.zeros_like(xhard)
plist = plist.type_as(x)
n_blocks = (x.numel() + block_size - 1) // block_size
for i in range(n_blocks):
start_idx = i * block_size
end_idx = min(start_idx + block_size, xhard.numel())
block_size_i = end_idx - start_idx
# print(x.numel())
# print(block_size_i)
# print(start_idx)
# print(end_idx)
xblock = xhard[start_idx:end_idx]
# xblock = xblock.view(shape[start_idx:end_idx])
plist_block = plist.unsqueeze(1) #.expand(-1, block_size_i)
idx = (xblock.unsqueeze(0) - plist_block).abs().min(dim=0)[1]
# print(xblock.shape)
xhard_block = plist[idx].view(xblock.shape)
xout[start_idx:end_idx] = (xhard_block - xblock).detach() + xblock
xout = xout.view(shape)
return xout
# 采用对称有符号量化时,获取量化范围最大值
def get_qmax(quant_type,num_bits=None, e_bits=None):
if quant_type == 'INT':
qmax = 2. ** (num_bits - 1) - 1
elif quant_type == 'POT':
qmax = 1
else: #FLOAT
m_bits = num_bits - 1 - e_bits
dist_m = 2 ** (-m_bits)
e = 2 ** (e_bits - 1)
expo = 2 ** e
m = 2 ** m_bits -1
frac = 1. + m * dist_m
qmax = frac * expo
return qmax
# 都采用有符号量化,zeropoint都置为0
def calcScaleZeroPoint(min_val, max_val, qmax):
scale = torch.max(max_val.abs(),min_val.abs()) / qmax
zero_point = torch.tensor(0.)
return scale, zero_point
# 将输入进行量化,输入输出都为tensor
def quantize_tensor(quant_type, x, scale, zero_point, qmax, is_bias=False):
# 量化后范围,直接根据位宽确定
qmin = -qmax
q_x = zero_point + x / scale
q_x.clamp_(qmin, qmax)
q_x = get_nearest_val(quant_type, q_x, is_bias)
return q_x
# bias使用不同精度,需要根据量化类型指定num_bits/e_bits
def bias_qmax(quant_type):
if quant_type == 'INT':
return get_qmax(quant_type, 64)
elif quant_type == 'POT':
return get_qmax(quant_type)
else:
return get_qmax(quant_type, 16, 7)
# 转化为FP32,不需再做限制
def dequantize_tensor(q_x, scale, zero_point):
return scale * (q_x - zero_point)
class QParam(nn.Module):
def __init__(self,quant_type, num_bits=8, e_bits=3):
super(QParam, self).__init__()
self.quant_type = quant_type
self.num_bits = num_bits
self.e_bits = e_bits
self.qmax = get_qmax(quant_type, num_bits, e_bits)
scale = torch.tensor([], requires_grad=False)
zero_point = torch.tensor([], requires_grad=False)
min = torch.tensor([], requires_grad=False)
max = torch.tensor([], requires_grad=False)
# 通过注册为register,使得buffer可以被记录到state_dict
self.register_buffer('scale', scale)
self.register_buffer('zero_point', zero_point)
self.register_buffer('min', min)
self.register_buffer('max', max)
# 更新统计范围及量化参数
def update(self, tensor):
if self.max.nelement() == 0 or self.max.data < tensor.max().data:
self.max.data = tensor.max().data
self.max.clamp_(min=0)
if self.min.nelement() == 0 or self.min.data > tensor.min().data:
self.min.data = tensor.min().data
self.min.clamp_(max=0)
self.scale, self.zero_point = calcScaleZeroPoint(self.min, self.max, self.qmax)
def quantize_tensor(self, tensor):
return quantize_tensor(self.quant_type, tensor, self.scale, self.zero_point, self.qmax)
def dequantize_tensor(self, q_x):
return dequantize_tensor(q_x, self.scale, self.zero_point)
# 该方法保证了可以从state_dict里恢复
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
error_msgs):
key_names = ['scale', 'zero_point', 'min', 'max']
for key in key_names:
value = getattr(self, key)
value.data = state_dict[prefix + key].data
state_dict.pop(prefix + key)
# 该方法返回值将是打印该对象的结果
def __str__(self):
info = 'scale: %.10f ' % self.scale
info += 'zp: %.6f ' % self.zero_point
info += 'min: %.6f ' % self.min
info += 'max: %.6f' % self.max
return info
# 作为具体量化层的父类,qi和qo分别为量化输入/输出
class QModule(nn.Module):
def __init__(self,quant_type, qi=True, qo=True, num_bits=8, e_bits=3):
super(QModule, self).__init__()
if qi:
self.qi = QParam(quant_type,num_bits, e_bits)
if qo:
self.qo = QParam(quant_type,num_bits, e_bits)
self.quant_type = quant_type
self.num_bits = num_bits
self.e_bits = e_bits
self.bias_qmax = bias_qmax(quant_type)
def freeze(self):
pass # 空语句
def fakefreeze(self):
pass
def quantize_inference(self, x):
raise NotImplementedError('quantize_inference should be implemented.')
"""
QModule 量化卷积
:quant_type: 量化类型
:conv_module: 卷积模块
:qi: 是否量化输入特征图
:qo: 是否量化输出特征图
:num_bits: 8位bit数
"""
class QConv2d(QModule):
def __init__(self, quant_type, conv_module, qi=True, qo=True, num_bits=8, e_bits=3):
super(QConv2d, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.conv_module = conv_module
self.qw = QParam(quant_type, num_bits,e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
# freeze方法可以固定真量化的权重参数,并将该值更新到原全精度层上,便于散度计算
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
# 这里因为在池化或者激活的输入,不需要对最大值和最小是进行额外的统计,会共享相同的输出
if qi is not None:
self.qi = qi
if qo is not None:
self.qo = qo
# 根据https://zhuanlan.zhihu.com/p/156835141, 这是式3 的系数
self.M.data = (self.qw.scale * self.qi.scale / self.qo.scale).data
self.conv_module.weight.data = self.qw.quantize_tensor(self.conv_module.weight.data)
self.conv_module.weight.data = self.conv_module.weight.data - self.qw.zero_point
self.conv_module.bias.data = quantize_tensor(self.quant_type,
self.conv_module.bias.data, scale=self.qi.scale * self.qw.scale,
zero_point=0.,qmax=self.bias_qmax, is_bias=True)
def fakefreeze(self):
self.conv_module.weight.data = self.qw.dequantize_tensor(self.conv_module.weight.data)
self.conv_module.bias.data = dequantize_tensor(self.conv_module.bias.data,scale=self.qi.scale * self.qw.scale, zero_point=0.)
def forward(self, x): # 前向传播,输入张量,x为浮点型数据
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi) # 对输入张量X完成量化
# foward前更新qw,保证量化weight时候scale正确
self.qw.update(self.conv_module.weight.data)
# 注意:此处主要为了统计各层x和weight范围,未对bias进行量化操作
tmp_wgt = FakeQuantize.apply(self.conv_module.weight, self.qw)
x = F.conv2d(x, tmp_wgt, self.conv_module.bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding, dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
# 利用公式 q_a = M(\sigma(q_w-Z_w)(q_x-Z_x) + q_b)
def quantize_inference(self, x): # 此处input为已经量化的qx
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point
return x
class QLinear(QModule):
def __init__(self, quant_type, fc_module, qi=True, qo=True, num_bits=8, e_bits=3):
super(QLinear, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.fc_module = fc_module
self.qw = QParam(quant_type, num_bits, e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
if qi is not None:
self.qi = qi
if qo is not None:
self.qo = qo
self.M.data = (self.qw.scale * self.qi.scale / self.qo.scale).data
self.fc_module.weight.data = self.qw.quantize_tensor(self.fc_module.weight.data)
self.fc_module.weight.data = self.fc_module.weight.data - self.qw.zero_point
self.fc_module.bias.data = quantize_tensor(self.quant_type,
self.fc_module.bias.data, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax, is_bias=True)
def fakefreeze(self):
self.fc_module.weight.data = self.qw.dequantize_tensor(self.fc_module.weight.data)
self.fc_module.bias.data = dequantize_tensor(self.fc_module.bias.data, scale=self.qi.scale * self.qw.scale, zero_point=0.)
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
self.qw.update(self.fc_module.weight.data)
tmp_wgt = FakeQuantize.apply(self.fc_module.weight, self.qw)
x = F.linear(x, tmp_wgt, self.fc_module.bias)
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x):
x = x - self.qi.zero_point
x = self.fc_module(x)
x = self.M * x
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point
return x
class QReLU(QModule):
def __init__(self,quant_type, qi=False, qo=True, num_bits=8, e_bits=3):
super(QReLU, self).__init__(quant_type, qi, qo, num_bits, e_bits)
def freeze(self, qi=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if qi is not None:
self.qi = qi
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
x = F.relu(x)
return x
def quantize_inference(self, x):
x = x.clone()
# x[x < self.qi.zero_point] = self.qi.zero_point
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
a = self.qi.zero_point.float().to(device)
x[x < a] = a
return x
class QMaxPooling2d(QModule):
def __init__(self, quant_type, kernel_size=3, stride=1, padding=0, qi=False, qo=True, num_bits=8,e_bits=3):
super(QMaxPooling2d, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
def freeze(self, qi=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if qi is not None:
self.qi = qi
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding)
return x
def quantize_inference(self, x):
return F.max_pool2d(x, self.kernel_size, self.stride, self.padding)
class QConvBNReLU(QModule):
def __init__(self, quant_type, conv_module, bn_module, qi=True, qo=True, num_bits=8, e_bits=3):
super(QConvBNReLU, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.conv_module = conv_module
self.bn_module = bn_module
self.qw = QParam(quant_type, num_bits,e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def fold_bn(self, mean, std):
if self.bn_module.affine:
gamma_ = self.bn_module.weight / std
weight = self.conv_module.weight * gamma_.view(self.conv_module.out_channels, 1, 1, 1)
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean + self.bn_module.bias
else:
bias = self.bn_module.bias - gamma_ * mean
else:
gamma_ = 1 / std
weight = self.conv_module.weight * gamma_
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean
else:
bias = -gamma_ * mean
return weight, bias
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
if qi is not None:
self.qi = qi
if qo is not None:
self.qo = qo
self.M.data = (self.qw.scale * self.qi.scale / self.qo.scale).data
std = torch.sqrt(self.bn_module.running_var + self.bn_module.eps)
weight, bias = self.fold_bn(self.bn_module.running_mean, std)
self.conv_module.weight.data = self.qw.quantize_tensor(weight.data)
self.conv_module.weight.data = self.conv_module.weight.data - self.qw.zero_point
if self.conv_module.bias is None:
self.conv_module.bias = nn.Parameter(quantize_tensor(self.quant_type,
bias, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax,is_bias=True))
else:
self.conv_module.bias.data = quantize_tensor(self.quant_type,
bias, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax,is_bias=True)
def fakefreeze(self):
self.conv_module.weight.data = self.qw.dequantize_tensor(self.conv_module.weight.data)
self.conv_module.bias.data = dequantize_tensor(self.conv_module.bias.data,scale=self.qi.scale * self.qw.scale, zero_point=0.)
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
if self.training:
y = F.conv2d(x, self.conv_module.weight, self.conv_module.bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding,
dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
y = y.permute(1, 0, 2, 3) # NCHW -> CNHW
y = y.contiguous().view(self.conv_module.out_channels, -1) # CNHW -> C,NHW
# mean = y.mean(1)
# var = y.var(1)
mean = y.mean(1).detach()
var = y.var(1).detach()
self.bn_module.running_mean = \
(1 - self.bn_module.momentum) * self.bn_module.running_mean + \
self.bn_module.momentum * mean
self.bn_module.running_var = \
(1 - self.bn_module.momentum) * self.bn_module.running_var + \
self.bn_module.momentum * var
else:
mean = Variable(self.bn_module.running_mean)
var = Variable(self.bn_module.running_var)
std = torch.sqrt(var + self.bn_module.eps)
weight, bias = self.fold_bn(mean, std)
self.qw.update(weight.data)
x = F.conv2d(x, FakeQuantize.apply(weight, self.qw), bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding, dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
x = F.relu(x)
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x):
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point
x.clamp_(min=0)
return x
class QConvBN(QModule):
def __init__(self, quant_type, conv_module, bn_module, qi=True, qo=True, num_bits=8, e_bits=3):
super(QConvBN, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.conv_module = conv_module
self.bn_module = bn_module
self.qw = QParam(quant_type, num_bits,e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def fold_bn(self, mean, std):
if self.bn_module.affine:
gamma_ = self.bn_module.weight / std
weight = self.conv_module.weight * gamma_.view(self.conv_module.out_channels, 1, 1, 1)
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean + self.bn_module.bias
else:
bias = self.bn_module.bias - gamma_ * mean
else:
gamma_ = 1 / std
weight = self.conv_module.weight * gamma_
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean
else:
bias = -gamma_ * mean
return weight, bias
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
if qi is not None:
self.qi = qi
if qo is not None:
self.qo = qo
self.M.data = (self.qw.scale * self.qi.scale / self.qo.scale).data
std = torch.sqrt(self.bn_module.running_var + self.bn_module.eps)
weight, bias = self.fold_bn(self.bn_module.running_mean, std)
self.conv_module.weight.data = self.qw.quantize_tensor(weight.data)
self.conv_module.weight.data = self.conv_module.weight.data - self.qw.zero_point
if self.conv_module.bias is None:
self.conv_module.bias = nn.Parameter(quantize_tensor(self.quant_type,
bias, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax,is_bias=True))
else:
self.conv_module.bias.data = quantize_tensor(self.quant_type,
bias, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax,is_bias=True)
def fakefreeze(self):
self.conv_module.weight.data = self.qw.dequantize_tensor(self.conv_module.weight.data)
self.conv_module.bias.data = dequantize_tensor(self.conv_module.bias.data,scale=self.qi.scale * self.qw.scale, zero_point=0.)
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
if self.training:
y = F.conv2d(x, self.conv_module.weight, self.conv_module.bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding,
dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
y = y.permute(1, 0, 2, 3) # NCHW -> CNHW
y = y.contiguous().view(self.conv_module.out_channels, -1) # CNHW -> C,NHW
# mean = y.mean(1)
# var = y.var(1)
mean = y.mean(1).detach()
var = y.var(1).detach()
self.bn_module.running_mean = \
(1 - self.bn_module.momentum) * self.bn_module.running_mean + \
self.bn_module.momentum * mean
self.bn_module.running_var = \
(1 - self.bn_module.momentum) * self.bn_module.running_var + \
self.bn_module.momentum * var
else:
mean = Variable(self.bn_module.running_mean)
var = Variable(self.bn_module.running_var)
std = torch.sqrt(var + self.bn_module.eps)
weight, bias = self.fold_bn(mean, std)
self.qw.update(weight.data)
x = F.conv2d(x, FakeQuantize.apply(weight, self.qw), bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding, dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
# x = F.relu(x)
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x):
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point
# x.clamp_(min=0)
return x
# 待修改 需要有qo吧
class QAdaptiveAvgPool2d(QModule):
def __init__(self, quant_type, qi=False, qo=True, num_bits=8, e_bits=3):
super(QAdaptiveAvgPool2d, self).__init__(quant_type,qi,qo,num_bits,e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if qi is not None:
self.qi = qi
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
if qo is not None:
self.qo = qo
self.M.data = (self.qi.scale / self.qo.scale).data
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi) # 与ReLu一样,先更新qi的scale,再将x用PoT表示了 (不过一般前一层的qo都是True,则x已经被PoT表示了)
x = F.adaptive_avg_pool2d(x,(1, 1)) # 对输入输出都量化一下就算是量化了
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x):
x = F.adaptive_avg_pool2d(x,(1, 1)) # 对输入输出都量化一下就算是量化了
x = self.M * x
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
return x
class QConvBNReLU6(QModule):
def __init__(self, quant_type, conv_module, bn_module, qi=True, qo=True, num_bits=8, e_bits=3):
super(QConvBNReLU6, self).__init__(quant_type, qi, qo, num_bits, e_bits)
self.conv_module = conv_module
self.bn_module = bn_module
self.qw = QParam(quant_type, num_bits,e_bits)
self.register_buffer('M', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def fold_bn(self, mean, std):
if self.bn_module.affine:
gamma_ = self.bn_module.weight / std
weight = self.conv_module.weight * gamma_.view(self.conv_module.out_channels, 1, 1, 1)
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean + self.bn_module.bias
else:
bias = self.bn_module.bias - gamma_ * mean
else:
gamma_ = 1 / std
weight = self.conv_module.weight * gamma_
if self.conv_module.bias is not None:
bias = gamma_ * self.conv_module.bias - gamma_ * mean
else:
bias = -gamma_ * mean
return weight, bias
def freeze(self, qi=None, qo=None):
if hasattr(self, 'qi') and qi is not None:
raise ValueError('qi has been provided in init function.')
if not hasattr(self, 'qi') and qi is None:
raise ValueError('qi is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
if qi is not None:
self.qi = qi
if qo is not None:
self.qo = qo
self.M.data = (self.qw.scale * self.qi.scale / self.qo.scale).data
std = torch.sqrt(self.bn_module.running_var + self.bn_module.eps)
weight, bias = self.fold_bn(self.bn_module.running_mean, std)
self.conv_module.weight.data = self.qw.quantize_tensor(weight.data)
self.conv_module.weight.data = self.conv_module.weight.data - self.qw.zero_point
self.conv_module.bias.data = quantize_tensor(self.quant_type,
bias, scale=self.qi.scale * self.qw.scale,
zero_point=0., qmax=self.bias_qmax,is_bias=True)
def fakefreeze(self):
self.conv_module.weight.data = self.qw.dequantize_tensor(self.conv_module.weight.data)
self.conv_module.bias.data = dequantize_tensor(self.conv_module.bias.data,scale=self.qi.scale * self.qw.scale, zero_point=0.)
def forward(self, x):
if hasattr(self, 'qi'):
self.qi.update(x)
x = FakeQuantize.apply(x, self.qi)
if self.training:
y = F.conv2d(x, self.conv_module.weight, self.conv_module.bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding,
dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
y = y.permute(1, 0, 2, 3) # NCHW -> CNHW
y = y.contiguous().view(self.conv_module.out_channels, -1) # CNHW -> C,NHW
# mean = y.mean(1)
# var = y.var(1)
mean = y.mean(1).detach()
var = y.var(1).detach()
self.bn_module.running_mean = \
(1 - self.bn_module.momentum) * self.bn_module.running_mean + \
self.bn_module.momentum * mean
self.bn_module.running_var = \
(1 - self.bn_module.momentum) * self.bn_module.running_var + \
self.bn_module.momentum * var
else:
mean = Variable(self.bn_module.running_mean)
var = Variable(self.bn_module.running_var)
std = torch.sqrt(var + self.bn_module.eps)
weight, bias = self.fold_bn(mean, std)
self.qw.update(weight.data)
x = F.conv2d(x, FakeQuantize.apply(weight, self.qw), bias,
stride=self.conv_module.stride,
padding=self.conv_module.padding, dilation=self.conv_module.dilation,
groups=self.conv_module.groups)
x = F.relu6(x)
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x):
a = torch.tensor(6)
a = self.qo.quantize_tensor(a)
x = x - self.qi.zero_point
x = self.conv_module(x)
x = self.M * x
# if self.quant_type is not 'POT':
# x = get_nearest_val(self.quant_type,x)
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point # 属于qo范围的数据
x.clamp_(min=0, max=a.item())
return x
class QModule_2(nn.Module):
def __init__(self,quant_type, qi0=True, qi1=True, qo=True, num_bits=8, e_bits=3):
super(QModule_2, self).__init__()
if qi0:
self.qi0 = QParam(quant_type,num_bits, e_bits) # qi在此处就已经被num_bits和mode赋值了
if qi1:
self.qi1 = QParam(quant_type,num_bits, e_bits) # qi在此处就已经被num_bits和mode赋值了
if qo:
self.qo = QParam(quant_type,num_bits, e_bits) # qo在此处就已经被num_bits和mode赋值了
self.quant_type = quant_type
self.num_bits = num_bits
self.e_bits = e_bits
self.bias_qmax = bias_qmax(quant_type)
def freeze(self):
pass
def fakefreeze(self):
pass
def quantize_inference(self, x):
raise NotImplementedError('quantize_inference should be implemented.')
class QElementwiseAdd(QModule_2):
def __init__(self, quant_type, qi0=True, qi1=True, qo=True, num_bits=8, e_bits=3):
super(QElementwiseAdd, self).__init__(quant_type, qi0, qi1, qo, num_bits, e_bits)
self.register_buffer('M0', torch.tensor([], requires_grad=False)) # 将M注册为buffer
self.register_buffer('M1', torch.tensor([], requires_grad=False)) # 将M注册为buffer
def freeze(self, qi0=None, qi1=None ,qo=None):
if hasattr(self, 'qi') and qi0 is not None:
raise ValueError('qi0 has been provided in init function.')
if not hasattr(self, 'qi') and qi0 is None:
raise ValueError('qi0 is not existed, should be provided.')
if hasattr(self, 'qi1') and qi0 is not None:
raise ValueError('qi1 has been provided in init function.')
if not hasattr(self, 'qi1') and qi0 is None:
raise ValueError('qi1 is not existed, should be provided.')
if hasattr(self, 'qo') and qo is not None:
raise ValueError('qo has been provided in init function.')
if not hasattr(self, 'qo') and qo is None:
raise ValueError('qo is not existed, should be provided.')
# 这里因为在池化或者激活的输入,不需要对最大值和最小是进行额外的统计,会共享相同的输出
if qi0 is not None:
self.qi0 = qi0
if qi1 is not None:
self.qi1 = qi1
if qo is not None:
self.qo = qo
# 根据https://zhuanlan.zhihu.com/p/156835141, 这是式3 的系数
self.M0.data = self.qi0.scale / self.qo.scale
self.M1.data = self.qi1.scale / self.qi0.scale
# self.M0.data = self.qi0.scale / self.qo.scale
# self.M1.data = self.qi1.scale / self.qo.scale
def forward(self, x0, x1): # 前向传播,输入张量,x为浮点型数据
if hasattr(self, 'qi0'):
self.qi0.update(x0)
x0 = FakeQuantize.apply(x0, self.qi0) # 对输入张量X完成量化
if hasattr(self, 'qi1'):
self.qi1.update(x1)
x1 = FakeQuantize.apply(x1, self.qi1) # 对输入张量X完成量化
x = x0 + x1
if hasattr(self, 'qo'):
self.qo.update(x)
x = FakeQuantize.apply(x, self.qo)
return x
def quantize_inference(self, x0, x1): # 此处input为已经量化的qx
x0 = x0 - self.qi0.zero_point
x1 = x1 - self.qi1.zero_point
x = self.M0 * (x0 + x1*self.M1)
# if self.quant_type is 'INT':
x = get_nearest_val(self.quant_type,x)
x = x + self.qo.zero_point
return x
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from get_weight import *
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms
from torchvision.datasets import CIFAR10
from torch.optim.lr_scheduler import CosineAnnealingLR
from model import *
from torchvision.transforms import transforms
# import models
import time
import os
import argparse
# 定义模型
def train(model, optimizer, criterion, train_loader, device):
model.train()
running_loss = 0.0
flag = 0
cnt = 0
for i, data in enumerate(train_loader):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
histo, grads = (get_model_histogram(model))
if flag == 0:
flag = 1
grads_sum = grads
else:
for k,v in grads_sum.items():
grads_sum[k] += grads[k]
optimizer.step()
running_loss += loss.item()
train_loss = running_loss / len(train_loader)
for k, v in grads_sum.items():
grads_sum[k] = v / len(train_loader)
return train_loss,grads_sum
def evaluate(model, criterion, test_loader, device):
model.eval()
correct, total = 0, 0
with torch.no_grad():
for data in test_loader:
images, labels = data
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
return accuracy
# def get_children(model: torch.nn.Module):
# # get children form model!
# # 为了后续也能够更新参数,需要用nn.ModuleList来承载
# children = nn.ModuleList(model.children())
# # print(children)
# # 方便对其中的module进行后续的更新
# flatt_children = nn.ModuleList()
# # children = list(model.children())
# # flatt_children = nn.ModuleList()
# # flatt_children = []
# if len(children) == 0:
# # if model has no children; model is last child! :O
# return model
# else:
# # look for children from children... to the last child!
# for child in children:
# try:
# flatt_children.extend(get_children(child))
# except TypeError:
# flatt_children.append(get_children(child))
# # print(flatt_children)
# return flatt_children
if __name__ == "__main__":
# torch.cuda.empty_cache()
parser = argparse.ArgumentParser(description='PyTorch FP32 Training')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='resnet18')
parser.add_argument('-e','--epochs', default=100, type=int, metavar='EPOCHS', help='number of total epochs to run')
parser.add_argument('-b', '--batch_size', default=128, type=int, metavar='BATCH SIZE', help='mini-batch size (default: 128)')
parser.add_argument('-j','--workers', default=4, type=int, metavar='WORKERS',help='number of data loading workers (default: 4)')
parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('-wd','--weight_decay',default=0.0001,type=float,metavar='WD',help='lr schduler weight decay',dest='wd')
parser.add_argument('-t', '--test', dest='test', action='store_true', help='test model on test set')
# 训练参数
args = parser.parse_args()
num_epochs = args.epochs
print(num_epochs)
batch_size = args.batch_size
print(batch_size)
num_workers = args.workers
lr = args.lr
weight_decay = args.wd
best_acc = float("-inf")
start_time = time.time()
# 模型、损失函数和优化器
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 加入设备选择
print(device)
if args.model == 'ResNet18' :
model = resnet18().to(device)
elif args.model == 'ResNet50' :
model = resnet50().to(device)
elif args.model == 'ResNet152' :
model = resnet152().to(device)
# elif args.model == 'LeNet' :
# model = LeNet().to(device)
# elif args.model == 'NetBN' :
# model = NetBN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
# optimizer = optim.AdaBound(model.parameters(), lr=lr,
# weight_decay=weight_decay, final_lr=0.001*lr)
# print("ok!")
# 数据并行
if torch.cuda.device_count() > 1:
print(f"Using {torch.cuda.device_count()} GPUs")
model = nn.DataParallel(model)
# 加载数据
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=False,
transform=transforms.Compose([
transforms.RandomCrop(32, padding=2),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, download=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True
)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
# 学习率调度器
# lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
# TensorBoard
# WARN
# writer = SummaryWriter(log_dir='./project/p/models_log/trail/full_log')
writer = SummaryWriter(log_dir='log/' + args.model + '/full_log')
# Early Stopping 参数
patience = 30
count = 0
# WARN
# save_dir = './project/p/ckpt/trail'
save_dir = 'ckpt'
if not os.path.isdir(save_dir):
os.makedirs(save_dir, mode=0o777)
os.chmod(save_dir, mode=0o777)
# checkpoint_dir = './project/p/checkpoint/cifar-10_trail_model'
checkpoint_dir = 'checkpoint'
if not os.path.isdir(checkpoint_dir):
os.makedirs(checkpoint_dir, mode=0o777)
os.chmod(checkpoint_dir, mode=0o777)
# 训练循环
if args.test == True:
model.load_state_dict(torch.load(save_dir + '/cifar10_' +args.model + '.pt'))
acc = evaluate(model, criterion, test_loader, device=device)
print(f"test accuracy: {acc:.2f}%")
# for name, module in model.named_modules():
# print(f"{name}: {module}\n")
# print('========================================================')
# print('========================================================')
# model.quantize()
# for name , layer in model.quantize_layers.items():
# print(f"Layer {name}: {layer} ") # 足够遍历了
else:
for epoch in range(num_epochs):
# 训练模型并记录 loss
train_loss,grads_sum = train(model, optimizer, criterion,
train_loader, device=device)
writer.add_scalar("Training Loss", train_loss, epoch + 1)
# 评估模型并记录 accuracy
if (epoch + 1) % 5 == 0:
acc = evaluate(model, criterion, test_loader, device=device)
writer.add_scalar("Validation Accuracy", acc, epoch + 1)
checkpoint = {
# 'model': model.state_dict(),
# 'optimizer': optimizer.state_dict(),
'epoch': epoch,
'grads': grads_sum,
'accuracy':acc
}
# for name, param in model.named_parameters():
# writer.add_histogram(tag=name + '_grad', values=param.grad, global_step=epoch)
# writer.add_histogram(tag=name + '_data', values=param.data, global_step=epoch)
for name, param in grads_sum.items():
# 此处的grad是累加值吧 不是平均值
writer.add_histogram(tag=name + '_grad', values=param, global_step=epoch)
# 取这个epoch最后一个batch算完之后的weight
for name, param in model.named_parameters():
writer.add_histogram(tag=name + '_data', values=param.data, global_step=epoch)
# WARN
# torch.save(checkpoint, checkpoint_dir + '/ckpt_cifar-10_trail_model%s.pt' % (str(epoch+1)))
torch.save(checkpoint, checkpoint_dir + '/cifar10_' + args.model + '_%s.pt' % (str(epoch+1)))
# 存储最好的模型
if acc > best_acc:
best_acc = acc
count = 0
# WARN
# torch.save(model.state_dict(), save_dir+'/model_trail.pt')
torch.save(model.state_dict(), save_dir + '/cifar10_' +args.model + '.pt')
else:
count += 1
print(
f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.5f}, Val Acc: {acc:.2f}%")
# 判断是否需要 early stopping
if count == patience:
print(f"No improvement after {patience} epochs. Early stop!")
break
# 更新学习率
lr_scheduler.step()
# 训练用时和最佳验证集准确率
print(f"Training took {(time.time() - start_time) / 60:.2f} minutes")
print(f"Best validation accuracy: {best_acc:.2f}%")
# 加载并测试最佳模型
# model.load_state_dict(torch.load("best_model.pth"))
# model.to(device)
# test_acc = evaluate(model, criterion, test_loader, device="cuda")
# print(f"Test Accuracy: {test_acc:.2f}%")
# 关闭 TensorBoard 写入器
writer.close()
ResNet(
3.84 M, 104.771% Params, 245.82 MMac, 100.000% MACs,
(conv1): Conv2d(432, 0.012% Params, 442.37 KMac, 0.180% MACs, 3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.001% Params, 32.77 KMac, 0.013% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 16.38 KMac, 0.007% MACs, )
(layer1): MakeLayer(
15.17 k, 0.414% Params, 15.83 MMac, 6.439% MACs,
(downsample): Sequential(
1.15 k, 0.031% Params, 1.18 MMac, 0.480% MACs,
(0): Conv2d(1.02 k, 0.028% Params, 1.05 MMac, 0.427% MACs, 16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(128, 0.003% Params, 131.07 KMac, 0.053% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
14.02 k, 0.382% Params, 14.65 MMac, 5.959% MACs,
(block1): Bottleneck(
4.93 k, 0.134% Params, 5.14 MMac, 2.093% MACs,
(conv1): Conv2d(256, 0.007% Params, 262.14 KMac, 0.107% MACs, 16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.001% Params, 32.77 KMac, 0.013% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(2.3 k, 0.063% Params, 2.36 MMac, 0.960% MACs, 16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(32, 0.001% Params, 32.77 KMac, 0.013% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(1.02 k, 0.028% Params, 1.05 MMac, 0.427% MACs, 16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(128, 0.003% Params, 131.07 KMac, 0.053% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 98.3 KMac, 0.040% MACs, )
(downsample): Sequential(
1.15 k, 0.031% Params, 1.18 MMac, 0.480% MACs,
(0): Conv2d(1.02 k, 0.028% Params, 1.05 MMac, 0.427% MACs, 16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(128, 0.003% Params, 131.07 KMac, 0.053% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): Bottleneck(
4.54 k, 0.124% Params, 4.75 MMac, 1.933% MACs,
(conv1): Conv2d(1.02 k, 0.028% Params, 1.05 MMac, 0.427% MACs, 64, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.001% Params, 32.77 KMac, 0.013% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(2.3 k, 0.063% Params, 2.36 MMac, 0.960% MACs, 16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(32, 0.001% Params, 32.77 KMac, 0.013% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(1.02 k, 0.028% Params, 1.05 MMac, 0.427% MACs, 16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(128, 0.003% Params, 131.07 KMac, 0.053% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 98.3 KMac, 0.040% MACs, )
)
(block3): Bottleneck(
4.54 k, 0.124% Params, 4.75 MMac, 1.933% MACs,
(conv1): Conv2d(1.02 k, 0.028% Params, 1.05 MMac, 0.427% MACs, 64, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.001% Params, 32.77 KMac, 0.013% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(2.3 k, 0.063% Params, 2.36 MMac, 0.960% MACs, 16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(32, 0.001% Params, 32.77 KMac, 0.013% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(1.02 k, 0.028% Params, 1.05 MMac, 0.427% MACs, 16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(128, 0.003% Params, 131.07 KMac, 0.053% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 98.3 KMac, 0.040% MACs, )
)
)
)
(layer2): MakeLayer(
157.18 k, 4.286% Params, 42.28 MMac, 17.199% MACs,
(downsample): Sequential(
8.45 k, 0.230% Params, 2.16 MMac, 0.880% MACs,
(0): Conv2d(8.19 k, 0.223% Params, 2.1 MMac, 0.853% MACs, 64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(256, 0.007% Params, 65.54 KMac, 0.027% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
148.74 k, 4.056% Params, 40.12 MMac, 16.320% MACs,
(block1): Bottleneck(
24.19 k, 0.660% Params, 7.89 MMac, 3.209% MACs,
(conv1): Conv2d(2.05 k, 0.056% Params, 2.1 MMac, 0.853% MACs, 64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.002% Params, 65.54 KMac, 0.027% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.251% Params, 2.36 MMac, 0.960% MACs, 32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.007% Params, 65.54 KMac, 0.027% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 73.73 KMac, 0.030% MACs, )
(downsample): Sequential(
8.45 k, 0.230% Params, 2.16 MMac, 0.880% MACs,
(0): Conv2d(8.19 k, 0.223% Params, 2.1 MMac, 0.853% MACs, 64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(256, 0.007% Params, 65.54 KMac, 0.027% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): Bottleneck(
17.79 k, 0.485% Params, 4.6 MMac, 1.873% MACs,
(conv1): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.251% Params, 2.36 MMac, 0.960% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.007% Params, 65.54 KMac, 0.027% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 49.15 KMac, 0.020% MACs, )
)
(block3): Bottleneck(
17.79 k, 0.485% Params, 4.6 MMac, 1.873% MACs,
(conv1): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.251% Params, 2.36 MMac, 0.960% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.007% Params, 65.54 KMac, 0.027% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 49.15 KMac, 0.020% MACs, )
)
(block4): Bottleneck(
17.79 k, 0.485% Params, 4.6 MMac, 1.873% MACs,
(conv1): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.251% Params, 2.36 MMac, 0.960% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.007% Params, 65.54 KMac, 0.027% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 49.15 KMac, 0.020% MACs, )
)
(block5): Bottleneck(
17.79 k, 0.485% Params, 4.6 MMac, 1.873% MACs,
(conv1): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.251% Params, 2.36 MMac, 0.960% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.007% Params, 65.54 KMac, 0.027% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 49.15 KMac, 0.020% MACs, )
)
(block6): Bottleneck(
17.79 k, 0.485% Params, 4.6 MMac, 1.873% MACs,
(conv1): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.251% Params, 2.36 MMac, 0.960% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.007% Params, 65.54 KMac, 0.027% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 49.15 KMac, 0.020% MACs, )
)
(block7): Bottleneck(
17.79 k, 0.485% Params, 4.6 MMac, 1.873% MACs,
(conv1): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.251% Params, 2.36 MMac, 0.960% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.007% Params, 65.54 KMac, 0.027% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 49.15 KMac, 0.020% MACs, )
)
(block8): Bottleneck(
17.79 k, 0.485% Params, 4.6 MMac, 1.873% MACs,
(conv1): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.251% Params, 2.36 MMac, 0.960% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.002% Params, 16.38 KMac, 0.007% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.112% Params, 1.05 MMac, 0.427% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.007% Params, 65.54 KMac, 0.027% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 49.15 KMac, 0.020% MACs, )
)
)
)
(layer3): MakeLayer(
2.59 M, 70.699% Params, 168.43 MMac, 68.519% MACs,
(downsample): Sequential(
33.28 k, 0.907% Params, 2.13 MMac, 0.866% MACs,
(0): Conv2d(32.77 k, 0.894% Params, 2.1 MMac, 0.853% MACs, 128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
2.56 M, 69.791% Params, 166.3 MMac, 67.653% MACs,
(block1): Bottleneck(
95.49 k, 2.604% Params, 7.75 MMac, 3.151% MACs,
(conv1): Conv2d(8.19 k, 0.223% Params, 2.1 MMac, 0.853% MACs, 128, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 32.77 KMac, 0.013% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 36.86 KMac, 0.015% MACs, )
(downsample): Sequential(
33.28 k, 0.907% Params, 2.13 MMac, 0.866% MACs,
(0): Conv2d(32.77 k, 0.894% Params, 2.1 MMac, 0.853% MACs, 128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block3): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block4): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block5): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block6): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block7): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block8): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block9): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block10): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block11): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block12): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block13): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block14): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block15): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block16): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block17): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block18): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block19): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block20): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block21): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block22): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block23): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block24): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block25): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block26): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block27): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block28): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block29): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block30): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block31): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block32): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block33): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block34): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block35): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
(block36): Bottleneck(
70.4 k, 1.920% Params, 4.53 MMac, 1.843% MACs,
(conv1): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 1.005% Params, 2.36 MMac, 0.960% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.003% Params, 8.19 KMac, 0.003% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 0.447% Params, 1.05 MMac, 0.427% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.014% Params, 32.77 KMac, 0.013% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.010% MACs, )
)
)
)
(layer4): MakeLayer(
1.07 M, 29.220% Params, 18.77 MMac, 7.637% MACs,
(downsample): Sequential(
132.1 k, 3.602% Params, 2.11 MMac, 0.860% MACs,
(0): Conv2d(131.07 k, 3.574% Params, 2.1 MMac, 0.853% MACs, 256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(1.02 k, 0.028% Params, 16.38 KMac, 0.007% MACs, 512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
939.52 k, 25.618% Params, 16.66 MMac, 6.778% MACs,
(block1): Bottleneck(
379.39 k, 10.345% Params, 7.67 MMac, 3.122% MACs,
(conv1): Conv2d(32.77 k, 0.894% Params, 2.1 MMac, 0.853% MACs, 256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, 0.007% Params, 16.38 KMac, 0.007% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(147.46 k, 4.021% Params, 2.36 MMac, 0.960% MACs, 128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, 0.007% Params, 4.1 KMac, 0.002% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(65.54 k, 1.787% Params, 1.05 MMac, 0.427% MACs, 128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1.02 k, 0.028% Params, 16.38 KMac, 0.007% MACs, 512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 18.43 KMac, 0.007% MACs, )
(downsample): Sequential(
132.1 k, 3.602% Params, 2.11 MMac, 0.860% MACs,
(0): Conv2d(131.07 k, 3.574% Params, 2.1 MMac, 0.853% MACs, 256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(1.02 k, 0.028% Params, 16.38 KMac, 0.007% MACs, 512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): Bottleneck(
280.06 k, 7.637% Params, 4.49 MMac, 1.828% MACs,
(conv1): Conv2d(65.54 k, 1.787% Params, 1.05 MMac, 0.427% MACs, 512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, 0.007% Params, 4.1 KMac, 0.002% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(147.46 k, 4.021% Params, 2.36 MMac, 0.960% MACs, 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, 0.007% Params, 4.1 KMac, 0.002% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(65.54 k, 1.787% Params, 1.05 MMac, 0.427% MACs, 128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1.02 k, 0.028% Params, 16.38 KMac, 0.007% MACs, 512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 12.29 KMac, 0.005% MACs, )
)
(block3): Bottleneck(
280.06 k, 7.637% Params, 4.49 MMac, 1.828% MACs,
(conv1): Conv2d(65.54 k, 1.787% Params, 1.05 MMac, 0.427% MACs, 512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, 0.007% Params, 4.1 KMac, 0.002% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(147.46 k, 4.021% Params, 2.36 MMac, 0.960% MACs, 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, 0.007% Params, 4.1 KMac, 0.002% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(65.54 k, 1.787% Params, 1.05 MMac, 0.427% MACs, 128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1.02 k, 0.028% Params, 16.38 KMac, 0.007% MACs, 512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 12.29 KMac, 0.005% MACs, )
)
)
)
(avgpool): AdaptiveAvgPool2d(0, 0.000% Params, 8.19 KMac, 0.003% MACs, output_size=(1, 1))
(fc): Linear(5.13 k, 0.140% Params, 5.13 KMac, 0.002% MACs, in_features=512, out_features=10, bias=True)
)
\ No newline at end of file
ResNet(
712.67 k, 101.597% Params, 35.92 MMac, 100.000% MACs,
(conv1): Conv2d(432, 0.062% Params, 442.37 KMac, 1.232% MACs, 3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.005% Params, 32.77 KMac, 0.091% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 16.38 KMac, 0.046% MACs, )
(layer1): MakeLayer(
9.34 k, 1.332% Params, 9.63 MMac, 26.822% MACs,
(blockdict): ModuleDict(
9.34 k, 1.332% Params, 9.63 MMac, 26.822% MACs,
(block1): BasicBlock(
4.67 k, 0.666% Params, 4.82 MMac, 13.411% MACs,
(conv1): Conv2d(2.3 k, 0.328% Params, 2.36 MMac, 6.569% MACs, 16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.005% Params, 32.77 KMac, 0.091% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(2.3 k, 0.328% Params, 2.36 MMac, 6.569% MACs, 16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(32, 0.005% Params, 32.77 KMac, 0.091% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 32.77 KMac, 0.091% MACs, )
)
(block2): BasicBlock(
4.67 k, 0.666% Params, 4.82 MMac, 13.411% MACs,
(conv1): Conv2d(2.3 k, 0.328% Params, 2.36 MMac, 6.569% MACs, 16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.005% Params, 32.77 KMac, 0.091% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(2.3 k, 0.328% Params, 2.36 MMac, 6.569% MACs, 16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(32, 0.005% Params, 32.77 KMac, 0.091% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 32.77 KMac, 0.091% MACs, )
)
)
)
(layer2): MakeLayer(
33.66 k, 4.799% Params, 8.65 MMac, 24.085% MACs,
(downsample): Sequential(
576, 0.082% Params, 147.46 KMac, 0.411% MACs,
(0): Conv2d(512, 0.073% Params, 131.07 KMac, 0.365% MACs, 16, 32, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(64, 0.009% Params, 16.38 KMac, 0.046% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
33.09 k, 4.717% Params, 8.5 MMac, 23.675% MACs,
(block1): BasicBlock(
14.53 k, 2.071% Params, 3.74 MMac, 10.400% MACs,
(conv1): Conv2d(4.61 k, 0.657% Params, 1.18 MMac, 3.284% MACs, 16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.009% Params, 16.38 KMac, 0.046% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 1.314% Params, 2.36 MMac, 6.569% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.009% Params, 16.38 KMac, 0.046% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 16.38 KMac, 0.046% MACs, )
(downsample): Sequential(
576, 0.082% Params, 147.46 KMac, 0.411% MACs,
(0): Conv2d(512, 0.073% Params, 131.07 KMac, 0.365% MACs, 16, 32, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(64, 0.009% Params, 16.38 KMac, 0.046% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): BasicBlock(
18.56 k, 2.646% Params, 4.77 MMac, 13.274% MACs,
(conv1): Conv2d(9.22 k, 1.314% Params, 2.36 MMac, 6.569% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.009% Params, 16.38 KMac, 0.046% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 1.314% Params, 2.36 MMac, 6.569% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.009% Params, 16.38 KMac, 0.046% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 16.38 KMac, 0.046% MACs, )
)
)
)
(layer3): MakeLayer(
133.89 k, 19.087% Params, 8.59 MMac, 23.903% MACs,
(downsample): Sequential(
2.18 k, 0.310% Params, 139.26 KMac, 0.388% MACs,
(0): Conv2d(2.05 k, 0.292% Params, 131.07 KMac, 0.365% MACs, 32, 64, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(128, 0.018% Params, 8.19 KMac, 0.023% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
131.71 k, 18.777% Params, 8.45 MMac, 23.515% MACs,
(block1): BasicBlock(
57.73 k, 8.230% Params, 3.7 MMac, 10.309% MACs,
(conv1): Conv2d(18.43 k, 2.628% Params, 1.18 MMac, 3.284% MACs, 32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.018% Params, 8.19 KMac, 0.023% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 5.255% Params, 2.36 MMac, 6.569% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.018% Params, 8.19 KMac, 0.023% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 8.19 KMac, 0.023% MACs, )
(downsample): Sequential(
2.18 k, 0.310% Params, 139.26 KMac, 0.388% MACs,
(0): Conv2d(2.05 k, 0.292% Params, 131.07 KMac, 0.365% MACs, 32, 64, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(128, 0.018% Params, 8.19 KMac, 0.023% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): BasicBlock(
73.98 k, 10.547% Params, 4.74 MMac, 13.206% MACs,
(conv1): Conv2d(36.86 k, 5.255% Params, 2.36 MMac, 6.569% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.018% Params, 8.19 KMac, 0.023% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 5.255% Params, 2.36 MMac, 6.569% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.018% Params, 8.19 KMac, 0.023% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 8.19 KMac, 0.023% MACs, )
)
)
)
(layer4): MakeLayer(
534.02 k, 76.129% Params, 8.55 MMac, 23.812% MACs,
(downsample): Sequential(
8.45 k, 1.204% Params, 135.17 KMac, 0.376% MACs,
(0): Conv2d(8.19 k, 1.168% Params, 131.07 KMac, 0.365% MACs, 64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(256, 0.036% Params, 4.1 KMac, 0.011% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
525.57 k, 74.924% Params, 8.42 MMac, 23.435% MACs,
(block1): BasicBlock(
230.14 k, 32.809% Params, 3.69 MMac, 10.264% MACs,
(conv1): Conv2d(73.73 k, 10.511% Params, 1.18 MMac, 3.284% MACs, 64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(256, 0.036% Params, 4.1 KMac, 0.011% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(147.46 k, 21.021% Params, 2.36 MMac, 6.569% MACs, 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, 0.036% Params, 4.1 KMac, 0.011% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 4.1 KMac, 0.011% MACs, )
(downsample): Sequential(
8.45 k, 1.204% Params, 135.17 KMac, 0.376% MACs,
(0): Conv2d(8.19 k, 1.168% Params, 131.07 KMac, 0.365% MACs, 64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(256, 0.036% Params, 4.1 KMac, 0.011% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): BasicBlock(
295.42 k, 42.115% Params, 4.73 MMac, 13.172% MACs,
(conv1): Conv2d(147.46 k, 21.021% Params, 2.36 MMac, 6.569% MACs, 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(256, 0.036% Params, 4.1 KMac, 0.011% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(147.46 k, 21.021% Params, 2.36 MMac, 6.569% MACs, 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, 0.036% Params, 4.1 KMac, 0.011% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 4.1 KMac, 0.011% MACs, )
)
)
)
(avgpool): AdaptiveAvgPool2d(0, 0.000% Params, 2.05 KMac, 0.006% MACs, output_size=(1, 1))
(fc): Linear(1.29 k, 0.184% Params, 1.29 KMac, 0.004% MACs, in_features=128, out_features=10, bias=True)
)
\ No newline at end of file
ResNet(
1.66 M, 111.789% Params, 91.5 MMac, 100.000% MACs,
(conv1): Conv2d(432, 0.029% Params, 442.37 KMac, 0.483% MACs, 3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.002% Params, 32.77 KMac, 0.036% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 16.38 KMac, 0.018% MACs, )
(layer1): MakeLayer(
15.17 k, 1.022% Params, 15.83 MMac, 17.298% MACs,
(downsample): Sequential(
1.15 k, 0.078% Params, 1.18 MMac, 1.289% MACs,
(0): Conv2d(1.02 k, 0.069% Params, 1.05 MMac, 1.146% MACs, 16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(128, 0.009% Params, 131.07 KMac, 0.143% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
14.02 k, 0.944% Params, 14.65 MMac, 16.009% MACs,
(block1): Bottleneck(
4.93 k, 0.332% Params, 5.14 MMac, 5.623% MACs,
(conv1): Conv2d(256, 0.017% Params, 262.14 KMac, 0.287% MACs, 16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.002% Params, 32.77 KMac, 0.036% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(2.3 k, 0.155% Params, 2.36 MMac, 2.579% MACs, 16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(32, 0.002% Params, 32.77 KMac, 0.036% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(1.02 k, 0.069% Params, 1.05 MMac, 1.146% MACs, 16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(128, 0.009% Params, 131.07 KMac, 0.143% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 98.3 KMac, 0.107% MACs, )
(downsample): Sequential(
1.15 k, 0.078% Params, 1.18 MMac, 1.289% MACs,
(0): Conv2d(1.02 k, 0.069% Params, 1.05 MMac, 1.146% MACs, 16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(128, 0.009% Params, 131.07 KMac, 0.143% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): Bottleneck(
4.54 k, 0.306% Params, 4.75 MMac, 5.193% MACs,
(conv1): Conv2d(1.02 k, 0.069% Params, 1.05 MMac, 1.146% MACs, 64, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.002% Params, 32.77 KMac, 0.036% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(2.3 k, 0.155% Params, 2.36 MMac, 2.579% MACs, 16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(32, 0.002% Params, 32.77 KMac, 0.036% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(1.02 k, 0.069% Params, 1.05 MMac, 1.146% MACs, 16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(128, 0.009% Params, 131.07 KMac, 0.143% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 98.3 KMac, 0.107% MACs, )
)
(block3): Bottleneck(
4.54 k, 0.306% Params, 4.75 MMac, 5.193% MACs,
(conv1): Conv2d(1.02 k, 0.069% Params, 1.05 MMac, 1.146% MACs, 64, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(32, 0.002% Params, 32.77 KMac, 0.036% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(2.3 k, 0.155% Params, 2.36 MMac, 2.579% MACs, 16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(32, 0.002% Params, 32.77 KMac, 0.036% MACs, 16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(1.02 k, 0.069% Params, 1.05 MMac, 1.146% MACs, 16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(128, 0.009% Params, 131.07 KMac, 0.143% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 98.3 KMac, 0.107% MACs, )
)
)
)
(layer2): MakeLayer(
86.02 k, 5.796% Params, 23.86 MMac, 26.081% MACs,
(downsample): Sequential(
8.45 k, 0.569% Params, 2.16 MMac, 2.364% MACs,
(0): Conv2d(8.19 k, 0.552% Params, 2.1 MMac, 2.292% MACs, 64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(256, 0.017% Params, 65.54 KMac, 0.072% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
77.57 k, 5.226% Params, 21.7 MMac, 23.718% MACs,
(block1): Bottleneck(
24.19 k, 1.630% Params, 7.89 MMac, 8.622% MACs,
(conv1): Conv2d(2.05 k, 0.138% Params, 2.1 MMac, 2.292% MACs, 64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.004% Params, 65.54 KMac, 0.072% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.621% Params, 2.36 MMac, 2.579% MACs, 32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.004% Params, 16.38 KMac, 0.018% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.276% Params, 1.05 MMac, 1.146% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.017% Params, 65.54 KMac, 0.072% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 73.73 KMac, 0.081% MACs, )
(downsample): Sequential(
8.45 k, 0.569% Params, 2.16 MMac, 2.364% MACs,
(0): Conv2d(8.19 k, 0.552% Params, 2.1 MMac, 2.292% MACs, 64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(256, 0.017% Params, 65.54 KMac, 0.072% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): Bottleneck(
17.79 k, 1.199% Params, 4.6 MMac, 5.032% MACs,
(conv1): Conv2d(4.1 k, 0.276% Params, 1.05 MMac, 1.146% MACs, 128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.004% Params, 16.38 KMac, 0.018% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.621% Params, 2.36 MMac, 2.579% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.004% Params, 16.38 KMac, 0.018% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.276% Params, 1.05 MMac, 1.146% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.017% Params, 65.54 KMac, 0.072% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 49.15 KMac, 0.054% MACs, )
)
(block3): Bottleneck(
17.79 k, 1.199% Params, 4.6 MMac, 5.032% MACs,
(conv1): Conv2d(4.1 k, 0.276% Params, 1.05 MMac, 1.146% MACs, 128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.004% Params, 16.38 KMac, 0.018% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.621% Params, 2.36 MMac, 2.579% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.004% Params, 16.38 KMac, 0.018% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.276% Params, 1.05 MMac, 1.146% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.017% Params, 65.54 KMac, 0.072% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 49.15 KMac, 0.054% MACs, )
)
(block4): Bottleneck(
17.79 k, 1.199% Params, 4.6 MMac, 5.032% MACs,
(conv1): Conv2d(4.1 k, 0.276% Params, 1.05 MMac, 1.146% MACs, 128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(64, 0.004% Params, 16.38 KMac, 0.018% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(9.22 k, 0.621% Params, 2.36 MMac, 2.579% MACs, 32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, 0.004% Params, 16.38 KMac, 0.018% MACs, 32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(4.1 k, 0.276% Params, 1.05 MMac, 1.146% MACs, 32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, 0.017% Params, 65.54 KMac, 0.072% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 49.15 KMac, 0.054% MACs, )
)
)
)
(layer3): MakeLayer(
480.77 k, 32.393% Params, 32.53 MMac, 35.550% MACs,
(downsample): Sequential(
33.28 k, 2.242% Params, 2.13 MMac, 2.328% MACs,
(0): Conv2d(32.77 k, 2.208% Params, 2.1 MMac, 2.292% MACs, 128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, 0.034% Params, 32.77 KMac, 0.036% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
447.49 k, 30.150% Params, 30.4 MMac, 33.222% MACs,
(block1): Bottleneck(
95.49 k, 6.434% Params, 7.75 MMac, 8.465% MACs,
(conv1): Conv2d(8.19 k, 0.552% Params, 2.1 MMac, 2.292% MACs, 128, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.009% Params, 32.77 KMac, 0.036% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 2.484% Params, 2.36 MMac, 2.579% MACs, 64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.034% Params, 32.77 KMac, 0.036% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 36.86 KMac, 0.040% MACs, )
(downsample): Sequential(
33.28 k, 2.242% Params, 2.13 MMac, 2.328% MACs,
(0): Conv2d(32.77 k, 2.208% Params, 2.1 MMac, 2.292% MACs, 128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, 0.034% Params, 32.77 KMac, 0.036% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): Bottleneck(
70.4 k, 4.743% Params, 4.53 MMac, 4.951% MACs,
(conv1): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 2.484% Params, 2.36 MMac, 2.579% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.034% Params, 32.77 KMac, 0.036% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.027% MACs, )
)
(block3): Bottleneck(
70.4 k, 4.743% Params, 4.53 MMac, 4.951% MACs,
(conv1): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 2.484% Params, 2.36 MMac, 2.579% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.034% Params, 32.77 KMac, 0.036% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.027% MACs, )
)
(block4): Bottleneck(
70.4 k, 4.743% Params, 4.53 MMac, 4.951% MACs,
(conv1): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 2.484% Params, 2.36 MMac, 2.579% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.034% Params, 32.77 KMac, 0.036% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.027% MACs, )
)
(block5): Bottleneck(
70.4 k, 4.743% Params, 4.53 MMac, 4.951% MACs,
(conv1): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 2.484% Params, 2.36 MMac, 2.579% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.034% Params, 32.77 KMac, 0.036% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.027% MACs, )
)
(block6): Bottleneck(
70.4 k, 4.743% Params, 4.53 MMac, 4.951% MACs,
(conv1): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(36.86 k, 2.484% Params, 2.36 MMac, 2.579% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, 0.009% Params, 8.19 KMac, 0.009% MACs, 64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(16.38 k, 1.104% Params, 1.05 MMac, 1.146% MACs, 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, 0.034% Params, 32.77 KMac, 0.036% MACs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 24.58 KMac, 0.027% MACs, )
)
)
)
(layer4): MakeLayer(
1.07 M, 72.202% Params, 18.77 MMac, 20.519% MACs,
(downsample): Sequential(
132.1 k, 8.900% Params, 2.11 MMac, 2.310% MACs,
(0): Conv2d(131.07 k, 8.831% Params, 2.1 MMac, 2.292% MACs, 256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(1.02 k, 0.069% Params, 16.38 KMac, 0.018% MACs, 512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(blockdict): ModuleDict(
939.52 k, 63.302% Params, 16.66 MMac, 18.209% MACs,
(block1): Bottleneck(
379.39 k, 25.562% Params, 7.67 MMac, 8.387% MACs,
(conv1): Conv2d(32.77 k, 2.208% Params, 2.1 MMac, 2.292% MACs, 256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, 0.017% Params, 16.38 KMac, 0.018% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(147.46 k, 9.935% Params, 2.36 MMac, 2.579% MACs, 128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, 0.017% Params, 4.1 KMac, 0.004% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(65.54 k, 4.416% Params, 1.05 MMac, 1.146% MACs, 128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1.02 k, 0.069% Params, 16.38 KMac, 0.018% MACs, 512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 18.43 KMac, 0.020% MACs, )
(downsample): Sequential(
132.1 k, 8.900% Params, 2.11 MMac, 2.310% MACs,
(0): Conv2d(131.07 k, 8.831% Params, 2.1 MMac, 2.292% MACs, 256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(1.02 k, 0.069% Params, 16.38 KMac, 0.018% MACs, 512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(block2): Bottleneck(
280.06 k, 18.870% Params, 4.49 MMac, 4.911% MACs,
(conv1): Conv2d(65.54 k, 4.416% Params, 1.05 MMac, 1.146% MACs, 512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, 0.017% Params, 4.1 KMac, 0.004% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(147.46 k, 9.935% Params, 2.36 MMac, 2.579% MACs, 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, 0.017% Params, 4.1 KMac, 0.004% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(65.54 k, 4.416% Params, 1.05 MMac, 1.146% MACs, 128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1.02 k, 0.069% Params, 16.38 KMac, 0.018% MACs, 512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 12.29 KMac, 0.013% MACs, )
)
(block3): Bottleneck(
280.06 k, 18.870% Params, 4.49 MMac, 4.911% MACs,
(conv1): Conv2d(65.54 k, 4.416% Params, 1.05 MMac, 1.146% MACs, 512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, 0.017% Params, 4.1 KMac, 0.004% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(147.46 k, 9.935% Params, 2.36 MMac, 2.579% MACs, 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, 0.017% Params, 4.1 KMac, 0.004% MACs, 128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(65.54 k, 4.416% Params, 1.05 MMac, 1.146% MACs, 128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1.02 k, 0.069% Params, 16.38 KMac, 0.018% MACs, 512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(0, 0.000% Params, 12.29 KMac, 0.013% MACs, )
)
)
)
(avgpool): AdaptiveAvgPool2d(0, 0.000% Params, 8.19 KMac, 0.009% MACs, output_size=(1, 1))
(fc): Linear(5.13 k, 0.346% Params, 5.13 KMac, 0.006% MACs, in_features=512, out_features=10, bias=True)
)
\ No newline at end of file
# -*- coding: utf-8 -*-
from torch.serialization import load
from model import *
from extract_ratio import *
from utils import *
import gol
import openpyxl
import sys
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.transforms.functional import InterpolationMode
import torch.utils.bottleneck as bn
import os
import os.path as osp
from torch.utils.tensorboard import SummaryWriter
def direct_quantize(model, test_loader,device):
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model.quantize_forward(data).cpu()
if i % 500 == 0:
break
print('direct quantization finish')
def full_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model(data).cpu()
pred = output.argmax(dim=1, keepdim=True)
# print(pred)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Full Model Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
return 100. * correct / len(test_loader.dataset)
def quantize_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data = data.to(device)
output = model.quantize_inference(data).cpu()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('Test set: Quant Model Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
return 100. * correct / len(test_loader.dataset)
def js_div(p_output, q_output, get_softmax=True):
"""
Function that measures JS divergence between target and output logits:
"""
KLDivLoss = nn.KLDivLoss(reduction='sum')
if get_softmax:
p_output = F.softmax(p_output)
q_output = F.softmax(q_output)
log_mean_output = ((p_output + q_output)/2).log()
return (KLDivLoss(log_mean_output, p_output) + KLDivLoss(log_mean_output, q_output))/2
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='PyTorch FP32 Training')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='ResNet18')
parser.add_argument('-b', '--batch_size', default=128, type=int, metavar='BATCH SIZE', help='mini-batch size (default: 128)')
parser.add_argument('-j','--workers', default=4, type=int, metavar='WORKERS',help='number of data loading workers (default: 4)')
parser.add_argument('-s', '--save', default=False, type=bool)
# parser.add_argument('-t', '--test', dest='test', action='store_true', help='test model on test set')
# 训练参数
args = parser.parse_args()
batch_size = args.batch_size
num_workers = args.workers
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=False,
transform=transforms.Compose([
transforms.RandomCrop(32, padding=2),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, download=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True
)
# model = AlexNet_BN()
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
writer = SummaryWriter(log_dir='log/' + args.model + '/ptq')
full_file = 'ckpt/cifar10_' + args.model + '.pt'
model.load_state_dict(torch.load(full_file))
model.to(device)
load_ptq = False
ptq_file_prefix = 'ckpt/cifar10_' + args.model + '_ptq_'
model.eval()
full_acc = full_inference(model, test_loader, device)
model_fold = fold_model(model) #
full_params = []
layer, par_ratio, flop_ratio = extract_ratio(args.model)
# print(layer)
layer = []
for name, param in model.named_parameters():
if 'weight' in name:
n = name.split('.')
pre = '.'.join(n[:len(n)-1])
# 提取出weight前的名字(就是这个层的名字,if weight是避免bias重复提取一遍名字)
layer.append(pre)
# print(name)
print('===================')
# print(layer)
par_ratio, flop_ratio = fold_ratio(layer, par_ratio, flop_ratio)
# sys.exit()
for name, param in model_fold.named_parameters():
if 'bn' in name or 'sample.1' in name:
continue
# param_norm = F.normalize(param.data.cpu(),p=2,dim=-1)
param_norm = param.data.cpu()
full_params.append(param_norm) # 没统计bn的 只统计了conv的 而且还是fold后的
writer.add_histogram(tag='Full_' + name + '_data', values=param.data)
gol._init()
quant_type_list = ['INT','POT','FLOAT']
title_list = []
js_flops_list = []
js_param_list = []
ptq_acc_list = []
acc_loss_list = []
for quant_type in quant_type_list:
num_bit_list = numbit_list(quant_type)
# 对一个量化类别,只需设置一次bias量化表
# int由于位宽大,使用量化表开销过大,直接_round即可
if quant_type != 'INT':
bias_list = build_bias_list(quant_type)
gol.set_value(bias_list, is_bias=True)
for num_bits in num_bit_list:
e_bit_list = ebit_list(quant_type,num_bits)
for e_bits in e_bit_list:
# model_ptq = resnet18()
if args.model == 'ResNet18':
model_ptq = resnet18()
elif args.model == 'ResNet50':
model_ptq = resnet50()
elif args.model == 'ResNet152':
model_ptq = resnet152()
if quant_type == 'FLOAT':
title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
else:
title = '%s_%d' % (quant_type, num_bits)
print('\nPTQ: '+title)
title_list.append(title)
# 设置量化表
if quant_type != 'INT':
plist = build_list(quant_type, num_bits, e_bits)
gol.set_value(plist)
# 判断是否需要载入
if load_ptq is True and osp.exists(ptq_file_prefix + title + '.pt'):
model_ptq.quantize(quant_type,num_bits,e_bits)
model_ptq.load_state_dict(torch.load(ptq_file_prefix + title + '.pt'))
model_ptq.to(device)
print('Successfully load ptq model: ' + title)
else:
model_ptq.load_state_dict(torch.load(full_file))
model_ptq.to(device)
model_ptq.quantize(quant_type,num_bits,e_bits)
model_ptq.eval()
direct_quantize(model_ptq, train_loader, device)
# if args.save == True:
# torch.save(model_ptq.state_dict(), ptq_file_prefix + title + '.pt')
model_ptq.freeze()
ptq_acc = quantize_inference(model_ptq, test_loader, device)
ptq_acc_list.append(ptq_acc)
acc_loss = (full_acc - ptq_acc) / full_acc
acc_loss_list.append(acc_loss)
idx = -1
# 获取计算量/参数量下的js-div
js_flops = 0.
js_param = 0.
for name, param in model_ptq.named_parameters():
# if '.' not in name or 'bn' in name:
if 'bn' in name or 'sample.1' in name:
continue
writer.add_histogram(tag=title +':'+ name + '_data', values=param.data)
idx = idx + 1
# renset中有多个. 需要改写拼一下
# prefix = name.split('.')[0]
n = name.split('.')
prefix = '.'.join(n[:len(n) - 1])
# weight和bias 1:1 ? 对于ratio,是按层赋予的,此处可以对weight和bias再单独赋予不同的权重,比如(8:2)
if prefix in layer:
layer_idx = layer.index(prefix)
ptq_param = param.data.cpu()
# 取L2范数
# ptq_norm = F.normalize(ptq_param,p=2,dim=-1)
ptq_norm = ptq_param
writer.add_histogram(tag=title +':'+ name + '_data', values=ptq_param)
# print(name)
# print('=========')
# print(ptq_norm)
# print('=========')
# print(full_params[idx])
js = js_div(ptq_norm,full_params[idx]) # 这里算了fold后的量化前后模型的js距离
js = js.item()
if js < 0.:
js = 0.
js_flops = js_flops + js * flop_ratio[layer_idx]
js_param = js_param + js * par_ratio[layer_idx]
js_flops_list.append(js_flops)
js_param_list.append(js_param)
print(title + ': js_flops: %f js_param: %f acc_loss: %f' % (js_flops, js_param, acc_loss))
# 写入xlsx
workbook = openpyxl.Workbook()
worksheet = workbook.active
worksheet.cell(row=1,column=1,value='FP32-acc')
worksheet.cell(row=1,column=2,value=full_acc)
worksheet.cell(row=3,column=1,value='title')
worksheet.cell(row=3,column=2,value='js_flops')
worksheet.cell(row=3,column=3,value='js_param')
worksheet.cell(row=3,column=4,value='ptq_acc')
worksheet.cell(row=3,column=5,value='acc_loss')
for i in range(len(title_list)):
worksheet.cell(row=i+4, column=1, value=title_list[i])
worksheet.cell(row=i+4, column=2, value=js_flops_list[i])
worksheet.cell(row=i+4, column=3, value=js_param_list[i])
worksheet.cell(row=i+4, column=4, value=ptq_acc_list[i])
worksheet.cell(row=i+4, column=5, value=acc_loss_list[i])
workbook.save('ptq_result_' + args.model + '.xlsx')
writer.close()
ft = open('ptq_result_' + args.model + '.txt','w')
print('title_list:',file=ft)
print(" ".join(title_list),file=ft)
print('js_flops_list:',file=ft)
print(" ".join(str(i) for i in js_flops_list), file=ft)
print('js_param_list:',file=ft)
print(" ".join(str(i) for i in js_param_list), file=ft)
print('ptq_acc_list:',file=ft)
print(" ".join(str(i) for i in ptq_acc_list), file=ft)
print('acc_loss_list:',file=ft)
print(" ".join(str(i) for i in acc_loss_list), file=ft)
ft.close()
title_list:
INT_2 INT_3 INT_4 INT_5 INT_6 INT_7 INT_8 INT_9 INT_10 INT_11 INT_12 INT_13 INT_14 INT_15 INT_16 POT_2 POT_3 POT_4 POT_5 POT_6 POT_7 POT_8 FLOAT_3_E1 FLOAT_4_E1 FLOAT_4_E2 FLOAT_5_E1 FLOAT_5_E2 FLOAT_5_E3 FLOAT_6_E1 FLOAT_6_E2 FLOAT_6_E3 FLOAT_6_E4 FLOAT_7_E1 FLOAT_7_E2 FLOAT_7_E3 FLOAT_7_E4 FLOAT_7_E5 FLOAT_8_E1 FLOAT_8_E2 FLOAT_8_E3 FLOAT_8_E4 FLOAT_8_E5 FLOAT_8_E6
js_flops_list:
23.751628696034675 9.970130122904623 1.27369342087768 0.28313082786825217 0.0661330799143482 0.015909788749053112 0.003919561637024714 0.000976824876513856 0.0002478207988959643 6.729750617685151e-05 2.465999254564852e-05 1.4437640662014585e-05 1.753492282816915e-05 1.6118921091019203e-05 1.0942083041102445e-05 19.360958236249306 3.720049772121422 1.203861698176446 1.205551302740704 1.1875784274865653 1.197261990471644 1.187304920096399 3.9054602854075413 1.359654720924604 0.6455689626848736 0.738021827869755 0.21746836151979718 0.24024395039789773 0.5416934888086916 0.1090641843494709 0.062104454894076684 0.24001038871414732 0.4679728318331829 0.07558863061735592 0.015767804836310184 0.062021724817765404 0.24001037755982702 0.43631459469295775 0.06362924454198643 0.003988977252631458 0.015714696763899187 0.06202157326324543 0.24307858155352763
js_param_list:
20.85553941977874 9.981209988545288 1.3539305429512496 0.30139434372520046 0.07038894495212882 0.016983314589243823 0.004172975453783345 0.0010420360112590532 0.00026273648166581423 7.17926315777753e-05 2.4605053810581976e-05 1.317531321555632e-05 1.9623210180810698e-05 1.747944047326041e-05 1.170199482974471e-05 17.390004975267495 3.8124967157643033 1.0178092545455621 1.0125712143846217 0.9928878407387453 1.0125745208871095 0.995760463684502 3.5930026564803716 1.1991610585566574 0.6596486829871696 0.6274086902313186 0.2277369707300968 0.20674715542055963 0.45375203785887974 0.11819179575270293 0.05360085504661807 0.20648789904506443 0.3899066809035698 0.0837931514468961 0.013553482490574238 0.05351237083520507 0.206488208407236 0.3632293653938247 0.07115836515515826 0.003443783278898041 0.01349690316709514 0.0535122559122496 0.20908980476129443
ptq_acc_list:
10.0 10.0 11.86 46.5 75.15 88.16 90.38 90.67 90.81 90.78 90.81 90.82 90.79 90.77 90.77 10.0 10.32 16.21 18.59 17.49 17.34 18.47 13.0 10.63 33.64 21.79 46.89 72.72 36.71 68.56 88.48 76.4 52.7 58.52 90.32 88.99 75.24 51.23 57.34 90.59 90.41 88.78 70.99
acc_loss_list:
0.8898314421064228 0.8898314421064228 0.8693400903382175 0.48771620579486613 0.17208328742976745 0.028753993610223638 0.004296573757849516 0.001101685578935709 -0.0004406742315743776 -0.00011016855789363355 -0.0004406742315743776 -0.0005508427894678545 -0.0002203371157872671 0.0 0.0 0.8898314421064228 0.8863060482538283 0.8214167676545114 0.79519665087584 0.8073151922441336 0.8089677206125372 0.7965186735705629 0.8567808747383496 0.8828908229591275 0.6293929712460063 0.7599427123498953 0.4834196320370166 0.19885424699790677 0.5955712239726781 0.24468436708163485 0.025228599757629085 0.1583122176930703 0.41941169990084826 0.3552935992067863 0.0049575851052110044 0.01961000330505675 0.17109177040872536 0.43560647791120416 0.36829348903822845 0.0019830340420843077 0.003966068084168772 0.021923543020821803 0.21791340751349567
title_list:
INT_2 INT_3 INT_4 INT_5 INT_6 INT_7 INT_8 INT_9 INT_10 INT_11 INT_12 INT_13 INT_14 INT_15 INT_16 POT_2 POT_3 POT_4 POT_5 POT_6 POT_7 POT_8 FLOAT_3_E1 FLOAT_4_E1 FLOAT_4_E2 FLOAT_5_E1 FLOAT_5_E2 FLOAT_5_E3 FLOAT_6_E1 FLOAT_6_E2 FLOAT_6_E3 FLOAT_6_E4 FLOAT_7_E1 FLOAT_7_E2 FLOAT_7_E3 FLOAT_7_E4 FLOAT_7_E5 FLOAT_8_E1 FLOAT_8_E2 FLOAT_8_E3 FLOAT_8_E4 FLOAT_8_E5 FLOAT_8_E6
js_flops_list:
17.39068197353568 4.992727157836896 0.7082449333226905 0.15278217922691148 0.035902245013520316 0.008569443717644162 0.0021334381003386507 0.000530096122199641 0.00013202739139047992 4.320540852718224e-05 1.330992682049459e-05 2.0368878546145547e-05 8.560127359019233e-06 1.5414435348480747e-05 1.034017508399343e-05 13.592069723232997 2.603415308985623 1.0163834751275276 1.0004914009898496 1.0267242270943244 1.0377851066533914 1.0093247382340054 2.4882911516734794 0.8719669278759606 0.39824198801738997 0.48712327345618334 0.13437272845045903 0.15283618430614965 0.36366539038391377 0.067001336425203 0.04023162172835145 0.15278158008432788 0.3153947829963702 0.04619076816233325 0.010272439305968415 0.040191841741053155 0.15277940050779587 0.2944708799554367 0.03873975046512612 0.002562897125930476 0.010259629159672047 0.040194168479249566 0.16940471118115202
js_param_list:
13.75589548965823 6.139934297648725 0.9983266223973442 0.21685218331277975 0.05073819961183949 0.012236724314191861 0.0030479968659331298 0.0007642922990751943 0.00018892417520173234 6.914265082028416e-05 2.0654949731546248e-05 1.2782403100869643e-05 9.44194520770214e-06 1.7287066209449194e-05 1.2478257919601581e-05 13.5254986055193 3.463308629604464 1.160512064451701 1.1084529894485604 1.193382464700844 1.2041072939829902 1.2033002903326413 2.4474367498054836 0.7570219992909055 0.5246367584038191 0.3844942455969616 0.18715129322472332 0.13583544042939869 0.2740889994593467 0.10076411331478825 0.03570359182314611 0.13571819932622736 0.2344246202009843 0.07290873555706959 0.008973915201922977 0.03566098435431149 0.13571299092732977 0.21809584818526528 0.0623769948948551 0.00226774688205569 0.00895695770659421 0.03566482115654845 0.18569380791656256
ptq_acc_list:
10.0 9.97 23.88 76.77 87.61 88.95 89.11 89.49 89.46 89.51 89.44 89.49 89.5 89.49 89.48 10.0 10.2 41.62 46.35 41.57 33.14 18.97 11.7 27.9 58.44 63.17 81.54 80.64 71.73 85.63 88.05 80.84 76.92 86.23 88.79 87.8 81.29 79.62 86.98 89.18 89.08 88.03 82.63
acc_loss_list:
0.8882431828341529 0.8885784532856504 0.7331247206079572 0.14204291461779175 0.02089852481001346 0.00592311130978991 0.004135002235136394 -0.00011175681716574548 0.00022351363433180857 -0.00033527045149755406 0.0004470272686634583 -0.00011175681716574548 -0.00022351363433164976 -0.00011175681716574548 0.0 0.8882431828341529 0.8860080464908359 0.5348681269557444 0.4820071524362986 0.5354269110415736 0.6296379079123826 0.787997317836388 0.8692445239159589 0.6881984801072866 0.3468931604827895 0.2940321859633438 0.08873491282968259 0.09879302637460889 0.19836835046937862 0.04302637460885123 0.015981224854716213 0.09655789003129191 0.140366562360304 0.03632096557890031 0.007711220384443425 0.018775145283862392 0.09152883325882875 0.11019222172552524 0.027939204291461777 0.0033527045149753815 0.004470272686633948 0.016204738489047864 0.07655341975860537
title_list:
INT_2 INT_3 INT_4 INT_5 INT_6 INT_7 INT_8 INT_9 INT_10 INT_11 INT_12 INT_13 INT_14 INT_15 INT_16 POT_2 POT_3 POT_4 POT_5 POT_6 POT_7 POT_8 FLOAT_3_E1 FLOAT_4_E1 FLOAT_4_E2 FLOAT_5_E1 FLOAT_5_E2 FLOAT_5_E3 FLOAT_6_E1 FLOAT_6_E2 FLOAT_6_E3 FLOAT_6_E4 FLOAT_7_E1 FLOAT_7_E2 FLOAT_7_E3 FLOAT_7_E4 FLOAT_7_E5 FLOAT_8_E1 FLOAT_8_E2 FLOAT_8_E3 FLOAT_8_E4 FLOAT_8_E5 FLOAT_8_E6
js_flops_list:
33.51452234870289 11.491148760988082 1.443232369683574 0.30895904466985846 0.07212736726486271 0.017521603310707724 0.004322850749958015 0.0010654208648101923 0.00027006499577500253 7.161059028231077e-05 2.538806909059987e-05 2.055073568547306e-05 1.620682115453542e-05 1.1986353787221355e-05 9.067183894872267e-06 30.863645293169988 4.784097662511332 1.6646070424398351 1.6585560614298085 1.6573939189141602 1.678128465984322 1.696152544154087 5.78339268163358 2.1114062743040702 0.8425751347317033 1.2052541601543698 0.2767788178149355 0.3748657360498163 0.9081736813090305 0.13299755106486685 0.09626535561783886 0.37471558677670597 0.7918321592327399 0.08953626957132739 0.024398799135561436 0.0962213077547234 0.3747139603715501 0.7410242796157969 0.07442835219519522 0.0061065013202944015 0.024361557223410733 0.09622132719129213 0.3932453051939647
js_param_list:
33.721554105756816 13.269154166033433 1.877153395798168 0.4076554546710941 0.09554313376446293 0.023148392970637945 0.005675039691374743 0.001426682819835179 0.00036345209532231146 9.38985449811094e-05 2.837714142247516e-05 3.191036192167962e-05 2.277355467755515e-05 1.0829856218760036e-05 8.09611746277672e-06 33.383130206500184 5.912517637356709 1.725167775811526 1.6936075456198305 1.7079693950603978 1.745066474153093 1.7771297088256834 6.363561522947833 2.233996327924317 1.0358764351056118 1.245546534684258 0.3518593191648724 0.3809527745802683 0.9287923064091098 0.17743315049465594 0.09900998360709243 0.38070531858108136 0.8063356331707637 0.12348247277252783 0.024949418674407576 0.09893290024757509 0.3807042031089577 0.7535293303127147 0.1040074579869313 0.006337410559870987 0.02488997718210982 0.09893348151517127 0.4220865848581757
ptq_acc_list:
10.0 10.03 14.16 73.84 88.41 89.87 90.3 90.48 90.45 90.54 90.53 90.54 90.58 90.57 90.56 10.0 10.93 22.69 23.64 17.97 24.65 23.01 15.05 15.76 45.01 26.09 75.66 75.82 41.03 84.68 87.95 78.47 49.61 86.32 90.14 88.01 77.2 54.96 87.52 90.17 90.02 88.25 78.85
acc_loss_list:
0.8895759717314488 0.8892446996466431 0.8436395759717314 0.18462897526501765 0.023741166077738577 0.00761925795053001 0.0028710247349823886 0.000883392226148391 0.0012146643109540573 0.00022084805653705853 0.00033127208480566626 0.00022084805653705853 -0.00022084805653705853 -0.0001104240282684508 0.0 0.8895759717314488 0.8793065371024734 0.7494478798586572 0.7389575971731449 0.8015680212014135 0.7278047703180212 0.7459143109540636 0.8338118374558304 0.8259717314487632 0.5029814487632509 0.7119037102473498 0.1645318021201414 0.16276501766784462 0.5469302120141343 0.06492932862190808 0.028820671378091866 0.13350265017667848 0.4521863957597173 0.04681978798586582 0.004637809187279171 0.028158127208480533 0.14752650176678445 0.3931095406360424 0.03356890459363964 0.0043065371024735045 0.005962897526501836 0.02550795053003536 0.12930653710247358
from model import *
from utils import *
import gol
import sys
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import CosineAnnealingLR
import os
import os.path as osp
from torch.utils.tensorboard import SummaryWriter
def quantize_aware_training(model, device, train_loader, optimizer, epoch):
lossLayer = torch.nn.CrossEntropyLoss()
#统计loss和每个参数的grad
#初始化
loss_sum = 0.
grad_dict = {}
# 可以遍历各种weight和bias
for name,param in model.named_parameters():
grad_dict[name] = torch.zeros_like(param) #param.grad和param形状相同
for batch_idx, (data, target) in enumerate(train_loader, 1):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model.quantize_forward(data)
# 对一批数据求得的loss是平均值
loss = lossLayer(output, target)
loss.backward()
#loss和grads累加
loss_sum += loss
for name,param in model.named_parameters():
if param.grad is not None:
# print('-------'+name+'-------')
grad_dict[name] += param.grad
# print(grad_dict[name])
# print(grad_dict.items())
# input()
optimizer.step()
if batch_idx % 50 == 0:
print('Quantize Aware Training Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
batch_size = len(train_loader.batch_sampler)
#对不同batch累加值求平均
for name,grad in grad_dict.items():
grad_dict[name] = grad / batch_size
loss_avg = loss_sum / batch_size
return loss_avg, grad_dict
def full_inference(model, test_loader):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Full Model Accuracy: {:.2f}%\n'.format(100. * correct / len(test_loader.dataset)))
def train(model, device, train_loader, optimizer, epoch):
model.train()
lossLayer = torch.nn.CrossEntropyLoss()
#统计loss和每个参数的grad
#初始化
loss_sum = 0.
grad_dict = {}
for name,param in model.named_parameters():
grad_dict[name] = torch.zeros_like(param) #param.grad和param形状相同
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = lossLayer(output, target)
loss.backward()
#loss和grads累加
loss_sum += loss
for name,param in model.named_parameters():
if param.grad is not None:
# print('-------'+name+'-------')
grad_dict[name] += param.grad # 对batch的累加
# print(grad_dict[name])
optimizer.step()
if batch_idx % 50 == 0:
print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
batch_size = len(train_loader.batch_sampler) # batch数量
#对不同batch累加值求平均
for name,grad in grad_dict.items():
grad_dict[name] = grad / batch_size # 对batch的平均
loss_avg = loss_sum / batch_size
return loss_avg, grad_dict
def quantize_inference(model, test_loader):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data, target = data.to(device), target.to(device)
output = model.quantize_inference(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Quant Model Accuracy: {:.2f}%\n'.format(100. * correct / len(test_loader.dataset)))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='QAT Training')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='resnet18')
parser.add_argument('-e','--epochs', default=20, type=int, metavar='EPOCHS', help='number of total epochs to run')
parser.add_argument('-b', '--batch_size', default=128, type=int, metavar='BATCH SIZE', help='mini-batch size (default: 128)')
parser.add_argument('-j','--workers', default=1, type=int, metavar='WORKERS',help='number of data loading workers (default: 4)')
parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('-wd','--weight_decay',default=0.0001,type=float,metavar='WD',help='lr schduler weight decay',dest='wd')
parser.add_argument('-t', '--test', dest='test', action='store_true', help='test model on test set')
args = parser.parse_args()
batch_size = args.batch_size
seed = 1
epochs = args.epochs
lr = args.lr
# momentum = 0.5
weight_decay = args.wd
torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
writer = SummaryWriter(log_dir='log/' + args.model + '/qat')
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=args.workers, pin_memory=False
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=args.workers, pin_memory=False
)
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
writer = SummaryWriter(log_dir='log/' + args.model + '/qat')
# full_file = 'ckpt/cifar10_' + args.model + '.pt'
# model.load_state_dict(torch.load(full_file))
model.to(device)
# optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
optimizer = optim.Adam(model.parameters(), lr=lr)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
load_qat = False
ckpt_prefix = 'ckpt/qat/'+ args.model + '/'
loss_sum = 0.
grad_dict_sum = {}
grad_dict_avg = {}
for name,param in model.named_parameters():
grad_dict_sum[name] = torch.zeros_like(param)
grad_dict_avg[name] = torch.zeros_like(param)
# full precision from scratch
for epoch in range(1, epochs+1):
# 训练原模型,获取梯度分布
loss,grad_dict = train(model, device, train_loader, optimizer, epoch)
# print('loss:%f' % loss_avg)
writer.add_scalar('Full.loss',loss,epoch)
for name,grad in grad_dict.items():
writer.add_histogram('Full.'+name+'_grad',grad,global_step=epoch)
loss_sum += loss
loss_avg = loss_sum / epoch
for name,grad in grad_dict.items():
grad_dict_sum[name] += grad_dict[name] # 对epoch的累加
grad_dict_avg[name] = grad_dict_sum[name] / epoch # 对epoch求平均
ckpt = {
'epoch' : epoch,
'loss' : loss,
'loss_sum' : loss_sum,
'loss_avg' : loss_avg,
'grad_dict_avg' : grad_dict_avg
}
if epoch % 5 == 0:
subdir = 'epoch_%d/' % epoch
if not os.path.isdir(ckpt_prefix+ subdir):
os.makedirs(ckpt_prefix+ subdir, mode=0o777)
os.chmod(ckpt_prefix+ subdir, mode=0o777)
torch.save(ckpt,ckpt_prefix+ subdir +'full.pt')
lr_scheduler.step()
# loss_avg,grad_dict = quantize_aware_training(model_ptq, device, train_loader, optimizer, epoch)
# print('qat_loss:%f' % loss_avg)
# for name,grad in grad_dict.items():
# writer.add_histogram('qat_'+name+'_grad',grad,global_step=epoch)
# QAT from scratch
quant_type_list = ['INT','POT','FLOAT']
gol._init()
for quant_type in quant_type_list:
num_bit_list = numbit_list(quant_type)
# 对一个量化类别,只需设置一次bias量化表
# int由于位宽大,使用量化表开销过大,直接_round即可
if quant_type != 'INT':
bias_list = build_bias_list(quant_type)
gol.set_value(bias_list, is_bias=True)
for num_bits in num_bit_list:
e_bit_list = ebit_list(quant_type,num_bits)
for e_bits in e_bit_list:
if quant_type == 'FLOAT':
title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
else:
title = '%s_%d' % (quant_type, num_bits)
if load_qat is True and osp.exists(ckpt_prefix+'epoch_20/'+title+'.pt'):
continue
print('\nQAT: '+title)
# model_ptq = AlexNet()
if args.model == 'ResNet18':
model_ptq = resnet18()
elif args.model == 'ResNet50':
model_ptq = resnet50()
elif args.model == 'ResNet152':
model_ptq = resnet152()
# 设置量化表
if quant_type != 'INT':
plist = build_list(quant_type, num_bits, e_bits)
gol.set_value(plist)
# model_ptq.load_state_dict(torch.load(full_file))
model_ptq.to(device)
model_ptq.quantize(quant_type,num_bits,e_bits)
model_ptq.train()
loss_sum = 0.
grad_dict_sum = {}
grad_dict_avg = {}
for name,param in model.named_parameters():
grad_dict_sum[name] = torch.zeros_like(param)
grad_dict_avg[name] = torch.zeros_like(param)
for epoch in range(1, epochs+1):
loss,grad_dict = quantize_aware_training(model_ptq, device, train_loader, optimizer, epoch)
# print('loss:%f' % loss_avg)
writer.add_scalar(title+'.loss',loss,epoch)
for name,grad in grad_dict.items():
writer.add_histogram(title+'.'+name+'_grad',grad,global_step=epoch)
loss_sum += loss
loss_avg = loss_sum / epoch
for name,param in model.named_parameters():
grad_dict_sum[name] += grad_dict[name]
grad_dict_avg[name] = grad_dict_sum[name] / epoch
ckpt = {
'epoch' : epoch,
'loss' : loss,
'loss_sum' : loss_sum,
'loss_avg' : loss_avg,
# 'grad_dict' : grad_dict,
# 'grad_dict_sum' : grad_dict_sum,
'grad_dict_avg' : grad_dict_avg
}
if epoch % 5 == 0:
subdir = 'epoch_%d/' % epoch
if not os.path.isdir(ckpt_prefix+ subdir):
os.makedirs(ckpt_prefix+ subdir, mode=0o777)
os.chmod(ckpt_prefix+ subdir, mode=0o777)
torch.save(ckpt,ckpt_prefix+subdir + title+'.pt')
lr_scheduler.step()
writer.close()
# # model.eval()
# # full_inference(model, test_loader)
# num_bits = 8
# e_bits = 0
# gol._init()
# print("qat: INT8")
# model.quantize('INT',num_bits,e_bits)
# print('Quantization bit: %d' % num_bits)
# if load_quant_model_file is not None:
# model.load_state_dict(torch.load(load_quant_model_file))
# print("Successfully load quantized model %s" % load_quant_model_file)
# else:
# model.train()
# for epoch in range(1, epochs+1):
# quantize_aware_training(model, device, train_loader, optimizer, epoch)
# # for epoch in range(epochs1 + 1, epochs2 + 1):
# # quantize_aware_training(model, device, train_loader, optimizer2, epoch)
# model.eval()
# # torch.save(model.state_dict(), save_file)
# model.freeze()
# # for name, param in model.named_parameters():
# # print(name)
# # print(param.data)
# # print('----------')
# # for param_tensor, param_value in model.state_dict().items():
# # print(param_tensor, "\t", param_value)
# quantize_inference(model, test_loader)
## update: <br>2023.4.24<br>
补充了一些数据和拟合图<br>
尝试将ResNet18,ResNet50,ResNet152,MobileNetV2四个模型的数据点拟合在同一张图上,效果还不错。不过考虑到这四个模型的结构较为相似,暂不确定与其他的结构差异较大的模型的数据点在一起拟合效果如何。
1. ResNet50:
<img src = "fig/50_flops_f.png" class="h-90 auto">
<img src = "fig/50_params_F.png" class="h-90 auto">
2. ResNet152:
<img src = "fig/152_flops_f.png" class="h-90 auto">
<img src = "fig/152_params_f.png" class="h-90 auto">
3. 综合了ResNet18,ResNet50,ResNet152,MobileNetV2的数据点的拟合图
<img src = "fig/total_flops.png" class="h-90 auto">
<img src = "fig/total_params.png" class="h-90 auto">
## update: <br>2023.4.24<br>
- 解决的问题:
1. PoT拟合异常 <br>
2. 曲线拟合效果较差
- 思路记录<br>
在解决MobileNetV2遇到的js散度,曲线拟合问题时,发现了F.normalize对相似度计算的影响很大,其虽然能够帮助把量化后的权值参数调整成与全精度模型相同的scale,但是导致数据分布改变过大,进而导致了出现了反常的关系和拟合结果。具体思路和分析可见MobileNetV2的readme,此处同样也使用了fakefreeze来将量化后的权值参数dequantize,使其与全精度模型的权值参数处于相同的scale,再直接用js散度计算距离。
<br>
在ResNet系列上进行了重新实验,ResNet50,152的数据和曲线待程序运行完后再补充. <br>
ResNet18:
<img src = "fig/18_flops_f.png" class="h-90 auto">
<img src = "fig/18_params_f.png" class="h-90 auto">
可以看到曲线拟合效果有了非常好的提升。
## update: <br>2023.4.23<br>
1. 本文件夹下的ResNet系列模型的Conv层不具有bias,与经典的ResNet一致。但在含有fold BN操作的ConvBN,ConvBNReLU后,Conv会具有bias,在代码中进行了相应修改以适配。
2. ResNet18的无bias的acc比有bias的略高。且无bias的情况下,训练速度明显增快。
3. 修正了一系列小bug
4. 一系列拟合图
(1) 有PoT:
resnet18:
<img src = "fig/18_flops_nobias.png" class="h-90 auto">
<img src = "fig/18_params_nobias.png" class="h-90 auto">
resnet50:
<img src = "fig/50_flops_nobias.png" class="h-90 auto">
<img src = "fig/50_params_nobias.png" class="h-90 auto">
resnet152:
<img src = "fig/152_flops_nobias.png" class="h-90 auto">
<img src = "fig/152_params_nobias.png" class="h-90 auto">
(2) 无PoT:
resnet18:
<img src = "fig/18_flops_nobias_nopot.png" class="h-90 auto">
resnet50:
<img src = "fig/50_flops_nobias_nopot.png" class="h-90 auto">
resnet152:
<img src = "fig/152_flops_nobias_nopot.png" class="h-90 auto">
from model import *
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.transforms.functional import InterpolationMode
import os
import os.path as osp
def train(model, device, train_loader, optimizer, epoch):
model.train()
lossLayer = torch.nn.CrossEntropyLoss()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = lossLayer(output, target)
loss.backward()
optimizer.step()
if batch_idx % 50 == 0:
print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
lossLayer = torch.nn.CrossEntropyLoss(reduction='sum')
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += lossLayer(output, target).item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
test_loss, 100. * correct / len(test_loader.dataset)
))
if __name__ == "__main__":
batch_size = 128
test_batch_size = 128
seed = 1
epochs1 = 15
epochs2 = epochs1+10
epochs3 = epochs2+10
lr1 = 0.01
lr2 = 0.001
lr3 = 0.0001
momentum = 0.5
save_model = True
torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=True,
transform=transforms.Compose([
transforms.Resize((32, 32), interpolation=InterpolationMode.BICUBIC),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, transform=transforms.Compose([
transforms.Resize((32, 32), interpolation=InterpolationMode.BICUBIC),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=test_batch_size, shuffle=True, num_workers=1, pin_memory=True
)
# model = AlexNet_BN().to(device)
model = resnet18().to(device)
optimizer1 = optim.SGD(model.parameters(), lr=lr1, momentum=momentum)
optimizer2 = optim.SGD(model.parameters(), lr=lr2, momentum=momentum)
optimizer3 = optim.SGD(model.parameters(), lr=lr3, momentum=momentum)
for epoch in range(1, epochs1 + 1):
train(model, device, train_loader, optimizer1, epoch)
test(model, device, test_loader)
for epoch in range(epochs1 + 1, epochs2 + 1):
train(model, device, train_loader, optimizer2, epoch)
test(model, device, test_loader)
for epoch in range(epochs2 + 1, epochs3 + 1):
train(model, device, train_loader, optimizer3, epoch)
test(model, device, test_loader)
if save_model:
if not osp.exists('ckpt'):
os.makedirs('ckpt')
torch.save(model.state_dict(), 'ckpt/cifar10_ResNet18.pt')
\ No newline at end of file
import re
pattern = r"\(\w+(\.\w+)*\)"
# text = "" # 输入的文本字符串
# 从文本中查找匹配的子模块名称列表
matches = re.findall(pattern, text)
# 提取所有子模块路径并存储到一个列表中
submodule_paths = [match.strip("()") for match in matches]
# 输出所有子模块路径
print(submodule_paths)
\ No newline at end of file
import torch
import torch.nn as nn
import torch.nn.functional as F
def js_div(p_output, q_output, get_softmax=True):
"""
Function that measures JS divergence between target and output logits:
"""
KLDivLoss = nn.KLDivLoss(reduction='sum')
if get_softmax:
p_output = F.softmax(p_output)
q_output = F.softmax(q_output)
log_mean_output = ((p_output + q_output)/2).log()
return (KLDivLoss(log_mean_output, p_output) + KLDivLoss(log_mean_output, q_output))/2
def ebit_list(quant_type, num_bits):
if quant_type == 'FLOAT':
e_bit_list = list(range(1,num_bits-1))
else:
e_bit_list = [0]
return e_bit_list
def numbit_list(quant_type):
if quant_type == 'INT':
num_bit_list = list(range(2,17))
elif quant_type == 'POT':
num_bit_list = list(range(2,9))
else:
num_bit_list = list(range(2,9))
# num_bit_list = [8]
return num_bit_list
def build_bias_list(quant_type):
if quant_type == 'POT':
return build_pot_list(8) #
else:
return build_float_list(16,7)
def build_list(quant_type, num_bits, e_bits):
if quant_type == 'POT':
return build_pot_list(num_bits)
else:
return build_float_list(num_bits,e_bits)
def build_pot_list(num_bits):
plist = [0.]
for i in range(-2 ** (num_bits-1) + 2, 1):
# i最高到0,即pot量化最大值为1
plist.append(2. ** i)
plist.append(-2. ** i)
plist = torch.Tensor(list(set(plist)))
# plist = plist.mul(1.0 / torch.max(plist))
return plist
def build_float_list(num_bits,e_bits):
m_bits = num_bits - 1 - e_bits
plist = [0.]
# 相邻尾数的差值
dist_m = 2 ** (-m_bits)
e = -2 ** (e_bits - 1) + 1
for m in range(1, 2 ** m_bits):
frac = m * dist_m # 尾数部分
expo = 2 ** e # 指数部分
flt = frac * expo
plist.append(flt)
plist.append(-flt)
for e in range(-2 ** (e_bits - 1) + 2, 2 ** (e_bits - 1) + 1):
expo = 2 ** e
for m in range(0, 2 ** m_bits):
frac = 1. + m * dist_m
flt = frac * expo
plist.append(flt)
plist.append(-flt)
plist = torch.Tensor(list(set(plist)))
return plist
def fold_ratio(layer, par_ratio, flop_ratio):
idx = -1
for name in layer:
idx = idx + 1
# layer是for name, param in model.named_parameters()中提取出来的,一定是有downsample的
if 'bn' in name or 'sample.1' in name:
par_ratio[idx-1] += par_ratio[idx]
flop_ratio[idx-1] += flop_ratio[idx]
return par_ratio,flop_ratio
def fold_model(model):
idx = -1
module_list = []
# print('fold model:')
for name, module in model.named_modules():
# print(name+'-- +')
idx += 1
module_list.append(module)
# 这里之前忘记考虑downsampl里的conv了,导致少融合了一些
if 'bn' in name or 'sample.1' in name:
# print(name+'-- *')
module_list[idx-1] = fold_bn(module_list[idx-1],module) # 在这里修改了
return model
# def fold_model(model):
# last_conv = None
# last_bn = None
# for name, module in model.named_modules():
# if isinstance(module, nn.Conv2d):
# # 如果当前模块是卷积层,则将其 "fold" 到上一个 BN 层中
# if last_bn is not None:
# last_conv = fold_bn(last_conv, last_bn)
# last_bn = None
# last_conv = module
# elif isinstance(module, nn.BatchNorm2d):
# # 如果当前模块是 BN 层,则将其 "fold" 到上一个卷积层中
# last_bn = module
# if last_conv is not None:
# last_conv = fold_bn(last_conv, last_bn)
# last_bn = None
# # 处理最后一个 BN 层
# if last_bn is not None:
# last_conv = fold_bn(last_conv, last_bn)
# return model
def fold_bn(conv, bn):
# 获取 BN 层的参数
gamma = bn.weight.data
beta = bn.bias.data
mean = bn.running_mean
var = bn.running_var
eps = bn.eps
std = torch.sqrt(var + eps)
feat = bn.num_features
# 获取卷积层的参数
weight = conv.weight.data
if conv.bias is not None:
bias = conv.bias.data
if bn.affine:
gamma_ = gamma / std
weight = weight * gamma_.view(feat, 1, 1, 1)
if conv.bias is not None:
bias = gamma_ * bias - gamma_ * mean + beta
else:
bias = beta - gamma_ * mean
else:
gamma_ = 1 / std
weight = weight * gamma_
if conv.bias is not None:
bias = gamma_ * bias - gamma_ * mean
else:
bias = -gamma_ * mean
# 设置新的 weight 和 bias
conv.weight.data = weight
# 适用于bias=none的
if conv.bias is None:
conv.bias = nn.Parameter(bias)
else:
conv.bias.data = bias
return conv
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment