Commit 2d918317 by Klin
parents 061582e9 2fe094dd
import torch
import numpy as np
from torch.autograd import Variable
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
# 构建MNIST序列数据集
class seq_mnist(Dataset):
"""docstring for seq_mnist_dataset"""
def __init__(self, trainer_params, train_set):
self.suffix = "_train" if train_set else "_test"
self.data = datasets.MNIST('../../project/p/data', train=train_set, download=False, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]))
self.trainer_params = trainer_params
self.images = []
self.labels = []
self.input_lengths = np.ones(1, dtype=np.int32) * (28 * self.trainer_params.word_size)
self.label_lengths = np.ones(1, dtype=np.int32) * (self.trainer_params.word_size)
# self.build_dataset()
self.load_dataset()
def build_dataset(self):
imgs = []
labels = []
for j in range(len(self.data)//self.trainer_params.word_size): # this loop builds dataset
# 用input_size(32) 下面有pad
img = np.zeros((self.trainer_params.input_size, self.trainer_params.word_size * 28))
labs = np.zeros(self.trainer_params.word_size, dtype=np.int32)
for i in range(self.trainer_params.word_size): # this loop builds one example
ims, labs[i] = self.data[(j*self.trainer_params.word_size)+i]
labs[i] += 1 # because ctc assumes 0 as blank character
ims = np.reshape(ims, (28,28))
# 在上下扩展两行,得到32*28的ims
ims = np.pad(ims, ((2,2),(0,0)), mode='constant', constant_values=-1)
img[:, i*28 : (i+1)*28 ] = ims
# 循环神经网络(RNN)等序列模型的输入数据通常采用3D张量的格式(time_steps, batch_size, input_dim)
# 转置后: time_steps,input_dim
img = np.transpose(img)
imgs.append(img)
labels.append(labs)
# batch_size ,time_steps,input_dim => time_steps, batch_size, input_dim
self.images = np.asarray(imgs, dtype=np.float32).transpose(1, 0, 2)
self.labels.append(labels)
# save
np.save('data/images{}.npy'.format(self.suffix), self.images)
np.save('data/labels{}.npy'.format(self.suffix), np.asarray(self.labels))
# 这里做了Quantize input的操作
if self.trainer_params.quantize_input:
self.images = self.quantize_tensor_image(self.images)
self.images = np.asarray(self.images)
def load_dataset(self):
self.images = np.load('data/images{}.npy'.format(self.suffix))
self.labels = np.load('data/labels{}.npy'.format(self.suffix))
print("Successfully load dataset!")
if self.trainer_params.quantize_input:
self.images = self.quantize_tensor_image(self.images)
self.images = np.asarray(self.images)
# 这里无需单独做, ptq时修改
def quantize_tensor_image(self, tensor_image):
frac_bits = self.trainer_params.recurrent_activation_bit_width-1
prescale = 2**frac_bits
postscale = 2**-frac_bits
max_val = 1 - postscale
tensor_image = tensor_image.clip(-1, max_val)
tensor_image = np.round(tensor_image*prescale)*postscale
return tensor_image
# batch size
def __len__(self):
return self.images.shape[1]
def __getitem__(self, index):
return self.images[:,index,:], self.labels[0][index], self.input_lengths, self.label_lengths
class seq_mnist_train(seq_mnist):
def __init__(self, trainer_params):
print("Building Training Dataset . . . ")
super(seq_mnist_train, self).__init__(trainer_params, train_set=True)
class seq_mnist_val(seq_mnist):
def __init__(self, trainer_params):
print("Building Testing Dataset . . . ")
super(seq_mnist_val, self).__init__(trainer_params, train_set=False)
import math
import numpy as np
class seq_mnist_decoder():
def __init__(self, labels, blank=0):
self.blank_chr = blank
self.labels = labels
def decode(self, predictions, output_len, label_len):
predictions = predictions.data.cpu().numpy()
output = []
# 把结果逐个翻译,再拼成序列
# predictions参数是一个大小为(time_steps, num_classes)的二维数组,表示模型的预测输出。每一行代表一个时间步长,每一列代表一个可能标签的概率。
for i in range(output_len):
pred = np.argmax(predictions[i, :])
# 对标签做一些去除空和重复的处理(因为lstm序列中可能多个neuron处理同一个字符)
if (pred != self.blank_chr) and (pred != np.argmax(predictions[i-1, :])): # merging repeats and removing blank character (0)
output.append(pred-1)
return np.asarray(output)
def hit(self, pred, target):
res = []
for idx, word in enumerate(target):
if idx < len(pred): # 列表长度
item = pred[idx]
# 判断当前位置是否已经不小于预测结果列表的长度,则真实标签对应的预测结果已不存在,此时我们将item设置为任意一个
else:
item = 10
res.append(word == item)
acc = np.mean(np.asarray(res))*100
if math.isnan(acc):
return 0.00
else:
return acc
def to_string(self, in_str):
out_str = ''
for i in range(in_str.shape[0]):
out_str += str(in_str[i])
return out_str
\ No newline at end of file
import sys
import os
# 从get_param.py输出重定向文件val.txt中提取参数量和计算量
def extract_ratio(md='ResNet18'):
fr = open('param_flops_' + md + '.txt','r')
lines = fr.readlines()
layer = []
par_ratio = []
flop_ratio = []
for line in lines:
# if '(' in line and ')' in line:
if 'Conv' in line or 'BatchNorm2d' in line or 'Linear' in line:
layer.append(line.split(':')[1].split('(')[0])
r1 = line.split('%')[0].split(',')[-1]
r1 = float(r1)
par_ratio.append(r1)
r2 = line.split('%')[-2].split(',')[-1]
r2 = float(r2)
flop_ratio.append(r2)
return layer, par_ratio, flop_ratio
if __name__ == "__main__":
layer, par_ratio, flop_ratio = extract_ratio()
print(len(layer))
print(len(par_ratio))
print(len(flop_ratio))
\ No newline at end of file
from torch.autograd import Function
class FakeQuantize(Function):
@staticmethod
def forward(ctx, x, qparam):
x = qparam.quantize_tensor(x)
x = qparam.dequantize_tensor(x)
return x
@staticmethod
def backward(ctx, grad_output):
return grad_output, None
\ No newline at end of file
from model import *
import torch
from ptflops import get_model_complexity_info
import argparse
def get_children(model: torch.nn.Module):
# get children form model!
# 为了后续也能够更新参数,需要用nn.ModuleList来承载
# children = nn.ModuleList(model.children())
# print(children)
# 方便对其中的module进行后续的更新
# flatt_children = nn.ModuleList()
children = list(model.children())
# flatt_children = nn.ModuleList()
flatt_children = []
if len(children) == 0:
# if model has no children; model is last child! :O
return model
else:
# look for children from children... to the last child!
for child in children:
try:
flatt_children.extend(get_children(child))
except TypeError:
flatt_children.append(get_children(child))
# print(flatt_children)
return flatt_children
# 定义获取不包含wrapper的所有子模块的函数
def get_all_child_modules(module):
for name, child in module.named_children():
if isinstance(child, nn.Sequential):
yield from get_all_child_modules(child)
elif len(list(child.children())) > 0:
yield from child.children()
else:
yield child
def filter_fn(module, n_inp, outp_shape):
# if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d, torch.nn.ReLU,torch.nn.BatchNorm2d,torch.nn.Linear,torch.nn.AdaptiveAvgPool2d)):
if 'conv' in module or 'bn' in module or 'fc' in module or 'avg' in module or 'relu' in module:
return True
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Model Analysis --- params & flops')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='resnet18')
args = parser.parse_args()
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
full_file = 'ckpt/cifar10_' + args.model + '.pt'
model.load_state_dict(torch.load(full_file))
# flat = get_children(model)
# print(flat)
# flat = get_children(model)
# new_model = nn.Sequential(*flat)
flops, params = get_model_complexity_info(model, (3, 32, 32), as_strings=True, print_per_layer_stat=True)
class GlobalVariables:
SELF_INPLANES = 0
\ No newline at end of file
# -*- coding: utf-8 -*-
# 用于多个module之间共享全局变量
def _init(): # 初始化
global _global_dict
_global_dict = {}
def set_value(value,is_bias=False):
# 定义一个全局变量
if is_bias:
_global_dict[0] = value
else:
_global_dict[1] = value
def get_value(is_bias=False): # 给bias独立于各变量外的精度
if is_bias:
return _global_dict[0]
else:
return _global_dict[1]
import os
import json
import torch
import argparse
from trainer import Seq_MNIST_Trainer
torch.backends.cudnn.enabled = False
torch.set_printoptions(precision=10)
class objdict(dict):
def __getattr__(self, name):
if name in self:
return self[name]
else:
raise AttributeError("No such attribute: " + name)
def __setattr__(self, name, value):
self[name] = value
def __delattr__(self, name):
if name in self:
del self[name]
else:
raise AttributeError("No such attribute: " + name)
def ascii_encode_dict(data):
ascii_encode = lambda x: x.encode('ascii')
# return dict(map(ascii_encode, pair) if isinstance(pair[1], unicode) else pair for pair in data.items())
return dict(map(ascii_encode, pair) if isinstance(pair[1], str) else pair for pair in data.items())
def non_or_str(value):
if value == None:
return None
return value
if __name__ == '__main__':
# Training settings
parser = argparse.ArgumentParser(description='PyTorch BiLSTM Sequential MNIST Example')
parser.add_argument('--params', '-p', type=str, default="default_trainer_params.json", help='Path to params JSON file. Default ignored when resuming.')
# 这里是可以改的,原版本应该是支持多机训练
parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training')
# parser.add_argument('--gpus', default=0, help='gpus used for training - e.g 0,1,3')
parser.add_argument('--epochs', type=int, default=1000, help='Number of epochs')
parser.add_argument('--init_bn_fc_fusion', default=False, action='store_true', help='Init BN FC fusion.')
# 默认为none
parser.add_argument('--resume', type=non_or_str, help='resume from a checkpoint')
parser.add_argument('--eval', default=False, action='store_true', help='perform evaluation of trained model')
parser.add_argument('--export', default=False, action='store_true', help='perform weights export as .hpp of trained model')
parser.add_argument('--export_image', default=False, action='store_true', help='perform test image export as png and txt')
parser.add_argument('--experiments', default="./experiments", help='Save Path')
parser.add_argument('--simd_factor', default=1, type=int, help='SIMD factor for export.')
parser.add_argument('--pe', default=1, type=int, help='Number of PEs for export.')
#Overrides
parser.add_argument('--random_seed', type=int)
parser.add_argument('--batch_size', type=int)
parser.add_argument('--test_batch_size', type=int)
parser.add_argument('--num_workers', type=int)
parser.add_argument('--num_units', type=int)
parser.add_argument('--num_layers', type=int)
parser.add_argument('--num_classes', type=int)
parser.add_argument('--word_size', type=int)
parser.add_argument('--seq_len', type=int)
parser.add_argument('--neuron_type', type=str)
parser.add_argument('--input_size', type=int)
parser.add_argument('--lr', type=float)
parser.add_argument('--bidirectional', type=bool)
parser.add_argument('--reduce_bidirectional', type=str)
parser.add_argument('--recurrent_bias_enabled', type=bool)
parser.add_argument('--checkpoint_interval', type=int)
args = parser.parse_args()
if args.export:
args.no_cuda = True
args.cuda = not args.no_cuda and torch.cuda.is_available()
if not os.path.exists(args.experiments):
os.mkdir(args.experiments)
# 直接恢复
# if (args.resume or args.eval) and args.params == "default_trainer_params.json":
# package = torch.load(args.resume, map_location=lambda storage, loc: storage)
# trainer_params = package['trainer_params']
# 重新训练
# else:
with open(args.params) as d:
# trainer_params = json.load(d, object_hook=ascii_encode_dict)
trainer_params = json.load(d)
trainer_params = objdict(trainer_params)
for k in trainer_params.keys():
print(k, trainer_params[k])
if trainer_params[k] == 'LSTM':
print("LSTM YES")
elif trainer_params[k] == 'CONCAT':
print("CONCAT YES")
# args还是有用的,trainer_params中的default的和args中关注的参数往往是互补的
trainer = Seq_MNIST_Trainer(trainer_params, args)
if args.export:
trainer.export_model(args.simd_factor, args.pe)
exit(0)
if args.export_image:
trainer.export_image()
exit(0)
if args.eval:
trainer.eval_model()
exit(0)
else:
trainer.train_model()
This source diff could not be displayed because it is too large. You can view the blob instead.
## update 2023.5.4
### 对fp32的模型进行了改进,并进行了初步的PTQ实验(只做到了对部分参数伪量化 naive fakequantization,结果在ptq_result.txt中)
1. 对fp32的改进
- 支持多层LSTM,并用nn.ModuleList组织
2. 对PTQ的尝试
- 对LSTM的结构,数据流向,输入输出有了更细致的理解。PTQ遇到的主要问题有:
- BiLSTM涉及到了双向的output,需要用SUM或者CONCAT处理,PTQ时可能需要引入更多module处理。
- LSTM内部进行的运算较为复杂,如下图所示:<img src = "fig/math.png" class="h-90 auto">
首先涉及到了多个门i,f,g,他们内部有Wx+b+Wh+b的结构,他们的scale是否整体考虑(即,是共用一个scale,还是多个scale并在+的时候做rescale)是一个问题。<br>他们外部的sigmoid或tanh也是一个问题(会导致不方便在各个层间通过scale变换保证PTQ结果的正确性)。<br>对c',h'的更新涉及到的*,+也是一个问题。
- LSTM内部各个门的权值被组织在`weight_ih_l[k]`、`weight_hh_l[k]`、`bias_ih_l[k]`和`bias_hh_l[k]`中,分属在不同行中,是否应该把这个权值矩阵拆开,分别量化后再合并。
- 暂时还没想好怎么处理上述问题,于是进行了简单的尝试性实验:
- 先只处理单向的LSTM,不考虑双向所需的SUM和CONCAT
-`weight_ih_l[k]``weight_hh_l[k]``bias_ih_l[k]``bias_hh_l[k]`各自整体量化,没有再对每个张量切成四块再分别对`weight_ir_l[k]`,`weight_hr_l[k]`,`bias_r_l[k]`,`weight_if_l[k]`,`weight_hf_l[k]`,`bias_f_l[k]`,`weight_ii_l[k]`,`weight_hi_l[k]`进行量化
- 对weight和bias和每层的输出tensor采取伪量化的方法,避免层间的scale变换。<br><br>后果:<br> (1) freeze后的权值不再是量化后的值,而是其量化值经过scale变换后的值
<br>(2) 没有考虑tanh,sigmoid的量化,他们不再是矩阵乘的形式,还没想好怎么消掉scale。只模拟了将权值和输出张量量化产生的rounding,溢出等误差。
<br>(3) 与实际可直接部署到硬件的lstm量化可能有差异。(不过目前我不太确定LSTM实际的量化应该怎么做,在网上没有找到很多lstm量化相关的资料)
<br>(4) 与之前的其他网络的PTQ量化略有差异,与scale相关的的计算顺序有些不同,但运算原理是类似的。
- 有待改进:
- 对LSTM的PTQ量化的更真实模拟,考虑sigmoid,tanh,各种乘加组合
- 补充对BiLSTM的PTQ量化 (如果还按现在的简化版处理方式,BiLSTM很容易实现,因为不需要考虑各种scale方面的问题,直接SUM或者CONCAT即可)
- 可以考虑把`weight_ih_l[k]``weight_hh_l[k]``bias_ih_l[k]``bias_hh_l[k]`拆开,按各自门的权值分别伪量化后再组合
- 使用更复杂的数据集
- 度量相似度
- 找到比较好的指标来度量精度
- fuse BN-Fc的量化
## update 2023.5.2
basic version: FP32版本,只有单个lstm cell,训练数据集采用序列化的MNIST,仅作记录方便后续修改。
\ No newline at end of file
import os
import math
import numpy
import torch
import torch.nn as nn
from model import *
import argparse
import json
# input_size = 32
# num_units = 128
# num_layers = 1
# bidirectional = True
# recurrent_bias_enabled = True
# lstm1 = nn.LSTM(input_size=input_size,
# hidden_size=num_units,
# num_layers=num_layers,
# batch_first=False,
# bidirectional= bidirectional,
# bias= recurrent_bias_enabled)
# lstm2 = nn.LSTM(input_size=input_size,
# hidden_size=num_units,
# num_layers=num_layers + 1,
# batch_first=False,
# bidirectional= bidirectional,
# bias= recurrent_bias_enabled)
# print("LSTM1:")
# for name,params in lstm1.named_parameters():
# print(f"name:{name},params:{params.shape}")
# print("=============================================")
# print("LSTM2:")
# for name,params in lstm2.named_parameters():
# print(f"name:{name},params:{params.shape}")
class objdict(dict):
def __getattr__(self, name):
if name in self:
return self[name]
else:
raise AttributeError("No such attribute: " + name)
def __setattr__(self, name, value):
self[name] = value
def __delattr__(self, name):
if name in self:
del self[name]
else:
raise AttributeError("No such attribute: " + name)
parser = argparse.ArgumentParser(description='PyTorch BiLSTM Sequential MNIST Example')
parser.add_argument('--params', '-p', type=str, default="default_trainer_params.json", help='Path to params JSON file. Default ignored when resuming.')
args = parser.parse_args()
with open(args.params) as d:
trainer_params = json.load(d)
# trainer_params = json.load(d, object_hook=ascii_encode_dict)
trainer_params = objdict(trainer_params)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 加入设备选择
model = BiLSTM(trainer_params).to(device)
model.quantize('INT',8,0)
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
# import warpctc_pytorch as wp
from torch.autograd import Variable
from torch.utils.data import DataLoader
from model import BiLSTM
from decoder import seq_mnist_decoder
from data import seq_mnist_train, seq_mnist_val
class Seq_MNIST_Trainer():
def __init__(self, trainer_params, args):
self.args = args
self.trainer_params = trainer_params
random.seed(trainer_params.random_seed)
torch.manual_seed(trainer_params.random_seed)
if args.cuda:
torch.cuda.manual_seed_all(trainer_params.random_seed)
self.train_data = seq_mnist_train(trainer_params)
self.val_data = seq_mnist_val(trainer_params)
self.train_loader = DataLoader(self.train_data, batch_size=trainer_params.batch_size, \
shuffle=True, num_workers=trainer_params.num_workers)
self.val_loader = DataLoader(self.val_data, batch_size=trainer_params.test_batch_size, \
shuffle=False, num_workers=trainer_params.num_workers)
self.starting_epoch = 1
self.prev_loss = 10000
self.model = BiLSTM(trainer_params)
self.criterion = nn.CTCLoss(blank=0, reduction='mean', zero_infinity=False)
self.labels = [i for i in range(trainer_params.num_classes-1)]
self.decoder = seq_mnist_decoder(labels=self.labels)
self.optimizer = optim.Adam(self.model.parameters(), lr=trainer_params.lr)
# self.criterion = wp.CTCLoss(size_average=False)
# 默认为false
# if args.init_bn_fc_fusion:
# # 默认是false的,应该是用于记录当前是否fuse了吧
# if not trainer_params.prefused_bn_fc:
# self.model.batch_norm_fc.init_fusion() # fuse bn-fc
# self.trainer_params.prefused_bn_fc = True # 已fuse
# else:
# raise Exception("BN and FC are already fused.")
# 先fuse了再load fuse后的
if args.eval or args.resume :
save_dir = 'ckpt'
full_file = save_dir + '/mnist_' + self.trainer_params.reduce_bidirectional +'_' + str(self.trainer_params.bidirectional) + '.pt'
self.model.load_state_dict(torch.load(full_file))
print("load Model from existing file finished!")
if args.cuda:
# torch.cuda.set_device(args.gpus)
# self.model = self.model.cuda()
# self.criterion = self.criterion.cuda()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 加入设备选择
self.model = self.model.to(device)
self.criterion = self.criterion.to(device)
def serialize(self, model, trainer_params, optimizer, starting_epoch, prev_loss):
package = {'state_dict': model.state_dict(),
'trainer_params': trainer_params,
'optim_dict' : optimizer.state_dict(),
'starting_epoch' : starting_epoch,
'prev_loss': prev_loss
}
return package
# 存储
def save_model(self):
save_dir = 'ckpt'
if not os.path.isdir(save_dir):
os.makedirs(save_dir, mode=0o777)
os.chmod(save_dir, mode=0o777)
# path = self.args.experiments + '/' + name
torch.save(self.model.state_dict(), save_dir + '/mnist_' + self.trainer_params.reduce_bidirectional +'_' + str(self.trainer_params.bidirectional) + '.pt')
# print("Model saved at: {}\n".format(path))
# torch.save(self.serialize(model=self.model, trainer_params=self.trainer_params,
# optimizer=self.optimizer, starting_epoch=epoch + 1, prev_loss=self.prev_loss), path)
def train(self, epoch):
self.model.train()
# 重写的 def __getitem__(self, index)
for i, (item) in enumerate(self.train_loader):
data, labels, output_len, lab_len = item
data = Variable(data.transpose(1,0), requires_grad=False)
labels = Variable(labels.view(-1), requires_grad=False)
output_len = Variable(output_len.view(-1), requires_grad=False)
lab_len = Variable(lab_len.view(-1), requires_grad=False)
if self.args.cuda:
# data = data.cuda()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = data.to(device)
output = self.model(data)
loss = self.criterion(output, labels, output_len, lab_len)
# loss_value = loss.data[0]
loss_value = loss.item()
print("Loss value for epoch = {}/{} and batch {}/{} is = {:.4f}".format(epoch,
self.args.epochs, (i+1)*self.trainer_params.batch_size, len(self.train_data) , loss_value))
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 来同步CPU和GPU的内存,确保在GPU上计算完成后再将结果返回到CPU,避免在多GPU环境下出现计算结果的错误或者不一致。
# if self.args.cuda:
# torch.cuda.synchronize()
def test(self, epoch=0, save_model_flag=False):
self.model.eval()
loss_value = 0
for i, (item) in enumerate(self.val_loader):
data, labels, output_len, lab_len = item
data = Variable(data.transpose(1,0), requires_grad=False)
labels = Variable(labels.view(-1), requires_grad=False)
output_len = Variable(output_len.view(-1), requires_grad=False)
lab_len = Variable(lab_len.view(-1), requires_grad=False)
if self.args.cuda:
# data = data.cuda()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = data.to(device)
output = self.model(data)
index = random.randint(0,self.trainer_params.test_batch_size-1)
label = labels[index*self.trainer_params.word_size:(index+1)*self.trainer_params.word_size].data.numpy()
label = label-1
prediction = self.decoder.decode(output[:,index,:], output_len[index], lab_len[index])
accuracy = self.decoder.hit(prediction, label)
print("Sample Label = {}".format(self.decoder.to_string(label)))
print("Sample Prediction = {}".format(self.decoder.to_string(prediction)))
print("Accuracy on Sample = {:.2f}%\n\n".format(accuracy))
loss = self.criterion(output, labels, output_len, lab_len)
# loss_value += loss.data.numpy()
loss_value += loss.cpu().data.numpy()
loss_value /= (len(self.val_data)//self.trainer_params.test_batch_size)
# loss_value = loss_value[0]
loss_value = loss_value.item()
print("Average Loss Value for Val Data is = {:.4f}\n".format(float(loss_value)))
if loss_value < self.prev_loss and save_model_flag:
self.prev_loss = loss_value
self.save_model()
# elif save_model_flag:
# self.save_model(epoch, "checkpoint.tar")
def eval_model(self):
self.test()
def train_model(self):
for epoch in range(self.starting_epoch, self.args.epochs + 1):
self.train(epoch)
acc = self.test(epoch=epoch, save_model_flag=True) # 默认不save model (等做ptq的实验时在处理)
if epoch%20==0:
self.optimizer.param_groups[0]['lr'] = self.optimizer.param_groups[0]['lr']*0.98
def export_model(self, simd_factor, pe):
self.model.eval()
self.model.export('r_model_fw_bw.hpp', simd_factor, pe)
def export_image(self):
random.seed()
idx = random.randint(0,self.val_data.images.shape[1]-1)
# idx = 100
img, label = self.val_data.images[:,idx,:], self.val_data.labels[0][idx]
inp = torch.from_numpy(img)
inp = inp.unsqueeze(1)
inp = Variable(inp, requires_grad=False)
out = self.model(inp)
out = self.decoder.decode(out, self.val_data.input_lengths, self.val_data.label_lengths)
out = self.decoder.to_string(out)
img = img.transpose(1, 0)
label -= 1
label = self.decoder.to_string(label)
assert label==out
from PIL import Image, ImageOps
from matplotlib import cm
img1 = (img+1)/2.
im = Image.fromarray(np.uint8(cm.gist_earth(img1)*255)).convert('L')
im = ImageOps.invert(im)
im.save('test_image.png')
img = img.transpose(1, 0)
img = np.reshape(img, (-1, 1))
np.savetxt("test_image.txt", img, fmt='%.10f')
f = open('test_image_gt.txt','w')
f.write(label)
f.close()
print("Prediction on the image = {}".format(out))
print("Label of exported image = {}".format(label))
import torch
import torch.nn as nn
import torch.nn.functional as F
def js_div(p_output, q_output, get_softmax=True):
"""
Function that measures JS divergence between target and output logits:
"""
KLDivLoss = nn.KLDivLoss(reduction='sum')
if get_softmax:
p_output = F.softmax(p_output)
q_output = F.softmax(q_output)
log_mean_output = ((p_output + q_output)/2).log()
return (KLDivLoss(log_mean_output, p_output) + KLDivLoss(log_mean_output, q_output))/2
def ebit_list(quant_type, num_bits):
if quant_type == 'FLOAT':
e_bit_list = list(range(1,num_bits-1))
else:
e_bit_list = [0]
return e_bit_list
def numbit_list(quant_type):
if quant_type == 'INT':
num_bit_list = list(range(2,17))
elif quant_type == 'POT':
num_bit_list = list(range(2,9))
else:
num_bit_list = list(range(2,9))
# num_bit_list = [8]
return num_bit_list
def build_bias_list(quant_type):
if quant_type == 'POT':
return build_pot_list(8) #
else:
return build_float_list(16,7)
def build_list(quant_type, num_bits, e_bits):
if quant_type == 'POT':
return build_pot_list(num_bits)
else:
return build_float_list(num_bits,e_bits)
def build_pot_list(num_bits):
plist = [0.]
for i in range(-2 ** (num_bits-1) + 2, 1):
# i最高到0,即pot量化最大值为1
plist.append(2. ** i)
plist.append(-2. ** i)
plist = torch.Tensor(list(set(plist)))
# plist = plist.mul(1.0 / torch.max(plist))
return plist
def build_float_list(num_bits,e_bits):
m_bits = num_bits - 1 - e_bits
plist = [0.]
# 相邻尾数的差值
dist_m = 2 ** (-m_bits)
e = -2 ** (e_bits - 1) + 1
for m in range(1, 2 ** m_bits):
frac = m * dist_m # 尾数部分
expo = 2 ** e # 指数部分
flt = frac * expo
plist.append(flt)
plist.append(-flt)
for e in range(-2 ** (e_bits - 1) + 2, 2 ** (e_bits - 1) + 1):
expo = 2 ** e
for m in range(0, 2 ** m_bits):
frac = 1. + m * dist_m
flt = frac * expo
plist.append(flt)
plist.append(-flt)
plist = torch.Tensor(list(set(plist)))
return plist
def fold_ratio(layer, par_ratio, flop_ratio):
idx = -1
for name in layer:
idx = idx + 1
# layer是for name, param in model.named_parameters()中提取出来的,一定是有downsample的
if 'bn' in name or 'sample.1' in name:
par_ratio[idx-1] += par_ratio[idx]
flop_ratio[idx-1] += flop_ratio[idx]
return par_ratio,flop_ratio
def fold_model(model):
idx = -1
module_list = []
# print('fold model:')
for name, module in model.named_modules():
# print(name+'-- +')
idx += 1
module_list.append(module)
# 这里之前忘记考虑downsampl里的conv了,导致少融合了一些
if 'bn' in name or 'sample.1' in name:
# print(name+'-- *')
module_list[idx-1] = fold_bn(module_list[idx-1],module) # 在这里修改了
return model
# def fold_model(model):
# last_conv = None
# last_bn = None
# for name, module in model.named_modules():
# if isinstance(module, nn.Conv2d):
# # 如果当前模块是卷积层,则将其 "fold" 到上一个 BN 层中
# if last_bn is not None:
# last_conv = fold_bn(last_conv, last_bn)
# last_bn = None
# last_conv = module
# elif isinstance(module, nn.BatchNorm2d):
# # 如果当前模块是 BN 层,则将其 "fold" 到上一个卷积层中
# last_bn = module
# if last_conv is not None:
# last_conv = fold_bn(last_conv, last_bn)
# last_bn = None
# # 处理最后一个 BN 层
# if last_bn is not None:
# last_conv = fold_bn(last_conv, last_bn)
# return model
def fold_bn(conv, bn):
# 获取 BN 层的参数
gamma = bn.weight.data
beta = bn.bias.data
mean = bn.running_mean
var = bn.running_var
eps = bn.eps
std = torch.sqrt(var + eps)
feat = bn.num_features
# 获取卷积层的参数
weight = conv.weight.data
if conv.bias is not None:
bias = conv.bias.data
if bn.affine:
gamma_ = gamma / std
weight = weight * gamma_.view(feat, 1, 1, 1)
if conv.bias is not None:
bias = gamma_ * bias - gamma_ * mean + beta
else:
bias = beta - gamma_ * mean
else:
gamma_ = 1 / std
weight = weight * gamma_
if conv.bias is not None:
bias = gamma_ * bias - gamma_ * mean
else:
bias = -gamma_ * mean
# 设置新的 weight 和 bias
conv.weight.data = weight
# 适用于bias=none的
if conv.bias is None:
conv.bias = nn.Parameter(bias)
else:
conv.bias.data = bias
return conv
\ No newline at end of file
from model import *
from extract_ratio import *
from utils import *
import openpyxl
import gol
import sys
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import os
import os.path as osp
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import CosineAnnealingLR
def js_div_norm(a,b):
a_norm = F.normalize(a.data,p=2,dim=-1)
b_norm = F.normalize(b.data,p=2,dim=-1)
return js_div(a_norm,b_norm).cpu().item()
def js_div_0(a,b):
return js_div(a,b).cpu().item()
def quantize_aware_training(model, device, train_loader, optimizer, epoch):
lossLayer = torch.nn.CrossEntropyLoss()
#统计loss和每个参数的grad
#初始化
loss_sum = 0.
grad_dict = {}
for name,param in model.named_parameters():
grad_dict[name] = torch.zeros_like(param) #param.grad和param形状相同
for batch_idx, (data, target) in enumerate(train_loader, 1):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model.quantize_forward(data)
# 对一批数据求得的loss是平均值
loss = lossLayer(output, target)
loss.backward()
#loss和grads累加
loss_sum += loss
for name,param in model.named_parameters():
if param.grad is not None:
# print('-------'+name+'-------')
grad_dict[name] += param.grad.detach()
# print(grad_dict[name])
# print(grad_dict.items())
# input()
optimizer.step()
if batch_idx % 50 == 0:
print('Quantize Aware Training Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
batch_size = len(train_loader.batch_sampler)
#对不同batch累加值求平均
for name,grad in grad_dict.items():
grad_dict[name] = grad / batch_size
loss_avg = loss_sum / batch_size
return loss_avg, grad_dict
def full_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Full Model Accuracy: {:.2f}%\n'.format(100. * correct / len(test_loader.dataset)))
def train(model, device, train_loader, optimizer, epoch):
model.train()
lossLayer = torch.nn.CrossEntropyLoss()
#统计loss和每个参数的grad
#初始化
loss_sum = 0.
grad_dict = {}
for name,param in model.named_parameters():
grad_dict[name] = torch.zeros_like(param) #param.grad和param形状相同
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = lossLayer(output, target)
loss.backward()
#loss和grads累加
loss_sum += loss
for name,param in model.named_parameters():
if param.grad is not None:
# print('-------'+name+'-------')
grad_dict[name] += param.grad.detach()
# print(grad_dict[name])
optimizer.step()
if batch_idx % 50 == 0:
print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
batch_size = len(train_loader.batch_sampler)
#对不同batch累加值求平均
for name,grad in grad_dict.items():
grad_dict[name] = grad / batch_size
loss_avg = loss_sum / batch_size
return loss_avg, grad_dict
def quantize_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data, target = data.to(device), target.to(device)
output = model.quantize_inference(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Quant Model Accuracy: {:.2f}%\n'.format(100. * correct / len(test_loader.dataset)))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='QAT Training')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='resnet18')
parser.add_argument('-e','--epochs', default=15, type=int, metavar='EPOCHS', help='number of total epochs to run')
parser.add_argument('-b', '--batch_size', default=128, type=int, metavar='BATCH SIZE', help='mini-batch size (default: 128)')
parser.add_argument('-j','--workers', default=1, type=int, metavar='WORKERS',help='number of data loading workers (default: 4)')
parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('-wd','--weight_decay',default=0.0001,type=float,metavar='WD',help='lr schduler weight decay',dest='wd')
parser.add_argument('-t', '--test', dest='test', action='store_true', help='test model on test set')
args = parser.parse_args()
batch_size = args.batch_size
seed = 1
epochs = args.epochs
lr = args.lr
# momentum = 0.5
weight_decay = args.wd
torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
writer = SummaryWriter(log_dir='log/' + args.model + '/qat')
wb = openpyxl.Workbook()
ws = wb.active
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
layer, par_ratio, flop_ratio = extract_ratio(args.model)
# TODO layer要重新读取
layer = []
for name, param in model.named_parameters():
if 'weight' in name:
n = name.split('.') # conv,bn,fc这些有param的层的名字都能提取出来
pre = '.'.join(n[:len(n)-1])
# 提取出weight前的名字(就是这个层的名字,if weight是避免bias重复提取一遍名字)
layer.append(pre)
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=args.workers, pin_memory=False
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=args.workers, pin_memory=False
)
# model.load_state_dict(torch.load(full_file))
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
# 没save .pt 无load
quant_type_list = ['INT']
gol._init()
currow=4 #数据从哪行开始写
for quant_type in quant_type_list:
num_bit_list = numbit_list(quant_type)
# 对一个量化类别,只需设置一次bias量化表
# int由于位宽大,使用量化表开销过大,直接_round即可
if quant_type != 'INT':
bias_list = build_bias_list(quant_type)
gol.set_value(bias_list, is_bias=True)
for num_bits in num_bit_list:
e_bit_list = ebit_list(quant_type,num_bits)
for e_bits in e_bit_list:
if quant_type == 'FLOAT':
title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
else:
title = '%s_%d' % (quant_type, num_bits)
currow += 1
print('\nQAT: '+title)
if args.model == 'ResNet18':
model_ptq = resnet18()
elif args.model == 'ResNet50':
model_ptq = resnet50()
elif args.model == 'ResNet152':
model_ptq = resnet152()
model_ptq.to(device)
full_file = 'ckpt/cifar10_' + args.model + '.pt'
model_ptq.load_state_dict(torch.load(full_file))
model_ptq.eval()
full_acc = full_inference(model_ptq, test_loader, device)
# 设置量化表
if quant_type != 'INT':
plist = build_list(quant_type, num_bits, e_bits)
gol.set_value(plist)
# model_ptq.load_state_dict(torch.load(full_file))
model_ptq.quantize(quant_type,num_bits,e_bits)
model_ptq.train()
for epoch in range(1, epochs+1):
loss,qat_grad = quantize_aware_training(model_ptq, device, train_loader, optimizer, epoch)
# print('loss:%f' % loss_avg)
if epoch == 1:
loss_start = loss
writer.add_scalar(title+'.loss',loss,epoch)
lr_scheduler.step()
print(f"loss:{loss}")
model_ptq.freeze()
quantize_inference(model_ptq, test_loader, device)
# print(f"Final QAT ACC:{qat_acc}")
## update: <br>2023.4.28<br>
### 目标工作:尝试去解决“预测模型收敛速度”方面的问题
- 问题:按照原有思路,通过QAT from scratch获得前5/10/15/20个epoch的loss下降量和训练梯度相似度进行拟合。但根据qat.py得到的数据结果并不太好。<br>主要有两个方面的问题:<br>(1)出现了距离(即 相似度的差异性)过大、且变化过大(出现了显著的数量级差异,且规律与预期不符)的问题。<br>(2) 对不同量化方式的数据,loss的下降量有正有负,换言之,没有一个明显的loss在减小的趋势,数值较为随机。<br>
- 实验:针对上述问题,我进行了一系列观察、思考、实验,修改了qat.py中可能存在的问题,得到new_qat.py,还新增了model_foldbn.py, 修改了module.py.<br>
### 分析与实验:
1. 问题与方案:
- 量化模型中将BN fold进了Conv,因此我尝试仿照量化中的fold过程,在全精度模型训练时也将BN fold进Conv,具体的代码在module.py和model_foldbn.py中。我对fold后的全精度模型进行了训练验证,其可以正常更新权值参数,提升推理精度,但训练的收敛速度明显变慢了(ResNet18_foldbn在80个epoch时acc才40%)。
- qat.py中model和model_ptq都使用了同一个optimizer,在new_qat.py将其改为两个optimizer,分别为两个model的参数进行优化。
- 在实验中发现如果使用Adam优化器,得到的梯度会比较不稳定,我该用了SGD后稳定性提高了,趋势更显著。
- 对full_grad...等字典存储的是epoch上限时的各组梯度数据,如果直接用于与各个epoch节点的量化模型梯度数据去计算相似度,在大部分情况下是没有对应上的。这里我在实验中暂时只训练5个epoch,还没处理该问题。
- 对lr和momentum进行了一系列调整,但效果不明显。
2. 修改后的效果:
- 在INT量化中,随着量化位宽增大,量化模型的训练梯度与全精度模型的训练梯度的数量级逐渐接近至相同(但具体数值上仍有明显差异)。
- 得到明显改善的js_grad, loss_delta...等数据 (不过与预期仍不相符)
3. 还存在的问题与我的猜想:
- 对于INT量化,随着位宽增加,只有最开始出现了训练梯度相似度上升的趋势,后续呈现了波动趋势。且根据我对梯度数据的观察,他们并没有随着量化位宽增加而呈现出一致性,仅仅是数量级接近了,但具体数值仍有很大的差异。我猜想这是因为QAT from scratch较难,他们没有能有效的训练。也有可能是代码中还有一些未发现的bug。
- 在INT量化中,loss_delta(loss的减小量)也没有随着位宽增加而呈显著一致的增大,只有在位宽较小时有一段增大趋势,后续则呈现了较为随机的波动。
- 尝试进行了QAT from scratch训练,80个epoch左右没能看到明显的训练效果,可能这也是为什么出现上述问题的原因之一。
<br>(注:具体数据可见于ResNet18qat_result.xlsx中)
4. 尝试进行的拟合
loss_delta - js_grad (loss的下降量 - 第5个epoch的训练梯度加权相似度)
<img src = "fig/grad_delta.png" class="h-90 auto">
可以看到拟合效果非常差。
loss_avg - js_grad_avg (loss平均值 - 前5epoch的平均训练梯度的加权相似度)
<img src = "fig/qat.png" class="h-90 auto">
可以看到他们存在一个线性趋势,但这仅仅说明训练梯度相似度越大,loss越大,而loss的大小与训练收敛速度间并不能建立合理的逻辑关系。loss很大说明当前模型的效果很差,导致了loss大。
## update: <br>2023.4.24<br>
补充了一些数据和拟合图<br>
尝试将ResNet18,ResNet50,ResNet152,MobileNetV2四个模型的数据点拟合在同一张图上,效果还不错。不过考虑到这四个模型的结构较为相似,暂不确定与其他的结构差异较大的模型的数据点在一起拟合效果如何。
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment