Commit bc233acc by Zhihong Ma

fix: QAT - Predicting the Convergence Speed of Quantized Model

parent f4b96743
from model import *
from extract_ratio import *
from utils import *
import openpyxl
import gol
import sys
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import os
import os.path as osp
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import CosineAnnealingLR
def js_div_norm(a,b):
a_norm = F.normalize(a.data,p=2,dim=-1)
b_norm = F.normalize(b.data,p=2,dim=-1)
return js_div(a_norm,b_norm).cpu().item()
def js_div_0(a,b):
return js_div(a,b).cpu().item()
def quantize_aware_training(model, device, train_loader, optimizer, epoch):
lossLayer = torch.nn.CrossEntropyLoss()
#统计loss和每个参数的grad
#初始化
loss_sum = 0.
grad_dict = {}
for name,param in model.named_parameters():
grad_dict[name] = torch.zeros_like(param) #param.grad和param形状相同
for batch_idx, (data, target) in enumerate(train_loader, 1):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model.quantize_forward(data)
# 对一批数据求得的loss是平均值
loss = lossLayer(output, target)
loss.backward()
#loss和grads累加
loss_sum += loss
for name,param in model.named_parameters():
if param.grad is not None:
# print('-------'+name+'-------')
grad_dict[name] += param.grad.detach()
# print(grad_dict[name])
# print(grad_dict.items())
# input()
optimizer.step()
if batch_idx % 50 == 0:
print('Quantize Aware Training Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
batch_size = len(train_loader.batch_sampler)
#对不同batch累加值求平均
for name,grad in grad_dict.items():
grad_dict[name] = grad / batch_size
loss_avg = loss_sum / batch_size
return loss_avg, grad_dict
def full_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Full Model Accuracy: {:.2f}%\n'.format(100. * correct / len(test_loader.dataset)))
def train(model, device, train_loader, optimizer, epoch):
model.train()
lossLayer = torch.nn.CrossEntropyLoss()
#统计loss和每个参数的grad
#初始化
loss_sum = 0.
grad_dict = {}
for name,param in model.named_parameters():
grad_dict[name] = torch.zeros_like(param) #param.grad和param形状相同
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = lossLayer(output, target)
loss.backward()
#loss和grads累加
loss_sum += loss
for name,param in model.named_parameters():
if param.grad is not None:
# print('-------'+name+'-------')
grad_dict[name] += param.grad.detach()
# print(grad_dict[name])
optimizer.step()
if batch_idx % 50 == 0:
print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
))
batch_size = len(train_loader.batch_sampler)
#对不同batch累加值求平均
for name,grad in grad_dict.items():
grad_dict[name] = grad / batch_size
loss_avg = loss_sum / batch_size
return loss_avg, grad_dict
def quantize_inference(model, test_loader, device):
correct = 0
for i, (data, target) in enumerate(test_loader, 1):
data, target = data.to(device), target.to(device)
output = model.quantize_inference(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print('\nTest set: Quant Model Accuracy: {:.2f}%\n'.format(100. * correct / len(test_loader.dataset)))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='QAT Training')
parser.add_argument('-m', '--model', metavar='MODEL ARCH', default='resnet18')
parser.add_argument('-e','--epochs', default=15, type=int, metavar='EPOCHS', help='number of total epochs to run')
parser.add_argument('-b', '--batch_size', default=128, type=int, metavar='BATCH SIZE', help='mini-batch size (default: 128)')
parser.add_argument('-j','--workers', default=1, type=int, metavar='WORKERS',help='number of data loading workers (default: 4)')
parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('-wd','--weight_decay',default=0.0001,type=float,metavar='WD',help='lr schduler weight decay',dest='wd')
parser.add_argument('-t', '--test', dest='test', action='store_true', help='test model on test set')
args = parser.parse_args()
batch_size = args.batch_size
seed = 1
epochs = args.epochs
lr = args.lr
# momentum = 0.5
weight_decay = args.wd
torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
writer = SummaryWriter(log_dir='log/' + args.model + '/qat')
wb = openpyxl.Workbook()
ws = wb.active
if args.model == 'ResNet18':
model = resnet18()
elif args.model == 'ResNet50':
model = resnet50()
elif args.model == 'ResNet152':
model = resnet152()
layer, par_ratio, flop_ratio = extract_ratio(args.model)
# TODO layer要重新读取
layer = []
for name, param in model.named_parameters():
if 'weight' in name:
n = name.split('.') # conv,bn,fc这些有param的层的名字都能提取出来
pre = '.'.join(n[:len(n)-1])
# 提取出weight前的名字(就是这个层的名字,if weight是避免bias重复提取一遍名字)
layer.append(pre)
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=args.workers, pin_memory=False
)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('../../project/p/data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])),
batch_size=batch_size, shuffle=True, num_workers=args.workers, pin_memory=False
)
# model.load_state_dict(torch.load(full_file))
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
# 没save .pt 无load
quant_type_list = ['INT']
gol._init()
currow=4 #数据从哪行开始写
for quant_type in quant_type_list:
num_bit_list = numbit_list(quant_type)
# 对一个量化类别,只需设置一次bias量化表
# int由于位宽大,使用量化表开销过大,直接_round即可
if quant_type != 'INT':
bias_list = build_bias_list(quant_type)
gol.set_value(bias_list, is_bias=True)
for num_bits in num_bit_list:
e_bit_list = ebit_list(quant_type,num_bits)
for e_bits in e_bit_list:
if quant_type == 'FLOAT':
title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
else:
title = '%s_%d' % (quant_type, num_bits)
currow += 1
print('\nQAT: '+title)
if args.model == 'ResNet18':
model_ptq = resnet18()
elif args.model == 'ResNet50':
model_ptq = resnet50()
elif args.model == 'ResNet152':
model_ptq = resnet152()
model_ptq.to(device)
full_file = 'ckpt/cifar10_' + args.model + '.pt'
model_ptq.load_state_dict(torch.load(full_file))
model_ptq.eval()
full_acc = full_inference(model_ptq, test_loader, device)
# 设置量化表
if quant_type != 'INT':
plist = build_list(quant_type, num_bits, e_bits)
gol.set_value(plist)
# model_ptq.load_state_dict(torch.load(full_file))
model_ptq.quantize(quant_type,num_bits,e_bits)
model_ptq.train()
for epoch in range(1, epochs+1):
loss,qat_grad = quantize_aware_training(model_ptq, device, train_loader, optimizer, epoch)
# print('loss:%f' % loss_avg)
if epoch == 1:
loss_start = loss
writer.add_scalar(title+'.loss',loss,epoch)
lr_scheduler.step()
print(f"loss:{loss}")
model_ptq.freeze()
quantize_inference(model_ptq, test_loader, device)
# print(f"Final QAT ACC:{qat_acc}")
## update: <br>2023.4.28<br>
### 目标工作:尝试去解决“预测模型收敛速度”方面的问题
- 问题:按照原有思路,通过QAT from scratch获得前5/10/15/20个epoch的loss下降量和训练梯度相似度进行拟合。但根据qat.py得到的数据结果并不太好。<br>主要有两个方面的问题:<br>(1)出现了距离(即 相似度的差异性)过大、且变化过大(出现了显著的数量级差异,且规律与预期不符)的问题。<br>(2) 对不同量化方式的数据,loss的下降量有正有负,换言之,没有一个明显的loss在减小的趋势,数值较为随机。<br>
- 实验:针对上述问题,我进行了一系列观察、思考、实验,修改了qat.py中可能存在的问题,得到new_qat.py,还新增了model_foldbn.py, 修改了module.py.<br>
### 分析与实验:
1. 问题与方案:
- 量化模型中将BN fold进了Conv,因此我尝试仿照量化中的fold过程,在全精度模型训练时也将BN fold进Conv,具体的代码在module.py和model_foldbn.py中。我对fold后的全精度模型进行了训练验证,其可以正常更新权值参数,提升推理精度,但训练的收敛速度明显变慢了(ResNet18_foldbn在80个epoch时acc才40%)。
- qat.py中model和model_ptq都使用了同一个optimizer,在new_qat.py将其改为两个optimizer,分别为两个model的参数进行优化。
- 在实验中发现如果使用Adam优化器,得到的梯度会比较不稳定,我该用了SGD后稳定性提高了,趋势更显著。
- 对full_grad...等字典存储的是epoch上限时的各组梯度数据,如果直接用于与各个epoch节点的量化模型梯度数据去计算相似度,在大部分情况下是没有对应上的。这里我在实验中暂时只训练5个epoch,还没处理该问题。
- 对lr和momentum进行了一系列调整,但效果不明显。
2. 修改后的效果:
- 在INT量化中,随着量化位宽增大,量化模型的训练梯度与全精度模型的训练梯度的数量级逐渐接近至相同(但具体数值上仍有明显差异)。
- 得到明显改善的js_grad, loss_delta...等数据 (不过与预期仍不相符)
3. 还存在的问题与我的猜想:
- 对于INT量化,随着位宽增加,只有最开始出现了训练梯度相似度上升的趋势,后续呈现了波动趋势。且根据我对梯度数据的观察,他们并没有随着量化位宽增加而呈现出一致性,仅仅是数量级接近了,但具体数值仍有很大的差异。我猜想这是因为QAT from scratch较难,他们没有能有效的训练。也有可能是代码中还有一些未发现的bug。
- 在INT量化中,loss_delta(loss的减小量)也没有随着位宽增加而呈显著一致的增大,只有在位宽较小时有一段增大趋势,后续则呈现了较为随机的波动。
- 尝试进行了QAT from scratch训练,80个epoch左右没能看到明显的训练效果,可能这也是为什么出现上述问题的原因之一。
<br>(注:具体数据可见于ResNet18qat_result.xlsx中)
4. 尝试进行的拟合
loss_delta - js_grad (loss的下降量 - 第5个epoch的训练梯度加权相似度)
<img src = "fig/grad_delta.png" class="h-90 auto">
可以看到拟合效果非常差。
loss_avg - js_grad_avg (loss平均值 - 前5epoch的平均训练梯度的加权相似度)
<img src = "fig/qat.png" class="h-90 auto">
可以看到他们存在一个线性趋势,但这仅仅说明训练梯度相似度越大,loss越大,而loss的大小与训练收敛速度间并不能建立合理的逻辑关系。loss很大说明当前模型的效果很差,导致了loss大。
## update: <br>2023.4.24<br>
补充了一些数据和拟合图<br>
尝试将ResNet18,ResNet50,ResNet152,MobileNetV2四个模型的数据点拟合在同一张图上,效果还不错。不过考虑到这四个模型的结构较为相似,暂不确定与其他的结构差异较大的模型的数据点在一起拟合效果如何。
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment