Commit 62550290 by Zhihong Ma

feat: new version combined. ResNet18 trial

parent 84050d7f
import sys
import os
# 从get_param.py输出重定向文件val.txt中提取参数量和计算量
def extract_ratio():
fr = open('param_flops.txt','r')
lines = fr.readlines()
layer = []
par_ratio = []
flop_ratio = []
for line in lines:
if '(' in line and ')' in line:
layer.append(line.split(')')[0].split('(')[1])
r1 = line.split('%')[0].split(',')[-1]
r1 = float(r1)
par_ratio.append(r1)
r2 = line.split('%')[-2].split(',')[-1]
r2 = float(r2)
flop_ratio.append(r2)
return layer, par_ratio, flop_ratio
if __name__ == "__main__":
layer, par_ratio, flop_ratio = extract_ratio()
print(layer)
print(par_ratio)
print(flop_ratio)
\ No newline at end of file
from torch.autograd import Function
class FakeQuantize(Function):
@staticmethod
def forward(ctx, x, qparam):
x = qparam.quantize_tensor(x)
x = qparam.dequantize_tensor(x)
return x
@staticmethod
def backward(ctx, grad_output):
return grad_output, None
\ No newline at end of file
from model import *
import torch
from ptflops import get_model_complexity_info
if __name__ == "__main__":
model = resnet18()
full_file = 'ckpt/cifar10_ResNet18.pt'
model.load_state_dict(torch.load(full_file))
flops, params = get_model_complexity_info(model, (3, 32, 32), as_strings=True, print_per_layer_stat=True)
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import OrderedDict
def get_model_histogram(model):
"""
Description:
- get norm gradients from model, and store in a OrderDict
Args:
- model: (torch.nn.Module), torch model
Returns:
- grads in OrderDict
"""
gradshisto = OrderedDict()
grads = OrderedDict()
for name, params in model.named_parameters():
grad = params.grad
if grad is not None:
tmp = {}
params_np = grad.cpu().numpy()
histogram, bins = np.histogram(params_np.flatten(),bins=20)
tmp['histogram'] = list(histogram)
tmp['bins'] = list(bins)
gradshisto[name] = tmp
grads[name] = params_np
return gradshisto,grads
def get_model_norm_gradient(model):
"""
Description:
- get norm gradients from model, and store in a OrderDict
Args:
- model: (torch.nn.Module), torch model
Returns:
- grads in OrderDict
"""
grads = OrderedDict()
for name, params in model.named_parameters():
grad = params.grad
if grad is not None:
grads[name] = grad.norm().item()
return grads
def get_grad_histogram(grads_sum):
gradshisto = OrderedDict()
# grads = OrderedDict()
for name, params in grads_sum.items():
grad = params
if grad is not None:
tmp = {}
#params_np = grad.cpu().numpy()
params_np = grad
histogram, bins = np.histogram(params_np.flatten(),bins=20)
tmp['histogram'] = list(histogram)
tmp['bins'] = list(bins)
gradshisto[name] = tmp #每层一个histogram (tmp中的是描述直方图的信息)
# grads[name] = params_np
return gradshisto
\ No newline at end of file
class GlobalVariables:
SELF_INPLANES = 0
\ No newline at end of file
# -*- coding: utf-8 -*-
# 用于多个module之间共享全局变量
def _init(): # 初始化
global _global_dict
_global_dict = {}
def set_value(value,is_bias=False):
# 定义一个全局变量
if is_bias:
_global_dict[0] = value
else:
_global_dict[1] = value
def get_value(is_bias=False): # 给bias独立于各变量外的精度
if is_bias:
return _global_dict[0]
else:
return _global_dict[1]
2023.4.10
注:new_mzh中的程序改用了与游昆霖同学统一的度量方式、以及一些量化细节约定,将代码重新建立在游昆霖同学版本的程序上。
在量化BN层的过程中遇到了较多问题,感谢游昆霖同学的帮助:D
程序改动:
为量化ResNet18,在module.py中新增的量化层包括QConvBNReLu层,QConvBN层,QElementwiseAdd层,QAdaptiveAvgPool2d层。在model.py中建立了ResNet18的量化架构,通过class BasicBlock, class Bottleneck, class MakeLayer等保障了ResNet的扩展性,能够较为方便的扩展成ResNet50和152
待完善:
ResNet的网络架构相比于AlexNet,VGG等更加的跳跃,各种MakeLayer, Residual的结构使得其不是一个平铺开来的网络,则过去的很多计算相似度等的算法不能直接适用在ResNet上(直接遍历网络参数时,会有包装在conv,bn等层外面的layer, sequential, block等),关于参数相似度、梯度相似度的分析有待后续研究补充。
QAT方面有待后续补充
下面的实验是关于ResNet18的PTQ结果:(js_flops, js_param等还未修改计算方式,因而暂时未计算,赋值为0)
```
PTQ: INT_2
direct quantization finish
Test set: Quant Model Accuracy: 10.00%
INT_2: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.883599
PTQ: INT_3
direct quantization finish
Test set: Quant Model Accuracy: 10.00%
INT_3: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.883599
PTQ: INT_4
direct quantization finish
Test set: Quant Model Accuracy: 49.76%
INT_4: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.420789
PTQ: INT_5
direct quantization finish
Test set: Quant Model Accuracy: 80.86%
INT_5: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.058782
PTQ: INT_6
direct quantization finish
Test set: Quant Model Accuracy: 84.91%
INT_6: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.011640
PTQ: INT_7
direct quantization finish
Test set: Quant Model Accuracy: 85.60%
INT_7: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.003608
PTQ: INT_8
direct quantization finish
Test set: Quant Model Accuracy: 85.85%
INT_8: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.000698
PTQ: INT_9
direct quantization finish
Test set: Quant Model Accuracy: 85.64%
INT_9: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.003143
PTQ: INT_10
direct quantization finish
Test set: Quant Model Accuracy: 82.81%
INT_10: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.036084
PTQ: INT_11
direct quantization finish
Test set: Quant Model Accuracy: 74.91%
INT_11: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.128041
PTQ: INT_12
direct quantization finish
Test set: Quant Model Accuracy: 56.50%
INT_12: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.342335
PTQ: INT_13
direct quantization finish
Test set: Quant Model Accuracy: 26.25%
INT_13: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.694448
PTQ: INT_14
direct quantization finish
Test set: Quant Model Accuracy: 14.16%
INT_14: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.835176
PTQ: INT_15
direct quantization finish
Test set: Quant Model Accuracy: 11.29%
INT_15: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.868583
PTQ: INT_16
direct quantization finish
Test set: Quant Model Accuracy: 10.25%
INT_16: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.880689
PTQ: POT_2
direct quantization finish
Test set: Quant Model Accuracy: 10.00%
POT_2: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.883599
PTQ: POT_3
direct quantization finish
Test set: Quant Model Accuracy: 10.00%
POT_3: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.883599
PTQ: POT_4
direct quantization finish
Test set: Quant Model Accuracy: 44.75%
POT_4: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.479106
PTQ: POT_5
direct quantization finish
Test set: Quant Model Accuracy: 40.29%
POT_5: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.531021
PTQ: POT_6
direct quantization finish
Test set: Quant Model Accuracy: 50.13%
POT_6: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.416482
PTQ: POT_7
direct quantization finish
Test set: Quant Model Accuracy: 45.75%
POT_7: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.467466
PTQ: POT_8
direct quantization finish
Test set: Quant Model Accuracy: 39.79%
POT_8: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.536841
PTQ: FLOAT_3_E1
direct quantization finish
Test set: Quant Model Accuracy: 9.93%
FLOAT_3_E1: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.884414
PTQ: FLOAT_4_E1
direct quantization finish
Test set: Quant Model Accuracy: 39.63%
FLOAT_4_E1: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.538703
PTQ: FLOAT_4_E2
direct quantization finish
Test set: Quant Model Accuracy: 70.74%
FLOAT_4_E2: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.176580
PTQ: FLOAT_5_E1
direct quantization finish
Test set: Quant Model Accuracy: 65.04%
FLOAT_5_E1: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.242929
PTQ: FLOAT_5_E2
direct quantization finish
Test set: Quant Model Accuracy: 82.65%
FLOAT_5_E2: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.037947
PTQ: FLOAT_5_E3
direct quantization finish
Test set: Quant Model Accuracy: 80.86%
FLOAT_5_E3: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.058782
PTQ: FLOAT_6_E1
direct quantization finish
Test set: Quant Model Accuracy: 74.17%
FLOAT_6_E1: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.136655
PTQ: FLOAT_6_E2
direct quantization finish
Test set: Quant Model Accuracy: 84.28%
FLOAT_6_E2: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.018973
PTQ: FLOAT_6_E3
direct quantization finish
Test set: Quant Model Accuracy: 84.81%
FLOAT_6_E3: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.012804
PTQ: FLOAT_6_E4
direct quantization finish
Test set: Quant Model Accuracy: 78.06%
FLOAT_6_E4: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.091375
PTQ: FLOAT_7_E1
direct quantization finish
Test set: Quant Model Accuracy: 76.20%
FLOAT_7_E1: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.113025
PTQ: FLOAT_7_E2
direct quantization finish
Test set: Quant Model Accuracy: 84.83%
FLOAT_7_E2: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.012571
PTQ: FLOAT_7_E3
direct quantization finish
Test set: Quant Model Accuracy: 85.55%
FLOAT_7_E3: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.004190
PTQ: FLOAT_7_E4
direct quantization finish
Test set: Quant Model Accuracy: 82.00%
FLOAT_7_E4: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.045513
PTQ: FLOAT_7_E5
direct quantization finish
Test set: Quant Model Accuracy: 10.00%
FLOAT_7_E5: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.883599
PTQ: FLOAT_8_E1
direct quantization finish
Test set: Quant Model Accuracy: 77.39%
FLOAT_8_E1: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.099174
PTQ: FLOAT_8_E2
direct quantization finish
Test set: Quant Model Accuracy: 85.21%
FLOAT_8_E2: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.008148
PTQ: FLOAT_8_E3
direct quantization finish
Test set: Quant Model Accuracy: 86.00%
FLOAT_8_E3: js_flops: 0.000000 js_param: 0.000000 acc_loss: -0.001048
PTQ: FLOAT_8_E4
direct quantization finish
Test set: Quant Model Accuracy: 83.26%
FLOAT_8_E4: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.030846
PTQ: FLOAT_8_E5
direct quantization finish
Test set: Quant Model Accuracy: 10.02%
FLOAT_8_E5: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.883366
PTQ: FLOAT_8_E6
direct quantization finish
Test set: Quant Model Accuracy: 13.09%
FLOAT_8_E6: js_flops: 0.000000 js_param: 0.000000 acc_loss: 0.847631
```
我在加上正确的QElementwiseAdd层前,PTQ后的acc都不超过15%,足以见到该层的重要性,他是负责残差的相加部分,因为两个层的输出结果是在不同量化范围,所以不能直接相加,而是需要做rescale。
目前看到INT量化随位宽增加而先增大后下降,我查看了量化后的参数分布,其整体趋势与全精度模型是较为相似的,因此问题不在Conv,BN等普通的量化层上,我猜想可能是因为量化位宽较大的时候,QElementwiseAdd做rescale的过程中出现了溢出,还有待后续观察确认。
\ No newline at end of file
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
#SBATCH -J Resnet18_trial # The job name
#SBATCH -o ./info/ret-%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ./info/ret-%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Resources
# (TODO)
# Please modify your requirements
#SBATCH -p nv-gpu # Submit to 'nv-gpu' Partitiion
#SBATCH -t 0-12:00:00 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=1 # Request N nodes
#SBATCH --gres=gpu:1 # Request M GPU per node
#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
#SBATCH --qos=gpu-normal # Request QOS Type
###
### The system will alloc 8 or 16 cores per gpu by default.
### If you need more or less, use following:
### #SBATCH --cpus-per-task=K # Request K cores
###
###
### Without specifying the constraint, any available nodes that meet the requirement will be allocated
### You can specify the characteristics of the compute nodes, and even the names of the compute nodes
###
### #SBATCH --nodelist=gpu-v00 # Request a specific list of hosts
### #SBATCH --constraint="Volta|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
###
#- Log information
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "Job run at:"
echo "$(hostnamectl)"
#- Load environments
source /tools/module_env.sh
source pyt1.5/bin/activate
module list # list modules loaded
##- Tools
module load cluster-tools/v1.0
module load slurm-tools/v1.0
module load cmake/3.15.7
module load git/2.17.1
module load vim/8.1.2424
##- language
module load python3/3.6.8
##- CUDA
module load cuda-cudnn/11.1-8.1.1
##- virtualenv
# source xxxxx/activate
echo $(module list) # list modules loaded
echo $(which gcc)
echo $(which python)
echo $(which python3)
cluster-quota # nas quota
nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
#- Warning! Please not change your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
echo "Use GPU ${CUDA_VISIBLE_DEVICES}" # which gpus
#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
#- Job step
# [EDIT HERE(TODO)]
sleep 2s
hostname
echo "python ./new_train.py -m ResNet18 -e 60 -b 128 -j 4 -lr 0.001 -wd 0.0001"
python ./new_train.py -m ResNet18 -e 60 -b 128 -j 4 -lr 0.001 -wd 0.0001
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
#SBATCH -J ResNet18_trial # The job name
#SBATCH -o ./info/ret-%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ./info/ret-%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Resources
# (TODO)
# Please modify your requirements
#SBATCH -p nv-gpu # Submit to 'nv-gpu' Partitiion
#SBATCH -t 0-12:00:00 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=1 # Request N nodes
#SBATCH --gres=gpu:1 # Request M GPU per node
#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
#SBATCH --qos=gpu-normal # Request QOS Type
###
### The system will alloc 8 or 16 cores per gpu by default.
### If you need more or less, use following:
### #SBATCH --cpus-per-task=K # Request K cores
###
###
### Without specifying the constraint, any available nodes that meet the requirement will be allocated
### You can specify the characteristics of the compute nodes, and even the names of the compute nodes
###
### #SBATCH --nodelist=gpu-v00 # Request a specific list of hosts
### #SBATCH --constraint="Volta|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
###
#- Log information
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "Job run at:"
echo "$(hostnamectl)"
#- Load environments
source /tools/module_env.sh
source pyt1.5/bin/activate
module list # list modules loaded
##- Tools
module load cluster-tools/v1.0
module load slurm-tools/v1.0
module load cmake/3.15.7
module load git/2.17.1
module load vim/8.1.2424
##- language
module load python3/3.6.8
##- CUDA
module load cuda-cudnn/11.1-8.1.1
##- virtualenv
# source xxxxx/activate
echo $(module list) # list modules loaded
echo $(which gcc)
echo $(which python)
echo $(which python3)
cluster-quota # nas quota
nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
#- Warning! Please not change your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
echo "Use GPU ${CUDA_VISIBLE_DEVICES}" # which gpus
#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
#- Job step
# [EDIT HERE(TODO)]
sleep 2s
hostname
echo "python ./ptq.py -m ResNet18 -b 128 -j 4"
python ./ptq.py -m ResNet18 -b 128 -j 4
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
import torch
import torch.nn as nn
def ebit_list(quant_type, num_bits):
if quant_type == 'FLOAT':
e_bit_list = list(range(1,num_bits-1))
else:
e_bit_list = [0]
return e_bit_list
def numbit_list(quant_type):
if quant_type == 'INT':
num_bit_list = list(range(2,17))
elif quant_type == 'POT':
num_bit_list = list(range(2,9))
else:
num_bit_list = list(range(2,9))
# num_bit_list = [8]
return num_bit_list
def build_bias_list(quant_type):
if quant_type == 'POT':
return build_pot_list(8)
else:
return build_float_list(16,7)
def build_list(quant_type, num_bits, e_bits):
if quant_type == 'POT':
return build_pot_list(num_bits)
else:
return build_float_list(num_bits,e_bits)
def build_pot_list(num_bits):
plist = [0.]
for i in range(-2 ** (num_bits-1) + 2, 1):
# i最高到0,即pot量化最大值为1
plist.append(2. ** i)
plist.append(-2. ** i)
plist = torch.Tensor(list(set(plist)))
# plist = plist.mul(1.0 / torch.max(plist))
return plist
def build_float_list(num_bits,e_bits):
m_bits = num_bits - 1 - e_bits
plist = [0.]
# 相邻尾数的差值
dist_m = 2 ** (-m_bits)
e = -2 ** (e_bits - 1) + 1
for m in range(1, 2 ** m_bits):
frac = m * dist_m # 尾数部分
expo = 2 ** e # 指数部分
flt = frac * expo
plist.append(flt)
plist.append(-flt)
for e in range(-2 ** (e_bits - 1) + 2, 2 ** (e_bits - 1) + 1):
expo = 2 ** e
for m in range(0, 2 ** m_bits):
frac = 1. + m * dist_m
flt = frac * expo
plist.append(flt)
plist.append(-flt)
plist = torch.Tensor(list(set(plist)))
return plist
def fold_ratio(layer, par_ratio, flop_ratio):
idx = -1
for name in layer:
idx = idx + 1
if 'bn' in name:
par_ratio[idx-1] += par_ratio[idx]
flop_ratio[idx-1] += flop_ratio[idx]
return par_ratio,flop_ratio
def fold_model(model):
idx = -1
module_list = []
for name, module in model.named_modules():
idx += 1
module_list.append(module)
if 'bn' in name:
module_list[idx-1] = fold_bn(module_list[idx-1],module) # 在这里修改了
return model
# def fold_model(model):
# last_conv = None
# last_bn = None
# for name, module in model.named_modules():
# if isinstance(module, nn.Conv2d):
# # 如果当前模块是卷积层,则将其 "fold" 到上一个 BN 层中
# if last_bn is not None:
# last_conv = fold_bn(last_conv, last_bn)
# last_bn = None
# last_conv = module
# elif isinstance(module, nn.BatchNorm2d):
# # 如果当前模块是 BN 层,则将其 "fold" 到上一个卷积层中
# last_bn = module
# if last_conv is not None:
# last_conv = fold_bn(last_conv, last_bn)
# last_bn = None
# # 处理最后一个 BN 层
# if last_bn is not None:
# last_conv = fold_bn(last_conv, last_bn)
# return model
def fold_bn(conv, bn):
# 获取 BN 层的参数
gamma = bn.weight.data
beta = bn.bias.data
mean = bn.running_mean
var = bn.running_var
eps = bn.eps
std = torch.sqrt(var + eps)
feat = bn.num_features
# 获取卷积层的参数
weight = conv.weight.data
bias = conv.bias.data
if bn.affine:
gamma_ = gamma / std
weight = weight * gamma_.view(feat, 1, 1, 1)
if bias is not None:
bias = gamma_ * bias - gamma_ * mean + beta
else:
bias = beta - gamma_ * mean
else:
gamma_ = 1 / std
weight = weight * gamma_
if bias is not None:
bias = gamma_ * bias - gamma_ * mean
else:
bias = -gamma_ * mean
# 设置新的 weight 和 bias
conv.weight.data = weight
conv.bias.data = bias
return conv
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment