feat: PTB_LSTM: inference with fake_quantize

061582e9 · Klin · f4b96743 · 061582e9 · 061582e9 · 061582e9
Commit 061582e9 authored May 11, 2023 by Klin
18 changed files
--- a/ykl/PTB_LSTM/PTB_LSTM.txt
+++ b/ykl/PTB_LSTM/PTB_LSTM.txt
+Warning! No positional inputs found for a module, assuming batch size is 1.
+Model(
+  14.86 M, 100.000% Params, 5.2 GMac, 100.000% MACs, 
+  (embed1): Embedding(0, 0.000% Params, 0.0 Mac, 0.000% MACs, 10000, 700)
+  (drop2): Dropout(0, 0.000% Params, 0.0 Mac, 0.000% MACs, p=0.65, inplace=False)
+  (lstm3): LSTM(3.93 M, 26.415% Params, 1.38 GMac, 26.455% MACs, 700, 700, dropout=0.65)
+  (lstm4): LSTM(3.93 M, 26.415% Params, 1.38 GMac, 26.455% MACs, 700, 700)
+  (drop5): Dropout(0, 0.000% Params, 0.0 Mac, 0.000% MACs, p=0.65, inplace=False)
+  (fc6): Linear(7.01 M, 47.170% Params, 2.45 GMac, 47.090% MACs, in_features=700, out_features=10000, bias=True)
+)
--- a/ykl/PTB_LSTM/README.md
+++ b/ykl/PTB_LSTM/README.md
+# PTB_LSTM量化说明
+
+## 全精度模型
+
+1. 数据集及预处理：该模型使用PTB数据集，里面包含大量的英文句子，与cifar10数据集以图像为样本不同，该数据集将句子切割作为样本，并构建词表将单词转化为数字索引，便于网络处理。
+2. 模型目标及评价指标：预测某个单词/短句后面出现的单词。由于每个样本在相同单词/短句的后续单词未必一致，不能简单的使用acc进行评判。语言模型使用困惑度ppl作为评价指标，值越小表示后续单词的可能性越集中，表征模型性能约好。数学上ppl表现为loss的指数。
+3. 模型结构的额外要求：由于PTB LSTM模型中的embedding层具有稀疏矩阵参数，且其中很多值为0，导致Adam优化器学习率调整机制失效。pytorch官方embedding类处说明，当前可支持的CUDA优化器只有SGD、sparseAdam（专为稀疏矩阵准备的Adam优化器）。另外，在训练结束，进行推理时，一般会对lstm层使用flatten_parameters（）方法，将其参数拉平为一维，方便并行计算。
+4. 模型参数量和计算量的获取：由于PTB LSTM的输入并非floattensor，而是int类型的longTensor和hidden。之前所使用的ptflops需要使用其特殊输入构造API。在get_model_complexity_info方法中额外传入一个返回值为dict的输入构造器，从而获得特定的输入。
+
+
+
+# ptq部分
+
+### 量化层说明
+
+#### embedding
+
+ 该层接受输入的每个元素均为int类型的索引，根据索引将embedding矩阵（即weight参数）的对应行取出，放置到对应位置。因此输入不需要进行量化，只需要对weight进行量化即可。
+ 注意到，输出的每个元素都来自于weight，因此只需要weight进行了量化。同时考虑到输出可能只包括了embedding矩阵的部分元素，设置了qo对输出进行了统计。并通过`self.M.data=self.qw.scale/self.qo.scale).data`进行rescale。
+
+
+
+#### LSTM
+
+ 全精度层允许多层，为量化方便，我们将对多层LSTM进行拆分。
+
+ 该层接受输入x和隐层hidden。其中hidden是可选输入，可unpack为上一步隐层h和上一步状态c。如果未指定或输入为None时，nn.LSTM会自动初始化值为0的隐层作为输入。h,c形状为(nlayer,batch_sz, hidden_size)。考虑到第一层可能始终不接受hidden输入，或在第一个batch输入为None，后续输入为非零值。量化时指定参数has_hidden表明是否需要对输入hidden作为统计。
+
+  当has_hidden为true时，表示需要统计，设置相应的统计值qih和qic。考虑到第一个batch输入的hidden可能为None，只在hidden不为None时进行qih和qic的更新以及hidden的反量化，防止因为scale为0导致量化值出现nan。
+
+  当has_hidden为false时，表示该层始终不接受非零的hidden，无需进行统计，并在相应方法中进行检查。
+
+ 对于单层的lstm_module，参数主要有weight_ih_l0，weight_hh_l0，bias_ih_l0，bias_hh_l0。其中后缀为ih的表示输入x和输出hidden之间的关系，后缀为hh表示输入hidden和输出hidden之间的关系。简便起见，我们当前不对每个矩阵进行进一步分拆。
+
+  另外，由于该层运算并非简单的加减、乘法和卷积运算，不能很方便的进行rescale。当前仍然使用伪量化来进行推理。后续的rescale可以考虑用一个相近的线性函数来模拟。
+
+ 在quantize_forward时，与其他层直接调用toch.nn.Functional不同，仍然使用nn.LSTM进行。这是因为直接调用函数还涉及到flatten_weight等操作，为了简化调用逻辑，新建一个临时的LSTM层，并修改其参数与量化值一致来进行运算。
\ No newline at end of file
--- a/ykl/PTB_LSTM/extract_ratio.py
+++ b/ykl/PTB_LSTM/extract_ratio.py
+import sys
+import os
+
+
+# 从get_param.py输出重定向文件val.txt中提取参数量和计算量
+def extract_ratio(model_name):
+    fr = open('param_flops/'+model_name+'.txt','r')
+    lines = fr.readlines()
+
+    Mac = lines[1].split('Mac,')[0].split(',')[-1]
+    if 'M' in Mac:
+        Mac = Mac.split('M')[0]
+        Mac = float(Mac)
+    elif 'G' in Mac:
+        Mac = Mac.split('G')[0]
+        Mac = float(Mac)
+        Mac *= 1024
+    
+    Param = lines[1].split('M,')[0]
+    Param = float(Param)
+    
+    layer = []
+    par_ratio = []
+    flop_ratio = []
+    weight_ratio = []
+    for line in lines:
+        if '(' in line and ')' in line:
+            layer.append(line.split(')')[0].split('(')[1])
+            r1 = line.split('%')[0].split(',')[-1]
+            r1 = float(r1)
+            par_ratio.append(r1)
+            r2 = line.split('%')[-2].split(',')[-1]
+            r2 = float(r2)
+            flop_ratio.append(r2)
+            if 'conv' in line:
+                #无论是否bias=false都计算，fold之后直接使用conv的近似计算
+                inch = line.split(',')[4]
+                # outch = line.split(',')[5]
+                klsz = line.split(',')[6].split('(')[-1]
+                inch = float(inch)
+                # outch = float(outch)
+                klsz = float(klsz)
+                wr = inch * klsz * klsz
+                wr = wr / (1+wr)
+                weight_ratio.append(wr)
+            elif 'fc' in line:
+                inch = line.split(',')[4].split('=')[-1]
+                inch = float(inch)
+                wr = inch / (1+inch)
+                weight_ratio.append(wr)
+            else:
+                weight_ratio.append(0)
+
+
+    return Mac, Param, layer, par_ratio, flop_ratio, weight_ratio
+
+
+if __name__ == "__main__":
+    Mac, Param, layer, par_ratio, flop_ratio, weight_ratio = extract_ratio('Inception_BN')
+    print(Mac)
+    print(Param)
+    print(layer)
+    print(par_ratio)
+    print(flop_ratio)
+    print(weight_ratio)
\ No newline at end of file
--- a/ykl/PTB_LSTM/function.py
+++ b/ykl/PTB_LSTM/function.py
+from torch.autograd import Function
+
+
+class FakeQuantize(Function):
+
+    @staticmethod
+    def forward(ctx, x, qparam):
+        x = qparam.quantize_tensor(x)
+        x = qparam.dequantize_tensor(x)
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
\ No newline at end of file
--- a/ykl/PTB_LSTM/get_param_flops.py
+++ b/ykl/PTB_LSTM/get_param_flops.py
+from model import *
+from functools import partial
+from lstm_utils import *
+
+import sys
+import torch
+from ptflops import get_model_complexity_info
+
+data_path = '../data/ptb'
+embed_size = 700
+hidden_size = 700
+eval_batch_size = 10
+dropout = 0.65
+tied = True
+
+def lstm_constructor(shape,hidden):
+    return {"x":        torch.zeros(shape,dtype=torch.int64),
+            "hidden":   hidden}
+
+if __name__ == "__main__":
+
+    corpus = Corpus(data_path)
+    ntokens = len(corpus.dictionary)
+
+    model = Model(ntokens, embed_size, hidden_size, dropout, tied)
+
+    full_file = 'ckpt/ptb_PTB_LSTM.pt'
+    model.load_state_dict(torch.load(full_file))
+    hidden = model.init_hidden(eval_batch_size)
+    flops, params = get_model_complexity_info(model, (35,10), as_strings=True, 
+                                            input_constructor = partial(lstm_constructor,hidden=hidden),
+                                            print_per_layer_stat=True)
--- a/ykl/PTB_LSTM/get_param_flops.slurm
+++ b/ykl/PTB_LSTM/get_param_flops.slurm
+#!/bin/bash
+
+#- Job parameters
+
+# (TODO)
+# Please modify job name
+
+#SBATCH -J PTB_LSTM              # The job name
+#SBATCH -o ret/ret-%j.out        # Write the standard output to file named 'ret-<job_number>.out'
+#SBATCH -e ret/ret-%j.err        # Write the standard error to file named 'ret-<job_number>.err'
+
+
+#- Resources
+
+# (TODO)
+# Please modify your requirements
+
+#SBATCH -p nv-gpu                    # Submit to 'nv-gpu' Partitiion
+#SBATCH -t 0-01:30:00                # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
+#SBATCH --nodes=1                    # Request N nodes
+#SBATCH --gres=gpu:1                 # Request M GPU per node
+#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
+#SBATCH --qos=gpu-debug             # Request QOS Type
+
+###
+### The system will alloc 8 or 16 cores per gpu by default.
+### If you need more or less, use following:
+### #SBATCH --cpus-per-task=K            # Request K cores
+###
+### 
+### Without specifying the constraint, any available nodes that meet the requirement will be allocated
+### You can specify the characteristics of the compute nodes, and even the names of the compute nodes
+###
+### #SBATCH --nodelist=gpu-v00           # Request a specific list of hosts 
+### #SBATCH --constraint="Volta|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
+###
+
+# set constraint for RTX8000 to meet my cuda
+#SBATCH --constraint="Ampere|RTX8000|T4"
+
+#- Log information
+
+echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
+echo "Job run at:"
+echo "$(hostnamectl)"
+
+#- Load environments
+source /tools/module_env.sh
+module list                       # list modules loaded
+
+##- Tools
+module load cluster-tools/v1.0
+module load slurm-tools/v1.0
+module load cmake/3.15.7
+module load git/2.17.1
+module load vim/8.1.2424
+
+##- language
+module load python3/3.6.8
+
+##- CUDA
+# module load cuda-cudnn/10.2-7.6.5
+# module load cuda-cudnn/11.2-8.2.1
+module load cuda-cudnn/11.1-8.2.1
+
+##- virtualenv
+# source xxxxx/activate
+
+echo $(module list)              # list modules loaded
+echo $(which gcc)
+echo $(which python)
+echo $(which python3)
+
+cluster-quota                    # nas quota
+
+nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
+
+#- Warning! Please not change your CUDA_VISIBLE_DEVICES
+#- in `.bashrc`, `env.sh`, or your job script
+echo "Use GPU ${CUDA_VISIBLE_DEVICES}"                              # which gpus
+#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
+
+#- Job step
+# [EDIT HERE(TODO)]
+python get_param_flops.py > PTB_LSTM.txt
+
+#- End
+echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
--- a/ykl/PTB_LSTM/gol.py
+++ b/ykl/PTB_LSTM/gol.py
+# -*- coding: utf-8 -*-
+
+# 用于多个module之间共享全局变量
+def _init():  # 初始化
+    global _global_dict
+    _global_dict = {}
+ 
+def set_value(value,is_bias=False):
+    # 定义一个全局变量
+    if is_bias:
+        _global_dict[0] = value
+    else:
+        _global_dict[1] = value
+ 
+ 
+def get_value(is_bias=False): # 给bias独立于各变量外的精度
+    if is_bias:
+        return _global_dict[0]
+    else:
+        return _global_dict[1]  
+
--- a/ykl/PTB_LSTM/lstm_utils.py
+++ b/ykl/PTB_LSTM/lstm_utils.py
+import os
+from io import open
+import torch
+
+def batchify(data, bsz, device):
+    nbatch = data.size(0) // bsz
+    data = data.narrow(0, 0, nbatch * bsz)
+    data = data.view(bsz, -1).t().contiguous()
+    return data.to(device)
+
+def get_batch(source, i, bptt):
+    seq_len = min(bptt, len(source) - 1 - i)
+    data = source[i:i+seq_len]
+    target = source[i+1:i+1+seq_len].view(-1)
+    return data, target
+
+def repackage_hidden(h):
+    """Wraps hidden states in new Tensors, to detach them from their history."""
+    if isinstance(h, torch.Tensor):
+        return h.detach()
+    else:
+        return tuple(repackage_hidden(v) for v in h)
+
+# 在推理时,输入数据的形状是`(seq_len, batch, *)`,与训练时的形状相反。
+# 如果不调用`.flatten_parameters()`,那么在推理前,LSTM需要对权重矩阵进行转置,将其形状调整为`(seq_len, batch, *)`,以匹配输入数据的形状。
+
+def lstm_flatten(model):
+    for name,layer in model.named_modules():
+        if 'lstm' in name:
+            layer.flatten_parameters()
+
+class Dictionary(object):
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+        return self.word2idx[word]
+
+    def __len__(self):
+        return len(self.idx2word)
+
+
+class Corpus(object):
+    def __init__(self, path):
+        self.dictionary = Dictionary()
+        self.train = self.tokenize(os.path.join(path, 'train.txt'))
+        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
+        self.test = self.tokenize(os.path.join(path, 'test.txt'))
+
+    def tokenize(self, path):
+        """Tokenizes a text file."""
+        assert os.path.exists(path)
+        # Add words to the dictionary
+        with open(path, 'r', encoding="utf8") as f:
+            tokens = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                tokens += len(words)
+                for word in words:
+                    self.dictionary.add_word(word)
+
+        with open(path, 'r', encoding="utf8") as f:
+            ids = torch.LongTensor(tokens)
+            token = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                for word in words:
+                    ids[token] = self.dictionary.word2idx[word]
+                    token += 1
+
+        return ids
--- a/ykl/PTB_LSTM/model.py
+++ b/ykl/PTB_LSTM/model.py
+import torch.nn as nn
+from module import *
+
+class Model(nn.Module):
+    def __init__(self,ntoken, ninp, nhid, dropout=0.5, tie_weights=False):
+        super(Model, self).__init__()
+        self.embed1 = nn.Embedding(ntoken, ninp)
+        self.drop2 = nn.Dropout(dropout)
+        self.lstm3 = nn.LSTM(ninp, nhid, 1, dropout=dropout)
+        # self.drop4 = nn.Dropout(dropout)
+        self.lstm4 = nn.LSTM(nhid, nhid, 1)
+        self.drop5 = nn.Dropout(dropout)
+        self.fc6 = nn.Linear(nhid, ntoken)
+
+        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
+        if tie_weights:
+            if nhid != ninp:
+                raise ValueError('the number of hidden unit per layer must be equal to the embedding size')
+            self.fc6.weight = self.embed1.weight
+
+        self.init_weights()
+
+        self.nhid = nhid
+
+    def init_weights(self):
+        initrange = 0.1
+        self.embed1.weight.data.uniform_(-initrange, initrange)
+        self.fc6.bias.data.zero_()
+        self.fc6.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, x, hidden=None):
+        x = self.embed1(x)
+        x = self.drop2(x)
+        x, hidden = self.lstm3(x, hidden)
+        x, hidden = self.lstm4(x, hidden)
+        x = self.drop5(x)
+        t, n = x.size(0), x.size(1)
+        x = x.view(t*n,-1)
+        x = self.fc6(x)
+        x = x.view(t,n,-1)
+        return x, hidden
+
+    def init_hidden(self,bsz):
+        # 获取模型第一个参数的device和数据类型
+        weight = next(self.parameters())
+        return (weight.new_zeros(1, bsz, self.nhid),
+                weight.new_zeros(1, bsz, self.nhid))
+
+    def quantize(self, quant_type, num_bits=8, e_bits=3):
+        # embed input作为索引，无需量化
+        self.qembed1 = QEmbedding(quant_type, self.embed1, num_bits=num_bits, e_bits=e_bits)
+        #qix承接上层，无需再量化，qih和qic初次对hidden操作，需量化
+        # self.qlstm3 = QLSTM(quant_type, self.lstm3, has_hidden=True, qix=False, qih=True, qic=True, num_bits=num_bits, e_bits=e_bits)
+        self.qlstm3 = QLSTM(quant_type, self.lstm3, has_hidden=True, qix=False, qih=True, qic=True, num_bits=num_bits, e_bits=e_bits)
+        self.qlstm4 = QLSTM(quant_type, self.lstm4, has_hidden=True, qix=False, qih=False, qic=False, num_bits=num_bits, e_bits=e_bits)
+        self.qfc6 = QLinear(quant_type, self.fc6, num_bits=num_bits, e_bits=e_bits)
+
+    def quantize_forward(self, x, hidden=None):
+        x = self.qembed1(x)
+        x = self.drop2(x)
+        x,hidden = self.qlstm3(x,hidden)
+        x,hidden = self.qlstm4(x,hidden)
+        x = self.drop5(x)
+        t,n = x.size(0), x.size(1)
+        x = x.view(t*n, -1)
+        x = self.qfc6(x)
+        x = x.view(t,n,-1)
+        return x,hidden
+
+    def freeze(self):
+        self.qembed1.freeze()
+        self.qlstm3.freeze(qix=self.qembed1.qo)
+        self.qlstm4.freeze(qix=self.qlstm3.qox, qih=self.qlstm3.qoh, qic=self.qlstm3.qoc)
+        self.qfc6.freeze(qi=self.qlstm4.qox)
+
+    def quantize_inference(self, x, hidden=None):
+        x = self.qembed1.quantize_inference(x)
+        x,hidden = self.qlstm3.quantize_inference(x,hidden)
+        x,hidden = self.qlstm4.quantize_inference(x,hidden)
+        t,n = x.size(0), x.size(1)
+        x = x.view(t*n, -1)
+        x = self.qfc6.quantize_inference(x)
+        x = x.view(t,n,-1)
+        return x,hidden
+        
\ No newline at end of file
--- a/ykl/PTB_LSTM/module.py
+++ b/ykl/PTB_LSTM/module.py
--- a/ykl/PTB_LSTM/ptq.py
+++ b/ykl/PTB_LSTM/ptq.py
+from torch.serialization import load
+from model import *
+from extract_ratio import *
+from utils import *
+from lstm_utils import *
+
+import gol
+import openpyxl
+import sys
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torchvision.transforms.functional import InterpolationMode
+import os
+import os.path as osp
+from torch.utils.tensorboard import SummaryWriter
+
+import time
+import math
+
+data_path = '../data/ptb'
+embed_size = 700
+hidden_size = 700
+lr = 22
+clip = 0.25
+eval_batch_size = 10
+bptt = 35 #所取的串长度
+dropout = 0.65
+tied = True
+seed = 1111
+seed_gpu = 1111
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+def direct_quantize(model, val_data, eval_batch_size, bptt):
+    # hidden = model.init_hidden(eval_batch_size)
+    hidden=None
+    with torch.no_grad():
+        for i in range(0, val_data.size(0) - 1, bptt):
+            data, targets = get_batch(val_data, i, bptt)
+            output, hidden = model.quantize_forward(data,hidden)
+            hidden = repackage_hidden(hidden)
+    print('direct quantization finish')
+
+def full_inference(model, test_data, ntokens, eval_batch_size, bptt):
+    total_loss = 0.
+    lossLayer = nn.CrossEntropyLoss()
+    # hidden = model.init_hidden(eval_batch_size)
+    hidden=None
+    with torch.no_grad():
+        for i in range(0, test_data.size(0) - 1, bptt):
+            data, targets = get_batch(test_data, i, bptt)
+            output, hidden = model(data,hidden)
+            output_flat = output.view(-1, ntokens)
+            total_loss += len(data) * lossLayer(output_flat, targets).item()
+            hidden = repackage_hidden(hidden)
+    test_loss = total_loss / (len(test_data) - 1)
+    ppl = math.exp(test_loss)
+    print('\nTest set: Full Model Perplexity: {:.4f} Loss {:f}'.format(ppl,test_loss))
+    return ppl
+
+def quantize_inference(model, test_data, ntokens, eval_batch_size, bptt):
+    total_loss = 0.
+    lossLayer = nn.CrossEntropyLoss()
+    # hidden = model.init_hidden(eval_batch_size)
+    hidden=None
+    # print(model.qembed1.qw)
+    # print(model.qembed1.qo)
+    # print(model.qlstm3.qix)
+    # print(model.qlstm3.qih)
+    # print(model.qlstm3.qic)
+    # print(model.qlstm3.qox)
+    # print(model.qlstm3.qoh)
+    # print(model.qlstm3.qoc)
+    with torch.no_grad():
+        for i in range(0, test_data.size(0) - 1, bptt):
+            data, targets = get_batch(test_data, i, bptt)
+            output, hidden = model.quantize_inference(data,hidden)
+            output_flat = output.view(-1, ntokens)
+            total_loss += len(data) * lossLayer(output_flat, targets).item()
+            hidden = repackage_hidden(hidden)
+    test_loss = total_loss / (len(test_data) - 1)
+    ppl = math.exp(test_loss)
+    print('Test set: Quant Model Perplexity: {:.4f} Loss {:f}\n'.format(ppl,test_loss))
+    return ppl
+
+
+if __name__ == "__main__":
+    sys.stdout = open(sys.stdout.fileno(), mode='w', buffering=1)
+    
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed_gpu)
+
+    corpus = Corpus(data_path)
+    ntokens = len(corpus.dictionary)
+
+    val_data = batchify(corpus.valid, eval_batch_size, device)
+    test_data = batchify(corpus.test, eval_batch_size, device)
+
+
+    load_ptq = False
+    store_ptq = False
+
+    gol._init()
+
+    excel_path = 'ptq_result.xlsx'
+    workbook = openpyxl.Workbook()
+    if 'Sheet' in workbook.sheetnames:
+        workbook.remove(workbook['Sheet'])
+
+    txt_path = 'ptq_result.txt'
+    ft = open(txt_path,'w')
+    
+    model = Model(ntokens, embed_size, hidden_size, dropout, tied).to(device)
+    full_file = 'ckpt/ptb_PTB_LSTM.pt'
+    model.load_state_dict(torch.load(full_file))
+
+    ptq_file_prefix = 'ckpt/ptb_PTB_LSTM_ptq_'
+
+    model.eval()
+    lstm_flatten(model)
+    full_ppl = full_inference(model,test_data,ntokens,eval_batch_size,bptt)
+    
+    quant_type_list = ['INT','POT','FLOAT']
+    # quant_type_list = ['FLOAT']
+
+    title_list = []
+    js_flops_list = []
+    js_param_list = []
+    ptq_ppl_list = []
+    ppl_ratio_list = []
+
+    for quant_type in quant_type_list:
+        num_bit_list = numbit_list(quant_type)
+        if quant_type != 'INT':
+            bias_list = build_bias_list(quant_type)
+            gol.set_value(bias_list, is_bias=True)
+
+        for num_bits in num_bit_list:
+            e_bit_list = ebit_list(quant_type,num_bits)
+            for e_bits in e_bit_list:
+                model_ptq = Model(ntokens, embed_size, hidden_size, dropout, tied).to(device)
+                if quant_type == 'FLOAT':
+                    title = '%s_%d_E%d' % (quant_type, num_bits, e_bits)
+                else:
+                    title = '%s_%d' % (quant_type, num_bits)
+                print('\nPTB_LSTM: PTQ: '+title)
+                title_list.append(title)
+
+                if quant_type != 'INT':
+                    plist = build_list(quant_type, num_bits, e_bits)
+                    gol.set_value(plist)
+                
+                model_ptq.load_state_dict(torch.load(full_file))
+                model_ptq.quantize(quant_type,num_bits,e_bits)
+                model_ptq.eval()
+                direct_quantize(model_ptq, val_data, eval_batch_size, bptt)
+
+                #这里的quantize_inference都用伪量化，相比forward只少了update，不需freeze
+                ptq_ppl = quantize_inference(model_ptq, test_data, ntokens, eval_batch_size, bptt)
+                ppl_ratio = ptq_ppl / full_ppl
+                print(title+': ppl_ratio: %f'%ppl_ratio)
+                ptq_ppl_list.append(ptq_ppl)
+                ppl_ratio_list.append(ppl_ratio)
+
+    worksheet = workbook.create_sheet('PTB_LSTM')
+    worksheet.cell(row=1,column=1,value='FP32-ppl')
+    worksheet.cell(row=1,column=2,value=full_ppl)
+    worksheet.cell(row=3,column=1,value='title')
+    # worksheet.cell(row=3,column=2,value='js_flops')
+    # worksheet.cell(row=3,column=3,value='js_param')
+    worksheet.cell(row=3,column=2,value='ptq_ppl')
+    worksheet.cell(row=3,column=3,value='ppl_ratio')
+
+    for i in range(len(title_list)):
+        worksheet.cell(row=i+4,column=1,value=title_list[i])
+        worksheet.cell(row=i+4,column=2,value=ptq_ppl_list[i])
+        worksheet.cell(row=i+4,column=3,value=ppl_ratio_list[i])
+
+    if 'Sheet' in workbook.sheetnames:
+        workbook.remove(workbook['Sheet'])
+    workbook.save(excel_path)
+
+    print('PTB_LSTM',file=ft)
+    print('Full_ppl: %f'%full_ppl,file=ft)
+    print('title_list:',file=ft)
+    print(title_list,file=ft)
+    # print('js_flops_list:',file=ft)
+    # print(js_flops_list, file=ft)
+    # print('js_param_list:',file=ft)
+    # print(js_param_list, file=ft)
+    print('ptq_ppl_list:',file=ft)
+    print(ptq_ppl_list, file=ft)
+    print('ppl_ratio_list:',file=ft)
+    print(ppl_ratio_list, file=ft)
+    print("\n",file=ft)
+    
+    ft.close()
--- a/ykl/PTB_LSTM/ptq.slurm
+++ b/ykl/PTB_LSTM/ptq.slurm
+#!/bin/bash
+
+#- Job parameters
+
+# (TODO)
+# Please modify job name
+
+#SBATCH -J PTB_LSTM            # The job name
+#SBATCH -o ret/ret-%j.out        # Write the standard output to file named 'ret-<job_number>.out'
+#SBATCH -e ret/ret-%j.err        # Write the standard error to file named 'ret-<job_number>.err'
+
+
+#- Resources
+
+# (TODO)
+# Please modify your requirements
+
+#SBATCH -p nv-gpu                    # Submit to 'nv-gpu' Partitiion
+#SBATCH -t 3-00:00:00                # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
+#SBATCH --nodes=1                    # Request N nodes
+#SBATCH --gres=gpu:1                 # Request M GPU per node
+#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
+#SBATCH --qos=gpu-long             # Request QOS Type
+
+###
+### The system will alloc 8 or 16 cores per gpu by default.
+### If you need more or less, use following:
+### #SBATCH --cpus-per-task=K            # Request K cores
+###
+### 
+### Without specifying the constraint, any available nodes that meet the requirement will be allocated
+### You can specify the characteristics of the compute nodes, and even the names of the compute nodes
+###
+### #SBATCH --nodelist=gpu-v00           # Request a specific list of hosts 
+### #SBATCH --constraint="Volta|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
+###
+
+# set constraint for RTX8000 to meet my cuda
+#SBATCH --constraint="Ampere|RTX8000|T4"
+
+#- Log information
+
+echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
+echo "Job run at:"
+echo "$(hostnamectl)"
+
+#- Load environments
+source /tools/module_env.sh
+module list                       # list modules loaded
+
+##- Tools
+module load cluster-tools/v1.0
+module load slurm-tools/v1.0
+module load cmake/3.15.7
+module load git/2.17.1
+module load vim/8.1.2424
+
+##- language
+module load python3/3.6.8
+
+##- CUDA
+# module load cuda-cudnn/10.2-7.6.5
+# module load cuda-cudnn/11.2-8.2.1
+module load cuda-cudnn/11.1-8.2.1
+
+##- virtualenv
+# source xxxxx/activate
+
+echo $(module list)              # list modules loaded
+echo $(which gcc)
+echo $(which python)
+echo $(which python3)
+
+cluster-quota                    # nas quota
+
+nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
+
+#- Warning! Please not change your CUDA_VISIBLE_DEVICES
+#- in `.bashrc`, `env.sh`, or your job script
+echo "Use GPU ${CUDA_VISIBLE_DEVICES}"                              # which gpus
+#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
+
+#- Job step
+# [EDIT HERE(TODO)]
+python ptq.py
+
+#- End
+echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
--- a/ykl/PTB_LSTM/ptq_result.txt
+++ b/ykl/PTB_LSTM/ptq_result.txt
+PTB_LSTM
+Full_ppl: 80.347527
+title_list:
+['INT_2', 'INT_3', 'INT_4', 'INT_5', 'INT_6', 'INT_7', 'INT_8', 'INT_9', 'INT_10', 'INT_11', 'INT_12', 'INT_13', 'INT_14', 'INT_15', 'INT_16', 'POT_2', 'POT_3', 'POT_4', 'POT_5', 'POT_6', 'POT_7', 'POT_8', 'FLOAT_3_E1', 'FLOAT_4_E1', 'FLOAT_4_E2', 'FLOAT_5_E1', 'FLOAT_5_E2', 'FLOAT_5_E3', 'FLOAT_6_E1', 'FLOAT_6_E2', 'FLOAT_6_E3', 'FLOAT_6_E4', 'FLOAT_7_E1', 'FLOAT_7_E2', 'FLOAT_7_E3', 'FLOAT_7_E4', 'FLOAT_7_E5', 'FLOAT_8_E1', 'FLOAT_8_E2', 'FLOAT_8_E3', 'FLOAT_8_E4', 'FLOAT_8_E5', 'FLOAT_8_E6']
+ptq_ppl_list:
+[3680.054005643725, 1113.1595756444422, 237.175147011631, 118.14850804671768, 85.68659660098655, 82.04448564575729, 80.74354328756583, 80.4501107581829, 80.37609633787858, 80.35521256832457, 80.34711404715937, 80.34729820019561, 80.34810939943782, 80.34762953587571, 80.34751136939029, 3680.054005643725, 1436.4914411630305, 168.02534152231982, 167.8943965460058, 166.00384274940092, 166.00408006630718, 166.00413183236293, 829.8560620309429, 407.67140698415596, 142.51986167702356, 165.40337795565802, 96.49152981578862, 101.44738407817896, 131.7285436670383, 87.54655615370719, 86.09250067555212, 101.35647647974928, 119.5743194917011, 84.69916168997997, 81.94981964738962, 86.0939502797634, 101.40409626471063, 117.6051487311836, 83.61499193954714, 80.77412538207982, 81.96937299585758, 85.96081475056866, 101.39158416596052]
+ppl_ratio_list:
+[45.80170816062014, 13.854310274163526, 2.951866154605171, 1.470468497709415, 1.0664497001338629, 1.0211202286630146, 1.0049287863275775, 1.0012767445219362, 1.0003555659541061, 1.000095647942865, 0.9999948542853881, 0.9999971462418676, 1.0000072423738524, 1.0000012700237741, 0.9999997993315312, 45.80170816062014, 17.87847723497442, 2.091232260230823, 2.0896025277374854, 2.0660728205317613, 2.0660757741622424, 2.06607641843914, 10.32833352721793, 5.073851302041298, 1.7737927491314487, 2.058599475520639, 1.2009271825404801, 1.2626074161089171, 1.6394847206612713, 1.0895986334086523, 1.0715015553337994, 1.2614759861661844, 1.4882140524200391, 1.0541601507000304, 1.0199420219238415, 1.071519597011702, 1.2620686588525436, 1.4637058836938923, 1.0406666458683778, 1.0053094090482557, 1.0201853816024549, 1.0698626010424483, 1.2619129341010724]
+
+
--- a/ykl/PTB_LSTM/ptq_result.xlsx
+++ b/ykl/PTB_LSTM/ptq_result.xlsx
--- a/ykl/PTB_LSTM/train.py
+++ b/ykl/PTB_LSTM/train.py
+# coding: utf-8
+import time
+import math
+import os
+import torch
+import torch.nn as nn
+from lstm_utils import *
+from model import *
+import sys
+
+data_path = '../data/ptb'
+embed_size = 700
+hidden_size = 700
+lr = 22
+clip = 0.25
+epochs = 10
+train_batch_size = 20
+eval_batch_size = 10
+bptt = 35 #所取的串长度
+dropout = 0.65
+tied = True
+seed = 1111
+seed_gpu = 1111
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+log_interval = 200
+save_path = 'ckpt/ptb_PTB_LSTM.pt'
+
+
+def evaluate(model, eval_data, ntokens, eval_batch_size, bptt):
+    model.eval()
+    total_loss = 0.
+    lossLayer = nn.CrossEntropyLoss()
+    hidden = model.init_hidden(eval_batch_size)
+    with torch.no_grad():
+        for i in range(0, eval_data.size(0) - 1, bptt):
+            data, targets = get_batch(eval_data, i, bptt)
+            output, hidden = model(data,hidden)
+            output_flat = output.view(-1, ntokens)
+            total_loss += len(data) * lossLayer(output_flat, targets).item()
+            hidden = repackage_hidden(hidden)
+    return total_loss / (len(eval_data) - 1)
+
+def train(model, train_data, ntokens, train_batch_size, bptt, lr, clip):
+    model.train()
+    total_loss = 0.
+    lossLayer = nn.CrossEntropyLoss()
+    start_time = time.time()
+    hidden = model.init_hidden(train_batch_size)
+    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
+        data, targets = get_batch(train_data, i, bptt)
+        # hidden = repackage_hidden(hidden)
+
+        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+        optimizer.zero_grad()
+        output, hidden = model(data,hidden)
+        hidden = repackage_hidden(hidden)
+        loss = lossLayer(output.view(-1, ntokens), targets)
+        loss.backward()
+
+        #梯度裁剪
+        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
+        for p in model.parameters():
+            p.data.add_(p.grad.data, alpha=-lr)
+
+        total_loss += loss.item()
+
+        if batch % 200 == 0 and batch > 0:
+            cur_loss = total_loss / 200
+            elapsed = time.time() - start_time
+            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
+                    'loss {:5.2f} | ppl {:8.2f}'.format(
+                epoch, batch, len(train_data) // bptt, lr,
+                elapsed * 1000 / 200, cur_loss, math.exp(cur_loss)))
+            total_loss = 0
+            start_time = time.time()
+
+if __name__ == "__main__":
+    sys.stdout = open(sys.stdout.fileno(), mode='w', buffering=1)
+    
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed_gpu)
+
+    corpus = Corpus(data_path)
+    ntokens = len(corpus.dictionary)
+
+    train_data = batchify(corpus.train, train_batch_size, device)
+    val_data = batchify(corpus.valid, eval_batch_size, device)
+    test_data = batchify(corpus.test, eval_batch_size, device)
+
+    model = Model(ntokens, embed_size, hidden_size, dropout, tied).to(device)
+
+    best_val_loss = None
+
+    for epoch in range(1, epochs+1):
+        epoch_start_time = time.time()
+        train(model, train_data, ntokens, train_batch_size, bptt, lr, clip)
+        val_loss = evaluate(model, val_data, ntokens, eval_batch_size, bptt)
+        print('-' * 89)
+        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
+                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
+                                            val_loss, math.exp(val_loss)))
+        print('-' * 89)
+        if not best_val_loss or val_loss < best_val_loss:
+            with open(save_path, 'wb') as f:
+                torch.save(model.state_dict(), f)
+            best_val_loss = val_loss
+        else:
+            lr /= 2.5
+            if lr < 0.0001:
+                break
+
+    with open(save_path, 'rb') as f:
+        # model = torch.load(f)
+        model = Model(ntokens, embed_size, hidden_size, dropout, tied).to(device)
+        model.load_state_dict(torch.load(f))
+        #将LSTM层的参数拉平为一维，方便并行计算
+        lstm_flatten(model)
+        # model.lstm3.flatten_parameters()
+        # model.lstm4.flatten_parameters()
+
+    test_loss = evaluate(model, test_data, ntokens, eval_batch_size, bptt)
+    print('=' * 89)
+    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
+        test_loss, math.exp(test_loss)))
+    print('=' * 89)
+
--- a/ykl/PTB_LSTM/train.slurm
+++ b/ykl/PTB_LSTM/train.slurm
+#!/bin/bash
+
+#- Job parameters
+
+# (TODO)
+# Please modify job name
+
+#SBATCH -J PTB_LSTM              # The job name
+#SBATCH -o ret/ret-%j.out        # Write the standard output to file named 'ret-<job_number>.out'
+#SBATCH -e ret/ret-%j.err        # Write the standard error to file named 'ret-<job_number>.err'
+
+
+#- Resources
+
+# (TODO)
+# Please modify your requirements
+
+#SBATCH -p nv-gpu                    # Submit to 'nv-gpu' Partitiion
+#SBATCH -t 3-00:00:00                # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
+#SBATCH --nodes=1                    # Request N nodes
+#SBATCH --gres=gpu:1                 # Request M GPU per node
+#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
+#SBATCH --qos=gpu-long             # Request QOS Type
+
+###
+### The system will alloc 8 or 16 cores per gpu by default.
+### If you need more or less, use following:
+### #SBATCH --cpus-per-task=K            # Request K cores
+###
+### 
+### Without specifying the constraint, any available nodes that meet the requirement will be allocated
+### You can specify the characteristics of the compute nodes, and even the names of the compute nodes
+###
+### #SBATCH --nodelist=gpu-v00           # Request a specific list of hosts 
+### #SBATCH --constraint="Volta|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
+###
+
+# set constraint for RTX8000 to meet my cuda
+#SBATCH --constraint="Ampere|RTX8000|T4"
+
+#- Log information
+
+echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
+echo "Job run at:"
+echo "$(hostnamectl)"
+
+#- Load environments
+source /tools/module_env.sh
+module list                       # list modules loaded
+
+##- Tools
+module load cluster-tools/v1.0
+module load slurm-tools/v1.0
+module load cmake/3.15.7
+module load git/2.17.1
+module load vim/8.1.2424
+
+##- language
+module load python3/3.6.8
+
+##- CUDA
+# module load cuda-cudnn/10.2-7.6.5
+# module load cuda-cudnn/11.2-8.2.1
+module load cuda-cudnn/11.1-8.2.1
+
+##- virtualenv
+# source xxxxx/activate
+
+echo $(module list)              # list modules loaded
+echo $(which gcc)
+echo $(which python)
+echo $(which python3)
+
+cluster-quota                    # nas quota
+
+nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
+
+#- Warning! Please not change your CUDA_VISIBLE_DEVICES
+#- in `.bashrc`, `env.sh`, or your job script
+echo "Use GPU ${CUDA_VISIBLE_DEVICES}"                              # which gpus
+#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
+
+#- Job step
+# [EDIT HERE(TODO)]
+python train.py
+
+#- End
+echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
--- a/ykl/PTB_LSTM/try.py
+++ b/ykl/PTB_LSTM/try.py
+# import torch
+# from transformers import BertForSequenceClassification, BertTokenizer
+
+# from ptflops import get_model_complexity_info
+
+
+# def bert_input_constructor(input_shape, tokenizer):
+#     inp_seq = ""
+#     for _ in range(input_shape[1] - 2):  # there are two special tokens [CLS] and [SEP]
+#         inp_seq += tokenizer.pad_token  # let's use pad token to form a fake
+#     # sequence for subsequent flops calculation
+
+#     inputs = tokenizer([inp_seq] * input_shape[0], padding=True, truncation=True,
+#                        return_tensors="pt")
+#     labels = torch.tensor([1] * input_shape[0])
+#     # Batch size input_shape[0], sequence length input_shape[128]
+#     inputs = dict(inputs)
+#     inputs.update({"labels": labels})
+#     return inputs
+
+# if __name__ == '__main__':
+#     shape = (35,10)
+#     bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+#     tmp = bert_input_constructor(shape,bert_tokenizer)
+#     print(tmp)
+#     print(**tmp)
+
+from functools import partial
+
+import torch
+from transformers import BertForSequenceClassification, BertTokenizer
+
+from ptflops import get_model_complexity_info
+
+
+def bert_input_constructor(input_shape, tokenizer):
+    inp_seq = ""
+    for _ in range(input_shape[1] - 2):  # there are two special tokens [CLS] and [SEP]
+        inp_seq += tokenizer.pad_token  # let's use pad token to form a fake
+    # sequence for subsequent flops calculation
+
+    inputs = tokenizer([inp_seq] * input_shape[0], padding=True, truncation=True,
+                       return_tensors="pt")
+    labels = torch.tensor([1] * input_shape[0])
+    # Batch size input_shape[0], sequence length input_shape[128]
+    inputs = dict(inputs)
+    inputs.update({"labels": labels})
+    return inputs
+
+
+if __name__ == '__main__':
+    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+    flops_count, params_count = get_model_complexity_info(
+            model, (2, 128), as_strings=True,
+            input_constructor=partial(bert_input_constructor, tokenizer=bert_tokenizer),
+            print_per_layer_stat=True)
+    print('{:<30}  {:<8}'.format('Computational complexity: ', flops_count))
+    print('{:<30}  {:<8}'.format('Number of parameters: ', params_count))
\ No newline at end of file
--- a/ykl/PTB_LSTM/utils.py
+++ b/ykl/PTB_LSTM/utils.py
+import torch
+import torch.nn as nn
+
+def ebit_list(quant_type, num_bits):
+    if quant_type == 'FLOAT':
+        e_bit_list = list(range(1,num_bits-1))
+    else:
+        e_bit_list = [0]
+    return e_bit_list
+
+
+def numbit_list(quant_type):
+    if quant_type == 'INT':
+        num_bit_list = list(range(2,17))
+        # num_bit_list = [4,5]
+    elif quant_type == 'POT':
+        num_bit_list = list(range(2,9))
+        # num_bit_list = [5]
+    else:
+        num_bit_list = list(range(2,9))
+        # num_bit_list = [8]
+    
+    return num_bit_list     
+
+def build_bias_list(quant_type):
+    if quant_type == 'POT':
+        return build_pot_list(8)
+    else:
+        return build_float_list(16,7)
+    
+def build_list(quant_type, num_bits, e_bits):
+    if quant_type == 'POT':
+        return build_pot_list(num_bits)
+    else:
+        return build_float_list(num_bits,e_bits)
+
+def build_pot_list(num_bits):
+    plist = [0.]
+    for i in range(-2 ** (num_bits-1) + 2, 1): 
+        # i最高到0，即pot量化最大值为1
+        plist.append(2. ** i)
+        plist.append(-2. ** i)
+    plist = torch.Tensor(list(set(plist)))
+    # plist = plist.mul(1.0 / torch.max(plist))
+    return plist
+
+def build_float_list(num_bits,e_bits):
+    m_bits = num_bits - 1 - e_bits
+    plist = [0.]
+    # 相邻尾数的差值
+    dist_m = 2 ** (-m_bits)
+    e = -2 ** (e_bits - 1) + 1
+    for m in range(1, 2 ** m_bits):
+        frac = m * dist_m   # 尾数部分
+        expo = 2 ** e       # 指数部分
+        flt = frac * expo
+        plist.append(flt)
+        plist.append(-flt)
+
+    for e in range(-2 ** (e_bits - 1) + 2, 2 ** (e_bits - 1) + 1):
+        expo = 2 ** e
+        for m in range(0, 2 ** m_bits):
+            frac = 1. + m * dist_m
+            flt = frac * expo
+            plist.append(flt)
+            plist.append(-flt)
+    plist = torch.Tensor(list(set(plist)))
+    return plist
+
+#此处不必cfg，直接取同前缀同后缀即可。将relu一起考虑进去
+def fold_ratio(layer, par_ratio, flop_ratio):
+    idx = -1
+    for name in layer:
+        if 'conv' in name:
+            conv_idx = layer.index(name)
+            [prefix,suffix] = name.split('conv')
+            bn_name = prefix+'bn'+suffix
+            relu_name = prefix+'relu'+suffix
+            if bn_name in layer:
+                bn_idx = layer.index(bn_name)
+                par_ratio[conv_idx]+=par_ratio[bn_idx]
+                flop_ratio[conv_idx]+=flop_ratio[bn_idx]
+                if relu_name in layer:
+                    relu_idx = layer.index(relu_name)
+                    par_ratio[conv_idx]+=par_ratio[relu_idx]
+                    flop_ratio[conv_idx]+=flop_ratio[bn_idx]
+    return par_ratio,flop_ratio
+
+def fold_model(model):
+    for name, module in model.named_modules():
+        if 'conv' in name:
+            [prefix,suffix] = name.split('conv')
+            bn_name = prefix+'bn'+suffix
+            if hasattr(model,bn_name):
+                bn_layer = getattr(model,bn_name)
+                fold_bn(module,bn_layer)
+
+def fold_bn(conv, bn):
+    # 获取 BN 层的参数
+    mean = bn.running_mean
+    var = bn.running_var
+    eps = bn.eps
+    std = torch.sqrt(var + eps)
+
+    if bn.affine:
+        gamma_ = bn.weight / std
+        weight = conv.weight * gamma_.view(conv.out_channels, 1, 1, 1)
+        if conv.bias is not None:
+            bias = gamma_ * conv.bias - gamma_ * mean + bn.bias
+        else:
+            bias = bn.bias - gamma_ * mean
+    else:
+        gamma_ = 1 / std
+        weight = conv.weight * gamma_
+        if conv.bias is not None:
+            bias = gamma_ * conv.bias - gamma_ * mean
+        else:
+            bias = -gamma_ * mean
+
+    # 设置新的 weight 和 bias
+    conv.weight.data = weight.data
+    if conv.bias is not None:
+        conv.bias.data = bias.data
+    else:
+        conv.bias = torch.nn.Parameter(bias)