Commit ee8f53f5 by lvzhengyang

Initial commit

parents
# train.py
#!/usr/bin/env python3
""" train network using pytorch
author baiyu
"""
import os
import sys
import argparse
import time
from datetime import datetime
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from conf import settings
from utils import get_network, get_my_loader3, WarmUpLR, \
most_recent_folder, most_recent_weights, last_epoch, best_acc_weights
import pdb
LABEL_THRESHOLD = 0.05
def train(epoch):
start = time.time()
net.train(True)
for batch_index, (imgs, rudys, labels) in enumerate(train_loader):
if args.gpu:
labels = labels.cuda()
rudys = rudys.cuda()
imgs = imgs.cuda()
optimizer.zero_grad()
outputs = net(rudys, imgs)
labels = labels.squeeze()
labels = torch.where(labels > LABEL_THRESHOLD, 1, 0)
loss = loss_function(outputs, labels)
loss.backward()
optimizer.step()
n_iter = (epoch - 1) * len(train_loader) + batch_index + 1
last_layer = list(net.children())[-1]
for name, para in last_layer.named_parameters():
if 'weight' in name:
writer.add_scalar('LastLayerGradients/grad_norm2_weights', para.grad.norm(), n_iter)
if 'bias' in name:
writer.add_scalar('LastLayerGradients/grad_norm2_bias', para.grad.norm(), n_iter)
print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format(
loss.item(),
optimizer.param_groups[0]['lr'],
epoch=epoch,
trained_samples=batch_index * args.b + len(rudys),
total_samples=len(train_loader.dataset)
))
#update training loss for each iteration
writer.add_scalar('Train/loss', loss.item(), n_iter)
if epoch <= args.warm:
warmup_scheduler.step()
for name, param in net.named_parameters():
layer, attr = os.path.splitext(name)
attr = attr[1:]
writer.add_histogram("{}/{}".format(layer, attr), param, epoch)
finish = time.time()
print('epoch {} training time consumed: {:.2f}s'.format(epoch, finish - start))
@torch.no_grad()
def eval_training(epoch=0, tb=True):
start = time.time()
net.eval()
test_loss = 0.0 # cost function error
correct = 0.0
for (imgs, rudys, labels) in test_loader:
if args.gpu:
rudys = rudys.cuda()
imgs = imgs.cuda()
labels = labels.cuda()
outputs = net(rudys, imgs)
labels = labels.squeeze()
labels = torch.where(labels > LABEL_THRESHOLD, 1, 0)
loss = loss_function(outputs, labels)
test_loss += loss.item()
_, preds = outputs.max(1)
correct += preds.eq(labels).sum()
finish = time.time()
if args.gpu:
print('GPU INFO.....')
print(torch.cuda.memory_summary(), end='')
print('Evaluating Network.....')
print('Test set: Epoch: {}, Average loss: {:.4f}, Accuracy: {:.4f}, Time consumed:{:.2f}s'.format(
epoch,
test_loss / len(test_loader.dataset),
correct.float() / len(test_loader.dataset),
finish - start
))
"""
print('Test set: Epoch: {}, Average loss: {:.4f}, Time consumed:{:.2f}s'.format(
epoch,
test_loss / len(test_loader.dataset),
finish - start
))
"""
#add informations to tensorboard
if tb:
writer.add_scalar('Test/Average loss', test_loss / len(test_loader.dataset), epoch)
writer.add_scalar('Test/Accuracy', correct.float() / len(test_loader.dataset), epoch)
return correct.float() / len(test_loader.dataset)
if __name__ == '__main__':
# python train_6.py -gpu -net config_1
# use image and RUDY
torch.multiprocessing.set_start_method('spawn')
torch.multiprocessing.set_sharing_strategy('file_system')
parser = argparse.ArgumentParser()
parser.add_argument('-net', default='config_1', type=str, required=True, help='net type')
parser.add_argument('-gpu', action='store_true', default=False, help='use gpu or not')
parser.add_argument('-b', type=int, default=128, help='batch size for dataloader')
parser.add_argument('-warm', type=int, default=1, help='warm up training phase')
parser.add_argument('-lr', type=float, default=0.1, help='initial learning rate')
parser.add_argument('-resume', action='store_true', default=False, help='resume training')
args = parser.parse_args()
net = get_network(args, num_class=2)
train_loader = get_my_loader3(
dataset_dir='./dataset/dataset4/train',
num_workers=4,
batch_size=args.b,
shuffle=True
)
test_loader = get_my_loader3(
dataset_dir='./dataset/dataset4/test',
num_workers=4,
batch_size=args.b,
shuffle=True
)
loss_function = nn.CrossEntropyLoss()
# optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=5e-4)
train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=settings.MILESTONES, gamma=0.2) #learning rate decay
# iter_per_epoch = len(cifar100_training_loader)
iter_per_epoch = len(train_loader)
warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm)
if args.resume:
recent_folder = most_recent_folder(os.path.join(settings.CHECKPOINT_PATH, args.net), fmt=settings.DATE_FORMAT)
if not recent_folder:
raise Exception('no recent folder were found')
checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)
else:
checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW)
#use tensorboard
if not os.path.exists(settings.LOG_DIR):
os.mkdir(settings.LOG_DIR)
#since tensorboard can't overwrite old values
#so the only way is to create a new tensorboard log
writer = SummaryWriter(log_dir=os.path.join(
settings.LOG_DIR, args.net, settings.TIME_NOW))
rudy_tensor = torch.Tensor(1, 9)
img_tensor = torch.Tensor(1, 1, 128, 128)
if args.gpu:
rudy_tensor = rudy_tensor.cuda()
img_tensor = img_tensor.cuda()
writer.add_graph(net, [rudy_tensor, img_tensor])
#create checkpoint folder to save model
if not os.path.exists(checkpoint_path):
os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')
best_acc = 0.0
if args.resume:
best_weights = best_acc_weights(os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder))
if best_weights:
weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder, best_weights)
print('found best acc weights file:{}'.format(weights_path))
print('load best training file to test acc...')
net.load_state_dict(torch.load(weights_path))
# best_acc = eval_training(tb=False)
eval_training(tb=False)
# print('best acc is {:0.2f}'.format(best_acc))
recent_weights_file = most_recent_weights(os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder))
if not recent_weights_file:
raise Exception('no recent weights file were found')
weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder, recent_weights_file)
print('loading weights file {} to resume training.....'.format(weights_path))
net.load_state_dict(torch.load(weights_path))
resume_epoch = last_epoch(os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder))
for epoch in range(1, settings.EPOCH + 1):
if epoch > args.warm:
train_scheduler.step(epoch)
if args.resume:
if epoch <= resume_epoch:
continue
train(epoch)
acc = eval_training(epoch)
#start to save best performance model after learning rate decay to 0.01
if epoch > settings.MILESTONES[1] and best_acc < acc:
weights_path = checkpoint_path.format(net=args.net, epoch=epoch, type='best')
print('saving weights file to {}'.format(weights_path))
torch.save(net.state_dict(), weights_path)
best_acc = acc
continue
if not epoch % settings.SAVE_EPOCH:
weights_path = checkpoint_path.format(net=args.net, epoch=epoch, type='regular')
print('saving weights file to {}'.format(weights_path))
torch.save(net.state_dict(), weights_path)
writer.close()
""" helper function
author baiyu
"""
from locale import DAY_1
import os
import sys
import re
import datetime
import numpy
import torch
from torch.optim.lr_scheduler import _LRScheduler
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from dataset import CADataset, CADataset2, CADataset3, CADataset4
import pdb
def get_network(args, num_class=4):
""" return given network
"""
if args.net == 'vgg16':
from models.vgg import vgg16_bn
net = vgg16_bn(num_class=num_class)
elif args.net == 'vgg13':
from models.vgg import vgg13_bn
net = vgg13_bn()
elif args.net == 'vgg11':
from models.vgg import vgg11_bn
net = vgg11_bn()
elif args.net == 'vgg19':
from models.vgg import vgg19_bn
net = vgg19_bn()
elif args.net == 'densenet121':
from models.densenet import densenet121
net = densenet121()
elif args.net == 'densenet161':
from models.densenet import densenet161
net = densenet161()
elif args.net == 'densenet169':
from models.densenet import densenet169
net = densenet169()
elif args.net == 'densenet201':
from models.densenet import densenet201
net = densenet201()
elif args.net == 'googlenet':
from models.googlenet import googlenet
net = googlenet()
elif args.net == 'inceptionv3':
from models.inceptionv3 import inceptionv3
net = inceptionv3()
elif args.net == 'inceptionv4':
from models.inceptionv4 import inceptionv4
net = inceptionv4()
elif args.net == 'inceptionresnetv2':
from models.inceptionv4 import inception_resnet_v2
net = inception_resnet_v2()
elif args.net == 'xception':
from models.xception import xception
net = xception()
elif args.net == 'resnet18':
from models.resnet import resnet18
net = resnet18()
elif args.net == 'resnet34':
from models.resnet import resnet34
net = resnet34()
elif args.net == 'resnet50':
from models.resnet import resnet50
net = resnet50()
elif args.net == 'resnet101':
from models.resnet import resnet101
net = resnet101()
elif args.net == 'resnet152':
from models.resnet import resnet152
net = resnet152()
elif args.net == 'preactresnet18':
from models.preactresnet import preactresnet18
net = preactresnet18()
elif args.net == 'preactresnet34':
from models.preactresnet import preactresnet34
net = preactresnet34()
elif args.net == 'preactresnet50':
from models.preactresnet import preactresnet50
net = preactresnet50()
elif args.net == 'preactresnet101':
from models.preactresnet import preactresnet101
net = preactresnet101()
elif args.net == 'preactresnet152':
from models.preactresnet import preactresnet152
net = preactresnet152()
elif args.net == 'resnext50':
from models.resnext import resnext50
net = resnext50()
elif args.net == 'resnext101':
from models.resnext import resnext101
net = resnext101()
elif args.net == 'resnext152':
from models.resnext import resnext152
net = resnext152()
elif args.net == 'shufflenet':
from models.shufflenet import shufflenet
net = shufflenet()
elif args.net == 'shufflenetv2':
from models.shufflenetv2 import shufflenetv2
net = shufflenetv2()
elif args.net == 'squeezenet':
from models.squeezenet import squeezenet
net = squeezenet()
elif args.net == 'mobilenet':
from models.mobilenet import mobilenet
net = mobilenet()
elif args.net == 'mobilenetv2':
from models.mobilenetv2 import mobilenetv2
net = mobilenetv2()
elif args.net == 'nasnet':
from models.nasnet import nasnet
net = nasnet()
elif args.net == 'attention56':
from models.attention import attention56
net = attention56()
elif args.net == 'attention92':
from models.attention import attention92
net = attention92()
elif args.net == 'seresnet18':
from models.senet import seresnet18
net = seresnet18()
elif args.net == 'seresnet34':
from models.senet import seresnet34
net = seresnet34()
elif args.net == 'seresnet50':
from models.senet import seresnet50
net = seresnet50()
elif args.net == 'seresnet101':
from models.senet import seresnet101
net = seresnet101()
elif args.net == 'seresnet152':
from models.senet import seresnet152
net = seresnet152()
elif args.net == 'wideresnet':
from models.wideresidual import wideresnet
net = wideresnet()
elif args.net == 'stochasticdepth18':
from models.stochasticdepth import stochastic_depth_resnet18
net = stochastic_depth_resnet18()
elif args.net == 'stochasticdepth34':
from models.stochasticdepth import stochastic_depth_resnet34
net = stochastic_depth_resnet34()
elif args.net == 'stochasticdepth50':
from models.stochasticdepth import stochastic_depth_resnet50
net = stochastic_depth_resnet50()
elif args.net == 'stochasticdepth101':
from models.stochasticdepth import stochastic_depth_resnet101
net = stochastic_depth_resnet101()
elif args.net == 'mlp':
from models.mlp import mlp
net = mlp()
elif args.net == 'config_1':
from models.config_1 import config_1
net = config_1()
elif args.net == 'config_2':
from models.config_2 import config_2
net = config_2()
elif args.net == 'config_3':
from models.config_3 import config_3
net = config_3()
elif args.net == 'config_4':
from models.config_4 import config_4
net = config_4()
else:
print('the network name you have entered is not supported yet')
sys.exit()
if args.gpu: #use_gpu
net = net.cuda()
return net
def get_training_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True):
""" return training dataloader
Args:
mean: mean of cifar100 training dataset
std: std of cifar100 training dataset
path: path to cifar100 training python dataset
batch_size: dataloader batchsize
num_workers: dataloader num_works
shuffle: whether to shuffle
Returns: train_data_loader:torch dataloader object
"""
transform_train = transforms.Compose([
#transforms.ToPILImage(),
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ToTensor(),
transforms.Normalize(mean, std)
])
#cifar100_training = CIFAR100Train(path, transform=transform_train)
cifar100_training = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
cifar100_training_loader = DataLoader(
cifar100_training, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)
return cifar100_training_loader
def get_test_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True):
""" return training dataloader
Args:
mean: mean of cifar100 test dataset
std: std of cifar100 test dataset
path: path to cifar100 test python dataset
batch_size: dataloader batchsize
num_workers: dataloader num_works
shuffle: whether to shuffle
Returns: cifar100_test_loader:torch dataloader object
"""
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean, std)
])
#cifar100_test = CIFAR100Test(path, transform=transform_test)
cifar100_test = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)
cifar100_test_loader = DataLoader(
cifar100_test, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)
return cifar100_test_loader
def compute_mean_std(cifar100_dataset):
"""compute the mean and std of cifar100 dataset
Args:
cifar100_training_dataset or cifar100_test_dataset
witch derived from class torch.utils.data
Returns:
a tuple contains mean, std value of entire dataset
"""
data_r = numpy.dstack([cifar100_dataset[i][1][:, :, 0] for i in range(len(cifar100_dataset))])
data_g = numpy.dstack([cifar100_dataset[i][1][:, :, 1] for i in range(len(cifar100_dataset))])
data_b = numpy.dstack([cifar100_dataset[i][1][:, :, 2] for i in range(len(cifar100_dataset))])
mean = numpy.mean(data_r), numpy.mean(data_g), numpy.mean(data_b)
std = numpy.std(data_r), numpy.std(data_g), numpy.std(data_b)
return mean, std
class WarmUpLR(_LRScheduler):
"""warmup_training learning rate scheduler
Args:
optimizer: optimzier(e.g. SGD)
total_iters: totoal_iters of warmup phase
"""
def __init__(self, optimizer, total_iters, last_epoch=-1):
self.total_iters = total_iters
super().__init__(optimizer, last_epoch)
def get_lr(self):
"""we will use the first m batches, and set the learning
rate to base_lr * m / total_iters
"""
return [base_lr * self.last_epoch / (self.total_iters + 1e-8) for base_lr in self.base_lrs]
def most_recent_folder(net_weights, fmt):
"""
return most recent created folder under net_weights
if no none-empty folder were found, return empty folder
"""
# get subfolders in net_weights
folders = os.listdir(net_weights)
# filter out empty folders
folders = [f for f in folders if len(os.listdir(os.path.join(net_weights, f)))]
if len(folders) == 0:
return ''
# sort folders by folder created time
folders = sorted(folders, key=lambda f: datetime.datetime.strptime(f, fmt))
return folders[-1]
def most_recent_weights(weights_folder):
"""
return most recent created weights file
if folder is empty return empty string
"""
weight_files = os.listdir(weights_folder)
if len(weights_folder) == 0:
return ''
regex_str = r'([A-Za-z0-9]+)-([0-9]+)-(regular|best)'
# sort files by epoch
weight_files = sorted(weight_files, key=lambda w: int(re.search(regex_str, w).groups()[1]))
return weight_files[-1]
def last_epoch(weights_folder):
weight_file = most_recent_weights(weights_folder)
if not weight_file:
raise Exception('no recent weights were found')
resume_epoch = int(weight_file.split('-')[1])
return resume_epoch
def best_acc_weights(weights_folder):
"""
return the best acc .pth file in given folder, if no
best acc weights file were found, return empty string
"""
files = os.listdir(weights_folder)
if len(files) == 0:
return ''
regex_str = r'([A-Za-z0-9]+)-([0-9]+)-(regular|best)'
best_files = [w for w in files if re.search(regex_str, w).groups()[2] == 'best']
if len(best_files) == 0:
return ''
best_files = sorted(best_files, key=lambda w: int(re.search(regex_str, w).groups()[1]))
return best_files[-1]
def get_train_val_test_dataloader(dataset_dir="dataset/dataset0",
batch_size=16, num_workers=2, shuffle=True):
# OSError: [Errno 24] Too many open files:
# _data = numpy.dstack([dataset[i][0] for i in range(len(dataset))])
# mean = numpy.mean(_data)
# std = numpy.std(_data)
_transform = transforms.Compose([
transforms.Resize(180),
transforms.ToTensor(),
transforms.Normalize(0.7130, 0.2601214961302188)
])
# dataset.transform = _transform
dataset = CADataset(dataset_dir=dataset_dir, transform=_transform)
data_num = len(dataset)
train_num = round(data_num * 0.6)
val_num = round(data_num * 0.2)
test_num = data_num - train_num - val_num
train_data, val_data, test_data = torch.utils.data.random_split(dataset,
[train_num, val_num, test_num])
train_loader = DataLoader(dataset=train_data, batch_size=batch_size,
shuffle=shuffle, num_workers=num_workers)
val_loader = DataLoader(dataset=val_data, batch_size=batch_size,
shuffle=shuffle, num_workers=num_workers)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size,
shuffle=shuffle, num_workers=num_workers)
return train_loader, val_loader, test_loader
def get_my_loader(dataset_dir, mean=2.086080, std=2.618769, batch_size=16, num_workers=2, shuffle=True):
# only load rudy tensor
transform = transforms.Normalize(mean, std)
dataset = CADataset2(dataset_dir, transform=transform)
loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
return loader
def get_my_loader3(dataset_dir, batch_size=16, num_workers=2, shuffle=True):
transform_rudy = transforms.Normalize(mean=2.087080, std=2.618769)
transform_img = transforms.Compose([
transforms.Resize(128),
transforms.ToTensor(),
transforms.Normalize(mean=0.8070, std=0.2214)
])
dataset = CADataset3(dataset_dir, transform_rudy=transform_rudy, transform_img=transform_img)
loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
return loader
def get_my_loader4(dataset_dir, batch_size=16, num_workers=2, shuffle=True):
transform = transforms.Compose([
transforms.Resize(128),
transforms.ToTensor(),
transforms.Normalize(mean=0.8070, std=0.2214)
])
dataset = CADataset4(dataset_dir, transform=transform)
loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
return loader
def get_my_loader5(dataset_dir, batch_size=16, num_workers=2, shuffle=True):
transform = transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),
transforms.Normalize(mean=0.8070, std=0.2214)
])
dataset = CADataset4(dataset_dir, transform=transform)
loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
return loader
# -*- coding: UTF-8 -*-
""" inceptionv4 in pytorch
[1] Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi
Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning
https://arxiv.org/abs/1602.07261
"""
import torch
import torch.nn as nn
import pdb
class BasicConv2d(nn.Module):
def __init__(self, input_channels, output_channels, **kwargs):
super().__init__()
self.conv = nn.Conv2d(input_channels, output_channels, bias=False, **kwargs)
self.bn = nn.BatchNorm2d(output_channels)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
return x
class Inception_Stem(nn.Module):
#"""Figure 3. The schema for stem of the pure Inception-v4 and
#Inception-ResNet-v2 networks. This is the input part of those
#networks."""
def __init__(self, input_channels):
super().__init__()
self.conv1 = nn.Sequential(
BasicConv2d(input_channels, 32, kernel_size=3, stride=2, padding=0),
BasicConv2d(32, 32, kernel_size=3, padding=0),
BasicConv2d(32, 64, kernel_size=3, padding=1)
)
self.branch3x3_conv = BasicConv2d(64, 96, kernel_size=3, stride=2, padding=0)
self.branch3x3_pool = nn.MaxPool2d(3, stride=2, padding=0)
self.branch7x7a = nn.Sequential(
BasicConv2d(160, 64, kernel_size=1),
BasicConv2d(64, 64, kernel_size=(7, 1), padding=(3, 0)),
BasicConv2d(64, 64, kernel_size=(1, 7), padding=(0, 3)),
BasicConv2d(64, 96, kernel_size=3, padding=0)
)
self.branch7x7b = nn.Sequential(
BasicConv2d(160, 64, kernel_size=3, padding=1),
BasicConv2d(64, 96, kernel_size=3, padding=0)
)
self.branchpoola = nn.MaxPool2d(kernel_size=3, stride=1, padding=0)
self.branchpoolb = BasicConv2d(192, 192, kernel_size=3, stride=1, padding=0)
def forward(self, x):
x = self.conv1(x)
x = [
self.branch3x3_conv(x),
self.branch3x3_pool(x)
]
x = torch.cat(x, 1)
x = [
self.branch7x7a(x),
self.branch7x7b(x)
]
x = torch.cat(x, 1)
x = [
self.branchpoola(x),
self.branchpoolb(x)
]
x = torch.cat(x, 1)
return x
class InceptionA(nn.Module):
#"""Figure 4. The schema for 35 × 35 grid modules of the pure
#Inception-v4 network. This is the Inception-A block of Figure 9."""
def __init__(self, input_channels):
super().__init__()
self.branch3x3stack = nn.Sequential(
BasicConv2d(input_channels, 64, kernel_size=1),
BasicConv2d(64, 96, kernel_size=3, padding=1),
BasicConv2d(96, 96, kernel_size=3, padding=1)
)
self.branch3x3 = nn.Sequential(
BasicConv2d(input_channels, 64, kernel_size=1),
BasicConv2d(64, 96, kernel_size=3, padding=1)
)
self.branch1x1 = BasicConv2d(input_channels, 96, kernel_size=1)
self.branchpool = nn.Sequential(
nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
BasicConv2d(input_channels, 96, kernel_size=1)
)
def forward(self, x):
x = [
self.branch3x3stack(x),
self.branch3x3(x),
self.branch1x1(x),
self.branchpool(x)
]
return torch.cat(x, 1)
class ReductionA(nn.Module):
#"""Figure 7. The schema for 35 × 35 to 17 × 17 reduction module.
#Different variants of this blocks (with various number of filters)
#are used in Figure 9, and 15 in each of the new Inception(-v4, - ResNet-v1,
#-ResNet-v2) variants presented in this paper. The k, l, m, n numbers
#represent filter bank sizes which can be looked up in Table 1.
def __init__(self, input_channels, k, l, m, n):
super().__init__()
self.branch3x3stack = nn.Sequential(
BasicConv2d(input_channels, k, kernel_size=1),
BasicConv2d(k, l, kernel_size=3, padding=1),
BasicConv2d(l, m, kernel_size=3, stride=2)
)
self.branch3x3 = BasicConv2d(input_channels, n, kernel_size=3, stride=2)
self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2)
self.output_channels = input_channels + n + m
def forward(self, x):
x = [
self.branch3x3stack(x),
self.branch3x3(x),
self.branchpool(x)
]
return torch.cat(x, 1)
class InceptionB(nn.Module):
#"""Figure 5. The schema for 17 × 17 grid modules of the pure Inception-v4 network.
#This is the Inception-B block of Figure 9."""
def __init__(self, input_channels):
super().__init__()
self.branch7x7stack = nn.Sequential(
BasicConv2d(input_channels, 192, kernel_size=1),
BasicConv2d(192, 192, kernel_size=(1, 7), padding=(0, 3)),
BasicConv2d(192, 224, kernel_size=(7, 1), padding=(3, 0)),
BasicConv2d(224, 224, kernel_size=(1, 7), padding=(0, 3)),
BasicConv2d(224, 256, kernel_size=(7, 1), padding=(3, 0))
)
self.branch7x7 = nn.Sequential(
BasicConv2d(input_channels, 192, kernel_size=1),
BasicConv2d(192, 224, kernel_size=(1, 7), padding=(0, 3)),
BasicConv2d(224, 256, kernel_size=(7, 1), padding=(3, 0))
)
self.branch1x1 = BasicConv2d(input_channels, 384, kernel_size=1)
self.branchpool = nn.Sequential(
nn.AvgPool2d(3, stride=1, padding=1),
BasicConv2d(input_channels, 128, kernel_size=1)
)
def forward(self, x):
x = [
self.branch1x1(x),
self.branch7x7(x),
self.branch7x7stack(x),
self.branchpool(x)
]
return torch.cat(x, 1)
class ReductionB(nn.Module):
#"""Figure 8. The schema for 17 × 17 to 8 × 8 grid-reduction mod- ule.
#This is the reduction module used by the pure Inception-v4 network in
#Figure 9."""
def __init__(self, input_channels):
super().__init__()
self.branch7x7 = nn.Sequential(
BasicConv2d(input_channels, 256, kernel_size=1),
BasicConv2d(256, 256, kernel_size=(1, 7), padding=(0, 3)),
BasicConv2d(256, 320, kernel_size=(7, 1), padding=(3, 0)),
BasicConv2d(320, 320, kernel_size=3, stride=2, padding=1)
)
self.branch3x3 = nn.Sequential(
BasicConv2d(input_channels, 192, kernel_size=1),
BasicConv2d(192, 192, kernel_size=3, stride=2, padding=1)
)
self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
def forward(self, x):
x = [
self.branch3x3(x),
self.branch7x7(x),
self.branchpool(x)
]
return torch.cat(x, 1)
class InceptionC(nn.Module):
def __init__(self, input_channels):
#"""Figure 6. The schema for 8×8 grid modules of the pure
#Inceptionv4 network. This is the Inception-C block of Figure 9."""
super().__init__()
self.branch3x3stack = nn.Sequential(
BasicConv2d(input_channels, 384, kernel_size=1),
BasicConv2d(384, 448, kernel_size=(1, 3), padding=(0, 1)),
BasicConv2d(448, 512, kernel_size=(3, 1), padding=(1, 0)),
)
self.branch3x3stacka = BasicConv2d(512, 256, kernel_size=(1, 3), padding=(0, 1))
self.branch3x3stackb = BasicConv2d(512, 256, kernel_size=(3, 1), padding=(1, 0))
self.branch3x3 = BasicConv2d(input_channels, 384, kernel_size=1)
self.branch3x3a = BasicConv2d(384, 256, kernel_size=(3, 1), padding=(1, 0))
self.branch3x3b = BasicConv2d(384, 256, kernel_size=(1, 3), padding=(0, 1))
self.branch1x1 = BasicConv2d(input_channels, 256, kernel_size=1)
self.branchpool = nn.Sequential(
nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
BasicConv2d(input_channels, 256, kernel_size=1)
)
def forward(self, x):
branch3x3stack_output = self.branch3x3stack(x)
branch3x3stack_output = [
self.branch3x3stacka(branch3x3stack_output),
self.branch3x3stackb(branch3x3stack_output)
]
branch3x3stack_output = torch.cat(branch3x3stack_output, 1)
branch3x3_output = self.branch3x3(x)
branch3x3_output = [
self.branch3x3a(branch3x3_output),
self.branch3x3b(branch3x3_output)
]
branch3x3_output = torch.cat(branch3x3_output, 1)
branch1x1_output = self.branch1x1(x)
branchpool = self.branchpool(x)
output = [
branch1x1_output,
branch3x3_output,
branch3x3stack_output,
branchpool
]
return torch.cat(output, 1)
class InceptionV4(nn.Module):
def __init__(self, A, B, C, k=192, l=224, m=256, n=384, class_nums=9):
super().__init__()
self.stem = Inception_Stem(1)
self.inception_a = self._generate_inception_module(384, 384, A, InceptionA)
self.reduction_a = ReductionA(384, k, l, m, n)
output_channels = self.reduction_a.output_channels
self.inception_b = self._generate_inception_module(output_channels, 1024, B, InceptionB)
self.reduction_b = ReductionB(1024)
self.inception_c = self._generate_inception_module(1536, 1536, C, InceptionC)
self.avgpool = nn.AvgPool2d(8)
#"""Dropout (keep 0.8)"""
self.dropout = nn.Dropout2d(1 - 0.8)
self.linear = nn.Linear(1536, class_nums)
def forward(self, x):
# x is of shape [#batch_size, 1, 256, 256]
x = self.stem(x)
x = self.inception_a(x)
x = self.reduction_a(x)
x = self.inception_b(x)
x = self.reduction_b(x)
x = self.inception_c(x)
x = self.avgpool(x)
x = self.dropout(x)
x = x.view(-1, 1536)
# we only need the embedding
# x = self.linear(x)
return x
@staticmethod
def _generate_inception_module(input_channels, output_channels, block_num, block):
layers = nn.Sequential()
for l in range(block_num):
layers.add_module("{}_{}".format(block.__name__, l), block(input_channels))
input_channels = output_channels
return layers
class InceptionResNetA(nn.Module):
#"""Figure 16. The schema for 35 × 35 grid (Inception-ResNet-A)
#module of the Inception-ResNet-v2 network."""
def __init__(self, input_channels):
super().__init__()
self.branch3x3stack = nn.Sequential(
BasicConv2d(input_channels, 32, kernel_size=1),
BasicConv2d(32, 48, kernel_size=3, padding=1),
BasicConv2d(48, 64, kernel_size=3, padding=1)
)
self.branch3x3 = nn.Sequential(
BasicConv2d(input_channels, 32, kernel_size=1),
BasicConv2d(32, 32, kernel_size=3, padding=1)
)
self.branch1x1 = BasicConv2d(input_channels, 32, kernel_size=1)
self.reduction1x1 = nn.Conv2d(128, 384, kernel_size=1)
self.shortcut = nn.Conv2d(input_channels, 384, kernel_size=1)
self.bn = nn.BatchNorm2d(384)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
residual = [
self.branch1x1(x),
self.branch3x3(x),
self.branch3x3stack(x)
]
residual = torch.cat(residual, 1)
residual = self.reduction1x1(residual)
shortcut = self.shortcut(x)
output = self.bn(shortcut + residual)
output = self.relu(output)
return output
class InceptionResNetB(nn.Module):
#"""Figure 17. The schema for 17 × 17 grid (Inception-ResNet-B) module of
#the Inception-ResNet-v2 network."""
def __init__(self, input_channels):
super().__init__()
self.branch7x7 = nn.Sequential(
BasicConv2d(input_channels, 128, kernel_size=1),
BasicConv2d(128, 160, kernel_size=(1, 7), padding=(0, 3)),
BasicConv2d(160, 192, kernel_size=(7, 1), padding=(3, 0))
)
self.branch1x1 = BasicConv2d(input_channels, 192, kernel_size=1)
self.reduction1x1 = nn.Conv2d(384, 1154, kernel_size=1)
self.shortcut = nn.Conv2d(input_channels, 1154, kernel_size=1)
self.bn = nn.BatchNorm2d(1154)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
residual = [
self.branch1x1(x),
self.branch7x7(x)
]
residual = torch.cat(residual, 1)
#"""In general we picked some scaling factors between 0.1 and 0.3 to scale the residuals
#before their being added to the accumulated layer activations (cf. Figure 20)."""
residual = self.reduction1x1(residual) * 0.1
shortcut = self.shortcut(x)
output = self.bn(residual + shortcut)
output = self.relu(output)
return output
class InceptionResNetC(nn.Module):
def __init__(self, input_channels):
#Figure 19. The schema for 8×8 grid (Inception-ResNet-C)
#module of the Inception-ResNet-v2 network."""
super().__init__()
self.branch3x3 = nn.Sequential(
BasicConv2d(input_channels, 192, kernel_size=1),
BasicConv2d(192, 224, kernel_size=(1, 3), padding=(0, 1)),
BasicConv2d(224, 256, kernel_size=(3, 1), padding=(1, 0))
)
self.branch1x1 = BasicConv2d(input_channels, 192, kernel_size=1)
self.reduction1x1 = nn.Conv2d(448, 2048, kernel_size=1)
self.shorcut = nn.Conv2d(input_channels, 2048, kernel_size=1)
self.bn = nn.BatchNorm2d(2048)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
residual = [
self.branch1x1(x),
self.branch3x3(x)
]
residual = torch.cat(residual, 1)
residual = self.reduction1x1(residual) * 0.1
shorcut = self.shorcut(x)
output = self.bn(shorcut + residual)
output = self.relu(output)
return output
class InceptionResNetReductionA(nn.Module):
#"""Figure 7. The schema for 35 × 35 to 17 × 17 reduction module.
#Different variants of this blocks (with various number of filters)
#are used in Figure 9, and 15 in each of the new Inception(-v4, - ResNet-v1,
#-ResNet-v2) variants presented in this paper. The k, l, m, n numbers
#represent filter bank sizes which can be looked up in Table 1.
def __init__(self, input_channels, k, l, m, n):
super().__init__()
self.branch3x3stack = nn.Sequential(
BasicConv2d(input_channels, k, kernel_size=1),
BasicConv2d(k, l, kernel_size=3, padding=1),
BasicConv2d(l, m, kernel_size=3, stride=2)
)
self.branch3x3 = BasicConv2d(input_channels, n, kernel_size=3, stride=2)
self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2)
self.output_channels = input_channels + n + m
def forward(self, x):
x = [
self.branch3x3stack(x),
self.branch3x3(x),
self.branchpool(x)
]
return torch.cat(x, 1)
class InceptionResNetReductionB(nn.Module):
#"""Figure 18. The schema for 17 × 17 to 8 × 8 grid-reduction module.
#Reduction-B module used by the wider Inception-ResNet-v1 network in
#Figure 15."""
#I believe it was a typo(Inception-ResNet-v1 should be Inception-ResNet-v2)
def __init__(self, input_channels):
super().__init__()
self.branchpool = nn.MaxPool2d(3, stride=2)
self.branch3x3a = nn.Sequential(
BasicConv2d(input_channels, 256, kernel_size=1),
BasicConv2d(256, 384, kernel_size=3, stride=2)
)
self.branch3x3b = nn.Sequential(
BasicConv2d(input_channels, 256, kernel_size=1),
BasicConv2d(256, 288, kernel_size=3, stride=2)
)
self.branch3x3stack = nn.Sequential(
BasicConv2d(input_channels, 256, kernel_size=1),
BasicConv2d(256, 288, kernel_size=3, padding=1),
BasicConv2d(288, 320, kernel_size=3, stride=2)
)
def forward(self, x):
x = [
self.branch3x3a(x),
self.branch3x3b(x),
self.branch3x3stack(x),
self.branchpool(x)
]
x = torch.cat(x, 1)
return x
class InceptionResNetV2(nn.Module):
def __init__(self, A, B, C, k=256, l=256, m=384, n=384, class_nums=100):
super().__init__()
self.stem = Inception_Stem(3)
self.inception_resnet_a = self._generate_inception_module(384, 384, A, InceptionResNetA)
self.reduction_a = InceptionResNetReductionA(384, k, l, m, n)
output_channels = self.reduction_a.output_channels
self.inception_resnet_b = self._generate_inception_module(output_channels, 1154, B, InceptionResNetB)
self.reduction_b = InceptionResNetReductionB(1154)
self.inception_resnet_c = self._generate_inception_module(2146, 2048, C, InceptionResNetC)
#6x6 featuresize
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
#"""Dropout (keep 0.8)"""
self.dropout = nn.Dropout2d(1 - 0.8)
self.linear = nn.Linear(2048, class_nums)
def forward(self, x):
x = self.stem(x)
x = self.inception_resnet_a(x)
x = self.reduction_a(x)
x = self.inception_resnet_b(x)
x = self.reduction_b(x)
x = self.inception_resnet_c(x)
x = self.avgpool(x)
x = self.dropout(x)
x = x.view(-1, 2048)
x = self.linear(x)
return x
@staticmethod
def _generate_inception_module(input_channels, output_channels, block_num, block):
layers = nn.Sequential()
for l in range(block_num):
layers.add_module("{}_{}".format(block.__name__, l), block(input_channels))
input_channels = output_channels
return layers
def inceptionv4():
# return InceptionV4(4, 7, 3)
return InceptionV4(4, 7, 3, class_nums=9)
def inception_resnet_v2():
return InceptionResNetV2(5, 10, 5)
"""
@brief: the network structure, and the backbone is inception-v4
I copy the inception-v4 code from baiyu's work
@author: Zhengyang Lyu
@date: 2022.8.23
"""
\ No newline at end of file
from utils import *
from model.inceptionv4 import inceptionv4
import torch.optim as optim
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import os
def train(epoch):
net.train(True)
for i in range(iter_per_epoch - 1):
optimizer.zero_grad()
imgs_nonlabel = iter_nonlabel.next()
loss_ssl = self_supervised_loss(imgs_nonlabel, net)
labels_list, imgs_list, flag = get_few_shot_batch(iter_list)
if flag == -1:
return -1
loss_fs = few_shot_loss(imgs_list, labels_list, net)
# --- following is for test ---
loss = loss_fs + ALPHA * loss_ssl
# loss = loss_fs
if torch.isnan(loss):
pdb.set_trace()
loss.backward()
optimizer.step()
print("Training epoch: {epoch}\tIter: {cur_iter}/{iter_per_epoch}\tloss: {:0.4f}\tLR: {:0.6f}".format(
loss.item(),
optimizer.param_groups[0]['lr'],\
epoch=epoch,
cur_iter=i + 1,
iter_per_epoch=iter_per_epoch - 1
))
n_iter = (epoch - 1) * (iter_per_epoch - 1) + i
#update training loss for each iteration
writer.add_scalar('Train/loss', loss.item(), n_iter)
if epoch <= WARM:
warmup_scheduler.step()
return 0
if __name__ == '__main__':
# --- parameters setting ---
lr = 0.1
epoch_num = 500
ALPHA = 1.0
MILESTONES = [60, 120, 160] # don't know what it means...
WARM = 1
# --- log setting ---
LOG_DIR = 'runs'
DATE_FORMAT = '%A_%d_%B_%Y_%Hh_%Mm_%Ss'
#time of we run the script
TIME_NOW = datetime.now().strftime(DATE_FORMAT)
MODEL_NAME = 'InceptionV4'
# --- save weight ---
CHECKPOINT_PATH = 'checkpoint'
SAVE_EPOCH = 10
# --- data loading ---
df_withlabel, df_nonlabel, df_withpattern, df_nonpattern = read_pkl()
# data with label
df_list = get_df_of_label(df_withlabel)
dataset_list = get_dataset_list_withlabel(df_list)
loader_list = get_loader_list(dataset_list) # for labeled data
# data without label
# TODO: can we treat those with label as nonlabel?
dataset_nonlabel = LSWMD_Dataset_nonlabel(df_nonlabel)
# --- model to be trained ---
net = inceptionv4()
net.cuda()
# --- train setting ---
# weight_decay is the weight for the norm of the NN params added to loss, L2 penalty
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-3)
# optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=MILESTONES, gamma=0.2) #learning rate decay
iter_per_epoch = get_iter_per_epoch(loader_list)
warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * WARM)
# --- reuse pretrained weights ---
# TODO: To be implemented
# --- use Tensorboard ---
if not os.path.exists(LOG_DIR):
os.mkdir(LOG_DIR)
writer = SummaryWriter(log_dir=os.path.join(LOG_DIR, MODEL_NAME, TIME_NOW))
img_tensor = torch.Tensor(1, 1, 256, 256).cuda()
writer.add_graph(net, img_tensor)
# --- setting for saving model ---
checkpoint_path = os.path.join(CHECKPOINT_PATH, MODEL_NAME, TIME_NOW)
if not os.path.exists(checkpoint_path):
os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')
best_acc = 0.0
# --- logging pretrained info ---
# TODO: To be implemented
# --- training process ---
for epoch in range(1, epoch_num):
# --- warming up ---
# TODO: To be implemented
# if epoch > WARM:
# train_scheduler.step(epoch)
# --- re-initilize the iterator ---
loader_list = get_loader_list(dataset_list) # for labeled data
iter_list = get_iter_list(loader_list)
loader_nonlabel = get_loader(dataset_nonlabel)
iter_nonlabel = loader_nonlabel._get_iterator()
train(epoch)
# --- evaluating model ---
# TODO: To be implemented
# TODO: divide training set and evaluating set
# --- save model ---
# when best
# TODO: To be implemented
# regularly
if not epoch % SAVE_EPOCH:
weights_path = checkpoint_path.format(net=MODEL_NAME, epoch=epoch, type='regular')
print('saving weights file to {}'.format(weights_path))
torch.save(net.state_dict(), weights_path)
writer.close()
\ No newline at end of file
from utils import *
from model.inceptionv4 import inceptionv4
import torch.optim as optim
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import os
def train(epoch):
net.train(True)
for i in range(iter_per_epoch - 1):
optimizer.zero_grad()
"""
imgs_nonlabel = iter_nonlabel.next()
loss_ssl = self_supervised_loss(imgs_nonlabel, net)
"""
labels_list, imgs_list, flag = get_few_shot_batch(iter_list)
if flag == -1:
return -1
loss_fs = few_shot_loss(imgs_list, labels_list, net)
# --- following is for test ---
# loss = loss_fs + ALPHA * loss_ssl
loss = loss_fs
# loss = loss_fs
if torch.isnan(loss):
pdb.set_trace()
loss.backward()
optimizer.step()
print("Training epoch: {epoch}\tIter: {cur_iter}/{iter_per_epoch}\tloss: {:0.4f}\tLR: {:0.6f}".format(
loss.item(),
optimizer.param_groups[0]['lr'],\
epoch=epoch,
cur_iter=i + 1,
iter_per_epoch=iter_per_epoch - 1
))
n_iter = (epoch - 1) * (iter_per_epoch - 1) + i
#update training loss for each iteration
writer.add_scalar('Train/loss', loss.item(), n_iter)
if epoch <= WARM:
warmup_scheduler.step()
return 0
if __name__ == '__main__':
# --- parameters setting ---
lr = 0.1
epoch_num = 500
ALPHA = 1.0
MILESTONES = [60, 120, 160] # don't know what it means...
WARM = 1
# --- log setting ---
LOG_DIR = 'runs'
DATE_FORMAT = '%A_%d_%B_%Y_%Hh_%Mm_%Ss'
#time of we run the script
TIME_NOW = datetime.now().strftime(DATE_FORMAT)
MODEL_NAME = 'InceptionV4'
# --- save weight ---
CHECKPOINT_PATH = 'checkpoint'
SAVE_EPOCH = 10
# --- data loading ---
df_withlabel, df_nonlabel, df_withpattern, df_nonpattern = read_pkl()
# data with label
df_list = get_df_of_label(df_withlabel)
dataset_list = get_dataset_list_withlabel(df_list)
loader_list = get_loader_list(dataset_list) # for labeled data
# data without label
# TODO: can we treat those with label as nonlabel?
dataset_nonlabel = LSWMD_Dataset_nonlabel(df_nonlabel)
# --- model to be trained ---
net = inceptionv4()
net.cuda()
# --- train setting ---
# weight_decay is the weight for the norm of the NN params added to loss, L2 penalty
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-3)
# optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=MILESTONES, gamma=0.2) #learning rate decay
iter_per_epoch = get_iter_per_epoch(loader_list)
warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * WARM)
# --- reuse pretrained weights ---
# TODO: To be implemented
# --- use Tensorboard ---
if not os.path.exists(LOG_DIR):
os.mkdir(LOG_DIR)
writer = SummaryWriter(log_dir=os.path.join(LOG_DIR, MODEL_NAME, '1', TIME_NOW))
img_tensor = torch.Tensor(1, 1, 256, 256).cuda()
writer.add_graph(net, img_tensor)
# --- setting for saving model ---
checkpoint_path = os.path.join(CHECKPOINT_PATH, MODEL_NAME, TIME_NOW)
if not os.path.exists(checkpoint_path):
os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')
best_acc = 0.0
# --- logging pretrained info ---
# TODO: To be implemented
# --- training process ---
for epoch in range(1, epoch_num):
# --- warming up ---
# TODO: To be implemented
# if epoch > WARM:
# train_scheduler.step(epoch)
# --- re-initilize the iterator ---
loader_list = get_loader_list(dataset_list) # for labeled data
iter_list = get_iter_list(loader_list)
loader_nonlabel = get_loader(dataset_nonlabel)
iter_nonlabel = loader_nonlabel._get_iterator()
train(epoch)
# --- evaluating model ---
# TODO: To be implemented
# TODO: divide training set and evaluating set
# --- save model ---
# when best
# TODO: To be implemented
# regularly
if not epoch % SAVE_EPOCH:
weights_path = checkpoint_path.format(net=MODEL_NAME, epoch=epoch, type='regular')
print('saving weights file to {}'.format(weights_path))
torch.save(net.state_dict(), weights_path)
writer.close()
\ No newline at end of file
from utils import *
from model.inceptionv4 import inceptionv4
import torch.optim as optim
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import os
def train(epoch):
net.train(True)
for i in range(iter_per_epoch - 1):
optimizer.zero_grad()
imgs_nonlabel = iter_nonlabel.next()
loss_ssl = self_supervised_loss(imgs_nonlabel, net)
"""
labels_list, imgs_list, flag = get_few_shot_batch(iter_list)
if flag == -1:
return -1
loss_fs = few_shot_loss(imgs_list, labels_list, net)
"""
# --- following is for test ---
# loss = loss_fs + ALPHA * loss_ssl
loss = loss_ssl
# loss = loss_fs
if torch.isnan(loss):
pdb.set_trace()
loss.backward()
optimizer.step()
print("Training epoch: {epoch}\tIter: {cur_iter}/{iter_per_epoch}\tloss: {:0.4f}\tLR: {:0.6f}".format(
loss.item(),
optimizer.param_groups[0]['lr'],\
epoch=epoch,
cur_iter=i + 1,
iter_per_epoch=iter_per_epoch - 1
))
n_iter = (epoch - 1) * (iter_per_epoch - 1) + i
#update training loss for each iteration
writer.add_scalar('Train/loss', loss.item(), n_iter)
if epoch <= WARM:
warmup_scheduler.step()
return 0
if __name__ == '__main__':
# --- parameters setting ---
lr = 0.1
epoch_num = 500
ALPHA = 1.0
MILESTONES = [60, 120, 160] # don't know what it means...
WARM = 1
# --- log setting ---
LOG_DIR = 'runs'
DATE_FORMAT = '%A_%d_%B_%Y_%Hh_%Mm_%Ss'
#time of we run the script
TIME_NOW = datetime.now().strftime(DATE_FORMAT)
MODEL_NAME = 'InceptionV4'
# --- save weight ---
CHECKPOINT_PATH = 'checkpoint'
SAVE_EPOCH = 10
# --- data loading ---
df_withlabel, df_nonlabel, df_withpattern, df_nonpattern = read_pkl()
# data with label
df_list = get_df_of_label(df_withlabel)
dataset_list = get_dataset_list_withlabel(df_list)
loader_list = get_loader_list(dataset_list) # for labeled data
# data without label
# TODO: can we treat those with label as nonlabel?
dataset_nonlabel = LSWMD_Dataset_nonlabel(df_nonlabel)
# --- model to be trained ---
net = inceptionv4()
net.cuda()
# --- train setting ---
# weight_decay is the weight for the norm of the NN params added to loss, L2 penalty
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-3)
# optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=MILESTONES, gamma=0.2) #learning rate decay
iter_per_epoch = get_iter_per_epoch(loader_list)
warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * WARM)
# --- reuse pretrained weights ---
# TODO: To be implemented
# --- use Tensorboard ---
if not os.path.exists(LOG_DIR):
os.mkdir(LOG_DIR)
writer = SummaryWriter(log_dir=os.path.join(LOG_DIR, MODEL_NAME, '2', TIME_NOW))
img_tensor = torch.Tensor(1, 1, 256, 256).cuda()
writer.add_graph(net, img_tensor)
# --- setting for saving model ---
checkpoint_path = os.path.join(CHECKPOINT_PATH, MODEL_NAME, TIME_NOW)
if not os.path.exists(checkpoint_path):
os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')
best_acc = 0.0
# --- logging pretrained info ---
# TODO: To be implemented
# --- training process ---
for epoch in range(1, epoch_num):
# --- warming up ---
# TODO: To be implemented
# if epoch > WARM:
# train_scheduler.step(epoch)
# --- re-initilize the iterator ---
loader_list = get_loader_list(dataset_list) # for labeled data
iter_list = get_iter_list(loader_list)
loader_nonlabel = get_loader(dataset_nonlabel)
iter_nonlabel = loader_nonlabel._get_iterator()
train(epoch)
# --- evaluating model ---
# TODO: To be implemented
# TODO: divide training set and evaluating set
# --- save model ---
# when best
# TODO: To be implemented
# regularly
if not epoch % SAVE_EPOCH:
weights_path = checkpoint_path.format(net=MODEL_NAME, epoch=epoch, type='regular')
print('saving weights file to {}'.format(weights_path))
torch.save(net.state_dict(), weights_path)
writer.close()
\ No newline at end of file
"""
@brief: build distributed training on multi-GPU
@author: Zhengyang Lyu
@date 2022.8.26
@note: https://github.com/tczhangzhi/pytorch-distributed
"""
# use torch.distributed
# n processes for n GPUs
# To launch, type the following command in terminal:
# `CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 main.py`
import argparse
import os
import random
import shutil
import time
import warnings
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
from model.inceptionv4 import inceptionv4
from utils import get_dataset_list_withlabel
# --- Args ---
parser = argparse.ArgumentParser(description='PyTorch Distributed Training')
parser.add_argument('--data',
metavar='DIR',
default='/lustre/S/lvzhengyang/wafer_failure/dataset/LSWMD.pkl',
help='path to dataset')
parser.add_argument('-j',
'--workers',
default=4,
type=int,
metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs',
default=90,
type=int,
metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch',
default=0,
type=int,
metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b',
'--batch-size',
default=128,
type=int,
metavar='N',
help='mini-batch size (default: 3200), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr',
'--learning-rate',
default=0.1,
type=float,
metavar='LR',
help='initial learning rate',
dest='lr')
parser.add_argument('--momentum',
default=0.9,
type=float,
metavar='M',
help='momentum')
parser.add_argument('--local_rank',
default=-1,
type=int,
help='node rank for distributed training')
parser.add_argument('--wd',
'--weight-decay',
default=1e-4,
type=float,
metavar='W',
help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p',
'--print-freq',
default=10,
type=int,
metavar='N',
help='print frequency (default: 10)')
parser.add_argument('-e',
'--evaluate',
dest='evaluate',
action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained',
dest='pretrained',
action='store_true',
help='use pre-trained model')
parser.add_argument('--seed',
default=None,
type=int,
help='seed for initializing training. ')
def reduce_mean(tensor, nprocs):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
rt /= nprocs
return rt
def main():
args = parser.parse_args()
args.nprocs = torch.cuda.device_count()
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
main_worker(args.local_rank, args.nprocs, args)
def main_worker(local_rank, nprocs, args):
best_acc1 = .0
dist.init_process_group(backend='nccl')
# create model
if args.pretrained:
raise NotImplementedError
else:
print("=> creating model '{}'".format('Inception-V4'))
model = inceptionv4()
torch.cuda.set_device(local_rank)
model.cuda(local_rank)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / nprocs)
model = torch.nn.parallel.DistributedDataParallel(model,
device_ids=[local_rank])
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(local_rank)
"""
optimizer = torch.optim.SGD(model.parameters(),
args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
"""
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
cudnn.benchmark = True
# Data loading code
traindir = os.path.join(args.data, 'train')
train_dataset_list = get_dataset_list_withlabel
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=args.batch_size,
num_workers=2,
pin_memory=True,
sampler=train_sampler)
val_dataset = datasets.ImageFolder(
valdir,
transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]))
val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
val_loader = torch.utils.data.DataLoader(val_dataset,
batch_size=args.batch_size,
num_workers=2,
pin_memory=True,
sampler=val_sampler)
if args.evaluate:
validate(val_loader, model, criterion, local_rank, args)
return
for epoch in range(args.start_epoch, args.epochs):
train_sampler.set_epoch(epoch)
val_sampler.set_epoch(epoch)
adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, local_rank,
args)
# evaluate on validation set
acc1 = validate(val_loader, model, criterion, local_rank, args)
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
if args.local_rank == 0:
save_checkpoint(
{
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.module.state_dict(),
'best_acc1': best_acc1,
}, is_best)
"""
@brief: utils
@author: Zhengyang Lyu
@date: 2022.8.23
"""
import pandas as pd
import numpy as np
import pdb
import torch
import sys
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import torchvision.transforms.functional as trans_fn
from torchvision.transforms.functional import InterpolationMode
from torch.utils.data import DataLoader
from typing import Union
from torch import linalg as LA
import torch.nn.functional as fn
from torch.optim.lr_scheduler import _LRScheduler
import random
import copy
import torch.nn as nn
#TODO: replace pandas by cuDF, which can be executed on GPU
class LSWMD_Dataset_withlabel(Dataset):
def __init__(self, df: pd.core.frame.DataFrame) -> None:
super().__init__()
df = df.reset_index()
self.df = df
self.t = transforms.Compose([
transforms.ToTensor(),
transforms.Resize([256, 256], interpolation=InterpolationMode.NEAREST),
])
def __len__(self):
return self.df.shape[0]
def __getitem__(self, index):
label = self.df['failureNum'][index]
img = self.df['waferMap'][index]
# img only have 3 values: 0, 1, 2
# scale it in to 1, 127, 255
img = np.floor(img / 2 * 255)
# add transforms according to ICCAD 2021 paper:
# "When Wafer Failure Pattern Classification Meets Few-shot Learning and Self-Supervised Learning"
img = self.t(img)
return label, img
class LSWMD_Dataset_nonlabel(Dataset):
def __init__(self, df: pd.core.frame.DataFrame) -> None:
super().__init__()
df = df.reset_index()
self.df = df
self.t = transforms.Compose([
transforms.ToTensor(),
transforms.Resize([256, 256], interpolation=InterpolationMode.NEAREST),
])
def __len__(self):
return self.df.shape[0]
def __getitem__(self, index):
img = self.df['waferMap'][index]
# img only have 3 values: 0, 1, 2
# scale it in to 1, 127, 255
img = np.floor(img / 2 * 255)
# add transforms according to ICCAD 2021 paper:
# "When Wafer Failure Pattern Classification Meets Few-shot Learning and Self-Supervised Learning"
img = self.t(img)
return img
def get_loader(dataset: Union[LSWMD_Dataset_withlabel, LSWMD_Dataset_nonlabel],
batch_size=8, num_workers=4, shuffle=True):
loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
return loader
def read_pkl(path='./dataset/LSWMD.pkl'):
# refered to url: https://www.kaggle.com/code/ashishpatel26/wm-811k-wafermap
df = pd.read_pickle(path)
# --- drop some non-needed attributes ---
# don't need the 'waferIndex' feature, which corresponding to where is the data from
df = df.drop(['waferIndex'], axis = 1)
# don't need the 'trianTestLabel', here we divide the test and train set handfully
# and note that the word 'train' is wrong spelled
df = df.drop(['trianTestLabel'], axis = 1)
df = df.drop(['lotName'], axis = 1)
df = df.drop(['dieSize'], axis = 1)
# encode the mapping type
mapping_type={'Center':0,'Donut':1,'Edge-Loc':2,'Edge-Ring':3,'Loc':4,'Random':5,'Scratch':6,'Near-full':7,'none':8}
df['failureNum']=df.failureType
df = df.replace({'failureNum':mapping_type})
# this dataset contains data with labels and without labels
df_withlabel = df[(df['failureNum']>=0) & (df['failureNum']<=8)]
df_withlabel =df_withlabel.reset_index()
df_nonlabel = df[((df['failureNum']>=0) & (df['failureNum']<=8)) == False]
df_nonlabel =df_nonlabel.reset_index()
# among the data with labels, some of them is labeled as 'none'
df_withpattern = df[(df['failureNum']>=0) & (df['failureNum']<=7)]
df_withpattern = df_withpattern.reset_index()
df_nonpattern = df[(df['failureNum']==8)]
df_nonpattern = df_nonpattern.reset_index()
# a brief overview of this dataset
# df_withlabel.shape[0], df_withpattern.shape[0], df_nonpattern.shape[0]
# (172950, 25519, 147431)
# the die size for each instance is different, varying from (300, 202) to (6, 21)
# need to make input the same dim
# df
# /\
# df_withlabel df_nonlabel
# /\
# df_withpattern df_nonpattern (defect-free)
return df_withlabel, df_nonlabel, df_withpattern, df_nonpattern
def extract_exact_label(df: pd.core.frame.DataFrame, label: int):
df_label = df[(df['failureNum'] == label)]
return df_label
def get_df_of_label(df: pd.core.frame.DataFrame):
# label: 0, 1, ..., 8
df_list = []
for label in range(9):
df_list.append(extract_exact_label(df, label))
return df_list
def get_dataset_list_withlabel(df_list):
dataset_list = []
for df in df_list:
dataset_list.append(LSWMD_Dataset_withlabel(df))
return dataset_list
def get_loader_list(dataset_list):
loader_list = []
for dataset in dataset_list:
loader_list.append(get_loader(dataset))
return loader_list
def get_iter_list(loader_list):
iter_list = []
for loader in loader_list:
iter_list.append(loader._get_iterator())
return iter_list
def get_few_shot_batch(iter_list):
"""
@brief make a batch for few-shot learning
@param iter_list: iterator of each class
@return list of label batch: [(label batch of type i) for i in range(types)]
list of img batch: [(img batch of type i) for i in range(types)]
"""
labels_list = []
imgs_list = []
for _iter in iter_list:
try:
labels, imgs = _iter.next()
labels_list.append(labels)
imgs_list.append(imgs)
except StopIteration:
return [], [], -1
return labels_list, imgs_list, 0
# Considering use df.sample directly instead of Dataloader?
# Done by dataloader
def few_shot_loss(imgs_list, labels_list, net):
"""
@brief Compute loss for the few-shot learner
@params labels_list: a list of batches from the loaders
@params imgs_list: a list of batches from the loaders
@params net: backbone_net, the inception-v4 net
@return loss of the few-shot learner
"""
batch_size = labels_list[0].shape[-1]
num_types = len(labels_list)
# labels = torch.stack(labels_list, dim=0).cuda()
""" imgs
[
[type 0 img]
...
[type 0 img]
[type 1 img]
...
[type 1 img]
...
[type k img]
...
[type k img]
]
"""
imgs = torch.cat(imgs_list, dim=0).cuda().float()
""" outputs
[
output embedding for type 0
...
output embedding for type 0
output embedding for type 1
...
output embedding for type 1
...
output embedding for type k
...
output embedding for type k
]
"""
outputs = net(imgs) # shape [#batch_size * #type_num, #output_dim]
""" reshape outputs as:
[
[
output embedding for type 0
...
output embedding for type 0
]
[
output embedding for type 1
...
output embedding for type 1
]
...
[
output embedding for type k
...
output embedding for type k
]
]
"""
outputs = outputs.view(num_types, batch_size, -1) # [#num_types, #batch_size, #dim]
prototypes = torch.sum(outputs, dim=1) / batch_size # [#num_types, #dim]
# transpose for better understanding
outputs.transpose_(1, 0) # [#batch_size, #num_types, #dim]
""" the tensor subtraction rule in torch, an example:
shape (#batch, #num_types, 1, #dims) - shape (1, #num_types, #dims)
= shape (#batch, #num_types, #num_types, #dims)
"""
distances = torch.sub(outputs.unsqueeze(2), prototypes.unsqueeze(0))
distances = LA.norm(distances, dim=-1).pow(2) # [#batch_size, #num_types, #num_types]
distances = -1.0 * fn.log_softmax(-1.0 * distances, dim=-1)
# now, distances[k][i, j] means: in the k-th data in the batch, the distance
# between the embeddings labeled i and the prototype j
diag_elem_per_batch = torch.diagonal(distances, dim1=1, dim2=2) # [#batch_size, #num_types]
# regularizer is implemented by PyTorch Optimizers
loss = diag_elem_per_batch.sum()
return loss
def self_supervised_loss(imgs, net):
"""
@brief compute the loss for self-supervised learner
@param imgs: the batch of imgs of shape [#batch_size, 1, H, W]
@param net: the backbone net
@return the loss of the self-supervised learner
"""
# the data augmentation module
# rotation, top-bottom, left-right flipping
aug_trans_list = [
trans_fn.vflip,
trans_fn.hflip,
transforms.RandomRotation(degrees=(0, 360)),
]
aug_trans = random.sample(aug_trans_list, 2)
imgs = imgs.cuda().float()
imgs_0 = aug_trans[0](imgs)
imgs_1 = aug_trans[1](imgs)
embed_0 = net(imgs_0)
# use the embed_1 as label
with torch.no_grad():
embed_1 = net(imgs_1)
# try to use MSE directly
loss = nn.functional.mse_loss(embed_0, embed_1)
"""
# loss function
# build correlation matrix in shape [1536, 1536]
embed_0.unsqueeze_(-1)
embed_1.unsqueeze_(1)
dividend = torch.sum(torch.mul(embed_0, embed_1), dim=0)
"""
"""
divisor = torch.mul(
embed_0.pow(2).sum(dim=0).sqrt(),
embed_1.pow(2).sum(dim=0).sqrt()
)
"""
"""
divisor = torch.mul(
torch.norm(embed_0, dim=0),
torch.norm(embed_1, dim=0)
)
EPSILON = 1e-8 # in case of divided by 0
cor_mat = torch.div(dividend, divisor.add_(EPSILON))
LAMBDA = 1e-4
# loss = cor_mat.sub(1.0).pow(2).trace()
loss = (torch.diagonal(cor_mat) - 1.0).pow(2).sum()
cor_mat.pow_(2)
loss += LAMBDA * (cor_mat.sum() - cor_mat.trace())
"""
return loss
class WarmUpLR(_LRScheduler):
"""warmup_training learning rate scheduler
Args:
optimizer: optimzier(e.g. SGD)
total_iters: totoal_iters of warmup phase
"""
def __init__(self, optimizer, total_iters, last_epoch=-1):
self.total_iters = total_iters
super().__init__(optimizer, last_epoch)
def get_lr(self):
"""we will use the first m batches, and set the learning
rate to base_lr * m / total_iters
"""
return [base_lr * self.last_epoch / (self.total_iters + 1e-8) for base_lr in self.base_lrs]
def get_iter_per_epoch(loader_list):
iter_num = sys.maxsize
for loader in loader_list:
num_batch = len(loader)
if num_batch < iter_num:
iter_num = num_batch
return iter_num
if __name__ == '__main__':
df_withlabel, df_nonlabel, df_withpattern, df_nonpattern = read_pkl()
dataset = LSWMD_Dataset_withlabel(df_withlabel)
loader_withlabel = get_loader(df_withlabel)
pdb.set_trace()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment