Commit ee8f53f5 by lvzhengyang

Initial commit

parents
# train.py
#!/usr/bin/env python3
""" train network using pytorch
author baiyu
"""
import os
import sys
import argparse
import time
from datetime import datetime
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from conf import settings
from utils import get_network, get_my_loader3, WarmUpLR, \
most_recent_folder, most_recent_weights, last_epoch, best_acc_weights
import pdb
LABEL_THRESHOLD = 0.05
def train(epoch):
start = time.time()
net.train(True)
for batch_index, (imgs, rudys, labels) in enumerate(train_loader):
if args.gpu:
labels = labels.cuda()
rudys = rudys.cuda()
imgs = imgs.cuda()
optimizer.zero_grad()
outputs = net(rudys, imgs)
labels = labels.squeeze()
labels = torch.where(labels > LABEL_THRESHOLD, 1, 0)
loss = loss_function(outputs, labels)
loss.backward()
optimizer.step()
n_iter = (epoch - 1) * len(train_loader) + batch_index + 1
last_layer = list(net.children())[-1]
for name, para in last_layer.named_parameters():
if 'weight' in name:
writer.add_scalar('LastLayerGradients/grad_norm2_weights', para.grad.norm(), n_iter)
if 'bias' in name:
writer.add_scalar('LastLayerGradients/grad_norm2_bias', para.grad.norm(), n_iter)
print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format(
loss.item(),
optimizer.param_groups[0]['lr'],
epoch=epoch,
trained_samples=batch_index * args.b + len(rudys),
total_samples=len(train_loader.dataset)
))
#update training loss for each iteration
writer.add_scalar('Train/loss', loss.item(), n_iter)
if epoch <= args.warm:
warmup_scheduler.step()
for name, param in net.named_parameters():
layer, attr = os.path.splitext(name)
attr = attr[1:]
writer.add_histogram("{}/{}".format(layer, attr), param, epoch)
finish = time.time()
print('epoch {} training time consumed: {:.2f}s'.format(epoch, finish - start))
@torch.no_grad()
def eval_training(epoch=0, tb=True):
start = time.time()
net.eval()
test_loss = 0.0 # cost function error
correct = 0.0
for (imgs, rudys, labels) in test_loader:
if args.gpu:
rudys = rudys.cuda()
imgs = imgs.cuda()
labels = labels.cuda()
outputs = net(rudys, imgs)
labels = labels.squeeze()
labels = torch.where(labels > LABEL_THRESHOLD, 1, 0)
loss = loss_function(outputs, labels)
test_loss += loss.item()
_, preds = outputs.max(1)
correct += preds.eq(labels).sum()
finish = time.time()
if args.gpu:
print('GPU INFO.....')
print(torch.cuda.memory_summary(), end='')
print('Evaluating Network.....')
print('Test set: Epoch: {}, Average loss: {:.4f}, Accuracy: {:.4f}, Time consumed:{:.2f}s'.format(
epoch,
test_loss / len(test_loader.dataset),
correct.float() / len(test_loader.dataset),
finish - start
))
"""
print('Test set: Epoch: {}, Average loss: {:.4f}, Time consumed:{:.2f}s'.format(
epoch,
test_loss / len(test_loader.dataset),
finish - start
))
"""
#add informations to tensorboard
if tb:
writer.add_scalar('Test/Average loss', test_loss / len(test_loader.dataset), epoch)
writer.add_scalar('Test/Accuracy', correct.float() / len(test_loader.dataset), epoch)
return correct.float() / len(test_loader.dataset)
if __name__ == '__main__':
# python train_6.py -gpu -net config_1
# use image and RUDY
torch.multiprocessing.set_start_method('spawn')
torch.multiprocessing.set_sharing_strategy('file_system')
parser = argparse.ArgumentParser()
parser.add_argument('-net', default='config_1', type=str, required=True, help='net type')
parser.add_argument('-gpu', action='store_true', default=False, help='use gpu or not')
parser.add_argument('-b', type=int, default=128, help='batch size for dataloader')
parser.add_argument('-warm', type=int, default=1, help='warm up training phase')
parser.add_argument('-lr', type=float, default=0.1, help='initial learning rate')
parser.add_argument('-resume', action='store_true', default=False, help='resume training')
args = parser.parse_args()
net = get_network(args, num_class=2)
train_loader = get_my_loader3(
dataset_dir='./dataset/dataset4/train',
num_workers=4,
batch_size=args.b,
shuffle=True
)
test_loader = get_my_loader3(
dataset_dir='./dataset/dataset4/test',
num_workers=4,
batch_size=args.b,
shuffle=True
)
loss_function = nn.CrossEntropyLoss()
# optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=5e-4)
train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=settings.MILESTONES, gamma=0.2) #learning rate decay
# iter_per_epoch = len(cifar100_training_loader)
iter_per_epoch = len(train_loader)
warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm)
if args.resume:
recent_folder = most_recent_folder(os.path.join(settings.CHECKPOINT_PATH, args.net), fmt=settings.DATE_FORMAT)
if not recent_folder:
raise Exception('no recent folder were found')
checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)
else:
checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW)
#use tensorboard
if not os.path.exists(settings.LOG_DIR):
os.mkdir(settings.LOG_DIR)
#since tensorboard can't overwrite old values
#so the only way is to create a new tensorboard log
writer = SummaryWriter(log_dir=os.path.join(
settings.LOG_DIR, args.net, settings.TIME_NOW))
rudy_tensor = torch.Tensor(1, 9)
img_tensor = torch.Tensor(1, 1, 128, 128)
if args.gpu:
rudy_tensor = rudy_tensor.cuda()
img_tensor = img_tensor.cuda()
writer.add_graph(net, [rudy_tensor, img_tensor])
#create checkpoint folder to save model
if not os.path.exists(checkpoint_path):
os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')
best_acc = 0.0
if args.resume:
best_weights = best_acc_weights(os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder))
if best_weights:
weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder, best_weights)
print('found best acc weights file:{}'.format(weights_path))
print('load best training file to test acc...')
net.load_state_dict(torch.load(weights_path))
# best_acc = eval_training(tb=False)
eval_training(tb=False)
# print('best acc is {:0.2f}'.format(best_acc))
recent_weights_file = most_recent_weights(os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder))
if not recent_weights_file:
raise Exception('no recent weights file were found')
weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder, recent_weights_file)
print('loading weights file {} to resume training.....'.format(weights_path))
net.load_state_dict(torch.load(weights_path))
resume_epoch = last_epoch(os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder))
for epoch in range(1, settings.EPOCH + 1):
if epoch > args.warm:
train_scheduler.step(epoch)
if args.resume:
if epoch <= resume_epoch:
continue
train(epoch)
acc = eval_training(epoch)
#start to save best performance model after learning rate decay to 0.01
if epoch > settings.MILESTONES[1] and best_acc < acc:
weights_path = checkpoint_path.format(net=args.net, epoch=epoch, type='best')
print('saving weights file to {}'.format(weights_path))
torch.save(net.state_dict(), weights_path)
best_acc = acc
continue
if not epoch % settings.SAVE_EPOCH:
weights_path = checkpoint_path.format(net=args.net, epoch=epoch, type='regular')
print('saving weights file to {}'.format(weights_path))
torch.save(net.state_dict(), weights_path)
writer.close()
This diff is collapsed. Click to expand it.
"""
@brief: the network structure, and the backbone is inception-v4
I copy the inception-v4 code from baiyu's work
@author: Zhengyang Lyu
@date: 2022.8.23
"""
\ No newline at end of file
from utils import *
from model.inceptionv4 import inceptionv4
import torch.optim as optim
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import os
def train(epoch):
net.train(True)
for i in range(iter_per_epoch - 1):
optimizer.zero_grad()
imgs_nonlabel = iter_nonlabel.next()
loss_ssl = self_supervised_loss(imgs_nonlabel, net)
labels_list, imgs_list, flag = get_few_shot_batch(iter_list)
if flag == -1:
return -1
loss_fs = few_shot_loss(imgs_list, labels_list, net)
# --- following is for test ---
loss = loss_fs + ALPHA * loss_ssl
# loss = loss_fs
if torch.isnan(loss):
pdb.set_trace()
loss.backward()
optimizer.step()
print("Training epoch: {epoch}\tIter: {cur_iter}/{iter_per_epoch}\tloss: {:0.4f}\tLR: {:0.6f}".format(
loss.item(),
optimizer.param_groups[0]['lr'],\
epoch=epoch,
cur_iter=i + 1,
iter_per_epoch=iter_per_epoch - 1
))
n_iter = (epoch - 1) * (iter_per_epoch - 1) + i
#update training loss for each iteration
writer.add_scalar('Train/loss', loss.item(), n_iter)
if epoch <= WARM:
warmup_scheduler.step()
return 0
if __name__ == '__main__':
# --- parameters setting ---
lr = 0.1
epoch_num = 500
ALPHA = 1.0
MILESTONES = [60, 120, 160] # don't know what it means...
WARM = 1
# --- log setting ---
LOG_DIR = 'runs'
DATE_FORMAT = '%A_%d_%B_%Y_%Hh_%Mm_%Ss'
#time of we run the script
TIME_NOW = datetime.now().strftime(DATE_FORMAT)
MODEL_NAME = 'InceptionV4'
# --- save weight ---
CHECKPOINT_PATH = 'checkpoint'
SAVE_EPOCH = 10
# --- data loading ---
df_withlabel, df_nonlabel, df_withpattern, df_nonpattern = read_pkl()
# data with label
df_list = get_df_of_label(df_withlabel)
dataset_list = get_dataset_list_withlabel(df_list)
loader_list = get_loader_list(dataset_list) # for labeled data
# data without label
# TODO: can we treat those with label as nonlabel?
dataset_nonlabel = LSWMD_Dataset_nonlabel(df_nonlabel)
# --- model to be trained ---
net = inceptionv4()
net.cuda()
# --- train setting ---
# weight_decay is the weight for the norm of the NN params added to loss, L2 penalty
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-3)
# optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=MILESTONES, gamma=0.2) #learning rate decay
iter_per_epoch = get_iter_per_epoch(loader_list)
warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * WARM)
# --- reuse pretrained weights ---
# TODO: To be implemented
# --- use Tensorboard ---
if not os.path.exists(LOG_DIR):
os.mkdir(LOG_DIR)
writer = SummaryWriter(log_dir=os.path.join(LOG_DIR, MODEL_NAME, TIME_NOW))
img_tensor = torch.Tensor(1, 1, 256, 256).cuda()
writer.add_graph(net, img_tensor)
# --- setting for saving model ---
checkpoint_path = os.path.join(CHECKPOINT_PATH, MODEL_NAME, TIME_NOW)
if not os.path.exists(checkpoint_path):
os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')
best_acc = 0.0
# --- logging pretrained info ---
# TODO: To be implemented
# --- training process ---
for epoch in range(1, epoch_num):
# --- warming up ---
# TODO: To be implemented
# if epoch > WARM:
# train_scheduler.step(epoch)
# --- re-initilize the iterator ---
loader_list = get_loader_list(dataset_list) # for labeled data
iter_list = get_iter_list(loader_list)
loader_nonlabel = get_loader(dataset_nonlabel)
iter_nonlabel = loader_nonlabel._get_iterator()
train(epoch)
# --- evaluating model ---
# TODO: To be implemented
# TODO: divide training set and evaluating set
# --- save model ---
# when best
# TODO: To be implemented
# regularly
if not epoch % SAVE_EPOCH:
weights_path = checkpoint_path.format(net=MODEL_NAME, epoch=epoch, type='regular')
print('saving weights file to {}'.format(weights_path))
torch.save(net.state_dict(), weights_path)
writer.close()
\ No newline at end of file
from utils import *
from model.inceptionv4 import inceptionv4
import torch.optim as optim
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import os
def train(epoch):
net.train(True)
for i in range(iter_per_epoch - 1):
optimizer.zero_grad()
"""
imgs_nonlabel = iter_nonlabel.next()
loss_ssl = self_supervised_loss(imgs_nonlabel, net)
"""
labels_list, imgs_list, flag = get_few_shot_batch(iter_list)
if flag == -1:
return -1
loss_fs = few_shot_loss(imgs_list, labels_list, net)
# --- following is for test ---
# loss = loss_fs + ALPHA * loss_ssl
loss = loss_fs
# loss = loss_fs
if torch.isnan(loss):
pdb.set_trace()
loss.backward()
optimizer.step()
print("Training epoch: {epoch}\tIter: {cur_iter}/{iter_per_epoch}\tloss: {:0.4f}\tLR: {:0.6f}".format(
loss.item(),
optimizer.param_groups[0]['lr'],\
epoch=epoch,
cur_iter=i + 1,
iter_per_epoch=iter_per_epoch - 1
))
n_iter = (epoch - 1) * (iter_per_epoch - 1) + i
#update training loss for each iteration
writer.add_scalar('Train/loss', loss.item(), n_iter)
if epoch <= WARM:
warmup_scheduler.step()
return 0
if __name__ == '__main__':
# --- parameters setting ---
lr = 0.1
epoch_num = 500
ALPHA = 1.0
MILESTONES = [60, 120, 160] # don't know what it means...
WARM = 1
# --- log setting ---
LOG_DIR = 'runs'
DATE_FORMAT = '%A_%d_%B_%Y_%Hh_%Mm_%Ss'
#time of we run the script
TIME_NOW = datetime.now().strftime(DATE_FORMAT)
MODEL_NAME = 'InceptionV4'
# --- save weight ---
CHECKPOINT_PATH = 'checkpoint'
SAVE_EPOCH = 10
# --- data loading ---
df_withlabel, df_nonlabel, df_withpattern, df_nonpattern = read_pkl()
# data with label
df_list = get_df_of_label(df_withlabel)
dataset_list = get_dataset_list_withlabel(df_list)
loader_list = get_loader_list(dataset_list) # for labeled data
# data without label
# TODO: can we treat those with label as nonlabel?
dataset_nonlabel = LSWMD_Dataset_nonlabel(df_nonlabel)
# --- model to be trained ---
net = inceptionv4()
net.cuda()
# --- train setting ---
# weight_decay is the weight for the norm of the NN params added to loss, L2 penalty
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-3)
# optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=MILESTONES, gamma=0.2) #learning rate decay
iter_per_epoch = get_iter_per_epoch(loader_list)
warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * WARM)
# --- reuse pretrained weights ---
# TODO: To be implemented
# --- use Tensorboard ---
if not os.path.exists(LOG_DIR):
os.mkdir(LOG_DIR)
writer = SummaryWriter(log_dir=os.path.join(LOG_DIR, MODEL_NAME, '1', TIME_NOW))
img_tensor = torch.Tensor(1, 1, 256, 256).cuda()
writer.add_graph(net, img_tensor)
# --- setting for saving model ---
checkpoint_path = os.path.join(CHECKPOINT_PATH, MODEL_NAME, TIME_NOW)
if not os.path.exists(checkpoint_path):
os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')
best_acc = 0.0
# --- logging pretrained info ---
# TODO: To be implemented
# --- training process ---
for epoch in range(1, epoch_num):
# --- warming up ---
# TODO: To be implemented
# if epoch > WARM:
# train_scheduler.step(epoch)
# --- re-initilize the iterator ---
loader_list = get_loader_list(dataset_list) # for labeled data
iter_list = get_iter_list(loader_list)
loader_nonlabel = get_loader(dataset_nonlabel)
iter_nonlabel = loader_nonlabel._get_iterator()
train(epoch)
# --- evaluating model ---
# TODO: To be implemented
# TODO: divide training set and evaluating set
# --- save model ---
# when best
# TODO: To be implemented
# regularly
if not epoch % SAVE_EPOCH:
weights_path = checkpoint_path.format(net=MODEL_NAME, epoch=epoch, type='regular')
print('saving weights file to {}'.format(weights_path))
torch.save(net.state_dict(), weights_path)
writer.close()
\ No newline at end of file
from utils import *
from model.inceptionv4 import inceptionv4
import torch.optim as optim
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import os
def train(epoch):
net.train(True)
for i in range(iter_per_epoch - 1):
optimizer.zero_grad()
imgs_nonlabel = iter_nonlabel.next()
loss_ssl = self_supervised_loss(imgs_nonlabel, net)
"""
labels_list, imgs_list, flag = get_few_shot_batch(iter_list)
if flag == -1:
return -1
loss_fs = few_shot_loss(imgs_list, labels_list, net)
"""
# --- following is for test ---
# loss = loss_fs + ALPHA * loss_ssl
loss = loss_ssl
# loss = loss_fs
if torch.isnan(loss):
pdb.set_trace()
loss.backward()
optimizer.step()
print("Training epoch: {epoch}\tIter: {cur_iter}/{iter_per_epoch}\tloss: {:0.4f}\tLR: {:0.6f}".format(
loss.item(),
optimizer.param_groups[0]['lr'],\
epoch=epoch,
cur_iter=i + 1,
iter_per_epoch=iter_per_epoch - 1
))
n_iter = (epoch - 1) * (iter_per_epoch - 1) + i
#update training loss for each iteration
writer.add_scalar('Train/loss', loss.item(), n_iter)
if epoch <= WARM:
warmup_scheduler.step()
return 0
if __name__ == '__main__':
# --- parameters setting ---
lr = 0.1
epoch_num = 500
ALPHA = 1.0
MILESTONES = [60, 120, 160] # don't know what it means...
WARM = 1
# --- log setting ---
LOG_DIR = 'runs'
DATE_FORMAT = '%A_%d_%B_%Y_%Hh_%Mm_%Ss'
#time of we run the script
TIME_NOW = datetime.now().strftime(DATE_FORMAT)
MODEL_NAME = 'InceptionV4'
# --- save weight ---
CHECKPOINT_PATH = 'checkpoint'
SAVE_EPOCH = 10
# --- data loading ---
df_withlabel, df_nonlabel, df_withpattern, df_nonpattern = read_pkl()
# data with label
df_list = get_df_of_label(df_withlabel)
dataset_list = get_dataset_list_withlabel(df_list)
loader_list = get_loader_list(dataset_list) # for labeled data
# data without label
# TODO: can we treat those with label as nonlabel?
dataset_nonlabel = LSWMD_Dataset_nonlabel(df_nonlabel)
# --- model to be trained ---
net = inceptionv4()
net.cuda()
# --- train setting ---
# weight_decay is the weight for the norm of the NN params added to loss, L2 penalty
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-3)
# optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=MILESTONES, gamma=0.2) #learning rate decay
iter_per_epoch = get_iter_per_epoch(loader_list)
warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * WARM)
# --- reuse pretrained weights ---
# TODO: To be implemented
# --- use Tensorboard ---
if not os.path.exists(LOG_DIR):
os.mkdir(LOG_DIR)
writer = SummaryWriter(log_dir=os.path.join(LOG_DIR, MODEL_NAME, '2', TIME_NOW))
img_tensor = torch.Tensor(1, 1, 256, 256).cuda()
writer.add_graph(net, img_tensor)
# --- setting for saving model ---
checkpoint_path = os.path.join(CHECKPOINT_PATH, MODEL_NAME, TIME_NOW)
if not os.path.exists(checkpoint_path):
os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')
best_acc = 0.0
# --- logging pretrained info ---
# TODO: To be implemented
# --- training process ---
for epoch in range(1, epoch_num):
# --- warming up ---
# TODO: To be implemented
# if epoch > WARM:
# train_scheduler.step(epoch)
# --- re-initilize the iterator ---
loader_list = get_loader_list(dataset_list) # for labeled data
iter_list = get_iter_list(loader_list)
loader_nonlabel = get_loader(dataset_nonlabel)
iter_nonlabel = loader_nonlabel._get_iterator()
train(epoch)
# --- evaluating model ---
# TODO: To be implemented
# TODO: divide training set and evaluating set
# --- save model ---
# when best
# TODO: To be implemented
# regularly
if not epoch % SAVE_EPOCH:
weights_path = checkpoint_path.format(net=MODEL_NAME, epoch=epoch, type='regular')
print('saving weights file to {}'.format(weights_path))
torch.save(net.state_dict(), weights_path)
writer.close()
\ No newline at end of file
"""
@brief: build distributed training on multi-GPU
@author: Zhengyang Lyu
@date 2022.8.26
@note: https://github.com/tczhangzhi/pytorch-distributed
"""
# use torch.distributed
# n processes for n GPUs
# To launch, type the following command in terminal:
# `CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 main.py`
import argparse
import os
import random
import shutil
import time
import warnings
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
from model.inceptionv4 import inceptionv4
from utils import get_dataset_list_withlabel
# --- Args ---
parser = argparse.ArgumentParser(description='PyTorch Distributed Training')
parser.add_argument('--data',
metavar='DIR',
default='/lustre/S/lvzhengyang/wafer_failure/dataset/LSWMD.pkl',
help='path to dataset')
parser.add_argument('-j',
'--workers',
default=4,
type=int,
metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs',
default=90,
type=int,
metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch',
default=0,
type=int,
metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b',
'--batch-size',
default=128,
type=int,
metavar='N',
help='mini-batch size (default: 3200), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr',
'--learning-rate',
default=0.1,
type=float,
metavar='LR',
help='initial learning rate',
dest='lr')
parser.add_argument('--momentum',
default=0.9,
type=float,
metavar='M',
help='momentum')
parser.add_argument('--local_rank',
default=-1,
type=int,
help='node rank for distributed training')
parser.add_argument('--wd',
'--weight-decay',
default=1e-4,
type=float,
metavar='W',
help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p',
'--print-freq',
default=10,
type=int,
metavar='N',
help='print frequency (default: 10)')
parser.add_argument('-e',
'--evaluate',
dest='evaluate',
action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained',
dest='pretrained',
action='store_true',
help='use pre-trained model')
parser.add_argument('--seed',
default=None,
type=int,
help='seed for initializing training. ')
def reduce_mean(tensor, nprocs):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
rt /= nprocs
return rt
def main():
args = parser.parse_args()
args.nprocs = torch.cuda.device_count()
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
main_worker(args.local_rank, args.nprocs, args)
def main_worker(local_rank, nprocs, args):
best_acc1 = .0
dist.init_process_group(backend='nccl')
# create model
if args.pretrained:
raise NotImplementedError
else:
print("=> creating model '{}'".format('Inception-V4'))
model = inceptionv4()
torch.cuda.set_device(local_rank)
model.cuda(local_rank)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / nprocs)
model = torch.nn.parallel.DistributedDataParallel(model,
device_ids=[local_rank])
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(local_rank)
"""
optimizer = torch.optim.SGD(model.parameters(),
args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
"""
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
cudnn.benchmark = True
# Data loading code
traindir = os.path.join(args.data, 'train')
train_dataset_list = get_dataset_list_withlabel
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=args.batch_size,
num_workers=2,
pin_memory=True,
sampler=train_sampler)
val_dataset = datasets.ImageFolder(
valdir,
transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]))
val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
val_loader = torch.utils.data.DataLoader(val_dataset,
batch_size=args.batch_size,
num_workers=2,
pin_memory=True,
sampler=val_sampler)
if args.evaluate:
validate(val_loader, model, criterion, local_rank, args)
return
for epoch in range(args.start_epoch, args.epochs):
train_sampler.set_epoch(epoch)
val_sampler.set_epoch(epoch)
adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, local_rank,
args)
# evaluate on validation set
acc1 = validate(val_loader, model, criterion, local_rank, args)
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
if args.local_rank == 0:
save_checkpoint(
{
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.module.state_dict(),
'best_acc1': best_acc1,
}, is_best)
This diff is collapsed. Click to expand it.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment