Commit bfe16e7a by lvzhengyang

fix dataset

parent ee8f53f5
"""
@brief: reorganize the dataset for few-shot learning and self-supervised learning
@author: Zhengyang Lyu
@date: 2022.8.26
@note: run this script under 'dataset' dir
"""
import os
import sys
sys.path.append('..')
from utils import read_pkl, get_df_of_label
import pandas as pd
import random
import numpy as np
import pdb
"""
Categories Training Set Testing Set
# Percent (%) # Percent (%)
Center 2576 2.48 1718 2.48
Donut 333 0.32 222 0.32
Edge-Loc 3113 2.99 2076 3.00
Edge-Ring 5808 5.60 3872 5.60
Location 2155 2.08 1438 2.08
Near-Full 89 0.09 60 0.09
Random 519 0.50 347 0.50
Scratch 715 0.69 478 0.69
None 88459 85.25 58972 85.25
Total 103767 100 69183 100
"""
def reorganize_dataset():
df_withlabel, df_nonlabel, df_withpattern, df_nonpattern = read_pkl(path=dataset_path)
df_nonlabel.to_pickle(nonlabel_path)
# --- following code is for df_withlabel ---
df_list = get_df_of_label(df_withlabel)
df_len = [len(df) for df in df_list] # [4294, 555, 5189, 9680, 3593, 866, 1193, 149, 147431]
# divide training / validation set (0.6 : 0.4)
val_list = [df.sample(frac=0.4,random_state=60) for df in df_list]
df_val = pd.concat(val_list).reset_index(drop=True)
df_val.to_pickle(withlabel_test_path)
train_list = [df_list[i].drop(val_list[i].index).reset_index(drop=True)
for i in range(len(val_list))]
train_len = [len(df) for df in train_list] # [2576, 333, 3113, 5808, 2156, 520, 716, 89, 88459]
non_zero, _ = np.nonzero(train_len)
while non_zero.size >= 2:
num_classes = random.randint(2, non_zero.size)
random.sample(non_zero, num_classes)
train_len = [len(df) for df in train_list]
non_zero, _ = np.nonzero(train_len)
pdb.set_trace()
if __name__ == '__main__':
dataset_path = '/lustre/S/lvzhengyang/wafer_failure/dataset/LSWMD.pkl'
nonlabel_dir = os.path.join('.', 'nonlabel')
if not os.path.exists(nonlabel_dir):
os.mkdir(nonlabel_dir)
nonlabel_path = os.path.join(nonlabel_dir, 'nonlabel.pkl')
withlabel_dir = os.path.join('.', 'withlabel')
if not os.path.exists(withlabel_dir):
os.mkdir(withlabel_dir)
withlabel_dir_train = os.path.join(withlabel_dir, 'train')
if not os.path.exists(withlabel_dir_train):
os.mkdir(withlabel_dir_train)
withlabel_dir_test = os.path.join(withlabel_dir, 'test')
if not os.path.exists(withlabel_dir_test):
os.mkdir(withlabel_dir_test)
withlabel_test_path = os.path.join(withlabel_dir_test, 'withlabel_test.pkl')
reorganize_dataset()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment