fix dataset

bfe16e7a · lvzhengyang · ee8f53f5 · ee8f53f5 · bfe16e7a
Commit bfe16e7a authored Sep 06, 2022 by lvzhengyang
Show whitespace changes
Inline Side-by-side

Showing with 71 additions and 0 deletions

dataset/LSWMD.pkl.zip
+0 -0

dataset/reorganize_dataset.py
+71 -0

No files found.
--- a/dataset/LSWMD.pkl.zip
+++ b/dataset/LSWMD.pkl.zip
--- a/dataset/reorganize_dataset.py
+++ b/dataset/reorganize_dataset.py
+"""
+@brief: reorganize the dataset for few-shot learning and self-supervised learning
+@author: Zhengyang Lyu
+@date: 2022.8.26
+@note: run this script under 'dataset' dir
+"""
+import os
+import sys
+sys.path.append('..')
+from utils import read_pkl, get_df_of_label
+import pandas as pd
+import random
+import numpy as np
+import pdb
+
+"""
+Categories  Training Set  Testing Set
+            # Percent (%) # Percent (%)
+Center        2576 2.48     1718 2.48
+Donut         333  0.32      222 0.32
+Edge-Loc      3113 2.99     2076 3.00
+Edge-Ring     5808 5.60     3872 5.60
+Location      2155 2.08     1438 2.08
+Near-Full     89   0.09       60 0.09
+Random        519  0.50      347 0.50
+Scratch       715  0.69      478 0.69
+None          88459 85.25   58972 85.25
+Total         103767 100    69183 100
+"""
+
+def reorganize_dataset():
+  df_withlabel, df_nonlabel, df_withpattern, df_nonpattern = read_pkl(path=dataset_path)
+  df_nonlabel.to_pickle(nonlabel_path)
+  # --- following code is for df_withlabel ---
+  df_list = get_df_of_label(df_withlabel)
+  df_len = [len(df) for df in df_list]  # [4294, 555, 5189, 9680, 3593, 866, 1193, 149, 147431]
+  # divide training / validation set (0.6 : 0.4)
+  val_list = [df.sample(frac=0.4,random_state=60) for df in df_list]
+  df_val = pd.concat(val_list).reset_index(drop=True)
+  df_val.to_pickle(withlabel_test_path)
+  train_list = [df_list[i].drop(val_list[i].index).reset_index(drop=True) 
+                for i in range(len(val_list))]
+  train_len = [len(df) for df in train_list]  # [2576, 333, 3113, 5808, 2156, 520, 716, 89, 88459]
+  non_zero, _ = np.nonzero(train_len)
+  while non_zero.size >= 2:
+    num_classes = random.randint(2, non_zero.size)
+    random.sample(non_zero, num_classes)
+    train_len = [len(df) for df in train_list]
+    non_zero, _ = np.nonzero(train_len)
+  pdb.set_trace()
+
+if __name__ == '__main__':
+  dataset_path = '/lustre/S/lvzhengyang/wafer_failure/dataset/LSWMD.pkl'
+  nonlabel_dir = os.path.join('.', 'nonlabel')
+  if not os.path.exists(nonlabel_dir):
+    os.mkdir(nonlabel_dir)
+  nonlabel_path = os.path.join(nonlabel_dir, 'nonlabel.pkl')
+
+  withlabel_dir = os.path.join('.', 'withlabel')
+  if not os.path.exists(withlabel_dir):
+    os.mkdir(withlabel_dir)
+  withlabel_dir_train = os.path.join(withlabel_dir, 'train')
+  if not os.path.exists(withlabel_dir_train):
+    os.mkdir(withlabel_dir_train)
+
+  withlabel_dir_test = os.path.join(withlabel_dir, 'test')
+  if not os.path.exists(withlabel_dir_test):
+    os.mkdir(withlabel_dir_test)
+  withlabel_test_path = os.path.join(withlabel_dir_test, 'withlabel_test.pkl')
+
+  reorganize_dataset()