Commit bc0ad88a by Pengwei-Jin

避免重复加载info文件夹

parent 91ab82b3
...@@ -73,14 +73,11 @@ def load_fake_niuren(): ...@@ -73,14 +73,11 @@ def load_fake_niuren():
return fake_niuren_names, fake_niuren_papers return fake_niuren_names, fake_niuren_papers
def check_niuren(authors, title): def check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
niuren = [] niuren = []
order = [] order = []
suspected_niuren = [] suspected_niuren = []
niuren_pool_names = load_niuren_pool()
true_niuren_names, true_niuren_papers = load_true_niuren()
fake_niuren_names, fake_niuren_papers = load_fake_niuren()
for author_idx, author in enumerate(authors): for author_idx, author in enumerate(authors):
if author == "": if author == "":
...@@ -112,13 +109,13 @@ def check_niuren(authors, title): ...@@ -112,13 +109,13 @@ def check_niuren(authors, title):
return niuren_str, order_str, suspected_niuren_str return niuren_str, order_str, suspected_niuren_str
def process_row(index, row): def process_row(index, row, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
authors = row.get('引文作者', '') authors = row.get('引文作者', '')
title = row.get('引文名称', '') title = row.get('引文名称', '')
if not isinstance(authors, str): if not isinstance(authors, str):
return index, "", "", "" return index, "", "", ""
authors = [i.strip() for i in authors.split(";") if i!=""] authors = [i.strip() for i in authors.split(";") if i!=""]
niuren_true, niuren_true_order, suspected_niuren = check_niuren(authors, title) niuren_true, niuren_true_order, suspected_niuren = check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers)
return index, niuren_true, niuren_true_order, suspected_niuren return index, niuren_true, niuren_true_order, suspected_niuren
...@@ -126,6 +123,10 @@ if __name__ == "__main__": ...@@ -126,6 +123,10 @@ if __name__ == "__main__":
# 读取原始 Excel 文件的前两行 # 读取原始 Excel 文件的前两行
original_header = pd.read_excel(input_file_path, nrows=2, header=None) original_header = pd.read_excel(input_file_path, nrows=2, header=None)
niuren_pool_names = load_niuren_pool()
true_niuren_names, true_niuren_papers = load_true_niuren()
fake_niuren_names, fake_niuren_papers = load_fake_niuren()
# 读取数据并跳过前两行 # 读取数据并跳过前两行
input_df = pd.read_excel(input_file_path, skiprows=2) input_df = pd.read_excel(input_file_path, skiprows=2)
...@@ -135,17 +136,12 @@ if __name__ == "__main__": ...@@ -135,17 +136,12 @@ if __name__ == "__main__":
input_df['疑似牛人'] = None input_df['疑似牛人'] = None
results = Parallel(n_jobs=-1)(delayed(process_row)(index, row) for index, row in tqdm(input_df.iterrows())) results = Parallel(n_jobs=-1)(delayed(process_row)(index, row, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers) for index, row in tqdm(input_df.iterrows()))
for index, niuren_true, niuren_true_order, suspected_niuren in results: for index, niuren_true, niuren_true_order, suspected_niuren in results:
input_df.at[index, '牛人'] = niuren_true input_df.at[index, '牛人'] = niuren_true
input_df.at[index, '牛人署名顺序'] = niuren_true_order input_df.at[index, '牛人署名顺序'] = niuren_true_order
input_df.at[index, '疑似牛人'] = suspected_niuren input_df.at[index, '疑似牛人'] = suspected_niuren
# for index, row in tqdm(input_df.iterrows()):
# index, niuren_true, niuren_true_order, suspected_niuren = process_row(index, row)
# input_df.at[index, '牛人'] = niuren_true
# input_df.at[index, '牛人署名顺序'] = niuren_true_order
# input_df.at[index, '疑似牛人'] = suspected_niuren
max_cols = max(original_header.shape[1], input_df.shape[1]) max_cols = max(original_header.shape[1], input_df.shape[1])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment