Commit bc0ad88a by Pengwei-Jin

避免重复加载info文件夹

parent 91ab82b3
......@@ -73,14 +73,11 @@ def load_fake_niuren():
return fake_niuren_names, fake_niuren_papers
def check_niuren(authors, title):
def check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
niuren = []
order = []
suspected_niuren = []
niuren_pool_names = load_niuren_pool()
true_niuren_names, true_niuren_papers = load_true_niuren()
fake_niuren_names, fake_niuren_papers = load_fake_niuren()
for author_idx, author in enumerate(authors):
if author == "":
......@@ -112,13 +109,13 @@ def check_niuren(authors, title):
return niuren_str, order_str, suspected_niuren_str
def process_row(index, row):
def process_row(index, row, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
authors = row.get('引文作者', '')
title = row.get('引文名称', '')
if not isinstance(authors, str):
return index, "", "", ""
authors = [i.strip() for i in authors.split(";") if i!=""]
niuren_true, niuren_true_order, suspected_niuren = check_niuren(authors, title)
niuren_true, niuren_true_order, suspected_niuren = check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers)
return index, niuren_true, niuren_true_order, suspected_niuren
......@@ -126,6 +123,10 @@ if __name__ == "__main__":
# 读取原始 Excel 文件的前两行
original_header = pd.read_excel(input_file_path, nrows=2, header=None)
niuren_pool_names = load_niuren_pool()
true_niuren_names, true_niuren_papers = load_true_niuren()
fake_niuren_names, fake_niuren_papers = load_fake_niuren()
# 读取数据并跳过前两行
input_df = pd.read_excel(input_file_path, skiprows=2)
......@@ -135,17 +136,12 @@ if __name__ == "__main__":
input_df['疑似牛人'] = None
results = Parallel(n_jobs=-1)(delayed(process_row)(index, row) for index, row in tqdm(input_df.iterrows()))
results = Parallel(n_jobs=-1)(delayed(process_row)(index, row, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers) for index, row in tqdm(input_df.iterrows()))
for index, niuren_true, niuren_true_order, suspected_niuren in results:
input_df.at[index, '牛人'] = niuren_true
input_df.at[index, '牛人署名顺序'] = niuren_true_order
input_df.at[index, '疑似牛人'] = suspected_niuren
# for index, row in tqdm(input_df.iterrows()):
# index, niuren_true, niuren_true_order, suspected_niuren = process_row(index, row)
# input_df.at[index, '牛人'] = niuren_true
# input_df.at[index, '牛人署名顺序'] = niuren_true_order
# input_df.at[index, '疑似牛人'] = suspected_niuren
max_cols = max(original_header.shape[1], input_df.shape[1])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment