Commit 2cb16c72 by Pengwei-Jin

add niurenshaixuan

parent bc37faa6
# 使用方法
修改`main.py`中的`input_file_path``output_file_path`
输出的excel中包含`牛人``牛人署名顺序``疑似牛人`三列。其中疑似牛人的格式为 `疑似牛人名字(疑似牛人在info/new_niuren_format-merged_turing.csv中的索引)`
请初步判断是否为牛人。如果认为是牛人,则找李慧老师复核,由李慧老师添加到全局牛人表中。请不要修改全局牛人表。
如果是脚本运行相关的问题,请联系马天云同学。
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
import os
import json
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from utils import firstlast2lastfirst, name_in_niuren_list
input_file_path = '测试输入3.xlsx'
output_file_path = '测试输出3.xlsx'
def load_niuren_pool():
niuren_pool = pd.read_csv("info/new_niuren_format-merged_turing.csv", encoding='utf-8-sig')
niuren_pool_names = niuren_pool["name"].tolist()
niuren_pool_names = [name.replace("\xa0", " ") for name in niuren_pool_names] # 去除空格
return niuren_pool_names
def load_true_niuren():
# 读取 "全局牛人" 工作簿
true_niuren = pd.read_excel("info/新表-论文被引用统计-陈老师-截止2025年X月XX日.xlsx", sheet_name="全局牛人")
true_niuren["别名列表"] = None
true_niuren["别名列表"] = true_niuren["别名列表(各种奇奇怪怪的名字格式,比如first name和second name的顺序,以;分隔)"].apply(
lambda x: [firstlast2lastfirst(i.strip()) for i in x.split(";") if i!=""] if isinstance(x, str) else []
)
true_niuren_names = []
true_niuren_papers = []
# 构建牛人姓名列表
for _, row in true_niuren.iterrows():
if row["别名列表"]:
true_niuren_names.append([firstlast2lastfirst(row["姓名"])] + row["别名列表"])
else:
true_niuren_names.append(firstlast2lastfirst(row["姓名"]))
# 构建牛人论文列表
for true_niuren_idx, _ in enumerate(true_niuren_names):
paper_file_path = os.path.join("info/niuren_papers", f"{true_niuren_idx+1}.xlsx")
if os.path.exists(paper_file_path):
papers_df = pd.read_excel(paper_file_path, usecols=[0])
papers_list = papers_df.iloc[:, 0].dropna().tolist()
papers_list = [paper.lower() for paper in papers_list]
true_niuren_papers.append(papers_list)
else:
true_niuren_papers.append([])
return true_niuren_names, true_niuren_papers
def load_fake_niuren():
# 读取 "全局非牛人" 工作簿
fake_niuren = pd.read_excel("info/新表-论文被引用统计-陈老师-截止2025年X月XX日.xlsx", sheet_name="全局非牛人")
fake_niuren_names = []
fake_niuren_papers = []
# 构建非牛人姓名列表
for _, row in fake_niuren.iterrows():
fake_niuren_names.append(firstlast2lastfirst(row["姓名"]))
# 构建非牛人论文列表
for fake_niuren_idx, _ in enumerate(fake_niuren_names):
paper_file_path = os.path.join("info/feiniuren_papers", f"{fake_niuren_idx+1}.xlsx")
if os.path.exists(paper_file_path):
papers_df = pd.read_excel(paper_file_path, usecols=[0])
papers_list = papers_df.iloc[:, 0].dropna().tolist()
papers_list = [paper.lower() for paper in papers_list]
fake_niuren_papers.append(papers_list)
else:
fake_niuren_papers.append([])
return fake_niuren_names, fake_niuren_papers
def check_niuren(authors, title):
niuren = []
order = []
suspected_niuren = []
niuren_pool_names = load_niuren_pool()
true_niuren_names, true_niuren_papers = load_true_niuren()
fake_niuren_names, fake_niuren_papers = load_fake_niuren()
for author_idx, author in enumerate(authors):
if author == "":
continue
# 作者姓名能否在牛人池中找到
name_index = name_in_niuren_list(firstlast2lastfirst(author), niuren_pool_names)
if name_index == -1:
continue
# 作者姓名能否在全局牛人中找到
true_name_index = name_in_niuren_list(firstlast2lastfirst(author), true_niuren_names)
if true_name_index != -1:
if title.strip().lower() in true_niuren_papers[true_name_index]:
niuren.append(author)
order.append(author_idx + 1)
continue
# 作者姓名能否在全局非牛人中找到
fake_name_index = name_in_niuren_list(firstlast2lastfirst(author), fake_niuren_names)
if fake_name_index != -1:
if title.strip().lower() in fake_niuren_papers[fake_name_index]:
continue
suspected_niuren.append([author, name_index+1])
niuren_str = ";".join(niuren)
order_str = ";".join([str(i) for i in order])
suspected_niuren_str = ";".join([f"{name}({index})" for name, index in suspected_niuren])
return niuren_str, order_str, suspected_niuren_str
def process_row(index, row):
authors = row.get('引文作者', '')
title = row.get('引文名称', '')
if not isinstance(authors, str):
return index, "", "", ""
authors = [i.strip() for i in authors.split(";") if i!=""]
niuren_true, niuren_true_order, suspected_niuren = check_niuren(authors, title)
return index, niuren_true, niuren_true_order, suspected_niuren
if __name__ == "__main__":
# 读取原始 Excel 文件的前两行
original_header = pd.read_excel(input_file_path, nrows=2, header=None)
# 读取数据并跳过前两行
input_df = pd.read_excel(input_file_path, skiprows=2)
# 添加新列
input_df['牛人'] = None
input_df['牛人署名顺序'] = None
input_df['疑似牛人'] = None
results = Parallel(n_jobs=-1)(delayed(process_row)(index, row) for index, row in tqdm(input_df.iterrows()))
for index, niuren_true, niuren_true_order, suspected_niuren in results:
input_df.at[index, '牛人'] = niuren_true
input_df.at[index, '牛人署名顺序'] = niuren_true_order
input_df.at[index, '疑似牛人'] = suspected_niuren
# for index, row in tqdm(input_df.iterrows()):
# index, niuren_true, niuren_true_order, suspected_niuren = process_row(index, row)
# input_df.at[index, '牛人'] = niuren_true
# input_df.at[index, '牛人署名顺序'] = niuren_true_order
# input_df.at[index, '疑似牛人'] = suspected_niuren
max_cols = max(original_header.shape[1], input_df.shape[1])
# 定义补齐函数
def pad_df(df, max_cols):
df = df.copy()
for i in range(df.shape[1], max_cols):
df[i] = ""
df = df.iloc[:, :max_cols] # 限制在max_cols列
df.columns = range(max_cols) # 用数字做列名
return df
# 补齐所有DF
original_header_padded = pad_df(original_header, max_cols)
input_header = pd.DataFrame([input_df.columns.tolist()])
input_header_padded = pad_df(input_header, max_cols)
input_data_padded = pad_df(input_df, max_cols)
# 拼接
output_df = pd.concat([original_header_padded, input_header_padded, input_data_padded], ignore_index=True)
# 保存
output_df.to_excel(output_file_path, index=False, header=False)
print(f"处理后的数据已保存到 {output_file_path}")
\ No newline at end of file
def firstlast2lastfirst(name):
if "," in name:
return name
# 名人列表中存在一些名字是First Name Last Name的格式,将其转换为Last Name, First Name格式
# 例如:'John Doe' -> 'Doe, John','M. Jane Smith' -> 'Smith, Jane M.'
special_str = [".", "Ms.", "Mr.", "Mrs.", "Dr.", "Prof.", "PhD", "MD", "Jr.", "Sr.", "The", "Honorable"]
name_split = name.split(" ")
name_split = [item.strip() for item in name_split if item.strip() != ""]
name_split = [item for item in name_split if item not in special_str]
# Grigory Isaakovich Barenblatt --> Barenblatt, Alexa, Marc Isaakovich
if len(name_split) == 2:
first_name = name_split[0]
last_name = name_split[1]
new_name = f"{last_name}, {first_name}"
elif len(name_split) == 3:
if name_split[0].startswith("(") and name_split[0].endswith(")"):
# (Alexander) Philip Dawid
first_name = f"{name_split[1]} {name_split[0]}"
last_name = f"{name_split[2]}"
elif name_split[1].startswith("(") and name_split[1].endswith(")"):
# Xinyan (Tracy) Cui --> Cui, Xinyan (Tracy)
first_name = f"{name_split[0]} {name_split[1]}"
last_name = f"{name_split[2]}"
elif name_split[2].startswith("(") and name_split[2].endswith(")"):
# Ye Fred (Ying)实际是姓Ye,名Ying,英文名Fred, 需要将其转换为"Ye, Ying Fred"
# Zhu Jesse (Jingxu)实际是姓Zhu,名Jingxu,英文名Jesse, 需要将其转换为"Zhu, Jingxu Jesse"
first_name = f"{name_split[2][1:-1]} ({name_split[1]})"
last_name = f"{name_split[0]}"
elif name_split[0].endswith("."):
# M. Jane Smith --> Smith, Jane M.
# K.W. Michael Siu --> Siu, Michael K.W.
first_name = f"{name_split[1]} {name_split[0]}"
last_name = name_split[2]
elif name_split[1].endswith("."):
# Jane M. Smith --> Smith, Jane M.
# Pierre J.H. Richardson --> Richardson, Pierre J.H.
first_name = f"{name_split[0]} {name_split[1]}"
last_name = name_split[2]
elif name_split[2].endswith("."):
# Wimmer-Schweingruber Robert F. --> Wimmer-Schweingruber, Robert F.
# Wilderer Peter A. --> Wilderer, Peter A.
first_name = f"{name_split[1]} {name_split[2]}"
last_name = f"{name_split[0]}"
else:
# William Nelson Joy --> Joy, William Nelson
# Michael J Carey --> Carey, Michael J
# len_3_list.append(" ".join(name_split))
first_name = f"{name_split[0]} {name_split[1]}"
last_name = name_split[2]
new_name = f"{last_name}, {first_name}"
# print(f"{name} --> {new_name}")
else:
# lens greater than 3 这部分名字应该不会跟其他部分名字出现重复
last_name = name_split[-1]
first_name = " ".join(name_split[:-1])
new_name = f"{last_name}, {first_name}"
return new_name
def name_in_niuren_list(name, niuren_name_list):
"""
判断名字是否在牛人列表中
:param name: 名字
:param niuren_name_list: 牛人列表
:return: 如果在牛人列表中,返回 True,否则返回 False
"""
for idx, niuren_name in enumerate(niuren_name_list):
if isinstance(niuren_name, str):
if name == niuren_name:
return idx
elif isinstance(niuren_name, list):
if name in niuren_name:
return idx
for idx, niuren_name in enumerate(niuren_name_list):
if isinstance(niuren_name, str):
if niuren_name.startswith(name):
if niuren_name[niuren_name.find(name) + len(name)] == " ":
return idx
if name.startswith(niuren_name):
if name[name.find(niuren_name) + len(niuren_name)] == " ":
return idx
elif isinstance(niuren_name, list):
for niuren_name_item in niuren_name:
if niuren_name_item.startswith(name):
if niuren_name_item[niuren_name_item.find(name) + len(name)] == " ":
return idx
if name.startswith(niuren_name_item):
if name[name.find(niuren_name_item) + len(niuren_name_item)] == " ":
return idx
return -1
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment