Commit e46e4e8c by zhengxinhan

solve the compound first name problem

parent e02d76c7
......@@ -9,8 +9,8 @@ from copy import copy
from joblib import Parallel, delayed
from utils import standardized_name, name_in_niuren_list
input_file_path = 'info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
output_file_path = 'output/论文被引用统计-陈老师-截止2025年X月XX日_牛人筛选.xlsx'
input_file_path = 'info/doubohan.xlsx'
output_file_path = 'output/doubohan_牛人筛选.xlsx'
output_dir = os.path.dirname(output_file_path)
if not os.path.exists(output_dir):
......
......@@ -9,8 +9,14 @@ from copy import copy
from joblib import Parallel, delayed
from utils import standardized_name, name_in_niuren_list
<<<<<<< Updated upstream:papertools_niuren_ccfa/niurenpipei_xinbiao.py
input_file_path = 'info/新表-论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
output_file_path = 'output/新表-论文被引用统计-陈老师-截止2025年X月XX日_牛人筛选.xlsx'
=======
input_file_path = 'info/doubohan.xlsx'
# input_file_path = 'info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
output_file_path = 'output/doubohan_牛人筛选.xlsx'
>>>>>>> Stashed changes:papertools_niuren_ccfa/niurenpipei_update.py
output_dir = os.path.dirname(output_file_path)
if not os.path.exists(output_dir):
......@@ -153,11 +159,11 @@ if __name__ == "__main__":
fake_niuren_names, fake_niuren_papers = load_fake_niuren(fake_niuren,fake_niuren_papers_path)
# 读取表头(第4行作为列名)
original_header = pd.read_excel(input_file_path, nrows=0, header=3)
original_header = pd.read_excel(input_file_path, nrows=0, header=2)
column_names = original_header.columns.tolist()
# 读取数据(从第8行开始)
input_df = pd.read_excel(input_file_path, skiprows=7, header=None, names=column_names)
input_df = pd.read_excel(input_file_path, skiprows=2, header=None, names=column_names)
# 输出表头和数据的基本信息
print("表头元素:")
......@@ -273,9 +279,16 @@ if __name__ == "__main__":
ws_new.cell(row=4, column=suspected_niuren_col_index, value='疑似牛人')
# 更新数据(从第8行开始)
<<<<<<< Updated upstream:papertools_niuren_ccfa/niurenpipei_xinbiao.py
for i, row in input_df.iterrows():
excel_row = i + 8 # 转换为Excel的行号(从1开始)
=======
# for i, row in input_df.iterrows():
for i, (index, row) in enumerate(input_df.iterrows(), start=3):
# excel_row = i + 8 # 转换为Excel的行号(从1开始)
print(i, niuren_col_index)
>>>>>>> Stashed changes:papertools_niuren_ccfa/niurenpipei_update.py
# 只更新牛人相关的三列
ws_new.cell(row=excel_row, column=niuren_col_index, value=row[niuren_col])
ws_new.cell(row=excel_row, column=niuren_order_col_index, value=row[niuren_order_col])
......
......@@ -2,8 +2,8 @@
Author: zhengxinhan
Date: 2025-05-08 15:02:02
LastEditors: zhengxinhan
LastEditTime: 2025-05-09 02:46:29
FilePath: /papertools-master/niurenshaixuan/utils.py
LastEditTime: 2025-05-11 12:42:43
FilePath: /papertools_niuren_ccfa/utils.py
Description:
Copyright (c) 2025 by m13521952989@163.com, All Rights Reserved.
......@@ -11,66 +11,132 @@ Copyright (c) 2025 by m13521952989@163.com, All Rights Reserved.
def standardized_name(name):
if "," in name:
return name
# 名人列表中存在一些名字是First Name Last Name的格式,将其转换为Last Name, First Name格式
# 例如:'John Doe' -> 'Doe, John','M. Jane Smith' -> 'Smith, Jane M.'
# 添加常见的姓氏前缀列表
surname_prefixes = ["De", "Del", "Della", "Di", "Da", "Van", "Von", "Le", "La", "O'", "Mc", "Mac",
"Al", "El", "Bin", "Ben", "Ibn", "Ter", "Saint", "St.", "Dos", "Das", "Los", "Las", "San"]
# 特殊字符串处理
special_str = [".", "Ms.", "Mr.", "Mrs.", "Dr.", "Prof.", "PhD", "MD", "Jr.", "Sr.", "The", "Honorable"]
name_split = name.split(" ")
name_split = [item.strip() for item in name_split if item.strip() != ""]
name_split = [item for item in name_split if item not in special_str]
# Grigory Isaakovich Barenblatt --> Barenblatt, Alexa, Marc Isaakovich
# 如果分割后没有部分,直接返回原名
if not name_split:
return name
# 单部分名字处理(只有一个单词)
if len(name_split) == 1:
return name_split[0]
# 两部分名字处理
if len(name_split) == 2:
first_name = name_split[0]
last_name = name_split[1]
new_name = f"{last_name}, {first_name}"
# 三部分名字处理
elif len(name_split) == 3:
if name_split[0].startswith("(") and name_split[0].endswith(")"):
# 检查中间部分是否为姓氏前缀
if name_split[1] in surname_prefixes:
# 例如 "Barbara De Salvo" 应该变成 "De Salvo, Barbara"
first_name = name_split[0]
last_name = f"{name_split[1]} {name_split[2]}"
new_name = f"{last_name}, {first_name}"
# 以下是原有的其他情况处理
elif name_split[0].startswith("(") and name_split[0].endswith(")"):
# (Alexander) Philip Dawid
first_name = f"{name_split[1]} {name_split[0]}"
last_name = f"{name_split[2]}"
new_name = f"{last_name}, {first_name}"
elif name_split[1].startswith("(") and name_split[1].endswith(")"):
# Xinyan (Tracy) Cui --> Cui, Xinyan (Tracy)
first_name = f"{name_split[0]} {name_split[1]}"
last_name = f"{name_split[2]}"
new_name = f"{last_name}, {first_name}"
elif name_split[2].startswith("(") and name_split[2].endswith(")"):
# Ye Fred (Ying)实际是姓Ye,名Ying,英文名Fred, 需要将其转换为"Ye, Ying Fred"
# Zhu Jesse (Jingxu)实际是姓Zhu,名Jingxu,英文名Jesse, 需要将其转换为"Zhu, Jingxu Jesse"
first_name = f"{name_split[2][1:-1]} ({name_split[1]})"
last_name = f"{name_split[0]}"
new_name = f"{last_name}, {first_name}"
elif name_split[0].endswith("."):
# M. Jane Smith --> Smith, Jane M.
# K.W. Michael Siu --> Siu, Michael K.W.
first_name = f"{name_split[1]} {name_split[0]}"
last_name = name_split[2]
new_name = f"{last_name}, {first_name}"
elif name_split[1].endswith("."):
# Jane M. Smith --> Smith, Jane M.
# Pierre J.H. Richardson --> Richardson, Pierre J.H.
first_name = f"{name_split[0]} {name_split[1]}"
last_name = name_split[2]
new_name = f"{last_name}, {first_name}"
elif name_split[2].endswith("."):
# Wimmer-Schweingruber Robert F. --> Wimmer-Schweingruber, Robert F.
# Wilderer Peter A. --> Wilderer, Peter A.
first_name = f"{name_split[1]} {name_split[2]}"
last_name = f"{name_split[0]}"
new_name = f"{last_name}, {first_name}"
else:
# William Nelson Joy --> Joy, William Nelson
# Michael J Carey --> Carey, Michael J
# len_3_list.append(" ".join(name_split))
first_name = f"{name_split[0]} {name_split[1]}"
last_name = name_split[2]
new_name = f"{last_name}, {first_name}"
# print(f"{name} --> {new_name}")
new_name = f"{last_name}, {first_name}"
# 四部分或更多的名字处理
else:
# lens greater than 3 这部分名字应该不会跟其他部分名字出现重复
last_name = name_split[-1]
first_name = " ".join(name_split[:-1])
new_name = f"{last_name}, {first_name}"
# 检查复合姓氏模式
compound_found = False
# 检查倒数第二部分是否为姓氏前缀(确保列表长度足够)
if len(name_split) >= 2 and name_split[-2] in surname_prefixes:
last_name = f"{name_split[-2]} {name_split[-1]}"
first_name = " ".join(name_split[:-2])
compound_found = True
# 从前往后检查是否有姓氏前缀组合
if not compound_found:
for i in range(len(name_split) - 1):
if name_split[i] in surname_prefixes:
# 可能是复合姓氏的开始
# 例如 "Maria Del Carmen Gomez" 中的 "Del Carmen"
potential_compound = True
compound_parts = [name_split[i]]
# 查看后续部分是否也可能是复合姓氏的一部分
j = i + 1
while j < len(name_split) - 1 and potential_compound:
if name_split[j] in surname_prefixes or name_split[j][0].isupper():
compound_parts.append(name_split[j])
j += 1
else:
potential_compound = False
if potential_compound and j < len(name_split):
# 找到了复合姓氏
compound_parts.append(name_split[j])
last_name = " ".join(compound_parts)
first_name = " ".join(name_split[:i])
compound_found = True
break
# 如果没有找到复合姓氏,使用默认处理
if not compound_found:
last_name = name_split[-1]
first_name = " ".join(name_split[:-1])
new_name = f"{last_name}, {first_name}"
return new_name
def convert_to_lowercase(element):
if isinstance(element, list):
return [convert_to_lowercase(e) for e in element]
elif isinstance(element, str):
return element.lower()
else:
return element
def name_in_niuren_list(name, niuren_name_list):
"""
判断名字是否在牛人列表中
......@@ -78,6 +144,8 @@ def name_in_niuren_list(name, niuren_name_list):
:param niuren_name_list: 牛人列表
:return: 如果在牛人列表中,返回 True,否则返回 False
"""
name = convert_to_lowercase(name)
niuren_name_list = convert_to_lowercase(niuren_name_list)
for idx, niuren_name in enumerate(niuren_name_list):
if isinstance(niuren_name, str):
if name == niuren_name:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment