Commit 34bba471 by Pengwei-Jin

author变为全小写形式匹配

parent 2cb16c72
......@@ -12,7 +12,7 @@ output_file_path = '测试输出3.xlsx'
def load_niuren_pool():
niuren_pool = pd.read_csv("info/new_niuren_format-merged_turing.csv", encoding='utf-8-sig')
niuren_pool_names = niuren_pool["name"].tolist()
niuren_pool_names = [name.replace("\xa0", " ") for name in niuren_pool_names] # 去除空格
niuren_pool_names = [name.replace("\xa0", " ").lower() for name in niuren_pool_names] # 去除空格
return niuren_pool_names
......@@ -22,7 +22,7 @@ def load_true_niuren():
true_niuren["别名列表"] = None
true_niuren["别名列表"] = true_niuren["别名列表(各种奇奇怪怪的名字格式,比如first name和second name的顺序,以;分隔)"].apply(
lambda x: [firstlast2lastfirst(i.strip()) for i in x.split(";") if i!=""] if isinstance(x, str) else []
lambda x: [firstlast2lastfirst(i.strip()).lower() for i in x.split(";") if i!=""] if isinstance(x, str) else []
)
true_niuren_names = []
......@@ -31,9 +31,9 @@ def load_true_niuren():
# 构建牛人姓名列表
for _, row in true_niuren.iterrows():
if row["别名列表"]:
true_niuren_names.append([firstlast2lastfirst(row["姓名"])] + row["别名列表"])
true_niuren_names.append([firstlast2lastfirst(row["姓名"]).lower()] + row["别名列表"])
else:
true_niuren_names.append(firstlast2lastfirst(row["姓名"]))
true_niuren_names.append(firstlast2lastfirst(row["姓名"]).lower())
# 构建牛人论文列表
for true_niuren_idx, _ in enumerate(true_niuren_names):
......@@ -58,7 +58,7 @@ def load_fake_niuren():
# 构建非牛人姓名列表
for _, row in fake_niuren.iterrows():
fake_niuren_names.append(firstlast2lastfirst(row["姓名"]))
fake_niuren_names.append(firstlast2lastfirst(row["姓名"]).lower())
# 构建非牛人论文列表
for fake_niuren_idx, _ in enumerate(fake_niuren_names):
......@@ -87,12 +87,12 @@ def check_niuren(authors, title):
continue
# 作者姓名能否在牛人池中找到
name_index = name_in_niuren_list(firstlast2lastfirst(author), niuren_pool_names)
name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), niuren_pool_names)
if name_index == -1:
continue
# 作者姓名能否在全局牛人中找到
true_name_index = name_in_niuren_list(firstlast2lastfirst(author), true_niuren_names)
true_name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), true_niuren_names)
if true_name_index != -1:
if title.strip().lower() in true_niuren_papers[true_name_index]:
niuren.append(author)
......@@ -100,7 +100,7 @@ def check_niuren(authors, title):
continue
# 作者姓名能否在全局非牛人中找到
fake_name_index = name_in_niuren_list(firstlast2lastfirst(author), fake_niuren_names)
fake_name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), fake_niuren_names)
if fake_name_index != -1:
if title.strip().lower() in fake_niuren_papers[fake_name_index]:
continue
......
......@@ -4,6 +4,7 @@ def firstlast2lastfirst(name):
# 名人列表中存在一些名字是First Name Last Name的格式,将其转换为Last Name, First Name格式
# 例如:'John Doe' -> 'Doe, John','M. Jane Smith' -> 'Smith, Jane M.'
special_str = [".", "Ms.", "Mr.", "Mrs.", "Dr.", "Prof.", "PhD", "MD", "Jr.", "Sr.", "The", "Honorable"]
special_str = [item.lower() for item in special_str]
name_split = name.split(" ")
name_split = [item.strip() for item in name_split if item.strip() != ""]
name_split = [item for item in name_split if item not in special_str]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment