Commit 34bba471 by Pengwei-Jin

author变为全小写形式匹配

parent 2cb16c72
...@@ -12,7 +12,7 @@ output_file_path = '测试输出3.xlsx' ...@@ -12,7 +12,7 @@ output_file_path = '测试输出3.xlsx'
def load_niuren_pool(): def load_niuren_pool():
niuren_pool = pd.read_csv("info/new_niuren_format-merged_turing.csv", encoding='utf-8-sig') niuren_pool = pd.read_csv("info/new_niuren_format-merged_turing.csv", encoding='utf-8-sig')
niuren_pool_names = niuren_pool["name"].tolist() niuren_pool_names = niuren_pool["name"].tolist()
niuren_pool_names = [name.replace("\xa0", " ") for name in niuren_pool_names] # 去除空格 niuren_pool_names = [name.replace("\xa0", " ").lower() for name in niuren_pool_names] # 去除空格
return niuren_pool_names return niuren_pool_names
...@@ -22,7 +22,7 @@ def load_true_niuren(): ...@@ -22,7 +22,7 @@ def load_true_niuren():
true_niuren["别名列表"] = None true_niuren["别名列表"] = None
true_niuren["别名列表"] = true_niuren["别名列表(各种奇奇怪怪的名字格式,比如first name和second name的顺序,以;分隔)"].apply( true_niuren["别名列表"] = true_niuren["别名列表(各种奇奇怪怪的名字格式,比如first name和second name的顺序,以;分隔)"].apply(
lambda x: [firstlast2lastfirst(i.strip()) for i in x.split(";") if i!=""] if isinstance(x, str) else [] lambda x: [firstlast2lastfirst(i.strip()).lower() for i in x.split(";") if i!=""] if isinstance(x, str) else []
) )
true_niuren_names = [] true_niuren_names = []
...@@ -31,9 +31,9 @@ def load_true_niuren(): ...@@ -31,9 +31,9 @@ def load_true_niuren():
# 构建牛人姓名列表 # 构建牛人姓名列表
for _, row in true_niuren.iterrows(): for _, row in true_niuren.iterrows():
if row["别名列表"]: if row["别名列表"]:
true_niuren_names.append([firstlast2lastfirst(row["姓名"])] + row["别名列表"]) true_niuren_names.append([firstlast2lastfirst(row["姓名"]).lower()] + row["别名列表"])
else: else:
true_niuren_names.append(firstlast2lastfirst(row["姓名"])) true_niuren_names.append(firstlast2lastfirst(row["姓名"]).lower())
# 构建牛人论文列表 # 构建牛人论文列表
for true_niuren_idx, _ in enumerate(true_niuren_names): for true_niuren_idx, _ in enumerate(true_niuren_names):
...@@ -58,7 +58,7 @@ def load_fake_niuren(): ...@@ -58,7 +58,7 @@ def load_fake_niuren():
# 构建非牛人姓名列表 # 构建非牛人姓名列表
for _, row in fake_niuren.iterrows(): for _, row in fake_niuren.iterrows():
fake_niuren_names.append(firstlast2lastfirst(row["姓名"])) fake_niuren_names.append(firstlast2lastfirst(row["姓名"]).lower())
# 构建非牛人论文列表 # 构建非牛人论文列表
for fake_niuren_idx, _ in enumerate(fake_niuren_names): for fake_niuren_idx, _ in enumerate(fake_niuren_names):
...@@ -87,12 +87,12 @@ def check_niuren(authors, title): ...@@ -87,12 +87,12 @@ def check_niuren(authors, title):
continue continue
# 作者姓名能否在牛人池中找到 # 作者姓名能否在牛人池中找到
name_index = name_in_niuren_list(firstlast2lastfirst(author), niuren_pool_names) name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), niuren_pool_names)
if name_index == -1: if name_index == -1:
continue continue
# 作者姓名能否在全局牛人中找到 # 作者姓名能否在全局牛人中找到
true_name_index = name_in_niuren_list(firstlast2lastfirst(author), true_niuren_names) true_name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), true_niuren_names)
if true_name_index != -1: if true_name_index != -1:
if title.strip().lower() in true_niuren_papers[true_name_index]: if title.strip().lower() in true_niuren_papers[true_name_index]:
niuren.append(author) niuren.append(author)
...@@ -100,7 +100,7 @@ def check_niuren(authors, title): ...@@ -100,7 +100,7 @@ def check_niuren(authors, title):
continue continue
# 作者姓名能否在全局非牛人中找到 # 作者姓名能否在全局非牛人中找到
fake_name_index = name_in_niuren_list(firstlast2lastfirst(author), fake_niuren_names) fake_name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), fake_niuren_names)
if fake_name_index != -1: if fake_name_index != -1:
if title.strip().lower() in fake_niuren_papers[fake_name_index]: if title.strip().lower() in fake_niuren_papers[fake_name_index]:
continue continue
......
...@@ -4,6 +4,7 @@ def firstlast2lastfirst(name): ...@@ -4,6 +4,7 @@ def firstlast2lastfirst(name):
# 名人列表中存在一些名字是First Name Last Name的格式,将其转换为Last Name, First Name格式 # 名人列表中存在一些名字是First Name Last Name的格式,将其转换为Last Name, First Name格式
# 例如:'John Doe' -> 'Doe, John','M. Jane Smith' -> 'Smith, Jane M.' # 例如:'John Doe' -> 'Doe, John','M. Jane Smith' -> 'Smith, Jane M.'
special_str = [".", "Ms.", "Mr.", "Mrs.", "Dr.", "Prof.", "PhD", "MD", "Jr.", "Sr.", "The", "Honorable"] special_str = [".", "Ms.", "Mr.", "Mrs.", "Dr.", "Prof.", "PhD", "MD", "Jr.", "Sr.", "The", "Honorable"]
special_str = [item.lower() for item in special_str]
name_split = name.split(" ") name_split = name.split(" ")
name_split = [item.strip() for item in name_split if item.strip() != ""] name_split = [item.strip() for item in name_split if item.strip() != ""]
name_split = [item for item in name_split if item not in special_str] name_split = [item for item in name_split if item not in special_str]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment