Commit ab7aabf8 by Tian_zi_kang

tianzikang update: Add niuren, location, zhimingqiye for good_papers.jsonl. Add location for hpca

parent 264ecd70
from utils import contains_chinese, chinese_to_english_name, firstlast2lastfirst, name_in_niuren_list
import json
import pandas as pd
niuren_df = pd.read_csv("new_niuren_format-merged_turing.csv", encoding='utf-8-sig')
niuren_title_names = niuren_df.columns[2:].tolist()
niuren_name = pd.read_csv("new_niuren_format-merged_turing.csv", encoding='utf-8-sig')["name"].tolist()
niuren_name = [name.replace("\xa0", " ") for name in niuren_name] # 去除空格
# 打开 JSON Lines 文件并逐行读取
# file_path = 'allpapers_authors.jsonl' # 替换为你的文件路径
# file_path = '任务三pdf提取作者信息.jsonl' # 替换为你的文件路径
# file_path = 'good_papers-威.jsonl' # 替换为你的文件路径
file_path = 'good_papers.jsonl' # 替换为你的文件路径
# 创建一个空列表来存储解析后的 JSON 对象
data = []
# breakpoint()
# 逐行读取文件
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
# 解析每一行的 JSON 数据
json_line = json.loads(line)
data.append(json_line)
papers_with_niuren = []
for item in data:
item_insert_flag = False
authors = item['authors']
authors_name_list = [author['name'] for author in authors]
# breakpoint()
authors_name_list = [item if isinstance(item, str) else "" for item in authors_name_list] # 8788行为空nan
authors_name_list = [item if not contains_chinese(item) else chinese_to_english_name(item) for item in authors_name_list]
authors_name_list = [firstlast2lastfirst(item) for item in authors_name_list]
# print(authors_name_list)
item_niuren_authors = []
niuren_title_names_dict = {niuren_title: [] for niuren_title in niuren_title_names}
niuren_idx = 0
for idx, author_name in enumerate(authors_name_list):
# if author_name in niuren_name:
name_index = name_in_niuren_list(author_name, niuren_name)
if name_index != -1:
niuren_idx += 1
# print("找到匹配的作者:", author_name)
if not item_insert_flag:
item_insert_flag = True
# item_niuren_authors.append(author_name)
item_niuren_authors.append(idx + 1)
author_title = niuren_df.iloc[name_index][2:].tolist()
assert len(author_title) == len(niuren_title_names)
for author_title_name, author_title_value in zip(niuren_title_names, author_title):
if author_title_value == 1.0:
niuren_title_names_dict[author_title_name].append(niuren_idx)
item['niuren_authors'] = item_niuren_authors
for k, v in niuren_title_names_dict.items():
item[k] = "; ".join([str(i) for i in v])
# if item_insert_flag:
# papers_with_niuren.append(item)
# else:
# papers_with_niuren.append("empty line")
papers_with_niuren.append(item)
# save papers_with_niuren to jsonl file
# with open('good_papers-威-final.jsonl', 'w', encoding='utf-8') as f:
with open('good_papers-niuren.jsonl', 'w', encoding='utf-8') as f:
# with open('任务三pdf提取作者信息-final.jsonl', 'w', encoding='utf-8') as f:
for item in papers_with_niuren:
json.dump(item, f, ensure_ascii=False)
f.write('\n')
niuren_titles = {
"ACM Fellow": "",
"IEEE Fellow": "",
"AAAS": "",
"NAS ": "",
"NAE": "",
"NAI": "12",
"American Academy of Arts and Sciences": "",
"European Academy of Sciences": "",
"European Academy of Sciences and Arts": "",
"Academia Europaea (AE)": "",
"中国科学院": "",
"中国工程院": "",
"The Royal Society": "",
"The Royal Society of Canada": "",
"Turing Award": ""
}
from utils import read_jsonl
from copy import deepcopy
papers_with_niuren_final = read_jsonl("good_papers-niuren.jsonl")
# keys: 'title', 'authors', 'filename', 'niuren_authors'
# breakpoint()
import pandas as pd
df_data = {'title': [], 'authors': [], 'affiliations': [], 'filename': [], 'niuren_authors': []}
for key in niuren_titles.keys():
if key not in df_data.keys():
df_data[key] = []
for item in papers_with_niuren_final:
# if item == "empty line":
# continue
if "title" not in item.keys():
# continue
item["title"] = item["applicant"] # 如果是applicant不是title,就用applicant代替title
# 同时applicant情况下,author大概率没有affiliations,赋值[]
for author in item['authors']:
if "affiliations" not in author.keys():
author['affiliations'] = []
df_data['title'].append(item['title'])
author_names = []
author_affiliations = []
for author in item['authors']:
author_name = author['name']
author_names.append(author_name)
author_affiliations.extend(author['affiliations'])
niuren_authors = [author_names[idx - 1] for idx in item['niuren_authors']]
# author_names = list(set(author_names)) # 去重
author_affiliations = list(set(author_affiliations)) # 去重
df_data['authors'].append(deepcopy("; ".join(author_names)))
df_data['affiliations'].append(deepcopy("; ".join(author_affiliations)))
df_data['filename'].append(deepcopy(item['filename']))
df_data['niuren_authors'].append(deepcopy("; ".join(niuren_authors)))
for key in niuren_titles.keys():
df_data[key].append(deepcopy(item[key]))
# save to csv file
df = pd.DataFrame(df_data)
df.to_csv("good_papers-niuren.csv", index=False, encoding='utf-8-sig')
\ No newline at end of file
from copy import deepcopy
from utils import get_continent
import pandas as pd
df = pd.read_excel("机构国家汇总-初版.xlsx")
# 提取第一行的值作为新的列名
new_columns = df.iloc[0].values
# 将第一行的值赋值给 df.columns
df.columns = new_columns
# 删除第一行(因为它已经被用作列名)
df = df.drop(0).reset_index(drop=True)
institution_name = df["Institution Name"].tolist()
file_name = "good_papers-niuren.csv"
final = pd.read_csv(file_name, encoding='utf-8-sig')
# add continent_country column
final["continent_country"] = ""
for row_idx, row in final.iterrows():
# if row["affiliations"] == "":
# continue
continent_country_row = []
# print(row_idx, row["affiliations"])
# if row["affiliations"] == "":
# continue
try:
aff_split = row["affiliations"].split(";")
except Exception as e:
print("没有机构信息:", row["affiliations"], "type:", type(row["affiliations"]))
continent_country_row = []
aff_split = []
aff_split = [x.strip() for x in aff_split]
for aff in aff_split:
# each aff
aff_in_flag = False
aff_split_2 = aff.split(",")
aff_split_2 = [x.strip() for x in aff_split_2]
row_index = -1
for x in aff_split_2:
if x in institution_name:
row_index = institution_name.index(x)
break
if row_index != -1:
country = df.iloc[row_index]["Country"]
continent = get_continent(country)
continent_country_row.append(f"{continent}-{country}")
continent_country_row = list(set(continent_country_row)) # 去重
continent_country_row = "; ".join(continent_country_row)
final.at[row_idx, "continent_country"] = deepcopy(continent_country_row)
# save it as an csv file
final.to_csv("good_papers-niuren-task_10.csv", index=False, encoding='utf-8-sig')
\ No newline at end of file
from copy import deepcopy
from utils import get_continent
import pandas as pd
df = pd.read_excel("知名企业.xlsx")
qiye_name = df["企业-英文名"].tolist()
file_name = "good_papers-niuren-task_10.csv"
final = pd.read_csv(file_name, encoding='utf-8-sig')
# add continent_country column
final["知名企业"] = ""
for row_idx, row in final.iterrows():
row_qiye = []
for qiye in qiye_name:
try:
if qiye in row["affiliations"]:
if qiye == "Intel":
# 排除 "Intel" 这个词在 "Intelligence" 和 "Intelligent" 中的情况
aff_str = deepcopy(row["affiliations"])
aff_str = aff_str.replace("Intelligence", "")
aff_str = aff_str.replace("Intelligent", "")
if qiye in aff_str:
row_qiye.append(deepcopy(qiye))
else:
row_qiye.append(deepcopy(qiye))
except Exception as e:
continue
row_qiye = list(set(row_qiye)) # 去重,可能也不需要去重
row_qiye = "; ".join(row_qiye)
final.at[row_idx, "知名企业"] = deepcopy(row_qiye)
# save it as an csv file
final.to_csv("good_papers-niuren-task_11.csv", index=False, encoding='utf-8-sig')
\ No newline at end of file
from utils import get_continent
import pandas as pd
from copy import deepcopy
hpca = pd.read_excel("hpca.xlsx")
new_hpca_columns = [
"届数",
"论文名称",
"第一作者",
"第一作者机构",
"第一作者洲-国家",
"通讯作者",
"通讯作者机构",
"通讯作者洲-国家",
]
new_hpca = pd.DataFrame(columns=new_hpca_columns)
new_hpca["届数"] = hpca["届数"]
new_hpca["论文名称"] = hpca["论文名称"]
new_hpca["第一作者"] = hpca["第一作者"]
new_hpca["第一作者机构"] = hpca["第一作者机构"]
# new_hpca["第一作者洲-国家"] = hpca["第一作者洲-国家"]
new_hpca["通讯作者"] = hpca["通讯作者"]
new_hpca["通讯作者机构"] = hpca["通讯作者机构"]
# new_hpca["通讯作者洲-国家"] = hpca["通讯作者洲-国家"]
df = pd.read_excel("机构国家汇总-初版.xlsx")
# 提取第一行的值作为新的列名
new_columns = df.iloc[0].values
# 将第一行的值赋值给 df.columns
df.columns = new_columns
# 删除第一行(因为它已经被用作列名)
df = df.drop(0).reset_index(drop=True)
institution_name = df["Institution Name"].tolist()
country_name = df["Country"].tolist()
for row_idx, row in new_hpca.iterrows():
diyi_flag = False
tongxun_flag = False
diyi_continent_country_row = []
tongxun_continent_country_row = []
try:
diyi_aff = row["第一作者机构"].replace("\xa0", " ").split(",")
assert ";" not in row["第一作者机构"]
except Exception as e:
# print("没有机构信息:", row["第一作者机构"], "type:", type(row["第一作者机构"]))
diyi_aff = []
try:
tongxun_aff = row["通讯作者机构"].replace("\xa0", " ").split(",")
assert ";" not in row["通讯作者机构"]
except Exception as e:
# print("没有机构信息:", row["通讯作者机构"], "type:", type(row["通讯作者机构"]))
tongxun_aff = []
diyi_aff = [x.strip() for x in diyi_aff]
diyi_aff_more_detail = []
for aff in diyi_aff:
diyi_aff_more_detail.extend(aff.split(" "))
diyi_aff_more_detail = [x.strip() for x in diyi_aff_more_detail]
diyi_aff_more_detail.extend(diyi_aff)
tongxun_aff = [x.strip() for x in tongxun_aff]
tongxun_aff_more_detail = []
for aff in tongxun_aff:
tongxun_aff_more_detail.extend(aff.split(" "))
tongxun_aff_more_detail = [x.strip() for x in tongxun_aff_more_detail]
tongxun_aff_more_detail.extend(tongxun_aff)
for aff in diyi_aff:
# each aff
if aff in institution_name:
diyi_flag = True
diyi_row_index = institution_name.index(aff)
country = df.iloc[diyi_row_index]["Country"]
continent = get_continent(country)
diyi_continent_country_row.append(f"{continent}-{country}")
if aff in country_name:
diyi_flag = True
diyi_row_index = country_name.index(aff)
continent = get_continent(aff)
diyi_continent_country_row.append(f"{continent}-{aff}")
for aff in diyi_aff_more_detail:
# each aff
if aff in institution_name:
diyi_flag = True
diyi_row_index = institution_name.index(aff)
country = df.iloc[diyi_row_index]["Country"]
continent = get_continent(country)
diyi_continent_country_row.append(f"{continent}-{country}")
if aff in country_name:
diyi_flag = True
diyi_row_index = country_name.index(aff)
continent = get_continent(aff)
diyi_continent_country_row.append(f"{continent}-{aff}")
# if diyi_flag == False:
# diyi_continent_country_row.append("")
for aff in tongxun_aff:
# each aff
if aff in institution_name:
tongxun_flag = True
tongxun_row_index = institution_name.index(aff)
country = df.iloc[tongxun_row_index]["Country"]
continent = get_continent(country)
tongxun_continent_country_row.append(f"{continent}-{country}")
if aff in country_name:
tongxun_flag = True
tongxun_row_index = country_name.index(aff)
continent = get_continent(aff)
tongxun_continent_country_row.append(f"{continent}-{aff}")
for aff in tongxun_aff_more_detail:
# each aff
if aff in institution_name:
tongxun_flag = True
tongxun_row_index = institution_name.index(aff)
country = df.iloc[tongxun_row_index]["Country"]
continent = get_continent(country)
tongxun_continent_country_row.append(f"{continent}-{country}")
if aff in country_name:
tongxun_flag = True
tongxun_row_index = country_name.index(aff)
continent = get_continent(aff)
tongxun_continent_country_row.append(f"{continent}-{aff}")
# if tongxun_flag == False:
# tongxun_continent_country_row.append("")
diyi_continent_country_row = list(set(diyi_continent_country_row)) # 去重
diyi_continent_country_row = "; ".join(diyi_continent_country_row)
new_hpca.at[row_idx, "第一作者洲-国家"] = deepcopy(diyi_continent_country_row)
tongxun_continent_country_row = list(set(tongxun_continent_country_row)) # 去重
tongxun_continent_country_row = "; ".join(tongxun_continent_country_row)
new_hpca.at[row_idx, "通讯作者洲-国家"] = deepcopy(tongxun_continent_country_row)
# save it as an csv file
new_hpca.to_csv("hpca-洲-国家.csv", index=False, encoding='utf-8-sig')
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
### 3-find_celebrity_from_paper_with_authors.py
运行:`python 3-find_celebrity_from_paper_with_authors.py`
功能:为`good_papers.jsonl`文件中的每篇论文增加niuren_authors(作者索引列表,如[2,6]表示第2个和第6个是牛人,从1开始编号)和各头衔(如"1.2"表示牛人作者中的第1个和第2个是对应头衔,从1开始编号),得到`good_papers-niuren.jsonl`.
输入:`good_papers.jsonl`,每一行格式为
```python
{"title": "Automating multi-task learning on optical neural networks with weight sharing and physical rotation", "authors": [{"name": "Shanglin Zhou", "affiliations": ["School of Computing, University of Connecticut, Storrs 06269, USA"]}, {"name": "Yingjie Li", "affiliations": ["A. James Clark School of Engineering, University of Maryland, College Park 20742, USA"]}, {"name": "Weilu Gao", "affiliations": ["Electrical and Computer Engineering, University of Utah, Salt Lake City 84112, USA"]}, {"name": "Cunxi Yu", "affiliations": ["A. James Clark School of Engineering, University of Maryland, College Park 20742, USA"]}, {"name": "Caiwen Ding", "affiliations": ["Department of Computer Science & Engineering, University of Minnesota Twin Cities, Minneapolis 55455, USA"]}], "filename": "0001-0001-Automating multi-task learning on optical neural networks with weight sharing and physical rotation.pdf"}
```
输出:`good_papers-niuren.jsonl`,每一行格式为
```python
{"title": "On Efficiency, Fairness and Security in AI Accelerator Resource Sharing: A Survey", "authors": [{"name": "Jiahua Huang", "affiliations": ["South China University of Technology, China", "Pengcheng Laboratory, Guangzhou, China"]}, {"name": "Weiwei Lin", "affiliations": ["South China University of Technology, China", "Pengcheng Laboratory, Guangzhou, China"]}, {"name": "Wentai Wu", "affiliations": ["Department of Computer Science, Jinan University, Guangzhou, China"]}, {"name": "Yang Wang", "affiliations": ["Shenzhen Institutes of Advanced Technology Chinese Academy of Sciences, Shenzhen, China"]}, {"name": "Haocheng Zhong", "affiliations": ["South China University of Technology, Guangzhou, China"]}, {"name": "Xinhua Wang", "affiliations": ["South China University of Technology, Guangzhou, China"]}, {"name": "Keqin Li", "affiliations": ["Department of Computer Science, State University of New York, New Paltz, United States"]}], "filename": "0001-0009-On Efficiency Fairness and Security in AI Accelerator Resource Sharing A Survey.pdf", "niuren_authors": [4, 7], "ACM Fellow": "", "IEEE Fellow": "1; 2", "AAAS": "", "NAS ": "", "NAE": "", "NAI": "", "American Academy of Arts and Sciences": "", "European Academy of Sciences": "", "European Academy of Sciences and Arts": "", "Academia Europaea (AE)": "", "中国科学院": "", "中国工程院": "", "The Royal Society": "", "The Royal Society of Canada": "", "Turing Award": ""}
```
### 4-papers_with_niuren-final-to_csv.py
运行:`python 4-papers_with_niuren-final-to_csv.py`
功能:将`good_papers-niuren.jsonl`由jsonl格式转换为csv格式`good_papers-niuren.csv`
输入:good_papers-niuren.jsonl
输出:good_papers-niuren.csv
### 5-find_location_of_papers_with_niuren.py
运行:`python 5-find_location_of_papers_with_niuren.py`
功能:给`good_papers-niuren.csv`文件添加`continent_country`列,得到`good_papers-niuren-task_10.csv`
输入:good_papers-niuren.csv
输出:good_papers-niuren-task_10.csv
注意:第3步因为有一些机构无法和`机构国家汇总-初版.xlsx`中的机构匹配,因此需要手动检查`good_papers-niuren-task_10.csv``continent_country`列的空行,手动添加机构(但应该只是很少量的)
### 6-zhimingqiye.py
运行:`python 6-zhimingqiye.py`
功能:给`good_papers-niuren-task_10.csv`文件添加`知名企业`列,得到`good_papers-niuren-task_11.csv`
### 8-add_country_hpca.py
运行:`python 8-add_country_hpca.py`
功能:给`hpca_1-15.csv`文件增加`第一作者洲-国家`列和`通讯作者洲-国家`
输入:`hpca_1-15.csv`,包括"届数、论文名称、第一作者、第一作者机构、通讯作者、通讯作者结构"这几列
def contains_chinese(s_str):
try:
for char in s_str:
if '\u4e00' <= char <= '\u9fff' or \
'\u3400' <= char <= '\u4dbf' or \
'\u20000' <= char <= '\u2a6df':
return True
return False
except Exception as e:
breakpoint()
from pypinyin import pinyin, Style
def chinese_to_english_name(chinese_name):
# 将中文名字分割为姓和名
# 假设姓为第一个字,名为后面的字
if len(chinese_name) < 2:
raise ValueError("名字长度不足")
last_name = chinese_name[0] # 姓
first_name = chinese_name[1:] # 名
# 使用 pypinyin 转换为拼音
last_name_pinyin = pinyin(last_name, style=Style.NORMAL, heteronym=False)[0][0].capitalize()
first_name_pinyin = ''.join([word[0].capitalize() for word in pinyin(first_name, style=Style.NORMAL, heteronym=False)])
# 格式化为 "Last Name, First Name"
english_name = f"{last_name_pinyin}, {first_name_pinyin}"
return english_name
def firstlast2lastfirst(name):
if "," in name or contains_chinese(name):
return name
# 名人列表中存在一些名字是First Name Last Name的格式,将其转换为Last Name, First Name格式
# 例如:'John Doe' -> 'Doe, John','M. Jane Smith' -> 'Smith, Jane M.'
special_str = [".", "Ms.", "Mr.", "Mrs.", "Dr.", "Prof.", "PhD", "MD", "Jr.", "Sr.", "The", "Honorable"]
name_split = name.split(" ")
name_split = [item.strip() for item in name_split if item.strip() != ""]
name_split = [item for item in name_split if item not in special_str]
# Grigory Isaakovich Barenblatt --> Barenblatt, Alexa, Marc Isaakovich
if len(name_split) == 2:
first_name = name_split[0]
last_name = name_split[1]
new_name = f"{last_name}, {first_name}"
elif len(name_split) == 3:
if name_split[0].startswith("(") and name_split[0].endswith(")"):
# (Alexander) Philip Dawid
first_name = f"{name_split[1]} {name_split[0]}"
last_name = f"{name_split[2]}"
elif name_split[1].startswith("(") and name_split[1].endswith(")"):
# Xinyan (Tracy) Cui --> Cui, Xinyan (Tracy)
first_name = f"{name_split[0]} {name_split[1]}"
last_name = f"{name_split[2]}"
elif name_split[2].startswith("(") and name_split[2].endswith(")"):
# Ye Fred (Ying)实际是姓Ye,名Ying,英文名Fred, 需要将其转换为"Ye, Ying Fred"
# Zhu Jesse (Jingxu)实际是姓Zhu,名Jingxu,英文名Jesse, 需要将其转换为"Zhu, Jingxu Jesse"
first_name = f"{name_split[2][1:-1]} ({name_split[1]})"
last_name = f"{name_split[0]}"
elif name_split[0].endswith("."):
# M. Jane Smith --> Smith, Jane M.
# K.W. Michael Siu --> Siu, Michael K.W.
first_name = f"{name_split[1]} {name_split[0]}"
last_name = name_split[2]
elif name_split[1].endswith("."):
# Jane M. Smith --> Smith, Jane M.
# Pierre J.H. Richardson --> Richardson, Pierre J.H.
first_name = f"{name_split[0]} {name_split[1]}"
last_name = name_split[2]
elif name_split[2].endswith("."):
# Wimmer-Schweingruber Robert F. --> Wimmer-Schweingruber, Robert F.
# Wilderer Peter A. --> Wilderer, Peter A.
first_name = f"{name_split[1]} {name_split[2]}"
last_name = f"{name_split[0]}"
else:
# William Nelson Joy --> Joy, William Nelson
# Michael J Carey --> Carey, Michael J
# len_3_list.append(" ".join(name_split))
first_name = f"{name_split[0]} {name_split[1]}"
last_name = name_split[2]
new_name = f"{last_name}, {first_name}"
# print(f"{name} --> {new_name}")
else:
# lens greater than 3 这部分名字应该不会跟其他部分名字出现重复
last_name = name_split[-1]
first_name = " ".join(name_split[:-1])
new_name = f"{last_name}, {first_name}"
return new_name
import numpy as np
def rule_of_same_names(item, row_values):
# # 把肯定一样的信息行筛掉,如'Polanyi, John', 'Polanyi, John'
# name_set = set([name for idx, name in item])
# # if len(name_set) == len(item):
# # return ""
# if len(name_set) > 1:
# return ""
# else:
# return 1.0 # 1.0表示同一个人,0.0表示不是同一个人
row_values_sum = np.sum(row_values, axis=0)
if all([item < 2 for item in row_values_sum]):
return 1.0
else:
return 0.0
import json
# 创建一个空列表来存储解析后的 JSON 对象
def read_jsonl(file_path):
"""
读取 JSONL 文件并返回解析后的数据
:param file_path: JSONL 文件路径
:return: 解析后的数据列表
"""
data = []
# 逐行读取文件
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
# 解析每一行的 JSON 数据
json_line = json.loads(line)
data.append(json_line)
return data
def name_in_niuren_list(name, niuren_name_list):
"""
判断名字是否在牛人列表中
:param name: 名字
:param niuren_name_list: 牛人列表
:return: 如果在牛人列表中,返回 True,否则返回 False
"""
for idx, niuren_name in enumerate(niuren_name_list):
if name == niuren_name:
return idx
for idx, niuren_name in enumerate(niuren_name_list):
if niuren_name.startswith(name):
if niuren_name[niuren_name.find(name) + len(name)] == " ":
return idx
if name.startswith(niuren_name):
if name[name.find(niuren_name) + len(niuren_name)] == " ":
return idx
return -1
import pycountry
import pycountry_convert as pc
def get_continent(country_name):
if country_name == "South Korea":
country_name = "Korea"
try:
country = pycountry.countries.get(name=country_name)
if not country:
# 可能是国家名称不规范,尝试使用通用名
for c in pycountry.countries:
if country_name.lower() in c.name.lower():
country = c
break
if not country:
return "未知洲"
country_alpha2 = country.alpha_2
continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
continent_name = pc.convert_continent_code_to_continent_name(continent_code)
return continent_name
except Exception as e:
return f"出错:{e}"
# # 示例
# print(get_continent("China")) # 输出: Asia
# print(get_continent("Germany")) # 输出: Europe
# print(get_continent("Brazil")) # 输出: South America
def parse_name(name):
candidate_names = []
# 按照逗号分割名字
parts = name.split(',')
if len(parts) < 2:
return None # 不符合格式,跳过
last_name = parts[0].strip() # 姓
first_name_part = parts[1].strip() # 名和可能的中间名
# 按照空格分割名和中间名: "FeiFei-Q" 或 "FeiFei Q"
if "-" in first_name_part:
first_name_1 = first_name_part.split('-')[0].strip()
first_name_2 = first_name_part.split('-')[1].strip()
candidate_names.append(f"{last_name}, {first_name_1}")
candidate_names.append(f"{last_name}, {first_name_1}-{first_name_2}")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}.")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}.")
if ", " in first_name_part:
first_name_1 = first_name_part.split(', ')[0].strip()
first_name_2 = first_name_part.split(', ')[1].strip()
candidate_names.append(f"{last_name}, {first_name_1}")
candidate_names.append(f"{last_name}, {first_name_1}-{first_name_2}")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}.")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}.")
if " " in first_name_part:
first_name_1 = first_name_part.split(' ')[0].strip()
first_name_2 = first_name_part.split(' ')[1].strip()
candidate_names.append(f"{last_name}, {first_name_1}")
candidate_names.append(f"{last_name}, {first_name_1}-{first_name_2}")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}.")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}")
candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}.")
# else:
# candidate_names.append(f"{last_name}, {first_name_part}")
candidate_names.append(f"{last_name}, {first_name_part}")
return candidate_names
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment