tianzikang update: Add niuren, location, zhimingqiye for good_papers.jsonl. Add location for hpca

ab7aabf8 · Tian_zi_kang · 264ecd70 · ab7aabf8 · ab7aabf8 · ab7aabf8
Commit ab7aabf8 authored May 06, 2025 by Tian_zi_kang
12 changed files
--- a/add-niuren-location-zhimingqiye/3-find_celebrity_from_paper_with_authors.py
+++ b/add-niuren-location-zhimingqiye/3-find_celebrity_from_paper_with_authors.py
+from utils import contains_chinese, chinese_to_english_name, firstlast2lastfirst, name_in_niuren_list
+import json
+import pandas as pd
+niuren_df = pd.read_csv("new_niuren_format-merged_turing.csv", encoding='utf-8-sig')
+niuren_title_names = niuren_df.columns[2:].tolist()
+niuren_name = pd.read_csv("new_niuren_format-merged_turing.csv", encoding='utf-8-sig')["name"].tolist()
+niuren_name = [name.replace("\xa0", " ") for name in niuren_name]  # 去除空格
+# 打开 JSON Lines 文件并逐行读取
+# file_path = 'allpapers_authors.jsonl'  # 替换为你的文件路径
+# file_path = '任务三pdf提取作者信息.jsonl'  # 替换为你的文件路径
+# file_path = 'good_papers-威.jsonl'  # 替换为你的文件路径
+file_path = 'good_papers.jsonl'  # 替换为你的文件路径
+
+# 创建一个空列表来存储解析后的 JSON 对象
+data = []
+# breakpoint()
+
+# 逐行读取文件
+with open(file_path, 'r', encoding='utf-8') as file:
+    for line in file:
+        # 解析每一行的 JSON 数据
+        json_line = json.loads(line)
+        data.append(json_line)
+
+
+papers_with_niuren = []
+
+for item in data:
+    item_insert_flag = False
+    authors = item['authors']
+    authors_name_list = [author['name'] for author in authors]
+    # breakpoint()
+    authors_name_list = [item if isinstance(item, str) else "" for item in authors_name_list] # 8788行为空nan
+    authors_name_list = [item if not contains_chinese(item) else chinese_to_english_name(item) for item in authors_name_list]
+    authors_name_list = [firstlast2lastfirst(item) for item in authors_name_list]
+    
+    # print(authors_name_list)
+    item_niuren_authors = []
+    niuren_title_names_dict = {niuren_title: [] for niuren_title in niuren_title_names}
+    niuren_idx = 0
+    for idx, author_name in enumerate(authors_name_list):
+        # if author_name in niuren_name:
+        name_index = name_in_niuren_list(author_name, niuren_name)
+        if name_index != -1:
+            niuren_idx += 1
+            # print("找到匹配的作者：", author_name)
+            if not item_insert_flag:
+                item_insert_flag = True
+            # item_niuren_authors.append(author_name)
+            item_niuren_authors.append(idx + 1)
+            author_title = niuren_df.iloc[name_index][2:].tolist()
+            assert len(author_title) == len(niuren_title_names)
+            for author_title_name, author_title_value in zip(niuren_title_names, author_title):
+                if author_title_value == 1.0:
+                    niuren_title_names_dict[author_title_name].append(niuren_idx)
+            
+            
+            
+    item['niuren_authors'] = item_niuren_authors
+    for k, v in niuren_title_names_dict.items():
+        item[k] = "; ".join([str(i) for i in v])
+    # if item_insert_flag:
+    #     papers_with_niuren.append(item)
+    # else:
+    #     papers_with_niuren.append("empty line")
+    papers_with_niuren.append(item)
+
+# save papers_with_niuren to jsonl file
+# with open('good_papers-威-final.jsonl', 'w', encoding='utf-8') as f:
+with open('good_papers-niuren.jsonl', 'w', encoding='utf-8') as f:
+# with open('任务三pdf提取作者信息-final.jsonl', 'w', encoding='utf-8') as f:
+    for item in papers_with_niuren:
+        json.dump(item, f, ensure_ascii=False)
+        f.write('\n')
+
--- a/add-niuren-location-zhimingqiye/4-papers_with_niuren-final-to_csv.py
+++ b/add-niuren-location-zhimingqiye/4-papers_with_niuren-final-to_csv.py
+niuren_titles = {
+    "ACM Fellow": "", 
+    "IEEE Fellow": "", 
+    "AAAS": "", 
+    "NAS ": "", 
+    "NAE": "", 
+    "NAI": "12", 
+    "American Academy of Arts and Sciences": "", 
+    "European Academy of Sciences": "", 
+    "European Academy of Sciences and Arts": "", 
+    "Academia Europaea (AE)": "", 
+    "中国科学院": "", 
+    "中国工程院": "", 
+    "The Royal Society": "", 
+    "The Royal Society of Canada": "",
+    "Turing Award": ""
+}
+
+
+from utils import read_jsonl
+from copy import deepcopy
+
+papers_with_niuren_final = read_jsonl("good_papers-niuren.jsonl")
+
+# keys: 'title', 'authors', 'filename', 'niuren_authors'
+# breakpoint()
+
+import pandas as pd
+df_data = {'title': [], 'authors': [], 'affiliations': [], 'filename': [], 'niuren_authors': []}
+for key in niuren_titles.keys():
+    if key not in df_data.keys():
+        df_data[key] = []
+        
+for item in papers_with_niuren_final:
+    # if item == "empty line":
+    #     continue
+    if "title" not in item.keys():
+        # continue
+        item["title"] = item["applicant"] # 如果是applicant不是title，就用applicant代替title
+        # 同时applicant情况下，author大概率没有affiliations，赋值[]
+        for author in item['authors']:
+            if "affiliations" not in author.keys():
+                author['affiliations'] = []
+    df_data['title'].append(item['title'])
+    author_names = []
+    author_affiliations = []
+    
+    for author in item['authors']:
+        author_name = author['name']
+        author_names.append(author_name)
+        author_affiliations.extend(author['affiliations'])
+    
+    niuren_authors = [author_names[idx - 1] for idx in item['niuren_authors']]
+    # author_names = list(set(author_names))  # 去重
+    author_affiliations = list(set(author_affiliations))  # 去重
+    
+    df_data['authors'].append(deepcopy("; ".join(author_names)))
+    df_data['affiliations'].append(deepcopy("; ".join(author_affiliations)))
+    df_data['filename'].append(deepcopy(item['filename']))
+    df_data['niuren_authors'].append(deepcopy("; ".join(niuren_authors)))
+    
+    for key in niuren_titles.keys():
+        df_data[key].append(deepcopy(item[key]))
+    
+# save to csv file
+df = pd.DataFrame(df_data)
+df.to_csv("good_papers-niuren.csv", index=False, encoding='utf-8-sig')
\ No newline at end of file
--- a/add-niuren-location-zhimingqiye/5-find_location_of_papers_with_niuren.py
+++ b/add-niuren-location-zhimingqiye/5-find_location_of_papers_with_niuren.py
+from copy import deepcopy
+from utils import get_continent
+import pandas as pd
+df = pd.read_excel("机构国家汇总-初版.xlsx")
+# 提取第一行的值作为新的列名
+new_columns = df.iloc[0].values
+
+# 将第一行的值赋值给 df.columns
+df.columns = new_columns
+
+# 删除第一行（因为它已经被用作列名）
+df = df.drop(0).reset_index(drop=True)
+
+institution_name = df["Institution Name"].tolist()
+
+file_name = "good_papers-niuren.csv"
+final = pd.read_csv(file_name, encoding='utf-8-sig')
+# add continent_country column
+final["continent_country"] = ""
+
+for row_idx, row in final.iterrows():
+    # if row["affiliations"] == "":
+    #     continue
+    continent_country_row = []
+    # print(row_idx, row["affiliations"])
+    # if row["affiliations"] == "":
+    #     continue
+    try:
+        aff_split = row["affiliations"].split(";")
+    except Exception as e:
+        print("没有机构信息：", row["affiliations"], "type:", type(row["affiliations"]))
+        continent_country_row = []
+        aff_split = []
+    aff_split = [x.strip() for x in aff_split]
+    for aff in aff_split:
+        # each aff
+        aff_in_flag = False
+        aff_split_2 = aff.split(",")
+        aff_split_2 = [x.strip() for x in aff_split_2]
+        row_index = -1
+        for x in aff_split_2:
+            if x in institution_name:
+                row_index = institution_name.index(x)
+                break
+        if row_index != -1:
+            country = df.iloc[row_index]["Country"]
+            continent = get_continent(country)
+            continent_country_row.append(f"{continent}-{country}")
+    
+    continent_country_row = list(set(continent_country_row))  # 去重
+    continent_country_row = "; ".join(continent_country_row)
+    final.at[row_idx, "continent_country"] = deepcopy(continent_country_row)
+        
+        
+# save it as an csv file
+final.to_csv("good_papers-niuren-task_10.csv", index=False, encoding='utf-8-sig')
\ No newline at end of file
--- a/add-niuren-location-zhimingqiye/6-zhimingqiye.py
+++ b/add-niuren-location-zhimingqiye/6-zhimingqiye.py
+from copy import deepcopy
+from utils import get_continent
+import pandas as pd
+df = pd.read_excel("知名企业.xlsx")
+
+qiye_name = df["企业-英文名"].tolist()
+
+file_name = "good_papers-niuren-task_10.csv"
+final = pd.read_csv(file_name, encoding='utf-8-sig')
+# add continent_country column
+final["知名企业"] = ""
+
+for row_idx, row in final.iterrows():
+    row_qiye = []
+    for qiye in qiye_name:
+        try:
+            if qiye in row["affiliations"]:
+                if qiye == "Intel":
+                    # 排除 "Intel" 这个词在 "Intelligence" 和 "Intelligent" 中的情况
+                    aff_str = deepcopy(row["affiliations"])
+                    aff_str = aff_str.replace("Intelligence", "")
+                    aff_str = aff_str.replace("Intelligent", "")
+                    if qiye in aff_str:
+                        row_qiye.append(deepcopy(qiye))
+                else:
+                    row_qiye.append(deepcopy(qiye))
+        except Exception as e:
+            continue
+    row_qiye = list(set(row_qiye))  # 去重,可能也不需要去重
+    row_qiye = "; ".join(row_qiye)
+    
+    final.at[row_idx, "知名企业"] = deepcopy(row_qiye)
+# save it as an csv file
+final.to_csv("good_papers-niuren-task_11.csv", index=False, encoding='utf-8-sig')
\ No newline at end of file
--- a/add-niuren-location-zhimingqiye/8-add_country_hpca.py
+++ b/add-niuren-location-zhimingqiye/8-add_country_hpca.py
+from utils import get_continent
+import pandas as pd
+from copy import deepcopy
+
+hpca = pd.read_excel("hpca.xlsx")
+
+new_hpca_columns = [
+    "届数",
+    "论文名称",
+    "第一作者",
+    "第一作者机构",
+    "第一作者洲-国家",
+    "通讯作者",
+    "通讯作者机构",
+    "通讯作者洲-国家",
+]
+new_hpca = pd.DataFrame(columns=new_hpca_columns)
+
+new_hpca["届数"] = hpca["届数"]
+new_hpca["论文名称"] = hpca["论文名称"]
+new_hpca["第一作者"] = hpca["第一作者"]
+new_hpca["第一作者机构"] = hpca["第一作者机构"]
+# new_hpca["第一作者洲-国家"] = hpca["第一作者洲-国家"]
+new_hpca["通讯作者"] = hpca["通讯作者"]
+new_hpca["通讯作者机构"] = hpca["通讯作者机构"]
+# new_hpca["通讯作者洲-国家"] = hpca["通讯作者洲-国家"]
+
+df = pd.read_excel("机构国家汇总-初版.xlsx")
+# 提取第一行的值作为新的列名
+new_columns = df.iloc[0].values
+
+# 将第一行的值赋值给 df.columns
+df.columns = new_columns
+
+# 删除第一行（因为它已经被用作列名）
+df = df.drop(0).reset_index(drop=True)
+
+institution_name = df["Institution Name"].tolist()
+country_name = df["Country"].tolist()
+
+
+for row_idx, row in new_hpca.iterrows():
+    diyi_flag = False
+    tongxun_flag = False
+    diyi_continent_country_row = []
+    tongxun_continent_country_row = []
+    try:
+        diyi_aff = row["第一作者机构"].replace("\xa0", " ").split(",")
+        assert ";" not in row["第一作者机构"]
+    except Exception as e:
+        # print("没有机构信息：", row["第一作者机构"], "type:", type(row["第一作者机构"]))
+        diyi_aff = []
+    try:
+        tongxun_aff = row["通讯作者机构"].replace("\xa0", " ").split(",")
+        assert ";" not in row["通讯作者机构"]
+    except Exception as e:
+        # print("没有机构信息：", row["通讯作者机构"], "type:", type(row["通讯作者机构"]))
+        tongxun_aff = []
+    diyi_aff = [x.strip() for x in diyi_aff]
+    diyi_aff_more_detail = []
+    for aff in diyi_aff:
+        diyi_aff_more_detail.extend(aff.split(" "))
+    diyi_aff_more_detail = [x.strip() for x in diyi_aff_more_detail]
+    diyi_aff_more_detail.extend(diyi_aff)
+
+    tongxun_aff = [x.strip() for x in tongxun_aff]
+    tongxun_aff_more_detail = []
+    for aff in tongxun_aff:
+        tongxun_aff_more_detail.extend(aff.split(" "))
+    tongxun_aff_more_detail = [x.strip() for x in tongxun_aff_more_detail]
+    tongxun_aff_more_detail.extend(tongxun_aff)
+    for aff in diyi_aff:
+        # each aff
+        if aff in institution_name:
+            diyi_flag = True
+            diyi_row_index = institution_name.index(aff)
+            country = df.iloc[diyi_row_index]["Country"]
+            continent = get_continent(country)
+            diyi_continent_country_row.append(f"{continent}-{country}")
+        if aff in country_name:
+            diyi_flag = True
+            diyi_row_index = country_name.index(aff)
+            continent = get_continent(aff)
+            diyi_continent_country_row.append(f"{continent}-{aff}")
+    for aff in diyi_aff_more_detail:
+        # each aff
+        if aff in institution_name:
+            diyi_flag = True
+            diyi_row_index = institution_name.index(aff)
+            country = df.iloc[diyi_row_index]["Country"]
+            continent = get_continent(country)
+            diyi_continent_country_row.append(f"{continent}-{country}")
+        if aff in country_name:
+            diyi_flag = True
+            diyi_row_index = country_name.index(aff)
+            continent = get_continent(aff)
+            diyi_continent_country_row.append(f"{continent}-{aff}")
+    # if diyi_flag == False:
+    #     diyi_continent_country_row.append("")
+    
+    for aff in tongxun_aff:
+        # each aff
+        if aff in institution_name:
+            tongxun_flag = True
+            tongxun_row_index = institution_name.index(aff)
+            country = df.iloc[tongxun_row_index]["Country"]
+            continent = get_continent(country)
+            tongxun_continent_country_row.append(f"{continent}-{country}")
+        if aff in country_name:
+            tongxun_flag = True
+            tongxun_row_index = country_name.index(aff)
+            continent = get_continent(aff)
+            tongxun_continent_country_row.append(f"{continent}-{aff}")
+    for aff in tongxun_aff_more_detail:
+        # each aff
+        if aff in institution_name:
+            tongxun_flag = True
+            tongxun_row_index = institution_name.index(aff)
+            country = df.iloc[tongxun_row_index]["Country"]
+            continent = get_continent(country)
+            tongxun_continent_country_row.append(f"{continent}-{country}")
+        if aff in country_name:
+            tongxun_flag = True
+            tongxun_row_index = country_name.index(aff)
+            continent = get_continent(aff)
+            tongxun_continent_country_row.append(f"{continent}-{aff}")
+    
+    # if tongxun_flag == False:
+    #     tongxun_continent_country_row.append("")
+        
+    diyi_continent_country_row = list(set(diyi_continent_country_row))  # 去重
+    diyi_continent_country_row = "; ".join(diyi_continent_country_row)
+    new_hpca.at[row_idx, "第一作者洲-国家"] = deepcopy(diyi_continent_country_row)
+    
+    tongxun_continent_country_row = list(set(tongxun_continent_country_row))  # 去重
+    tongxun_continent_country_row = "; ".join(tongxun_continent_country_row)
+    new_hpca.at[row_idx, "通讯作者洲-国家"] = deepcopy(tongxun_continent_country_row)
+        
+        
+# save it as an csv file
+new_hpca.to_csv("hpca-洲-国家.csv", index=False, encoding='utf-8-sig')
\ No newline at end of file
--- a/add-niuren-location-zhimingqiye/good_papers.jsonl
+++ b/add-niuren-location-zhimingqiye/good_papers.jsonl
--- a/add-niuren-location-zhimingqiye/hpca.xlsx
+++ b/add-niuren-location-zhimingqiye/hpca.xlsx
--- a/add-niuren-location-zhimingqiye/new_niuren_format-merged_turing.csv
+++ b/add-niuren-location-zhimingqiye/new_niuren_format-merged_turing.csv
--- a/add-niuren-location-zhimingqiye/readme.md
+++ b/add-niuren-location-zhimingqiye/readme.md
+### 3-find_celebrity_from_paper_with_authors.py
+运行：`python 3-find_celebrity_from_paper_with_authors.py`
+
+功能：为`good_papers.jsonl`文件中的每篇论文增加niuren_authors（作者索引列表，如[2,6]表示第2个和第6个是牛人，从1开始编号）和各头衔（如"1.2"表示牛人作者中的第1个和第2个是对应头衔，从1开始编号），得到`good_papers-niuren.jsonl`.
+
+输入：`good_papers.jsonl`，每一行格式为
+```python
+{"title": "Automating multi-task learning on optical neural networks with weight sharing and physical rotation", "authors": [{"name": "Shanglin Zhou", "affiliations": ["School of Computing, University of Connecticut, Storrs 06269, USA"]}, {"name": "Yingjie Li", "affiliations": ["A. James Clark School of Engineering, University of Maryland, College Park 20742, USA"]}, {"name": "Weilu Gao", "affiliations": ["Electrical and Computer Engineering, University of Utah, Salt Lake City 84112, USA"]}, {"name": "Cunxi Yu", "affiliations": ["A. James Clark School of Engineering, University of Maryland, College Park 20742, USA"]}, {"name": "Caiwen Ding", "affiliations": ["Department of Computer Science & Engineering, University of Minnesota Twin Cities, Minneapolis 55455, USA"]}], "filename": "0001-0001-Automating multi-task learning on optical neural networks with weight sharing and physical rotation.pdf"}
+```
+
+输出：`good_papers-niuren.jsonl`，每一行格式为
+```python
+{"title": "On Efficiency, Fairness and Security in AI Accelerator Resource Sharing: A Survey", "authors": [{"name": "Jiahua Huang", "affiliations": ["South China University of Technology, China", "Pengcheng Laboratory, Guangzhou, China"]}, {"name": "Weiwei Lin", "affiliations": ["South China University of Technology, China", "Pengcheng Laboratory, Guangzhou, China"]}, {"name": "Wentai Wu", "affiliations": ["Department of Computer Science, Jinan University, Guangzhou, China"]}, {"name": "Yang Wang", "affiliations": ["Shenzhen Institutes of Advanced Technology Chinese Academy of Sciences, Shenzhen, China"]}, {"name": "Haocheng Zhong", "affiliations": ["South China University of Technology, Guangzhou, China"]}, {"name": "Xinhua Wang", "affiliations": ["South China University of Technology, Guangzhou, China"]}, {"name": "Keqin Li", "affiliations": ["Department of Computer Science, State University of New York, New Paltz, United States"]}], "filename": "0001-0009-On Efficiency Fairness and Security in AI Accelerator Resource Sharing A Survey.pdf", "niuren_authors": [4, 7], "ACM Fellow": "", "IEEE Fellow": "1; 2", "AAAS": "", "NAS ": "", "NAE": "", "NAI": "", "American Academy of Arts and Sciences": "", "European Academy of Sciences": "", "European Academy of Sciences and Arts": "", "Academia Europaea (AE)": "", "中国科学院": "", "中国工程院": "", "The Royal Society": "", "The Royal Society of Canada": "", "Turing Award": ""}
+```
+
+### 4-papers_with_niuren-final-to_csv.py
+运行：`python 4-papers_with_niuren-final-to_csv.py`
+
+功能：将`good_papers-niuren.jsonl`由jsonl格式转换为csv格式`good_papers-niuren.csv`。
+
+输入：good_papers-niuren.jsonl
+
+输出：good_papers-niuren.csv
+
+### 5-find_location_of_papers_with_niuren.py
+运行：`python 5-find_location_of_papers_with_niuren.py`
+
+功能：给`good_papers-niuren.csv`文件添加`continent_country`列，得到`good_papers-niuren-task_10.csv` 
+
+输入：good_papers-niuren.csv
+输出：good_papers-niuren-task_10.csv
+
+注意：第3步因为有一些机构无法和`机构国家汇总-初版.xlsx`中的机构匹配，因此需要手动检查`good_papers-niuren-task_10.csv`的`continent_country`列的空行，手动添加机构（但应该只是很少量的）
+
+
+### 6-zhimingqiye.py
+
+运行：`python 6-zhimingqiye.py`
+
+功能：给`good_papers-niuren-task_10.csv`文件添加`知名企业`列，得到`good_papers-niuren-task_11.csv`
+
+
+### 8-add_country_hpca.py
+运行：`python 8-add_country_hpca.py`
+
+功能：给`hpca_1-15.csv`文件增加`第一作者洲-国家`列和`通讯作者洲-国家`列
+
+输入：`hpca_1-15.csv`，包括"届数、论文名称、第一作者、第一作者机构、通讯作者、通讯作者结构"这几列
--- a/add-niuren-location-zhimingqiye/utils.py
+++ b/add-niuren-location-zhimingqiye/utils.py
+def contains_chinese(s_str):
+    try:
+        for char in s_str:
+            if '\u4e00' <= char <= '\u9fff' or \
+            '\u3400' <= char <= '\u4dbf' or \
+            '\u20000' <= char <= '\u2a6df':
+                return True
+        return False
+    except Exception as e:
+        breakpoint()
+        
+
+from pypinyin import pinyin, Style
+def chinese_to_english_name(chinese_name):
+    # 将中文名字分割为姓和名
+    # 假设姓为第一个字，名为后面的字
+    if len(chinese_name) < 2:
+        raise ValueError("名字长度不足")
+    
+    last_name = chinese_name[0]  # 姓
+    first_name = chinese_name[1:]  # 名
+
+    # 使用 pypinyin 转换为拼音
+    last_name_pinyin = pinyin(last_name, style=Style.NORMAL, heteronym=False)[0][0].capitalize()
+    first_name_pinyin = ''.join([word[0].capitalize() for word in pinyin(first_name, style=Style.NORMAL, heteronym=False)])
+
+    # 格式化为 "Last Name, First Name"
+    english_name = f"{last_name_pinyin}, {first_name_pinyin}"
+    return english_name
+
+def firstlast2lastfirst(name):
+    if "," in name or contains_chinese(name):
+        return name
+    # 名人列表中存在一些名字是First Name Last Name的格式，将其转换为Last Name, First Name格式
+    # 例如：'John Doe' -> 'Doe, John'，'M. Jane Smith' -> 'Smith, Jane M.'
+    special_str = [".", "Ms.", "Mr.", "Mrs.", "Dr.", "Prof.", "PhD", "MD", "Jr.", "Sr.", "The", "Honorable"]
+    name_split = name.split(" ")
+    name_split = [item.strip() for item in name_split if item.strip() != ""]
+    name_split = [item for item in name_split if item not in special_str]
+    
+    # Grigory Isaakovich Barenblatt --> Barenblatt, Alexa, Marc Isaakovich
+    
+    if len(name_split) == 2:
+        first_name = name_split[0]
+        last_name = name_split[1]
+        new_name = f"{last_name}, {first_name}"
+    elif len(name_split) == 3:
+        if name_split[0].startswith("(") and name_split[0].endswith(")"):
+            # (Alexander) Philip Dawid
+            first_name = f"{name_split[1]} {name_split[0]}"
+            last_name = f"{name_split[2]}"
+        elif name_split[1].startswith("(") and name_split[1].endswith(")"):
+            # Xinyan (Tracy) Cui --> Cui, Xinyan (Tracy)
+            first_name = f"{name_split[0]} {name_split[1]}"
+            last_name = f"{name_split[2]}"
+        elif name_split[2].startswith("(") and name_split[2].endswith(")"):
+            # Ye Fred (Ying)实际是姓Ye，名Ying，英文名Fred, 需要将其转换为"Ye, Ying Fred"
+            # Zhu Jesse (Jingxu)实际是姓Zhu，名Jingxu，英文名Jesse, 需要将其转换为"Zhu, Jingxu Jesse"
+            first_name = f"{name_split[2][1:-1]} ({name_split[1]})"
+            last_name = f"{name_split[0]}"
+        elif name_split[0].endswith("."):
+            # M. Jane Smith --> Smith, Jane M.
+            # K.W. Michael Siu --> Siu, Michael K.W.
+            first_name = f"{name_split[1]} {name_split[0]}"
+            last_name = name_split[2]
+        elif name_split[1].endswith("."):
+            # Jane M. Smith --> Smith, Jane M.
+            # Pierre J.H. Richardson --> Richardson, Pierre J.H.
+            first_name = f"{name_split[0]} {name_split[1]}"
+            last_name = name_split[2]
+        elif name_split[2].endswith("."):
+            # Wimmer-Schweingruber Robert F. --> Wimmer-Schweingruber, Robert F.
+            # Wilderer Peter A. --> Wilderer, Peter A.
+            first_name = f"{name_split[1]} {name_split[2]}"
+            last_name = f"{name_split[0]}"
+        else:
+            # William Nelson Joy --> Joy, William Nelson
+            # Michael J Carey --> Carey, Michael J
+            # len_3_list.append(" ".join(name_split))
+            first_name = f"{name_split[0]} {name_split[1]}"
+            last_name = name_split[2]
+        
+        new_name = f"{last_name}, {first_name}"
+        # print(f"{name} --> {new_name}")
+    else:
+        # lens greater than 3 这部分名字应该不会跟其他部分名字出现重复
+        last_name = name_split[-1]
+        first_name = " ".join(name_split[:-1])
+        new_name = f"{last_name}, {first_name}"
+        
+    return new_name
+
+import numpy as np
+def rule_of_same_names(item, row_values):
+    # # 把肯定一样的信息行筛掉，如'Polanyi, John', 'Polanyi,  John'
+    # name_set = set([name for idx, name in item])
+    # # if len(name_set) == len(item):
+    # #     return ""
+    # if len(name_set) > 1:
+    #     return ""
+        
+    # else:
+    #     return 1.0  # 1.0表示同一个人，0.0表示不是同一个人
+    row_values_sum = np.sum(row_values, axis=0)
+    if all([item < 2 for item in row_values_sum]):
+        return 1.0
+    else:
+        return 0.0
+    
+    
+import json
+
+# 创建一个空列表来存储解析后的 JSON 对象
+def read_jsonl(file_path):
+    """
+    读取 JSONL 文件并返回解析后的数据
+    :param file_path: JSONL 文件路径
+    :return: 解析后的数据列表
+    """
+    data = []
+    # 逐行读取文件
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            # 解析每一行的 JSON 数据
+            json_line = json.loads(line)
+            data.append(json_line)
+            
+    return data
+
+
+def name_in_niuren_list(name, niuren_name_list):
+    """
+    判断名字是否在牛人列表中
+    :param name: 名字
+    :param niuren_name_list: 牛人列表
+    :return: 如果在牛人列表中，返回 True，否则返回 False
+    """
+    for idx, niuren_name in enumerate(niuren_name_list):
+        if name == niuren_name:
+            return idx
+        
+    for idx, niuren_name in enumerate(niuren_name_list):
+        if niuren_name.startswith(name):
+            if niuren_name[niuren_name.find(name) + len(name)] == " ":
+                return idx
+        if name.startswith(niuren_name):
+            if name[name.find(niuren_name) + len(niuren_name)] == " ":
+                return idx
+            
+    return -1
+
+
+import pycountry
+import pycountry_convert as pc
+
+def get_continent(country_name):
+    if country_name == "South Korea":
+        country_name = "Korea"
+    try:
+        country = pycountry.countries.get(name=country_name)
+        if not country:
+            # 可能是国家名称不规范，尝试使用通用名
+            for c in pycountry.countries:
+                if country_name.lower() in c.name.lower():
+                    country = c
+                    break
+        if not country:
+            return "未知洲"
+
+        country_alpha2 = country.alpha_2
+        continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
+        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
+        return continent_name
+    except Exception as e:
+        return f"出错：{e}"
+
+# # 示例
+# print(get_continent("China"))       # 输出: Asia
+# print(get_continent("Germany"))     # 输出: Europe
+# print(get_continent("Brazil"))      # 输出: South America
+
+
+def parse_name(name):
+    candidate_names = []
+    # 按照逗号分割名字
+    parts = name.split(',')
+    if len(parts) < 2:
+        return None  # 不符合格式，跳过
+
+    last_name = parts[0].strip()  # 姓
+    first_name_part = parts[1].strip()  # 名和可能的中间名
+
+    # 按照空格分割名和中间名: "FeiFei-Q" 或 "FeiFei Q"
+    if "-" in first_name_part:
+        first_name_1 = first_name_part.split('-')[0].strip()
+        first_name_2 = first_name_part.split('-')[1].strip()
+        candidate_names.append(f"{last_name}, {first_name_1}")
+        candidate_names.append(f"{last_name}, {first_name_1}-{first_name_2}")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}.")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}.")
+    if ", " in first_name_part:
+        first_name_1 = first_name_part.split(', ')[0].strip()
+        first_name_2 = first_name_part.split(', ')[1].strip()
+        candidate_names.append(f"{last_name}, {first_name_1}")
+        candidate_names.append(f"{last_name}, {first_name_1}-{first_name_2}")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}.")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}.") 
+    if " " in first_name_part:
+        first_name_1 = first_name_part.split(' ')[0].strip()
+        first_name_2 = first_name_part.split(' ')[1].strip()
+        candidate_names.append(f"{last_name}, {first_name_1}")
+        candidate_names.append(f"{last_name}, {first_name_1}-{first_name_2}")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2}.")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}")
+        candidate_names.append(f"{last_name}, {first_name_1} {first_name_2[0]}.")
+    # else:
+    #     candidate_names.append(f"{last_name}, {first_name_part}")
+    
+    candidate_names.append(f"{last_name}, {first_name_part}")
+
+    return candidate_names
\ No newline at end of file
--- a/add-niuren-location-zhimingqiye/机构国家汇总-初版.xlsx
+++ b/add-niuren-location-zhimingqiye/机构国家汇总-初版.xlsx
--- a/add-niuren-location-zhimingqiye/知名企业.xlsx
+++ b/add-niuren-location-zhimingqiye/知名企业.xlsx