solve the compound first name problem

e46e4e8c · zhengxinhan · e02d76c7 · e46e4e8c · e46e4e8c · e46e4e8c
Commit e46e4e8c authored May 11, 2025 by zhengxinhan
6 changed files
--- a/papertools_niuren_ccfa/__pycache__/utils.cpython-310.pyc
+++ b/papertools_niuren_ccfa/__pycache__/utils.cpython-310.pyc
--- a/papertools_niuren_ccfa/info/doubohan.xlsx
+++ b/papertools_niuren_ccfa/info/doubohan.xlsx
--- a/papertools_niuren_ccfa/niurenpipei.py
+++ b/papertools_niuren_ccfa/niurenpipei.py
@@ -9,8 +9,8 @@ from copy import copy
 from joblib import Parallel, delayed
 from utils import standardized_name, name_in_niuren_list

-input_file_path = 'info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
-output_file_path = 'output/论文被引用统计-陈老师-截止2025年X月XX日_牛人筛选.xlsx'
+input_file_path = 'info/doubohan.xlsx'
+output_file_path = 'output/doubohan_牛人筛选.xlsx'

 output_dir = os.path.dirname(output_file_path)
 if not os.path.exists(output_dir):

--- a/papertools_niuren_ccfa/niurenpipei_xinbiao.py
+++ b/papertools_niuren_ccfa/niurenpipei_xinbiao.py
@@ -9,8 +9,14 @@ from copy import copy
 from joblib import Parallel, delayed
 from utils import standardized_name, name_in_niuren_list

+<<<<<<< Updated upstream:papertools_niuren_ccfa/niurenpipei_xinbiao.py
 input_file_path = 'info/新表-论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
 output_file_path = 'output/新表-论文被引用统计-陈老师-截止2025年X月XX日_牛人筛选.xlsx'
+=======
+input_file_path = 'info/doubohan.xlsx'
+# input_file_path = 'info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
+output_file_path = 'output/doubohan_牛人筛选.xlsx'
+>>>>>>> Stashed changes:papertools_niuren_ccfa/niurenpipei_update.py

 output_dir = os.path.dirname(output_file_path)
 if not os.path.exists(output_dir):
@@ -153,11 +159,11 @@ if __name__ == "__main__":
    fake_niuren_names, fake_niuren_papers = load_fake_niuren(fake_niuren,fake_niuren_papers_path)
    
    # 读取表头（第4行作为列名）
-    original_header = pd.read_excel(input_file_path, nrows=0, header=3)
+    original_header = pd.read_excel(input_file_path, nrows=0, header=2)
    column_names = original_header.columns.tolist()
    
    # 读取数据（从第8行开始）
-    input_df = pd.read_excel(input_file_path, skiprows=7, header=None, names=column_names)
+    input_df = pd.read_excel(input_file_path, skiprows=2, header=None, names=column_names)
    
    # 输出表头和数据的基本信息
    print("表头元素：")
@@ -273,9 +279,16 @@ if __name__ == "__main__":
        ws_new.cell(row=4, column=suspected_niuren_col_index, value='疑似牛人')
    
    # 更新数据（从第8行开始）
+<<<<<<< Updated upstream:papertools_niuren_ccfa/niurenpipei_xinbiao.py
    for i, row in input_df.iterrows():
        excel_row = i + 8  # 转换为Excel的行号（从1开始）
        
+=======
+    # for i, row in input_df.iterrows():
+    for i, (index, row) in enumerate(input_df.iterrows(), start=3):
+        # excel_row = i + 8  # 转换为Excel的行号（从1开始）
+        print(i, niuren_col_index)
+>>>>>>> Stashed changes:papertools_niuren_ccfa/niurenpipei_update.py
        # 只更新牛人相关的三列
        ws_new.cell(row=excel_row, column=niuren_col_index, value=row[niuren_col])
        ws_new.cell(row=excel_row, column=niuren_order_col_index, value=row[niuren_order_col])

--- a/papertools_niuren_ccfa/output/doubohan_牛人筛选.xlsx
+++ b/papertools_niuren_ccfa/output/doubohan_牛人筛选.xlsx
--- a/papertools_niuren_ccfa/utils.py
+++ b/papertools_niuren_ccfa/utils.py
@@ -2,8 +2,8 @@
 Author: zhengxinhan
 Date: 2025-05-08 15:02:02
 LastEditors: zhengxinhan
-LastEditTime: 2025-05-09 02:46:29
-FilePath: /papertools-master/niurenshaixuan/utils.py
+LastEditTime: 2025-05-11 12:42:43
+FilePath: /papertools_niuren_ccfa/utils.py
 Description: 

 Copyright (c) 2025 by m13521952989@163.com, All Rights Reserved. 
@@ -11,66 +11,132 @@ Copyright (c) 2025 by m13521952989@163.com, All Rights Reserved.
 def standardized_name(name):
    if "," in name:
        return name
-    # 名人列表中存在一些名字是First Name Last Name的格式，将其转换为Last Name, First Name格式
-    # 例如：'John Doe' -> 'Doe, John'，'M. Jane Smith' -> 'Smith, Jane M.'
+    
+    # 添加常见的姓氏前缀列表
+    surname_prefixes = ["De", "Del", "Della", "Di", "Da", "Van", "Von", "Le", "La", "O'", "Mc", "Mac", 
+                        "Al", "El", "Bin", "Ben", "Ibn", "Ter", "Saint", "St.", "Dos", "Das", "Los", "Las", "San"]
+    
+    # 特殊字符串处理
    special_str = [".", "Ms.", "Mr.", "Mrs.", "Dr.", "Prof.", "PhD", "MD", "Jr.", "Sr.", "The", "Honorable"]
    name_split = name.split(" ")
    name_split = [item.strip() for item in name_split if item.strip() != ""]
    name_split = [item for item in name_split if item not in special_str]
    
-    # Grigory Isaakovich Barenblatt --> Barenblatt, Alexa, Marc Isaakovich
+    # 如果分割后没有部分，直接返回原名
+    if not name_split:
+        return name
    
+    # 单部分名字处理（只有一个单词）
+    if len(name_split) == 1:
+        return name_split[0]
+    
+    # 两部分名字处理
    if len(name_split) == 2:
        first_name = name_split[0]
        last_name = name_split[1]
        new_name = f"{last_name}, {first_name}"
+    
+    # 三部分名字处理
    elif len(name_split) == 3:
-        if name_split[0].startswith("(") and name_split[0].endswith(")"):
+        # 检查中间部分是否为姓氏前缀
+        if name_split[1] in surname_prefixes:
+            # 例如 "Barbara De Salvo" 应该变成 "De Salvo, Barbara"
+            first_name = name_split[0]
+            last_name = f"{name_split[1]} {name_split[2]}"
+            new_name = f"{last_name}, {first_name}"
+        
+        # 以下是原有的其他情况处理
+        elif name_split[0].startswith("(") and name_split[0].endswith(")"):
            # (Alexander) Philip Dawid
            first_name = f"{name_split[1]} {name_split[0]}"
            last_name = f"{name_split[2]}"
+            new_name = f"{last_name}, {first_name}"
        elif name_split[1].startswith("(") and name_split[1].endswith(")"):
            # Xinyan (Tracy) Cui --> Cui, Xinyan (Tracy)
            first_name = f"{name_split[0]} {name_split[1]}"
            last_name = f"{name_split[2]}"
+            new_name = f"{last_name}, {first_name}"
        elif name_split[2].startswith("(") and name_split[2].endswith(")"):
            # Ye Fred (Ying)实际是姓Ye，名Ying，英文名Fred, 需要将其转换为"Ye, Ying Fred"
-            # Zhu Jesse (Jingxu)实际是姓Zhu，名Jingxu，英文名Jesse, 需要将其转换为"Zhu, Jingxu Jesse"
            first_name = f"{name_split[2][1:-1]} ({name_split[1]})"
            last_name = f"{name_split[0]}"
+            new_name = f"{last_name}, {first_name}"
        elif name_split[0].endswith("."):
            # M. Jane Smith --> Smith, Jane M.
-            # K.W. Michael Siu --> Siu, Michael K.W.
            first_name = f"{name_split[1]} {name_split[0]}"
            last_name = name_split[2]
+            new_name = f"{last_name}, {first_name}"
        elif name_split[1].endswith("."):
            # Jane M. Smith --> Smith, Jane M.
-            # Pierre J.H. Richardson --> Richardson, Pierre J.H.
            first_name = f"{name_split[0]} {name_split[1]}"
            last_name = name_split[2]
+            new_name = f"{last_name}, {first_name}"
        elif name_split[2].endswith("."):
            # Wimmer-Schweingruber Robert F. --> Wimmer-Schweingruber, Robert F.
-            # Wilderer Peter A. --> Wilderer, Peter A.
            first_name = f"{name_split[1]} {name_split[2]}"
            last_name = f"{name_split[0]}"
+            new_name = f"{last_name}, {first_name}"
        else:
            # William Nelson Joy --> Joy, William Nelson
-            # Michael J Carey --> Carey, Michael J
-            # len_3_list.append(" ".join(name_split))
            first_name = f"{name_split[0]} {name_split[1]}"
            last_name = name_split[2]
-        
-        new_name = f"{last_name}, {first_name}"
-        # print(f"{name} --> {new_name}")
+            new_name = f"{last_name}, {first_name}"
+    
+    # 四部分或更多的名字处理
    else:
-        # lens greater than 3 这部分名字应该不会跟其他部分名字出现重复
-        last_name = name_split[-1]
-        first_name = " ".join(name_split[:-1])
-        new_name = f"{last_name}, {first_name}"
+        # 检查复合姓氏模式
+        compound_found = False
+        
+        # 检查倒数第二部分是否为姓氏前缀（确保列表长度足够）
+        if len(name_split) >= 2 and name_split[-2] in surname_prefixes:
+            last_name = f"{name_split[-2]} {name_split[-1]}"
+            first_name = " ".join(name_split[:-2])
+            compound_found = True
+        
+        # 从前往后检查是否有姓氏前缀组合
+        if not compound_found:
+            for i in range(len(name_split) - 1):
+                if name_split[i] in surname_prefixes:
+                    # 可能是复合姓氏的开始
+                    # 例如 "Maria Del Carmen Gomez" 中的 "Del Carmen"
+                    potential_compound = True
+                    compound_parts = [name_split[i]]
+                    
+                    # 查看后续部分是否也可能是复合姓氏的一部分
+                    j = i + 1
+                    while j < len(name_split) - 1 and potential_compound:
+                        if name_split[j] in surname_prefixes or name_split[j][0].isupper():
+                            compound_parts.append(name_split[j])
+                            j += 1
+                        else:
+                            potential_compound = False
+                    
+                    if potential_compound and j < len(name_split):
+                        # 找到了复合姓氏
+                        compound_parts.append(name_split[j])
+                        last_name = " ".join(compound_parts)
+                        first_name = " ".join(name_split[:i])
+                        compound_found = True
+                        break
+        
+        # 如果没有找到复合姓氏，使用默认处理
+        if not compound_found:
+            last_name = name_split[-1]
+            first_name = " ".join(name_split[:-1])
        
+        new_name = f"{last_name}, {first_name}"
+    
    return new_name


+def convert_to_lowercase(element):
+    if isinstance(element, list):
+        return [convert_to_lowercase(e) for e in element]
+    elif isinstance(element, str):
+        return element.lower()
+    else:
+        return element
+    
 def name_in_niuren_list(name, niuren_name_list):
    """
    判断名字是否在牛人列表中
@@ -78,6 +144,8 @@ def name_in_niuren_list(name, niuren_name_list):
    :param niuren_name_list: 牛人列表
    :return: 如果在牛人列表中，返回 True，否则返回 False
    """
+    name = convert_to_lowercase(name)
+    niuren_name_list = convert_to_lowercase(niuren_name_list)
    for idx, niuren_name in enumerate(niuren_name_list):
        if isinstance(niuren_name, str):
            if name == niuren_name: