author变为全小写形式匹配

34bba471 · Pengwei-Jin · 2cb16c72 · 34bba471 · 34bba471
Commit 34bba471 authored May 09, 2025 by Pengwei-Jin
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 8 deletions

niurenshaixuan/main.py
+8 -8

niurenshaixuan/utils.py
+1 -0

No files found.
--- a/niurenshaixuan/main.py
+++ b/niurenshaixuan/main.py
@@ -12,7 +12,7 @@ output_file_path = '测试输出3.xlsx'
 def load_niuren_pool():
    niuren_pool = pd.read_csv("info/new_niuren_format-merged_turing.csv", encoding='utf-8-sig')
    niuren_pool_names = niuren_pool["name"].tolist()
-    niuren_pool_names = [name.replace("\xa0", " ") for name in niuren_pool_names]  # 去除空格
+    niuren_pool_names = [name.replace("\xa0", " ").lower() for name in niuren_pool_names]  # 去除空格
    return niuren_pool_names


@@ -22,7 +22,7 @@ def load_true_niuren():

    true_niuren["别名列表"] = None
    true_niuren["别名列表"] = true_niuren["别名列表（各种奇奇怪怪的名字格式，比如first name和second name的顺序，以;分隔）"].apply(
-        lambda x: [firstlast2lastfirst(i.strip()) for i in x.split(";") if i!=""] if isinstance(x, str) else []
+        lambda x: [firstlast2lastfirst(i.strip()).lower() for i in x.split(";") if i!=""] if isinstance(x, str) else []
    )

    true_niuren_names = []
@@ -31,9 +31,9 @@ def load_true_niuren():
    # 构建牛人姓名列表
    for _, row in true_niuren.iterrows():
        if row["别名列表"]:
-            true_niuren_names.append([firstlast2lastfirst(row["姓名"])] + row["别名列表"])
+            true_niuren_names.append([firstlast2lastfirst(row["姓名"]).lower()] + row["别名列表"])
        else:
-            true_niuren_names.append(firstlast2lastfirst(row["姓名"]))
+            true_niuren_names.append(firstlast2lastfirst(row["姓名"]).lower())

    # 构建牛人论文列表
    for true_niuren_idx, _ in enumerate(true_niuren_names):
@@ -58,7 +58,7 @@ def load_fake_niuren():

    # 构建非牛人姓名列表
    for _, row in fake_niuren.iterrows():
-        fake_niuren_names.append(firstlast2lastfirst(row["姓名"]))
+        fake_niuren_names.append(firstlast2lastfirst(row["姓名"]).lower())

    # 构建非牛人论文列表
    for fake_niuren_idx, _ in enumerate(fake_niuren_names):
@@ -87,12 +87,12 @@ def check_niuren(authors, title):
            continue

        # 作者姓名能否在牛人池中找到
-        name_index = name_in_niuren_list(firstlast2lastfirst(author), niuren_pool_names)
+        name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), niuren_pool_names)
        if name_index == -1:
            continue

        # 作者姓名能否在全局牛人中找到
-        true_name_index = name_in_niuren_list(firstlast2lastfirst(author), true_niuren_names)
+        true_name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), true_niuren_names)
        if true_name_index != -1:
            if title.strip().lower() in true_niuren_papers[true_name_index]:
                niuren.append(author)
@@ -100,7 +100,7 @@ def check_niuren(authors, title):
                continue

        # 作者姓名能否在全局非牛人中找到
-        fake_name_index = name_in_niuren_list(firstlast2lastfirst(author), fake_niuren_names)
+        fake_name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), fake_niuren_names)
        if fake_name_index != -1:
            if title.strip().lower() in fake_niuren_papers[fake_name_index]:
                continue

--- a/niurenshaixuan/utils.py
+++ b/niurenshaixuan/utils.py
@@ -4,6 +4,7 @@ def firstlast2lastfirst(name):
    # 名人列表中存在一些名字是First Name Last Name的格式，将其转换为Last Name, First Name格式
    # 例如：'John Doe' -> 'Doe, John'，'M. Jane Smith' -> 'Smith, Jane M.'
    special_str = [".", "Ms.", "Mr.", "Mrs.", "Dr.", "Prof.", "PhD", "MD", "Jr.", "Sr.", "The", "Honorable"]
+    special_str = [item.lower() for item in special_str]
    name_split = name.split(" ")
    name_split = [item.strip() for item in name_split if item.strip() != ""]
    name_split = [item for item in name_split if item not in special_str]