Upload New File

2693f2e7 · matianyun · 6da5da74 · 2693f2e7
Commit 2693f2e7 authored May 09, 2025 by matianyun
Hide whitespace changes
Inline Side-by-side

Showing with 301 additions and 0 deletions

papertools_niuren_ccfa/niurenpipei_update.py
+301 -0

No files found.
--- a/papertools_niuren_ccfa/niurenpipei_update.py
+++ b/papertools_niuren_ccfa/niurenpipei_update.py
+import os
+import sys
+os.chdir(os.path.dirname(__file__))
+import json
+import pandas as pd
+from tqdm import tqdm
+import openpyxl
+from copy import copy
+# from joblib import Parallel, delayed
+from utils import standardized_name, name_in_niuren_list
+
+input_file_path = 'info/测试大表2.xlsx'
+# input_file_path = 'info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
+output_file_path = 'output/论文被引用统计-陈老师-截止2025年X月XX日_牛人筛选.xlsx'
+
+output_dir = os.path.dirname(output_file_path)
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+
+niuren_pool_path = 'info/niuren_pool.csv'
+true_niuren_papers_path = 'info/true_niuren_papers'
+fake_niuren_papers_path = 'info/fake_niuren_papers'
+
+# 全局变量，存储加载的数据
+# NIUREN_POOL_NAMES = []
+# TRUE_NIUREN_NAMES = []
+# TRUE_NIUREN_PAPERS = []
+# FAKE_NIUREN_NAMES = []
+# FAKE_NIUREN_PAPERS = []
+
+def load_niuren_pool(niuren_pool_path):
+    niuren_pool = pd.read_csv(niuren_pool_path, encoding='utf-8-sig')
+    niuren_pool_names = niuren_pool["name"].tolist()
+    niuren_pool_names = [name.replace("\xa0", " ") for name in niuren_pool_names]  # 去除空格
+    return niuren_pool_names
+
+
+def load_true_niuren(true_niuren, true_niuren_papers_path):
+
+    true_niuren["别名列表"] = None
+    true_niuren["别名列表"] = true_niuren["别名列表（各种奇奇怪怪的名字格式，比如first name和second name的顺序，以;分隔）"].apply(
+        lambda x: [standardized_name(i.strip()) for i in x.split(";") if i!=""] if isinstance(x, str) else []
+    )
+
+    true_niuren_names = []
+    true_niuren_papers = []
+    # 构建牛人姓名列表
+    for _, row in true_niuren.iterrows():
+        if pd.isna(row["姓名"]):
+            break
+        if row["别名列表"]:
+            true_niuren_names.append([standardized_name(row["姓名"])] + row["别名列表"])
+        else:
+            true_niuren_names.append(standardized_name(row["姓名"]))
+            
+
+    # 构建牛人论文列表
+    for true_niuren_idx, _ in enumerate(true_niuren_names):
+        paper_file_path = os.path.join(true_niuren_papers_path, f"{true_niuren_idx+1}.xlsx")
+        if os.path.exists(paper_file_path):
+            papers_df = pd.read_excel(paper_file_path, usecols=[0])
+            papers_list = papers_df.iloc[:, 0].dropna().tolist()
+            papers_list = [paper.lower() for paper in papers_list]
+            true_niuren_papers.append(papers_list)
+        else:
+            true_niuren_papers.append([])
+    return true_niuren_names, true_niuren_papers
+
+
+
+def load_fake_niuren(fake_niuren, fake_niuren_papers_path):
+    # 读取 "全局非牛人" 工作簿
+
+    fake_niuren_names = []
+    fake_niuren_papers = []
+
+    # 构建非牛人姓名列表
+    for _, row in fake_niuren.iterrows():
+        if pd.isna(row["姓名"]):
+            break
+        fake_niuren_names.append(standardized_name(row["姓名"]))
+
+    # 构建非牛人论文列表
+    for fake_niuren_idx, _ in enumerate(fake_niuren_names):
+        paper_file_path = os.path.join(fake_niuren_papers_path, f"{fake_niuren_idx+1}.xlsx")
+        if os.path.exists(paper_file_path):
+            papers_df = pd.read_excel(paper_file_path, usecols=[0])
+            papers_list = papers_df.iloc[:, 0].dropna().tolist()
+            papers_list = [paper.lower() for paper in papers_list]
+            fake_niuren_papers.append(papers_list)
+        else:
+            fake_niuren_papers.append([])
+    return fake_niuren_names, fake_niuren_papers
+
+
+def check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
+    niuren = []
+    order = []
+    suspected_niuren = []
+
+    for author_idx, author in enumerate(authors):
+        if author == "":
+            continue
+
+        # 作者姓名能否在牛人池中找到
+        name_index = name_in_niuren_list(standardized_name(author), niuren_pool_names)
+        if name_index == -1:
+            continue
+
+        # 作者姓名能否在全局牛人中找到
+        true_name_index = name_in_niuren_list(standardized_name(author), true_niuren_names)
+        if true_name_index != -1:
+            if title.strip().lower() in true_niuren_papers[true_name_index]:
+                niuren.append(author)
+                order.append(author_idx + 1)
+                continue
+
+        # 作者姓名能否在全局非牛人中找到
+        fake_name_index = name_in_niuren_list(standardized_name(author), fake_niuren_names)
+        if fake_name_index != -1:
+            if title.strip().lower() in fake_niuren_papers[fake_name_index]:
+                continue
+        suspected_niuren.append([author, name_index+1])
+    niuren_str = ";".join(niuren)
+    order_str = ";".join([str(i) for i in order])
+    suspected_niuren_str = ";".join([f"{name}({index})" for name, index in suspected_niuren])
+    return niuren_str, order_str, suspected_niuren_str
+
+
+def process_row(index, row, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
+    try:
+        authors = row.get('引文作者', '')
+        title = row.get('引文名称', '')
+        if not isinstance(authors, str):
+            return index, "", "", ""
+        authors = [i.strip() for i in authors.split(";") if i!=""]
+        niuren_true, niuren_true_order, suspected_niuren = check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers)
+        return index, niuren_true, niuren_true_order, suspected_niuren
+    except Exception as e:
+        print(f"处理行 {index} 时发生错误: {e}")
+        return index, "", "", ""
+
+
+if __name__ == "__main__":
+    breakpoint()
+    print("正在读取全局牛人...")
+    true_niuren = pd.read_excel(input_file_path, sheet_name="全局牛人")
+    print("正在读取全局非牛人...")
+    fake_niuren = pd.read_excel(input_file_path, sheet_name="全局非牛人")
+    print("正在处理牛人池...")
+    niuren_pool_names = load_niuren_pool(niuren_pool_path)
+    print("正在处理真牛人数据...")
+    true_niuren_names, true_niuren_papers = load_true_niuren(true_niuren,true_niuren_papers_path)
+    print("正在处理非牛人数据...")
+    fake_niuren_names, fake_niuren_papers = load_fake_niuren(fake_niuren,fake_niuren_papers_path)
+    
+    # 读取表头（第4行作为列名）
+    original_header = pd.read_excel(input_file_path, nrows=0, header=3)
+    column_names = original_header.columns.tolist()
+    
+    # 读取数据（从第8行开始）
+    input_df = pd.read_excel(input_file_path, skiprows=7, header=None, names=column_names)
+    
+    # 输出表头和数据的基本信息
+    print("表头元素：")
+    print(column_names)
+    print("\n数据行数：", input_df.shape[0])
+    print("数据列数：", input_df.shape[1])
+    
+    # 检查列是否存在，不存在则添加
+    if '疑似牛人' not in input_df.columns:
+        input_df['疑似牛人'] = None
+    
+    # 确保牛人相关列存在
+    niuren_col = '牛人\n(参考全局牛人列表)'
+    niuren_order_col = '牛人署名顺序\n'
+    
+    # 如果列不存在，添加它们
+    if niuren_col not in input_df.columns:
+        input_df[niuren_col] = None
+    
+    if niuren_order_col not in input_df.columns:
+        input_df[niuren_order_col] = None
+    
+    print("开始并行处理数据...")
+    # 并行处理，将加载的数据传递给每个进程
+    # results = Parallel(n_jobs=-1)(
+    # results = Parallel(n_jobs=1)(
+    #     delayed(process_row)(
+    #         index, row, 
+    #         niuren_pool_names, true_niuren_names, true_niuren_papers, 
+    #         fake_niuren_names, fake_niuren_papers
+    #     ) 
+    #     for index, row in tqdm(input_df.iterrows())
+    # )
+    results = []
+    for index, row in tqdm(input_df.iterrows(), total=input_df.shape[0]):
+        result = process_row(
+            index, row, 
+            niuren_pool_names, true_niuren_names, true_niuren_papers, 
+            fake_niuren_names, fake_niuren_papers
+        )
+        results.append(result)
+    for index, niuren_true, niuren_true_order, suspected_niuren in results:
+        input_df.at[index, niuren_col] = niuren_true
+        input_df.at[index, niuren_order_col] = niuren_true_order
+        input_df.at[index, '疑似牛人'] = suspected_niuren
+    
+    # 打开原始Excel文件
+    print("正在读取原始Excel文件以保留格式...")
+    wb_original = openpyxl.load_workbook(input_file_path)
+    
+    # 创建新工作簿
+    wb_new = openpyxl.Workbook()
+    # 删除默认创建的空白工作表
+    if 'Sheet' in wb_new.sheetnames:
+        del wb_new['Sheet']
+    
+    # 复制所有工作表
+    for sheet_name in wb_original.sheetnames:
+        ws_original = wb_original[sheet_name]
+        ws_new = wb_new.create_sheet(sheet_name)
+        
+        # 复制工作表属性
+        ws_new.sheet_properties = copy(ws_original.sheet_properties)
+        ws_new.sheet_format = copy(ws_original.sheet_format)
+        
+        # 复制整个工作表的内容和格式
+        for row in ws_original.rows:
+            for cell in row:
+                new_cell = ws_new.cell(row=cell.row, column=cell.column, value=cell.value)
+                if cell.has_style:
+                    new_cell.font = copy(cell.font)
+                    new_cell.border = copy(cell.border)
+                    new_cell.fill = copy(cell.fill)
+                    new_cell.number_format = copy(cell.number_format)
+                    new_cell.protection = copy(cell.protection)
+                    new_cell.alignment = copy(cell.alignment)
+        
+        # 复制合并单元格
+        for merged_cell_range in ws_original.merged_cells.ranges:
+            ws_new.merge_cells(str(merged_cell_range))
+    
+    # 获取主工作表（第一个工作表）
+    main_sheet_name = wb_original.sheetnames[0]
+    ws_new = wb_new[main_sheet_name]
+    
+    # 更新主工作表中的牛人相关数据
+    print("正在更新主工作表中的牛人数据...")
+    
+    # 找出列索引（Excel中列是从1开始的）
+    niuren_col_index = None
+    niuren_order_col_index = None
+    suspected_niuren_col_index = None
+    
+    # 获取第4行（索引从1开始）的所有单元格值
+    header_row = [cell.value for cell in ws_new[4]]
+    
+    # 在这些值中查找列名对应的索引
+    for i, cell_value in enumerate(header_row, start=1):
+        if cell_value == niuren_col:
+            niuren_col_index = i
+        elif cell_value == niuren_order_col:
+            niuren_order_col_index = i
+        elif cell_value == '疑似牛人':
+            suspected_niuren_col_index = i
+    
+    # 如果找不到列，添加新列
+    max_col = ws_new.max_column
+    
+    if niuren_col_index is None:
+        niuren_col_index = max_col + 1
+        ws_new.cell(row=4, column=niuren_col_index, value=niuren_col)
+        max_col += 1
+    
+    if niuren_order_col_index is None:
+        niuren_order_col_index = max_col + 1
+        ws_new.cell(row=4, column=niuren_order_col_index, value=niuren_order_col)
+        max_col += 1
+    
+    if suspected_niuren_col_index is None:
+        suspected_niuren_col_index = max_col + 1
+        ws_new.cell(row=4, column=suspected_niuren_col_index, value='疑似牛人')
+    
+    # 更新数据（从第8行开始）
+    # for i, row in input_df.iterrows():
+    for i, (index, row) in enumerate(input_df.iterrows(), start=7):
+        # excel_row = i + 8  # 转换为Excel的行号（从1开始）
+        print(i, niuren_col_index)
+        # 只更新牛人相关的三列
+        ws_new.cell(row=i, column=niuren_col_index, value=row[niuren_col])
+        ws_new.cell(row=i, column=niuren_order_col_index, value=row[niuren_order_col])
+        ws_new.cell(row=i, column=suspected_niuren_col_index, value=row['疑似牛人'])
+    
+    # 保存新Excel文件
+    print(f"正在保存为Excel格式 {output_file_path} ...")
+    wb_new.save(output_file_path)
+    print(f"成功保存到 {output_file_path}，保留了原始格式")
+    
+    print("处理完成!")
\ No newline at end of file