Commit 2693f2e7 by matianyun

Upload New File

parent 6da5da74
import os
import sys
os.chdir(os.path.dirname(__file__))
import json
import pandas as pd
from tqdm import tqdm
import openpyxl
from copy import copy
# from joblib import Parallel, delayed
from utils import standardized_name, name_in_niuren_list
input_file_path = 'info/测试大表2.xlsx'
# input_file_path = 'info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
output_file_path = 'output/论文被引用统计-陈老师-截止2025年X月XX日_牛人筛选.xlsx'
output_dir = os.path.dirname(output_file_path)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
niuren_pool_path = 'info/niuren_pool.csv'
true_niuren_papers_path = 'info/true_niuren_papers'
fake_niuren_papers_path = 'info/fake_niuren_papers'
# 全局变量,存储加载的数据
# NIUREN_POOL_NAMES = []
# TRUE_NIUREN_NAMES = []
# TRUE_NIUREN_PAPERS = []
# FAKE_NIUREN_NAMES = []
# FAKE_NIUREN_PAPERS = []
def load_niuren_pool(niuren_pool_path):
niuren_pool = pd.read_csv(niuren_pool_path, encoding='utf-8-sig')
niuren_pool_names = niuren_pool["name"].tolist()
niuren_pool_names = [name.replace("\xa0", " ") for name in niuren_pool_names] # 去除空格
return niuren_pool_names
def load_true_niuren(true_niuren, true_niuren_papers_path):
true_niuren["别名列表"] = None
true_niuren["别名列表"] = true_niuren["别名列表(各种奇奇怪怪的名字格式,比如first name和second name的顺序,以;分隔)"].apply(
lambda x: [standardized_name(i.strip()) for i in x.split(";") if i!=""] if isinstance(x, str) else []
)
true_niuren_names = []
true_niuren_papers = []
# 构建牛人姓名列表
for _, row in true_niuren.iterrows():
if pd.isna(row["姓名"]):
break
if row["别名列表"]:
true_niuren_names.append([standardized_name(row["姓名"])] + row["别名列表"])
else:
true_niuren_names.append(standardized_name(row["姓名"]))
# 构建牛人论文列表
for true_niuren_idx, _ in enumerate(true_niuren_names):
paper_file_path = os.path.join(true_niuren_papers_path, f"{true_niuren_idx+1}.xlsx")
if os.path.exists(paper_file_path):
papers_df = pd.read_excel(paper_file_path, usecols=[0])
papers_list = papers_df.iloc[:, 0].dropna().tolist()
papers_list = [paper.lower() for paper in papers_list]
true_niuren_papers.append(papers_list)
else:
true_niuren_papers.append([])
return true_niuren_names, true_niuren_papers
def load_fake_niuren(fake_niuren, fake_niuren_papers_path):
# 读取 "全局非牛人" 工作簿
fake_niuren_names = []
fake_niuren_papers = []
# 构建非牛人姓名列表
for _, row in fake_niuren.iterrows():
if pd.isna(row["姓名"]):
break
fake_niuren_names.append(standardized_name(row["姓名"]))
# 构建非牛人论文列表
for fake_niuren_idx, _ in enumerate(fake_niuren_names):
paper_file_path = os.path.join(fake_niuren_papers_path, f"{fake_niuren_idx+1}.xlsx")
if os.path.exists(paper_file_path):
papers_df = pd.read_excel(paper_file_path, usecols=[0])
papers_list = papers_df.iloc[:, 0].dropna().tolist()
papers_list = [paper.lower() for paper in papers_list]
fake_niuren_papers.append(papers_list)
else:
fake_niuren_papers.append([])
return fake_niuren_names, fake_niuren_papers
def check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
niuren = []
order = []
suspected_niuren = []
for author_idx, author in enumerate(authors):
if author == "":
continue
# 作者姓名能否在牛人池中找到
name_index = name_in_niuren_list(standardized_name(author), niuren_pool_names)
if name_index == -1:
continue
# 作者姓名能否在全局牛人中找到
true_name_index = name_in_niuren_list(standardized_name(author), true_niuren_names)
if true_name_index != -1:
if title.strip().lower() in true_niuren_papers[true_name_index]:
niuren.append(author)
order.append(author_idx + 1)
continue
# 作者姓名能否在全局非牛人中找到
fake_name_index = name_in_niuren_list(standardized_name(author), fake_niuren_names)
if fake_name_index != -1:
if title.strip().lower() in fake_niuren_papers[fake_name_index]:
continue
suspected_niuren.append([author, name_index+1])
niuren_str = ";".join(niuren)
order_str = ";".join([str(i) for i in order])
suspected_niuren_str = ";".join([f"{name}({index})" for name, index in suspected_niuren])
return niuren_str, order_str, suspected_niuren_str
def process_row(index, row, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
try:
authors = row.get('引文作者', '')
title = row.get('引文名称', '')
if not isinstance(authors, str):
return index, "", "", ""
authors = [i.strip() for i in authors.split(";") if i!=""]
niuren_true, niuren_true_order, suspected_niuren = check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers)
return index, niuren_true, niuren_true_order, suspected_niuren
except Exception as e:
print(f"处理行 {index} 时发生错误: {e}")
return index, "", "", ""
if __name__ == "__main__":
breakpoint()
print("正在读取全局牛人...")
true_niuren = pd.read_excel(input_file_path, sheet_name="全局牛人")
print("正在读取全局非牛人...")
fake_niuren = pd.read_excel(input_file_path, sheet_name="全局非牛人")
print("正在处理牛人池...")
niuren_pool_names = load_niuren_pool(niuren_pool_path)
print("正在处理真牛人数据...")
true_niuren_names, true_niuren_papers = load_true_niuren(true_niuren,true_niuren_papers_path)
print("正在处理非牛人数据...")
fake_niuren_names, fake_niuren_papers = load_fake_niuren(fake_niuren,fake_niuren_papers_path)
# 读取表头(第4行作为列名)
original_header = pd.read_excel(input_file_path, nrows=0, header=3)
column_names = original_header.columns.tolist()
# 读取数据(从第8行开始)
input_df = pd.read_excel(input_file_path, skiprows=7, header=None, names=column_names)
# 输出表头和数据的基本信息
print("表头元素:")
print(column_names)
print("\n数据行数:", input_df.shape[0])
print("数据列数:", input_df.shape[1])
# 检查列是否存在,不存在则添加
if '疑似牛人' not in input_df.columns:
input_df['疑似牛人'] = None
# 确保牛人相关列存在
niuren_col = '牛人\n(参考全局牛人列表)'
niuren_order_col = '牛人署名顺序\n'
# 如果列不存在,添加它们
if niuren_col not in input_df.columns:
input_df[niuren_col] = None
if niuren_order_col not in input_df.columns:
input_df[niuren_order_col] = None
print("开始并行处理数据...")
# 并行处理,将加载的数据传递给每个进程
# results = Parallel(n_jobs=-1)(
# results = Parallel(n_jobs=1)(
# delayed(process_row)(
# index, row,
# niuren_pool_names, true_niuren_names, true_niuren_papers,
# fake_niuren_names, fake_niuren_papers
# )
# for index, row in tqdm(input_df.iterrows())
# )
results = []
for index, row in tqdm(input_df.iterrows(), total=input_df.shape[0]):
result = process_row(
index, row,
niuren_pool_names, true_niuren_names, true_niuren_papers,
fake_niuren_names, fake_niuren_papers
)
results.append(result)
for index, niuren_true, niuren_true_order, suspected_niuren in results:
input_df.at[index, niuren_col] = niuren_true
input_df.at[index, niuren_order_col] = niuren_true_order
input_df.at[index, '疑似牛人'] = suspected_niuren
# 打开原始Excel文件
print("正在读取原始Excel文件以保留格式...")
wb_original = openpyxl.load_workbook(input_file_path)
# 创建新工作簿
wb_new = openpyxl.Workbook()
# 删除默认创建的空白工作表
if 'Sheet' in wb_new.sheetnames:
del wb_new['Sheet']
# 复制所有工作表
for sheet_name in wb_original.sheetnames:
ws_original = wb_original[sheet_name]
ws_new = wb_new.create_sheet(sheet_name)
# 复制工作表属性
ws_new.sheet_properties = copy(ws_original.sheet_properties)
ws_new.sheet_format = copy(ws_original.sheet_format)
# 复制整个工作表的内容和格式
for row in ws_original.rows:
for cell in row:
new_cell = ws_new.cell(row=cell.row, column=cell.column, value=cell.value)
if cell.has_style:
new_cell.font = copy(cell.font)
new_cell.border = copy(cell.border)
new_cell.fill = copy(cell.fill)
new_cell.number_format = copy(cell.number_format)
new_cell.protection = copy(cell.protection)
new_cell.alignment = copy(cell.alignment)
# 复制合并单元格
for merged_cell_range in ws_original.merged_cells.ranges:
ws_new.merge_cells(str(merged_cell_range))
# 获取主工作表(第一个工作表)
main_sheet_name = wb_original.sheetnames[0]
ws_new = wb_new[main_sheet_name]
# 更新主工作表中的牛人相关数据
print("正在更新主工作表中的牛人数据...")
# 找出列索引(Excel中列是从1开始的)
niuren_col_index = None
niuren_order_col_index = None
suspected_niuren_col_index = None
# 获取第4行(索引从1开始)的所有单元格值
header_row = [cell.value for cell in ws_new[4]]
# 在这些值中查找列名对应的索引
for i, cell_value in enumerate(header_row, start=1):
if cell_value == niuren_col:
niuren_col_index = i
elif cell_value == niuren_order_col:
niuren_order_col_index = i
elif cell_value == '疑似牛人':
suspected_niuren_col_index = i
# 如果找不到列,添加新列
max_col = ws_new.max_column
if niuren_col_index is None:
niuren_col_index = max_col + 1
ws_new.cell(row=4, column=niuren_col_index, value=niuren_col)
max_col += 1
if niuren_order_col_index is None:
niuren_order_col_index = max_col + 1
ws_new.cell(row=4, column=niuren_order_col_index, value=niuren_order_col)
max_col += 1
if suspected_niuren_col_index is None:
suspected_niuren_col_index = max_col + 1
ws_new.cell(row=4, column=suspected_niuren_col_index, value='疑似牛人')
# 更新数据(从第8行开始)
# for i, row in input_df.iterrows():
for i, (index, row) in enumerate(input_df.iterrows(), start=7):
# excel_row = i + 8 # 转换为Excel的行号(从1开始)
print(i, niuren_col_index)
# 只更新牛人相关的三列
ws_new.cell(row=i, column=niuren_col_index, value=row[niuren_col])
ws_new.cell(row=i, column=niuren_order_col_index, value=row[niuren_order_col])
ws_new.cell(row=i, column=suspected_niuren_col_index, value=row['疑似牛人'])
# 保存新Excel文件
print(f"正在保存为Excel格式 {output_file_path} ...")
wb_new.save(output_file_path)
print(f"成功保存到 {output_file_path},保留了原始格式")
print("处理完成!")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment