Commit a5a65fbb by zhengxinhan

zhengxinhan restructure niuren and ccfa scripts

parent 2e7b5dfe
# 使用方法
修改`main.py`中的`input_file_path``output_file_path`
输出的excel中包含`牛人``牛人署名顺序``疑似牛人`三列。其中疑似牛人的格式为 `疑似牛人名字(疑似牛人在info/new_niuren_format-merged_turing.csv中的索引)`
请初步判断是否为牛人。如果认为是牛人,则找李慧老师复核,由李慧老师添加到全局牛人表中。请不要修改全局牛人表。
如果是脚本运行相关的问题,请联系马天云同学。
\ No newline at end of file
"""
功能:判断论文是否属于CCF-A类会议
输入文件:CCF_A_list.csv,包含两列:abbr,fullname
输入文件:info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx
输出文件:处理后的Excel文件,保持原格式不变,添加是否CCF-A信息
"""
import os
import pandas as pd
import re
from rapidfuzz import fuzz, process
from collections import defaultdict
import openpyxl
from copy import copy
# 路径参数
input_file_path = "info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx"
output_file_path = "output/论文被引用统计-陈老师-截止2025年X月XX日-CCFA标记.xlsx"
ccf_a_file = "info/CCF_A_list.csv" # 请根据实际路径调整
# 读取CCF-A简称和全称
ccf_a = pd.read_csv(ccf_a_file)
# 缩写全部去空格
ccf_a_abbrs = set(ccf_a['abbr'].dropna().str.lower().str.replace(' ','').str.strip().tolist())
ccf_a_fullnames = set(ccf_a['fullname'].dropna().str.lower().str.strip().tolist())
ccf_a_names = ccf_a_abbrs | ccf_a_fullnames
def clean_venue(venue):
if pd.isnull(venue) or str(venue).strip() == "":
return ""
text = str(venue)
text = re.sub(r'[\(\(][^)\)]*?volume[^)\)]*?[\)\)]', '', text, flags=re.IGNORECASE)
text = re.sub(r"[0-9'\",\.\!\/\\\?\;\-\_]", "", text)
return text.lower().strip()
def extract_brackets(text):
match = re.search(r'[\(\(](.*?)[\)\)]', text)
if match:
return match.group(1).strip().lower()
return None
def extract_before_colon(text):
parts = re.split(r'[::]', text)
if len(parts) > 1:
return parts[0].strip().lower()
return None
def is_ccf_a(venue, threshold=70):
if pd.isnull(venue) or str(venue).strip() == "":
return 0
venue_clean = venue.lower().strip()
if 'south pacific design automation conference' in venue_clean:
return 0
if 'International Journal' in venue_clean and ('Computer Vision' not in venue_clean or 'Human-Computer Studies' not in venue_clean):
return 0
if 'ieee micro' in venue_clean or 'design automation conference' in venue_clean or 'ieee transactions on parallel & distributed systems' in venue_clean or 'ieee transactions on computers' in venue_clean or 'transactions on architecture and code optimization' in venue_clean or '信息科学' in venue_clean or 'pldi' in venue_clean or 'transactions on computer systems' in venue_clean or 'conference on computer vision and pattern recognition' in venue_clean or 'transactions on computeraided design of integrated circuits and systems' in venue_clean or 'aaai conference on artificial intelligence' in venue_clean or 'mm ' in venue_clean or 'transactions on parallel and distributed systems' in venue_clean:
return 1
if 'ijcnn' in venue_clean or 'iccd' in venue_clean or 'icpr' in venue_clean or 'aicas' in venue_clean or 'acm transactions on embedded computing systems' in venue_clean or 'iccad' in venue_clean or 'artificial intelligence review' in venue_clean or 'ieee journal of selected topics in quantum electronics' in venue_clean or 'future generation computer systems' in venue_clean or 'ieee transactions on services computing' in venue_clean or 'ipdps' in venue_clean or 'isocc' in venue_clean or 'microprocessors and microsystems' in venue_clean or 'ccet' in venue_clean or 'microelectronics' in venue_clean or 'asid' in venue_clean or 'caai' in venue_clean or 'euromicro conference on' in venue_clean or 'asicon' in venue_clean or 'transactions on system and lsi design methodology' in venue_clean or 'selected areas in communications' in venue_clean or 'high performance computing data and analytics' in venue_clean or 'innovation communication and engineering' in venue_clean or 'sysml conference' in venue_clean or 'international conference on digital signal processing' in venue_clean or 'international conference on computer and communication systems' in venue_clean or 'ieee transactions on services computing' in venue_clean or 'international symposium on performance analysis of systems and software' in venue_clean or 'wireless communications and mobile computing' in venue_clean or 'sigops operating systems review' in venue_clean or 'siggraphasia' in venue_clean or 'international journal of computational intelligence systems' in venue_clean or 'journal of ikeee' in venue_clean or 'artificial intelligence and security' in venue_clean or 'journal of software' in venue_clean or 'advances in artificial intelligence' in venue_clean or 'international symposium on networksonchip' in venue_clean or 'international journal of web information systems' in venue_clean or 'journal of timecritical computing systems' in venue_clean or 'journal of advanced computer science and applications' in venue_clean or 'conference on computing and informatics' in venue_clean or 'journal of big data' in venue_clean or 'journal of parallel programming' in venue_clean or 'international conference on computer and communications' in venue_clean or 'journal of supercomputing' in venue_clean or 'conference on computing frontiers' in venue_clean or 'international conference on industry applications' in venue_clean or 'artificial intelligence advances' in venue_clean or 'pattern recognition and artificial intelligence' in venue_clean or 'journal of electronics and communications' in venue_clean or 'journal of data science and analytics' in venue_clean or 'journal of parallel emergent and distributed systems' in venue_clean or 'conference on advances in electrical engineering' in venue_clean or 'international conference on frontiers in computing and systems' in venue_clean or 'conference on sentiment analysis and deep learning' in venue_clean or 'acm symposium on cloud computing' in venue_clean or 'conference on tools with artificial intelligence' in venue_clean or 'conference on supercomputing' in venue_clean or 'frontiers in artificial intelligence ' in venue_clean or 'conference on communication technology' in venue_clean or 'big data information and computer network' in venue_clean or 'conference on software engineering education and training' in venue_clean or 'journal of computers' in venue_clean or 'washington' in venue_clean or 'international conference on cloud computing' in venue_clean or 'acm on programming languages' in venue_clean or 'conference on high performance computing' in venue_clean or 'ubm designcon conference' in venue_clean or 'scientia sinica informationis' in venue_clean or 'symposium on computing and networking' in venue_clean or 'journal of mathematics' in venue_clean or 'iet image processing' in venue_clean or 'journal of computing and digital systems' in venue_clean or 'conference on very large scale integration' in venue_clean or 'journal of networking and computing' in venue_clean or 'transactions on cryptographic hardware and embedded systems' in venue_clean or 'International Conference on Hybrid Intelligent Systems' in venue_clean or 'international conference on asic' in venue_clean or 'international conference on nexgen technologies' in venue_clean or 'international conference on computer and information technology' in venue_clean or 'international journal of high performance systems architecture' in venue_clean or 'international conference on power and energy engineering' in venue_clean or 'international symposium on smart electronic systems' in venue_clean or 'international symposium on smart electronic systems' in venue_clean:
return 0
# 特判 IEEE Transactions on 开头
if venue_clean.startswith("ieee transactions on") or venue_clean.startswith("ieee international conference on") or venue_clean.startswith("ccf transactions on") or venue_clean.startswith("engineering applications of") or venue_clean.startswith("acm transactions on") or venue_clean.startswith("international conference on") or venue_clean.startswith("ieice transactions on"):
ccf_a_fullnames_clean = {clean_venue(name) for name in ccf_a_fullnames}
if venue_clean in ccf_a_fullnames_clean:
return 1
else:
return 0
# 1. 括号内简称匹配(只做去空格完全匹配)
bracket_content = extract_brackets(venue_clean)
if bracket_content:
bracket_content_no_space = bracket_content.replace(' ', '')
if bracket_content_no_space in ccf_a_abbrs:
return 1
# 2. 冒号前内容匹配(只做去空格完全匹配)
colon_content = extract_before_colon(venue_clean)
if colon_content:
colon_content_no_space = colon_content.replace(' ', '')
if colon_content_no_space in ccf_a_abbrs:
return 1
# 3. venue_clean整体匹配
if len(venue_clean) <= 6:
# 长度小于等于6,只做精确匹配
if venue_clean in ccf_a_names:
return 1
else:
return 0
else:
# 正常模糊匹配
if venue_clean in ccf_a_names:
return 1
result = process.extractOne(
venue_clean,
ccf_a_names,
scorer=fuzz.ratio
)
if result:
match, score, _ = result
if score >= threshold:
return 1
else:
return 0
else:
return 0
if __name__ == "__main__":
# 读取表头(第4行作为列名)
original_header = pd.read_excel(input_file_path, nrows=0, header=3)
column_names = original_header.columns.tolist()
# 读取数据(从第8行开始)
input_df = pd.read_excel(input_file_path, skiprows=7, header=None, names=column_names)
# 输出表头和数据的基本信息
print("表头元素:")
print(column_names)
print("\n数据行数:", input_df.shape[0])
print("数据列数:", input_df.shape[1])
# 检查是否存在期刊/会议名称列
venue_col_name = '期刊/会议名称'
if venue_col_name not in input_df.columns:
print(f"错误:找不到'{venue_col_name}'列")
exit(1)
# 检查是否存在CCF-A列
ccfa_col_name = '是否是CCF-A'
if ccfa_col_name not in input_df.columns:
print(f"'{ccfa_col_name}'列不存在,将添加该列")
# 找出CCF-A列的合适位置(通常在期刊/会议名称列之后)
venue_col_index = list(input_df.columns).index(venue_col_name)
input_df.insert(venue_col_index + 1, ccfa_col_name, None)
# 清理会议/期刊名称
input_df['venue_clean'] = input_df[venue_col_name].apply(clean_venue)
# 创建会议名称到行索引的映射
venue_map = defaultdict(list)
for idx, row in input_df.iterrows():
venue_map[row['venue_clean']].append(idx)
# 判断每个会议是否为CCF-A
venue_ccfa_dict = {}
for v_clean in venue_map:
venue_ccfa_dict[v_clean] = '是' if is_ccf_a(v_clean) else '否'
print(f'{v_clean} -> {venue_ccfa_dict[v_clean]}')
# 更新CCF-A列
for venue, indices in venue_map.items():
is_ccfa = venue_ccfa_dict[venue]
for idx in indices:
input_df.at[idx, ccfa_col_name] = is_ccfa
# 删除临时列
input_df.drop('venue_clean', axis=1, inplace=True)
# 使用openpyxl保留原始Excel格式
import openpyxl
from copy import copy
# 打开原始Excel文件
print("正在读取原始Excel文件以保留格式...")
wb_original = openpyxl.load_workbook(input_file_path)
# 创建新工作簿
wb_new = openpyxl.Workbook()
# 删除默认创建的空白工作表
if 'Sheet' in wb_new.sheetnames:
del wb_new['Sheet']
# 复制所有工作表
for sheet_name in wb_original.sheetnames:
ws_original = wb_original[sheet_name]
ws_new = wb_new.create_sheet(sheet_name)
# 复制工作表属性 - 只复制存在的属性
if hasattr(ws_original, 'sheet_properties'):
ws_new.sheet_properties = copy(ws_original.sheet_properties)
if hasattr(ws_original, 'sheet_format'):
ws_new.sheet_format = copy(ws_original.sheet_format)
# 复制整个工作表的内容和格式
for row in ws_original.rows:
for cell in row:
new_cell = ws_new.cell(row=cell.row, column=cell.column, value=cell.value)
if cell.has_style:
if hasattr(cell, 'font'):
new_cell.font = copy(cell.font)
if hasattr(cell, 'border'):
new_cell.border = copy(cell.border)
if hasattr(cell, 'fill'):
new_cell.fill = copy(cell.fill)
if hasattr(cell, 'number_format'):
new_cell.number_format = copy(cell.number_format)
if hasattr(cell, 'protection'):
new_cell.protection = copy(cell.protection)
if hasattr(cell, 'alignment'):
new_cell.alignment = copy(cell.alignment)
# 复制合并单元格
if hasattr(ws_original, 'merged_cells'):
try:
for merged_cell_range in ws_original.merged_cells.ranges:
ws_new.merge_cells(str(merged_cell_range))
except Exception as e:
print(f"警告: 复制合并单元格时出错: {e}")
# 获取主工作表(第一个工作表)
main_sheet_name = wb_original.sheetnames[0]
ws_new = wb_new[main_sheet_name]
# 更新主工作表中的CCF-A列
# 获取列索引
header_row = 4 # 第4行为列标题
venue_col_index = None
ccfa_col_index = None
# 获取列标题行的所有单元格值
header_cells = {}
for cell in ws_new[header_row]:
header_cells[cell.column] = cell.value
# 找出期刊/会议名称列和CCF-A列的索引
for col, value in header_cells.items():
if value == venue_col_name:
venue_col_index = col
elif value == ccfa_col_name:
ccfa_col_index = col
# 如果找不到CCF-A列,在期刊/会议名称列后添加
if ccfa_col_index is None and venue_col_index is not None:
ccfa_col_index = venue_col_index + 1
# 将后面的列向右移动一列
for row in range(1, ws_new.max_row + 1):
for col in range(ws_new.max_column, venue_col_index, -1):
cell = ws_new.cell(row=row, column=col)
cell.value = ws_new.cell(row=row, column=col-1).value
# 复制样式
source_cell = ws_new.cell(row=row, column=col-1)
if source_cell.has_style:
if hasattr(source_cell, 'font'):
cell.font = copy(source_cell.font)
if hasattr(source_cell, 'border'):
cell.border = copy(source_cell.border)
if hasattr(source_cell, 'fill'):
cell.fill = copy(source_cell.fill)
if hasattr(source_cell, 'number_format'):
cell.number_format = copy(source_cell.number_format)
if hasattr(source_cell, 'protection'):
cell.protection = copy(source_cell.protection)
if hasattr(source_cell, 'alignment'):
cell.alignment = copy(source_cell.alignment)
# 设置CCF-A列标题
ws_new.cell(row=header_row, column=ccfa_col_index, value=ccfa_col_name)
# 更新CCF-A列的数据(从第8行开始)
if venue_col_index is not None and ccfa_col_index is not None:
for i, row in input_df.iterrows():
excel_row = i + 8 # 转换为Excel的行号(从1开始)
venue_value = ws_new.cell(row=excel_row, column=venue_col_index).value
# 如果原始Excel中有会议名称,判断是否为CCF-A
if venue_value:
venue_clean = clean_venue(venue_value)
is_ccfa_value = venue_ccfa_dict.get(venue_clean, 0)
ws_new.cell(row=excel_row, column=ccfa_col_index, value=is_ccfa_value)
# 保存新Excel文件
print(f"正在保存为Excel格式 {output_file_path} ...")
try:
wb_new.save(output_file_path)
print(f"成功保存到 {output_file_path},保留了原始格式")
except Exception as e:
print(f"保存Excel失败: {e}")
print("处理完成!")
\ No newline at end of file
abbr,fullname
PPoPP,"ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming"
FAST,"USENIX Conference on File and Storage Technologies"
DAC,"Design Automation Conference"
HPCA,"IEEE International Symposium on High Performance Computer Architecture"
MICRO,"IEEE/ACM International Symposium on Microarchitecture"
SC,"International Conference for High Performance Computing, Networking, Storage, and Analysis"
ASPLOS,"International Conference on Architectural Support for Programming Languages and Operating Systems"
ISCA,"International Symposium on Computer Architecture"
USENIX ATC,"USENIX Annual Technical Conference"
EuroSys,"European Conference on Computer Systems"
SIGCOMM,"ACM International Conference on Applications, Technologies, Architectures, and Protocols for Computer Communication"
MobiCom,"ACM International Conference on Mobile Computing and Networking"
INFOCOM,"IEEE International Conference on Computer Communications"
NSDI,"Symposium on Network System Design and Implementation"
CCS,"ACM Conference on Computer and Communications Security"
EUROCRYPT,"International Conference on the Theory and Applications of Cryptographic Techniques"
S&P,"IEEE Symposium on Security and Privacy"
CRYPTO,"International Cryptology Conference"
USENIX Security,"USENIX Security Symposium"
NDSS,"Network and Distributed System Security Symposium"
PLDI,"ACM SIGPLAN Conference on Programming Language Design and Implementation"
POPL,"ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages"
FSE,"ACM International Conference on the Foundations of Software Engineering"
SOSP,"ACM Symposium on Operating Systems Principles"
OOPSLA,"Conference on Object-Oriented Programming Systems, Languages,and Applications"
ASE,"International Conference on Automated Software Engineering"
ICSE,"International Conference on Software Engineering"
ISSTA,"International Symposium on Software Testing and Analysis"
OSDI,"USENIX Symposium on Operating Systems Design and Implementations"
FM,"International Symposium on Formal Methods"
SIGMOD,"ACM SIGMOD Conference"
SIGKDD,"ACM SIGKDD Conference on Knowledge Discovery and Data Mining"
ICDE,"IEEE International Conference on Data Engineering"
SIGIR,"International ACM SIGIR Conference on Research and Development in Information Retrieval"
VLDB,"International Conference on Very Large Data Bases"
STOC,"ACM Symposium on Theory of Computing"
SODA,"ACM-SIAM Symposium on Discrete Algorithms"
CAV,"International Conference on Computer Aided Verification"
FOCS,"IEEE Annual Symposium on Foundations of Computer Science"
LICS,"ACM/IEEE Symposium on Logic in Computer Science"
ACM MM,"ACM International Conference on Multimedia"
SIGGRAPH,"ACM Special Interest Group on Computer Graphics"
VR,"IEEE Virtual Reality"
IEEE VIS,"IEEE Visualization Conference"
AAAI,"AAAI Conference on Artificial Intelligence"
NeurIPS,"Conference on Neural Information Processing Systems"
ACL,"Annual Meeting of the Association for Computational Linguistics"
CVPR,"IEEE/CVF Computer Vision and Pattern Recognition Conference"
ICCV,"International Conference on Computer Vision"
ICML,"International Conference on Machine Learning"
IJCAI,"International Joint Conference on Artificial Intelligence"
CSCW,"ACM Conference on Computer Supported Cooperative Work and Social Computing"
CHI,"ACM Conference on Human Factors in Computing Systems"
UbiComp/IMWUT,"ACM international joint conference on Pervasive and Ubiquitous Computing/ Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies"
UIST,"ACM Symposium on User Interface Software and Technology"
WWW,"International World Wide Web Conference"
RTSS,"IEEE Real-Time Systems Symposium"
WINE,"Conference on Web and Internet Economics"
TOCS,"ACM Transactions on Computer Systems"
TOS,"ACM Transactions on Storage"
TCAD,"IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems"
TC,"IEEE Transactions on Computers"
TPDS,"IEEE Transactions on Parallel and Distributed Systems"
TACO,"ACM Transactions on Architecture and Code Optimization"
JSAC,"IEEE Journal on Selected Areas in Communications"
TMC,"IEEE Transactions on Mobile Computing"
TON,"IEEE/ACM Transactions on Networking"
TDSC,"IEEE Transactions on Dependable and Secure Computing"
TIFS,"IEEE Transactions on Information Forensics and Security"
,"Journal of Cryptology"
TOPLAS,"ACM Transactions on Programming Languages and Systems"
TOSEM,"ACM Transactions on Software Engineering and Methodology"
TSE,"IEEE Transactions on Software Engineering"
TSC,"IEEE Transactions on Services Computing"
TODS,"ACM Transactions on Database Systems"
TOIS,"ACM Transactions on Information Systems"
TKDE,"IEEE Transactions on Knowledge and Data Engineering"
VLDBJ,"The VLDB Journal"
TIT,"IEEE Transactions on Information Theory"
IANDC,"Information and Computation"
SICOMP,"SIAM Journal on Computing"
TOG,"ACM Transactions on Graphics"
TIP,"IEEE Transactions on Image Processing"
TVCG,"IEEE Transactions on Visualization and Computer Graphics"
AI,"Artificial Intelligence"
TPAMI,"IEEE Transactions on Pattern Analysis and Machine Intelligence"
IJCV,"International Journal of Computer Vision"
JMLR,"Journal of Machine Learning Research"
TOCHI,"ACM Transactions on Computer-Human Interaction"
IJHCS,"International Journal of Human-Computer Studies"
JACM,"Journal of the ACM"
Proc. IEEE,"Proceedings of the IEEE"
SCIS,"Science China Information Sciences"
This source diff could not be displayed because it is too large. You can view the blob instead.
import os
import sys
os.chdir(os.path.dirname(__file__))
import json
import pandas as pd
from tqdm import tqdm
import openpyxl
from copy import copy
from joblib import Parallel, delayed
from utils import standardized_name, name_in_niuren_list
input_file_path = 'info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
output_file_path = 'output/论文被引用统计-陈老师-截止2025年X月XX日_牛人筛选.xlsx'
output_dir = os.path.dirname(output_file_path)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
niuren_pool_path = 'info/niuren_pool.csv'
true_niuren_papers_path = 'info/true_niuren_papers'
fake_niuren_papers_path = 'info/fake_niuren_papers'
# 全局变量,存储加载的数据
# NIUREN_POOL_NAMES = []
# TRUE_NIUREN_NAMES = []
# TRUE_NIUREN_PAPERS = []
# FAKE_NIUREN_NAMES = []
# FAKE_NIUREN_PAPERS = []
def load_niuren_pool(niuren_pool_path):
niuren_pool = pd.read_csv(niuren_pool_path, encoding='utf-8-sig')
niuren_pool_names = niuren_pool["name"].tolist()
niuren_pool_names = [name.replace("\xa0", " ") for name in niuren_pool_names] # 去除空格
return niuren_pool_names
def load_true_niuren(true_niuren, true_niuren_papers_path):
true_niuren["别名列表"] = None
true_niuren["别名列表"] = true_niuren["别名列表(各种奇奇怪怪的名字格式,比如first name和second name的顺序,以;分隔)"].apply(
lambda x: [standardized_name(i.strip()) for i in x.split(";") if i!=""] if isinstance(x, str) else []
)
true_niuren_names = []
true_niuren_papers = []
# 构建牛人姓名列表
for _, row in true_niuren.iterrows():
if pd.isna(row["姓名"]):
break
if row["别名列表"]:
true_niuren_names.append([standardized_name(row["姓名"])] + row["别名列表"])
else:
true_niuren_names.append(standardized_name(row["姓名"]))
# 构建牛人论文列表
for true_niuren_idx, _ in enumerate(true_niuren_names):
paper_file_path = os.path.join(true_niuren_papers_path, f"{true_niuren_idx+1}.xlsx")
if os.path.exists(paper_file_path):
papers_df = pd.read_excel(paper_file_path, usecols=[0])
papers_list = papers_df.iloc[:, 0].dropna().tolist()
papers_list = [paper.lower() for paper in papers_list]
true_niuren_papers.append(papers_list)
else:
true_niuren_papers.append([])
return true_niuren_names, true_niuren_papers
def load_fake_niuren(fake_niuren, fake_niuren_papers_path):
# 读取 "全局非牛人" 工作簿
fake_niuren_names = []
fake_niuren_papers = []
# 构建非牛人姓名列表
for _, row in fake_niuren.iterrows():
if pd.isna(row["姓名"]):
break
fake_niuren_names.append(standardized_name(row["姓名"]))
# 构建非牛人论文列表
for fake_niuren_idx, _ in enumerate(fake_niuren_names):
paper_file_path = os.path.join(fake_niuren_papers_path, f"{fake_niuren_idx+1}.xlsx")
if os.path.exists(paper_file_path):
papers_df = pd.read_excel(paper_file_path, usecols=[0])
papers_list = papers_df.iloc[:, 0].dropna().tolist()
papers_list = [paper.lower() for paper in papers_list]
fake_niuren_papers.append(papers_list)
else:
fake_niuren_papers.append([])
return fake_niuren_names, fake_niuren_papers
def check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
niuren = []
order = []
suspected_niuren = []
for author_idx, author in enumerate(authors):
if author == "":
continue
# 作者姓名能否在牛人池中找到
name_index = name_in_niuren_list(standardized_name(author), niuren_pool_names)
if name_index == -1:
continue
# 作者姓名能否在全局牛人中找到
true_name_index = name_in_niuren_list(standardized_name(author), true_niuren_names)
if true_name_index != -1:
if title.strip().lower() in true_niuren_papers[true_name_index]:
niuren.append(author)
order.append(author_idx + 1)
continue
# 作者姓名能否在全局非牛人中找到
fake_name_index = name_in_niuren_list(standardized_name(author), fake_niuren_names)
if fake_name_index != -1:
if title.strip().lower() in fake_niuren_papers[fake_name_index]:
continue
suspected_niuren.append([author, name_index+1])
niuren_str = ";".join(niuren)
order_str = ";".join([str(i) for i in order])
suspected_niuren_str = ";".join([f"{name}({index})" for name, index in suspected_niuren])
return niuren_str, order_str, suspected_niuren_str
def process_row(index, row, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
try:
authors = row.get('引文作者', '')
title = row.get('引文名称', '')
if not isinstance(authors, str):
return index, "", "", ""
authors = [i.strip() for i in authors.split(";") if i!=""]
niuren_true, niuren_true_order, suspected_niuren = check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers)
return index, niuren_true, niuren_true_order, suspected_niuren
except Exception as e:
print(f"处理行 {index} 时发生错误: {e}")
return index, "", "", ""
if __name__ == "__main__":
print("正在读取全局牛人...")
true_niuren = pd.read_excel(input_file_path, sheet_name="全局牛人")
print("正在读取全局非牛人...")
fake_niuren = pd.read_excel(input_file_path, sheet_name="全局非牛人")
print("正在处理牛人池...")
niuren_pool_names = load_niuren_pool(niuren_pool_path)
print("正在处理真牛人数据...")
true_niuren_names, true_niuren_papers = load_true_niuren(true_niuren,true_niuren_papers_path)
print("正在处理非牛人数据...")
fake_niuren_names, fake_niuren_papers = load_fake_niuren(fake_niuren,fake_niuren_papers_path)
# 读取表头(第4行作为列名)
original_header = pd.read_excel(input_file_path, nrows=0, header=3)
column_names = original_header.columns.tolist()
# 读取数据(从第8行开始)
input_df = pd.read_excel(input_file_path, skiprows=7, header=None, names=column_names)
# 输出表头和数据的基本信息
print("表头元素:")
print(column_names)
print("\n数据行数:", input_df.shape[0])
print("数据列数:", input_df.shape[1])
# 检查列是否存在,不存在则添加
if '疑似牛人' not in input_df.columns:
input_df['疑似牛人'] = None
# 确保牛人相关列存在
niuren_col = '牛人\n(参考全局牛人列表)'
niuren_order_col = '牛人署名顺序\n'
# 如果列不存在,添加它们
if niuren_col not in input_df.columns:
input_df[niuren_col] = None
if niuren_order_col not in input_df.columns:
input_df[niuren_order_col] = None
print("开始并行处理数据...")
# 并行处理,将加载的数据传递给每个进程
results = Parallel(n_jobs=-1)(
delayed(process_row)(
index, row,
niuren_pool_names, true_niuren_names, true_niuren_papers,
fake_niuren_names, fake_niuren_papers
)
for index, row in tqdm(input_df.iterrows())
)
for index, niuren_true, niuren_true_order, suspected_niuren in results:
input_df.at[index, niuren_col] = niuren_true
input_df.at[index, niuren_order_col] = niuren_true_order
input_df.at[index, '疑似牛人'] = suspected_niuren
# 打开原始Excel文件
print("正在读取原始Excel文件以保留格式...")
wb_original = openpyxl.load_workbook(input_file_path)
# 创建新工作簿
wb_new = openpyxl.Workbook()
# 删除默认创建的空白工作表
if 'Sheet' in wb_new.sheetnames:
del wb_new['Sheet']
# 复制所有工作表
for sheet_name in wb_original.sheetnames:
ws_original = wb_original[sheet_name]
ws_new = wb_new.create_sheet(sheet_name)
# 复制工作表属性
ws_new.sheet_properties = copy(ws_original.sheet_properties)
ws_new.sheet_format = copy(ws_original.sheet_format)
# 复制整个工作表的内容和格式
for row in ws_original.rows:
for cell in row:
new_cell = ws_new.cell(row=cell.row, column=cell.column, value=cell.value)
if cell.has_style:
new_cell.font = copy(cell.font)
new_cell.border = copy(cell.border)
new_cell.fill = copy(cell.fill)
new_cell.number_format = copy(cell.number_format)
new_cell.protection = copy(cell.protection)
new_cell.alignment = copy(cell.alignment)
# 复制合并单元格
for merged_cell_range in ws_original.merged_cells.ranges:
ws_new.merge_cells(str(merged_cell_range))
# 获取主工作表(第一个工作表)
main_sheet_name = wb_original.sheetnames[0]
ws_new = wb_new[main_sheet_name]
# 更新主工作表中的牛人相关数据
print("正在更新主工作表中的牛人数据...")
# 找出列索引(Excel中列是从1开始的)
niuren_col_index = None
niuren_order_col_index = None
suspected_niuren_col_index = None
# 获取第4行(索引从1开始)的所有单元格值
header_row = [cell.value for cell in ws_new[4]]
# 在这些值中查找列名对应的索引
for i, cell_value in enumerate(header_row, start=1):
if cell_value == niuren_col:
niuren_col_index = i
elif cell_value == niuren_order_col:
niuren_order_col_index = i
elif cell_value == '疑似牛人':
suspected_niuren_col_index = i
# 如果找不到列,添加新列
max_col = ws_new.max_column
if niuren_col_index is None:
niuren_col_index = max_col + 1
ws_new.cell(row=4, column=niuren_col_index, value=niuren_col)
max_col += 1
if niuren_order_col_index is None:
niuren_order_col_index = max_col + 1
ws_new.cell(row=4, column=niuren_order_col_index, value=niuren_order_col)
max_col += 1
if suspected_niuren_col_index is None:
suspected_niuren_col_index = max_col + 1
ws_new.cell(row=4, column=suspected_niuren_col_index, value='疑似牛人')
# 更新数据(从第8行开始)
for i, row in input_df.iterrows():
excel_row = i + 8 # 转换为Excel的行号(从1开始)
# 只更新牛人相关的三列
ws_new.cell(row=excel_row, column=niuren_col_index, value=row[niuren_col])
ws_new.cell(row=excel_row, column=niuren_order_col_index, value=row[niuren_order_col])
ws_new.cell(row=excel_row, column=suspected_niuren_col_index, value=row['疑似牛人'])
# 保存新Excel文件
print(f"正在保存为Excel格式 {output_file_path} ...")
wb_new.save(output_file_path)
print(f"成功保存到 {output_file_path},保留了原始格式")
print("处理完成!")
\ No newline at end of file
import os
import json
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from utils import firstlast2lastfirst, name_in_niuren_list
input_file_path = '/Users/zhengxinhan/Desktop/papertools-master-2cb16c72b94585f63d8a221966bdc782a086fda5/niurenshaixuan/info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx'
output_file_path = '测试输出3.xlsx'
# 全局变量,存储加载的数据
NIUREN_POOL_NAMES = []
TRUE_NIUREN_NAMES = []
TRUE_NIUREN_PAPERS = []
FAKE_NIUREN_NAMES = []
FAKE_NIUREN_PAPERS = []
def load_niuren_pool():
niuren_pool = pd.read_csv("/Users/zhengxinhan/Desktop/papertools-master-2cb16c72b94585f63d8a221966bdc782a086fda5/niurenshaixuan/info/new_niuren_format-merged_turing.csv", encoding='utf-8-sig')
niuren_pool_names = niuren_pool["name"].tolist()
niuren_pool_names = [name.replace("\xa0", " ") for name in niuren_pool_names] # 去除空格
return niuren_pool_names
def load_true_niuren():
# 读取 "全局牛人" 工作簿
true_niuren = pd.read_excel("/Users/zhengxinhan/Desktop/papertools-master-2cb16c72b94585f63d8a221966bdc782a086fda5/niurenshaixuan/info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx", sheet_name="全局牛人")
true_niuren["别名列表"] = None
true_niuren["别名列表"] = true_niuren["别名列表(各种奇奇怪怪的名字格式,比如first name和second name的顺序,以;分隔)"].apply(
lambda x: [firstlast2lastfirst(i.strip()) for i in x.split(";") if i!=""] if isinstance(x, str) else []
)
true_niuren_names = []
true_niuren_papers = []
# 构建牛人姓名列表
for _, row in true_niuren.iterrows():
if pd.isna(row["姓名"]):
break
if row["别名列表"]:
true_niuren_names.append([firstlast2lastfirst(row["姓名"])] + row["别名列表"])
else:
true_niuren_names.append(firstlast2lastfirst(row["姓名"]))
# 构建牛人论文列表
for true_niuren_idx, _ in enumerate(true_niuren_names):
paper_file_path = os.path.join("/Users/zhengxinhan/Desktop/papertools-master-2cb16c72b94585f63d8a221966bdc782a086fda5/niurenshaixuan/info/niuren_papers", f"{true_niuren_idx+1}.xlsx")
if os.path.exists(paper_file_path):
papers_df = pd.read_excel(paper_file_path, usecols=[0])
papers_list = papers_df.iloc[:, 0].dropna().tolist()
papers_list = [paper.lower() for paper in papers_list]
true_niuren_papers.append(papers_list)
else:
true_niuren_papers.append([])
return true_niuren_names, true_niuren_papers
def load_fake_niuren():
# 读取 "全局非牛人" 工作簿
fake_niuren = pd.read_excel("/Users/zhengxinhan/Desktop/papertools-master-2cb16c72b94585f63d8a221966bdc782a086fda5/niurenshaixuan/info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx", sheet_name="全局非牛人")
fake_niuren_names = []
fake_niuren_papers = []
# 构建非牛人姓名列表
for _, row in fake_niuren.iterrows():
if pd.isna(row["姓名"]):
break
fake_niuren_names.append(firstlast2lastfirst(row["姓名"]))
# 构建非牛人论文列表
for fake_niuren_idx, _ in enumerate(fake_niuren_names):
paper_file_path = os.path.join("/Users/zhengxinhan/Desktop/papertools-master-2cb16c72b94585f63d8a221966bdc782a086fda5/niurenshaixuan/info/feiniuren_papers", f"{fake_niuren_idx+1}.xlsx")
if os.path.exists(paper_file_path):
papers_df = pd.read_excel(paper_file_path, usecols=[0])
papers_list = papers_df.iloc[:, 0].dropna().tolist()
papers_list = [paper.lower() for paper in papers_list]
fake_niuren_papers.append(papers_list)
else:
fake_niuren_papers.append([])
return fake_niuren_names, fake_niuren_papers
def check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
niuren = []
order = []
suspected_niuren = []
for author_idx, author in enumerate(authors):
if author == "":
continue
# 作者姓名能否在牛人池中找到
name_index = name_in_niuren_list(firstlast2lastfirst(author), niuren_pool_names)
if name_index == -1:
continue
# 作者姓名能否在全局牛人中找到
true_name_index = name_in_niuren_list(firstlast2lastfirst(author), true_niuren_names)
if true_name_index != -1:
if title.strip().lower() in true_niuren_papers[true_name_index]:
niuren.append(author)
order.append(author_idx + 1)
continue
# 作者姓名能否在全局非牛人中找到
fake_name_index = name_in_niuren_list(firstlast2lastfirst(author), fake_niuren_names)
if fake_name_index != -1:
if title.strip().lower() in fake_niuren_papers[fake_name_index]:
continue
suspected_niuren.append([author, name_index+1])
niuren_str = ";".join(niuren)
order_str = ";".join([str(i) for i in order])
suspected_niuren_str = ";".join([f"{name}({index})" for name, index in suspected_niuren])
# 截断可能过长的字符串,防止Excel导出问题
if len(suspected_niuren_str) > 2000:
suspected_niuren_str = suspected_niuren_str[:2000] + "..."
return niuren_str, order_str, suspected_niuren_str
def process_row(index, row, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
try:
authors = row.get('引文作者', '')
title = row.get('引文名称', '')
if not isinstance(authors, str):
return index, "", "", ""
authors = [i.strip() for i in authors.split(";") if i!=""]
niuren_true, niuren_true_order, suspected_niuren = check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers)
return index, niuren_true, niuren_true_order, suspected_niuren
except Exception as e:
print(f"处理行 {index} 时发生错误: {e}")
return index, "", "", ""
if __name__ == "__main__":
# 预先加载所有数据,只加载一次
print("正在加载牛人池...")
niuren_pool_names = load_niuren_pool()
print("正在加载真牛人数据...")
true_niuren_names, true_niuren_papers = load_true_niuren()
print("正在加载假牛人数据...")
fake_niuren_names, fake_niuren_papers = load_fake_niuren()
# 读取原始 Excel 文件的前两行
original_header = pd.read_excel(input_file_path, nrows=2, header=None)
# 读取数据并跳过前两行
input_df = pd.read_excel(input_file_path, skiprows=3)
# 添加新列
input_df['牛人'] = None
input_df['牛人署名顺序'] = None
input_df['疑似牛人'] = None
print("开始并行处理数据...")
# 并行处理,将加载的数据传递给每个进程
results = Parallel(n_jobs=-1)(
delayed(process_row)(
index, row,
niuren_pool_names, true_niuren_names, true_niuren_papers,
fake_niuren_names, fake_niuren_papers
)
for index, row in tqdm(input_df.iterrows())
)
for index, niuren_true, niuren_true_order, suspected_niuren in results:
input_df.at[index, '牛人'] = niuren_true
input_df.at[index, '牛人署名顺序'] = niuren_true_order
input_df.at[index, '疑似牛人'] = suspected_niuren
max_cols = max(original_header.shape[1], input_df.shape[1])
# 定义补齐函数
def pad_df(df, max_cols):
df = df.copy()
for i in range(df.shape[1], max_cols):
df[i] = ""
df = df.iloc[:, :max_cols] # 限制在max_cols列
df.columns = range(max_cols) # 用数字做列名
return df
# 补齐所有DF
original_header_padded = pad_df(original_header, max_cols)
input_header = pd.DataFrame([input_df.columns.tolist()])
input_header_padded = pad_df(input_header, max_cols)
input_data_padded = pad_df(input_df, max_cols)
# 拼接
output_df = pd.concat([original_header_padded, input_header_padded, input_data_padded], ignore_index=True)
# 保存为CSV先测试
csv_path = output_file_path.replace('.xlsx', '.csv')
print(f"先保存为CSV格式 {csv_path} ...")
output_df.to_csv(csv_path, index=False, header=False)
# 尝试保存为Excel
try:
print(f"尝试保存为Excel格式 {output_file_path} ...")
output_df.to_excel(output_file_path, index=False, header=False, engine='openpyxl')
print(f"成功保存到 {output_file_path}")
except Exception as e:
print(f"保存Excel失败: {e}")
print(f"请使用CSV格式文件 {csv_path}")
print("处理完成!")
\ No newline at end of file
'''
Author: zhengxinhan
Date: 2025-05-08 15:02:02
LastEditors: zhengxinhan
LastEditTime: 2025-05-09 02:46:29
FilePath: /papertools-master/niurenshaixuan/utils.py
Description:
Copyright (c) 2025 by m13521952989@163.com, All Rights Reserved.
'''
def standardized_name(name):
if "," in name:
return name
# 名人列表中存在一些名字是First Name Last Name的格式,将其转换为Last Name, First Name格式
# 例如:'John Doe' -> 'Doe, John','M. Jane Smith' -> 'Smith, Jane M.'
special_str = [".", "Ms.", "Mr.", "Mrs.", "Dr.", "Prof.", "PhD", "MD", "Jr.", "Sr.", "The", "Honorable"]
name_split = name.split(" ")
name_split = [item.strip() for item in name_split if item.strip() != ""]
name_split = [item for item in name_split if item not in special_str]
# Grigory Isaakovich Barenblatt --> Barenblatt, Alexa, Marc Isaakovich
if len(name_split) == 2:
first_name = name_split[0]
last_name = name_split[1]
new_name = f"{last_name}, {first_name}"
elif len(name_split) == 3:
if name_split[0].startswith("(") and name_split[0].endswith(")"):
# (Alexander) Philip Dawid
first_name = f"{name_split[1]} {name_split[0]}"
last_name = f"{name_split[2]}"
elif name_split[1].startswith("(") and name_split[1].endswith(")"):
# Xinyan (Tracy) Cui --> Cui, Xinyan (Tracy)
first_name = f"{name_split[0]} {name_split[1]}"
last_name = f"{name_split[2]}"
elif name_split[2].startswith("(") and name_split[2].endswith(")"):
# Ye Fred (Ying)实际是姓Ye,名Ying,英文名Fred, 需要将其转换为"Ye, Ying Fred"
# Zhu Jesse (Jingxu)实际是姓Zhu,名Jingxu,英文名Jesse, 需要将其转换为"Zhu, Jingxu Jesse"
first_name = f"{name_split[2][1:-1]} ({name_split[1]})"
last_name = f"{name_split[0]}"
elif name_split[0].endswith("."):
# M. Jane Smith --> Smith, Jane M.
# K.W. Michael Siu --> Siu, Michael K.W.
first_name = f"{name_split[1]} {name_split[0]}"
last_name = name_split[2]
elif name_split[1].endswith("."):
# Jane M. Smith --> Smith, Jane M.
# Pierre J.H. Richardson --> Richardson, Pierre J.H.
first_name = f"{name_split[0]} {name_split[1]}"
last_name = name_split[2]
elif name_split[2].endswith("."):
# Wimmer-Schweingruber Robert F. --> Wimmer-Schweingruber, Robert F.
# Wilderer Peter A. --> Wilderer, Peter A.
first_name = f"{name_split[1]} {name_split[2]}"
last_name = f"{name_split[0]}"
else:
# William Nelson Joy --> Joy, William Nelson
# Michael J Carey --> Carey, Michael J
# len_3_list.append(" ".join(name_split))
first_name = f"{name_split[0]} {name_split[1]}"
last_name = name_split[2]
new_name = f"{last_name}, {first_name}"
# print(f"{name} --> {new_name}")
else:
# lens greater than 3 这部分名字应该不会跟其他部分名字出现重复
last_name = name_split[-1]
first_name = " ".join(name_split[:-1])
new_name = f"{last_name}, {first_name}"
return new_name
def name_in_niuren_list(name, niuren_name_list):
"""
判断名字是否在牛人列表中
:param name: 名字
:param niuren_name_list: 牛人列表
:return: 如果在牛人列表中,返回 True,否则返回 False
"""
for idx, niuren_name in enumerate(niuren_name_list):
if isinstance(niuren_name, str):
if name == niuren_name:
return idx
elif isinstance(niuren_name, list):
if name in niuren_name:
return idx
for idx, niuren_name in enumerate(niuren_name_list):
if isinstance(niuren_name, str):
if niuren_name.startswith(name):
if niuren_name[niuren_name.find(name) + len(name)] == " ":
return idx
if name.startswith(niuren_name):
if name[name.find(niuren_name) + len(niuren_name)] == " ":
return idx
elif isinstance(niuren_name, list):
for niuren_name_item in niuren_name:
if niuren_name_item.startswith(name):
if niuren_name_item[niuren_name_item.find(name) + len(name)] == " ":
return idx
if name.startswith(niuren_name_item):
if name[name.find(niuren_name_item) + len(niuren_name_item)] == " ":
return idx
return -1
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment