Commit 7c363521 by matianyun

Upload New File

parent 03c9d49c
"""
功能:判断论文是否属于CCF-A类会议
输入文件:CCF_A_list.csv,包含两列:abbr,fullname
输入文件:info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx
输出文件:处理后的Excel文件,保持原格式不变,添加是否CCF-A信息
"""
import os
import pandas as pd
import re
from rapidfuzz import fuzz, process
from collections import defaultdict
import openpyxl
from copy import copy
# 路径参数
input_file_path = "info/测试大表2.xlsx"
output_file_path = "output/论文被引用统计-陈老师-截止2025年X月XX日-CCFA标记.xlsx"
ccf_a_file = "info/CCF_A_list.csv" # 请根据实际路径调整
# 读取CCF-A简称和全称
ccf_a = pd.read_csv(ccf_a_file)
# 缩写全部去空格
ccf_a_abbrs = set(ccf_a['abbr'].dropna().str.lower().str.replace(' ','').str.strip().tolist())
ccf_a_fullnames = set(ccf_a['fullname'].dropna().str.lower().str.strip().tolist())
ccf_a_names = ccf_a_abbrs | ccf_a_fullnames
def clean_venue(venue):
if pd.isnull(venue) or str(venue).strip() == "":
return ""
text = str(venue)
text = re.sub(r'[\(\(][^)\)]*?volume[^)\)]*?[\)\)]', '', text, flags=re.IGNORECASE)
text = re.sub(r"[0-9'\",\.\!\/\\\?\;\-\_]", "", text)
return text.lower().strip()
def extract_brackets(text):
match = re.search(r'[\(\(](.*?)[\)\)]', text)
if match:
return match.group(1).strip().lower()
return None
def extract_before_colon(text):
parts = re.split(r'[::]', text)
if len(parts) > 1:
return parts[0].strip().lower()
return None
def is_ccf_a(venue, threshold=70):
if pd.isnull(venue) or str(venue).strip() == "":
return 0
venue_clean = venue.lower().strip()
if 'south pacific design automation conference' in venue_clean:
return 0
if 'International Journal' in venue_clean and ('Computer Vision' not in venue_clean or 'Human-Computer Studies' not in venue_clean):
return 0
if 'ieee micro' in venue_clean or 'design automation conference' in venue_clean or 'ieee transactions on parallel & distributed systems' in venue_clean or 'ieee transactions on computers' in venue_clean or 'transactions on architecture and code optimization' in venue_clean or '信息科学' in venue_clean or 'pldi' in venue_clean or 'transactions on computer systems' in venue_clean or 'conference on computer vision and pattern recognition' in venue_clean or 'transactions on computeraided design of integrated circuits and systems' in venue_clean or 'aaai conference on artificial intelligence' in venue_clean or 'mm ' in venue_clean or 'transactions on parallel and distributed systems' in venue_clean:
return 1
if 'ijcnn' in venue_clean or 'iccd' in venue_clean or 'icpr' in venue_clean or 'aicas' in venue_clean or 'acm transactions on embedded computing systems' in venue_clean or 'iccad' in venue_clean or 'artificial intelligence review' in venue_clean or 'ieee journal of selected topics in quantum electronics' in venue_clean or 'future generation computer systems' in venue_clean or 'ieee transactions on services computing' in venue_clean or 'ipdps' in venue_clean or 'isocc' in venue_clean or 'microprocessors and microsystems' in venue_clean or 'ccet' in venue_clean or 'microelectronics' in venue_clean or 'asid' in venue_clean or 'caai' in venue_clean or 'euromicro conference on' in venue_clean or 'asicon' in venue_clean or 'transactions on system and lsi design methodology' in venue_clean or 'selected areas in communications' in venue_clean or 'high performance computing data and analytics' in venue_clean or 'innovation communication and engineering' in venue_clean or 'sysml conference' in venue_clean or 'international conference on digital signal processing' in venue_clean or 'international conference on computer and communication systems' in venue_clean or 'ieee transactions on services computing' in venue_clean or 'international symposium on performance analysis of systems and software' in venue_clean or 'wireless communications and mobile computing' in venue_clean or 'sigops operating systems review' in venue_clean or 'siggraphasia' in venue_clean or 'international journal of computational intelligence systems' in venue_clean or 'journal of ikeee' in venue_clean or 'artificial intelligence and security' in venue_clean or 'journal of software' in venue_clean or 'advances in artificial intelligence' in venue_clean or 'international symposium on networksonchip' in venue_clean or 'international journal of web information systems' in venue_clean or 'journal of timecritical computing systems' in venue_clean or 'journal of advanced computer science and applications' in venue_clean or 'conference on computing and informatics' in venue_clean or 'journal of big data' in venue_clean or 'journal of parallel programming' in venue_clean or 'international conference on computer and communications' in venue_clean or 'journal of supercomputing' in venue_clean or 'conference on computing frontiers' in venue_clean or 'international conference on industry applications' in venue_clean or 'artificial intelligence advances' in venue_clean or 'pattern recognition and artificial intelligence' in venue_clean or 'journal of electronics and communications' in venue_clean or 'journal of data science and analytics' in venue_clean or 'journal of parallel emergent and distributed systems' in venue_clean or 'conference on advances in electrical engineering' in venue_clean or 'international conference on frontiers in computing and systems' in venue_clean or 'conference on sentiment analysis and deep learning' in venue_clean or 'acm symposium on cloud computing' in venue_clean or 'conference on tools with artificial intelligence' in venue_clean or 'conference on supercomputing' in venue_clean or 'frontiers in artificial intelligence ' in venue_clean or 'conference on communication technology' in venue_clean or 'big data information and computer network' in venue_clean or 'conference on software engineering education and training' in venue_clean or 'journal of computers' in venue_clean or 'washington' in venue_clean or 'international conference on cloud computing' in venue_clean or 'acm on programming languages' in venue_clean or 'conference on high performance computing' in venue_clean or 'ubm designcon conference' in venue_clean or 'scientia sinica informationis' in venue_clean or 'symposium on computing and networking' in venue_clean or 'journal of mathematics' in venue_clean or 'iet image processing' in venue_clean or 'journal of computing and digital systems' in venue_clean or 'conference on very large scale integration' in venue_clean or 'journal of networking and computing' in venue_clean or 'transactions on cryptographic hardware and embedded systems' in venue_clean or 'International Conference on Hybrid Intelligent Systems' in venue_clean or 'international conference on asic' in venue_clean or 'international conference on nexgen technologies' in venue_clean or 'international conference on computer and information technology' in venue_clean or 'international journal of high performance systems architecture' in venue_clean or 'international conference on power and energy engineering' in venue_clean or 'international symposium on smart electronic systems' in venue_clean or 'international symposium on smart electronic systems' in venue_clean:
return 0
# 特判 IEEE Transactions on 开头
if venue_clean.startswith("ieee transactions on") or venue_clean.startswith("ieee international conference on") or venue_clean.startswith("ccf transactions on") or venue_clean.startswith("engineering applications of") or venue_clean.startswith("acm transactions on") or venue_clean.startswith("international conference on") or venue_clean.startswith("ieice transactions on"):
ccf_a_fullnames_clean = {clean_venue(name) for name in ccf_a_fullnames}
if venue_clean in ccf_a_fullnames_clean:
return 1
else:
return 0
# 1. 括号内简称匹配(只做去空格完全匹配)
bracket_content = extract_brackets(venue_clean)
if bracket_content:
bracket_content_no_space = bracket_content.replace(' ', '')
if bracket_content_no_space in ccf_a_abbrs:
return 1
# 2. 冒号前内容匹配(只做去空格完全匹配)
colon_content = extract_before_colon(venue_clean)
if colon_content:
colon_content_no_space = colon_content.replace(' ', '')
if colon_content_no_space in ccf_a_abbrs:
return 1
# 3. venue_clean整体匹配
if len(venue_clean) <= 6:
# 长度小于等于6,只做精确匹配
if venue_clean in ccf_a_names:
return 1
else:
return 0
else:
# 正常模糊匹配
if venue_clean in ccf_a_names:
return 1
result = process.extractOne(
venue_clean,
ccf_a_names,
scorer=fuzz.ratio
)
if result:
match, score, _ = result
if score >= threshold:
return 1
else:
return 0
else:
return 0
if __name__ == "__main__":
# 读取表头(第4行作为列名)
original_header = pd.read_excel(input_file_path, nrows=0, header=3)
column_names = original_header.columns.tolist()
# 读取数据(从第8行开始)
input_df = pd.read_excel(input_file_path, skiprows=7, header=None, names=column_names)
# 输出表头和数据的基本信息
print("表头元素:")
print(column_names)
print("\n数据行数:", input_df.shape[0])
print("数据列数:", input_df.shape[1])
# 检查是否存在期刊/会议名称列
venue_col_name = '期刊/会议名称'
if venue_col_name not in input_df.columns:
print(f"错误:找不到'{venue_col_name}'列")
exit(1)
# 检查是否存在CCF-A列
ccfa_col_name = '是否是CCF-A'
if ccfa_col_name not in input_df.columns:
print(f"'{ccfa_col_name}'列不存在,将添加该列")
# 找出CCF-A列的合适位置(通常在期刊/会议名称列之后)
venue_col_index = list(input_df.columns).index(venue_col_name)
input_df.insert(venue_col_index + 1, ccfa_col_name, None)
# 清理会议/期刊名称
input_df['venue_clean'] = input_df[venue_col_name].apply(clean_venue)
# 创建会议名称到行索引的映射
venue_map = defaultdict(list)
for idx, row in input_df.iterrows():
venue_map[row['venue_clean']].append(idx)
# 判断每个会议是否为CCF-A
venue_ccfa_dict = {}
for v_clean in venue_map:
venue_ccfa_dict[v_clean] = '是' if is_ccf_a(v_clean) else '否'
print(f'{v_clean} -> {venue_ccfa_dict[v_clean]}')
# 更新CCF-A列
for venue, indices in venue_map.items():
is_ccfa = venue_ccfa_dict[venue]
for idx in indices:
input_df.at[idx, ccfa_col_name] = is_ccfa
# 删除临时列
input_df.drop('venue_clean', axis=1, inplace=True)
# 使用openpyxl保留原始Excel格式
import openpyxl
from copy import copy
# 打开原始Excel文件
print("正在读取原始Excel文件以保留格式...")
wb_original = openpyxl.load_workbook(input_file_path)
# 创建新工作簿
wb_new = openpyxl.Workbook()
# 删除默认创建的空白工作表
if 'Sheet' in wb_new.sheetnames:
del wb_new['Sheet']
# 复制所有工作表
for sheet_name in wb_original.sheetnames:
ws_original = wb_original[sheet_name]
ws_new = wb_new.create_sheet(sheet_name)
# 复制工作表属性 - 只复制存在的属性
if hasattr(ws_original, 'sheet_properties'):
ws_new.sheet_properties = copy(ws_original.sheet_properties)
if hasattr(ws_original, 'sheet_format'):
ws_new.sheet_format = copy(ws_original.sheet_format)
# 复制整个工作表的内容和格式
for row in ws_original.rows:
for cell in row:
new_cell = ws_new.cell(row=cell.row, column=cell.column, value=cell.value)
if cell.has_style:
if hasattr(cell, 'font'):
new_cell.font = copy(cell.font)
if hasattr(cell, 'border'):
new_cell.border = copy(cell.border)
if hasattr(cell, 'fill'):
new_cell.fill = copy(cell.fill)
if hasattr(cell, 'number_format'):
new_cell.number_format = copy(cell.number_format)
if hasattr(cell, 'protection'):
new_cell.protection = copy(cell.protection)
if hasattr(cell, 'alignment'):
new_cell.alignment = copy(cell.alignment)
# 复制合并单元格
if hasattr(ws_original, 'merged_cells'):
try:
for merged_cell_range in ws_original.merged_cells.ranges:
ws_new.merge_cells(str(merged_cell_range))
except Exception as e:
print(f"警告: 复制合并单元格时出错: {e}")
# 获取主工作表(第一个工作表)
main_sheet_name = wb_original.sheetnames[0]
ws_new = wb_new[main_sheet_name]
# 更新主工作表中的CCF-A列
# 获取列索引
header_row = 4 # 第4行为列标题
venue_col_index = None
ccfa_col_index = None
# 获取列标题行的所有单元格值
header_cells = {}
for cell in ws_new[header_row]:
header_cells[cell.column] = cell.value
# 找出期刊/会议名称列和CCF-A列的索引
for col, value in header_cells.items():
if value == venue_col_name:
venue_col_index = col
elif value == ccfa_col_name:
ccfa_col_index = col
# 如果找不到CCF-A列,在期刊/会议名称列后添加
if ccfa_col_index is None and venue_col_index is not None:
ccfa_col_index = venue_col_index + 1
# 将后面的列向右移动一列
for row in range(1, ws_new.max_row + 1):
for col in range(ws_new.max_column, venue_col_index, -1):
cell = ws_new.cell(row=row, column=col)
cell.value = ws_new.cell(row=row, column=col-1).value
# 复制样式
source_cell = ws_new.cell(row=row, column=col-1)
if source_cell.has_style:
if hasattr(source_cell, 'font'):
cell.font = copy(source_cell.font)
if hasattr(source_cell, 'border'):
cell.border = copy(source_cell.border)
if hasattr(source_cell, 'fill'):
cell.fill = copy(source_cell.fill)
if hasattr(source_cell, 'number_format'):
cell.number_format = copy(source_cell.number_format)
if hasattr(source_cell, 'protection'):
cell.protection = copy(source_cell.protection)
if hasattr(source_cell, 'alignment'):
cell.alignment = copy(source_cell.alignment)
# 设置CCF-A列标题
ws_new.cell(row=header_row, column=ccfa_col_index, value=ccfa_col_name)
# 更新CCF-A列的数据(从第8行开始)
if venue_col_index is not None and ccfa_col_index is not None:
# for i, row in input_df.iterrows():
for i, (index, row) in enumerate(input_df.iterrows(), start=7):
# excel_row = i + 8 # 转换为Excel的行号(从1开始)
venue_value = ws_new.cell(row=i, column=venue_col_index).value
# 如果原始Excel中有会议名称,判断是否为CCF-A
if venue_value:
venue_clean = clean_venue(venue_value)
is_ccfa_value = venue_ccfa_dict.get(venue_clean, 0)
ws_new.cell(row=i, column=ccfa_col_index, value=is_ccfa_value)
# 保存新Excel文件
print(f"正在保存为Excel格式 {output_file_path} ...")
try:
wb_new.save(output_file_path)
print(f"成功保存到 {output_file_path},保留了原始格式")
except Exception as e:
print(f"保存Excel失败: {e}")
print("处理完成!")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment