Upload New File

7c363521 · matianyun · 03c9d49c · 7c363521
Commit 7c363521 authored May 09, 2025 by matianyun
Hide whitespace changes
Inline Side-by-side

Showing with 286 additions and 0 deletions

papertools_niuren_ccfa/if_ccfa_update.py
+286 -0

No files found.
--- a/papertools_niuren_ccfa/if_ccfa_update.py
+++ b/papertools_niuren_ccfa/if_ccfa_update.py
+"""
+功能：判断论文是否属于CCF-A类会议
+输入文件：CCF_A_list.csv，包含两列：abbr,fullname
+输入文件：info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx
+输出文件：处理后的Excel文件，保持原格式不变，添加是否CCF-A信息
+"""
+import os
+import pandas as pd
+import re
+from rapidfuzz import fuzz, process
+from collections import defaultdict
+import openpyxl
+from copy import copy
+
+# 路径参数
+input_file_path = "info/测试大表2.xlsx"
+output_file_path = "output/论文被引用统计-陈老师-截止2025年X月XX日-CCFA标记.xlsx"
+ccf_a_file = "info/CCF_A_list.csv"  # 请根据实际路径调整
+
+# 读取CCF-A简称和全称
+ccf_a = pd.read_csv(ccf_a_file)
+# 缩写全部去空格
+ccf_a_abbrs = set(ccf_a['abbr'].dropna().str.lower().str.replace(' ','').str.strip().tolist())
+ccf_a_fullnames = set(ccf_a['fullname'].dropna().str.lower().str.strip().tolist())
+ccf_a_names = ccf_a_abbrs | ccf_a_fullnames
+
+def clean_venue(venue):
+    if pd.isnull(venue) or str(venue).strip() == "":
+        return ""
+    text = str(venue)
+    text = re.sub(r'[\(\（][^)\）]*?volume[^)\）]*?[\)\）]', '', text, flags=re.IGNORECASE)
+    text = re.sub(r"[0-9'\",\.\!\/\\\?\;\-\_]", "", text)
+    return text.lower().strip()
+
+def extract_brackets(text):
+    match = re.search(r'[\(\（](.*?)[\)\）]', text)
+    if match:
+        return match.group(1).strip().lower()
+    return None
+
+def extract_before_colon(text):
+    parts = re.split(r'[:：]', text)
+    if len(parts) > 1:
+        return parts[0].strip().lower()
+    return None
+
+def is_ccf_a(venue, threshold=70):
+    if pd.isnull(venue) or str(venue).strip() == "":
+        return 0
+    venue_clean = venue.lower().strip()
+
+    if 'south pacific design automation conference' in venue_clean:
+        return 0
+    
+    if 'International Journal' in venue_clean and ('Computer Vision' not in venue_clean or 'Human-Computer Studies' not in venue_clean):
+        return 0
+    
+    if 'ieee micro' in venue_clean or 'design automation conference' in venue_clean or 'ieee transactions on parallel & distributed systems' in venue_clean or 'ieee transactions on computers' in venue_clean or 'transactions on architecture and code optimization' in venue_clean or '信息科学' in venue_clean or 'pldi' in venue_clean or 'transactions on computer systems' in venue_clean or 'conference on computer vision and pattern recognition' in venue_clean or 'transactions on computeraided design of integrated circuits and systems' in venue_clean or 'aaai conference on artificial intelligence' in venue_clean or 'mm ' in venue_clean or 'transactions on parallel and distributed systems' in venue_clean:
+        return 1
+    
+    if 'ijcnn' in venue_clean or 'iccd' in venue_clean or 'icpr' in venue_clean or 'aicas' in venue_clean or 'acm transactions on embedded computing systems' in venue_clean or 'iccad' in venue_clean or 'artificial intelligence review' in venue_clean or 'ieee journal of selected topics in quantum electronics' in venue_clean or 'future generation computer systems' in venue_clean or 'ieee transactions on services computing' in venue_clean or 'ipdps' in venue_clean or 'isocc' in venue_clean or 'microprocessors and microsystems' in venue_clean or 'ccet' in venue_clean or 'microelectronics' in venue_clean or 'asid' in venue_clean or 'caai' in venue_clean or 'euromicro conference on' in venue_clean or 'asicon' in venue_clean or 'transactions on system and lsi design methodology' in venue_clean or 'selected areas in communications' in venue_clean or 'high performance computing data and analytics' in venue_clean or 'innovation communication and engineering' in venue_clean or 'sysml conference' in venue_clean or 'international conference on digital signal processing' in venue_clean or 'international conference on computer and communication systems' in venue_clean or 'ieee transactions on services computing' in venue_clean or 'international symposium on performance analysis of systems and software' in venue_clean or 'wireless communications and mobile computing' in venue_clean or 'sigops operating systems review' in venue_clean or 'siggraphasia' in venue_clean or 'international journal of computational intelligence systems' in venue_clean or 'journal of ikeee' in venue_clean or 'artificial intelligence and security' in venue_clean or 'journal of software' in venue_clean or 'advances in artificial intelligence' in venue_clean or 'international symposium on networksonchip' in venue_clean or 'international journal of web information systems' in venue_clean or 'journal of timecritical computing systems' in venue_clean or 'journal of advanced computer science and applications' in venue_clean or 'conference on computing and informatics' in venue_clean or 'journal of big data' in venue_clean or 'journal of parallel programming' in venue_clean or 'international conference on computer and communications' in venue_clean or 'journal of supercomputing' in venue_clean or 'conference on computing frontiers' in venue_clean or 'international conference on industry applications' in venue_clean or 'artificial intelligence advances' in venue_clean or 'pattern recognition and artificial intelligence' in venue_clean or 'journal of electronics and communications' in venue_clean or 'journal of data science and analytics' in venue_clean or 'journal of parallel emergent and distributed systems' in venue_clean or 'conference on advances in electrical engineering' in venue_clean or 'international conference on frontiers in computing and systems' in venue_clean or 'conference on sentiment analysis and deep learning' in venue_clean or 'acm symposium on cloud computing' in venue_clean or 'conference on tools with artificial intelligence' in venue_clean or 'conference on supercomputing' in venue_clean or 'frontiers in artificial intelligence ' in venue_clean or 'conference on communication technology' in venue_clean or 'big data information and computer network' in venue_clean or 'conference on software engineering education and training' in venue_clean or 'journal of computers' in venue_clean or 'washington' in venue_clean or 'international conference on cloud computing' in venue_clean or 'acm on programming languages' in venue_clean or 'conference on high performance computing' in venue_clean or 'ubm designcon conference' in venue_clean or 'scientia sinica informationis' in venue_clean or 'symposium on computing and networking' in venue_clean or 'journal of mathematics' in venue_clean or 'iet image processing' in venue_clean or 'journal of computing and digital systems' in venue_clean or 'conference on very large scale integration' in venue_clean or 'journal of networking and computing' in venue_clean or 'transactions on cryptographic hardware and embedded systems' in venue_clean or 'International Conference on Hybrid Intelligent Systems' in venue_clean or 'international conference on asic' in venue_clean or 'international conference on nexgen technologies' in venue_clean or 'international conference on computer and information technology' in venue_clean or 'international journal of high performance systems architecture' in venue_clean or 'international conference on power and energy engineering' in venue_clean or 'international symposium on smart electronic systems' in venue_clean or 'international symposium on smart electronic systems' in venue_clean:
+        return 0
+    
+    # 特判 IEEE Transactions on 开头
+    if venue_clean.startswith("ieee transactions on") or venue_clean.startswith("ieee international conference on") or venue_clean.startswith("ccf transactions on") or venue_clean.startswith("engineering applications of") or venue_clean.startswith("acm transactions on") or venue_clean.startswith("international conference on") or venue_clean.startswith("ieice transactions on"):  
+        ccf_a_fullnames_clean = {clean_venue(name) for name in ccf_a_fullnames}
+        if venue_clean in ccf_a_fullnames_clean:
+            return 1
+        else:
+            return 0
+
+    # 1. 括号内简称匹配（只做去空格完全匹配）
+    bracket_content = extract_brackets(venue_clean)
+    if bracket_content:
+        bracket_content_no_space = bracket_content.replace(' ', '')
+        if bracket_content_no_space in ccf_a_abbrs:
+            return 1
+
+    # 2. 冒号前内容匹配（只做去空格完全匹配）
+    colon_content = extract_before_colon(venue_clean)
+    if colon_content:
+        colon_content_no_space = colon_content.replace(' ', '')
+        if colon_content_no_space in ccf_a_abbrs:
+            return 1
+
+    # 3. venue_clean整体匹配
+    if len(venue_clean) <= 6:
+        # 长度小于等于6，只做精确匹配
+        if venue_clean in ccf_a_names:
+            return 1
+        else:
+            return 0
+    else:
+        # 正常模糊匹配
+        if venue_clean in ccf_a_names:
+            return 1
+        result = process.extractOne(
+            venue_clean,
+            ccf_a_names,
+            scorer=fuzz.ratio
+        )
+        if result:
+            match, score, _ = result
+            if score >= threshold:
+                return 1
+            else:
+                return 0
+        else:
+            return 0
+
+if __name__ == "__main__":
+    # 读取表头（第4行作为列名）
+    original_header = pd.read_excel(input_file_path, nrows=0, header=3)
+    column_names = original_header.columns.tolist()
+    
+    # 读取数据（从第8行开始）
+    input_df = pd.read_excel(input_file_path, skiprows=7, header=None, names=column_names)
+    
+    # 输出表头和数据的基本信息
+    print("表头元素：")
+    print(column_names)
+    print("\n数据行数：", input_df.shape[0])
+    print("数据列数：", input_df.shape[1])
+    
+    # 检查是否存在期刊/会议名称列
+    venue_col_name = '期刊/会议名称'
+    if venue_col_name not in input_df.columns:
+        print(f"错误：找不到'{venue_col_name}'列")
+        exit(1)
+    
+    # 检查是否存在CCF-A列
+    ccfa_col_name = '是否是CCF-A'
+    if ccfa_col_name not in input_df.columns:
+        print(f"'{ccfa_col_name}'列不存在，将添加该列")
+        # 找出CCF-A列的合适位置（通常在期刊/会议名称列之后）
+        venue_col_index = list(input_df.columns).index(venue_col_name)
+        input_df.insert(venue_col_index + 1, ccfa_col_name, None)
+    
+    # 清理会议/期刊名称
+    input_df['venue_clean'] = input_df[venue_col_name].apply(clean_venue)
+    
+    # 创建会议名称到行索引的映射
+    venue_map = defaultdict(list)
+    for idx, row in input_df.iterrows():
+        venue_map[row['venue_clean']].append(idx)
+    
+    # 判断每个会议是否为CCF-A
+    venue_ccfa_dict = {}
+    for v_clean in venue_map:
+        venue_ccfa_dict[v_clean] = '是' if is_ccf_a(v_clean) else '否'
+        print(f'{v_clean} -> {venue_ccfa_dict[v_clean]}')
+    
+    # 更新CCF-A列
+    for venue, indices in venue_map.items():
+        is_ccfa = venue_ccfa_dict[venue]
+        for idx in indices:
+            input_df.at[idx, ccfa_col_name] = is_ccfa
+    
+    # 删除临时列
+    input_df.drop('venue_clean', axis=1, inplace=True)
+    
+    # 使用openpyxl保留原始Excel格式
+    import openpyxl
+    from copy import copy
+    
+    # 打开原始Excel文件
+    print("正在读取原始Excel文件以保留格式...")
+    wb_original = openpyxl.load_workbook(input_file_path)
+    
+    # 创建新工作簿
+    wb_new = openpyxl.Workbook()
+    # 删除默认创建的空白工作表
+    if 'Sheet' in wb_new.sheetnames:
+        del wb_new['Sheet']
+    
+    # 复制所有工作表
+    for sheet_name in wb_original.sheetnames:
+        ws_original = wb_original[sheet_name]
+        ws_new = wb_new.create_sheet(sheet_name)
+        
+        # 复制工作表属性 - 只复制存在的属性
+        if hasattr(ws_original, 'sheet_properties'):
+            ws_new.sheet_properties = copy(ws_original.sheet_properties)
+        if hasattr(ws_original, 'sheet_format'):
+            ws_new.sheet_format = copy(ws_original.sheet_format)
+        
+        # 复制整个工作表的内容和格式
+        for row in ws_original.rows:
+            for cell in row:
+                new_cell = ws_new.cell(row=cell.row, column=cell.column, value=cell.value)
+                if cell.has_style:
+                    if hasattr(cell, 'font'):
+                        new_cell.font = copy(cell.font)
+                    if hasattr(cell, 'border'):
+                        new_cell.border = copy(cell.border)
+                    if hasattr(cell, 'fill'):
+                        new_cell.fill = copy(cell.fill)
+                    if hasattr(cell, 'number_format'):
+                        new_cell.number_format = copy(cell.number_format)
+                    if hasattr(cell, 'protection'):
+                        new_cell.protection = copy(cell.protection)
+                    if hasattr(cell, 'alignment'):
+                        new_cell.alignment = copy(cell.alignment)
+        
+        # 复制合并单元格
+        if hasattr(ws_original, 'merged_cells'):
+            try:
+                for merged_cell_range in ws_original.merged_cells.ranges:
+                    ws_new.merge_cells(str(merged_cell_range))
+            except Exception as e:
+                print(f"警告: 复制合并单元格时出错: {e}")
+    
+    # 获取主工作表（第一个工作表）
+    main_sheet_name = wb_original.sheetnames[0]
+    ws_new = wb_new[main_sheet_name]
+    
+    # 更新主工作表中的CCF-A列
+    # 获取列索引
+    header_row = 4  # 第4行为列标题
+    venue_col_index = None
+    ccfa_col_index = None
+    
+    # 获取列标题行的所有单元格值
+    header_cells = {}
+    for cell in ws_new[header_row]:
+        header_cells[cell.column] = cell.value
+    
+    # 找出期刊/会议名称列和CCF-A列的索引
+    for col, value in header_cells.items():
+        if value == venue_col_name:
+            venue_col_index = col
+        elif value == ccfa_col_name:
+            ccfa_col_index = col
+    
+    # 如果找不到CCF-A列，在期刊/会议名称列后添加
+    if ccfa_col_index is None and venue_col_index is not None:
+        ccfa_col_index = venue_col_index + 1
+        
+        # 将后面的列向右移动一列
+        for row in range(1, ws_new.max_row + 1):
+            for col in range(ws_new.max_column, venue_col_index, -1):
+                cell = ws_new.cell(row=row, column=col)
+                cell.value = ws_new.cell(row=row, column=col-1).value
+                
+                # 复制样式
+                source_cell = ws_new.cell(row=row, column=col-1)
+                if source_cell.has_style:
+                    if hasattr(source_cell, 'font'):
+                        cell.font = copy(source_cell.font)
+                    if hasattr(source_cell, 'border'):
+                        cell.border = copy(source_cell.border)
+                    if hasattr(source_cell, 'fill'):
+                        cell.fill = copy(source_cell.fill)
+                    if hasattr(source_cell, 'number_format'):
+                        cell.number_format = copy(source_cell.number_format)
+                    if hasattr(source_cell, 'protection'):
+                        cell.protection = copy(source_cell.protection)
+                    if hasattr(source_cell, 'alignment'):
+                        cell.alignment = copy(source_cell.alignment)
+        
+        # 设置CCF-A列标题
+        ws_new.cell(row=header_row, column=ccfa_col_index, value=ccfa_col_name)
+    
+    # 更新CCF-A列的数据（从第8行开始）
+    if venue_col_index is not None and ccfa_col_index is not None:
+        # for i, row in input_df.iterrows():
+        for i, (index, row) in enumerate(input_df.iterrows(), start=7):
+            # excel_row = i + 8  # 转换为Excel的行号（从1开始）
+            venue_value = ws_new.cell(row=i, column=venue_col_index).value
+            
+            # 如果原始Excel中有会议名称，判断是否为CCF-A
+            if venue_value:
+                venue_clean = clean_venue(venue_value)
+                is_ccfa_value = venue_ccfa_dict.get(venue_clean, 0)
+                ws_new.cell(row=i, column=ccfa_col_index, value=is_ccfa_value)
+    
+    # 保存新Excel文件
+    print(f"正在保存为Excel格式 {output_file_path} ...")
+    try:
+        wb_new.save(output_file_path)
+        print(f"成功保存到 {output_file_path}，保留了原始格式")
+    except Exception as e:
+        print(f"保存Excel失败: {e}")
+    
+    print("处理完成!")
\ No newline at end of file