"""
功能：判断论文是否属于CCF-A类会议
输入文件：CCF_A_list.csv，包含两列：abbr,fullname
输入文件：info/论文被引用统计-陈老师-截止2025年X月XX日.xlsx
输出文件：处理后的Excel文件，保持原格式不变，添加是否CCF-A信息
"""
import os
import pandas as pd
import re
import sys
from rapidfuzz import fuzz, process
from collections import defaultdict
import openpyxl
from copy import copy
from tqdm import tqdm

# 路径参数
input_file_path = "info/test.xlsx"
output_file_path = "output/test-CCFA标记.xlsx"
ccf_a_file = "info/CCF_A_list.csv"  # 请根据实际路径调整

# 确保输出目录存在
output_dir = os.path.dirname(output_file_path)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

def load_ccf_a_list(ccf_a_file):
    # 读取CCF-A简称和全称
    ccf_a = pd.read_csv(ccf_a_file)
    # 缩写全部去空格
    ccf_a_abbrs = set(ccf_a['abbr'].dropna().str.lower().str.replace(' ','').str.strip().tolist())
    ccf_a_fullnames = set(ccf_a['fullname'].dropna().str.lower().str.strip().tolist())
    ccf_a_names = ccf_a_abbrs | ccf_a_fullnames
    return ccf_a_abbrs, ccf_a_fullnames, ccf_a_names

def clean_venue(venue):
    if pd.isnull(venue) or str(venue).strip() == "":
        return ""
    text = str(venue)
    text = re.sub(r'[\(\（][^)\）]*?volume[^)\）]*?[\)\）]', '', text, flags=re.IGNORECASE)
    text = re.sub(r"[0-9'\",\.\!\/\\\?\;\-\_]", "", text)
    return text.lower().strip()

def extract_brackets(text):
    match = re.search(r'[\(\（](.*?)[\)\）]', text)
    if match:
        return match.group(1).strip().lower()
    return None

def extract_before_colon(text):
    parts = re.split(r'[:：]', text)
    if len(parts) > 1:
        return parts[0].strip().lower()
    return None

def is_ccf_a(venue, ccf_a_abbrs, ccf_a_fullnames, ccf_a_names, threshold=70):
    if pd.isnull(venue) or str(venue).strip() == "":
        return 0
    venue_clean = venue.lower().strip()

    if 'south pacific design automation conference' in venue_clean:
        return 0
    
    if 'International Journal' in venue_clean and ('Computer Vision' not in venue_clean or 'Human-Computer Studies' not in venue_clean):
        return 0
    
    if 'ieee micro' in venue_clean or 'design automation conference' in venue_clean or 'ieee transactions on parallel & distributed systems' in venue_clean or 'ieee transactions on computers' in venue_clean or 'transactions on architecture and code optimization' in venue_clean or '信息科学' in venue_clean or 'pldi' in venue_clean or 'transactions on computer systems' in venue_clean or 'conference on computer vision and pattern recognition' in venue_clean or 'transactions on computeraided design of integrated circuits and systems' in venue_clean or 'aaai conference on artificial intelligence' in venue_clean or 'mm ' in venue_clean or 'transactions on parallel and distributed systems' in venue_clean:
        return 1
    
    if 'ijcnn' in venue_clean or 'iccd' in venue_clean or 'icpr' in venue_clean or 'aicas' in venue_clean or 'acm transactions on embedded computing systems' in venue_clean or 'iccad' in venue_clean or 'artificial intelligence review' in venue_clean or 'ieee journal of selected topics in quantum electronics' in venue_clean or 'future generation computer systems' in venue_clean or 'ieee transactions on services computing' in venue_clean or 'ipdps' in venue_clean or 'isocc' in venue_clean or 'microprocessors and microsystems' in venue_clean or 'ccet' in venue_clean or 'microelectronics' in venue_clean or 'asid' in venue_clean or 'caai' in venue_clean or 'euromicro conference on' in venue_clean or 'asicon' in venue_clean or 'transactions on system and lsi design methodology' in venue_clean or 'selected areas in communications' in venue_clean or 'high performance computing data and analytics' in venue_clean or 'innovation communication and engineering' in venue_clean or 'sysml conference' in venue_clean or 'international conference on digital signal processing' in venue_clean or 'international conference on computer and communication systems' in venue_clean or 'ieee transactions on services computing' in venue_clean or 'international symposium on performance analysis of systems and software' in venue_clean or 'wireless communications and mobile computing' in venue_clean or 'sigops operating systems review' in venue_clean or 'siggraphasia' in venue_clean or 'international journal of computational intelligence systems' in venue_clean or 'journal of ikeee' in venue_clean or 'artificial intelligence and security' in venue_clean or 'journal of software' in venue_clean or 'advances in artificial intelligence' in venue_clean or 'international symposium on networksonchip' in venue_clean or 'international journal of web information systems' in venue_clean or 'journal of timecritical computing systems' in venue_clean or 'journal of advanced computer science and applications' in venue_clean or 'conference on computing and informatics' in venue_clean or 'journal of big data' in venue_clean or 'journal of parallel programming' in venue_clean or 'international conference on computer and communications' in venue_clean or 'journal of supercomputing' in venue_clean or 'conference on computing frontiers' in venue_clean or 'international conference on industry applications' in venue_clean or 'artificial intelligence advances' in venue_clean or 'pattern recognition and artificial intelligence' in venue_clean or 'journal of electronics and communications' in venue_clean or 'journal of data science and analytics' in venue_clean or 'journal of parallel emergent and distributed systems' in venue_clean or 'conference on advances in electrical engineering' in venue_clean or 'international conference on frontiers in computing and systems' in venue_clean or 'conference on sentiment analysis and deep learning' in venue_clean or 'acm symposium on cloud computing' in venue_clean or 'conference on tools with artificial intelligence' in venue_clean or 'conference on supercomputing' in venue_clean or 'frontiers in artificial intelligence ' in venue_clean or 'conference on communication technology' in venue_clean or 'big data information and computer network' in venue_clean or 'conference on software engineering education and training' in venue_clean or 'journal of computers' in venue_clean or 'washington' in venue_clean or 'international conference on cloud computing' in venue_clean or 'acm on programming languages' in venue_clean or 'conference on high performance computing' in venue_clean or 'ubm designcon conference' in venue_clean or 'scientia sinica informationis' in venue_clean or 'symposium on computing and networking' in venue_clean or 'journal of mathematics' in venue_clean or 'iet image processing' in venue_clean or 'journal of computing and digital systems' in venue_clean or 'conference on very large scale integration' in venue_clean or 'journal of networking and computing' in venue_clean or 'transactions on cryptographic hardware and embedded systems' in venue_clean or 'International Conference on Hybrid Intelligent Systems' in venue_clean or 'international conference on asic' in venue_clean or 'international conference on nexgen technologies' in venue_clean or 'international conference on computer and information technology' in venue_clean or 'international journal of high performance systems architecture' in venue_clean or 'international conference on power and energy engineering' in venue_clean or 'international symposium on smart electronic systems' in venue_clean or 'international symposium on smart electronic systems' in venue_clean:
        return 0
    
    # 特判 IEEE Transactions on 开头
    if venue_clean.startswith("ieee transactions on") or venue_clean.startswith("ieee international conference on") or venue_clean.startswith("ccf transactions on") or venue_clean.startswith("engineering applications of") or venue_clean.startswith("acm transactions on") or venue_clean.startswith("international conference on") or venue_clean.startswith("ieice transactions on"):  
        ccf_a_fullnames_clean = {clean_venue(name) for name in ccf_a_fullnames}
        if venue_clean in ccf_a_fullnames_clean:
            return 1
        else:
            return 0

    # 1. 括号内简称匹配（只做去空格完全匹配）
    bracket_content = extract_brackets(venue_clean)
    if bracket_content:
        bracket_content_no_space = bracket_content.replace(' ', '')
        if bracket_content_no_space in ccf_a_abbrs:
            return 1

    # 2. 冒号前内容匹配（只做去空格完全匹配）
    colon_content = extract_before_colon(venue_clean)
    if colon_content:
        colon_content_no_space = colon_content.replace(' ', '')
        if colon_content_no_space in ccf_a_abbrs:
            return 1

    # 3. venue_clean整体匹配
    if len(venue_clean) <= 6:
        # 长度小于等于6，只做精确匹配
        if venue_clean in ccf_a_names:
            return 1
        else:
            return 0
    else:
        # 正常模糊匹配
        if venue_clean in ccf_a_names:
            return 1
        result = process.extractOne(
            venue_clean,
            ccf_a_names,
            scorer=fuzz.ratio
        )
        if result:
            match, score, _ = result
            if score >= threshold:
                return 1
            else:
                return 0
        else:
            return 0

def process_sheet(sheet_name, wb_original, wb_new, ccf_a_abbrs, ccf_a_fullnames, ccf_a_names):
    print(f"正在处理工作表: {sheet_name}")
    
    # 读取表头（第4行作为列名）
    original_header = pd.read_excel(input_file_path, sheet_name=sheet_name, nrows=0, header=1)
    column_names = original_header.columns.tolist()
    
    # 读取数据（从第8行开始）
    input_df = pd.read_excel(input_file_path, sheet_name=sheet_name, skiprows=2, header=None, names=column_names)
    
    # 输出表头和数据的基本信息
    print(f"表头元素 ({sheet_name}):")
    print(column_names)
    print(f"\n数据行数 ({sheet_name}): {input_df.shape[0]}")
    print(f"数据列数 ({sheet_name}): {input_df.shape[1]}")
    
    # 检查是否存在期刊/会议名称列
    venue_col_name = '期刊/会议名称'
    if venue_col_name not in input_df.columns:
        print(f"警告：在工作表 {sheet_name} 中找不到'{venue_col_name}'列，跳过此工作表")
        return
    
    # 检查是否存在CCF-A列
    ccfa_col_name = '是否是CCF-A'
    if ccfa_col_name not in input_df.columns:
        print(f"'{ccfa_col_name}'列不存在，将添加该列")
        # 找出CCF-A列的合适位置（通常在期刊/会议名称列之后）
        venue_col_index = list(input_df.columns).index(venue_col_name)
        input_df.insert(venue_col_index + 1, ccfa_col_name, None)
    
    # 清理会议/期刊名称
    input_df['venue_clean'] = input_df[venue_col_name].apply(clean_venue)
    
    # 创建会议名称到行索引的映射
    venue_map = defaultdict(list)
    for idx, row in input_df.iterrows():
        venue_map[row['venue_clean']].append(idx)
    
    # 判断每个会议是否为CCF-A
    venue_ccfa_dict = {}
    for v_clean in tqdm(venue_map, desc=f"分析 {sheet_name} 会议名称"):
        is_ccfa_result = is_ccf_a(v_clean, ccf_a_abbrs, ccf_a_fullnames, ccf_a_names)
        venue_ccfa_dict[v_clean] = '是' if is_ccfa_result else '否'
        print(f'{v_clean} -> {venue_ccfa_dict[v_clean]}')
    
    # 更新CCF-A列
    for venue, indices in venue_map.items():
        is_ccfa = venue_ccfa_dict[venue]
        for idx in indices:
            input_df.at[idx, ccfa_col_name] = is_ccfa
    
    # 删除临时列
    input_df.drop('venue_clean', axis=1, inplace=True)
    
    # 获取工作表
    ws_new = wb_new[sheet_name]
    
    # 获取列索引
    header_row = 4  # 第4行为列标题
    venue_col_index = None
    ccfa_col_index = None
    
    # 获取列标题行的所有单元格值
    header_cells = {}
    for cell in ws_new[header_row]:
        header_cells[cell.column] = cell.value
    
    # 找出期刊/会议名称列和CCF-A列的索引
    for col, value in header_cells.items():
        if value == venue_col_name:
            venue_col_index = col
        elif value == ccfa_col_name:
            ccfa_col_index = col
    
    # 如果找不到CCF-A列，在期刊/会议名称列后添加
    if ccfa_col_index is None and venue_col_index is not None:
        ccfa_col_index = venue_col_index + 1
        
        # 将后面的列向右移动一列
        for row in range(1, ws_new.max_row + 1):
            for col in range(ws_new.max_column, venue_col_index, -1):
                cell = ws_new.cell(row=row, column=col)
                cell.value = ws_new.cell(row=row, column=col-1).value
                
                # 复制样式
                source_cell = ws_new.cell(row=row, column=col-1)
                if source_cell.has_style:
                    if hasattr(source_cell, 'font'):
                        cell.font = copy(source_cell.font)
                    if hasattr(source_cell, 'border'):
                        cell.border = copy(source_cell.border)
                    if hasattr(source_cell, 'fill'):
                        cell.fill = copy(source_cell.fill)
                    if hasattr(source_cell, 'number_format'):
                        cell.number_format = copy(source_cell.number_format)
                    if hasattr(source_cell, 'protection'):
                        cell.protection = copy(source_cell.protection)
                    if hasattr(source_cell, 'alignment'):
                        cell.alignment = copy(source_cell.alignment)
        
        # 设置CCF-A列标题
        ws_new.cell(row=header_row, column=ccfa_col_index, value=ccfa_col_name)
    
    # 更新CCF-A列的数据（从第8行开始）
    if venue_col_index is not None and ccfa_col_index is not None:
        for i, row in tqdm(input_df.iterrows(), desc=f"更新 {sheet_name} CCF-A标记"):
            excel_row = i + 8  # 转换为Excel的行号（从1开始）
            venue_value = ws_new.cell(row=excel_row, column=venue_col_index).value
            
            # 如果原始Excel中有会议名称，判断是否为CCF-A
            if venue_value:
                venue_clean = clean_venue(venue_value)
                is_ccfa_value = venue_ccfa_dict.get(venue_clean, '否')
                ws_new.cell(row=excel_row, column=ccfa_col_index, value=is_ccfa_value)
    
    print(f"工作表 {sheet_name} 处理完成")


if __name__ == "__main__":
    # 加载CCF-A列表
    print("正在加载CCF-A列表...")
    ccf_a_abbrs, ccf_a_fullnames, ccf_a_names = load_ccf_a_list(ccf_a_file)
    
    # 获取Excel文件中所有工作表名称
    workbook = openpyxl.load_workbook(input_file_path, read_only=True)
    all_sheets = workbook.sheetnames
    workbook.close()

    all_sheets = [sheet for sheet in all_sheets if sheet not in ["全局牛人", "全局非牛人"]]
    
    print("可用的工作表列表:")
    for i, sheet in enumerate(all_sheets, 1):
        print(f"{i}. {sheet}")
    
    # 用户输入要处理的工作表
    print("\n请选择要处理的工作表 (输入序号，用逗号分隔多个序号，输入 'all' 处理所有工作表):")
    choice = input().strip()
    
    if choice.lower() == 'all':
        selected_sheets = all_sheets
    else:
        try:
            sheet_indices = [int(idx) - 1 for idx in choice.split(',')]
            selected_sheets = [all_sheets[idx] for idx in sheet_indices if 0 <= idx < len(all_sheets)]
        except:
            print("输入格式错误，请重新运行程序")
            sys.exit(1)
    
    if not selected_sheets:
        print("未选择任何工作表，程序退出")
        sys.exit(1)
    
    print(f"\n将处理以下工作表: {', '.join(selected_sheets)}")
    
    # 打开原始Excel文件
    print("正在读取原始Excel文件以保留格式...")
    wb_original = openpyxl.load_workbook(input_file_path)
    
    # 创建新工作簿
    wb_new = openpyxl.Workbook()
    # 删除默认创建的空白工作表
    if 'Sheet' in wb_new.sheetnames:
        del wb_new['Sheet']
    
    # 复制所有工作表
    for sheet_name in wb_original.sheetnames:
        ws_original = wb_original[sheet_name]
        ws_new = wb_new.create_sheet(sheet_name)
        
        # 复制工作表属性 - 只复制存在的属性
        if hasattr(ws_original, 'sheet_properties'):
            ws_new.sheet_properties = copy(ws_original.sheet_properties)
        if hasattr(ws_original, 'sheet_format'):
            ws_new.sheet_format = copy(ws_original.sheet_format)
        
        # 复制整个工作表的内容和格式
        for row in ws_original.rows:
            for cell in row:
                new_cell = ws_new.cell(row=cell.row, column=cell.column, value=cell.value)
                if cell.has_style:
                    if hasattr(cell, 'font'):
                        new_cell.font = copy(cell.font)
                    if hasattr(cell, 'border'):
                        new_cell.border = copy(cell.border)
                    if hasattr(cell, 'fill'):
                        new_cell.fill = copy(cell.fill)
                    if hasattr(cell, 'number_format'):
                        new_cell.number_format = copy(cell.number_format)
                    if hasattr(cell, 'protection'):
                        new_cell.protection = copy(cell.protection)
                    if hasattr(cell, 'alignment'):
                        new_cell.alignment = copy(cell.alignment)
        
        # 复制合并单元格
        if hasattr(ws_original, 'merged_cells'):
            try:
                for merged_cell_range in ws_original.merged_cells.ranges:
                    ws_new.merge_cells(str(merged_cell_range))
            except Exception as e:
                print(f"警告: 复制合并单元格时出错: {e}")
    
    # 处理选定的工作表
    for sheet_name in selected_sheets:
        process_sheet(sheet_name, wb_original, wb_new, ccf_a_abbrs, ccf_a_fullnames, ccf_a_names)
    
    # 保存新Excel文件
    print(f"正在保存为Excel格式 {output_file_path} ...")
    try:
        wb_new.save(output_file_path)
        print(f"成功保存到 {output_file_path}，保留了原始格式")
    except Exception as e:
        print(f"保存Excel失败: {e}")
    
    print("所有工作表处理完成!")