"""
功能：判断论文是否属于CCF-A类会议
输入文件：CCF_A_list.csv，包含两列：abbr,fullname
输入文件夹：data/title_venue，包含若干csv文件，每个文件的标题栏是title,venue
输出文件夹：data/is_ccfa,对用title_venue文件夹下的每个csv文件进行处理，输出文件名相同，标题栏是title,venue,is_ccf_a
"""
import os
import pandas as pd
import re
from rapidfuzz import fuzz, process
from collections import defaultdict

# 路径参数
input_dir = "data/title_venue"
output_dir = "data/is_ccfa"
ccf_a_file = "CCF_A_list.csv"  # 请根据实际路径调整

os.makedirs(output_dir, exist_ok=True)

# 读取CCF-A简称和全称
ccf_a = pd.read_csv(ccf_a_file)
# 缩写全部去空格
ccf_a_abbrs = set(ccf_a['abbr'].dropna().str.lower().str.replace(' ','').str.strip().tolist())
ccf_a_fullnames = set(ccf_a['fullname'].dropna().str.lower().str.strip().tolist())
ccf_a_names = ccf_a_abbrs | ccf_a_fullnames

def clean_venue(venue):
    if pd.isnull(venue) or str(venue).strip() == "":
        return ""
    text = str(venue)
    text = re.sub(r'[\(\（][^)\）]*?volume[^)\）]*?[\)\）]', '', text, flags=re.IGNORECASE)
    text = re.sub(r"[0-9'\",\.\!\/\\\?\;\-\_]", "", text)
    return text.lower().strip()

def extract_brackets(text):
    match = re.search(r'[\(\（](.*?)[\)\）]', text)
    if match:
        return match.group(1).strip().lower()
    return None

def extract_before_colon(text):
    parts = re.split(r'[:：]', text)
    if len(parts) > 1:
        return parts[0].strip().lower()
    return None

def is_ccf_a(venue, threshold=70):
    if pd.isnull(venue) or str(venue).strip() == "":
        return 0
    venue_clean = venue.lower().strip()

    if 'south pacific design automation conference' in venue_clean:
        return 0
    
    if 'International Journal' in venue_clean and ('Computer Vision' not in venue_clean or 'Human-Computer Studies' not in venue_clean):
        return 0
    
    if 'ieee micro' in venue_clean or 'design automation conference' in venue_clean or 'ieee transactions on parallel & distributed systems' in venue_clean or 'ieee transactions on computers' in venue_clean or 'transactions on architecture and code optimization' in venue_clean or '信息科学' in venue_clean or 'pldi' in venue_clean or 'transactions on computer systems' in venue_clean or 'conference on computer vision and pattern recognition' in venue_clean or 'transactions on computeraided design of integrated circuits and systems' in venue_clean or 'aaai conference on artificial intelligence' in venue_clean or 'mm ’' in venue_clean or 'transactions on parallel and distributed systems' in venue_clean:
        return 1
    
    if 'ijcnn' in venue_clean or 'iccd' in venue_clean or 'icpr' in venue_clean or 'aicas' in venue_clean or 'acm transactions on embedded computing systems' in venue_clean or 'iccad' in venue_clean or 'artificial intelligence review' in venue_clean or 'ieee journal of selected topics in quantum electronics' in venue_clean or 'future generation computer systems' in venue_clean or 'ieee transactions on services computing' in venue_clean or 'ipdps' in venue_clean or 'isocc' in venue_clean or 'microprocessors and microsystems' in venue_clean or 'ccet' in venue_clean or 'microelectronics' in venue_clean or 'asid' in venue_clean or 'caai' in venue_clean or 'euromicro conference on' in venue_clean or 'asicon' in venue_clean or 'transactions on system and lsi design methodology' in venue_clean or 'selected areas in communications' in venue_clean or 'high performance computing data and analytics' in venue_clean or 'innovation communication and engineering' in venue_clean or 'sysml conference' in venue_clean or 'international conference on digital signal processing' in venue_clean or 'international conference on computer and communication systems' in venue_clean or 'ieee transactions on services computing' in venue_clean or 'international symposium on performance analysis of systems and software' in venue_clean or 'wireless communications and mobile computing' in venue_clean or 'sigops operating systems review' in venue_clean or 'siggraphasia' in venue_clean or 'international journal of computational intelligence systems' in venue_clean or 'journal of ikeee' in venue_clean or 'artificial intelligence and security' in venue_clean or 'journal of software' in venue_clean or 'advances in artificial intelligence' in venue_clean or 'international symposium on networksonchip' in venue_clean or 'international journal of web information systems' in venue_clean or 'journal of timecritical computing systems' in venue_clean or 'journal of advanced computer science and applications' in venue_clean or 'conference on computing and informatics' in venue_clean or 'journal of big data' in venue_clean or 'journal of parallel programming' in venue_clean or 'international conference on computer and communications' in venue_clean or 'journal of supercomputing' in venue_clean or 'conference on computing frontiers' in venue_clean or 'international conference on industry applications' in venue_clean or 'artificial intelligence advances' in venue_clean or 'pattern recognition and artificial intelligence' in venue_clean or 'journal of electronics and communications' in venue_clean or 'journal of data science and analytics' in venue_clean or 'journal of parallel emergent and distributed systems' in venue_clean or 'conference on advances in electrical engineering' in venue_clean or 'international conference on frontiers in computing and systems' in venue_clean or 'conference on sentiment analysis and deep learning' in venue_clean or 'acm symposium on cloud computing' in venue_clean or 'conference on tools with artificial intelligence' in venue_clean or 'conference on supercomputing' in venue_clean or 'frontiers in artificial intelligence ' in venue_clean or 'conference on communication technology' in venue_clean or 'big data information and computer network' in venue_clean or 'conference on software engineering education and training' in venue_clean or 'journal of computers' in venue_clean or 'washington' in venue_clean or 'international conference on cloud computing' in venue_clean or 'acm on programming languages' in venue_clean or 'conference on high performance computing' in venue_clean or 'ubm designcon conference' in venue_clean or 'scientia sinica informationis' in venue_clean or 'symposium on computing and networking' in venue_clean or 'journal of mathematics' in venue_clean or 'iet image processing' in venue_clean or 'journal of computing and digital systems' in venue_clean or 'conference on very large scale integration' in venue_clean or 'journal of networking and computing' in venue_clean or 'transactions on cryptographic hardware and embedded systems' in venue_clean or 'International Conference on Hybrid Intelligent Systems' in venue_clean or 'international conference on asic' in venue_clean or 'international conference on nexgen technologies' in venue_clean or 'international conference on computer and information technology' in venue_clean or 'international journal of high performance systems architecture' in venue_clean or 'international conference on power and energy engineering' in venue_clean or 'international symposium on smart electronic systems' in venue_clean or 'international symposium on smart electronic systems' in venue_clean:
        return 0
    

    # 特判 IEEE Transactions on 开头
    if venue_clean.startswith("ieee transactions on") or venue_clean.startswith("ieee international conference on") or venue_clean.startswith("ccf transactions on") or venue_clean.startswith("engineering applications of") or venue_clean.startswith("acm transactions on") or venue_clean.startswith("international conference on") or venue_clean.startswith("ieice transactions on"):  
        ccf_a_fullnames_clean = {clean_venue(name) for name in ccf_a_fullnames}
        if venue_clean in ccf_a_fullnames_clean:
            return 1
        else:
            return 0

     




    # 1. 括号内简称匹配（只做去空格完全匹配）
    bracket_content = extract_brackets(venue_clean)
    if bracket_content:
        bracket_content_no_space = bracket_content.replace(' ', '')
        if bracket_content_no_space in ccf_a_abbrs:
            return 1

    # 2. 冒号前内容匹配（只做去空格完全匹配）
    colon_content = extract_before_colon(venue_clean)
    if colon_content:
        colon_content_no_space = colon_content.replace(' ', '')
        if colon_content_no_space in ccf_a_abbrs:
            return 1

    # 3. venue_clean整体匹配
    if len(venue_clean) <= 6:
        # 长度小于等于6，只做精确匹配
        if venue_clean in ccf_a_names:
            return 1
        else:
            return 0
    else:
        # 正常模糊匹配
        if venue_clean in ccf_a_names:
            return 1
        result = process.extractOne(
            venue_clean,
            ccf_a_names,
            scorer=fuzz.ratio
        )
        if result:
            match, score, _ = result
            if score >= threshold:
                return 1
            else:
                return 0
        else:
            return 0
        
processed_files = [
    # 'c48_DaDiannao.csv',
    # 'j20-DaDianNao.csv',
    # 'j24-DianNao family.csv',
    # 'c43-DianNao.csv',
    # 'c35-Cambricon-X.csv',
    # 'c34-Cambricon.csv',
    # '',
    # 'c40-shidiannao.csv',
    # 'c32.csv',
    # 'c53-BenchNN.csv',
    # 'c66-Fast complete memory.csv',
    # 'j53-System Architecture.csv',
    # 'z45.csv',
    # 'j29.csv',
    # 'c64.csv',
    # 'c41.csv',
    # 'j35-A small-footprint.csv',
    # 'j21-An Accelerator for High.csv',
    # 'c51-Statistical....csv',
    # 'c47.csv',
    # 'c30.csv',
    # 'c57.csv',
    # 'c28.csv',
    # 'j16.csv',
    # 'c36-PuDianNao.csv'
]
file_list = [f for f in os.listdir(input_dir) if f.endswith('.csv')]
count = 0
for fname in file_list:
    count += 1
    # if count <= 79:
    #     continue
    print(count)
    if fname in processed_files:
        print(f"Skipping {fname} ...")
        continue
    input_path = os.path.join(input_dir, fname)
    output_path = os.path.join(output_dir, fname)

    print(f"Processing {input_path} ...")
    # breakpoint()
    papers = pd.read_csv(input_path)
    if 'title' not in papers.columns or 'venue' not in papers.columns:
        print(f"Warning: {fname} does not have required columns.")
        continue

    papers['venue_clean'] = papers['venue'].apply(clean_venue)
    venue_map = defaultdict(list)
    for idx, row in papers.iterrows():
        venue_map[row['venue_clean']].append(idx)

    venue_ccfa_dict = {}
    cnt = 0
    for v_clean in venue_map:
        venue_ccfa_dict[v_clean] = is_ccf_a(v_clean)
        print(f'{v_clean} -> {venue_ccfa_dict[v_clean]}')
        print(f'still have {len(venue_map) - len(venue_ccfa_dict)} left')
        cnt += 1
        # if cnt % 10 == 0:
        # breakpoint()

    is_ccf_a_list = [venue_ccfa_dict[vc] for vc in papers['venue_clean']]
    papers['is_ccf_a'] = is_ccf_a_list

    papers[['title', 'venue', 'is_ccf_a']].to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"Saved to {output_path}")

print("全部处理完成！")