from errno import ESTALE
from pathlib import Path
import logging
from openai import OpenAI
import pypdf
import openpyxl
from fuzzywuzzy import fuzz
import json
import re
import bibtexparser
import psrc.stage1.country_to_idx as c2i

RED = '\033[91m'
GREEN = '\033[92m'
BLUE = '\033[94m'
RESET = '\033[0m'


def chechCCFA( conferenceJournal, CCFA, configModel, client):
    system_prompt = f"""
    You are an expert academic conference/journal classifier. Your task is to determine if the given conference/journal name matches any entry in the provided CCF-A list.
    CCF-A List (comma-separated): {CCFA}
    Analysis Guidelines:
    1. Perform fuzzy matching considering:
       - Abbreviations vs full names (e.g. 'PPoPP' vs 'ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming')
       - Common variations (e.g. 'IEEE Transactions' vs 'IEEE Trans.')
       - Minor spelling differences
    2. Return JSON with:
       - "IsCCFA": ture/false
       - "MatchedName": the matched name from CCF-A list (empty string if no match)
       - "Confidence": your confidence score (0-100)
    Example Output:
    {{
        "IsCCFA": "ture",
        "MatchedName": "IEEE International Symposium on High Performance Computer Architecture",
        "Confidence": 0.95,
        "Reason": "The input matches HPCA's full name"
    }}
    """

    response = client.chat.completions.create(  
        model=configModel,  
        messages=[  
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": conferenceJournal},
        ],  
        temperature=0.2,  
        max_tokens=4096,
        # stream=True,
        response_format={"type": "json_object"}  
    ) 

    return response.choices[0].message.content

def get_key_info( content, configModel, client):
    system_prompt = """
    Act as an expert metadata extraction assistant.
    Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report).
    Your goal is to extract the document title, all authors, and their corresponding affiliations.

    Extraction Guidelines:
    -   **Title:** Extract the main title of the document. If ambiguous or missing, use "".
    -   **Authors:**
        -   Identify all listed authors. Maintain the order presented in the text if possible.
        -   Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
    -   **Institutions:**
        -   Extract all associated institutions of authors
        -   相同机构仅保留一个
    -   **Countrys:**
        -   Extract all associated countrys of institutions.
        -   Try to use full names. Only write the names of universities/companies, excluding departments, postal codes, countries, house numbers, cities, etc.
        -   For foreign names: Translate them first, and then manually check them using Google (mark them).
        -   In cases like the Chinese Academy of Sciences, specify the institute level. For example, the Institute of Computing Technology, Chinese Academy of Sciences.
        -   In cases of branch campuses, specify the branch. For example, California State University, University of California, University of Maryland.
    -   **ISSUE:**
        -   Extract where the paper is published like journal or session.
    -   Title, authors, institutions and countrys should be four separate keys, not nested together.
    -   Use highcase for first letter of key.
    -   **Handling Missing Data:** If no data of a field can be identified in the text, the field in the JSON should be an empty list `[]`.
    
    Example Output:
    {
        "Title": "Laius: Towards Latency Awareness and Improved Utilization of Spatial Multitasking Accelerators in Datacenters",
        "Authors": [
            "Quan Chen",
            "Daniel Edward Mawhirter",
            "Bo Wu",
            "Chao Li",
        ],
        "Institutions": [
            "Shanghai Jiao Tong University",
            "Colorado School of Mines",
        ],
        "Countrys": [
            "China",
            "United States",
        ],
        "ISSUE": [
            "IEEE Transactions on Computers" 
        ]
    }
    """

    response = client.chat.completions.create(  
        model=configModel,  
        messages=[  
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": content},
        ],  
        temperature=0.25,  
        max_tokens=4096,
        # stream=True,
        response_format={"type": "json_object"}  
    ) 

    logging.debug(response.choices[0].message.content)

    return response.choices[0].message.content

    # for chunk in response:
    #     if not chunk.choices:
    #         continue
    #     if chunk.choices[0].delta.content:
    #         # 增量输出返回值
    #         print(chunk.choices[0].delta.content, end="", flush=True) # 不换行刷新输出，流式输出

# Extracts text content from the first page of a PDF.
def extract_first_page_text(pdf_path):
    try:
        # 尝试读取 PDF 文件
        reader = pypdf.PdfReader(pdf_path)
        if len(reader.pages) > 0:
            first_page = reader.pages[0]
            text = first_page.extract_text()

            if text:
                # 2. text.split()：用默认空白符（空格/换行/制表符）分割字符串，自动合并连续空白
                # 3. " ".join(...)：用单个空格连接，实现「多余空白→单空格」的清理效果
                cleaned_text = " ".join(text.split())
                cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
                    "utf-8"
                )
                return cleaned_text
            else:
                logging.warning(f"{RED}No text found on the first page of {pdf_path.name}{RESET}")
                return None
        else:
            logging.warning(f"{RED}PDF has no pages: {pdf_path.name}{RESET}")
            return None
    except Exception as e:
        # 捕获并记录读取 PDF 文件时的异常
        logging.warning(f"{RED}Failed to read PDF {pdf_path.name}: {str(e)}{RESET}")
        return None

def get_citation_ids(pdf_path, title, configModel, client):
    try:
        reader = pypdf.PdfReader(pdf_path)
        if len(reader.pages) == 0:
            logging.warning(f"PDF has no pages: {pdf_path.name}")
            return None

        system_prompt = f"""
        你是一个专业的学术论文引用分析助手。你的任务是从PDF页面文本中找出引用给定论文的引用编号。
        论文标题: {title}
        
        分析指南:
        1. 从文本中查找类似 [1], [2] 这样的引用标记
        2. 引用标记后面通常会跟着引用的论文标题
        3. 如果找到匹配的引用编号，返回该数字
        4. 如果没有找到匹配的引用，返回空字符串
        
        请直接返回引用编号数字，不要返回其他内容。
        """

        # 从最后一页开始逐页检查
        for page_num in range(len(reader.pages) - 1, -1, -1):
            page = reader.pages[page_num]
            text = page.extract_text()

            if text:
                cleaned_text = " ".join(text.split())
                response = client.chat.completions.create(
                    model=configModel,
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": cleaned_text},
                    ],
                    temperature=0.1,
                    max_tokens=10,
                    response_format={"type": "text"}
                )
                
                result = response.choices[0].message.content.strip()
                if result.isdigit():
                    return result

        logging.warning(f"Citation ID for {title} not found in {pdf_path.name}")
        return None
    except Exception as e:
        logging.error(f"An error occurred while processing {pdf_path.name}: {str(e)}")
        return None

# excel表格的第4行开始读取索引和论文名称
def read_rough_nameIndex_from_excel(sheet, maxItem):

    index_list = []
    paperName_list = []

    # 从第4行开始遍历
    for idx, row in enumerate(sheet.iter_rows(min_row=4, values_only=True)):
        if idx >= maxItem:  # 限制读取的行数
            break
        if row[0] and row[2]:  # 确保索引和论文名称都存在
            index_list.append(row[0])
            paperName_list.append(row[2])
 
    return index_list, paperName_list

def translate_countries(countries_str: str, configModel, client):
    system_prompt = """
    你是一个专业的翻译助手，请将输入的英文国家名称翻译为中文。
    输入可能是多个国家名称，以英文分号分隔。
    输出同样以英文分号分隔对应的中文国家名称。
    示例输入: United States; China
    示例输出: 美国; 中国
    """

    response = client.chat.completions.create(
        model=configModel,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": countries_str}
        ],
        temperature=0.2,
        max_tokens=4096,
        response_format={"type": "text"}
    )

    return response.choices[0].message.content.strip()

def citationProcess(config: dict):

    client = OpenAI(api_key=config["api_key"], 
                    base_url=config["base_url"])

    excel_path = Path(config["source_excel_path"])
    target_path = Path(config["target_excel_path"])
    ccfa_excel_path = Path(config["ccfa_excel_path"])

    # 读取Excel文件
    wb = openpyxl.load_workbook(excel_path)
    # 读取CCFA列表
    ccfa_wb = openpyxl.load_workbook(ccfa_excel_path)
    sheetCCF = ccfa_wb["CCF-A列表"]

    # 遍历工作簿中的所有工作表
    for idx, sheet_name in enumerate(wb.sheetnames):
        if idx == config["sheetNum"]:
            break
        sheet = wb[sheet_name]
        logging.info(f"{BLUE}Processing sheet: {sheet_name}{RESET}")

        index_list, paperName_list = read_rough_nameIndex_from_excel(sheet, config["maxItem"])

        rst_dir = Path.cwd() / config["result_dir"] / sheet_name
        rst_dir.mkdir(parents=True, exist_ok=True)  # 确保结果目录存在

        pdf_directory = Path.cwd() / config["pdf_dir"] / sheet_name
        
        pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索, 输出所有pdf文件的路径

        # 读取第三行的BibTeX文本
        bibtex_row = sheet[3]  # 第三行，索引从1开始
        # 使用 map 函数将 cell.value 转换为字符串类型
        bibtex_text = "".join(map(lambda cell: str(cell.value) if cell.value is not None else "", bibtex_row))

        # 解析BibTeX文本
        try:
            bib_database = bibtexparser.loads(bibtex_text)
            if bib_database.entries:
                entry = bib_database.entries[0]
                cited_title_str = entry.get('title', '')
            else:
                cited_title_str = ""
                logging.warning(f"No BibTeX entry found in sheet {sheet_name} row 3")
        except Exception as e:
            cited_title_str = ""
            logging.error(f"Error parsing BibTeX in sheet {sheet_name} row 3: {str(e)}")

        # 遍历当前工作表对应的所有PDF文件
        for file in pdf_files:
 
            logging.info(f"{BLUE}Processing {file.name}{RESET}")

            first_page_text = extract_first_page_text(file)
            
            if first_page_text is None:
                logging.error(f"Failed to extract text from first page of {file.name}")
                continue  # 跳过当前文件继续处理下一个
                
            configModel = config["model"]

            # 提取关键信息
            result = get_key_info(first_page_text, configModel, client)

            cit_id = get_citation_ids(file, cited_title_str, configModel, client)

            if result is not None:
                # 解析JSON结果, 提取论文标题
                result_dict = json.loads(result)
                pdf_title = result_dict["Title"]
                pdf_issue = result_dict["ISSUE"]

                # 遍历Excel表项进行模糊匹配
                matched = False
                for idx, excel_name in zip(index_list, paperName_list):
                    # 预处理字符串
                    # 返回pdf字符前的字符串，所以加上索引0
                    clean_excel_name = excel_name.split('.pdf')[0].replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
                    clean_pdf_title = pdf_title.replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
                    logging.debug(f"clean_excel_name: {clean_excel_name}")
                    logging.debug(f"clean_pdf_title: {clean_pdf_title}")
                    
                    similarity = fuzz.partial_ratio(clean_pdf_title.lower(), clean_excel_name.lower())
                    
                    if similarity >= 85:
                        # 重命名PDF文件
                        # 替换冒号、空格、问号和斜杠，避免文件名非法字符
                        new_pdf_name = f"{idx}-{pdf_title.replace(': ', '_').replace(' ', '_').replace('?', '_').replace('/', '_')}.pdf"  # 将冒号、空格、问号和斜杠替换为下划线
                        new_pdf_path = file.parent / new_pdf_name
                        try:
                            file.rename(new_pdf_path)
                            logging.info(f"Renamed: {file.name} -> {new_pdf_name}")
                        except FileExistsError:
                            logging.warning(f"Renamed failed: filename {new_pdf_name} already exists with idx {idx}.")
                            break

                        # 存储关键信息到json文件中
                        rst_path = rst_dir / (f"{idx}" + ".json")
                        rst_path.write_text(result + "\n", encoding='utf-8')  # 明确指定UTF-8编码
                        
                        # 更新Excel中的表项
                        sheet.cell(row=idx+config["content_start"], column=3, value=pdf_title)  # 第3列是论文名称

                        issue = result_dict.get("ISSUE", [])
                        if issue and isinstance(issue, list) and len(issue) > 0:  # 确保issue是有效列表且不为空
                            sheet.cell(row=idx+config["content_start"], column=4, value=issue[0])
                            logging.info(f"Standardization issue info.")
                        else:
                            logging.warning(f"{RED}Invalid ISSUE data: {issue}{RESET}")

                        if cit_id is not None:
                            sheet.cell(row=idx+config["content_start"], column=6, value=cit_id)  # 第6列是cit_id
                        else:
                            logging.warning(f"{RED}cit_id is None.可能存在多个pdf版本，请找到真正引用的版本(T_T){RESET}")

                        authors_list = result_dict.get("Authors", [])
                        authors = ";".join(authors_list) if isinstance(authors_list, list) else ""
                        sheet.cell(row=idx+config["content_start"], column=7, value=authors)  # 第7列是作者名称
                        logging.info(f"Standardization author info.")

                        institution_list = result_dict.get("Institutions", [])
                        institutions = ";".join(institution_list) if isinstance( institution_list, list) else ""
                        sheet.cell(row=idx+config["content_start"], column=9, value=institutions)  # 第9列是机构
                        logging.info(f"Standardization institution info.")

                        countrys_list = result_dict.get("Countrys", [])
                        countrys = ";".join(countrys_list) if isinstance(countrys_list, list) else ""
                        # 翻译 countrys 为中文
                        if countrys:
                            translated_countrys = translate_countries(countrys, config["model"], client)
                            sheet.cell(row=idx+config["content_start"], column=11, value=translated_countrys)  # 第11列是中文国家名称
                            logging.debug(f"Translated countrys info: {translated_countrys}")
                        else:
                            logging.warning(f"{RED}No countrys info to translate.{RESET}")

                        countrys = c2i.country_to_idx(translated_countrys)
                        sheet.cell(row=idx+config["content_start"], column=10, value=countrys)  # 第10列是国家索引
                        logging.info(f"Standardization countrys info.")

                        # CCFA判断
                        logging.info(f"Judge CCFA.")
                        CCFA_list = []
                        for row in sheetCCF.iter_rows(min_row=2, values_only=True): # 从第二行开始遍历
                            if row[0] and row[1]: # 确保索引和论文名称都存在
                                CCFA_list.append(row[1])
                                CCFA_list.append(row[2])
                        # 把list转为长的字符串, ','分割
                        CCFA = ','.join(CCFA_list)
                        if pdf_issue:
                            conferenceJournal = pdf_issue[0]
                        else:
                            logging.warning(f"{RED}LLM没有在文章中找到会议/期刊信息, 默认使用输入Excel中的会议名称, 请人工确认本条的CCFA信息。{file.name}{RESET}")
                            conferenceJournal = ""

                        CCFA_flag = "否"
                        if conferenceJournal == "":
                            CCFA_flag = "否"
                        else:
                            CCFA_flag = "是" if chechCCFA(conferenceJournal, CCFA, configModel, client) else "否"
                        logging.info(f"{CCFA_flag}")
                        sheet.cell(row=idx+4, column=5, value=CCFA_flag)  # 第7列是作者名称

                        # 保存修改后的Excel文件
                        wb.save(target_path)
                        
                        logging.info(f"Matched: {file.name} -> idx: {idx}, excel_name: {excel_name}")
                        logging.info(f"Change: {file.name} -> {new_pdf_name}")

                        matched = True
                        break 
                if matched == False:
                    logging.warning(f"{RED}Not matched: {file.name}{RESET}")
            else:
                logging.warning(f"{RED}Failed to extract key info from {file.name}{RESET}")