Merge branch 'master' of http://62.234.201.16/nzy/papertools

91ab82b3 · Pengwei-Jin · 34bba471 · 382ba375 · 91ab82b3 · 91ab82b3
Commit 91ab82b3 authored May 09, 2025 by Pengwei-Jin
14 changed files
--- a/citationProcessStage1/.gitignore
+++ b/citationProcessStage1/.gitignore
+.vscode/
+Papers/
+psrc/__pycache__/
+psrc/stage1/__pycache__/
+psrc/stage2/__pycache__/
+json/
\ No newline at end of file
--- a/citationProcessStage1/README.md
+++ b/citationProcessStage1/README.md
+# 环境配置
+- 确保执行py的cwd在papertools仓库文件夹下
+- 路径和参数配置都在config.json文件中
+    - api_key
+        - 目前的密钥是东辰同学自己从知乎上打广告赚来的，只有100块的额度，请尽量使用自己的密钥
+        - 如果使用不同的API的密钥注意更改openAI的调用方式，这里推荐硅基流动，因为我就是用硅基流动跑通的
+    - base_url
+        - api接口url
+    - pdf_dir
+        - 放置论文pdf的文件夹
+    - result_dir
+        - 输出关键信息json文件的文件夹
+    - source_excel_path
+        - 放置需要check的excel表格
+        - 第context_start+1行开始实际表项
+        - 第一列索引
+        - 第三列论文标题
+        - 第七列论文作者
+    - target_excel_path
+        - 输出的格式化表格
+    - ccfa_excel_path
+        - CCFA的参考表格
+    - logLevel
+        - 取10表示DEBUG级别
+        - 取20表示INFO级别
+    - sheetNum 需要处理的工作表数量
+    - maxItem 每个工作表的最大条目数
+- python3.12
+- 无法import的库使用pip install逐个安装
+    - `openai`, `pypdf`
+    - `python-Levenshtein`
+
+# 使用方法
+- 将excel表的第三行改为如下图所示包含被引用文章标题的bibtex格式
+    - ![](./others/bibtex.png)
+- 查看config.json正确配置参数，让程序能够找到需要的文件位置和参数
+    - 默认配置
+        - 文章的pdf分sheet放置在Papers/sheetname文件夹下
+        - 待check的excel表格放在others文件夹中
+        - 输出的表格放在target文件夹中, pdf会原地标准化重命名
+- python main.py 执行程序
+- 程序执行过程中，不要打开target excel文件，不然会争用权限发生错误
+- 成功后的日志样例在logs文件夹下
+- 断点处理：如果在excel某个序号之前的pdf都正确提取了信息，并且正确修改了excel，下一个序号开始的pdf出错了
+    - 建议将正确的pdf都转移到其他文件夹，这样再次运行脚本将处理剩下的pdf
+- TODO:多模型交叉验证
+- TODO:Temperature的设置
+
+# 需求与解决方案
+1. TODO:下载论文pdf
+    1. 常用网站agent下载
+    2. 输出无法下载的条目
+2. stage1:自动化提取信息和格式化
+    1. 通过config.json读取配置对象
+    2. **遍历**excel的sheet
+        1. **遍历**sheet中的论文名称和索引
+            1. 用**大模型**读取pdf中第一页的论文名称和关键信息，存储到json文件夹下
+            2. 读取pdf中从后向前的引用信息, 通过**大模型**找出sheetname对应文章在当前pdf文章中的索引，存储到json文件夹下
+            3. **遍历**excel表格中的论文名称进行模糊匹配, 匹配成功后
+                1. 将pdf文件中的关键信息写入json文件中进行保存, 包括 标题 会议名称 作者姓名 机构 国家.
+                2. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文标题、会议名称、作者姓名、机构、国家.
+                3. 首先用**大模型**将英文国家名翻译成中国名，将国家对应的索引写入目标excel表格中.
+                4. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
+            4. 匹配失败后，输出无法匹配的条目,使用warning记录无法匹配的条目，方便后续处理.
+    3. 得到从pdf中提取的信息和格式化的excel表格.
+    4. 人工复核
+        1. 可能会有重复的pdf，只有标准化重命名之后才会发现，当两个pdf的索引名称相同时，说明pdf重复匹配，需要手动删除重复的pdf和excel表项，由人工结合原表格信息判定重复现象.
+        2. 可能机构和国家会有重复，请人工检查.
+        3. 读log查红色warning信息
+3. stage2: 知名企业、牛人判断
+
+# 代码结构说明
+1. psrc文件夹下是库函数
+2. config.json是配置文件
+3. main.py是主程序
+4. logs文件夹是日志文件
+5. json文件夹是关键信息json文件
\ No newline at end of file
--- a/citationProcessStage1/config.json
+++ b/citationProcessStage1/config.json
+{
+    "api_key": "sk-otamesebhzzycgfynnssjkrkjlcoitdtstcruwbhohksdlel",
+    "base_url": "https://api.siliconflow.cn/v1",
+    "model": "Pro/deepseek-ai/DeepSeek-V3",
+    "pdf_dir": "./Papers",
+    "result_dir": "./json",
+    "source_excel_path": "./others/论文被引用情况-陈老师-2025.05.01.xlsx",
+    "content_start": 4,
+    "ccfa_excel_path": "./others/CCFA.xlsx",
+    "target_excel_path": "./others/target.xlsx",
+    "logLevel": 20,
+    "sheetNum": 1,
+    "maxItem": 333
+}
\ No newline at end of file
--- a/citationProcessStage1/logs/citation_process.log
+++ b/citationProcessStage1/logs/citation_process.log
--- a/citationProcessStage1/main.py
+++ b/citationProcessStage1/main.py
+import json
+import logging
+import psrc.stage1.citationProcess as CP
+from pathlib import Path
+
+if __name__ == "__main__":
+    cwd_dir = Path.cwd()
+
+    # 构建 config.json 的完整路径
+    config_path = (cwd_dir / "config.json").resolve()
+
+    # 读取config.json中的配置参数
+    with open( config_path, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+
+    # Path对象后跟/用于连接地址
+
+     # 创建日志目录
+    log_dir = cwd_dir / "logs"
+    log_dir.mkdir(exist_ok=True)
+    
+     # 配置日志系统
+    log_file = log_dir / "citation_process.log"
+    logLevel = config["logLevel"]
+    # logging.basicConfig(...) 是 Python 标准库 logging 模块中的一个函数，用于快速配置日志记录的基本设置.
+    # 设置日志记录的最低级别为 INFO, 只有日志级别大于等于 INFO 的日志记录才会被处理（例如 INFO、WARNING、ERROR、CRITICAL）.
+    # logging.debug("这是一个调试信息")    # 不会输出（低于 INFO）
+    # logging.info("这是一个普通信息")     # 会输出
+    logging.basicConfig(
+        # %(asctime)s：日志记录的时间戳（默认格式：YYYY-MM-DD HH:MM:SS）。
+        # %(levelname)s：日志级别名称（如 INFO, WARNING）。
+        # %(message)s：日志的具体内容。
+        level=logLevel, format="%(asctime)s - %(levelname)s - %(message)s",
+        handlers=[
+            logging.FileHandler(log_file, encoding='utf-8'),
+            logging.StreamHandler()
+        ]
+    )
+
+    logging.info(f"程序启动，日志文件保存在: {log_file}")
+    CP.citationProcess(config)
--- a/citationProcessStage1/others/CCFA.xlsx
+++ b/citationProcessStage1/others/CCFA.xlsx
--- a/citationProcessStage1/others/bibtex.png
+++ b/citationProcessStage1/others/bibtex.png
--- a/citationProcessStage1/others/j1-j34.xlsx
+++ b/citationProcessStage1/others/j1-j34.xlsx
--- a/citationProcessStage1/others/target.xlsx
+++ b/citationProcessStage1/others/target.xlsx
--- a/citationProcessStage1/others/李昊晨.xlsx
+++ b/citationProcessStage1/others/李昊晨.xlsx
--- a/citationProcessStage1/others/论文被引用情况-陈老师-2025.05.01 copy.xlsx
+++ b/citationProcessStage1/others/论文被引用情况-陈老师-2025.05.01 copy.xlsx
--- a/citationProcessStage1/others/论文被引用情况-陈老师-2025.05.01.xlsx
+++ b/citationProcessStage1/others/论文被引用情况-陈老师-2025.05.01.xlsx
--- a/citationProcessStage1/psrc/stage1/citationProcess.py
+++ b/citationProcessStage1/psrc/stage1/citationProcess.py
+from errno import ESTALE
+from pathlib import Path
+import logging
+from openai import OpenAI
+import pypdf
+import openpyxl
+from fuzzywuzzy import fuzz
+import json
+import re
+import bibtexparser
+import psrc.stage1.country_to_idx as c2i
+
+RED = '\033[91m'
+GREEN = '\033[92m'
+BLUE = '\033[94m'
+RESET = '\033[0m'
+
+
+def chechCCFA( conferenceJournal, CCFA, configModel, client):
+    system_prompt = f"""
+    You are an expert academic conference/journal classifier. Your task is to determine if the given conference/journal name matches any entry in the provided CCF-A list.
+    CCF-A List (comma-separated): {CCFA}
+    Analysis Guidelines:
+    1. Perform fuzzy matching considering:
+       - Abbreviations vs full names (e.g. 'PPoPP' vs 'ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming')
+       - Common variations (e.g. 'IEEE Transactions' vs 'IEEE Trans.')
+       - Minor spelling differences
+    2. Return JSON with:
+       - "IsCCFA": ture/false
+       - "MatchedName": the matched name from CCF-A list (empty string if no match)
+       - "Confidence": your confidence score (0-100)
+    Example Output:
+    {{
+        "IsCCFA": "ture",
+        "MatchedName": "IEEE International Symposium on High Performance Computer Architecture",
+        "Confidence": 0.95,
+        "Reason": "The input matches HPCA's full name"
+    }}
+    """
+
+    response = client.chat.completions.create(  
+        model=configModel,  
+        messages=[  
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": conferenceJournal},
+        ],  
+        temperature=0.2,  
+        max_tokens=4096,
+        # stream=True,
+        response_format={"type": "json_object"}  
+    ) 
+
+    return response.choices[0].message.content
+
+def get_key_info( content, configModel, client):
+    system_prompt = """
+    Act as an expert metadata extraction assistant.
+    Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report).
+    Your goal is to extract the document title, all authors, and their corresponding affiliations.
+
+    Extraction Guidelines:
+    -   **Title:** Extract the main title of the document. If ambiguous or missing, use "".
+    -   **Authors:**
+        -   Identify all listed authors. Maintain the order presented in the text if possible.
+        -   Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
+    -   **Institutions:**
+        -   Extract all associated institutions of authors
+        -   相同机构仅保留一个
+    -   **Countrys:**
+        -   Extract all associated countrys of institutions.
+        -   Try to use full names. Only write the names of universities/companies, excluding departments, postal codes, countries, house numbers, cities, etc.
+        -   For foreign names: Translate them first, and then manually check them using Google (mark them).
+        -   In cases like the Chinese Academy of Sciences, specify the institute level. For example, the Institute of Computing Technology, Chinese Academy of Sciences.
+        -   In cases of branch campuses, specify the branch. For example, California State University, University of California, University of Maryland.
+    -   **ISSUE:**
+        -   Extract where the paper is published like journal or session.
+    -   Title, authors, institutions and countrys should be four separate keys, not nested together.
+    -   Use highcase for first letter of key.
+    -   **Handling Missing Data:** If no data of a field can be identified in the text, the field in the JSON should be an empty list `[]`.
+    
+    Example Output:
+    {
+        "Title": "Laius: Towards Latency Awareness and Improved Utilization of Spatial Multitasking Accelerators in Datacenters",
+        "Authors": [
+            "Quan Chen",
+            "Daniel Edward Mawhirter",
+            "Bo Wu",
+            "Chao Li",
+        ],
+        "Institutions": [
+            "Shanghai Jiao Tong University",
+            "Colorado School of Mines",
+        ],
+        "Countrys": [
+            "China",
+            "United States",
+        ],
+        "ISSUE": [
+            "IEEE Transactions on Computers" 
+        ]
+    }
+    """
+
+    response = client.chat.completions.create(  
+        model=configModel,  
+        messages=[  
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": content},
+        ],  
+        temperature=0.25,  
+        max_tokens=4096,
+        # stream=True,
+        response_format={"type": "json_object"}  
+    ) 
+
+    logging.debug(response.choices[0].message.content)
+
+    return response.choices[0].message.content
+
+    # for chunk in response:
+    #     if not chunk.choices:
+    #         continue
+    #     if chunk.choices[0].delta.content:
+    #         # 增量输出返回值
+    #         print(chunk.choices[0].delta.content, end="", flush=True) # 不换行刷新输出，流式输出
+
+# Extracts text content from the first page of a PDF.
+def extract_first_page_text(pdf_path):
+    try:
+        # 尝试读取 PDF 文件
+        reader = pypdf.PdfReader(pdf_path)
+        if len(reader.pages) > 0:
+            first_page = reader.pages[0]
+            text = first_page.extract_text()
+
+            if text:
+                # 2. text.split()：用默认空白符（空格/换行/制表符）分割字符串，自动合并连续空白
+                # 3. " ".join(...)：用单个空格连接，实现「多余空白→单空格」的清理效果
+                cleaned_text = " ".join(text.split())
+                cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
+                    "utf-8"
+                )
+                return cleaned_text
+            else:
+                logging.warning(f"{RED}No text found on the first page of {pdf_path.name}{RESET}")
+                return None
+        else:
+            logging.warning(f"{RED}PDF has no pages: {pdf_path.name}{RESET}")
+            return None
+    except Exception as e:
+        # 捕获并记录读取 PDF 文件时的异常
+        logging.warning(f"{RED}Failed to read PDF {pdf_path.name}: {str(e)}{RESET}")
+        return None
+
+def get_citation_ids(pdf_path, title, configModel, client):
+    try:
+        reader = pypdf.PdfReader(pdf_path)
+        if len(reader.pages) == 0:
+            logging.warning(f"PDF has no pages: {pdf_path.name}")
+            return None
+
+        system_prompt = f"""
+        你是一个专业的学术论文引用分析助手。你的任务是从PDF页面文本中找出引用给定论文的引用编号。
+        论文标题: {title}
+        
+        分析指南:
+        1. 从文本中查找类似 [1], [2] 这样的引用标记
+        2. 引用标记后面通常会跟着引用的论文标题
+        3. 如果找到匹配的引用编号，返回该数字
+        4. 如果没有找到匹配的引用，返回空字符串
+        
+        请直接返回引用编号数字，不要返回其他内容。
+        """
+
+        # 从最后一页开始逐页检查
+        for page_num in range(len(reader.pages) - 1, -1, -1):
+            page = reader.pages[page_num]
+            text = page.extract_text()
+
+            if text:
+                cleaned_text = " ".join(text.split())
+                response = client.chat.completions.create(
+                    model=configModel,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": cleaned_text},
+                    ],
+                    temperature=0.1,
+                    max_tokens=10,
+                    response_format={"type": "text"}
+                )
+                
+                result = response.choices[0].message.content.strip()
+                if result.isdigit():
+                    return result
+
+        logging.warning(f"Citation ID for {title} not found in {pdf_path.name}")
+        return None
+    except Exception as e:
+        logging.error(f"An error occurred while processing {pdf_path.name}: {str(e)}")
+        return None
+
+# excel表格的第4行开始读取索引和论文名称
+def read_rough_nameIndex_from_excel(sheet, maxItem):
+
+    index_list = []
+    paperName_list = []
+
+    # 从第4行开始遍历
+    for idx, row in enumerate(sheet.iter_rows(min_row=4, values_only=True)):
+        if idx >= maxItem:  # 限制读取的行数
+            break
+        if row[0] and row[2]:  # 确保索引和论文名称都存在
+            index_list.append(row[0])
+            paperName_list.append(row[2])
+ 
+    return index_list, paperName_list
+
+def translate_countries(countries_str: str, configModel, client):
+    system_prompt = """
+    你是一个专业的翻译助手，请将输入的英文国家名称翻译为中文。
+    输入可能是多个国家名称，以英文分号分隔。
+    输出同样以英文分号分隔对应的中文国家名称。
+    示例输入: United States; China
+    示例输出: 美国; 中国
+    """
+
+    response = client.chat.completions.create(
+        model=configModel,
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": countries_str}
+        ],
+        temperature=0.2,
+        max_tokens=4096,
+        response_format={"type": "text"}
+    )
+
+    return response.choices[0].message.content.strip()
+
+def citationProcess(config: dict):
+
+    client = OpenAI(api_key=config["api_key"], 
+                    base_url=config["base_url"])
+
+    excel_path = Path(config["source_excel_path"])
+    target_path = Path(config["target_excel_path"])
+    ccfa_excel_path = Path(config["ccfa_excel_path"])
+
+    # 读取Excel文件
+    wb = openpyxl.load_workbook(excel_path)
+    # 读取CCFA列表
+    ccfa_wb = openpyxl.load_workbook(ccfa_excel_path)
+    sheetCCF = ccfa_wb["CCF-A列表"]
+
+    # 遍历工作簿中的所有工作表
+    for idx, sheet_name in enumerate(wb.sheetnames):
+        if idx == config["sheetNum"]:
+            break
+        sheet = wb[sheet_name]
+        logging.info(f"{BLUE}Processing sheet: {sheet_name}{RESET}")
+
+        index_list, paperName_list = read_rough_nameIndex_from_excel(sheet, config["maxItem"])
+
+        rst_dir = Path.cwd() / config["result_dir"] / sheet_name
+        rst_dir.mkdir(parents=True, exist_ok=True)  # 确保结果目录存在
+
+        pdf_directory = Path.cwd() / config["pdf_dir"] / sheet_name
+        
+        pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索, 输出所有pdf文件的路径
+
+        # 读取第三行的BibTeX文本
+        bibtex_row = sheet[3]  # 第三行，索引从1开始
+        # 使用 map 函数将 cell.value 转换为字符串类型
+        bibtex_text = "".join(map(lambda cell: str(cell.value) if cell.value is not None else "", bibtex_row))
+
+        # 解析BibTeX文本
+        try:
+            bib_database = bibtexparser.loads(bibtex_text)
+            if bib_database.entries:
+                entry = bib_database.entries[0]
+                cited_title_str = entry.get('title', '')
+            else:
+                cited_title_str = ""
+                logging.warning(f"No BibTeX entry found in sheet {sheet_name} row 3")
+        except Exception as e:
+            cited_title_str = ""
+            logging.error(f"Error parsing BibTeX in sheet {sheet_name} row 3: {str(e)}")
+
+        # 遍历当前工作表对应的所有PDF文件
+        for file in pdf_files:
+ 
+            logging.info(f"{BLUE}Processing {file.name}{RESET}")
+
+            first_page_text = extract_first_page_text(file)
+            
+            if first_page_text is None:
+                logging.error(f"Failed to extract text from first page of {file.name}")
+                continue  # 跳过当前文件继续处理下一个
+                
+            configModel = config["model"]
+
+            # 提取关键信息
+            result = get_key_info(first_page_text, configModel, client)
+
+            cit_id = get_citation_ids(file, cited_title_str, configModel, client)
+
+            if result is not None:
+                # 解析JSON结果, 提取论文标题
+                result_dict = json.loads(result)
+                pdf_title = result_dict["Title"]
+                pdf_issue = result_dict["ISSUE"]
+
+                # 遍历Excel表项进行模糊匹配
+                matched = False
+                for idx, excel_name in zip(index_list, paperName_list):
+                    # 预处理字符串
+                    # 返回pdf字符前的字符串，所以加上索引0
+                    clean_excel_name = excel_name.split('.pdf')[0].replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
+                    clean_pdf_title = pdf_title.replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
+                    logging.debug(f"clean_excel_name: {clean_excel_name}")
+                    logging.debug(f"clean_pdf_title: {clean_pdf_title}")
+                    
+                    similarity = fuzz.partial_ratio(clean_pdf_title.lower(), clean_excel_name.lower())
+                    
+                    if similarity >= 85:
+                        # 重命名PDF文件
+                        # 替换冒号、空格、问号和斜杠，避免文件名非法字符
+                        new_pdf_name = f"{idx}-{pdf_title.replace(': ', '_').replace(' ', '_').replace('?', '_').replace('/', '_')}.pdf"  # 将冒号、空格、问号和斜杠替换为下划线
+                        new_pdf_path = file.parent / new_pdf_name
+                        try:
+                            file.rename(new_pdf_path)
+                            logging.info(f"Renamed: {file.name} -> {new_pdf_name}")
+                        except FileExistsError:
+                            logging.warning(f"Renamed failed: filename {new_pdf_name} already exists with idx {idx}.")
+                            break
+
+                        # 存储关键信息到json文件中
+                        rst_path = rst_dir / (f"{idx}" + ".json")
+                        rst_path.write_text(result + "\n", encoding='utf-8')  # 明确指定UTF-8编码
+                        
+                        # 更新Excel中的表项
+                        sheet.cell(row=idx+config["content_start"], column=3, value=pdf_title)  # 第3列是论文名称
+
+                        issue = result_dict.get("ISSUE", [])
+                        if issue and isinstance(issue, list) and len(issue) > 0:  # 确保issue是有效列表且不为空
+                            sheet.cell(row=idx+config["content_start"], column=4, value=issue[0])
+                            logging.info(f"Standardization issue info.")
+                        else:
+                            logging.warning(f"{RED}Invalid ISSUE data: {issue}{RESET}")
+
+                        if cit_id is not None:
+                            sheet.cell(row=idx+config["content_start"], column=6, value=cit_id)  # 第6列是cit_id
+                        else:
+                            logging.warning(f"{RED}cit_id is None.可能存在多个pdf版本，请找到真正引用的版本(T_T){RESET}")
+
+                        authors_list = result_dict.get("Authors", [])
+                        authors = ";".join(authors_list) if isinstance(authors_list, list) else ""
+                        sheet.cell(row=idx+config["content_start"], column=7, value=authors)  # 第7列是作者名称
+                        logging.info(f"Standardization author info.")
+
+                        institution_list = result_dict.get("Institutions", [])
+                        institutions = ";".join(institution_list) if isinstance( institution_list, list) else ""
+                        sheet.cell(row=idx+config["content_start"], column=9, value=institutions)  # 第9列是机构
+                        logging.info(f"Standardization institution info.")
+
+                        countrys_list = result_dict.get("Countrys", [])
+                        countrys = ";".join(countrys_list) if isinstance(countrys_list, list) else ""
+                        # 翻译 countrys 为中文
+                        if countrys:
+                            translated_countrys = translate_countries(countrys, config["model"], client)
+                            sheet.cell(row=idx+config["content_start"], column=11, value=translated_countrys)  # 第11列是中文国家名称
+                            logging.debug(f"Translated countrys info: {translated_countrys}")
+                        else:
+                            logging.warning(f"{RED}No countrys info to translate.{RESET}")
+
+                        countrys = c2i.country_to_idx(translated_countrys)
+                        sheet.cell(row=idx+config["content_start"], column=10, value=countrys)  # 第10列是国家索引
+                        logging.info(f"Standardization countrys info.")
+
+                        # CCFA判断
+                        logging.info(f"Judge CCFA.")
+                        CCFA_list = []
+                        for row in sheetCCF.iter_rows(min_row=2, values_only=True): # 从第二行开始遍历
+                            if row[0] and row[1]: # 确保索引和论文名称都存在
+                                CCFA_list.append(row[1])
+                                CCFA_list.append(row[2])
+                        # 把list转为长的字符串, ','分割
+                        CCFA = ','.join(CCFA_list)
+                        if pdf_issue:
+                            conferenceJournal = pdf_issue[0]
+                        else:
+                            logging.warning(f"{RED}LLM没有在文章中找到会议/期刊信息, 默认使用输入Excel中的会议名称, 请人工确认本条的CCFA信息。{file.name}{RESET}")
+                            conferenceJournal = ""
+
+                        CCFA_flag = "否"
+                        if conferenceJournal == "":
+                            CCFA_flag = "否"
+                        else:
+                            CCFA_flag = "是" if chechCCFA(conferenceJournal, CCFA, configModel, client) else "否"
+                        logging.info(f"{CCFA_flag}")
+                        sheet.cell(row=idx+4, column=5, value=CCFA_flag)  # 第7列是作者名称
+
+                        # 保存修改后的Excel文件
+                        wb.save(target_path)
+                        
+                        logging.info(f"Matched: {file.name} -> idx: {idx}, excel_name: {excel_name}")
+                        logging.info(f"Change: {file.name} -> {new_pdf_name}")
+
+                        matched = True
+                        break 
+                if matched == False:
+                    logging.warning(f"{RED}Not matched: {file.name}{RESET}")
+            else:
+                logging.warning(f"{RED}Failed to extract key info from {file.name}{RESET}")
\ No newline at end of file
--- a/citationProcessStage1/psrc/stage1/country_to_idx.py
+++ b/citationProcessStage1/psrc/stage1/country_to_idx.py
+country_to_id_map = {
+    "美国": 1,
+    "中国": 2,
+    "日本": 3,
+    "韩国": 4,
+    "新加坡": 5,
+    "台湾": 6,
+    "香港": 7,
+    "澳门": 8,
+    "中国台湾": 6,
+    "中国香港": 7,
+    "中国澳门": 8,
+    "法国": 9,
+    "英国": 10,
+    "德国": 11,
+    "意大利": 12,
+    "西班牙": 13,
+    "加拿大": 14,
+    "荷兰": 15,
+    "印度": 16,
+    "阿联酋": 17,
+    "比利时": 18,
+    "俄罗斯": 19,
+    "阿富汗": 20,
+    "亚美尼亚": 21,
+    "阿塞拜疆": 22,
+    "巴林": 23,
+    "孟加拉国": 24,
+    "不丹": 25,
+    "文莱": 26,
+    "缅甸": 27,
+    "柬埔寨": 28,
+    "塞浦路斯": 29,
+    "东帝汶": 30,
+    "格鲁吉亚": 31,
+    "印度尼西亚": 32,
+    "伊朗": 33,
+    "伊拉克": 34,
+    "以色列": 35,
+    "约旦": 36,
+    "哈萨克斯坦": 37,
+    "科威特": 38,
+    "吉尔吉斯斯坦": 39,
+    "老挝": 40,
+    "黎巴嫩": 41,
+    "马来西亚": 42,
+    "马尔代夫": 43,
+    "蒙古": 44,
+    "尼泊尔": 45,
+    "朝鲜": 46,
+    "阿曼": 47,
+    "巴基斯坦": 48,
+    "巴勒斯坦": 49,
+    "菲律宾": 50,
+    "卡塔尔": 51,
+    "沙特阿拉伯": 52,
+    "斯里兰卡": 53,
+    "叙利亚": 54,
+    "塔吉克斯坦": 55,
+    "泰国": 56,
+    "土库曼斯坦": 57,
+    "乌兹别克斯坦": 58,
+    "越南": 59,
+    "也门": 60,
+    "北塞浦路斯": 61,
+    "纳戈尔诺-卡拉巴 赫": 62,
+    "阿尔及利亚": 63,
+    "安哥拉": 64,
+    "贝宁": 65,
+    "博茨瓦纳": 66,
+    "布基纳法索": 67,
+    "布隆迪": 68,
+    "佛得角": 69,
+    "喀麦隆": 70,
+    "中非共和国": 71,
+    "乍得": 72,
+    "科摩罗": 73,
+    "刚果": 75,
+    "科特迪瓦": 76,
+    "吉布提": 77,
+    "埃及": 78,
+    "赤道几内亚": 79,
+    "厄立特里亚": 80,
+    "埃塞俄比亚": 81,
+    "加蓬": 82,
+    "冈比亚": 83,
+    "加纳": 84,
+    "几内亚": 85,
+    "几内亚比绍": 86,
+    "肯尼亚": 87,
+    "莱索托": 88,
+    "利比里亚": 89,
+    "利比亚": 90,
+    "马达加斯加": 91,
+    "马拉维": 92,
+    "马里": 93,
+    "毛里塔尼亚": 94,
+    "毛里求斯": 95,
+    "摩洛哥": 96,
+    "莫桑比克": 97,
+    "纳米比亚": 98,
+    "尼日尔": 99,
+    "尼日利亚": 100,
+    "卢旺达": 101,
+    "圣多美和普林西比": 102,
+    "塞内加尔": 103,
+    "塞舌尔": 104,
+    "塞拉利昂": 105,
+    "索马里": 106,
+    "南非": 107,
+    "南苏丹": 108,
+    "苏丹": 109,
+    "斯威士兰": 110,
+    "坦桑尼亚": 111,
+    "多哥": 112,
+    "突尼斯": 113,
+    "乌干达": 114,
+    "赞比亚": 115,
+    "津巴布韦": 116,
+    "西撒哈拉": 117,
+    "阿尔巴尼亚": 118,
+    "安道尔": 119,
+    "奥地利": 120,
+    "白俄罗斯": 121,
+    "波斯尼亚和黑塞哥维那": 122,
+    "保加利亚": 123,
+    "克罗地亚": 124,
+    "捷克": 125,
+    "丹麦": 126,
+    "爱沙尼亚": 127,
+    "芬兰": 128,
+    "希腊": 129,
+    "匈牙利": 130,
+    "冰岛": 131,
+    "爱尔兰": 132,
+    "拉脱维亚": 133,
+    "列支敦士登": 134,
+    "立陶宛": 135,
+    "卢森堡": 136,
+    "马耳他": 137,
+    "摩尔多瓦": 138,
+    "摩纳哥": 139,
+    "黑山": 140,
+    "北马其顿": 141,
+    "挪威": 142,
+    "波兰": 143,
+    "葡萄牙": 144,
+    "罗马尼亚": 145,
+    "圣马力诺": 146,
+    "塞尔维亚": 147,
+    "斯洛伐克": 148,
+    "斯洛文尼亚": 149,
+    "瑞典": 150,
+    "瑞士": 151,
+    "乌克兰": 152,
+    "梵蒂冈": 153,
+    "科索沃": 154,
+    "法罗群岛": 155,
+    "直布罗陀": 156,
+    "安提瓜和巴布达": 157,
+    "巴哈马": 158,
+    "巴巴多斯": 159,
+    "伯利兹": 160,
+    "哥斯达黎加": 161,
+    "古巴": 162,
+    "多米尼加": 163,
+    "多米尼加共和国": 164,
+    "萨尔瓦多": 165,
+    "格林纳达": 166,
+    "危地马拉": 167,
+    "海地": 168,
+    "洪都拉斯": 169,
+    "牙买加": 170,
+    "墨西哥": 171,
+    "尼加拉瓜": 172,
+    "巴拿马": 173,
+    "圣基茨和尼维斯": 174,
+    "圣卢西亚": 175,
+    "圣文森特和格林纳丁斯": 176,
+    "特立尼达和多巴哥": 177,
+    "百慕大": 178,
+    "格陵兰": 179,
+    "波多黎各": 180,
+    "美属维尔京群岛": 181,
+    "英属维尔京群岛": 182,
+    "开曼群岛": 183,
+    "安圭拉": 184,
+    "蒙特塞拉特": 185,
+    "阿根廷": 186,
+    "玻利维亚": 187,
+    "巴西": 188,
+    "智利": 189,
+    "哥伦比亚": 190,
+    "厄瓜多尔": 191,
+    "圭亚那": 192,
+    "巴拉圭": 193,
+    "秘鲁": 194,
+    "苏里南": 195,
+    "乌拉圭": 196,
+    "委内瑞拉": 197,
+    "法属圭亚那": 198,
+    "福克兰群岛": 199,
+    "澳大利亚": 200,
+    "斐济": 201,
+    "基里巴斯": 202,
+    "马绍尔群岛": 203,
+    "密克罗尼西亚": 204,
+    "瑙鲁": 205,
+    "新西兰": 206,
+    "帕劳": 207,
+    "巴布亚新几内亚": 208,
+    "萨摩亚": 209,
+    "所罗门群岛": 210,
+    "汤加": 211,
+    "图瓦卢": 212,
+    "瓦努阿图": 213,
+    "库克群岛": 214,
+    "纽埃": 215,
+    "法属波利尼西亚": 216,
+    "新喀里多尼亚": 217,
+    "瓦利斯和富图纳": 218,
+    "托克劳": 219,
+    "皮特凯恩群岛": 220,
+    "土耳其": 221,
+}
+
+
+def country_to_idx(country_str: str) -> str:
+    result = []
+    country_str = country_str.strip()
+    for country in country_str.split(";"):
+        clean_country = country.strip()
+        if clean_country not in country_to_id_map:
+            err_msg = f"Unknown Country: {clean_country}"
+            print(err_msg)
+            raise ValueError(err_msg)
+        else:
+            result.append(str(country_to_id_map[clean_country]))
+
+    return ";".join(result)