update, comment in README

7db669c9 · jiangdongchen · c84a9230 · 7db669c9 · 7db669c9 · 7db669c9
Commit 7db669c9 authored May 08, 2025 by jiangdongchen
8 changed files
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
        - 输出关键信息json文件的文件夹
    - source_excel_path
        - 放置需要check的excel表格
-        - 第4行开始实际表项
+        - 第context_start+1行开始实际表项
        - 第一列索引
        - 第三列论文标题
        - 第七列论文作者
@@ -52,11 +52,13 @@
    2. **遍历**excel的sheet
        1. **遍历**sheet中的论文名称和索引
            1. 用**大模型**读取pdf中第一页的论文名称和关键信息，存储到json文件夹下
-            2. **遍历**excel表格中的论文名称进行模糊匹配, 匹配成功后
+            2. 读取pdf中从后向前的引用信息, 通过**正则表达式**找出sheetname对应文章在当前pdf文章中的索引，存储到json文件夹下
+            3. **遍历**excel表格中的论文名称进行模糊匹配, 匹配成功后
                1. 将pdf文件中的关键信息写入json文件中进行保存, 包括 标题 会议名称 作者姓名 机构 国家.
                2. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文标题、会议名称、作者姓名、机构、国家.
-                3. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
-            2. 匹配失败后，输出无法匹配的条目,使用warning记录无法匹配的条目，方便后续处理.
+                3. 首先用**大模型**将英文国家名翻译成中国名，将国家对应的索引写入目标excel表格中.
+                4. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
+            4. 匹配失败后，输出无法匹配的条目,使用warning记录无法匹配的条目，方便后续处理.
    3. 得到从pdf中提取的信息和格式化的excel表格.
    4. 可能会有重复的pdf，只有标准化重命名之后才会发现，需要手动删除重复的pdf和excel表项，由人工判定.
 3. stage2: 国家机构索引、牛人判断

--- a/config.json
+++ b/config.json
@@ -5,9 +5,10 @@
    "pdf_dir": "./Papers",
    "result_dir": "./json",
    "source_excel_path": "./others/论文被引用情况-陈老师-2025.05.01.xlsx",
+    "content_start": 4,
    "ccfa_excel_path": "./others/CCFA.xlsx",
    "target_excel_path": "./others/target.xlsx",
    "logLevel": 20,
    "sheetNum": 1,
-    "maxItem": 64
+    "maxItem": 333
 }
\ No newline at end of file
--- a/logs/citation_process.log
+++ b/logs/citation_process.log
--- a/others/target.xlsx
+++ b/others/target.xlsx
--- a/others/论文被引用情况-陈老师-2025.05.01 copy.xlsx
+++ b/others/论文被引用情况-陈老师-2025.05.01 copy.xlsx
--- a/others/论文被引用情况-陈老师-2025.05.01.xlsx
+++ b/others/论文被引用情况-陈老师-2025.05.01.xlsx
--- a/psrc/stage1/citationProcess.py
+++ b/psrc/stage1/citationProcess.py
+from errno import ESTALE
 from pathlib import Path
 import logging
 from openai import OpenAI
@@ -5,6 +6,9 @@ import pypdf
 import openpyxl
 from fuzzywuzzy import fuzz
 import json
+import re
+import bibtexparser
+import psrc.stage1.country_to_idx as c2i

 RED = '\033[91m'
 GREEN = '\033[92m'
@@ -126,26 +130,79 @@ def get_key_info( content, configModel, client):

 # Extracts text content from the first page of a PDF.
 def extract_first_page_text(pdf_path):
+    try:
+        # 尝试读取 PDF 文件
+        reader = pypdf.PdfReader(pdf_path)
+        if len(reader.pages) > 0:
+            first_page = reader.pages[0]
+            text = first_page.extract_text()
+
+            if text:
+                # 2. text.split()：用默认空白符（空格/换行/制表符）分割字符串，自动合并连续空白
+                # 3. " ".join(...)：用单个空格连接，实现「多余空白→单空格」的清理效果
+                cleaned_text = " ".join(text.split())
+                cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
+                    "utf-8"
+                )
+                return cleaned_text
+            else:
+                logging.warning(f"{RED}No text found on the first page of {pdf_path.name}{RESET}")
+                return None
+        else:
+            logging.warning(f"{RED}PDF has no pages: {pdf_path.name}{RESET}")
+            return None
+    except Exception as e:
+        # 捕获并记录读取 PDF 文件时的异常
+        logging.warning(f"{RED}Failed to read PDF {pdf_path.name}: {str(e)}{RESET}")
+        return None

-    reader = pypdf.PdfReader(pdf_path)
-    if len(reader.pages) > 0:
+def get_citation_ids(pdf_path, title):
+    try:
+        reader = pypdf.PdfReader(pdf_path)
+        if len(reader.pages) == 0:
+            logging.warning(f"PDF has no pages: {pdf_path.name}")
+            return None

-        first_page = reader.pages[0]
-        text = first_page.extract_text()
+        # 定义需要替换的引号类型
+        quote_replacements = {
+            '“': '"', '”': '"', '‘': "'", '’': "'", '``': '"', "''": '"',
+            '〝': '"', '〞': '"', '＂': '"', '«': '"', '»': '"'
+        }
+
+        # 构建正则表达式模式
+        ref_pattern = r'''
+            \[\s*(\d+)\s*\]               # 引用编号
+            (?:(?!\[\s*\d+\s*\]).)*?      # 排除中间的其他引用编号
+            "                             # 开始引号
+            [^"]*{}[^"]*                  # 标题（允许前后有其他内容）
+            "                             # 结束引号
+        '''.format(re.escape(title))
+
+        # 从最后一页开始逐页检查
+        for page_num in range(len(reader.pages) - 1, -1, -1):
+            page = reader.pages[page_num]
+            text = page.extract_text()
+
+            if text:
+                cleaned_text = " ".join(text.split())
+                # 统一替换各种引号为直引号
+                for old, new in quote_replacements.items():
+                    cleaned_text = cleaned_text.replace(old, new)
+
+                match = re.search(
+                    ref_pattern,
+                    cleaned_text,
+                    flags=re.IGNORECASE | re.VERBOSE | re.DOTALL
+                )
+                if match:
+                    return match.group(1)  # 匹配成功立即返回
+
+        # 如果所有页面都未匹配成功
+        logging.warning(f"Citation ID for {title} not found in {pdf_path.name}")
+        return None

-        if text:
-            # 1. text.split()：用默认空白符（空格/换行/制表符）分割字符串，自动合并连续空白
-            # 2. " ".join(...)：用单个空格连接，实现「多余空白→单空格」的清理效果
-            cleaned_text = " ".join(text.split())
-            cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
-                "utf-8"
-            )
-            return cleaned_text
-        else:
-            logging.warning(f"No text found on the first page of {pdf_path.name}")
-            return None
-    else:
-        logging.warning(f"PDF has no pages: {pdf_path.name}")
+    except Exception as e:
+        logging.error(f"An error occurred while processing {pdf_path.name}: {str(e)}")
        return None

 # excel表格的第4行开始读取索引和论文名称
@@ -164,6 +221,28 @@ def read_rough_nameIndex_from_excel(sheet, maxItem):
 
    return index_list, paperName_list

+def translate_countries(countries_str: str, configModel, client):
+    system_prompt = """
+    你是一个专业的翻译助手，请将输入的英文国家名称翻译为中文。
+    输入可能是多个国家名称，以英文分号分隔。
+    输出同样以英文分号分隔对应的中文国家名称。
+    示例输入: United States; China
+    示例输出: 美国; 中国
+    """
+
+    response = client.chat.completions.create(
+        model=configModel,
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": countries_str}
+        ],
+        temperature=0.2,
+        max_tokens=4096,
+        response_format={"type": "text"}
+    )
+
+    return response.choices[0].message.content.strip()
+
 def citationProcess(config: dict):

    client = OpenAI(api_key=config["api_key"], 
@@ -176,12 +255,6 @@ def citationProcess(config: dict):
    # 读取Excel文件
    wb = openpyxl.load_workbook(excel_path)
    # 读取CCFA列表
-    # 序号	简称	全称
-    # 1	PPoPP	ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming
-    # 2	FAST	USENIX Conference on File and Storage Technologies
-    # 3	DAC	Design Automation Conference
-    # 4	HPCA	IEEE International Symposium on High Performance Computer Architecture
-    # 5	MICRO	IEEE/ACM International Symposium on Microarchitecture
    ccfa_wb = openpyxl.load_workbook(ccfa_excel_path)
    sheetCCF = ccfa_wb["CCF-A列表"]

@@ -191,16 +264,34 @@ def citationProcess(config: dict):
            break
        sheet = wb[sheet_name]
        logging.info(f"{BLUE}Processing sheet: {sheet_name}{RESET}")
- 
+
        index_list, paperName_list = read_rough_nameIndex_from_excel(sheet, config["maxItem"])

        rst_dir = Path.cwd() / config["result_dir"] / sheet_name
        rst_dir.mkdir(parents=True, exist_ok=True)  # 确保结果目录存在
- 
+
        pdf_directory = Path.cwd() / config["pdf_dir"] / sheet_name
        
        pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索, 输出所有pdf文件的路径

+        # 读取第三行的BibTeX文本
+        bibtex_row = sheet[3]  # 第三行，索引从1开始
+        # 使用 map 函数将 cell.value 转换为字符串类型
+        bibtex_text = "".join(map(lambda cell: str(cell.value) if cell.value is not None else "", bibtex_row))
+
+        # 解析BibTeX文本
+        try:
+            bib_database = bibtexparser.loads(bibtex_text)
+            if bib_database.entries:
+                entry = bib_database.entries[0]
+                cited_title_str = entry.get('title', '')
+            else:
+                cited_title_str = ""
+                logging.warning(f"No BibTeX entry found in sheet {sheet_name} row 3")
+        except Exception as e:
+            cited_title_str = ""
+            logging.error(f"Error parsing BibTeX in sheet {sheet_name} row 3: {str(e)}")
+
        # 遍历当前工作表对应的所有PDF文件
        for file in pdf_files:
 
@@ -217,6 +308,8 @@ def citationProcess(config: dict):
            # 提取关键信息
            result = get_key_info(first_page_text, configModel, client)

+            cit_id = get_citation_ids(file, cited_title_str)
+
            if result is not None:
                # 解析JSON结果, 提取论文标题
                result_dict = json.loads(result)
@@ -237,7 +330,8 @@ def citationProcess(config: dict):
                    
                    if similarity >= 85:
                        # 重命名PDF文件
-                        new_pdf_name = f"{idx}-{pdf_title.replace(':', '_').replace(' ', '_').replace('?', '_')}.pdf"  # 将冒号替换为连字符
+                        # 替换冒号、空格、问号和斜杠，避免文件名非法字符
+                        new_pdf_name = f"{idx}-{pdf_title.replace(': ', '_').replace(' ', '_').replace('?', '_').replace('/', '_')}.pdf"  # 将冒号、空格、问号和斜杠替换为下划线
                        new_pdf_path = file.parent / new_pdf_name
                        try:
                            file.rename(new_pdf_path)
@@ -251,21 +345,39 @@ def citationProcess(config: dict):
                        rst_path.write_text(result + "\n", encoding='utf-8')  # 明确指定UTF-8编码
                        
                        # 更新Excel中的表项
-                        sheet.cell(row=idx+4, column=3, value=pdf_title)  # 第3列是论文名称
+                        sheet.cell(row=idx+config["content_start"], column=3, value=pdf_title)  # 第3列是论文名称
+
+                        issue = result_dict.get("ISSUE", [])
+                        if issue is not None:
+                            sheet.cell(row=idx+config["content_start"], column=4, value=issue[0])  # 第4列是国家
+                            logging.info(f"Standardization issue info.")
+                        else:
+                            logging.warning(f"{RED}ISSUE is None.{RESET}")
+
+                        sheet.cell(row=idx+config["content_start"], column=6, value=cit_id)  # 第7列是作者名称

                        authors_list = result_dict.get("Authors", [])
                        authors = ";".join(authors_list) if isinstance(authors_list, list) else ""
-                        sheet.cell(row=idx+4, column=7, value=authors)  # 第7列是作者名称
+                        sheet.cell(row=idx+config["content_start"], column=7, value=authors)  # 第7列是作者名称
                        logging.info(f"Standardization author info.")

                        institution_list = result_dict.get("Institutions", [])
                        institutions = ";".join(institution_list) if isinstance( institution_list, list) else ""
-                        sheet.cell(row=idx+4, column=11, value=institutions)  # 第9列是机构
-                        logging.info(f"Standardization Institution info.")
+                        sheet.cell(row=idx+config["content_start"], column=9, value=institutions)  # 第9列是机构
+                        logging.info(f"Standardization institution info.")

                        countrys_list = result_dict.get("Countrys", [])
                        countrys = ";".join(countrys_list) if isinstance(countrys_list, list) else ""
-                        sheet.cell(row=idx+4, column=11, value=countrys)  # 第11列是国家
+                        # 翻译 countrys 为中文
+                        if countrys:
+                            translated_countrys = translate_countries(countrys, config["model"], client)
+                            sheet.cell(row=idx+config["content_start"], column=11, value=translated_countrys)  # 第11列是中文国家名称
+                            logging.debug(f"Translated countrys info: {translated_countrys}")
+                        else:
+                            logging.warning(f"No countrys info to translate.")
+
+                        countrys = c2i.country_to_idx(translated_countrys)
+                        sheet.cell(row=idx+config["content_start"], column=10, value=countrys)  # 第10列是国家索引
                        logging.info(f"Standardization countrys info.")

                        # CCFA判断
@@ -300,6 +412,6 @@ def citationProcess(config: dict):
                        matched = True
                        break 
                if matched == False:
-                    logging.warning(f"{RED}Not matched: {file.name} -> idx: {idx}, excel_name: {excel_name}{RESET}")
+                    logging.warning(f"{RED}Not matched: {file.name}{RESET}")
            else:
-                logging.error(f"{RED}Failed to extract key info from {file.name}{RESET}")
\ No newline at end of file
+                logging.warning(f"{RED}Failed to extract key info from {file.name}{RESET}")
\ No newline at end of file
--- a/psrc/stage1/country_to_idx.py
+++ b/psrc/stage1/country_to_idx.py
+country_to_id_map = {
+    "美国": 1,
+    "中国": 2,
+    "日本": 3,
+    "韩国": 4,
+    "新加坡": 5,
+    "中国台湾": 6,
+    "中国香港": 7,
+    "中国澳门": 8,
+    "法国": 9,
+    "英国": 10,
+    "德国": 11,
+    "意大利": 12,
+    "西班牙": 13,
+    "加拿大": 14,
+    "荷兰": 15,
+    "印度": 16,
+    "阿联酋": 17,
+    "比利时": 18,
+    "俄罗斯": 19,
+    "阿富汗": 20,
+    "亚美尼亚": 21,
+    "阿塞拜疆": 22,
+    "巴林": 23,
+    "孟加拉国": 24,
+    "不丹": 25,
+    "文莱": 26,
+    "缅甸": 27,
+    "柬埔寨": 28,
+    "塞浦路斯": 29,
+    "东帝汶": 30,
+    "格鲁吉亚": 31,
+    "印度尼西亚": 32,
+    "伊朗": 33,
+    "伊拉克": 34,
+    "以色列": 35,
+    "约旦": 36,
+    "哈萨克斯坦": 37,
+    "科威特": 38,
+    "吉尔吉斯斯坦": 39,
+    "老挝": 40,
+    "黎巴嫩": 41,
+    "马来西亚": 42,
+    "马尔代夫": 43,
+    "蒙古": 44,
+    "尼泊尔": 45,
+    "朝鲜": 46,
+    "阿曼": 47,
+    "巴基斯坦": 48,
+    "巴勒斯坦": 49,
+    "菲律宾": 50,
+    "卡塔尔": 51,
+    "沙特阿拉伯": 52,
+    "斯里兰卡": 53,
+    "叙利亚": 54,
+    "塔吉克斯坦": 55,
+    "泰国": 56,
+    "土库曼斯坦": 57,
+    "乌兹别克斯坦": 58,
+    "越南": 59,
+    "也门": 60,
+    "北塞浦路斯": 61,
+    "纳戈尔诺-卡拉巴 赫": 62,
+    "阿尔及利亚": 63,
+    "安哥拉": 64,
+    "贝宁": 65,
+    "博茨瓦纳": 66,
+    "布基纳法索": 67,
+    "布隆迪": 68,
+    "佛得角": 69,
+    "喀麦隆": 70,
+    "中非共和国": 71,
+    "乍得": 72,
+    "科摩罗": 73,
+    "刚果": 75,
+    "科特迪瓦": 76,
+    "吉布提": 77,
+    "埃及": 78,
+    "赤道几内亚": 79,
+    "厄立特里亚": 80,
+    "埃塞俄比亚": 81,
+    "加蓬": 82,
+    "冈比亚": 83,
+    "加纳": 84,
+    "几内亚": 85,
+    "几内亚比绍": 86,
+    "肯尼亚": 87,
+    "莱索托": 88,
+    "利比里亚": 89,
+    "利比亚": 90,
+    "马达加斯加": 91,
+    "马拉维": 92,
+    "马里": 93,
+    "毛里塔尼亚": 94,
+    "毛里求斯": 95,
+    "摩洛哥": 96,
+    "莫桑比克": 97,
+    "纳米比亚": 98,
+    "尼日尔": 99,
+    "尼日利亚": 100,
+    "卢旺达": 101,
+    "圣多美和普林西比": 102,
+    "塞内加尔": 103,
+    "塞舌尔": 104,
+    "塞拉利昂": 105,
+    "索马里": 106,
+    "南非": 107,
+    "南苏丹": 108,
+    "苏丹": 109,
+    "斯威士兰": 110,
+    "坦桑尼亚": 111,
+    "多哥": 112,
+    "突尼斯": 113,
+    "乌干达": 114,
+    "赞比亚": 115,
+    "津巴布韦": 116,
+    "西撒哈拉": 117,
+    "阿尔巴尼亚": 118,
+    "安道尔": 119,
+    "奥地利": 120,
+    "白俄罗斯": 121,
+    "波斯尼亚和黑塞哥维那": 122,
+    "保加利亚": 123,
+    "克罗地亚": 124,
+    "捷克": 125,
+    "丹麦": 126,
+    "爱沙尼亚": 127,
+    "芬兰": 128,
+    "希腊": 129,
+    "匈牙利": 130,
+    "冰岛": 131,
+    "爱尔兰": 132,
+    "拉脱维亚": 133,
+    "列支敦士登": 134,
+    "立陶宛": 135,
+    "卢森堡": 136,
+    "马耳他": 137,
+    "摩尔多瓦": 138,
+    "摩纳哥": 139,
+    "黑山": 140,
+    "北马其顿": 141,
+    "挪威": 142,
+    "波兰": 143,
+    "葡萄牙": 144,
+    "罗马尼亚": 145,
+    "圣马力诺": 146,
+    "塞尔维亚": 147,
+    "斯洛伐克": 148,
+    "斯洛文尼亚": 149,
+    "瑞典": 150,
+    "瑞士": 151,
+    "乌克兰": 152,
+    "梵蒂冈": 153,
+    "科索沃": 154,
+    "法罗群岛": 155,
+    "直布罗陀": 156,
+    "安提瓜和巴布达": 157,
+    "巴哈马": 158,
+    "巴巴多斯": 159,
+    "伯利兹": 160,
+    "哥斯达黎加": 161,
+    "古巴": 162,
+    "多米尼加": 163,
+    "多米尼加共和国": 164,
+    "萨尔瓦多": 165,
+    "格林纳达": 166,
+    "危地马拉": 167,
+    "海地": 168,
+    "洪都拉斯": 169,
+    "牙买加": 170,
+    "墨西哥": 171,
+    "尼加拉瓜": 172,
+    "巴拿马": 173,
+    "圣基茨和尼维斯": 174,
+    "圣卢西亚": 175,
+    "圣文森特和格林纳丁斯": 176,
+    "特立尼达和多巴哥": 177,
+    "百慕大": 178,
+    "格陵兰": 179,
+    "波多黎各": 180,
+    "美属维尔京群岛": 181,
+    "英属维尔京群岛": 182,
+    "开曼群岛": 183,
+    "安圭拉": 184,
+    "蒙特塞拉特": 185,
+    "阿根廷": 186,
+    "玻利维亚": 187,
+    "巴西": 188,
+    "智利": 189,
+    "哥伦比亚": 190,
+    "厄瓜多尔": 191,
+    "圭亚那": 192,
+    "巴拉圭": 193,
+    "秘鲁": 194,
+    "苏里南": 195,
+    "乌拉圭": 196,
+    "委内瑞拉": 197,
+    "法属圭亚那": 198,
+    "福克兰群岛": 199,
+    "澳大利亚": 200,
+    "斐济": 201,
+    "基里巴斯": 202,
+    "马绍尔群岛": 203,
+    "密克罗尼西亚": 204,
+    "瑙鲁": 205,
+    "新西兰": 206,
+    "帕劳": 207,
+    "巴布亚新几内亚": 208,
+    "萨摩亚": 209,
+    "所罗门群岛": 210,
+    "汤加": 211,
+    "图瓦卢": 212,
+    "瓦努阿图": 213,
+    "库克群岛": 214,
+    "纽埃": 215,
+    "法属波利尼西亚": 216,
+    "新喀里多尼亚": 217,
+    "瓦利斯和富图纳": 218,
+    "托克劳": 219,
+    "皮特凯恩群岛": 220,
+    "土耳其": 221,
+}
+
+
+def country_to_idx(country_str: str) -> str:
+    result = []
+    country_str = country_str.strip()
+    for country in country_str.split(";"):
+        clean_country = country.strip()
+        if clean_country not in country_to_id_map:
+            err_msg = f"Unknown Country: {clean_country}"
+            print(err_msg)
+            raise ValueError(err_msg)
+        else:
+            result.append(str(country_to_id_map[clean_country]))
+
+    return ";".join(result)