stage 1

ea9e27cc · jiangdongchen · 7db669c9 · ea9e27cc · ea9e27cc · ea9e27cc
Commit ea9e27cc authored May 08, 2025 by jiangdongchen
Hide whitespace changes
Inline Side-by-side

Showing with 43 additions and 42 deletions

README.md
+5 -2

logs/citation_process.log
+0 -0

others/target.xlsx
+0 -0

psrc/stage1/citationProcess.py
+38 -40

No files found.
--- a/README.md
+++ b/README.md
@@ -42,6 +42,7 @@
 - 断点处理：如果在excel某个序号之前的pdf都正确提取了信息，并且正确修改了excel，下一个序号开始的pdf出错了
    - 建议将正确的pdf都转移到其他文件夹，这样再次运行脚本将处理剩下的pdf
 - TODO:多模型交叉验证
+- TODO:Temperature的设置

 # 需求与解决方案
 1. TODO:下载论文pdf
@@ -52,7 +53,7 @@
    2. **遍历**excel的sheet
        1. **遍历**sheet中的论文名称和索引
            1. 用**大模型**读取pdf中第一页的论文名称和关键信息，存储到json文件夹下
-            2. 读取pdf中从后向前的引用信息, 通过**正则表达式**找出sheetname对应文章在当前pdf文章中的索引，存储到json文件夹下
+            2. 读取pdf中从后向前的引用信息, 通过**大模型**找出sheetname对应文章在当前pdf文章中的索引，存储到json文件夹下
            3. **遍历**excel表格中的论文名称进行模糊匹配, 匹配成功后
                1. 将pdf文件中的关键信息写入json文件中进行保存, 包括 标题 会议名称 作者姓名 机构 国家.
                2. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文标题、会议名称、作者姓名、机构、国家.
@@ -60,7 +61,9 @@
                4. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
            4. 匹配失败后，输出无法匹配的条目,使用warning记录无法匹配的条目，方便后续处理.
    3. 得到从pdf中提取的信息和格式化的excel表格.
-    4. 可能会有重复的pdf，只有标准化重命名之后才会发现，需要手动删除重复的pdf和excel表项，由人工判定.
+    4. 人工复核
+        1. 可能会有重复的pdf，只有标准化重命名之后才会发现，当两个pdf的索引名称相同时，说明pdf重复匹配，需要手动删除重复的pdf和excel表项，由人工结合原表格信息判定重复现象.
+        2. 可能机构和国家会有重复，请人工检查.
 3. stage2: 国家机构索引、牛人判断

 # 代码结构说明

--- a/logs/citation_process.log
+++ b/logs/citation_process.log
--- a/others/target.xlsx
+++ b/others/target.xlsx
--- a/psrc/stage1/citationProcess.py
+++ b/psrc/stage1/citationProcess.py
@@ -64,14 +64,14 @@ def get_key_info( content, configModel, client):
        -   Identify all listed authors. Maintain the order presented in the text if possible.
        -   Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
    -   **Institutions:**
-        -   Extract all associated institutions of authors.
+        -   Extract all associated institutions of authors
+        -   相同机构仅保留一个
    -   **Countrys:**
-        -   Extract all associated countrys of authors.
+        -   Extract all associated countrys of institutions.
        -   Try to use full names. Only write the names of universities/companies, excluding departments, postal codes, countries, house numbers, cities, etc.
        -   For foreign names: Translate them first, and then manually check them using Google (mark them).
        -   In cases like the Chinese Academy of Sciences, specify the institute level. For example, the Institute of Computing Technology, Chinese Academy of Sciences.
        -   In cases of branch campuses, specify the branch. For example, California State University, University of California, University of Maryland.
-
    -   **ISSUE:**
        -   Extract where the paper is published like journal or session.
    -   Title, authors, institutions and countrys should be four separate keys, not nested together.
@@ -90,14 +90,10 @@ def get_key_info( content, configModel, client):
        "Institutions": [
            "Shanghai Jiao Tong University",
            "Colorado School of Mines",
-            "Colorado School of Mines",
-            "Shanghai Jiao Tong University",
        ],
        "Countrys": [
            "China",
            "United States",
-            "United States",
-            "China",
        ],
        "ISSUE": [
            "IEEE Transactions on Computers" 
@@ -111,7 +107,7 @@ def get_key_info( content, configModel, client):
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": content},
        ],  
-        temperature=0.7,  
+        temperature=0.25,  
        max_tokens=4096,
        # stream=True,
        response_format={"type": "json_object"}  
@@ -156,27 +152,25 @@ def extract_first_page_text(pdf_path):
        logging.warning(f"{RED}Failed to read PDF {pdf_path.name}: {str(e)}{RESET}")
        return None

-def get_citation_ids(pdf_path, title):
+def get_citation_ids(pdf_path, title, configModel, client):
    try:
        reader = pypdf.PdfReader(pdf_path)
        if len(reader.pages) == 0:
            logging.warning(f"PDF has no pages: {pdf_path.name}")
            return None

-        # 定义需要替换的引号类型
-        quote_replacements = {
-            '“': '"', '”': '"', '‘': "'", '’': "'", '``': '"', "''": '"',
-            '〝': '"', '〞': '"', '＂': '"', '«': '"', '»': '"'
-        }
-
-        # 构建正则表达式模式
-        ref_pattern = r'''
-            \[\s*(\d+)\s*\]               # 引用编号
-            (?:(?!\[\s*\d+\s*\]).)*?      # 排除中间的其他引用编号
-            "                             # 开始引号
-            [^"]*{}[^"]*                  # 标题（允许前后有其他内容）
-            "                             # 结束引号
-        '''.format(re.escape(title))
+        system_prompt = f"""
+        你是一个专业的学术论文引用分析助手。你的任务是从PDF页面文本中找出引用给定论文的引用编号。
+        论文标题: {title}
+        
+        分析指南:
+        1. 从文本中查找类似 [1], [2] 这样的引用标记
+        2. 引用标记后面通常会跟着引用的论文标题
+        3. 如果找到匹配的引用编号，返回该数字
+        4. 如果没有找到匹配的引用，返回空字符串
+        
+        请直接返回引用编号数字，不要返回其他内容。
+        """

        # 从最后一页开始逐页检查
        for page_num in range(len(reader.pages) - 1, -1, -1):
@@ -185,22 +179,23 @@ def get_citation_ids(pdf_path, title):

            if text:
                cleaned_text = " ".join(text.split())
-                # 统一替换各种引号为直引号
-                for old, new in quote_replacements.items():
-                    cleaned_text = cleaned_text.replace(old, new)
-
-                match = re.search(
-                    ref_pattern,
-                    cleaned_text,
-                    flags=re.IGNORECASE | re.VERBOSE | re.DOTALL
+                response = client.chat.completions.create(
+                    model=configModel,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": cleaned_text},
+                    ],
+                    temperature=0.1,
+                    max_tokens=10,
+                    response_format={"type": "text"}
                )
-                if match:
-                    return match.group(1)  # 匹配成功立即返回
+                
+                result = response.choices[0].message.content.strip()
+                if result.isdigit():
+                    return result

-        # 如果所有页面都未匹配成功
        logging.warning(f"Citation ID for {title} not found in {pdf_path.name}")
        return None
-
    except Exception as e:
        logging.error(f"An error occurred while processing {pdf_path.name}: {str(e)}")
        return None
@@ -308,7 +303,7 @@ def citationProcess(config: dict):
            # 提取关键信息
            result = get_key_info(first_page_text, configModel, client)

-            cit_id = get_citation_ids(file, cited_title_str)
+            cit_id = get_citation_ids(file, cited_title_str, configModel, client)

            if result is not None:
                # 解析JSON结果, 提取论文标题
@@ -348,13 +343,16 @@ def citationProcess(config: dict):
                        sheet.cell(row=idx+config["content_start"], column=3, value=pdf_title)  # 第3列是论文名称

                        issue = result_dict.get("ISSUE", [])
-                        if issue is not None:
-                            sheet.cell(row=idx+config["content_start"], column=4, value=issue[0])  # 第4列是国家
+                        if issue and isinstance(issue, list) and len(issue) > 0:  # 确保issue是有效列表且不为空
+                            sheet.cell(row=idx+config["content_start"], column=4, value=issue[0])
                            logging.info(f"Standardization issue info.")
                        else:
-                            logging.warning(f"{RED}ISSUE is None.{RESET}")
+                            logging.warning(f"{RED}Invalid ISSUE data: {issue}{RESET}")

-                        sheet.cell(row=idx+config["content_start"], column=6, value=cit_id)  # 第7列是作者名称
+                        if cit_id is not None:
+                            sheet.cell(row=idx+config["content_start"], column=6, value=cit_id)  # 第6列是cit_id
+                        else:
+                            logging.warning(f"{RED}cit_id is None.{RESET}")

                        authors_list = result_dict.get("Authors", [])
                        authors = ";".join(authors_list) if isinstance(authors_list, list) else ""