update for deepseek and rename

763c9946 · jiangdongchen · ab7aabf8 · 763c9946 · 763c9946 · 763c9946
Commit 763c9946 authored May 07, 2025 by jiangdongchen
8 changed files
--- a/.gitignore
+++ b/.gitignore
+.vscode/
+others/
+Papers/
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# 环境配置
+- 确保执行py的cwd在papertools仓库文件夹下
+- 路径和参数配置都在config.json文件中
+    - logLevel
+        - 取10表示DEBUG级别
+        - 取20表示INFO级别
+- python3.12.10
+- 无法import的库使用pip install逐个安装
+    - `openai`, `pypdf`
+
+# 使用方法
+- 多模型交叉验证
+
+# 需求与解决方案
+1. 下载论文pdf
+    1. 常用网站agent下载
+    2. 输出无法下载的条目
+2. 自动化重命名
+    1. 读取excel表格中的论文名称和索引
+    2. 循环：读取pdf中的论文名称
+        1. 和excel表格中的论文名称进行模糊匹配
+        2. 匹配成功后
\ No newline at end of file
--- a/RFtools.py
+++ b/RFtools.py
+import json
+import logging
+import psrc.rename_extractInfo as RE
+from openai import OpenAI
+from pathlib import Path
+
+if __name__ == "__main__":
+    # 获取当前脚本所在目录
+    # current_py_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # 获取CWD
+    cwd_dir = Path.cwd()
+
+    # 构建 config.json 的完整路径
+    config_path = (cwd_dir / "config.json").resolve()
+
+    # 读取config.json中的配置参数
+    with open( config_path, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+
+    # Path对象后跟/用于连接地址
+    pdf_dir = (cwd_dir / config["pdf_dir"]).resolve()
+    rst_dir = (cwd_dir / config["result_path"]).resolve()
+    excel_path = (cwd_dir / config["excel_path"]).resolve()
+    sheet_name = config["sheet_name"]
+
+    # print(excel_path)
+
+    logLevel = config["logLevel"]
+    # logging.basicConfig(...) 是 Python 标准库 logging 模块中的一个函数，用于快速配置日志记录的基本设置.
+    # 设置日志记录的最低级别为 INFO, 只有日志级别大于等于 INFO 的日志记录才会被处理（例如 INFO、WARNING、ERROR、CRITICAL）.
+    # logging.debug("这是一个调试信息")    # 不会输出（低于 INFO）
+    # logging.info("这是一个普通信息")     # 会输出
+    logging.basicConfig(
+        # %(asctime)s：日志记录的时间戳（默认格式：YYYY-MM-DD HH:MM:SS）。
+        # %(levelname)s：日志级别名称（如 INFO, WARNING）。
+        # %(message)s：日志的具体内容。
+        level=logLevel, format="%(asctime)s - %(levelname)s - %(message)s"
+    )
+
+    client = OpenAI(api_key=config["api_key"], 
+                    base_url=config["base_url"])
+
+    # RE.main( pdf_dir, rst_dir, config["model"], client)
+    RE.read_rough_nameIndex_from_excel(excel_path, sheet_name)
--- a/config.json
+++ b/config.json
+{
+    "api_key": "sk-otamesebhzzycgfynnssjkrkjlcoitdtstcruwbhohksdlel",
+    "base_url": "https://api.siliconflow.cn/v1",
+    "model": "Pro/deepseek-ai/DeepSeek-V3",
+    "pdf_dir": "./Papers",
+    "result_path": "./result.json",
+    "excel_path": "./others/reference.xlsx",
+    "sheet_name": "j24-DianNao family",
+    "logLevel": 20
+}
\ No newline at end of file
--- a/extract_authors_from_pdf/readme.md
+++ b/extract_authors_from_pdf/readme.md
-# Readme
-
-## 依赖
-`openai`, `pypdf`
-
-## 用法
-
-export OPENAI_API_KEY=...
-python extract_authors_info.py --paper ./papars --result result.jsonl
-
-
- 参数paper: 包含所有pdf的文件夹的路径
- 参数result: 结果文件路径
-
-## 注意
-本结果由GPT自动生成，准确性无法保证，需要人工再次审核。
-
--- a/psrc/__pycache__/rename_extractInfo.cpython-312.pyc
+++ b/psrc/__pycache__/rename_extractInfo.cpython-312.pyc
--- a/extract_authors_from_pdf/extract_authors_info.py
+++ b/extract_authors_from_pdf/extract_authors_info.py
-from pydantic import BaseModel
-from openai import OpenAI
 from pathlib import Path
-import pypdf
-import json
-import os
-
 import logging
+import pypdf
+import pandas as pd
+
+def get_authors( content, configModel, client):
+    system_prompt = """
+    Act as an expert metadata extraction assistant.
+    Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report).
+    Your goal is to extract the document title, all authors, and their corresponding affiliations.
+
+    Extraction Guidelines:
+    -   **Title:** Extract the main title of the document. If ambiguous or missing, use "".
+    -   **Authors:**
+        -   Identify all listed authors. Maintain the order presented in the text if possible.
+        -   For each author:
+            -   Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
+            -   Extract all associated institutions/affiliations mentioned for that specific author.
+            -   If an author has no listed institution, use an empty list `[]`.
+            -   If there are many authors and only one afflication, these authors all come from the same afflication. other wise find the corresponding afflication by indicator.
+    -   **Handling Missing Data:** If no authors can be identified in the text, the "authors" field in the JSON should be an empty list `[]`.
+    """
+
+    response = client.chat.completions.create(  
+        model=configModel,  
+        messages=[  
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": content},
+        ],  
+        temperature=0.7,  
+        max_tokens=4096,
+        # stream=True,
+        response_format={"type": "json_object"}  
+    ) 

-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
+    logging.debug(response.choices[0].message.content)

-client = OpenAI(
-    api_key=os.environ.get("OPENAI_API_KEY"),
-)
+    return response.choices[0].message.content

+    # for chunk in response:
+    #     if not chunk.choices:
+    #         continue
+    #     if chunk.choices[0].delta.content:
+    #         # 增量输出返回值
+    #         print(chunk.choices[0].delta.content, end="", flush=True) # 不换行刷新输出，流式输出

-class Author(BaseModel):
-    name: str
-    affiliations: list[str]

+# Extracts text content from the first page of a PDF.
+def extract_first_page_text(pdf_path):

-class Metadata(BaseModel):
-    title: str
-    authors: list[Author]
+    reader = pypdf.PdfReader(pdf_path)
+    if len(reader.pages) > 0:

+        first_page = reader.pages[0]
+        text = first_page.extract_text()

-system_prompt = """
-Act as an expert metadata extraction assistant.
-Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report).
-Your goal is to extract the document title, all authors, and their corresponding affiliations.
+        if text:
+            # 1. text.split()：用默认空白符（空格/换行/制表符）分割字符串，自动合并连续空白
+            # 2. " ".join(...)：用单个空格连接，实现「多余空白→单空格」的清理效果
+            cleaned_text = " ".join(text.split())
+            cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
+                "utf-8"
+            )
+            return cleaned_text
+        else:
+            logging.warning(f"No text found on the first page of {pdf_path.name}")
+            return None
+    else:
+        logging.warning(f"PDF has no pages: {pdf_path.name}")
+        return None

-Extraction Guidelines:
-   **Title:** Extract the main title of the document. If ambiguous or missing, use "".
-   **Authors:**
-    -   Identify all listed authors. Maintain the order presented in the text if possible.
-    -   For each author:
-        -   Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
-        -   Extract all associated institutions/affiliations mentioned for that specific author.
-        -   If an author has no listed institution, use an empty list `[]`.
-        -   If there are many authors and only one afflication, these authors all come from the same afflication. other wise find the corresponding afflication by indicator.
-   **Handling Missing Data:** If no authors can be identified in the text, the "authors" field in the JSON should be an empty list `[]`.
-"""
+def read_rough_nameIndex_from_excel(excel_path: Path, sheet_name: str):

+    # 读取 Excel 文件中的某个工作表
+    df = pd.read_excel( excel_path, sheet_name)

-def get_authors(content):
-    response = client.responses.parse(
-        model="gpt-4o",
-        input=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": content},
-        ],
-        text_format=Metadata,
-    )
-    result = response.output_parsed
-    return result
+    # 显示前几行数据
+    print(df.head())

+    # 获取所有工作表的数据
+    # excel_data = pd.read_excel('example.xlsx', sheet_name=None)
+    # for sheet_name, data in excel_data.items():
+    #     print(f"工作表: {sheet_name}")
+    #     print(data.head())

-def extract_first_page_text(pdf_path):
-    """Extracts text content from the first page of a PDF."""
-    try:
-        reader = pypdf.PdfReader(pdf_path)
-        if len(reader.pages) > 0:
-            first_page = reader.pages[0]
-            text = first_page.extract_text()
-            if text:
-                # Basic cleaning: remove excessive whitespace
-                cleaned_text = " ".join(text.split())
-                cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
-                    "utf-8"
-                )
-                return cleaned_text
-            else:
-                logging.warning(f"No text found on the first page of {pdf_path.name}")
-                return None
-        else:
-            logging.warning(f"PDF has no pages: {pdf_path.name}")
-            return None
-    except pypdf.errors.PdfReadError as e:
-        logging.error(f"Error reading PDF file {pdf_path.name}: {e}")
-        return None
-    except FileNotFoundError:
-        logging.error(f"PDF file not found: {pdf_path}")
-        return None
-    except Exception as e:
-        logging.error(
-            f"An unexpected error occurred while processing {pdf_path.name}: {e}"
-        )
-        return None
+def main(pdf_directory: Path, result_path: Path, configModel: str, client):

+    with open(result_path, "w", encoding="utf-8") as f:
+        pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索 recursive glob

-def main(pdf_directory: Path, result_path: Path):
-    with open(result_path, "a", encoding="utf-8") as f:
-        pdf_files = pdf_directory.rglob("*.pdf")
        for file in pdf_files:
-            try:
-                logging.info(f"Extract {file.name}'s authors")
-                first_page_text = extract_first_page_text(file)
-                logging.info(first_page_text)
-                if first_page_text is not None:
-                    result = get_authors(first_page_text).model_dump()
-                    result["filename"] = file.name
-                    f.write(json.dumps(result) + "\n")
-            except Exception as e:
-                logging.error(f"{file.name}: {str(e)}")
-
-
-if __name__ == "__main__":
-    import argparse
-
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("--paper", type=str, required=True)
-    argparser.add_argument("--result", type=str, required=True)
-    args = argparser.parse_args()
-
-    main(Path(args.paper), Path(args.result))
+            logging.info(f"Extract {file.name}'s authors")
+
+            first_page_text = extract_first_page_text(file)
+            logging.debug(first_page_text)
+
+            if first_page_text is not None:
+                result = get_authors(first_page_text, configModel, client)
+
+                if result:
+                    f.write(result + "\n")
+            
+            exit()
\ No newline at end of file
--- a/result.json
+++ b/result.json
+{
+  "title": "A carbon-nanotube-based tensor processing unit",
+  "authors": [
+    {
+      "name": "Jia Si",
+      "affiliations": [
+        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
+        "Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
+      ]
+    },
+    {
+      "name": "Panpan Zhang",
+      "affiliations": [
+        "State Key Laboratory of Information Photonics and Optical Communications, Beijing University of Posts and Telecommunications, Beijing, China"
+      ]
+    },
+    {
+      "name": "Chenyi Zhao",
+      "affiliations": [
+        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
+      ]
+    },
+    {
+      "name": "Dongyi Lin",
+      "affiliations": [
+        "Hunan Institute of Advanced Sensing and Information Technology, Xiangtan University, Xiangtan, China"
+      ]
+    },
+    {
+      "name": "Lin Xu",
+      "affiliations": [
+        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
+      ]
+    },
+    {
+      "name": "Haitao Xu",
+      "affiliations": [
+        "Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
+      ]
+    },
+    {
+      "name": "Lijun Liu",
+      "affiliations": [
+        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
+      ]
+    },
+    {
+      "name": "Jianhua Jiang",
+      "affiliations": [
+        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
+      ]
+    },
+    {
+      "name": "Lian-Mao Peng",
+      "affiliations": [
+        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
+        "Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
+      ]
+    },
+    {
+      "name": "Zhiyong Zhang",
+      "affiliations": [
+        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
+        "Hunan Institute of Advanced Sensing and Information Technology, Xiangtan University, Xiangtan, China",
+        "Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
+      ]
+    }
+  ]
+}