rename and extract key information

00cc4554 · jiangdongchen · ae59d2a2 · 00cc4554 · 00cc4554 · 00cc4554
Commit 00cc4554 authored May 07, 2025 by jiangdongchen
Showing with 144 additions and 129 deletions

.gitignore
+4 -2

README.md
+17 -6

config.json
+6 -3

logs/citation_process.log
+0 -0

main.py
+14 -15

psrc/__pycache__/rename_extractInfo.cpython-312.pyc
+0 -0

psrc/citationProcess.py
+103 -35

result.json
+0 -68

No files found.
--- a/.gitignore
+++ b/.gitignore
 .vscode/
 others/
-Papers/
\ No newline at end of file
+Papers/
+psrc/__pycache__/
+json/
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -4,19 +4,30 @@
    - logLevel
        - 取10表示DEBUG级别
        - 取20表示INFO级别
+    - tableNum 需要处理的工作表数量
+    - maxItem 每个工作表的最大条目数
 - python3.12.10
 - 无法import的库使用pip install逐个安装
    - `openai`, `pypdf`
+    - `python-Levenshtein`

 # 使用方法
 - 多模型交叉验证
+- 成功后的日志样例在logs文件夹下

 # 需求与解决方案
 1. 下载论文pdf
    1. 常用网站agent下载
    2. 输出无法下载的条目
-2. 自动化重命名
-    1. 读取excel表格中的论文名称和索引
-    2. 循环：读取pdf中的论文名称
-        1. 和excel表格中的论文名称进行模糊匹配
-        2. 匹配成功后
\ No newline at end of file
+2. 自动化提取信息和格式化
+    1. 通过config.json读取配置对象
+    2. 遍历excel的工作表
+        1. 读取excel表格中的论文名称和索引
+        2. 循环：
+            1. 读取pdf中的论文名称和关键信息，存储到json文件夹下
+            2. 和excel表格中的论文名称进行模糊匹配
+            3. 匹配成功后
+                1. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文名称
+                2. 将pdf文件中的关键信息写入excel表格中, 包括作者姓名、机构、国家
+            4. 匹配失败后，输出无法匹配的条目
+                o 使用warning记录无法匹配的条目，方便后续处理
\ No newline at end of file
--- a/config.json
+++ b/config.json
@@ -3,7 +3,9 @@
    "base_url": "https://api.siliconflow.cn/v1",
    "model": "Pro/deepseek-ai/DeepSeek-V3",
    "pdf_dir": "./Papers",
-    "result_path": "./result.json",
-    "excel_path": "./others/reference.xlsx",
-    "logLevel": 20
+    "result_dir": "./json",
+    "excel_path": "./others/论文被引用情况-陈老师-2025.05.01.xlsx",
+    "logLevel": 20,
+    "tableNum": 1,
+    "maxItem": 64
 }
\ No newline at end of file
--- a/logs/citation_process.log
+++ b/logs/citation_process.log
--- a/RFtools.py
+++ b/RFtools.py
 import json
 import logging
-import psrc.rename_extractInfo as RE
-from openai import OpenAI
+import psrc.citationProcess as CP
 from pathlib import Path

 if __name__ == "__main__":
-    # 获取当前脚本所在目录
-    # current_py_dir = os.path.dirname(os.path.abspath(__file__))
-
-    # 获取CWD
    cwd_dir = Path.cwd()

    # 构建 config.json 的完整路径
@@ -19,12 +14,15 @@ if __name__ == "__main__":
        config = json.load(f)

    # Path对象后跟/用于连接地址
-    pdf_dir = (cwd_dir / config["pdf_dir"]).resolve()
-    rst_dir = (cwd_dir / config["result_path"]).resolve()
-    excel_path = (cwd_dir / config["excel_path"]).resolve()

    # print(excel_path)

+     # 创建日志目录
+    log_dir = cwd_dir / "logs"
+    log_dir.mkdir(exist_ok=True)
+    
+     # 配置日志系统
+    log_file = log_dir / "citation_process.log"
    logLevel = config["logLevel"]
    # logging.basicConfig(...) 是 Python 标准库 logging 模块中的一个函数，用于快速配置日志记录的基本设置.
    # 设置日志记录的最低级别为 INFO, 只有日志级别大于等于 INFO 的日志记录才会被处理（例如 INFO、WARNING、ERROR、CRITICAL）.
@@ -34,11 +32,12 @@ if __name__ == "__main__":
        # %(asctime)s：日志记录的时间戳（默认格式：YYYY-MM-DD HH:MM:SS）。
        # %(levelname)s：日志级别名称（如 INFO, WARNING）。
        # %(message)s：日志的具体内容。
-        level=logLevel, format="%(asctime)s - %(levelname)s - %(message)s"
+        level=logLevel, format="%(asctime)s - %(levelname)s - %(message)s",
+        handlers=[
+            logging.FileHandler(log_file, encoding='utf-8'),
+            logging.StreamHandler()
+        ]
    )

-    client = OpenAI(api_key=config["api_key"], 
-                    base_url=config["base_url"])
-
-    # RE.main( pdf_dir, rst_dir, config["model"], client)
-    RE.read_rough_nameIndex_from_excel(excel_path)
+    logging.info(f"程序启动，日志文件保存在: {log_file}")
+    CP.citationProcess(config)
--- a/psrc/__pycache__/rename_extractInfo.cpython-312.pyc
+++ b/psrc/__pycache__/rename_extractInfo.cpython-312.pyc
--- a/psrc/rename_extractInfo.py
+++ b/psrc/rename_extractInfo.py
 from pathlib import Path
 import logging
+from openai import OpenAI
 import pypdf
-import pandas as pd
+import openpyxl
+from fuzzywuzzy import fuzz
+import json

 def get_authors( content, configModel, client):
    system_prompt = """
@@ -15,10 +18,12 @@ def get_authors( content, configModel, client):
        -   Identify all listed authors. Maintain the order presented in the text if possible.
        -   For each author:
            -   Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
-            -   Extract all associated institutions/affiliations mentioned for that specific author.
-            -   If an author has no listed institution, use an empty list `[]`.
-            -   If there are many authors and only one afflication, these authors all come from the same afflication. other wise find the corresponding afflication by indicator.
-    -   **Handling Missing Data:** If no authors can be identified in the text, the "authors" field in the JSON should be an empty list `[]`.
+    -   **Institutions:**
+        -   Extract all associated institutions of authors.
+    -   **Countrys:**
+        -   Extract all associated countrys of authors.
+    -   **Handling Missing Data:** If no data of a field can be identified in the text, the field in the JSON should be an empty list `[]`.
+    -   use highcase for first letter of key.
    """

    response = client.chat.completions.create(  
@@ -44,7 +49,6 @@ def get_authors( content, configModel, client):
    #         # 增量输出返回值
    #         print(chunk.choices[0].delta.content, end="", flush=True) # 不换行刷新输出，流式输出

-
 # Extracts text content from the first page of a PDF.
 def extract_first_page_text(pdf_path):

@@ -70,38 +74,102 @@ def extract_first_page_text(pdf_path):
        return None

 # excel表格的第4行开始读取索引和论文名称
-def read_rough_nameIndex_from_excel(excel_path: Path):
-
-    # 读取 Excel 文件中的某个工作表
-    # 当你读取多个工作表时，pandas.read_excel(sheet_name=None) 会返回一个字典，其中：
-        # 键 是工作表的名称（sheet_name）；
-        # 值 是每个工作表对应的 DataFrame。
-    # 通过 items()，你可以在一个循环中轻松地访问这两个部分
-
-    # 获取工作表的数据
-    excel_data = pd.read_excel(excel_path, sheet_name=None)
-    for sname, data in excel_data.items():
-        df = data.iloc[2:]
-
-        for index, row in df.iterrows():
-            print(row.iloc[0])
-            print(row.iloc[1])
+def read_rough_nameIndex_from_excel(sheet, maxItem):
+
+    index_list = []
+    paperName_list = []
+
+    # 从第4行开始遍历
+    for idx, row in enumerate(sheet.iter_rows(min_row=4, values_only=True)):
+        if idx >= maxItem:  # 限制读取的行数
+            break
+        if row[0] and row[2]:  # 确保索引和论文名称都存在
+            index_list.append(row[0])
+            paperName_list.append(row[2])
 
-def main(pdf_directory: Path, result_path: Path, configModel: str, client):
+    return index_list, paperName_list

-    with open(result_path, "w", encoding="utf-8") as f:
-        pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索 recursive glob
+def citationProcess(config: dict):

-        for file in pdf_files:
-            logging.info(f"Extract {file.name}'s authors")
+    client = OpenAI(api_key=config["api_key"], 
+                    base_url=config["base_url"])

-            first_page_text = extract_first_page_text(file)
-            logging.debug(first_page_text)
+    excel_path = Path(config["excel_path"])
+
+    # 读取Excel文件
+    wb = openpyxl.load_workbook(excel_path)
+
+    # 遍历工作簿中的所有工作表
+    for idx, sheet_name in enumerate(wb.sheetnames):
+        if idx == config["tableNum"]:
+            break
+        sheet = wb[sheet_name]
+        logging.info(f"Processing sheet: {sheet_name}")
+ 
+        index_list, paperName_list = read_rough_nameIndex_from_excel(sheet, config["maxItem"])

-            if first_page_text is not None:
-                result = get_authors(first_page_text, configModel, client)
+        rst_dir = Path.cwd() / config["result_dir"] / sheet_name
+        rst_dir.mkdir(parents=True, exist_ok=True)  # 确保结果目录存在

-                if result:
-                    f.write(result + "\n")
+        exit()
+         
+        pdf_directory = Path.cwd() / config["pdf_dir"] / sheet_name
+        
+        pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索, 输出所有pdf文件的路径
+
+        # 遍历当前工作表对应的所有PDF文件
+        for file in pdf_files:
+ 
+            logging.info(f"Processing {file.name}")
+
+            first_page_text = extract_first_page_text(file)
            
-            exit()
\ No newline at end of file
+            if first_page_text is None:
+                logging.error(f"Failed to extract text from first page of {file.name}")
+                continue  # 跳过当前文件继续处理下一个
+                
+            configModel = config["model"]
+
+            # 提取关键信息
+            result = get_authors(first_page_text, configModel, client)
+
+
+            if result is not None:
+                # 解析JSON结果, 提取论文标题
+                result_dict = json.loads(result)
+                pdf_title = result_dict["Title"]
+
+                # 遍历Excel表项进行模糊匹配
+                for idx, excel_name in zip(index_list, paperName_list):
+                    # 预处理字符串
+                    # 返回pdf字符前的字符串，所以加上索引0
+                    clean_excel_name = excel_name.split('.pdf')[0].replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
+                    clean_pdf_title = pdf_title.replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
+                    
+                    similarity = fuzz.partial_ratio(clean_pdf_title.lower(), clean_excel_name.lower())
+                    
+
+                    if similarity >= 85:
+                        # 重命名PDF文件
+                        new_pdf_name = f"{idx}-{pdf_title.replace(':', '-')}.pdf"  # 将冒号替换为连字符
+                        new_pdf_path = file.parent / new_pdf_name
+                        try:
+                            file.rename(new_pdf_path)
+                            logging.info(f"Renamed: {file.name} -> {new_pdf_name}")
+                        except FileExistsError:
+                            logging.warning(f"Renamed failed: filename {new_pdf_name} already exists with idx {idx}.")
+                            break
+
+                        # 存储关键信息到json文件中
+                        rst_path = rst_dir / (f"{idx}" + ".json")
+                        rst_path.write_text(result + "\n", encoding='utf-8')  # 明确指定UTF-8编码
+                        
+                        # 更新Excel中的表项
+                        sheet.cell(row=idx+4, column=3, value=pdf_title)  # 第3列是论文名称
+                        
+                        logging.info(f"Matched: {file.name} -> idx: {idx}, excel_name: {excel_name}")
+                        logging.info(f"Change: {file.name} -> {new_pdf_name}")
+                        break
+    
+    # 保存修改后的Excel文件
+    wb.save(excel_path)
\ No newline at end of file
--- a/result.json
+++ b/result.json
-{
-  "title": "A carbon-nanotube-based tensor processing unit",
-  "authors": [
-    {
-      "name": "Jia Si",
-      "affiliations": [
-        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
-      ]
-    },
-    {
-      "name": "Panpan Zhang",
-      "affiliations": [
-        "State Key Laboratory of Information Photonics and Optical Communications, Beijing University of Posts and Telecommunications, Beijing, China"
-      ]
-    },
-    {
-      "name": "Chenyi Zhao",
-      "affiliations": [
-        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
-      ]
-    },
-    {
-      "name": "Dongyi Lin",
-      "affiliations": [
-        "Hunan Institute of Advanced Sensing and Information Technology, Xiangtan University, Xiangtan, China"
-      ]
-    },
-    {
-      "name": "Lin Xu",
-      "affiliations": [
-        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
-      ]
-    },
-    {
-      "name": "Haitao Xu",
-      "affiliations": [
-        "Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
-      ]
-    },
-    {
-      "name": "Lijun Liu",
-      "affiliations": [
-        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
-      ]
-    },
-    {
-      "name": "Jianhua Jiang",
-      "affiliations": [
-        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
-      ]
-    },
-    {
-      "name": "Lian-Mao Peng",
-      "affiliations": [
-        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
-        "Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
-      ]
-    },
-    {
-      "name": "Zhiyong Zhang",
-      "affiliations": [
-        "Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
-        "Hunan Institute of Advanced Sensing and Information Technology, Xiangtan University, Xiangtan, China",
-        "Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
-      ]
-    }
-  ]
-}