README and author name standardize to xlsx

7e4247f9 · jiangdongchen · 00cc4554 · 7e4247f9 · 7e4247f9 · 7e4247f9
Commit 7e4247f9 authored May 08, 2025 by jiangdongchen
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 19 deletions

README.md
+16 -2

config.json
+4 -2

logs/citation_process.log
+0 -0

psrc/citationProcess.py
+16 -15

No files found.
--- a/README.md
+++ b/README.md
@@ -10,10 +10,17 @@
 - 无法import的库使用pip install逐个安装
    - `openai`, `pypdf`
    - `python-Levenshtein`
+- 目前的密钥是东辰同学自己从知乎上打广告赚来的，只有100块的额度，请尽量使用自己的密钥
+- 如果使用不同的API的密钥注意更改openAI的调用方式，这里推荐硅基流动，因为我就是用硅基流动跑通的

 # 使用方法
+- 查看config.json正确配置参数，让程序能够找到需要的文件位置和参数
+- python main.py 执行程序
+- 程序执行过程中，不要打开target excel文件，不然会争用权限发生错误
 - 多模型交叉验证
 - 成功后的日志样例在logs文件夹下
+- 如果在excel某个序号之前的pdf都正确提取了信息，并且正确修改了excel，下一个序号开始的pdf出错了
+    - 建议将正确的pdf都转移到其他文件夹，这样再次运行脚本将处理剩下的pdf

 # 需求与解决方案
 1. 下载论文pdf
@@ -30,4 +37,11 @@
                1. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文名称
                2. 将pdf文件中的关键信息写入excel表格中, 包括作者姓名、机构、国家
            4. 匹配失败后，输出无法匹配的条目
-                o 使用warning记录无法匹配的条目，方便后续处理
\ No newline at end of file
+                1. 使用warning记录无法匹配的条目，方便后续处理
+
+# 代码结构说明
+1. psrc文件夹下是库函数
+2. config.json是配置文件
+3. main.py是主程序
+4. logs文件夹是日志文件
+5. json文件夹是关键信息json文件
\ No newline at end of file
--- a/config.json
+++ b/config.json
@@ -4,8 +4,9 @@
    "model": "Pro/deepseek-ai/DeepSeek-V3",
    "pdf_dir": "./Papers",
    "result_dir": "./json",
-    "excel_path": "./others/论文被引用情况-陈老师-2025.05.01.xlsx",
+    "source_excel_path": "./others/论文被引用情况-陈老师-2025.05.01.xlsx",
+    "target_excel_path": "./others/target.xlsx",
    "logLevel": 20,
-    "tableNum": 1,
+    "sheetNum": 1,
    "maxItem": 64
 }
\ No newline at end of file
--- a/logs/citation_process.log
+++ b/logs/citation_process.log
--- a/psrc/citationProcess.py
+++ b/psrc/citationProcess.py
@@ -16,14 +16,14 @@ def get_authors( content, configModel, client):
    -   **Title:** Extract the main title of the document. If ambiguous or missing, use "".
    -   **Authors:**
        -   Identify all listed authors. Maintain the order presented in the text if possible.
-        -   For each author:
-            -   Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
+        -   Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
    -   **Institutions:**
        -   Extract all associated institutions of authors.
    -   **Countrys:**
        -   Extract all associated countrys of authors.
+    -   Title, authors, institutions and countrys should be four separate keys, not nested together.
+    -   Use highcase for first letter of key.
    -   **Handling Missing Data:** If no data of a field can be identified in the text, the field in the JSON should be an empty list `[]`.
-    -   use highcase for first letter of key.
    """

    response = client.chat.completions.create(  
@@ -94,14 +94,15 @@ def citationProcess(config: dict):
    client = OpenAI(api_key=config["api_key"], 
                    base_url=config["base_url"])

-    excel_path = Path(config["excel_path"])
+    excel_path = Path(config["source_excel_path"])
+    target_path = Path(config["target_excel_path"])

    # 读取Excel文件
    wb = openpyxl.load_workbook(excel_path)

    # 遍历工作簿中的所有工作表
    for idx, sheet_name in enumerate(wb.sheetnames):
-        if idx == config["tableNum"]:
+        if idx == config["sheetNum"]:
            break
        sheet = wb[sheet_name]
        logging.info(f"Processing sheet: {sheet_name}")
@@ -110,9 +111,7 @@ def citationProcess(config: dict):

        rst_dir = Path.cwd() / config["result_dir"] / sheet_name
        rst_dir.mkdir(parents=True, exist_ok=True)  # 确保结果目录存在
-
-        exit()
-         
+ 
        pdf_directory = Path.cwd() / config["pdf_dir"] / sheet_name
        
        pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索, 输出所有pdf文件的路径
@@ -133,7 +132,6 @@ def citationProcess(config: dict):
            # 提取关键信息
            result = get_authors(first_page_text, configModel, client)

-
            if result is not None:
                # 解析JSON结果, 提取论文标题
                result_dict = json.loads(result)
@@ -151,7 +149,7 @@ def citationProcess(config: dict):

                    if similarity >= 85:
                        # 重命名PDF文件
-                        new_pdf_name = f"{idx}-{pdf_title.replace(':', '-')}.pdf"  # 将冒号替换为连字符
+                        new_pdf_name = f"{idx}-{pdf_title.replace(':', '_')}.pdf"  # 将冒号替换为连字符
                        new_pdf_path = file.parent / new_pdf_name
                        try:
                            file.rename(new_pdf_path)
@@ -166,10 +164,14 @@ def citationProcess(config: dict):
                        
                        # 更新Excel中的表项
                        sheet.cell(row=idx+4, column=3, value=pdf_title)  # 第3列是论文名称
+
+                        authors_list = result_dict.get("Authors", [])
+                        authors = ";".join(authors_list) if isinstance(authors_list, list) else ""
+                        sheet.cell(row=idx+4, column=7, value=authors)  # 第7列是作者名称
+
+                        # 保存修改后的Excel文件
+                        wb.save(target_path)
                        
                        logging.info(f"Matched: {file.name} -> idx: {idx}, excel_name: {excel_name}")
                        logging.info(f"Change: {file.name} -> {new_pdf_name}")
-                        break
-    
-    # 保存修改后的Excel文件
-    wb.save(excel_path)
\ No newline at end of file
+                        break