Commit 763c9946 by jiangdongchen

update for deepseek and rename

parent ab7aabf8
.vscode/
others/
Papers/
\ No newline at end of file
# 环境配置
- 确保执行py的cwd在papertools仓库文件夹下
- 路径和参数配置都在config.json文件中
- logLevel
- 取10表示DEBUG级别
- 取20表示INFO级别
- python3.12.10
- 无法import的库使用pip install逐个安装
- `openai`, `pypdf`
# 使用方法
- 多模型交叉验证
# 需求与解决方案
1. 下载论文pdf
1. 常用网站agent下载
2. 输出无法下载的条目
2. 自动化重命名
1. 读取excel表格中的论文名称和索引
2. 循环:读取pdf中的论文名称
1. 和excel表格中的论文名称进行模糊匹配
2. 匹配成功后
\ No newline at end of file
import json
import logging
import psrc.rename_extractInfo as RE
from openai import OpenAI
from pathlib import Path
if __name__ == "__main__":
# 获取当前脚本所在目录
# current_py_dir = os.path.dirname(os.path.abspath(__file__))
# 获取CWD
cwd_dir = Path.cwd()
# 构建 config.json 的完整路径
config_path = (cwd_dir / "config.json").resolve()
# 读取config.json中的配置参数
with open( config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
# Path对象后跟/用于连接地址
pdf_dir = (cwd_dir / config["pdf_dir"]).resolve()
rst_dir = (cwd_dir / config["result_path"]).resolve()
excel_path = (cwd_dir / config["excel_path"]).resolve()
sheet_name = config["sheet_name"]
# print(excel_path)
logLevel = config["logLevel"]
# logging.basicConfig(...) 是 Python 标准库 logging 模块中的一个函数,用于快速配置日志记录的基本设置.
# 设置日志记录的最低级别为 INFO, 只有日志级别大于等于 INFO 的日志记录才会被处理(例如 INFO、WARNING、ERROR、CRITICAL).
# logging.debug("这是一个调试信息") # 不会输出(低于 INFO)
# logging.info("这是一个普通信息") # 会输出
logging.basicConfig(
# %(asctime)s:日志记录的时间戳(默认格式:YYYY-MM-DD HH:MM:SS)。
# %(levelname)s:日志级别名称(如 INFO, WARNING)。
# %(message)s:日志的具体内容。
level=logLevel, format="%(asctime)s - %(levelname)s - %(message)s"
)
client = OpenAI(api_key=config["api_key"],
base_url=config["base_url"])
# RE.main( pdf_dir, rst_dir, config["model"], client)
RE.read_rough_nameIndex_from_excel(excel_path, sheet_name)
{
"api_key": "sk-otamesebhzzycgfynnssjkrkjlcoitdtstcruwbhohksdlel",
"base_url": "https://api.siliconflow.cn/v1",
"model": "Pro/deepseek-ai/DeepSeek-V3",
"pdf_dir": "./Papers",
"result_path": "./result.json",
"excel_path": "./others/reference.xlsx",
"sheet_name": "j24-DianNao family",
"logLevel": 20
}
\ No newline at end of file
# Readme
## 依赖
`openai`, `pypdf`
## 用法
export OPENAI_API_KEY=...
python extract_authors_info.py --paper ./papars --result result.jsonl
- 参数paper: 包含所有pdf的文件夹的路径
- 参数result: 结果文件路径
## 注意
本结果由GPT自动生成,准确性无法保证,需要人工再次审核。
from pydantic import BaseModel
from openai import OpenAI
from pathlib import Path from pathlib import Path
import pypdf
import json
import os
import logging import logging
import pypdf
import pandas as pd
def get_authors( content, configModel, client):
system_prompt = """
Act as an expert metadata extraction assistant.
Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report).
Your goal is to extract the document title, all authors, and their corresponding affiliations.
Extraction Guidelines:
- **Title:** Extract the main title of the document. If ambiguous or missing, use "".
- **Authors:**
- Identify all listed authors. Maintain the order presented in the text if possible.
- For each author:
- Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
- Extract all associated institutions/affiliations mentioned for that specific author.
- If an author has no listed institution, use an empty list `[]`.
- If there are many authors and only one afflication, these authors all come from the same afflication. other wise find the corresponding afflication by indicator.
- **Handling Missing Data:** If no authors can be identified in the text, the "authors" field in the JSON should be an empty list `[]`.
"""
response = client.chat.completions.create(
model=configModel,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": content},
],
temperature=0.7,
max_tokens=4096,
# stream=True,
response_format={"type": "json_object"}
)
logging.basicConfig( logging.debug(response.choices[0].message.content)
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
client = OpenAI( return response.choices[0].message.content
api_key=os.environ.get("OPENAI_API_KEY"),
)
# for chunk in response:
# if not chunk.choices:
# continue
# if chunk.choices[0].delta.content:
# # 增量输出返回值
# print(chunk.choices[0].delta.content, end="", flush=True) # 不换行刷新输出,流式输出
class Author(BaseModel):
name: str
affiliations: list[str]
# Extracts text content from the first page of a PDF.
def extract_first_page_text(pdf_path):
class Metadata(BaseModel): reader = pypdf.PdfReader(pdf_path)
title: str if len(reader.pages) > 0:
authors: list[Author]
first_page = reader.pages[0]
text = first_page.extract_text()
system_prompt = """ if text:
Act as an expert metadata extraction assistant. # 1. text.split():用默认空白符(空格/换行/制表符)分割字符串,自动合并连续空白
Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report). # 2. " ".join(...):用单个空格连接,实现「多余空白→单空格」的清理效果
Your goal is to extract the document title, all authors, and their corresponding affiliations. cleaned_text = " ".join(text.split())
cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
"utf-8"
)
return cleaned_text
else:
logging.warning(f"No text found on the first page of {pdf_path.name}")
return None
else:
logging.warning(f"PDF has no pages: {pdf_path.name}")
return None
Extraction Guidelines: def read_rough_nameIndex_from_excel(excel_path: Path, sheet_name: str):
- **Title:** Extract the main title of the document. If ambiguous or missing, use "".
- **Authors:**
- Identify all listed authors. Maintain the order presented in the text if possible.
- For each author:
- Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
- Extract all associated institutions/affiliations mentioned for that specific author.
- If an author has no listed institution, use an empty list `[]`.
- If there are many authors and only one afflication, these authors all come from the same afflication. other wise find the corresponding afflication by indicator.
- **Handling Missing Data:** If no authors can be identified in the text, the "authors" field in the JSON should be an empty list `[]`.
"""
# 读取 Excel 文件中的某个工作表
df = pd.read_excel( excel_path, sheet_name)
def get_authors(content): # 显示前几行数据
response = client.responses.parse( print(df.head())
model="gpt-4o",
input=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": content},
],
text_format=Metadata,
)
result = response.output_parsed
return result
# 获取所有工作表的数据
# excel_data = pd.read_excel('example.xlsx', sheet_name=None)
# for sheet_name, data in excel_data.items():
# print(f"工作表: {sheet_name}")
# print(data.head())
def extract_first_page_text(pdf_path): def main(pdf_directory: Path, result_path: Path, configModel: str, client):
"""Extracts text content from the first page of a PDF."""
try:
reader = pypdf.PdfReader(pdf_path)
if len(reader.pages) > 0:
first_page = reader.pages[0]
text = first_page.extract_text()
if text:
# Basic cleaning: remove excessive whitespace
cleaned_text = " ".join(text.split())
cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
"utf-8"
)
return cleaned_text
else:
logging.warning(f"No text found on the first page of {pdf_path.name}")
return None
else:
logging.warning(f"PDF has no pages: {pdf_path.name}")
return None
except pypdf.errors.PdfReadError as e:
logging.error(f"Error reading PDF file {pdf_path.name}: {e}")
return None
except FileNotFoundError:
logging.error(f"PDF file not found: {pdf_path}")
return None
except Exception as e:
logging.error(
f"An unexpected error occurred while processing {pdf_path.name}: {e}"
)
return None
with open(result_path, "w", encoding="utf-8") as f:
pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索 recursive glob
def main(pdf_directory: Path, result_path: Path):
with open(result_path, "a", encoding="utf-8") as f:
pdf_files = pdf_directory.rglob("*.pdf")
for file in pdf_files: for file in pdf_files:
try: logging.info(f"Extract {file.name}'s authors")
logging.info(f"Extract {file.name}'s authors")
first_page_text = extract_first_page_text(file) first_page_text = extract_first_page_text(file)
logging.info(first_page_text) logging.debug(first_page_text)
if first_page_text is not None:
result = get_authors(first_page_text).model_dump() if first_page_text is not None:
result["filename"] = file.name result = get_authors(first_page_text, configModel, client)
f.write(json.dumps(result) + "\n")
except Exception as e: if result:
logging.error(f"{file.name}: {str(e)}") f.write(result + "\n")
exit()
if __name__ == "__main__": \ No newline at end of file
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("--paper", type=str, required=True)
argparser.add_argument("--result", type=str, required=True)
args = argparser.parse_args()
main(Path(args.paper), Path(args.result))
{
"title": "A carbon-nanotube-based tensor processing unit",
"authors": [
{
"name": "Jia Si",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
},
{
"name": "Panpan Zhang",
"affiliations": [
"State Key Laboratory of Information Photonics and Optical Communications, Beijing University of Posts and Telecommunications, Beijing, China"
]
},
{
"name": "Chenyi Zhao",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Dongyi Lin",
"affiliations": [
"Hunan Institute of Advanced Sensing and Information Technology, Xiangtan University, Xiangtan, China"
]
},
{
"name": "Lin Xu",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Haitao Xu",
"affiliations": [
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
},
{
"name": "Lijun Liu",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Jianhua Jiang",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Lian-Mao Peng",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
},
{
"name": "Zhiyong Zhang",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
"Hunan Institute of Advanced Sensing and Information Technology, Xiangtan University, Xiangtan, China",
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
}
]
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment