Commit 763c9946 by jiangdongchen

update for deepseek and rename

parent ab7aabf8
.vscode/
others/
Papers/
\ No newline at end of file
# 环境配置
- 确保执行py的cwd在papertools仓库文件夹下
- 路径和参数配置都在config.json文件中
- logLevel
- 取10表示DEBUG级别
- 取20表示INFO级别
- python3.12.10
- 无法import的库使用pip install逐个安装
- `openai`, `pypdf`
# 使用方法
- 多模型交叉验证
# 需求与解决方案
1. 下载论文pdf
1. 常用网站agent下载
2. 输出无法下载的条目
2. 自动化重命名
1. 读取excel表格中的论文名称和索引
2. 循环:读取pdf中的论文名称
1. 和excel表格中的论文名称进行模糊匹配
2. 匹配成功后
\ No newline at end of file
import json
import logging
import psrc.rename_extractInfo as RE
from openai import OpenAI
from pathlib import Path
if __name__ == "__main__":
# 获取当前脚本所在目录
# current_py_dir = os.path.dirname(os.path.abspath(__file__))
# 获取CWD
cwd_dir = Path.cwd()
# 构建 config.json 的完整路径
config_path = (cwd_dir / "config.json").resolve()
# 读取config.json中的配置参数
with open( config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
# Path对象后跟/用于连接地址
pdf_dir = (cwd_dir / config["pdf_dir"]).resolve()
rst_dir = (cwd_dir / config["result_path"]).resolve()
excel_path = (cwd_dir / config["excel_path"]).resolve()
sheet_name = config["sheet_name"]
# print(excel_path)
logLevel = config["logLevel"]
# logging.basicConfig(...) 是 Python 标准库 logging 模块中的一个函数,用于快速配置日志记录的基本设置.
# 设置日志记录的最低级别为 INFO, 只有日志级别大于等于 INFO 的日志记录才会被处理(例如 INFO、WARNING、ERROR、CRITICAL).
# logging.debug("这是一个调试信息") # 不会输出(低于 INFO)
# logging.info("这是一个普通信息") # 会输出
logging.basicConfig(
# %(asctime)s:日志记录的时间戳(默认格式:YYYY-MM-DD HH:MM:SS)。
# %(levelname)s:日志级别名称(如 INFO, WARNING)。
# %(message)s:日志的具体内容。
level=logLevel, format="%(asctime)s - %(levelname)s - %(message)s"
)
client = OpenAI(api_key=config["api_key"],
base_url=config["base_url"])
# RE.main( pdf_dir, rst_dir, config["model"], client)
RE.read_rough_nameIndex_from_excel(excel_path, sheet_name)
{
"api_key": "sk-otamesebhzzycgfynnssjkrkjlcoitdtstcruwbhohksdlel",
"base_url": "https://api.siliconflow.cn/v1",
"model": "Pro/deepseek-ai/DeepSeek-V3",
"pdf_dir": "./Papers",
"result_path": "./result.json",
"excel_path": "./others/reference.xlsx",
"sheet_name": "j24-DianNao family",
"logLevel": 20
}
\ No newline at end of file
# Readme
## 依赖
`openai`, `pypdf`
## 用法
export OPENAI_API_KEY=...
python extract_authors_info.py --paper ./papars --result result.jsonl
- 参数paper: 包含所有pdf的文件夹的路径
- 参数result: 结果文件路径
## 注意
本结果由GPT自动生成,准确性无法保证,需要人工再次审核。
from pydantic import BaseModel
from openai import OpenAI
from pathlib import Path
import pypdf
import json
import os
import logging
import pypdf
import pandas as pd
def get_authors( content, configModel, client):
system_prompt = """
Act as an expert metadata extraction assistant.
Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report).
Your goal is to extract the document title, all authors, and their corresponding affiliations.
Extraction Guidelines:
- **Title:** Extract the main title of the document. If ambiguous or missing, use "".
- **Authors:**
- Identify all listed authors. Maintain the order presented in the text if possible.
- For each author:
- Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
- Extract all associated institutions/affiliations mentioned for that specific author.
- If an author has no listed institution, use an empty list `[]`.
- If there are many authors and only one afflication, these authors all come from the same afflication. other wise find the corresponding afflication by indicator.
- **Handling Missing Data:** If no authors can be identified in the text, the "authors" field in the JSON should be an empty list `[]`.
"""
response = client.chat.completions.create(
model=configModel,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": content},
],
temperature=0.7,
max_tokens=4096,
# stream=True,
response_format={"type": "json_object"}
)
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logging.debug(response.choices[0].message.content)
client = OpenAI(
api_key=os.environ.get("OPENAI_API_KEY"),
)
return response.choices[0].message.content
# for chunk in response:
# if not chunk.choices:
# continue
# if chunk.choices[0].delta.content:
# # 增量输出返回值
# print(chunk.choices[0].delta.content, end="", flush=True) # 不换行刷新输出,流式输出
class Author(BaseModel):
name: str
affiliations: list[str]
# Extracts text content from the first page of a PDF.
def extract_first_page_text(pdf_path):
class Metadata(BaseModel):
title: str
authors: list[Author]
reader = pypdf.PdfReader(pdf_path)
if len(reader.pages) > 0:
first_page = reader.pages[0]
text = first_page.extract_text()
system_prompt = """
Act as an expert metadata extraction assistant.
Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report).
Your goal is to extract the document title, all authors, and their corresponding affiliations.
if text:
# 1. text.split():用默认空白符(空格/换行/制表符)分割字符串,自动合并连续空白
# 2. " ".join(...):用单个空格连接,实现「多余空白→单空格」的清理效果
cleaned_text = " ".join(text.split())
cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
"utf-8"
)
return cleaned_text
else:
logging.warning(f"No text found on the first page of {pdf_path.name}")
return None
else:
logging.warning(f"PDF has no pages: {pdf_path.name}")
return None
Extraction Guidelines:
- **Title:** Extract the main title of the document. If ambiguous or missing, use "".
- **Authors:**
- Identify all listed authors. Maintain the order presented in the text if possible.
- For each author:
- Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
- Extract all associated institutions/affiliations mentioned for that specific author.
- If an author has no listed institution, use an empty list `[]`.
- If there are many authors and only one afflication, these authors all come from the same afflication. other wise find the corresponding afflication by indicator.
- **Handling Missing Data:** If no authors can be identified in the text, the "authors" field in the JSON should be an empty list `[]`.
"""
def read_rough_nameIndex_from_excel(excel_path: Path, sheet_name: str):
# 读取 Excel 文件中的某个工作表
df = pd.read_excel( excel_path, sheet_name)
def get_authors(content):
response = client.responses.parse(
model="gpt-4o",
input=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": content},
],
text_format=Metadata,
)
result = response.output_parsed
return result
# 显示前几行数据
print(df.head())
# 获取所有工作表的数据
# excel_data = pd.read_excel('example.xlsx', sheet_name=None)
# for sheet_name, data in excel_data.items():
# print(f"工作表: {sheet_name}")
# print(data.head())
def extract_first_page_text(pdf_path):
"""Extracts text content from the first page of a PDF."""
try:
reader = pypdf.PdfReader(pdf_path)
if len(reader.pages) > 0:
first_page = reader.pages[0]
text = first_page.extract_text()
if text:
# Basic cleaning: remove excessive whitespace
cleaned_text = " ".join(text.split())
cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
"utf-8"
)
return cleaned_text
else:
logging.warning(f"No text found on the first page of {pdf_path.name}")
return None
else:
logging.warning(f"PDF has no pages: {pdf_path.name}")
return None
except pypdf.errors.PdfReadError as e:
logging.error(f"Error reading PDF file {pdf_path.name}: {e}")
return None
except FileNotFoundError:
logging.error(f"PDF file not found: {pdf_path}")
return None
except Exception as e:
logging.error(
f"An unexpected error occurred while processing {pdf_path.name}: {e}"
)
return None
def main(pdf_directory: Path, result_path: Path, configModel: str, client):
with open(result_path, "w", encoding="utf-8") as f:
pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索 recursive glob
def main(pdf_directory: Path, result_path: Path):
with open(result_path, "a", encoding="utf-8") as f:
pdf_files = pdf_directory.rglob("*.pdf")
for file in pdf_files:
try:
logging.info(f"Extract {file.name}'s authors")
first_page_text = extract_first_page_text(file)
logging.info(first_page_text)
if first_page_text is not None:
result = get_authors(first_page_text).model_dump()
result["filename"] = file.name
f.write(json.dumps(result) + "\n")
except Exception as e:
logging.error(f"{file.name}: {str(e)}")
if __name__ == "__main__":
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("--paper", type=str, required=True)
argparser.add_argument("--result", type=str, required=True)
args = argparser.parse_args()
main(Path(args.paper), Path(args.result))
logging.info(f"Extract {file.name}'s authors")
first_page_text = extract_first_page_text(file)
logging.debug(first_page_text)
if first_page_text is not None:
result = get_authors(first_page_text, configModel, client)
if result:
f.write(result + "\n")
exit()
\ No newline at end of file
{
"title": "A carbon-nanotube-based tensor processing unit",
"authors": [
{
"name": "Jia Si",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
},
{
"name": "Panpan Zhang",
"affiliations": [
"State Key Laboratory of Information Photonics and Optical Communications, Beijing University of Posts and Telecommunications, Beijing, China"
]
},
{
"name": "Chenyi Zhao",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Dongyi Lin",
"affiliations": [
"Hunan Institute of Advanced Sensing and Information Technology, Xiangtan University, Xiangtan, China"
]
},
{
"name": "Lin Xu",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Haitao Xu",
"affiliations": [
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
},
{
"name": "Lijun Liu",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Jianhua Jiang",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Lian-Mao Peng",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
},
{
"name": "Zhiyong Zhang",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
"Hunan Institute of Advanced Sensing and Information Technology, Xiangtan University, Xiangtan, China",
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
}
]
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment