Commit 00cc4554 by jiangdongchen

rename and extract key information

parent ae59d2a2
.vscode/ .vscode/
others/ others/
Papers/ Papers/
\ No newline at end of file psrc/__pycache__/
json/
\ No newline at end of file
...@@ -4,19 +4,30 @@ ...@@ -4,19 +4,30 @@
- logLevel - logLevel
- 取10表示DEBUG级别 - 取10表示DEBUG级别
- 取20表示INFO级别 - 取20表示INFO级别
- tableNum 需要处理的工作表数量
- maxItem 每个工作表的最大条目数
- python3.12.10 - python3.12.10
- 无法import的库使用pip install逐个安装 - 无法import的库使用pip install逐个安装
- `openai`, `pypdf` - `openai`, `pypdf`
- `python-Levenshtein`
# 使用方法 # 使用方法
- 多模型交叉验证 - 多模型交叉验证
- 成功后的日志样例在logs文件夹下
# 需求与解决方案 # 需求与解决方案
1. 下载论文pdf 1. 下载论文pdf
1. 常用网站agent下载 1. 常用网站agent下载
2. 输出无法下载的条目 2. 输出无法下载的条目
2. 自动化重命名 2. 自动化提取信息和格式化
1. 读取excel表格中的论文名称和索引 1. 通过config.json读取配置对象
2. 循环:读取pdf中的论文名称 2. 遍历excel的工作表
1. 和excel表格中的论文名称进行模糊匹配 1. 读取excel表格中的论文名称和索引
2. 匹配成功后 2. 循环:
\ No newline at end of file 1. 读取pdf中的论文名称和关键信息,存储到json文件夹下
2. 和excel表格中的论文名称进行模糊匹配
3. 匹配成功后
1. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文名称
2. 将pdf文件中的关键信息写入excel表格中, 包括作者姓名、机构、国家
4. 匹配失败后,输出无法匹配的条目
o 使用warning记录无法匹配的条目,方便后续处理
\ No newline at end of file
...@@ -3,7 +3,9 @@ ...@@ -3,7 +3,9 @@
"base_url": "https://api.siliconflow.cn/v1", "base_url": "https://api.siliconflow.cn/v1",
"model": "Pro/deepseek-ai/DeepSeek-V3", "model": "Pro/deepseek-ai/DeepSeek-V3",
"pdf_dir": "./Papers", "pdf_dir": "./Papers",
"result_path": "./result.json", "result_dir": "./json",
"excel_path": "./others/reference.xlsx", "excel_path": "./others/论文被引用情况-陈老师-2025.05.01.xlsx",
"logLevel": 20 "logLevel": 20,
"tableNum": 1,
"maxItem": 64
} }
\ No newline at end of file
import json import json
import logging import logging
import psrc.rename_extractInfo as RE import psrc.citationProcess as CP
from openai import OpenAI
from pathlib import Path from pathlib import Path
if __name__ == "__main__": if __name__ == "__main__":
# 获取当前脚本所在目录
# current_py_dir = os.path.dirname(os.path.abspath(__file__))
# 获取CWD
cwd_dir = Path.cwd() cwd_dir = Path.cwd()
# 构建 config.json 的完整路径 # 构建 config.json 的完整路径
...@@ -19,12 +14,15 @@ if __name__ == "__main__": ...@@ -19,12 +14,15 @@ if __name__ == "__main__":
config = json.load(f) config = json.load(f)
# Path对象后跟/用于连接地址 # Path对象后跟/用于连接地址
pdf_dir = (cwd_dir / config["pdf_dir"]).resolve()
rst_dir = (cwd_dir / config["result_path"]).resolve()
excel_path = (cwd_dir / config["excel_path"]).resolve()
# print(excel_path) # print(excel_path)
# 创建日志目录
log_dir = cwd_dir / "logs"
log_dir.mkdir(exist_ok=True)
# 配置日志系统
log_file = log_dir / "citation_process.log"
logLevel = config["logLevel"] logLevel = config["logLevel"]
# logging.basicConfig(...) 是 Python 标准库 logging 模块中的一个函数,用于快速配置日志记录的基本设置. # logging.basicConfig(...) 是 Python 标准库 logging 模块中的一个函数,用于快速配置日志记录的基本设置.
# 设置日志记录的最低级别为 INFO, 只有日志级别大于等于 INFO 的日志记录才会被处理(例如 INFO、WARNING、ERROR、CRITICAL). # 设置日志记录的最低级别为 INFO, 只有日志级别大于等于 INFO 的日志记录才会被处理(例如 INFO、WARNING、ERROR、CRITICAL).
...@@ -34,11 +32,12 @@ if __name__ == "__main__": ...@@ -34,11 +32,12 @@ if __name__ == "__main__":
# %(asctime)s:日志记录的时间戳(默认格式:YYYY-MM-DD HH:MM:SS)。 # %(asctime)s:日志记录的时间戳(默认格式:YYYY-MM-DD HH:MM:SS)。
# %(levelname)s:日志级别名称(如 INFO, WARNING)。 # %(levelname)s:日志级别名称(如 INFO, WARNING)。
# %(message)s:日志的具体内容。 # %(message)s:日志的具体内容。
level=logLevel, format="%(asctime)s - %(levelname)s - %(message)s" level=logLevel, format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(log_file, encoding='utf-8'),
logging.StreamHandler()
]
) )
client = OpenAI(api_key=config["api_key"], logging.info(f"程序启动,日志文件保存在: {log_file}")
base_url=config["base_url"]) CP.citationProcess(config)
# RE.main( pdf_dir, rst_dir, config["model"], client)
RE.read_rough_nameIndex_from_excel(excel_path)
from pathlib import Path from pathlib import Path
import logging import logging
from openai import OpenAI
import pypdf import pypdf
import pandas as pd import openpyxl
from fuzzywuzzy import fuzz
import json
def get_authors( content, configModel, client): def get_authors( content, configModel, client):
system_prompt = """ system_prompt = """
...@@ -15,10 +18,12 @@ def get_authors( content, configModel, client): ...@@ -15,10 +18,12 @@ def get_authors( content, configModel, client):
- Identify all listed authors. Maintain the order presented in the text if possible. - Identify all listed authors. Maintain the order presented in the text if possible.
- For each author: - For each author:
- Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry. - Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
- Extract all associated institutions/affiliations mentioned for that specific author. - **Institutions:**
- If an author has no listed institution, use an empty list `[]`. - Extract all associated institutions of authors.
- If there are many authors and only one afflication, these authors all come from the same afflication. other wise find the corresponding afflication by indicator. - **Countrys:**
- **Handling Missing Data:** If no authors can be identified in the text, the "authors" field in the JSON should be an empty list `[]`. - Extract all associated countrys of authors.
- **Handling Missing Data:** If no data of a field can be identified in the text, the field in the JSON should be an empty list `[]`.
- use highcase for first letter of key.
""" """
response = client.chat.completions.create( response = client.chat.completions.create(
...@@ -44,7 +49,6 @@ def get_authors( content, configModel, client): ...@@ -44,7 +49,6 @@ def get_authors( content, configModel, client):
# # 增量输出返回值 # # 增量输出返回值
# print(chunk.choices[0].delta.content, end="", flush=True) # 不换行刷新输出,流式输出 # print(chunk.choices[0].delta.content, end="", flush=True) # 不换行刷新输出,流式输出
# Extracts text content from the first page of a PDF. # Extracts text content from the first page of a PDF.
def extract_first_page_text(pdf_path): def extract_first_page_text(pdf_path):
...@@ -70,38 +74,102 @@ def extract_first_page_text(pdf_path): ...@@ -70,38 +74,102 @@ def extract_first_page_text(pdf_path):
return None return None
# excel表格的第4行开始读取索引和论文名称 # excel表格的第4行开始读取索引和论文名称
def read_rough_nameIndex_from_excel(excel_path: Path): def read_rough_nameIndex_from_excel(sheet, maxItem):
# 读取 Excel 文件中的某个工作表 index_list = []
# 当你读取多个工作表时,pandas.read_excel(sheet_name=None) 会返回一个字典,其中: paperName_list = []
# 键 是工作表的名称(sheet_name);
# 值 是每个工作表对应的 DataFrame。 # 从第4行开始遍历
# 通过 items(),你可以在一个循环中轻松地访问这两个部分 for idx, row in enumerate(sheet.iter_rows(min_row=4, values_only=True)):
if idx >= maxItem: # 限制读取的行数
# 获取工作表的数据 break
excel_data = pd.read_excel(excel_path, sheet_name=None) if row[0] and row[2]: # 确保索引和论文名称都存在
for sname, data in excel_data.items(): index_list.append(row[0])
df = data.iloc[2:] paperName_list.append(row[2])
for index, row in df.iterrows():
print(row.iloc[0])
print(row.iloc[1])
def main(pdf_directory: Path, result_path: Path, configModel: str, client): return index_list, paperName_list
with open(result_path, "w", encoding="utf-8") as f: def citationProcess(config: dict):
pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索 recursive glob
for file in pdf_files: client = OpenAI(api_key=config["api_key"],
logging.info(f"Extract {file.name}'s authors") base_url=config["base_url"])
first_page_text = extract_first_page_text(file) excel_path = Path(config["excel_path"])
logging.debug(first_page_text)
# 读取Excel文件
wb = openpyxl.load_workbook(excel_path)
# 遍历工作簿中的所有工作表
for idx, sheet_name in enumerate(wb.sheetnames):
if idx == config["tableNum"]:
break
sheet = wb[sheet_name]
logging.info(f"Processing sheet: {sheet_name}")
index_list, paperName_list = read_rough_nameIndex_from_excel(sheet, config["maxItem"])
if first_page_text is not None: rst_dir = Path.cwd() / config["result_dir"] / sheet_name
result = get_authors(first_page_text, configModel, client) rst_dir.mkdir(parents=True, exist_ok=True) # 确保结果目录存在
if result: exit()
f.write(result + "\n")
pdf_directory = Path.cwd() / config["pdf_dir"] / sheet_name
pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索, 输出所有pdf文件的路径
# 遍历当前工作表对应的所有PDF文件
for file in pdf_files:
logging.info(f"Processing {file.name}")
first_page_text = extract_first_page_text(file)
exit() if first_page_text is None:
\ No newline at end of file logging.error(f"Failed to extract text from first page of {file.name}")
continue # 跳过当前文件继续处理下一个
configModel = config["model"]
# 提取关键信息
result = get_authors(first_page_text, configModel, client)
if result is not None:
# 解析JSON结果, 提取论文标题
result_dict = json.loads(result)
pdf_title = result_dict["Title"]
# 遍历Excel表项进行模糊匹配
for idx, excel_name in zip(index_list, paperName_list):
# 预处理字符串
# 返回pdf字符前的字符串,所以加上索引0
clean_excel_name = excel_name.split('.pdf')[0].replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
clean_pdf_title = pdf_title.replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
similarity = fuzz.partial_ratio(clean_pdf_title.lower(), clean_excel_name.lower())
if similarity >= 85:
# 重命名PDF文件
new_pdf_name = f"{idx}-{pdf_title.replace(':', '-')}.pdf" # 将冒号替换为连字符
new_pdf_path = file.parent / new_pdf_name
try:
file.rename(new_pdf_path)
logging.info(f"Renamed: {file.name} -> {new_pdf_name}")
except FileExistsError:
logging.warning(f"Renamed failed: filename {new_pdf_name} already exists with idx {idx}.")
break
# 存储关键信息到json文件中
rst_path = rst_dir / (f"{idx}" + ".json")
rst_path.write_text(result + "\n", encoding='utf-8') # 明确指定UTF-8编码
# 更新Excel中的表项
sheet.cell(row=idx+4, column=3, value=pdf_title) # 第3列是论文名称
logging.info(f"Matched: {file.name} -> idx: {idx}, excel_name: {excel_name}")
logging.info(f"Change: {file.name} -> {new_pdf_name}")
break
# 保存修改后的Excel文件
wb.save(excel_path)
\ No newline at end of file
{
"title": "A carbon-nanotube-based tensor processing unit",
"authors": [
{
"name": "Jia Si",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Panpan Zhang",
"affiliations": [
"State Key Laboratory of Information Photonics and Optical Communications, Beijing University of Posts and Telecommunications, Beijing, China"
]
},
{
"name": "Chenyi Zhao",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Dongyi Lin",
"affiliations": [
"Hunan Institute of Advanced Sensing and Information Technology, Xiangtan University, Xiangtan, China"
]
},
{
"name": "Lin Xu",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Haitao Xu",
"affiliations": [
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
},
{
"name": "Lijun Liu",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Jianhua Jiang",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China"
]
},
{
"name": "Lian-Mao Peng",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
},
{
"name": "Zhiyong Zhang",
"affiliations": [
"Key Laboratory for the Physics and Chemistry of Nanodevices and Center for Carbon-based Electronics, School of Electronics, Peking University, Beijing, China",
"Hunan Institute of Advanced Sensing and Information Technology, Xiangtan University, Xiangtan, China",
"Beijing Institute of Carbon-based Integrated Circuits, Beijing, China"
]
}
]
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment