Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
papertools
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
papertools
Commits
f999529f
Commit
f999529f
authored
May 15, 2025
by
jiangdongchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
excel未匹配项输出log以及清空表项
parent
1a389f1e
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
40 additions
and
9 deletions
+40
-9
README.md
+2
-1
others/target.xlsx
+0
-0
psrc/stage1/citationProcess.py
+38
-8
No files found.
README.md
View file @
f999529f
...
@@ -57,7 +57,8 @@
...
@@ -57,7 +57,8 @@
-
将论文标题、会议名称、作者姓名、通讯作者姓名、机构写入excel.
-
将论文标题、会议名称、作者姓名、通讯作者姓名、机构写入excel.
-
用
**大模型**
将英文国家名翻译成中国名,将国家对应的索引写入目标excel表格中.
-
用
**大模型**
将英文国家名翻译成中国名,将国家对应的索引写入目标excel表格中.
-
将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给
**大模型**
匹配,匹配结果以“是/否”的形式写入目标excel表格中.
-
将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给
**大模型**
匹配,匹配结果以“是/否”的形式写入目标excel表格中.
-
匹配失败后,输出无法匹配的条目,使用warning记录无法匹配的条目,方便后续处理.
-
匹配失败后,log输出无法匹配的条目,使用warning记录无法匹配的pdf条目,方便后续处理.
-
遍历结束后,log输出excel中未被匹配的条目集合,并且将其对应表项清空.
3.
得到从pdf中提取的信息json和格式化的excel表格.
3.
得到从pdf中提取的信息json和格式化的excel表格.
4.
人工复核
4.
人工复核
1.
关键信息可能会提取失败,详见log,一般不会有问题, 如果出现了需要人工加一下.
1.
关键信息可能会提取失败,详见log,一般不会有问题, 如果出现了需要人工加一下.
...
...
others/target.xlsx
View file @
f999529f
No preview for this file type
psrc/stage1/citationProcess.py
View file @
f999529f
from
errno
import
ESTALE
from
errno
import
ESTALE
from
logging
import
config
from
pathlib
import
Path
from
pathlib
import
Path
import
logging
import
logging
from
openai
import
OpenAI
from
openai
import
OpenAI
...
@@ -205,14 +206,14 @@ def get_citation_ids(pdf_path, title, configModel, client):
...
@@ -205,14 +206,14 @@ def get_citation_ids(pdf_path, title, configModel, client):
logging
.
error
(
f
"An error occurred while processing {pdf_path.name}: {str(e)}"
)
logging
.
error
(
f
"An error occurred while processing {pdf_path.name}: {str(e)}"
)
return
None
return
None
# excel表格的第
4
行开始读取索引和论文名称
# excel表格的第
content_start
行开始读取索引和论文名称
def
read_rough_nameIndex_from_excel
(
sheet
,
maxItem
):
def
read_rough_nameIndex_from_excel
(
sheet
,
m
in_row
,
m
axItem
):
index_list
=
[]
index_list
=
[]
paperName_list
=
[]
paperName_list
=
[]
# 从第4行开始遍历
# 从第4行开始遍历
for
idx
,
row
in
enumerate
(
sheet
.
iter_rows
(
min_row
=
4
,
values_only
=
True
)):
for
idx
,
row
in
enumerate
(
sheet
.
iter_rows
(
min_row
,
values_only
=
True
)):
if
idx
>=
maxItem
:
# 限制读取的行数
if
idx
>=
maxItem
:
# 限制读取的行数
break
break
if
row
[
0
]
and
row
[
2
]:
# 确保索引和论文名称都存在
if
row
[
0
]
and
row
[
2
]:
# 确保索引和论文名称都存在
...
@@ -274,7 +275,7 @@ def citationProcess(config: dict):
...
@@ -274,7 +275,7 @@ def citationProcess(config: dict):
sheet
=
wb
[
sheet_name
]
sheet
=
wb
[
sheet_name
]
logging
.
info
(
f
"{BLUE}Processing sheet: {sheet_name}{RESET}"
)
logging
.
info
(
f
"{BLUE}Processing sheet: {sheet_name}{RESET}"
)
index_list
,
paperName_list
=
read_rough_nameIndex_from_excel
(
sheet
,
config
[
"maxItem"
])
index_list
,
paperName_list
=
read_rough_nameIndex_from_excel
(
sheet
,
config
[
"
content_start"
]
+
1
,
config
[
"
maxItem"
])
rst_dir
=
Path
.
cwd
()
/
config
[
"result_dir"
]
/
sheet_name
rst_dir
=
Path
.
cwd
()
/
config
[
"result_dir"
]
/
sheet_name
rst_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# 确保结果目录存在
rst_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# 确保结果目录存在
...
@@ -302,6 +303,11 @@ def citationProcess(config: dict):
...
@@ -302,6 +303,11 @@ def citationProcess(config: dict):
logging
.
error
(
f
"Error parsing BibTeX in sheet {sheet_name} row 3: {str(e)}"
)
logging
.
error
(
f
"Error parsing BibTeX in sheet {sheet_name} row 3: {str(e)}"
)
# 遍历当前工作表下的所有项目
# 遍历当前工作表下的所有项目
# 在遍历PDF文件前,先记录所有Excel索引
all_excel_indices
:
set
[
int
]
=
set
(
index_list
)
matched_indices
:
set
[
int
]
=
set
()
# 遍历当前工作表下的所有项目
for
file
in
pdf_files
:
for
file
in
pdf_files
:
logging
.
info
(
f
"{BLUE}Processing PDF file: {file.name}{RESET}"
)
# 添加蓝色日志输出
logging
.
info
(
f
"{BLUE}Processing PDF file: {file.name}{RESET}"
)
# 添加蓝色日志输出
...
@@ -317,16 +323,20 @@ def citationProcess(config: dict):
...
@@ -317,16 +323,20 @@ def citationProcess(config: dict):
# 直接遍历index_list查找匹配的索引
# 直接遍历index_list查找匹配的索引
for
i
,
excel_idx
in
enumerate
(
index_list
):
for
i
,
excel_idx
in
enumerate
(
index_list
):
if
excel_idx
==
file_idx
:
if
excel_idx
==
file_idx
:
excel_row_idx
=
i
+
config
[
"content_start"
]
excel_row_idx
=
i
+
config
[
"content_start"
]
+
1
excel_name
=
paperName_list
[
i
]
excel_name
=
paperName_list
[
i
]
logging
.
info
(
f
"{BLUE}Matched - Excel Row: {excel_row_idx}, Index: {excel_idx}, Title: {excel_name}{RESET}"
)
logging
.
info
(
f
"{BLUE}Matched - Excel Row: {excel_row_idx}, Index: {excel_idx}, Title: {excel_name}{RESET}"
)
# 在成功匹配索引后记录
matched_indices
.
add
(
excel_idx
)
break
break
else
:
else
:
raise
ValueError
(
f
"Index {file_idx} not found"
)
raise
ValueError
(
f
"Index {file_idx} not found"
)
except
ValueError
:
except
ValueError
:
logging
.
warning
(
f
"{RED}Index {file_idx} not found in Excel sheet{RESET}"
)
logging
.
warning
(
f
"{RED}Index {file_idx} not found in Excel sheet{RESET}"
)
continue
continue
first_page_text
=
extract_first_page_text
(
file
)
first_page_text
=
extract_first_page_text
(
file
)
if
first_page_text
is
None
:
if
first_page_text
is
None
:
...
@@ -420,4 +430,24 @@ def citationProcess(config: dict):
...
@@ -420,4 +430,24 @@ def citationProcess(config: dict):
# 保存修改后的Excel文件
# 保存修改后的Excel文件
wb
.
save
(
target_path
)
wb
.
save
(
target_path
)
else
:
else
:
logging
.
warning
(
f
"{RED}Failed to extract key info from {file.name}{RESET}"
)
\ No newline at end of file
logging
.
warning
(
f
"{RED}Failed to extract key info from {file.name}{RESET}"
)
# 在处理完所有PDF文件后,检查未匹配的索引
unmatched_indices
=
all_excel_indices
-
matched_indices
if
unmatched_indices
:
# 将索引统一转换为整数再排序
sorted_indices
=
sorted
(
unmatched_indices
)
logging
.
warning
(
f
"{RED}以下索引在Excel中存在但没有对应的PDF文件: {sorted_indices}{RESET}"
)
# 清空未匹配索引对应的行
for
excel_idx
in
unmatched_indices
:
for
i
,
idx
in
enumerate
(
index_list
):
if
idx
==
excel_idx
:
row_idx
=
i
+
config
[
"content_start"
]
+
1
# 清空从第4列开始的内容(保留索引和原始名称)
for
col
in
range
(
4
,
sheet
.
max_column
+
1
):
sheet
.
cell
(
row
=
row_idx
,
column
=
col
,
value
=
""
)
break
# 保存修改后的Excel文件
wb
.
save
(
target_path
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment