Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
papertools
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
papertools
Commits
6e3eacb1
Commit
6e3eacb1
authored
May 15, 2025
by
Ziyuan Nan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload compare
parent
f5fcb116
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
216 additions
and
0 deletions
+216
-0
compare.py
+216
-0
No files found.
compare.py
0 → 100644
View file @
6e3eacb1
# python .\compare.py --human Diannao.xlsx --sheet 0 --deepseek diannao_rengong --skip_rows 3
# human: 人工标注表格数据
# sheet: 人工sheet的序号
# deepseek: deepseek产生json结果的文件夹,注意文件名要与人表格中序号对应
# skip_rows: 跳过表格中的前几行
# 生成结果: 比较结果.xlsx, highlighted.xlsx
import
json
from
pathlib
import
Path
import
difflib
from
openpyxl
import
load_workbook
from
openpyxl.styles
import
PatternFill
import
pandas
as
pd
from
fuzzywuzzy
import
fuzz
def
load_human_excel
(
path
,
worksheet_index
,
skip_rows
):
workbook
=
load_workbook
(
path
)
worksheet
=
workbook
.
worksheets
[
worksheet_index
]
results
=
[]
for
row_idx
,
row
in
enumerate
(
worksheet
.
rows
):
if
row_idx
<
skip_rows
:
continue
result
=
{
"row"
:
row_idx
}
for
cell_idx
,
cell
in
enumerate
(
row
):
if
cell_idx
==
0
:
result
[
"index"
]
=
str
(
cell
.
value
)
elif
cell_idx
==
6
:
result
[
"authors"
]
=
str
(
cell
.
value
)
elif
cell_idx
==
8
:
result
[
"institutions"
]
=
str
(
cell
.
value
)
results
.
append
(
result
)
return
results
def
load_deepseek_json
(
path
):
results
=
[]
for
json_path
in
Path
(
path
)
.
rglob
(
"*.json"
):
with
open
(
json_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
item
=
json
.
load
(
f
)
result
=
{
"index"
:
str
(
json_path
.
stem
.
strip
()),
"authors"
:
item
[
"Authors"
],
"institutions"
:
item
[
"Institutions"
],
}
results
.
append
(
result
)
return
results
def
empty_check
(
string_list
):
if
len
(
string_list
)
==
0
:
return
False
for
item
in
string_list
:
if
len
(
item
.
strip
())
==
0
:
return
False
return
True
def
compare_list
(
deepseek
,
human_str
,
key
):
if
not
empty_check
(
deepseek
):
return
[{
"status"
:
"skip"
,
"key"
:
key
,
"reason"
:
f
"deepseek结果错误"
}]
lst
=
[
x
.
strip
()
for
x
in
human_str
.
split
(
";"
)]
if
not
empty_check
(
lst
):
return
[
{
"status"
:
"error"
,
"key"
:
key
,
"reason"
:
f
"人类数据为空,或者含有空字符串"
}
]
deepseek_len
=
len
(
deepseek
)
human_len
=
len
(
lst
)
if
deepseek_len
!=
human_len
:
err_msg
=
f
"人类数据长度 {human_len} 与 Deepseek长度{deepseek_len}不同。"
return
[{
"status"
:
"error"
,
"key"
:
key
,
"reason"
:
err_msg
}]
errors
=
[]
for
d
,
h
in
zip
(
deepseek
,
lst
):
processed_d
=
d
.
strip
()
.
lower
()
processed_h
=
h
.
strip
()
.
lower
()
if
processed_d
==
processed_h
:
continue
else
:
match0
=
difflib
.
SequenceMatcher
(
None
,
processed_h
,
processed_d
)
.
ratio
()
>
0.8
match1
=
fuzz
.
partial_ratio
(
processed_h
,
processed_d
)
>=
75
match2
=
fuzz
.
partial_ratio
(
processed_d
,
processed_h
)
>=
75
if
not
any
([
match0
,
match1
,
match2
]):
errors
.
append
(
{
"status"
:
"error"
,
"key"
:
key
,
"reason"
:
f
"人类 {h} 与 Deepseek {d} 相似度过低"
,
}
)
else
:
errors
.
append
(
{
"status"
:
"warning"
,
"key"
:
key
,
"reason"
:
f
"人类 {h} 与 Deepseek {d} 不完全匹配"
,
}
)
return
errors
def
compare_item
(
deepseek
,
human
):
errors
=
[]
errors
.
extend
(
compare_list
(
deepseek
[
"authors"
],
human
[
"authors"
],
"author"
))
errors
.
extend
(
compare_list
(
deepseek
[
"institutions"
],
human
[
"institutions"
],
"institution"
))
return
errors
def
jsonline_to_dict
(
jsonline
):
results
=
{}
for
line
in
jsonline
:
idx
=
line
[
"index"
]
if
idx
in
results
:
print
(
"ERROR: 唯一编号"
,
idx
,
"在数据中多次出现"
)
else
:
results
[
idx
]
=
line
return
results
def
main
(
humans
,
deepseeks
):
print
(
"转化人类数据"
)
humans
=
jsonline_to_dict
(
humans
)
print
(
"转化DeepSeek数据"
)
deepseeks
=
jsonline_to_dict
(
deepseeks
)
all_errs
=
[]
fillings
=
[]
for
idx
,
deep_seek_item
in
deepseeks
.
items
():
if
idx
not
in
humans
:
print
(
"ERROR: 编号"
,
idx
,
"存在PDF,但Excel中不存在对应数据"
)
else
:
errs
=
compare_item
(
deep_seek_item
,
humans
[
idx
])
fill
=
{
"row"
:
humans
[
idx
][
"row"
],
"author"
:
None
,
"institution"
:
None
,
"msg"
:
None
}
err_messages
=
[]
for
err
in
errs
:
err
[
"index"
]
=
idx
all_errs
.
append
(
err
)
if
err
[
"status"
]
==
"skip"
:
continue
elif
err
[
"status"
]
==
"warning"
:
if
fill
[
err
[
"key"
]]
is
None
:
fill
[
err
[
"key"
]]
=
"warning"
err_messages
.
append
(
err
[
"reason"
])
elif
err
[
"status"
]
==
"error"
:
fill
[
err
[
"key"
]]
=
"error"
err_messages
.
append
(
err
[
"reason"
])
fill
[
"msg"
]
=
'
\n
'
.
join
(
err_messages
)
fillings
.
append
(
fill
)
return
all_errs
,
fillings
def
highlight_mismatched_cells
(
path
,
sheet
,
fillings
):
workbook
=
load_workbook
(
path
)
worksheet
=
workbook
.
worksheets
[
sheet
]
red_fill
=
PatternFill
(
start_color
=
'FFFF0000'
,
end_color
=
'FFFF0000'
,
fill_type
=
'solid'
)
yellow_fill
=
PatternFill
(
start_color
=
'FFFFFF00'
,
end_color
=
'FFFFFF00'
,
fill_type
=
'solid'
)
# 为每个不匹配的行添加红色标记
for
row_info
in
fillings
:
row_num
=
row_info
[
"row"
]
+
1
if
row_info
[
"author"
]
is
not
None
:
fill
=
yellow_fill
if
row_info
[
"author"
]
==
"warning"
else
red_fill
worksheet
.
cell
(
row
=
row_num
,
column
=
7
)
.
fill
=
fill
if
row_info
[
"institution"
]
is
not
None
:
fill
=
yellow_fill
if
row_info
[
"institution"
]
==
"warning"
else
red_fill
worksheet
.
cell
(
row
=
row_num
,
column
=
9
)
.
fill
=
fill
if
row_info
[
"msg"
]
is
not
None
:
worksheet
.
cell
(
row_num
,
column
=
24
)
.
value
=
row_info
[
"msg"
]
workbook
.
save
(
'highlighted.xlsx'
)
if
__name__
==
"__main__"
:
import
argparse
argparser
=
argparse
.
ArgumentParser
()
argparser
.
add_argument
(
"--human"
,
type
=
str
,
help
=
"path of human excel"
)
argparser
.
add_argument
(
"--sheet"
,
type
=
int
,
help
=
"index of the sheet in excel"
)
argparser
.
add_argument
(
"--deepseek"
,
type
=
str
,
help
=
"path of deepseek json"
)
argparser
.
add_argument
(
"--skip_rows"
,
type
=
int
,
default
=
3
,
help
=
"skipping first n rows"
)
args
=
argparser
.
parse_args
()
humans
=
load_human_excel
(
args
.
human
,
args
.
sheet
,
args
.
skip_rows
)
deepseeks
=
load_deepseek_json
(
args
.
deepseek
)
errs
,
fillings
=
main
(
humans
,
deepseeks
)
df
=
pd
.
DataFrame
(
errs
)
df
=
df
[[
"index"
,
"status"
,
"key"
,
"reason"
]]
df
=
df
.
rename
(
columns
=
{
"index"
:
"序号"
,
"status"
:
"错误级别"
,
"key"
:
"错误列"
,
"reason"
:
"错误原因"
,
}
)
df
.
to_excel
(
"比较结果.xlsx"
,
index
=
False
)
highlight_mismatched_cells
(
args
.
human
,
args
.
sheet
,
fillings
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment