Commit 6e3eacb1 by Ziyuan Nan

Upload compare

parent f5fcb116
# python .\compare.py --human Diannao.xlsx --sheet 0 --deepseek diannao_rengong --skip_rows 3
# human: 人工标注表格数据
# sheet: 人工sheet的序号
# deepseek: deepseek产生json结果的文件夹,注意文件名要与人表格中序号对应
# skip_rows: 跳过表格中的前几行
# 生成结果: 比较结果.xlsx, highlighted.xlsx
import json
from pathlib import Path
import difflib
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
import pandas as pd
from fuzzywuzzy import fuzz
def load_human_excel(path, worksheet_index, skip_rows):
workbook = load_workbook(path)
worksheet = workbook.worksheets[worksheet_index]
results = []
for row_idx, row in enumerate(worksheet.rows):
if row_idx < skip_rows:
continue
result = {"row": row_idx}
for cell_idx, cell in enumerate(row):
if cell_idx == 0:
result["index"] = str(cell.value)
elif cell_idx == 6:
result["authors"] = str(cell.value)
elif cell_idx == 8:
result["institutions"] = str(cell.value)
results.append(result)
return results
def load_deepseek_json(path):
results = []
for json_path in Path(path).rglob("*.json"):
with open(json_path, "r", encoding="utf-8") as f:
item = json.load(f)
result = {
"index": str(json_path.stem.strip()),
"authors": item["Authors"],
"institutions": item["Institutions"],
}
results.append(result)
return results
def empty_check(string_list):
if len(string_list) == 0:
return False
for item in string_list:
if len(item.strip()) == 0:
return False
return True
def compare_list(deepseek, human_str, key):
if not empty_check(deepseek):
return [{"status": "skip", "key": key, "reason": f"deepseek结果错误"}]
lst = [x.strip() for x in human_str.split(";")]
if not empty_check(lst):
return [
{"status": "error", "key": key, "reason": f"人类数据为空,或者含有空字符串"}
]
deepseek_len = len(deepseek)
human_len = len(lst)
if deepseek_len != human_len:
err_msg = f"人类数据长度 {human_len} 与 Deepseek长度{deepseek_len}不同。"
return [{"status": "error", "key": key, "reason": err_msg}]
errors = []
for d, h in zip(deepseek, lst):
processed_d = d.strip().lower()
processed_h = h.strip().lower()
if processed_d == processed_h:
continue
else:
match0 = difflib.SequenceMatcher(None, processed_h, processed_d).ratio() > 0.8
match1 = fuzz.partial_ratio(processed_h, processed_d) >= 75
match2 = fuzz.partial_ratio(processed_d, processed_h) >= 75
if not any([match0, match1, match2]):
errors.append(
{
"status": "error",
"key": key,
"reason": f"人类 {h} 与 Deepseek {d} 相似度过低",
}
)
else:
errors.append(
{
"status": "warning",
"key": key,
"reason": f"人类 {h} 与 Deepseek {d} 不完全匹配",
}
)
return errors
def compare_item(deepseek, human):
errors = []
errors.extend(compare_list(deepseek["authors"], human["authors"], "author"))
errors.extend(compare_list(deepseek["institutions"], human["institutions"], "institution"))
return errors
def jsonline_to_dict(jsonline):
results = {}
for line in jsonline:
idx = line["index"]
if idx in results:
print("ERROR: 唯一编号", idx, "在数据中多次出现")
else:
results[idx] = line
return results
def main(humans, deepseeks):
print("转化人类数据")
humans = jsonline_to_dict(humans)
print("转化DeepSeek数据")
deepseeks = jsonline_to_dict(deepseeks)
all_errs = []
fillings = []
for idx, deep_seek_item in deepseeks.items():
if idx not in humans:
print("ERROR: 编号", idx, "存在PDF,但Excel中不存在对应数据")
else:
errs = compare_item(deep_seek_item, humans[idx])
fill = {"row": humans[idx]["row"], "author": None, "institution": None, "msg": None}
err_messages = []
for err in errs:
err["index"] = idx
all_errs.append(err)
if err["status"] == "skip":
continue
elif err["status"] == "warning":
if fill[err["key"]] is None:
fill[err["key"]] = "warning"
err_messages.append(err["reason"])
elif err["status"] == "error":
fill[err["key"]] = "error"
err_messages.append(err["reason"])
fill["msg"] = '\n'.join(err_messages)
fillings.append(fill)
return all_errs, fillings
def highlight_mismatched_cells(path, sheet, fillings):
workbook = load_workbook(path)
worksheet = workbook.worksheets[sheet]
red_fill = PatternFill(start_color='FFFF0000', end_color='FFFF0000', fill_type='solid')
yellow_fill = PatternFill(start_color='FFFFFF00', end_color='FFFFFF00', fill_type='solid')
# 为每个不匹配的行添加红色标记
for row_info in fillings:
row_num = row_info["row"] + 1
if row_info["author"] is not None:
fill = yellow_fill if row_info["author"] == "warning" else red_fill
worksheet.cell(row=row_num, column=7).fill = fill
if row_info["institution"] is not None:
fill = yellow_fill if row_info["institution"] == "warning" else red_fill
worksheet.cell(row=row_num, column=9).fill = fill
if row_info["msg"] is not None:
worksheet.cell(row_num, column=24).value = row_info["msg"]
workbook.save('highlighted.xlsx')
if __name__ == "__main__":
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("--human", type=str, help="path of human excel")
argparser.add_argument("--sheet", type=int, help="index of the sheet in excel")
argparser.add_argument("--deepseek", type=str, help="path of deepseek json")
argparser.add_argument(
"--skip_rows", type=int, default=3, help="skipping first n rows"
)
args = argparser.parse_args()
humans = load_human_excel(args.human, args.sheet, args.skip_rows)
deepseeks = load_deepseek_json(args.deepseek)
errs, fillings = main(humans, deepseeks)
df = pd.DataFrame(errs)
df = df[["index", "status", "key", "reason"]]
df = df.rename(
columns={
"index": "序号",
"status": "错误级别",
"key": "错误列",
"reason": "错误原因",
}
)
df.to_excel("比较结果.xlsx", index=False)
highlight_mismatched_cells(args.human, args.sheet, fillings)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment