Upload compare

6e3eacb1 · Ziyuan Nan · f5fcb116 · 6e3eacb1
Commit 6e3eacb1 authored May 15, 2025 by Ziyuan Nan
Hide whitespace changes
Inline Side-by-side

Showing with 216 additions and 0 deletions

compare.py
+216 -0

No files found.
--- a/compare.py
+++ b/compare.py
+# python .\compare.py --human Diannao.xlsx --sheet 0 --deepseek diannao_rengong --skip_rows 3
+# human: 人工标注表格数据
+# sheet: 人工sheet的序号
+# deepseek： deepseek产生json结果的文件夹，注意文件名要与人表格中序号对应
+# skip_rows: 跳过表格中的前几行
+# 生成结果: 比较结果.xlsx, highlighted.xlsx
+import json
+from pathlib import Path
+import difflib
+from openpyxl import load_workbook
+from openpyxl.styles import PatternFill
+import pandas as pd
+from fuzzywuzzy import fuzz
+def load_human_excel(path, worksheet_index, skip_rows):
+    workbook = load_workbook(path)
+    worksheet = workbook.worksheets[worksheet_index]
+    results = []
+    for row_idx, row in enumerate(worksheet.rows):
+        if row_idx < skip_rows:
+            continue
+        result = {"row": row_idx}
+        for cell_idx, cell in enumerate(row):
+            if cell_idx == 0:
+                result["index"] = str(cell.value)
+            elif cell_idx == 6:
+                result["authors"] = str(cell.value)
+            elif cell_idx == 8:
+                result["institutions"] = str(cell.value)
+        results.append(result)
+    return results
+def load_deepseek_json(path):
+    results = []
+    for json_path in Path(path).rglob("*.json"):
+        with open(json_path, "r", encoding="utf-8") as f:
+            item = json.load(f)
+        result = {
+            "index": str(json_path.stem.strip()),
+            "authors": item["Authors"],
+            "institutions": item["Institutions"],
+        }
+        results.append(result)
+    return results
+def empty_check(string_list):
+    if len(string_list) == 0:
+        return False
+    for item in string_list:
+        if len(item.strip()) == 0:
+            return False
+    return True
+def compare_list(deepseek, human_str, key):
+    if not empty_check(deepseek):
+        return [{"status": "skip", "key": key, "reason": f"deepseek结果错误"}]
+    lst = [x.strip() for x in human_str.split(";")]
+    if not empty_check(lst):
+        return [
+            {"status": "error", "key": key, "reason": f"人类数据为空，或者含有空字符串"}
+        ]
+    deepseek_len = len(deepseek)
+    human_len = len(lst)
+    if deepseek_len != human_len:
+        err_msg = f"人类数据长度 {human_len} 与 Deepseek长度{deepseek_len}不同。"
+        return [{"status": "error", "key": key, "reason": err_msg}]
+    errors = []
+    for d, h in zip(deepseek, lst):
+        processed_d = d.strip().lower()
+        processed_h = h.strip().lower()
+        if processed_d == processed_h:
+            continue
+        else:
+            match0 = difflib.SequenceMatcher(None, processed_h, processed_d).ratio() > 0.8
+            match1 = fuzz.partial_ratio(processed_h, processed_d) >= 75
+            match2 = fuzz.partial_ratio(processed_d, processed_h) >= 75
+            if not any([match0, match1, match2]):
+                errors.append(
+                    {
+                        "status": "error",
+                        "key": key,
+                        "reason": f"人类 {h} 与 Deepseek {d} 相似度过低",
+                    }
+                )
+            else:
+                errors.append(
+                    {
+                        "status": "warning",
+                        "key": key,
+                        "reason": f"人类 {h} 与 Deepseek {d} 不完全匹配",
+                    }
+                )
+    return errors
+def compare_item(deepseek, human):
+    errors = []
+    errors.extend(compare_list(deepseek["authors"], human["authors"], "author"))
+    errors.extend(compare_list(deepseek["institutions"], human["institutions"], "institution"))
+    return errors
+def jsonline_to_dict(jsonline):
+    results = {}
+    for line in jsonline:
+        idx = line["index"]
+        if idx in results:
+            print("ERROR: 唯一编号", idx, "在数据中多次出现")
+        else:
+            results[idx] = line
+    return results
+def main(humans, deepseeks):
+    print("转化人类数据")
+    humans = jsonline_to_dict(humans)
+    print("转化DeepSeek数据")
+    deepseeks = jsonline_to_dict(deepseeks)
+    all_errs = []
+    fillings = []
+    for idx, deep_seek_item in deepseeks.items():
+        if idx not in humans:
+            print("ERROR: 编号", idx, "存在PDF，但Excel中不存在对应数据")
+        else:
+            errs = compare_item(deep_seek_item, humans[idx])
+            fill = {"row": humans[idx]["row"], "author": None, "institution": None, "msg": None}
+            err_messages = []
+            for err in errs:
+                err["index"] = idx
+                all_errs.append(err)
+                if err["status"] == "skip":
+                    continue
+                elif err["status"] == "warning":
+                    if fill[err["key"]] is None:
+                        fill[err["key"]] = "warning"
+                    err_messages.append(err["reason"])
+                elif err["status"] == "error":
+                    fill[err["key"]] = "error"
+                    err_messages.append(err["reason"])
+            fill["msg"] = '\n'.join(err_messages)
+            fillings.append(fill)
+    return all_errs, fillings
+def highlight_mismatched_cells(path, sheet, fillings):
+    workbook = load_workbook(path)
+    worksheet = workbook.worksheets[sheet]
+    red_fill = PatternFill(start_color='FFFF0000', end_color='FFFF0000', fill_type='solid')
+    yellow_fill = PatternFill(start_color='FFFFFF00', end_color='FFFFFF00', fill_type='solid')
+    # 为每个不匹配的行添加红色标记
+    for row_info in fillings:
+        row_num = row_info["row"] + 1
+        if row_info["author"] is not None:
+            fill = yellow_fill if row_info["author"] == "warning" else red_fill
+            worksheet.cell(row=row_num, column=7).fill = fill
+        if row_info["institution"] is not None:
+            fill = yellow_fill if row_info["institution"] == "warning" else red_fill
+            worksheet.cell(row=row_num, column=9).fill = fill
+        if row_info["msg"] is not None:
+            worksheet.cell(row_num, column=24).value = row_info["msg"]
+    workbook.save('highlighted.xlsx')
+if __name__ == "__main__":
+    import argparse
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("--human", type=str, help="path of human excel")
+    argparser.add_argument("--sheet", type=int, help="index of the sheet in excel")
+    argparser.add_argument("--deepseek", type=str, help="path of deepseek json")
+    argparser.add_argument(
+        "--skip_rows", type=int, default=3, help="skipping first n rows"
+    )
+    args = argparser.parse_args()
+    humans = load_human_excel(args.human, args.sheet, args.skip_rows)
+    deepseeks = load_deepseek_json(args.deepseek)
+    errs, fillings = main(humans, deepseeks)
+    df = pd.DataFrame(errs)
+    df = df[["index", "status", "key", "reason"]]
+    df = df.rename(
+        columns={
+            "index": "序号",
+            "status": "错误级别",
+            "key": "错误列",
+            "reason": "错误原因",
+        }
+    )
+    df.to_excel("比较结果.xlsx", index=False)
+    highlight_mismatched_cells(args.human, args.sheet, fillings)