
# python .\compare.py --human Diannao.xlsx --sheet 0 --deepseek diannao_rengong --skip_rows 3
# human: 人工标注表格数据
# sheet: 人工sheet的序号
# deepseek： deepseek产生json结果的文件夹，注意文件名要与人表格中序号对应
# skip_rows: 跳过表格中的前几行
# 生成结果: 比较结果.xlsx, highlighted.xlsx

import json
from pathlib import Path
import difflib
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
import pandas as pd
from fuzzywuzzy import fuzz


def load_human_excel(path, worksheet_index, skip_rows):
    workbook = load_workbook(path)
    worksheet = workbook.worksheets[worksheet_index]

    results = []
    for row_idx, row in enumerate(worksheet.rows):
        if row_idx < skip_rows:
            continue
        result = {"row": row_idx}
        for cell_idx, cell in enumerate(row):
            if cell_idx == 0:
                result["index"] = str(cell.value)
            elif cell_idx == 6:
                result["authors"] = str(cell.value)
            elif cell_idx == 8:
                result["institutions"] = str(cell.value)

        results.append(result)
    return results


def load_deepseek_json(path):
    results = []
    for json_path in Path(path).rglob("*.json"):
        with open(json_path, "r", encoding="utf-8") as f:
            item = json.load(f)
        result = {
            "index": str(json_path.stem.strip()),
            "authors": item["Authors"],
            "institutions": item["Institutions"],
        }
        results.append(result)
    return results


def empty_check(string_list):
    if len(string_list) == 0:
        return False
    for item in string_list:
        if len(item.strip()) == 0:
            return False
    return True


def compare_list(deepseek, human_str, key):
    if not empty_check(deepseek):
        return [{"status": "skip", "key": key, "reason": f"deepseek结果错误"}]

    lst = [x.strip() for x in human_str.split(";")]
    if not empty_check(lst):
        return [
            {"status": "error", "key": key, "reason": f"人类数据为空，或者含有空字符串"}
        ]

    deepseek_len = len(deepseek)
    human_len = len(lst)
    if deepseek_len != human_len:
        err_msg = f"人类数据长度 {human_len} 与 Deepseek长度{deepseek_len}不同。"
        return [{"status": "error", "key": key, "reason": err_msg}]

    errors = []

    for d, h in zip(deepseek, lst):
        processed_d = d.strip().lower()
        processed_h = h.strip().lower()
        if processed_d == processed_h:
            continue
        else:
            match0 = difflib.SequenceMatcher(None, processed_h, processed_d).ratio() > 0.8
            match1 = fuzz.partial_ratio(processed_h, processed_d) >= 75
            match2 = fuzz.partial_ratio(processed_d, processed_h) >= 75

            if not any([match0, match1, match2]):
                errors.append(
                    {
                        "status": "error",
                        "key": key,
                        "reason": f"人类 {h} 与 Deepseek {d} 相似度过低",
                    }
                )
            else:
                errors.append(
                    {
                        "status": "warning",
                        "key": key,
                        "reason": f"人类 {h} 与 Deepseek {d} 不完全匹配",
                    }
                )
    return errors


def compare_item(deepseek, human):
    errors = []
    errors.extend(compare_list(deepseek["authors"], human["authors"], "author"))
    errors.extend(compare_list(deepseek["institutions"], human["institutions"], "institution"))
    return errors


def jsonline_to_dict(jsonline):
    results = {}
    for line in jsonline:
        idx = line["index"]
        if idx in results:
            print("ERROR: 唯一编号", idx, "在数据中多次出现")
        else:
            results[idx] = line
    return results


def main(humans, deepseeks):
    print("转化人类数据")
    humans = jsonline_to_dict(humans)
    print("转化DeepSeek数据")
    deepseeks = jsonline_to_dict(deepseeks)

    all_errs = []
    fillings = []

    for idx, deep_seek_item in deepseeks.items():
        if idx not in humans:
            print("ERROR: 编号", idx, "存在PDF，但Excel中不存在对应数据")
        else:
            errs = compare_item(deep_seek_item, humans[idx])
            fill = {"row": humans[idx]["row"], "author": None, "institution": None, "msg": None}
            err_messages = []
            for err in errs:
                err["index"] = idx
                all_errs.append(err)

                if err["status"] == "skip":
                    continue
                elif err["status"] == "warning":
                    if fill[err["key"]] is None:
                        fill[err["key"]] = "warning"
                    err_messages.append(err["reason"])
                elif err["status"] == "error":
                    fill[err["key"]] = "error"
                    err_messages.append(err["reason"])
            fill["msg"] = '\n'.join(err_messages)
            fillings.append(fill)


    return all_errs, fillings

def highlight_mismatched_cells(path, sheet, fillings):
    workbook = load_workbook(path)
    worksheet = workbook.worksheets[sheet]

    red_fill = PatternFill(start_color='FFFF0000', end_color='FFFF0000', fill_type='solid')
    yellow_fill = PatternFill(start_color='FFFFFF00', end_color='FFFFFF00', fill_type='solid')
    
    # 为每个不匹配的行添加红色标记
    for row_info in fillings:
        row_num = row_info["row"] + 1
        
        if row_info["author"] is not None:
            fill = yellow_fill if row_info["author"] == "warning" else red_fill
            worksheet.cell(row=row_num, column=7).fill = fill
        
        if row_info["institution"] is not None:
            fill = yellow_fill if row_info["institution"] == "warning" else red_fill
            worksheet.cell(row=row_num, column=9).fill = fill
        if row_info["msg"] is not None:
            worksheet.cell(row_num, column=24).value = row_info["msg"]

    workbook.save('highlighted.xlsx')


if __name__ == "__main__":
    import argparse

    argparser = argparse.ArgumentParser()
    argparser.add_argument("--human", type=str, help="path of human excel")
    argparser.add_argument("--sheet", type=int, help="index of the sheet in excel")
    argparser.add_argument("--deepseek", type=str, help="path of deepseek json")
    argparser.add_argument(
        "--skip_rows", type=int, default=3, help="skipping first n rows"
    )

    args = argparser.parse_args()
    humans = load_human_excel(args.human, args.sheet, args.skip_rows)
    deepseeks = load_deepseek_json(args.deepseek)

    errs, fillings = main(humans, deepseeks)
    df = pd.DataFrame(errs)
    df = df[["index", "status", "key", "reason"]]
    df = df.rename(
        columns={
            "index": "序号",
            "status": "错误级别",
            "key": "错误列",
            "reason": "错误原因",
        }
    )
    df.to_excel("比较结果.xlsx", index=False)

    highlight_mismatched_cells(args.human, args.sheet, fillings)


