import openpyxl
from openpyxl.styles import PatternFill
from openpyxl.comments import Comment
import re
import os

# ============ 用户需要配置 =====================
excel_path = "测试输入.xlsx"
sheet_names = ["Dadiannao"]  # 你要检查的Sheet名, e.g ["Dadiannao", "Diannao"]
# ==============================================
# ============ 输出结果 =====================
# 文件一：excel_path_标记.xlsx (对格式错误单元格标红并注释错误原因)
# 文件二：excel_path_格式问题报告.txt
# ==============================================

EXPECTED_COLS = [
    "序号", "引文链接", "引文名称", "期刊/会议名称", "是否是CCF-A",
    "引用序号", "引文作者", "通讯作者顺序", "引文机构", "引文机构所属国家索引",
    "引文机构所属国家", "引文机构在知名企业中的索引", "知名企业名称\n（参考知名企业列表）", "牛人\n(参考全局牛人列表)", "牛人署名顺序"
]

red_fill = PatternFill(start_color='FFFF6666', end_color='FFFF6666', fill_type='solid')

def check_not_empty(value):
    errors = []
    if value is None or str(value).strip() == "":
        errors.append("不能为空")
    return errors

def check_url(value):
    """
    本次报告不对url进行强制要求
    """
    errors = []
    # if value is None or str(value).strip() == "":
    #     errors.append("不能为空")
    # else:
    #     url_pat = re.compile(
    #         r'^(https?|ftp)://[^\s/$.?#].[^\s]*$', re.IGNORECASE)
    #     if not url_pat.match(str(value).strip()):
    #         errors.append("不是合法的URL")
    return errors

def check_citation_title(value):
    errors = []
    invalid_prefixes = ["[PDF]", "[HTML]", "[citation]", "[BOOK]"]
    if value is None or str(value).strip() == "":
        errors.append("为空")
        return errors
    val = str(value)
    for pre in invalid_prefixes:
        if val.lower().startswith(pre.lower()):
            errors.append(f"前缀不合法：{pre}")
    if '\n' in val:
        errors.append("含有换行符")
    return errors

def check_ccfa(value):
    errors = []
    if value not in ("是", "否"):
        errors.append("只能为'是'或'否'")
    return errors

def check_reference_index(value):
    """
    只要求有内容，确认引用
    """
    errors = []
    if value is None or str(value).strip() == "":
        errors.append("不能为空")
    # else:
    #     val = str(value).strip()
    #     if val == "无":
    #         return errors
    #     if not val.isdigit():
    #         errors.append("只能为数字或者'无'")
    return errors
def check_authors(value):
    errors = []
    if value is None or str(value).strip() == "":
        errors.append("为空")
        return errors
    val = str(value).strip().rstrip(";")  # 去除结尾英文分号
    if " and " in val:
        errors.append("不能出现 ' and '")
    # 检测连续分号（允许分号间有任意空白符）
    if re.search(r';\s*;', val):
        errors.append("不能有连续分号（分号间不能有空格）")
    if ',' in val or '，' in val:
        errors.append("不能有逗号")
    if "；" in val:
        errors.append("不能用中文分号，请使用英文分号")
    # 检查是否有非英文句点的点符号
    # 非英文句点常见有：U+2024 '․', U+2027 '‧', U+2022 '•', U+00B7 '·', U+2219 '∙', U+22C5 '⋅', U+30FB '・', U+FF0E '．'
    invalid_dots = '[\u2024\u2027\u2022\u00B7\u2219\u22C5\u30FB\uFF0E]'
    if re.search(invalid_dots, val):
        errors.append("请使用英文句点（.），不要用其它点符号")
    authors = val.split(";")
    pat = re.compile(r'^[A-Za-z\u00C0-\u024F.\-\s]+$')
    for name in authors:
        if not pat.fullmatch(name.strip()):
            errors.append(f"作者名称不规范: {name}")
    return errors

def check_comm_author_order(value):
    errors = []
    if value is None or str(value).strip() == "":
        errors.append("为空")
        return errors
    val = str(value).strip().rstrip(";")  # 去除结尾英文分号
    if "；" in val:
        errors.append("不能用中文分号，请使用英文分号")
    if val == "无":
        return errors
    if '\n' in val:
        errors.append("有换行符")
    if "  " in val:
        errors.append("不能有连续2个以上空格")
    nums = val.split(";")
    for n in nums:
        if not n.strip().isdigit():
            errors.append(f"不是数字: {n}")
    return errors

def check_affiliation(value):
    errors = []
    if value is None or str(value).strip() == "":
        errors.append("为空")
        return errors
    val = str(value)
    if "department" in val.lower():
        errors.append("请写大学或者公司，不能含department等二级单位")
    if '\n' in val:
        errors.append("含有换行符")
    if "；" in val:
        errors.append("不能用中文分号，请使用英文分号")
    return errors

def check_aff_country_index(value):
    errors = []
    if value is None or str(value).strip() == "":
        errors.append("为空")
        return errors
    val = str(value).strip().rstrip(";")  # 去除结尾英文分号
    if "；" in val:
        errors.append("不能用中文分号，请使用英文分号")
    if '\n' in val:
        errors.append("含有换行符")
    nums = val.split(";")
    for n in nums:
        if not n.strip().isdigit():
            errors.append(f"不是数字或者没有用分号隔开: {n}")
    return errors

def check_aff_country(value):
    errors = []
    if value is None or str(value).strip() == "":
        errors.append("为空")
        return errors
    val = str(value).strip().rstrip(";")  # 去除结尾英文分号
    if "；" in val:
        errors.append("不能用中文分号，请使用英文分号")
    return errors

def check_enterprise_index(value):
    errors = []
    if value is None or str(value).strip() == "":
        errors.append("不能为空")
        return errors
    val = str(value).strip().rstrip(";")  # 去除结尾英文分号
    if "；" in val:
        errors.append("不能用中文分号，请使用英文分号")
    if " " in val:
        errors.append("不能出现空格")
    for part in val.split(";"):
        part = part.strip()
        if not (part == "无" or part.isdigit()):
            errors.append("只能用英文分号分隔，每项只能为数字或'无'")
    return errors

col_check_map = {
    "序号": check_not_empty,
    "引文链接": check_url,
    "引文名称": check_citation_title,
    "是否是CCF-A": check_ccfa,
    "引用序号": check_reference_index,
    "引文作者": check_authors,
    "通讯作者顺序": check_comm_author_order,
    "引文机构": check_affiliation,
    "引文机构所属国家索引": check_aff_country_index,
    "引文机构所属国家": check_aff_country,
    "引文机构在知名企业中的索引": check_enterprise_index,
    # 其他列可继续完善
}

def check_sheet(ws, sheet_name, report):
    header_row_idx = 4
    data_start_row = header_row_idx + 1
    rows = list(ws.iter_rows(min_row=header_row_idx, values_only=False))
    if not rows:
        report.append(f"{sheet_name}: 数据为空")
        return

    actual_cols = [str(cell.value).strip() if cell.value else "" for cell in rows[0]]
    col_map = {}
    missing_cols = []
    for expect in EXPECTED_COLS:
        for idx, ac in enumerate(actual_cols):
            if ac == expect:
                col_map[expect] = idx + 1  # openpyxl列号从1开始
                break
        else:
            missing_cols.append(expect)
            report.append(f"标题行应该在第三行！{sheet_name}：缺少列：{expect}")

    if missing_cols:
        # 1-10列填充红色
        for col in range(1, 11):
            cell = ws.cell(row=header_row_idx, column=col)
            cell.fill = red_fill
        # 第一列加注释，内容为缺失列名
        cell = ws.cell(row=header_row_idx, column=1)
        miss_txt = "标题应该在第四行\n"
        miss_txt += "缺少列: " + "; ".join(missing_cols)
        cell.comment = Comment(miss_txt, "checker")
        return

    max_row = ws.max_row
    for row_idx in range(data_start_row, max_row + 1):
        for col_name, col_num in col_map.items():
            cell = ws.cell(row=row_idx, column=col_num)
            value = cell.value
            checker = col_check_map.get(col_name)
            if checker:
                errors = checker(value)
                if errors:
                    cell.fill = red_fill
                    err_text = "; ".join(errors)
                    cell.comment = Comment(err_text, "checker")
                    report.append(f"{sheet_name}: 第{row_idx}行[{col_name}] 错误: {err_text}")

def main():
    wb = openpyxl.load_workbook(excel_path)
    error_report = []

    for sheet in sheet_names:
        if sheet not in wb.sheetnames:
            error_report.append(f"{sheet}: sheet不存在")
            continue
        ws = wb[sheet]
        check_sheet(ws, sheet, error_report)

    base, ext = os.path.splitext(excel_path)
    new_file = f"{base}_标记{ext}"
    wb.save(new_file)
    print(f"已保存标记文件: {new_file}")

    report_file = f"{base}_格式问题报告.txt"
    with open(report_file, "w", encoding="utf-8") as f:
        for line in error_report:
            f.write(line + "\n")
    print(f"已输出报告至: {report_file}")

if __name__ == "__main__":
    main()