import os
import json
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from utils import firstlast2lastfirst, name_in_niuren_list

input_file_path = '测试输入3.xlsx'
output_file_path = '测试输出3.xlsx'


def load_niuren_pool():
    niuren_pool = pd.read_csv("info/new_niuren_format-merged_turing.csv", encoding='utf-8-sig')
    niuren_pool_names = niuren_pool["name"].tolist()
    niuren_pool_names = [name.replace("\xa0", " ").lower() for name in niuren_pool_names]  # 去除空格
    return niuren_pool_names


def load_true_niuren():
    # 读取 "全局牛人" 工作簿
    true_niuren = pd.read_excel("info/新表-论文被引用统计-陈老师-截止2025年X月XX日.xlsx", sheet_name="全局牛人")

    true_niuren["别名列表"] = None
    true_niuren["别名列表"] = true_niuren["别名列表（各种奇奇怪怪的名字格式，比如first name和second name的顺序，以;分隔）"].apply(
        lambda x: [firstlast2lastfirst(i.strip()).lower() for i in x.split(";") if i!=""] if isinstance(x, str) else []
    )

    true_niuren_names = []
    true_niuren_papers = []

    # 构建牛人姓名列表
    for _, row in true_niuren.iterrows():
        if row["别名列表"]:
            true_niuren_names.append([firstlast2lastfirst(row["姓名"]).lower()] + row["别名列表"])
        else:
            true_niuren_names.append(firstlast2lastfirst(row["姓名"]).lower())

    # 构建牛人论文列表
    for true_niuren_idx, _ in enumerate(true_niuren_names):
        paper_file_path = os.path.join("info/niuren_papers", f"{true_niuren_idx+1}.xlsx")
        if os.path.exists(paper_file_path):
            papers_df = pd.read_excel(paper_file_path, usecols=[0])
            papers_list = papers_df.iloc[:, 0].dropna().tolist()
            papers_list = [paper.lower() for paper in papers_list]
            true_niuren_papers.append(papers_list)
        else:
            true_niuren_papers.append([])
    return true_niuren_names, true_niuren_papers



def load_fake_niuren():
    # 读取 "全局非牛人" 工作簿
    fake_niuren = pd.read_excel("info/新表-论文被引用统计-陈老师-截止2025年X月XX日.xlsx", sheet_name="全局非牛人")

    fake_niuren_names = []
    fake_niuren_papers = []

    # 构建非牛人姓名列表
    for _, row in fake_niuren.iterrows():
        fake_niuren_names.append(firstlast2lastfirst(row["姓名"]).lower())

    # 构建非牛人论文列表
    for fake_niuren_idx, _ in enumerate(fake_niuren_names):
        paper_file_path = os.path.join("info/feiniuren_papers", f"{fake_niuren_idx+1}.xlsx")
        if os.path.exists(paper_file_path):
            papers_df = pd.read_excel(paper_file_path, usecols=[0])
            papers_list = papers_df.iloc[:, 0].dropna().tolist()
            papers_list = [paper.lower() for paper in papers_list]
            fake_niuren_papers.append(papers_list)
        else:
            fake_niuren_papers.append([])
    return fake_niuren_names, fake_niuren_papers


def check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
    niuren = []
    order = []
    suspected_niuren = []


    for author_idx, author in enumerate(authors):
        if author == "":
            continue

        # 作者姓名能否在牛人池中找到
        name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), niuren_pool_names)
        if name_index == -1:
            continue

        # 作者姓名能否在全局牛人中找到
        true_name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), true_niuren_names)
        if true_name_index != -1:
            if title.strip().lower() in true_niuren_papers[true_name_index]:
                niuren.append(true_name_index + 1)
                order.append(author_idx + 1)
                continue

        # 作者姓名能否在全局非牛人中找到
        fake_name_index = name_in_niuren_list(firstlast2lastfirst(author).lower(), fake_niuren_names)
        if fake_name_index != -1:
            if title.strip().lower() in fake_niuren_papers[fake_name_index]:
                continue
        suspected_niuren.append([author, name_index+1])

    niuren_str = ";".join([str(i) for i in niuren])
    order_str = ";".join([str(i) for i in order])
    suspected_niuren_str = ";".join([f"{name}({index})" for name, index in suspected_niuren])
    return niuren_str, order_str, suspected_niuren_str


def process_row(index, row, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers):
    authors = row.get('引文作者', '')
    title = row.get('引文名称', '')
    if not isinstance(authors, str):
        return index, "", "", ""
    authors = [i.strip() for i in authors.split(";") if i!=""]
    niuren_true, niuren_true_order, suspected_niuren = check_niuren(authors, title, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers)
    return index, niuren_true, niuren_true_order, suspected_niuren


if __name__ == "__main__":
    # 读取原始 Excel 文件的前两行
    original_header = pd.read_excel(input_file_path, nrows=3, header=None)

    niuren_pool_names = load_niuren_pool()
    true_niuren_names, true_niuren_papers = load_true_niuren()
    fake_niuren_names, fake_niuren_papers = load_fake_niuren()

    # 读取数据并跳过前两行
    input_df = pd.read_excel(input_file_path, skiprows=3)

    # 添加新列
    input_df['牛人'] = None
    input_df['牛人署名顺序'] = None
    input_df['疑似牛人'] = None


    results = Parallel(n_jobs=-1)(delayed(process_row)(index, row, niuren_pool_names, true_niuren_names, true_niuren_papers, fake_niuren_names, fake_niuren_papers) for index, row in tqdm(input_df.iterrows()))
    for index, niuren_true, niuren_true_order, suspected_niuren in results:
        input_df.at[index, '牛人'] = niuren_true
        input_df.at[index, '牛人署名顺序'] = niuren_true_order
        input_df.at[index, '疑似牛人'] = suspected_niuren



    max_cols = max(original_header.shape[1], input_df.shape[1])

    # 定义补齐函数
    def pad_df(df, max_cols):
        df = df.copy()
        for i in range(df.shape[1], max_cols):
            df[i] = ""
        df = df.iloc[:, :max_cols]  # 限制在max_cols列
        df.columns = range(max_cols)  # 用数字做列名
        return df

    # 补齐所有DF
    original_header_padded = pad_df(original_header, max_cols)
    input_header = pd.DataFrame([input_df.columns.tolist()])
    input_header_padded = pad_df(input_header, max_cols)
    input_data_padded = pad_df(input_df, max_cols)

    # 拼接
    output_df = pd.concat([original_header_padded, input_header_padded, input_data_padded], ignore_index=True)

    # 保存
    output_df.to_excel(output_file_path, index=False, header=False)
    print(f"处理后的数据已保存到 {output_file_path}")