from utils import contains_chinese, chinese_to_english_name, firstlast2lastfirst, name_in_niuren_list
import json
import pandas as pd
niuren_df = pd.read_csv("new_niuren_format-merged_turing.csv", encoding='utf-8-sig')
niuren_title_names = niuren_df.columns[2:].tolist()
niuren_name = pd.read_csv("new_niuren_format-merged_turing.csv", encoding='utf-8-sig')["name"].tolist()
niuren_name = [name.replace("\xa0", " ") for name in niuren_name]  # 去除空格
# 打开 JSON Lines 文件并逐行读取
# file_path = 'allpapers_authors.jsonl'  # 替换为你的文件路径
# file_path = '任务三pdf提取作者信息.jsonl'  # 替换为你的文件路径
# file_path = 'good_papers-威.jsonl'  # 替换为你的文件路径
file_path = 'good_papers.jsonl'  # 替换为你的文件路径

# 创建一个空列表来存储解析后的 JSON 对象
data = []
# breakpoint()

# 逐行读取文件
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 解析每一行的 JSON 数据
        json_line = json.loads(line)
        data.append(json_line)


papers_with_niuren = []

for item in data:
    item_insert_flag = False
    authors = item['authors']
    authors_name_list = [author['name'] for author in authors]
    # breakpoint()
    authors_name_list = [item if isinstance(item, str) else "" for item in authors_name_list] # 8788行为空nan
    authors_name_list = [item if not contains_chinese(item) else chinese_to_english_name(item) for item in authors_name_list]
    authors_name_list = [firstlast2lastfirst(item) for item in authors_name_list]
    
    # print(authors_name_list)
    item_niuren_authors = []
    niuren_title_names_dict = {niuren_title: [] for niuren_title in niuren_title_names}
    niuren_idx = 0
    for idx, author_name in enumerate(authors_name_list):
        # if author_name in niuren_name:
        name_index = name_in_niuren_list(author_name, niuren_name)
        if name_index != -1:
            niuren_idx += 1
            # print("找到匹配的作者：", author_name)
            if not item_insert_flag:
                item_insert_flag = True
            # item_niuren_authors.append(author_name)
            item_niuren_authors.append(idx + 1)
            author_title = niuren_df.iloc[name_index][2:].tolist()
            assert len(author_title) == len(niuren_title_names)
            for author_title_name, author_title_value in zip(niuren_title_names, author_title):
                if author_title_value == 1.0:
                    niuren_title_names_dict[author_title_name].append(niuren_idx)
            
            
            
    item['niuren_authors'] = item_niuren_authors
    for k, v in niuren_title_names_dict.items():
        item[k] = "; ".join([str(i) for i in v])
    # if item_insert_flag:
    #     papers_with_niuren.append(item)
    # else:
    #     papers_with_niuren.append("empty line")
    papers_with_niuren.append(item)

# save papers_with_niuren to jsonl file
# with open('good_papers-威-final.jsonl', 'w', encoding='utf-8') as f:
with open('good_papers-niuren.jsonl', 'w', encoding='utf-8') as f:
# with open('任务三pdf提取作者信息-final.jsonl', 'w', encoding='utf-8') as f:
    for item in papers_with_niuren:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

