from copy import deepcopy
from utils import get_continent
import pandas as pd
df = pd.read_excel("机构国家汇总-初版.xlsx")
# 提取第一行的值作为新的列名
new_columns = df.iloc[0].values

# 将第一行的值赋值给 df.columns
df.columns = new_columns

# 删除第一行（因为它已经被用作列名）
df = df.drop(0).reset_index(drop=True)

institution_name = df["Institution Name"].tolist()

file_name = "good_papers-niuren.csv"
final = pd.read_csv(file_name, encoding='utf-8-sig')
# add continent_country column
final["continent_country"] = ""

for row_idx, row in final.iterrows():
    # if row["affiliations"] == "":
    #     continue
    continent_country_row = []
    # print(row_idx, row["affiliations"])
    # if row["affiliations"] == "":
    #     continue
    try:
        aff_split = row["affiliations"].split(";")
    except Exception as e:
        print("没有机构信息：", row["affiliations"], "type:", type(row["affiliations"]))
        continent_country_row = []
        aff_split = []
    aff_split = [x.strip() for x in aff_split]
    for aff in aff_split:
        # each aff
        aff_in_flag = False
        aff_split_2 = aff.split(",")
        aff_split_2 = [x.strip() for x in aff_split_2]
        row_index = -1
        for x in aff_split_2:
            if x in institution_name:
                row_index = institution_name.index(x)
                break
        if row_index != -1:
            country = df.iloc[row_index]["Country"]
            continent = get_continent(country)
            continent_country_row.append(f"{continent}-{country}")
    
    continent_country_row = list(set(continent_country_row))  # 去重
    continent_country_row = "; ".join(continent_country_row)
    final.at[row_idx, "continent_country"] = deepcopy(continent_country_row)
        
        
# save it as an csv file
final.to_csv("good_papers-niuren-task_10.csv", index=False, encoding='utf-8-sig')