from utils import get_continent
import pandas as pd
from copy import deepcopy

hpca = pd.read_excel("hpca.xlsx")

new_hpca_columns = [
    "届数",
    "论文名称",
    "第一作者",
    "第一作者机构",
    "第一作者洲-国家",
    "通讯作者",
    "通讯作者机构",
    "通讯作者洲-国家",
]
new_hpca = pd.DataFrame(columns=new_hpca_columns)

new_hpca["届数"] = hpca["届数"]
new_hpca["论文名称"] = hpca["论文名称"]
new_hpca["第一作者"] = hpca["第一作者"]
new_hpca["第一作者机构"] = hpca["第一作者机构"]
# new_hpca["第一作者洲-国家"] = hpca["第一作者洲-国家"]
new_hpca["通讯作者"] = hpca["通讯作者"]
new_hpca["通讯作者机构"] = hpca["通讯作者机构"]
# new_hpca["通讯作者洲-国家"] = hpca["通讯作者洲-国家"]

df = pd.read_excel("机构国家汇总-初版.xlsx")
# 提取第一行的值作为新的列名
new_columns = df.iloc[0].values

# 将第一行的值赋值给 df.columns
df.columns = new_columns

# 删除第一行（因为它已经被用作列名）
df = df.drop(0).reset_index(drop=True)

institution_name = df["Institution Name"].tolist()
country_name = df["Country"].tolist()


for row_idx, row in new_hpca.iterrows():
    diyi_flag = False
    tongxun_flag = False
    diyi_continent_country_row = []
    tongxun_continent_country_row = []
    try:
        diyi_aff = row["第一作者机构"].replace("\xa0", " ").split(",")
        assert ";" not in row["第一作者机构"]
    except Exception as e:
        # print("没有机构信息：", row["第一作者机构"], "type:", type(row["第一作者机构"]))
        diyi_aff = []
    try:
        tongxun_aff = row["通讯作者机构"].replace("\xa0", " ").split(",")
        assert ";" not in row["通讯作者机构"]
    except Exception as e:
        # print("没有机构信息：", row["通讯作者机构"], "type:", type(row["通讯作者机构"]))
        tongxun_aff = []
    diyi_aff = [x.strip() for x in diyi_aff]
    diyi_aff_more_detail = []
    for aff in diyi_aff:
        diyi_aff_more_detail.extend(aff.split(" "))
    diyi_aff_more_detail = [x.strip() for x in diyi_aff_more_detail]
    diyi_aff_more_detail.extend(diyi_aff)

    tongxun_aff = [x.strip() for x in tongxun_aff]
    tongxun_aff_more_detail = []
    for aff in tongxun_aff:
        tongxun_aff_more_detail.extend(aff.split(" "))
    tongxun_aff_more_detail = [x.strip() for x in tongxun_aff_more_detail]
    tongxun_aff_more_detail.extend(tongxun_aff)
    for aff in diyi_aff:
        # each aff
        if aff in institution_name:
            diyi_flag = True
            diyi_row_index = institution_name.index(aff)
            country = df.iloc[diyi_row_index]["Country"]
            continent = get_continent(country)
            diyi_continent_country_row.append(f"{continent}-{country}")
        if aff in country_name:
            diyi_flag = True
            diyi_row_index = country_name.index(aff)
            continent = get_continent(aff)
            diyi_continent_country_row.append(f"{continent}-{aff}")
    for aff in diyi_aff_more_detail:
        # each aff
        if aff in institution_name:
            diyi_flag = True
            diyi_row_index = institution_name.index(aff)
            country = df.iloc[diyi_row_index]["Country"]
            continent = get_continent(country)
            diyi_continent_country_row.append(f"{continent}-{country}")
        if aff in country_name:
            diyi_flag = True
            diyi_row_index = country_name.index(aff)
            continent = get_continent(aff)
            diyi_continent_country_row.append(f"{continent}-{aff}")
    # if diyi_flag == False:
    #     diyi_continent_country_row.append("")
    
    for aff in tongxun_aff:
        # each aff
        if aff in institution_name:
            tongxun_flag = True
            tongxun_row_index = institution_name.index(aff)
            country = df.iloc[tongxun_row_index]["Country"]
            continent = get_continent(country)
            tongxun_continent_country_row.append(f"{continent}-{country}")
        if aff in country_name:
            tongxun_flag = True
            tongxun_row_index = country_name.index(aff)
            continent = get_continent(aff)
            tongxun_continent_country_row.append(f"{continent}-{aff}")
    for aff in tongxun_aff_more_detail:
        # each aff
        if aff in institution_name:
            tongxun_flag = True
            tongxun_row_index = institution_name.index(aff)
            country = df.iloc[tongxun_row_index]["Country"]
            continent = get_continent(country)
            tongxun_continent_country_row.append(f"{continent}-{country}")
        if aff in country_name:
            tongxun_flag = True
            tongxun_row_index = country_name.index(aff)
            continent = get_continent(aff)
            tongxun_continent_country_row.append(f"{continent}-{aff}")
    
    # if tongxun_flag == False:
    #     tongxun_continent_country_row.append("")
        
    diyi_continent_country_row = list(set(diyi_continent_country_row))  # 去重
    diyi_continent_country_row = "; ".join(diyi_continent_country_row)
    new_hpca.at[row_idx, "第一作者洲-国家"] = deepcopy(diyi_continent_country_row)
    
    tongxun_continent_country_row = list(set(tongxun_continent_country_row))  # 去重
    tongxun_continent_country_row = "; ".join(tongxun_continent_country_row)
    new_hpca.at[row_idx, "通讯作者洲-国家"] = deepcopy(tongxun_continent_country_row)
        
        
# save it as an csv file
new_hpca.to_csv("hpca-洲-国家.csv", index=False, encoding='utf-8-sig')