import os
import re
import string
import json
import pandas as pd
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import time
import random
import argparse
from bs4 import BeautifulSoup
import requests
import re
import string

JSON_FILE = "yunjichen.json"
PUB_FILE = "papers.txt"
URL_FILE = "urls.txt"
DATA_FILE = "citations.xlsx"


def get_cited_url_list(citedby_url, year=None):
    prefix, suffix = citedby_url.split("oi=bibs&hl=en")
    prefix += "start="

    if year:
        # Use the year parameter for filtering
        suffix = f"&hl=en&as_sdt=2005&sciodt=0,5{suffix}&scipsc=&as_ylo={year}&as_yhi={year}&scisbd="
    else:
        # No year filter, fetch all
        suffix = f"&hl=en&as_sdt=2005&sciodt=0,5{suffix}&scipsc="

    for i in range(0, 10000, 10):
        yield prefix + str(i) + suffix


SYMBOL_MORE_AUTHORS = "…"

# 添加请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}


def parse_scholar_results(html_content):
    """
    Parses HTML content to extract Google Scholar results.

    Args:
        html_content (str): The HTML content to parse.

    Returns:
        list: A list of dictionaries containing '标题', '作者', and '期刊信息'.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    results = soup.find_all("div", class_="gs_r")
    extracted_data = []

    for result in results:
        # Extract title
        title_tag = result.find(["h2", "h3"], class_="gs_rt")
        if title_tag:
            a_tag = title_tag.find("a")
            if a_tag:
                title = a_tag.get_text()
            else:
                title = title_tag.get_text()
            title = title.replace("[CITATION][C] ", "")
        else:
            title = "未找到标题"

        # Extract authors and journal information
        authors_journal_tag = result.find("div", class_="gs_a")
        if authors_journal_tag:
            authors_journal_text = authors_journal_tag.get_text()
            parts = authors_journal_text.split("-")
            if len(parts) >= 2:
                authors = parts[0].strip()
                journal_info = parts[1].split(",")[0].strip()
            else:
                authors = "格式不符合预期"
                journal_info = "格式不符合预期"
        else:
            authors = "未找到作者和期刊信息"
            journal_info = "未找到作者和期刊信息"

        # Add extracted data to list
        extracted_data.append(
            {"标题": title, "作者": authors, "期刊信息": journal_info}
        )

    return extracted_data


def parse_html(url):
    """
    return:
    title: str
    new_url: url to the paper
    authors: str
    more_authors: bool, if there are more authors than shown
    """
    # print(f"Fetching {url}")
    results = []
    session = requests.Session()
    response = session.get(url, headers=headers)
    if response.status_code != 200:
        # print("Failed to get the page")
        return False, results
    soup = BeautifulSoup(response.content, "html.parser")
    papers = soup.find_all("div", class_="gs_r gs_or gs_scl")
    if len(papers) == 0:
        return False, results
    for paper in papers:
        div_title = paper.find("h3", class_="gs_rt")
        try:
            title = div_title.find("a").get_text()
            new_url = div_title.find("a")["href"]
        except:
            # span blocks
            title = div_title.find_all("span")[-1].get_text()
            continue
        authors = paper.find("div", class_="gs_a").get_text().split("-")[0].split(",")
        authors[-1] = " ".join(authors[-1].split())
        authors = ";".join(authors)
        if SYMBOL_MORE_AUTHORS in authors[-1]:
            results.append((title, new_url, authors, 1))
        else:
            results.append((title, new_url, authors, 0))
    return True, results


def add_scisbd_sort(url):
    """确保url中有scisbd=1 按date排序"""
    parsed_url = urlparse(url)
    query = parse_qs(parsed_url.query)
    query["scisbd"] = ["1"]
    new_query = urlencode(query, doseq=True)

    new_url = urlunparse(
        (parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", new_query, "")
    )
    return new_url


def main(publications, year):
    if not os.path.exists(URL_FILE):
        with open(URL_FILE, "w") as file:
            file.write("")
    publications = [pub.lower() for pub in publications]
    with open(JSON_FILE, "r") as file:
        author = json.load(file)
    author_publications = [
        pub for pub in author["publications"] if pub["container_type"] == "Publication"
    ]
    titles = [pub["bib"]["title"] for pub in author_publications]
    index_publications = []
    for i, publication in enumerate(publications):
        found = False
        for idx, title in enumerate(titles):
            if publication in title.lower():
                found = True
                index_publications.append(idx)
                publications[i] = title
                break
        if not found:
            index_publications.append(None)

    for idx, publication in enumerate(publications):
        with open(URL_FILE, "a+") as file:
            file.seek(0)
            know_urls = set([line.strip() for line in file.readlines()])
        name = "_".join(publication.split())
        name = re.sub(f"[{string.punctuation}]", "", name[:20])
        print(f"Processing {publication}...to {name}")
        columns = [
            "paper idx",
            "paper Title",
            "Cite idx",
            "Cite Title",
            "URL",
            "Authors",
            "More Authors",
        ]

        if os.path.exists(DATA_FILE) and name in pd.ExcelFile(DATA_FILE).sheet_names:
            old_df = pd.read_excel(DATA_FILE, sheet_name=name)
            data = (
                old_df
                if set(old_df.columns) == set(columns)
                else pd.DataFrame(columns=columns)
            )
            current_start_idx = old_df["Cite idx"].max() if not old_df.empty else 0
        else:
            data = pd.DataFrame(columns=columns)
            current_start_idx = 0

        index_publication = index_publications[idx]
        if index_publication is None:
            continue
        citations = author_publications[index_publication]["num_citations"]
        citedby_url = author_publications[index_publication]["citedby_url"]

        citation_count = current_start_idx
        for i, url in tqdm(enumerate(get_cited_url_list(citedby_url, year))):
            # url = add_scisbd_sort(url)
            if url in know_urls:
                continue
            if i * 10 > citations:
                break
            # 随机延时
            time.sleep(random.uniform(0.5, 2.0))
            mark, results = parse_html(url)
            if not mark:
                break
            for rec in results:
                citation_count += 1
                title = rec[0] if len(rec) > 0 else ""
                cite_url = rec[1] if len(rec) > 1 else ""
                authors = rec[2] if len(rec) > 2 else ""
                more_authors = rec[3] if len(rec) > 3 else ""
                row = {
                    "paper idx": idx + 1,
                    "paper Title": publication,
                    "Cite idx": citation_count,
                    "Cite Title": title,
                    "URL": cite_url,
                    "Authors": authors,
                    "More Authors": more_authors,
                }
                data = data._append(row, ignore_index=True)
            know_urls.add(url)

        if os.path.exists(DATA_FILE):
            with pd.ExcelWriter(
                DATA_FILE, mode="a", if_sheet_exists="replace", engine="openpyxl"
            ) as writer:
                data.to_excel(writer, sheet_name=name, index=False)
        else:
            data.to_excel(DATA_FILE, sheet_name=name, index=False)

        with open(URL_FILE, "w") as file:
            file.writelines([url + "\n" for url in know_urls])


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Crawl citations with optional year filtering"
    )
    parser.add_argument(
        "--year", type=int, help="The year to filter citations by (optional)"
    )
    args = parser.parse_args()

    # Load the publication file
    with open(PUB_FILE, "r", encoding="utf-8") as file:
        publications = file.readlines()
    publications = [pub.strip().lower() for pub in publications]
    main(publications, args.year)
