from downloadFactory import DownloaderFactory
from driver import ChromeDriver
import csv
import os
import pandas as pd

# Function to write to CSV
def write_to_csv(file_name, keys, detail, is_modified=False):
    with open(file_name, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        if csvfile.tell() == 0:  # Check if file is empty to write header
            writer.writeheader()
        if is_modified:
            # Prepare modified data
            if isinstance(detail.get("authors"), list):
                detail["authors"] = "; ".join(detail["authors"])
            if isinstance(detail.get("institutions"), list):
                detail["institutions"] = ";\n".join(detail["institutions"])
        writer.writerow(detail)


def process_txt_files_in_directory(input_dir):
    """Process each txt file in the given directory."""
    # List all txt files in the directory
    file_path_list = []
    for file_name in os.listdir(input_dir):
        if file_name.endswith(".txt"):
            file_path_list.append((input_dir, file_name))
    return file_path_list


def update_xlsx(file_name, row_index, author_status, pdf_status):
    df = pd.read_excel(file_name)
    if author_status:
        df.at[row_index, 'author'] = 1
    if pdf_status:
        df.at[row_index, 'pdf'] = 1
    df.to_excel(file_name, index=False)

if __name__ == "__main__":

    driver = ChromeDriver()
    input_file = "data/input.xlsx"  # The input Excel file
    supported_pdf_sites = ["mdpi","acm", "springer", "ieee", "proquest", "patents"]
    # supported_pdf_sites = ["mdpi"]
    df = pd.read_excel(input_file)

    for idx, row in df.iterrows():
        # TODO: add real idx
        url = row['url']
        author_status = row['author']
        pdf_status = row['pdf']
        detail = {}
        # Skip if already processed
        if author_status == 1 and pdf_status == 1:
            continue
        
        # 1. get author, institution, journal info
        try:
            if url.endswith(".pdf"):
                print(f"Skipping PDF file: {url}")
                continue

            print(f"Crawling {url}..................... ")
            downloaderClass = DownloaderFactory.get_downloader(url)
            downloader = downloaderClass(driver)
            updated_author_status = author_status
            updated_pdf_status = pdf_status
            
            if author_status != 1:
                detail = downloader.get_author_institution_journal(url)
            if detail.get("title"):
                updated_author_status = 1
                
        except Exception as e:
            print(e)
            continue
            
        # 2. get pdf
        try:
            # Download PDF if it's not already done
            if any(key in url for key in supported_pdf_sites) and updated_pdf_status != 1:
                state = downloader.download(url, str(idx + 1))
                updated_pdf_status = 1 if state == True else 0
        except Exception as e:
            print(e)
            continue
        
            # Update the Excel file for this URL
        update_xlsx(input_file, idx, updated_author_status, updated_pdf_status)

        if detail:
            keys = detail.keys()

            # Write each detail as it's processed to CSV
            write_to_csv("output.csv", keys, detail)
            write_to_csv(
                "modified_output.csv",
                keys,
                detail,
                is_modified=True,
            )


        print(f"Crawling {url} done.")
# Make sure to quit the driver after operations are done
# university of science and technology of china
