first commit

f6128fef · nzy · f6128fef · f6128fef · f6128fef · f6128fef
Commit f6128fef authored May 06, 2025 by nzy
11 changed files
--- a/extract_authors_from_pdf/extract_authors_info.py
+++ b/extract_authors_from_pdf/extract_authors_info.py
+from pydantic import BaseModel
+from openai import OpenAI
+from pathlib import Path
+import pypdf
+import json
+import os
+
+import logging
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+client = OpenAI(
+    api_key=os.environ.get("OPENAI_API_KEY"),
+)
+
+
+class Author(BaseModel):
+    name: str
+    affiliations: list[str]
+
+
+class Metadata(BaseModel):
+    title: str
+    authors: list[Author]
+
+
+system_prompt = """
+Act as an expert metadata extraction assistant.
+Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report).
+Your goal is to extract the document title, all authors, and their corresponding affiliations.
+
+Extraction Guidelines:
+-   **Title:** Extract the main title of the document. If ambiguous or missing, use "".
+-   **Authors:**
+    -   Identify all listed authors. Maintain the order presented in the text if possible.
+    -   For each author:
+        -   Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
+        -   Extract all associated institutions/affiliations mentioned for that specific author.
+        -   If an author has no listed institution, use an empty list `[]`.
+        -   If there are many authors and only one afflication, these authors all come from the same afflication. other wise find the corresponding afflication by indicator.
+-   **Handling Missing Data:** If no authors can be identified in the text, the "authors" field in the JSON should be an empty list `[]`.
+"""
+
+
+def get_authors(content):
+    response = client.responses.parse(
+        model="gpt-4o",
+        input=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": content},
+        ],
+        text_format=Metadata,
+    )
+    result = response.output_parsed
+    return result
+
+
+def extract_first_page_text(pdf_path):
+    """Extracts text content from the first page of a PDF."""
+    try:
+        reader = pypdf.PdfReader(pdf_path)
+        if len(reader.pages) > 0:
+            first_page = reader.pages[0]
+            text = first_page.extract_text()
+            if text:
+                # Basic cleaning: remove excessive whitespace
+                cleaned_text = " ".join(text.split())
+                cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
+                    "utf-8"
+                )
+                return cleaned_text
+            else:
+                logging.warning(f"No text found on the first page of {pdf_path.name}")
+                return None
+        else:
+            logging.warning(f"PDF has no pages: {pdf_path.name}")
+            return None
+    except pypdf.errors.PdfReadError as e:
+        logging.error(f"Error reading PDF file {pdf_path.name}: {e}")
+        return None
+    except FileNotFoundError:
+        logging.error(f"PDF file not found: {pdf_path}")
+        return None
+    except Exception as e:
+        logging.error(
+            f"An unexpected error occurred while processing {pdf_path.name}: {e}"
+        )
+        return None
+
+
+def main(pdf_directory: Path, result_path: Path):
+    with open(result_path, "a", encoding="utf-8") as f:
+        pdf_files = pdf_directory.rglob("*.pdf")
+        for file in pdf_files:
+            try:
+                logging.info(f"Extract {file.name}'s authors")
+                first_page_text = extract_first_page_text(file)
+                logging.info(first_page_text)
+                if first_page_text is not None:
+                    result = get_authors(first_page_text).model_dump()
+                    result["filename"] = file.name
+                    f.write(json.dumps(result) + "\n")
+            except Exception as e:
+                logging.error(f"{file.name}: {str(e)}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("--paper", type=str, required=True)
+    argparser.add_argument("--result", type=str, required=True)
+    args = argparser.parse_args()
+
+    main(Path(args.paper), Path(args.result))
--- a/extract_authors_from_pdf/readme.md
+++ b/extract_authors_from_pdf/readme.md
+# Readme
+
+## 依赖
+`openai`, `pypdf`
+
+## 用法
+
+export OPENAI_API_KEY=...
+python extract_authors_info.py --paper ./papars --result result.jsonl
+
+
+- 参数paper: 包含所有pdf的文件夹的路径
+- 参数result: 结果文件路径
+
+## 注意
+本结果由GPT自动生成，准确性无法保证，需要人工再次审核。
+
--- a/google_scholar_citedby/README.md
+++ b/google_scholar_citedby/README.md
+# crawler
+
+## update
+
+更新陈老师的文章列表信息`yunjichen.json`，后续`main.py`依赖此文件检索引用文献。
+
+## main
+
+创建`papers.txt`并在其中逐行加入你需要检索的论文标题，不建议输入完整标题，输入部分标题即可，如下所示：
+
+```
+Reproducing Concurrency Bugs Using Local Clocks
+binary translator with post-optimization
+timing error mitigation for hardware neural networks
+A Polyvalent Machine Learning Accelerator
+```
+
+已爬取过的链接在`urls.txt`中记录，避免重复爬取，`main.py`文件可重复运行，爬取失败建议检查网络连接。
+
+### 2. **运行脚本**
+
+确保你的环境已安装必要依赖（如`pandas`, `openpyxl`, `tqdm`等）。
+
+命令行运行：
+
+```bash
+python main.py
+```
+
+或 **指定年份**（仅爬取该年份的引用）：
+
+```bash
+python main.py --year 2023
+```
+
+### 3. **查看结果**
+
+抓取后将在`results/`下生成`citations.xlsx`，每个论文一个表单，包含被引论文的Title、URL、作者信息等。
+
+------
+
+## **参数说明**
+
+- `--year` 指定年份，仅抓取该年度的引用。不加该参数则会爬取**所有年份**的引用信息。
+
+  **用法示例：**
+
+  - 爬取全部（默认）： `python main.py`
+  - 仅抓2022年的引用： `python main.py --year 2022`
\ No newline at end of file
--- a/google_scholar_citedby/main.py
+++ b/google_scholar_citedby/main.py
+import os
+import re
+import string
+import json
+import pandas as pd
+from tqdm import tqdm
+from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
+import time
+import random
+import argparse
+from bs4 import BeautifulSoup
+import requests
+import re
+import string
+
+JSON_FILE = "yunjichen.json"
+PUB_FILE = "papers.txt"
+URL_FILE = "urls.txt"
+DATA_FILE = "citations.xlsx"
+
+
+def get_cited_url_list(citedby_url, year=None):
+    prefix, suffix = citedby_url.split("oi=bibs&hl=en")
+    prefix += "start="
+
+    if year:
+        # Use the year parameter for filtering
+        suffix = f"&hl=en&as_sdt=2005&sciodt=0,5{suffix}&scipsc=&as_ylo={year}&as_yhi={year}&scisbd="
+    else:
+        # No year filter, fetch all
+        suffix = f"&hl=en&as_sdt=2005&sciodt=0,5{suffix}&scipsc="
+
+    for i in range(0, 10000, 10):
+        yield prefix + str(i) + suffix
+
+
+SYMBOL_MORE_AUTHORS = "…"
+
+# 添加请求头
+headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+}
+
+
+def parse_scholar_results(html_content):
+    """
+    Parses HTML content to extract Google Scholar results.
+
+    Args:
+        html_content (str): The HTML content to parse.
+
+    Returns:
+        list: A list of dictionaries containing '标题', '作者', and '期刊信息'.
+    """
+    soup = BeautifulSoup(html_content, "html.parser")
+    results = soup.find_all("div", class_="gs_r")
+    extracted_data = []
+
+    for result in results:
+        # Extract title
+        title_tag = result.find(["h2", "h3"], class_="gs_rt")
+        if title_tag:
+            a_tag = title_tag.find("a")
+            if a_tag:
+                title = a_tag.get_text()
+            else:
+                title = title_tag.get_text()
+            title = title.replace("[CITATION][C] ", "")
+        else:
+            title = "未找到标题"
+
+        # Extract authors and journal information
+        authors_journal_tag = result.find("div", class_="gs_a")
+        if authors_journal_tag:
+            authors_journal_text = authors_journal_tag.get_text()
+            parts = authors_journal_text.split("-")
+            if len(parts) >= 2:
+                authors = parts[0].strip()
+                journal_info = parts[1].split(",")[0].strip()
+            else:
+                authors = "格式不符合预期"
+                journal_info = "格式不符合预期"
+        else:
+            authors = "未找到作者和期刊信息"
+            journal_info = "未找到作者和期刊信息"
+
+        # Add extracted data to list
+        extracted_data.append(
+            {"标题": title, "作者": authors, "期刊信息": journal_info}
+        )
+
+    return extracted_data
+
+
+def parse_html(url):
+    """
+    return:
+    title: str
+    new_url: url to the paper
+    authors: str
+    more_authors: bool, if there are more authors than shown
+    """
+    # print(f"Fetching {url}")
+    results = []
+    session = requests.Session()
+    response = session.get(url, headers=headers)
+    if response.status_code != 200:
+        # print("Failed to get the page")
+        return False, results
+    soup = BeautifulSoup(response.content, "html.parser")
+    papers = soup.find_all("div", class_="gs_r gs_or gs_scl")
+    if len(papers) == 0:
+        return False, results
+    for paper in papers:
+        div_title = paper.find("h3", class_="gs_rt")
+        try:
+            title = div_title.find("a").get_text()
+            new_url = div_title.find("a")["href"]
+        except:
+            # span blocks
+            title = div_title.find_all("span")[-1].get_text()
+            continue
+        authors = paper.find("div", class_="gs_a").get_text().split("-")[0].split(",")
+        authors[-1] = " ".join(authors[-1].split())
+        authors = ";".join(authors)
+        if SYMBOL_MORE_AUTHORS in authors[-1]:
+            results.append((title, new_url, authors, 1))
+        else:
+            results.append((title, new_url, authors, 0))
+    return True, results
+
+
+def add_scisbd_sort(url):
+    """确保url中有scisbd=1 按date排序"""
+    parsed_url = urlparse(url)
+    query = parse_qs(parsed_url.query)
+    query["scisbd"] = ["1"]
+    new_query = urlencode(query, doseq=True)
+
+    new_url = urlunparse(
+        (parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", new_query, "")
+    )
+    return new_url
+
+
+def main(publications, year):
+    if not os.path.exists(URL_FILE):
+        with open(URL_FILE, "w") as file:
+            file.write("")
+    publications = [pub.lower() for pub in publications]
+    with open(JSON_FILE, "r") as file:
+        author = json.load(file)
+    author_publications = [
+        pub for pub in author["publications"] if pub["container_type"] == "Publication"
+    ]
+    titles = [pub["bib"]["title"] for pub in author_publications]
+    index_publications = []
+    for i, publication in enumerate(publications):
+        found = False
+        for idx, title in enumerate(titles):
+            if publication in title.lower():
+                found = True
+                index_publications.append(idx)
+                publications[i] = title
+                break
+        if not found:
+            index_publications.append(None)
+
+    for idx, publication in enumerate(publications):
+        with open(URL_FILE, "a+") as file:
+            file.seek(0)
+            know_urls = set([line.strip() for line in file.readlines()])
+        name = "_".join(publication.split())
+        name = re.sub(f"[{string.punctuation}]", "", name[:20])
+        print(f"Processing {publication}...to {name}")
+        columns = [
+            "paper idx",
+            "paper Title",
+            "Cite idx",
+            "Cite Title",
+            "URL",
+            "Authors",
+            "More Authors",
+        ]
+
+        if os.path.exists(DATA_FILE) and name in pd.ExcelFile(DATA_FILE).sheet_names:
+            old_df = pd.read_excel(DATA_FILE, sheet_name=name)
+            data = (
+                old_df
+                if set(old_df.columns) == set(columns)
+                else pd.DataFrame(columns=columns)
+            )
+            current_start_idx = old_df["Cite idx"].max() if not old_df.empty else 0
+        else:
+            data = pd.DataFrame(columns=columns)
+            current_start_idx = 0
+
+        index_publication = index_publications[idx]
+        if index_publication is None:
+            continue
+        citations = author_publications[index_publication]["num_citations"]
+        citedby_url = author_publications[index_publication]["citedby_url"]
+
+        citation_count = current_start_idx
+        for i, url in tqdm(enumerate(get_cited_url_list(citedby_url, year))):
+            # url = add_scisbd_sort(url)
+            if url in know_urls:
+                continue
+            if i * 10 > citations:
+                break
+            # 随机延时
+            time.sleep(random.uniform(0.5, 2.0))
+            mark, results = parse_html(url)
+            if not mark:
+                break
+            for rec in results:
+                citation_count += 1
+                title = rec[0] if len(rec) > 0 else ""
+                cite_url = rec[1] if len(rec) > 1 else ""
+                authors = rec[2] if len(rec) > 2 else ""
+                more_authors = rec[3] if len(rec) > 3 else ""
+                row = {
+                    "paper idx": idx + 1,
+                    "paper Title": publication,
+                    "Cite idx": citation_count,
+                    "Cite Title": title,
+                    "URL": cite_url,
+                    "Authors": authors,
+                    "More Authors": more_authors,
+                }
+                data = data._append(row, ignore_index=True)
+            know_urls.add(url)
+
+        if os.path.exists(DATA_FILE):
+            with pd.ExcelWriter(
+                DATA_FILE, mode="a", if_sheet_exists="replace", engine="openpyxl"
+            ) as writer:
+                data.to_excel(writer, sheet_name=name, index=False)
+        else:
+            data.to_excel(DATA_FILE, sheet_name=name, index=False)
+
+        with open(URL_FILE, "w") as file:
+            file.writelines([url + "\n" for url in know_urls])
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Crawl citations with optional year filtering"
+    )
+    parser.add_argument(
+        "--year", type=int, help="The year to filter citations by (optional)"
+    )
+    args = parser.parse_args()
+
+    # Load the publication file
+    with open(PUB_FILE, "r", encoding="utf-8") as file:
+        publications = file.readlines()
+    publications = [pub.strip().lower() for pub in publications]
+    main(publications, args.year)
--- a/google_scholar_citedby/papers.txt
+++ b/google_scholar_citedby/papers.txt
+anNao: A Machine-Learning Supercompute
+
--- a/google_scholar_citedby/update.py
+++ b/google_scholar_citedby/update.py
+from scholarly import scholarly
+from scholarly import ProxyGenerator
+import json
+
+# Activates proxy because Google Scholar otherwise might block the IP address
+pg = ProxyGenerator()
+scholarly.use_proxy(pg, pg)
+
+def main(name):
+    author = next(scholarly.search_author(name))
+    author = scholarly.fill((author),sections = ['publications'])
+    json.dump(author, open(f"{name}.json", "w"), indent=4)
+
+if __name__=="__main__":
+    main("yunji chen")
\ No newline at end of file
--- a/google_scholar_citedby/yunjichen.json
+++ b/google_scholar_citedby/yunjichen.json
--- a/share_is_ccfa/CCF_A_list.csv
+++ b/share_is_ccfa/CCF_A_list.csv
+abbr,fullname
+PPoPP,ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming
+FAST,USENIX Conference on File and Storage Technologies
+DAC,Design Automation Conference
+HPCA,IEEE International Symposium on High Performance Computer Architecture
+MICRO,IEEE/ACM International Symposium on Microarchitecture
+SC,"International Conference for High Performance Computing, Networking, Storage, and Analysis"
+ASPLOS,International Conference on Architectural Support for Programming Languages and Operating Systems
+ISCA,International Symposium on Computer Architecture
+USENIX ATC,USENIX Annual Technical Conference
+EuroSys,European Conference on Computer Systems
+SIGCOMM,"ACM International Conference on Applications, Technologies, Architectures, and Protocols for Computer Communication"
+MobiCom,ACM International Conference on Mobile Computing and Networking
+INFOCOM,IEEE International Conference on Computer Communications
+NSDI,Symposium on Network System Design and Implementation
+CCS,ACM Conference on Computer and Communications Security
+EUROCRYPT,International Conference on the Theory and Applications of Cryptographic Techniques
+S&P,IEEE Symposium on Security and Privacy
+CRYPTO,International Cryptology Conference
+USENIX Security,USENIX Security Symposium
+NDSS,Network and Distributed System Security Symposium
+PLDI,ACM SIGPLAN Conference on Programming Language Design and Implementation
+POPL,ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages
+FSE,ACM International Conference on the Foundations of Software Engineering
+SOSP,ACM Symposium on Operating Systems Principles
+OOPSLA,"Conference on Object-Oriented Programming Systems, Languages,and Applications"
+ASE,International Conference on Automated Software Engineering
+ICSE,International Conference on Software Engineering
+ISSTA,International Symposium on Software Testing and Analysis
+OSDI,USENIX Symposium on Operating Systems Design and Implementations
+FM,International Symposium on Formal Methods
+SIGMOD,ACM SIGMOD Conference
+SIGKDD,ACM SIGKDD Conference on Knowledge Discovery and Data Mining
+ICDE,IEEE International Conference on Data Engineering
+SIGIR,International ACM SIGIR Conference on Research and Development in Information Retrieval
+VLDB,International Conference on Very Large Data Bases
+STOC,ACM Symposium on Theory of Computing
+SODA,ACM-SIAM Symposium on Discrete Algorithms
+CAV,International Conference on Computer Aided Verification
+FOCS,IEEE Annual Symposium on Foundations of Computer Science
+LICS,ACM/IEEE Symposium on Logic in Computer Science
+ACM MM,ACM International Conference on Multimedia
+SIGGRAPH,ACM Special Interest Group on Computer Graphics
+VR,IEEE Virtual Reality
+IEEE VIS,IEEE Visualization Conference
+AAAI,AAAI Conference on Artificial Intelligence
+NeurIPS,Conference on Neural Information Processing Systems
+ACL,Annual Meeting of the Association for Computational Linguistics
+CVPR,IEEE/CVF Computer Vision and Pattern Recognition Conference
+ICCV,International Conference on Computer Vision
+ICML,International Conference on Machine Learning
+IJCAI,International Joint Conference on Artificial Intelligence
+CSCW,ACM Conference on Computer Supported Cooperative Work and Social Computing
+CHI,ACM Conference on Human Factors in Computing Systems
+UbiComp/IMWUT,"ACM international joint conference on Pervasive and Ubiquitous Computing/ Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies"
+UIST,ACM Symposium on User Interface Software and Technology
+WWW,International World Wide Web Conference
+RTSS,IEEE Real-Time Systems Symposium
+WINE,Conference on Web and Internet Economics
+TOCS,ACM Transactions on Computer Systems
+TOS,ACM Transactions on Storage
+TCAD,IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems
+TC,IEEE Transactions on Computers
+TPDS,IEEE Transactions on Parallel and Distributed Systems
+TACO,ACM Transactions on Architecture and Code Optimization
+JSAC,IEEE Journal on Selected Areas in Communications
+TMC,IEEE Transactions on Mobile Computing
+TON,IEEE/ACM Transactions on Networking
+TDSC,IEEE Transactions on Dependable and Secure Computing
+TIFS,IEEE Transactions on Information Forensics and Security
+,Journal of Cryptology
+TOPLAS,ACM Transactions on Programming Languages and Systems
+TOSEM,ACM Transactions on Software Engineering and Methodology
+TSE,IEEE Transactions on Software Engineering
+TSC,IEEE Transactions on Services Computing
+TODS,ACM Transactions on Database Systems
+TOIS,ACM Transactions on Information Systems
+TKDE,IEEE Transactions on Knowledge and Data Engineering
+VLDBJ,The VLDB Journal
+TIT,IEEE Transactions on Information Theory
+IANDC,Information and Computation
+SICOMP,SIAM Journal on Computing
+TOG,ACM Transactions on Graphics
+TIP,IEEE Transactions on Image Processing
+TVCG,IEEE Transactions on Visualization and Computer Graphics
+AI,Artificial Intelligence
+TPAMI,IEEE Transactions on Pattern Analysis and Machine Intelligence
+IJCV,International Journal of Computer Vision
+JMLR,Journal of Machine Learning Research
+TOCHI,ACM Transactions on Computer-Human Interaction
+IJHCS,International Journal of Human-Computer Studies
+JACM,Journal of the ACM
+Proc. IEEE,Proceedings of the IEEE
+SCIS,Science China Information Sciences
--- a/share_is_ccfa/data/title_venue/c23.csv
+++ b/share_is_ccfa/data/title_venue/c23.csv
+title,venue
+Learning to Generalize With Object-Centric Agents in the Open World Survival Game Crafter,"IEEE Transactions on Games ( Volume: 16, Issue: 2, June 2024)"
+Focus-Then-Decide: Segmentation-Assisted Reinforcement Learning,"Proceedings of the AAAI Conference on Artificial Intelligence, 3"
+"Advancing DRL Agents in Commercial Fighting Games: Training, Integration,
+and Agent-Human Alignment",Proceedings of the 41th International Conference on Machine Learning (ICML 2024)
+Discovering and Using Structure in Autonomous Machine Learning,ETH Zurich thesis
+Jose Luis Flores Campana,博士论文
--- a/share_is_ccfa/is_ccfa.py
+++ b/share_is_ccfa/is_ccfa.py
--- a/share_is_ccfa/readme.md
+++ b/share_is_ccfa/readme.md
+用法：python is_ccfa.py
+
+功能：判断论文是否属于CCF-A类会议
+输入文件：CCF_A_list.csv，包含两列：abbr,fullname
+输入文件夹：data/title_venue，包含若干csv文件，每个文件的标题栏是title,venue
+输出文件夹：data/is_ccfa,对用title_venue文件夹下的每个csv文件进行处理，输出文件名相同，标题栏是title,venue,is_ccf_a
\ No newline at end of file