Commit f6128fef by nzy

first commit

parents
from pydantic import BaseModel
from openai import OpenAI
from pathlib import Path
import pypdf
import json
import os
import logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
client = OpenAI(
api_key=os.environ.get("OPENAI_API_KEY"),
)
class Author(BaseModel):
name: str
affiliations: list[str]
class Metadata(BaseModel):
title: str
authors: list[Author]
system_prompt = """
Act as an expert metadata extraction assistant.
Analyze the following text, which is extracted from the first page of a document (likely a scientific paper or report).
Your goal is to extract the document title, all authors, and their corresponding affiliations.
Extraction Guidelines:
- **Title:** Extract the main title of the document. If ambiguous or missing, use "".
- **Authors:**
- Identify all listed authors. Maintain the order presented in the text if possible.
- For each author:
- Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
- Extract all associated institutions/affiliations mentioned for that specific author.
- If an author has no listed institution, use an empty list `[]`.
- If there are many authors and only one afflication, these authors all come from the same afflication. other wise find the corresponding afflication by indicator.
- **Handling Missing Data:** If no authors can be identified in the text, the "authors" field in the JSON should be an empty list `[]`.
"""
def get_authors(content):
response = client.responses.parse(
model="gpt-4o",
input=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": content},
],
text_format=Metadata,
)
result = response.output_parsed
return result
def extract_first_page_text(pdf_path):
"""Extracts text content from the first page of a PDF."""
try:
reader = pypdf.PdfReader(pdf_path)
if len(reader.pages) > 0:
first_page = reader.pages[0]
text = first_page.extract_text()
if text:
# Basic cleaning: remove excessive whitespace
cleaned_text = " ".join(text.split())
cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
"utf-8"
)
return cleaned_text
else:
logging.warning(f"No text found on the first page of {pdf_path.name}")
return None
else:
logging.warning(f"PDF has no pages: {pdf_path.name}")
return None
except pypdf.errors.PdfReadError as e:
logging.error(f"Error reading PDF file {pdf_path.name}: {e}")
return None
except FileNotFoundError:
logging.error(f"PDF file not found: {pdf_path}")
return None
except Exception as e:
logging.error(
f"An unexpected error occurred while processing {pdf_path.name}: {e}"
)
return None
def main(pdf_directory: Path, result_path: Path):
with open(result_path, "a", encoding="utf-8") as f:
pdf_files = pdf_directory.rglob("*.pdf")
for file in pdf_files:
try:
logging.info(f"Extract {file.name}'s authors")
first_page_text = extract_first_page_text(file)
logging.info(first_page_text)
if first_page_text is not None:
result = get_authors(first_page_text).model_dump()
result["filename"] = file.name
f.write(json.dumps(result) + "\n")
except Exception as e:
logging.error(f"{file.name}: {str(e)}")
if __name__ == "__main__":
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("--paper", type=str, required=True)
argparser.add_argument("--result", type=str, required=True)
args = argparser.parse_args()
main(Path(args.paper), Path(args.result))
# Readme
## 依赖
`openai`, `pypdf`
## 用法
export OPENAI_API_KEY=...
python extract_authors_info.py --paper ./papars --result result.jsonl
- 参数paper: 包含所有pdf的文件夹的路径
- 参数result: 结果文件路径
## 注意
本结果由GPT自动生成,准确性无法保证,需要人工再次审核。
# crawler
## update
更新陈老师的文章列表信息`yunjichen.json`,后续`main.py`依赖此文件检索引用文献。
## main
创建`papers.txt`并在其中逐行加入你需要检索的论文标题,不建议输入完整标题,输入部分标题即可,如下所示:
```
Reproducing Concurrency Bugs Using Local Clocks
binary translator with post-optimization
timing error mitigation for hardware neural networks
A Polyvalent Machine Learning Accelerator
```
已爬取过的链接在`urls.txt`中记录,避免重复爬取,`main.py`文件可重复运行,爬取失败建议检查网络连接。
### 2. **运行脚本**
确保你的环境已安装必要依赖(如`pandas`, `openpyxl`, `tqdm`等)。
命令行运行:
```bash
python main.py
```
**指定年份**(仅爬取该年份的引用):
```bash
python main.py --year 2023
```
### 3. **查看结果**
抓取后将在`results/`下生成`citations.xlsx`,每个论文一个表单,包含被引论文的Title、URL、作者信息等。
------
## **参数说明**
- `--year` 指定年份,仅抓取该年度的引用。不加该参数则会爬取**所有年份**的引用信息。
**用法示例:**
- 爬取全部(默认): `python main.py`
- 仅抓2022年的引用: `python main.py --year 2022`
\ No newline at end of file
import os
import re
import string
import json
import pandas as pd
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import time
import random
import argparse
from bs4 import BeautifulSoup
import requests
import re
import string
JSON_FILE = "yunjichen.json"
PUB_FILE = "papers.txt"
URL_FILE = "urls.txt"
DATA_FILE = "citations.xlsx"
def get_cited_url_list(citedby_url, year=None):
prefix, suffix = citedby_url.split("oi=bibs&hl=en")
prefix += "start="
if year:
# Use the year parameter for filtering
suffix = f"&hl=en&as_sdt=2005&sciodt=0,5{suffix}&scipsc=&as_ylo={year}&as_yhi={year}&scisbd="
else:
# No year filter, fetch all
suffix = f"&hl=en&as_sdt=2005&sciodt=0,5{suffix}&scipsc="
for i in range(0, 10000, 10):
yield prefix + str(i) + suffix
SYMBOL_MORE_AUTHORS = "…"
# 添加请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
def parse_scholar_results(html_content):
"""
Parses HTML content to extract Google Scholar results.
Args:
html_content (str): The HTML content to parse.
Returns:
list: A list of dictionaries containing '标题', '作者', and '期刊信息'.
"""
soup = BeautifulSoup(html_content, "html.parser")
results = soup.find_all("div", class_="gs_r")
extracted_data = []
for result in results:
# Extract title
title_tag = result.find(["h2", "h3"], class_="gs_rt")
if title_tag:
a_tag = title_tag.find("a")
if a_tag:
title = a_tag.get_text()
else:
title = title_tag.get_text()
title = title.replace("[CITATION][C] ", "")
else:
title = "未找到标题"
# Extract authors and journal information
authors_journal_tag = result.find("div", class_="gs_a")
if authors_journal_tag:
authors_journal_text = authors_journal_tag.get_text()
parts = authors_journal_text.split("-")
if len(parts) >= 2:
authors = parts[0].strip()
journal_info = parts[1].split(",")[0].strip()
else:
authors = "格式不符合预期"
journal_info = "格式不符合预期"
else:
authors = "未找到作者和期刊信息"
journal_info = "未找到作者和期刊信息"
# Add extracted data to list
extracted_data.append(
{"标题": title, "作者": authors, "期刊信息": journal_info}
)
return extracted_data
def parse_html(url):
"""
return:
title: str
new_url: url to the paper
authors: str
more_authors: bool, if there are more authors than shown
"""
# print(f"Fetching {url}")
results = []
session = requests.Session()
response = session.get(url, headers=headers)
if response.status_code != 200:
# print("Failed to get the page")
return False, results
soup = BeautifulSoup(response.content, "html.parser")
papers = soup.find_all("div", class_="gs_r gs_or gs_scl")
if len(papers) == 0:
return False, results
for paper in papers:
div_title = paper.find("h3", class_="gs_rt")
try:
title = div_title.find("a").get_text()
new_url = div_title.find("a")["href"]
except:
# span blocks
title = div_title.find_all("span")[-1].get_text()
continue
authors = paper.find("div", class_="gs_a").get_text().split("-")[0].split(",")
authors[-1] = " ".join(authors[-1].split())
authors = ";".join(authors)
if SYMBOL_MORE_AUTHORS in authors[-1]:
results.append((title, new_url, authors, 1))
else:
results.append((title, new_url, authors, 0))
return True, results
def add_scisbd_sort(url):
"""确保url中有scisbd=1 按date排序"""
parsed_url = urlparse(url)
query = parse_qs(parsed_url.query)
query["scisbd"] = ["1"]
new_query = urlencode(query, doseq=True)
new_url = urlunparse(
(parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", new_query, "")
)
return new_url
def main(publications, year):
if not os.path.exists(URL_FILE):
with open(URL_FILE, "w") as file:
file.write("")
publications = [pub.lower() for pub in publications]
with open(JSON_FILE, "r") as file:
author = json.load(file)
author_publications = [
pub for pub in author["publications"] if pub["container_type"] == "Publication"
]
titles = [pub["bib"]["title"] for pub in author_publications]
index_publications = []
for i, publication in enumerate(publications):
found = False
for idx, title in enumerate(titles):
if publication in title.lower():
found = True
index_publications.append(idx)
publications[i] = title
break
if not found:
index_publications.append(None)
for idx, publication in enumerate(publications):
with open(URL_FILE, "a+") as file:
file.seek(0)
know_urls = set([line.strip() for line in file.readlines()])
name = "_".join(publication.split())
name = re.sub(f"[{string.punctuation}]", "", name[:20])
print(f"Processing {publication}...to {name}")
columns = [
"paper idx",
"paper Title",
"Cite idx",
"Cite Title",
"URL",
"Authors",
"More Authors",
]
if os.path.exists(DATA_FILE) and name in pd.ExcelFile(DATA_FILE).sheet_names:
old_df = pd.read_excel(DATA_FILE, sheet_name=name)
data = (
old_df
if set(old_df.columns) == set(columns)
else pd.DataFrame(columns=columns)
)
current_start_idx = old_df["Cite idx"].max() if not old_df.empty else 0
else:
data = pd.DataFrame(columns=columns)
current_start_idx = 0
index_publication = index_publications[idx]
if index_publication is None:
continue
citations = author_publications[index_publication]["num_citations"]
citedby_url = author_publications[index_publication]["citedby_url"]
citation_count = current_start_idx
for i, url in tqdm(enumerate(get_cited_url_list(citedby_url, year))):
# url = add_scisbd_sort(url)
if url in know_urls:
continue
if i * 10 > citations:
break
# 随机延时
time.sleep(random.uniform(0.5, 2.0))
mark, results = parse_html(url)
if not mark:
break
for rec in results:
citation_count += 1
title = rec[0] if len(rec) > 0 else ""
cite_url = rec[1] if len(rec) > 1 else ""
authors = rec[2] if len(rec) > 2 else ""
more_authors = rec[3] if len(rec) > 3 else ""
row = {
"paper idx": idx + 1,
"paper Title": publication,
"Cite idx": citation_count,
"Cite Title": title,
"URL": cite_url,
"Authors": authors,
"More Authors": more_authors,
}
data = data._append(row, ignore_index=True)
know_urls.add(url)
if os.path.exists(DATA_FILE):
with pd.ExcelWriter(
DATA_FILE, mode="a", if_sheet_exists="replace", engine="openpyxl"
) as writer:
data.to_excel(writer, sheet_name=name, index=False)
else:
data.to_excel(DATA_FILE, sheet_name=name, index=False)
with open(URL_FILE, "w") as file:
file.writelines([url + "\n" for url in know_urls])
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Crawl citations with optional year filtering"
)
parser.add_argument(
"--year", type=int, help="The year to filter citations by (optional)"
)
args = parser.parse_args()
# Load the publication file
with open(PUB_FILE, "r", encoding="utf-8") as file:
publications = file.readlines()
publications = [pub.strip().lower() for pub in publications]
main(publications, args.year)
anNao: A Machine-Learning Supercompute
from scholarly import scholarly
from scholarly import ProxyGenerator
import json
# Activates proxy because Google Scholar otherwise might block the IP address
pg = ProxyGenerator()
scholarly.use_proxy(pg, pg)
def main(name):
author = next(scholarly.search_author(name))
author = scholarly.fill((author),sections = ['publications'])
json.dump(author, open(f"{name}.json", "w"), indent=4)
if __name__=="__main__":
main("yunji chen")
\ No newline at end of file
abbr,fullname
PPoPP,ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming
FAST,USENIX Conference on File and Storage Technologies
DAC,Design Automation Conference
HPCA,IEEE International Symposium on High Performance Computer Architecture
MICRO,IEEE/ACM International Symposium on Microarchitecture
SC,"International Conference for High Performance Computing, Networking, Storage, and Analysis"
ASPLOS,International Conference on Architectural Support for Programming Languages and Operating Systems
ISCA,International Symposium on Computer Architecture
USENIX ATC,USENIX Annual Technical Conference
EuroSys,European Conference on Computer Systems
SIGCOMM,"ACM International Conference on Applications, Technologies, Architectures, and Protocols for Computer Communication"
MobiCom,ACM International Conference on Mobile Computing and Networking
INFOCOM,IEEE International Conference on Computer Communications
NSDI,Symposium on Network System Design and Implementation
CCS,ACM Conference on Computer and Communications Security
EUROCRYPT,International Conference on the Theory and Applications of Cryptographic Techniques
S&P,IEEE Symposium on Security and Privacy
CRYPTO,International Cryptology Conference
USENIX Security,USENIX Security Symposium
NDSS,Network and Distributed System Security Symposium
PLDI,ACM SIGPLAN Conference on Programming Language Design and Implementation
POPL,ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages
FSE,ACM International Conference on the Foundations of Software Engineering
SOSP,ACM Symposium on Operating Systems Principles
OOPSLA,"Conference on Object-Oriented Programming Systems, Languages,and Applications"
ASE,International Conference on Automated Software Engineering
ICSE,International Conference on Software Engineering
ISSTA,International Symposium on Software Testing and Analysis
OSDI,USENIX Symposium on Operating Systems Design and Implementations
FM,International Symposium on Formal Methods
SIGMOD,ACM SIGMOD Conference
SIGKDD,ACM SIGKDD Conference on Knowledge Discovery and Data Mining
ICDE,IEEE International Conference on Data Engineering
SIGIR,International ACM SIGIR Conference on Research and Development in Information Retrieval
VLDB,International Conference on Very Large Data Bases
STOC,ACM Symposium on Theory of Computing
SODA,ACM-SIAM Symposium on Discrete Algorithms
CAV,International Conference on Computer Aided Verification
FOCS,IEEE Annual Symposium on Foundations of Computer Science
LICS,ACM/IEEE Symposium on Logic in Computer Science
ACM MM,ACM International Conference on Multimedia
SIGGRAPH,ACM Special Interest Group on Computer Graphics
VR,IEEE Virtual Reality
IEEE VIS,IEEE Visualization Conference
AAAI,AAAI Conference on Artificial Intelligence
NeurIPS,Conference on Neural Information Processing Systems
ACL,Annual Meeting of the Association for Computational Linguistics
CVPR,IEEE/CVF Computer Vision and Pattern Recognition Conference
ICCV,International Conference on Computer Vision
ICML,International Conference on Machine Learning
IJCAI,International Joint Conference on Artificial Intelligence
CSCW,ACM Conference on Computer Supported Cooperative Work and Social Computing
CHI,ACM Conference on Human Factors in Computing Systems
UbiComp/IMWUT,"ACM international joint conference on Pervasive and Ubiquitous Computing/ Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies"
UIST,ACM Symposium on User Interface Software and Technology
WWW,International World Wide Web Conference
RTSS,IEEE Real-Time Systems Symposium
WINE,Conference on Web and Internet Economics
TOCS,ACM Transactions on Computer Systems
TOS,ACM Transactions on Storage
TCAD,IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems
TC,IEEE Transactions on Computers
TPDS,IEEE Transactions on Parallel and Distributed Systems
TACO,ACM Transactions on Architecture and Code Optimization
JSAC,IEEE Journal on Selected Areas in Communications
TMC,IEEE Transactions on Mobile Computing
TON,IEEE/ACM Transactions on Networking
TDSC,IEEE Transactions on Dependable and Secure Computing
TIFS,IEEE Transactions on Information Forensics and Security
,Journal of Cryptology
TOPLAS,ACM Transactions on Programming Languages and Systems
TOSEM,ACM Transactions on Software Engineering and Methodology
TSE,IEEE Transactions on Software Engineering
TSC,IEEE Transactions on Services Computing
TODS,ACM Transactions on Database Systems
TOIS,ACM Transactions on Information Systems
TKDE,IEEE Transactions on Knowledge and Data Engineering
VLDBJ,The VLDB Journal
TIT,IEEE Transactions on Information Theory
IANDC,Information and Computation
SICOMP,SIAM Journal on Computing
TOG,ACM Transactions on Graphics
TIP,IEEE Transactions on Image Processing
TVCG,IEEE Transactions on Visualization and Computer Graphics
AI,Artificial Intelligence
TPAMI,IEEE Transactions on Pattern Analysis and Machine Intelligence
IJCV,International Journal of Computer Vision
JMLR,Journal of Machine Learning Research
TOCHI,ACM Transactions on Computer-Human Interaction
IJHCS,International Journal of Human-Computer Studies
JACM,Journal of the ACM
Proc. IEEE,Proceedings of the IEEE
SCIS,Science China Information Sciences
title,venue
Learning to Generalize With Object-Centric Agents in the Open World Survival Game Crafter,"IEEE Transactions on Games ( Volume: 16, Issue: 2, June 2024)"
Focus-Then-Decide: Segmentation-Assisted Reinforcement Learning,"Proceedings of the AAAI Conference on Artificial Intelligence, 3"
"Advancing DRL Agents in Commercial Fighting Games: Training, Integration,
and Agent-Human Alignment",Proceedings of the 41th International Conference on Machine Learning (ICML 2024)
Discovering and Using Structure in Autonomous Machine Learning,ETH Zurich thesis
Jose Luis Flores Campana,博士论文
用法:python is_ccfa.py
功能:判断论文是否属于CCF-A类会议
输入文件:CCF_A_list.csv,包含两列:abbr,fullname
输入文件夹:data/title_venue,包含若干csv文件,每个文件的标题栏是title,venue
输出文件夹:data/is_ccfa,对用title_venue文件夹下的每个csv文件进行处理,输出文件名相同,标题栏是title,venue,is_ccf_a
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment