Commit 7db669c9 by jiangdongchen

update, comment in README

parent c84a9230
......@@ -12,7 +12,7 @@
- 输出关键信息json文件的文件夹
- source_excel_path
- 放置需要check的excel表格
-4行开始实际表项
-context_start+1行开始实际表项
- 第一列索引
- 第三列论文标题
- 第七列论文作者
......@@ -52,11 +52,13 @@
2. **遍历**excel的sheet
1. **遍历**sheet中的论文名称和索引
1.**大模型**读取pdf中第一页的论文名称和关键信息,存储到json文件夹下
2. **遍历**excel表格中的论文名称进行模糊匹配, 匹配成功后
2. 读取pdf中从后向前的引用信息, 通过**正则表达式**找出sheetname对应文章在当前pdf文章中的索引,存储到json文件夹下
3. **遍历**excel表格中的论文名称进行模糊匹配, 匹配成功后
1. 将pdf文件中的关键信息写入json文件中进行保存, 包括 标题 会议名称 作者姓名 机构 国家.
2. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文标题、会议名称、作者姓名、机构、国家.
3. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
2. 匹配失败后,输出无法匹配的条目,使用warning记录无法匹配的条目,方便后续处理.
3. 首先用**大模型**将英文国家名翻译成中国名,将国家对应的索引写入目标excel表格中.
4. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
4. 匹配失败后,输出无法匹配的条目,使用warning记录无法匹配的条目,方便后续处理.
3. 得到从pdf中提取的信息和格式化的excel表格.
4. 可能会有重复的pdf,只有标准化重命名之后才会发现,需要手动删除重复的pdf和excel表项,由人工判定.
3. stage2: 国家机构索引、牛人判断
......
......@@ -5,9 +5,10 @@
"pdf_dir": "./Papers",
"result_dir": "./json",
"source_excel_path": "./others/论文被引用情况-陈老师-2025.05.01.xlsx",
"content_start": 4,
"ccfa_excel_path": "./others/CCFA.xlsx",
"target_excel_path": "./others/target.xlsx",
"logLevel": 20,
"sheetNum": 1,
"maxItem": 64
"maxItem": 333
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
No preview for this file type
from errno import ESTALE
from pathlib import Path
import logging
from openai import OpenAI
......@@ -5,6 +6,9 @@ import pypdf
import openpyxl
from fuzzywuzzy import fuzz
import json
import re
import bibtexparser
import psrc.stage1.country_to_idx as c2i
RED = '\033[91m'
GREEN = '\033[92m'
......@@ -126,26 +130,79 @@ def get_key_info( content, configModel, client):
# Extracts text content from the first page of a PDF.
def extract_first_page_text(pdf_path):
try:
# 尝试读取 PDF 文件
reader = pypdf.PdfReader(pdf_path)
if len(reader.pages) > 0:
first_page = reader.pages[0]
text = first_page.extract_text()
if text:
# 2. text.split():用默认空白符(空格/换行/制表符)分割字符串,自动合并连续空白
# 3. " ".join(...):用单个空格连接,实现「多余空白→单空格」的清理效果
cleaned_text = " ".join(text.split())
cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
"utf-8"
)
return cleaned_text
else:
logging.warning(f"{RED}No text found on the first page of {pdf_path.name}{RESET}")
return None
else:
logging.warning(f"{RED}PDF has no pages: {pdf_path.name}{RESET}")
return None
except Exception as e:
# 捕获并记录读取 PDF 文件时的异常
logging.warning(f"{RED}Failed to read PDF {pdf_path.name}: {str(e)}{RESET}")
return None
reader = pypdf.PdfReader(pdf_path)
if len(reader.pages) > 0:
def get_citation_ids(pdf_path, title):
try:
reader = pypdf.PdfReader(pdf_path)
if len(reader.pages) == 0:
logging.warning(f"PDF has no pages: {pdf_path.name}")
return None
first_page = reader.pages[0]
text = first_page.extract_text()
# 定义需要替换的引号类型
quote_replacements = {
'“': '"', '”': '"', '‘': "'", '’': "'", '``': '"', "''": '"',
'〝': '"', '〞': '"', '"': '"', '«': '"', '»': '"'
}
# 构建正则表达式模式
ref_pattern = r'''
\[\s*(\d+)\s*\] # 引用编号
(?:(?!\[\s*\d+\s*\]).)*? # 排除中间的其他引用编号
" # 开始引号
[^"]*{}[^"]* # 标题(允许前后有其他内容)
" # 结束引号
'''.format(re.escape(title))
# 从最后一页开始逐页检查
for page_num in range(len(reader.pages) - 1, -1, -1):
page = reader.pages[page_num]
text = page.extract_text()
if text:
cleaned_text = " ".join(text.split())
# 统一替换各种引号为直引号
for old, new in quote_replacements.items():
cleaned_text = cleaned_text.replace(old, new)
match = re.search(
ref_pattern,
cleaned_text,
flags=re.IGNORECASE | re.VERBOSE | re.DOTALL
)
if match:
return match.group(1) # 匹配成功立即返回
# 如果所有页面都未匹配成功
logging.warning(f"Citation ID for {title} not found in {pdf_path.name}")
return None
if text:
# 1. text.split():用默认空白符(空格/换行/制表符)分割字符串,自动合并连续空白
# 2. " ".join(...):用单个空格连接,实现「多余空白→单空格」的清理效果
cleaned_text = " ".join(text.split())
cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
"utf-8"
)
return cleaned_text
else:
logging.warning(f"No text found on the first page of {pdf_path.name}")
return None
else:
logging.warning(f"PDF has no pages: {pdf_path.name}")
except Exception as e:
logging.error(f"An error occurred while processing {pdf_path.name}: {str(e)}")
return None
# excel表格的第4行开始读取索引和论文名称
......@@ -164,6 +221,28 @@ def read_rough_nameIndex_from_excel(sheet, maxItem):
return index_list, paperName_list
def translate_countries(countries_str: str, configModel, client):
system_prompt = """
你是一个专业的翻译助手,请将输入的英文国家名称翻译为中文。
输入可能是多个国家名称,以英文分号分隔。
输出同样以英文分号分隔对应的中文国家名称。
示例输入: United States; China
示例输出: 美国; 中国
"""
response = client.chat.completions.create(
model=configModel,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": countries_str}
],
temperature=0.2,
max_tokens=4096,
response_format={"type": "text"}
)
return response.choices[0].message.content.strip()
def citationProcess(config: dict):
client = OpenAI(api_key=config["api_key"],
......@@ -176,12 +255,6 @@ def citationProcess(config: dict):
# 读取Excel文件
wb = openpyxl.load_workbook(excel_path)
# 读取CCFA列表
# 序号 简称 全称
# 1 PPoPP ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming
# 2 FAST USENIX Conference on File and Storage Technologies
# 3 DAC Design Automation Conference
# 4 HPCA IEEE International Symposium on High Performance Computer Architecture
# 5 MICRO IEEE/ACM International Symposium on Microarchitecture
ccfa_wb = openpyxl.load_workbook(ccfa_excel_path)
sheetCCF = ccfa_wb["CCF-A列表"]
......@@ -191,16 +264,34 @@ def citationProcess(config: dict):
break
sheet = wb[sheet_name]
logging.info(f"{BLUE}Processing sheet: {sheet_name}{RESET}")
index_list, paperName_list = read_rough_nameIndex_from_excel(sheet, config["maxItem"])
rst_dir = Path.cwd() / config["result_dir"] / sheet_name
rst_dir.mkdir(parents=True, exist_ok=True) # 确保结果目录存在
pdf_directory = Path.cwd() / config["pdf_dir"] / sheet_name
pdf_files = pdf_directory.rglob("*.pdf") # 递归搜索, 输出所有pdf文件的路径
# 读取第三行的BibTeX文本
bibtex_row = sheet[3] # 第三行,索引从1开始
# 使用 map 函数将 cell.value 转换为字符串类型
bibtex_text = "".join(map(lambda cell: str(cell.value) if cell.value is not None else "", bibtex_row))
# 解析BibTeX文本
try:
bib_database = bibtexparser.loads(bibtex_text)
if bib_database.entries:
entry = bib_database.entries[0]
cited_title_str = entry.get('title', '')
else:
cited_title_str = ""
logging.warning(f"No BibTeX entry found in sheet {sheet_name} row 3")
except Exception as e:
cited_title_str = ""
logging.error(f"Error parsing BibTeX in sheet {sheet_name} row 3: {str(e)}")
# 遍历当前工作表对应的所有PDF文件
for file in pdf_files:
......@@ -217,6 +308,8 @@ def citationProcess(config: dict):
# 提取关键信息
result = get_key_info(first_page_text, configModel, client)
cit_id = get_citation_ids(file, cited_title_str)
if result is not None:
# 解析JSON结果, 提取论文标题
result_dict = json.loads(result)
......@@ -237,7 +330,8 @@ def citationProcess(config: dict):
if similarity >= 85:
# 重命名PDF文件
new_pdf_name = f"{idx}-{pdf_title.replace(':', '_').replace(' ', '_').replace('?', '_')}.pdf" # 将冒号替换为连字符
# 替换冒号、空格、问号和斜杠,避免文件名非法字符
new_pdf_name = f"{idx}-{pdf_title.replace(': ', '_').replace(' ', '_').replace('?', '_').replace('/', '_')}.pdf" # 将冒号、空格、问号和斜杠替换为下划线
new_pdf_path = file.parent / new_pdf_name
try:
file.rename(new_pdf_path)
......@@ -251,21 +345,39 @@ def citationProcess(config: dict):
rst_path.write_text(result + "\n", encoding='utf-8') # 明确指定UTF-8编码
# 更新Excel中的表项
sheet.cell(row=idx+4, column=3, value=pdf_title) # 第3列是论文名称
sheet.cell(row=idx+config["content_start"], column=3, value=pdf_title) # 第3列是论文名称
issue = result_dict.get("ISSUE", [])
if issue is not None:
sheet.cell(row=idx+config["content_start"], column=4, value=issue[0]) # 第4列是国家
logging.info(f"Standardization issue info.")
else:
logging.warning(f"{RED}ISSUE is None.{RESET}")
sheet.cell(row=idx+config["content_start"], column=6, value=cit_id) # 第7列是作者名称
authors_list = result_dict.get("Authors", [])
authors = ";".join(authors_list) if isinstance(authors_list, list) else ""
sheet.cell(row=idx+4, column=7, value=authors) # 第7列是作者名称
sheet.cell(row=idx+config["content_start"], column=7, value=authors) # 第7列是作者名称
logging.info(f"Standardization author info.")
institution_list = result_dict.get("Institutions", [])
institutions = ";".join(institution_list) if isinstance( institution_list, list) else ""
sheet.cell(row=idx+4, column=11, value=institutions) # 第9列是机构
logging.info(f"Standardization Institution info.")
sheet.cell(row=idx+config["content_start"], column=9, value=institutions) # 第9列是机构
logging.info(f"Standardization institution info.")
countrys_list = result_dict.get("Countrys", [])
countrys = ";".join(countrys_list) if isinstance(countrys_list, list) else ""
sheet.cell(row=idx+4, column=11, value=countrys) # 第11列是国家
# 翻译 countrys 为中文
if countrys:
translated_countrys = translate_countries(countrys, config["model"], client)
sheet.cell(row=idx+config["content_start"], column=11, value=translated_countrys) # 第11列是中文国家名称
logging.debug(f"Translated countrys info: {translated_countrys}")
else:
logging.warning(f"No countrys info to translate.")
countrys = c2i.country_to_idx(translated_countrys)
sheet.cell(row=idx+config["content_start"], column=10, value=countrys) # 第10列是国家索引
logging.info(f"Standardization countrys info.")
# CCFA判断
......@@ -300,6 +412,6 @@ def citationProcess(config: dict):
matched = True
break
if matched == False:
logging.warning(f"{RED}Not matched: {file.name} -> idx: {idx}, excel_name: {excel_name}{RESET}")
logging.warning(f"{RED}Not matched: {file.name}{RESET}")
else:
logging.error(f"{RED}Failed to extract key info from {file.name}{RESET}")
\ No newline at end of file
logging.warning(f"{RED}Failed to extract key info from {file.name}{RESET}")
\ No newline at end of file
country_to_id_map = {
"美国": 1,
"中国": 2,
"日本": 3,
"韩国": 4,
"新加坡": 5,
"中国台湾": 6,
"中国香港": 7,
"中国澳门": 8,
"法国": 9,
"英国": 10,
"德国": 11,
"意大利": 12,
"西班牙": 13,
"加拿大": 14,
"荷兰": 15,
"印度": 16,
"阿联酋": 17,
"比利时": 18,
"俄罗斯": 19,
"阿富汗": 20,
"亚美尼亚": 21,
"阿塞拜疆": 22,
"巴林": 23,
"孟加拉国": 24,
"不丹": 25,
"文莱": 26,
"缅甸": 27,
"柬埔寨": 28,
"塞浦路斯": 29,
"东帝汶": 30,
"格鲁吉亚": 31,
"印度尼西亚": 32,
"伊朗": 33,
"伊拉克": 34,
"以色列": 35,
"约旦": 36,
"哈萨克斯坦": 37,
"科威特": 38,
"吉尔吉斯斯坦": 39,
"老挝": 40,
"黎巴嫩": 41,
"马来西亚": 42,
"马尔代夫": 43,
"蒙古": 44,
"尼泊尔": 45,
"朝鲜": 46,
"阿曼": 47,
"巴基斯坦": 48,
"巴勒斯坦": 49,
"菲律宾": 50,
"卡塔尔": 51,
"沙特阿拉伯": 52,
"斯里兰卡": 53,
"叙利亚": 54,
"塔吉克斯坦": 55,
"泰国": 56,
"土库曼斯坦": 57,
"乌兹别克斯坦": 58,
"越南": 59,
"也门": 60,
"北塞浦路斯": 61,
"纳戈尔诺-卡拉巴 赫": 62,
"阿尔及利亚": 63,
"安哥拉": 64,
"贝宁": 65,
"博茨瓦纳": 66,
"布基纳法索": 67,
"布隆迪": 68,
"佛得角": 69,
"喀麦隆": 70,
"中非共和国": 71,
"乍得": 72,
"科摩罗": 73,
"刚果": 75,
"科特迪瓦": 76,
"吉布提": 77,
"埃及": 78,
"赤道几内亚": 79,
"厄立特里亚": 80,
"埃塞俄比亚": 81,
"加蓬": 82,
"冈比亚": 83,
"加纳": 84,
"几内亚": 85,
"几内亚比绍": 86,
"肯尼亚": 87,
"莱索托": 88,
"利比里亚": 89,
"利比亚": 90,
"马达加斯加": 91,
"马拉维": 92,
"马里": 93,
"毛里塔尼亚": 94,
"毛里求斯": 95,
"摩洛哥": 96,
"莫桑比克": 97,
"纳米比亚": 98,
"尼日尔": 99,
"尼日利亚": 100,
"卢旺达": 101,
"圣多美和普林西比": 102,
"塞内加尔": 103,
"塞舌尔": 104,
"塞拉利昂": 105,
"索马里": 106,
"南非": 107,
"南苏丹": 108,
"苏丹": 109,
"斯威士兰": 110,
"坦桑尼亚": 111,
"多哥": 112,
"突尼斯": 113,
"乌干达": 114,
"赞比亚": 115,
"津巴布韦": 116,
"西撒哈拉": 117,
"阿尔巴尼亚": 118,
"安道尔": 119,
"奥地利": 120,
"白俄罗斯": 121,
"波斯尼亚和黑塞哥维那": 122,
"保加利亚": 123,
"克罗地亚": 124,
"捷克": 125,
"丹麦": 126,
"爱沙尼亚": 127,
"芬兰": 128,
"希腊": 129,
"匈牙利": 130,
"冰岛": 131,
"爱尔兰": 132,
"拉脱维亚": 133,
"列支敦士登": 134,
"立陶宛": 135,
"卢森堡": 136,
"马耳他": 137,
"摩尔多瓦": 138,
"摩纳哥": 139,
"黑山": 140,
"北马其顿": 141,
"挪威": 142,
"波兰": 143,
"葡萄牙": 144,
"罗马尼亚": 145,
"圣马力诺": 146,
"塞尔维亚": 147,
"斯洛伐克": 148,
"斯洛文尼亚": 149,
"瑞典": 150,
"瑞士": 151,
"乌克兰": 152,
"梵蒂冈": 153,
"科索沃": 154,
"法罗群岛": 155,
"直布罗陀": 156,
"安提瓜和巴布达": 157,
"巴哈马": 158,
"巴巴多斯": 159,
"伯利兹": 160,
"哥斯达黎加": 161,
"古巴": 162,
"多米尼加": 163,
"多米尼加共和国": 164,
"萨尔瓦多": 165,
"格林纳达": 166,
"危地马拉": 167,
"海地": 168,
"洪都拉斯": 169,
"牙买加": 170,
"墨西哥": 171,
"尼加拉瓜": 172,
"巴拿马": 173,
"圣基茨和尼维斯": 174,
"圣卢西亚": 175,
"圣文森特和格林纳丁斯": 176,
"特立尼达和多巴哥": 177,
"百慕大": 178,
"格陵兰": 179,
"波多黎各": 180,
"美属维尔京群岛": 181,
"英属维尔京群岛": 182,
"开曼群岛": 183,
"安圭拉": 184,
"蒙特塞拉特": 185,
"阿根廷": 186,
"玻利维亚": 187,
"巴西": 188,
"智利": 189,
"哥伦比亚": 190,
"厄瓜多尔": 191,
"圭亚那": 192,
"巴拉圭": 193,
"秘鲁": 194,
"苏里南": 195,
"乌拉圭": 196,
"委内瑞拉": 197,
"法属圭亚那": 198,
"福克兰群岛": 199,
"澳大利亚": 200,
"斐济": 201,
"基里巴斯": 202,
"马绍尔群岛": 203,
"密克罗尼西亚": 204,
"瑙鲁": 205,
"新西兰": 206,
"帕劳": 207,
"巴布亚新几内亚": 208,
"萨摩亚": 209,
"所罗门群岛": 210,
"汤加": 211,
"图瓦卢": 212,
"瓦努阿图": 213,
"库克群岛": 214,
"纽埃": 215,
"法属波利尼西亚": 216,
"新喀里多尼亚": 217,
"瓦利斯和富图纳": 218,
"托克劳": 219,
"皮特凯恩群岛": 220,
"土耳其": 221,
}
def country_to_idx(country_str: str) -> str:
result = []
country_str = country_str.strip()
for country in country_str.split(";"):
clean_country = country.strip()
if clean_country not in country_to_id_map:
err_msg = f"Unknown Country: {clean_country}"
print(err_msg)
raise ValueError(err_msg)
else:
result.append(str(country_to_id_map[clean_country]))
return ";".join(result)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment