Commit ea9e27cc by jiangdongchen

stage 1

parent 7db669c9
......@@ -42,6 +42,7 @@
- 断点处理:如果在excel某个序号之前的pdf都正确提取了信息,并且正确修改了excel,下一个序号开始的pdf出错了
- 建议将正确的pdf都转移到其他文件夹,这样再次运行脚本将处理剩下的pdf
- TODO:多模型交叉验证
- TODO:Temperature的设置
# 需求与解决方案
1. TODO:下载论文pdf
......@@ -52,7 +53,7 @@
2. **遍历**excel的sheet
1. **遍历**sheet中的论文名称和索引
1.**大模型**读取pdf中第一页的论文名称和关键信息,存储到json文件夹下
2. 读取pdf中从后向前的引用信息, 通过**正则表达式**找出sheetname对应文章在当前pdf文章中的索引,存储到json文件夹下
2. 读取pdf中从后向前的引用信息, 通过**大模型**找出sheetname对应文章在当前pdf文章中的索引,存储到json文件夹下
3. **遍历**excel表格中的论文名称进行模糊匹配, 匹配成功后
1. 将pdf文件中的关键信息写入json文件中进行保存, 包括 标题 会议名称 作者姓名 机构 国家.
2. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文标题、会议名称、作者姓名、机构、国家.
......@@ -60,7 +61,9 @@
4. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
4. 匹配失败后,输出无法匹配的条目,使用warning记录无法匹配的条目,方便后续处理.
3. 得到从pdf中提取的信息和格式化的excel表格.
4. 可能会有重复的pdf,只有标准化重命名之后才会发现,需要手动删除重复的pdf和excel表项,由人工判定.
4. 人工复核
1. 可能会有重复的pdf,只有标准化重命名之后才会发现,当两个pdf的索引名称相同时,说明pdf重复匹配,需要手动删除重复的pdf和excel表项,由人工结合原表格信息判定重复现象.
2. 可能机构和国家会有重复,请人工检查.
3. stage2: 国家机构索引、牛人判断
# 代码结构说明
......
This source diff could not be displayed because it is too large. You can view the blob instead.
No preview for this file type
......@@ -64,14 +64,14 @@ def get_key_info( content, configModel, client):
- Identify all listed authors. Maintain the order presented in the text if possible.
- Extract their full name as accurately as possible. Use "" if a name cannot be clearly identified for an entry.
- **Institutions:**
- Extract all associated institutions of authors.
- Extract all associated institutions of authors
- 相同机构仅保留一个
- **Countrys:**
- Extract all associated countrys of authors.
- Extract all associated countrys of institutions.
- Try to use full names. Only write the names of universities/companies, excluding departments, postal codes, countries, house numbers, cities, etc.
- For foreign names: Translate them first, and then manually check them using Google (mark them).
- In cases like the Chinese Academy of Sciences, specify the institute level. For example, the Institute of Computing Technology, Chinese Academy of Sciences.
- In cases of branch campuses, specify the branch. For example, California State University, University of California, University of Maryland.
- **ISSUE:**
- Extract where the paper is published like journal or session.
- Title, authors, institutions and countrys should be four separate keys, not nested together.
......@@ -90,14 +90,10 @@ def get_key_info( content, configModel, client):
"Institutions": [
"Shanghai Jiao Tong University",
"Colorado School of Mines",
"Colorado School of Mines",
"Shanghai Jiao Tong University",
],
"Countrys": [
"China",
"United States",
"United States",
"China",
],
"ISSUE": [
"IEEE Transactions on Computers"
......@@ -111,7 +107,7 @@ def get_key_info( content, configModel, client):
{"role": "system", "content": system_prompt},
{"role": "user", "content": content},
],
temperature=0.7,
temperature=0.25,
max_tokens=4096,
# stream=True,
response_format={"type": "json_object"}
......@@ -156,27 +152,25 @@ def extract_first_page_text(pdf_path):
logging.warning(f"{RED}Failed to read PDF {pdf_path.name}: {str(e)}{RESET}")
return None
def get_citation_ids(pdf_path, title):
def get_citation_ids(pdf_path, title, configModel, client):
try:
reader = pypdf.PdfReader(pdf_path)
if len(reader.pages) == 0:
logging.warning(f"PDF has no pages: {pdf_path.name}")
return None
# 定义需要替换的引号类型
quote_replacements = {
'“': '"', '”': '"', '‘': "'", '’': "'", '``': '"', "''": '"',
'〝': '"', '〞': '"', '"': '"', '«': '"', '»': '"'
}
# 构建正则表达式模式
ref_pattern = r'''
\[\s*(\d+)\s*\] # 引用编号
(?:(?!\[\s*\d+\s*\]).)*? # 排除中间的其他引用编号
" # 开始引号
[^"]*{}[^"]* # 标题(允许前后有其他内容)
" # 结束引号
'''.format(re.escape(title))
system_prompt = f"""
你是一个专业的学术论文引用分析助手。你的任务是从PDF页面文本中找出引用给定论文的引用编号。
论文标题: {title}
分析指南:
1. 从文本中查找类似 [1], [2] 这样的引用标记
2. 引用标记后面通常会跟着引用的论文标题
3. 如果找到匹配的引用编号,返回该数字
4. 如果没有找到匹配的引用,返回空字符串
请直接返回引用编号数字,不要返回其他内容。
"""
# 从最后一页开始逐页检查
for page_num in range(len(reader.pages) - 1, -1, -1):
......@@ -185,22 +179,23 @@ def get_citation_ids(pdf_path, title):
if text:
cleaned_text = " ".join(text.split())
# 统一替换各种引号为直引号
for old, new in quote_replacements.items():
cleaned_text = cleaned_text.replace(old, new)
match = re.search(
ref_pattern,
cleaned_text,
flags=re.IGNORECASE | re.VERBOSE | re.DOTALL
response = client.chat.completions.create(
model=configModel,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": cleaned_text},
],
temperature=0.1,
max_tokens=10,
response_format={"type": "text"}
)
if match:
return match.group(1) # 匹配成功立即返回
result = response.choices[0].message.content.strip()
if result.isdigit():
return result
# 如果所有页面都未匹配成功
logging.warning(f"Citation ID for {title} not found in {pdf_path.name}")
return None
except Exception as e:
logging.error(f"An error occurred while processing {pdf_path.name}: {str(e)}")
return None
......@@ -308,7 +303,7 @@ def citationProcess(config: dict):
# 提取关键信息
result = get_key_info(first_page_text, configModel, client)
cit_id = get_citation_ids(file, cited_title_str)
cit_id = get_citation_ids(file, cited_title_str, configModel, client)
if result is not None:
# 解析JSON结果, 提取论文标题
......@@ -348,13 +343,16 @@ def citationProcess(config: dict):
sheet.cell(row=idx+config["content_start"], column=3, value=pdf_title) # 第3列是论文名称
issue = result_dict.get("ISSUE", [])
if issue is not None:
sheet.cell(row=idx+config["content_start"], column=4, value=issue[0]) # 第4列是国家
if issue and isinstance(issue, list) and len(issue) > 0: # 确保issue是有效列表且不为空
sheet.cell(row=idx+config["content_start"], column=4, value=issue[0])
logging.info(f"Standardization issue info.")
else:
logging.warning(f"{RED}ISSUE is None.{RESET}")
logging.warning(f"{RED}Invalid ISSUE data: {issue}{RESET}")
sheet.cell(row=idx+config["content_start"], column=6, value=cit_id) # 第7列是作者名称
if cit_id is not None:
sheet.cell(row=idx+config["content_start"], column=6, value=cit_id) # 第6列是cit_id
else:
logging.warning(f"{RED}cit_id is None.{RESET}")
authors_list = result_dict.get("Authors", [])
authors = ";".join(authors_list) if isinstance(authors_list, list) else ""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment