Commit a4679d73 by jiangdongchen

README

parent 2b8dc5d4
......@@ -53,12 +53,7 @@
2. **遍历**excel表格中的论文名称进行模糊匹配
1. 匹配成功后
1. 用pdf文件中的论文名称和索引标准化重命名pdf文件和excel表格中的论文名称
2. 将pdf文件中的关键信息写入json文件中进行保存, 包括
- 标题
- 会议名称
- 作者姓名
- 机构
- 国家
2. 将pdf文件中的关键信息写入json文件中进行保存, 包括 标题 会议名称 作者姓名 机构 国家
2. 匹配失败后,输出无法匹配的条目
1. 使用warning记录无法匹配的条目,方便后续处理
......
from openai import OpenAI
from pathlib import Path
import json
import openpyxl
def chechCCFA( conferenceJournal, CCFA, configModel, client):
system_prompt = f"""
You are an expert academic conference/journal classifier. Your task is to determine if the given conference/journal name matches any entry in the provided CCF-A list.
CCF-A List (comma-separated): {CCFA}
Analysis Guidelines:
1. Perform fuzzy matching considering:
- Abbreviations vs full names (e.g. 'PPoPP' vs 'ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming')
- Common variations (e.g. 'IEEE Transactions' vs 'IEEE Trans.')
- Minor spelling differences
2. Return JSON with:
- "IsCCFA": ture/false
- "MatchedName": the matched name from CCF-A list (empty string if no match)
- "Confidence": your confidence score (0-100)
Example Output:
{{
"IsCCFA": "ture",
"MatchedName": "IEEE International Symposium on High Performance Computer Architecture",
"Confidence": 0.95,
"Reason": "The input matches HPCA's full name"
}}
"""
response = client.chat.completions.create(
model=configModel,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": conferenceJournal},
],
temperature=0.2,
max_tokens=4096,
# stream=True,
response_format={"type": "json_object"}
)
return response.choices[0].message.content
if __name__ == "__main__":
cwd_dir = Path.cwd()
# 构建 config.json 的完整路径
config_path = (cwd_dir / "config.json").resolve()
# 读取config.json中的配置参数
with open( config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
client = OpenAI(api_key=config["api_key"], base_url=config["base_url"])
configModel = config["model"]
excel_path2 = Path(config["excel_path2"])
wb = openpyxl.load_workbook(excel_path2)
sheetCCF = wb["CCF-A列表"]
# 序号 简称 全称
# 1 PPoPP ACM SIGPLAN Symposium on Principles & Practice of Parallel Programming
# 2 FAST USENIX Conference on File and Storage Technologies
# 3 DAC Design Automation Conference
# 4 HPCA IEEE International Symposium on High Performance Computer Architecture
# 5 MICRO IEEE/ACM International Symposium on Microarchitecture
CCFA_list = []
for row in sheetCCF.iter_rows(min_row=2, values_only=True): # 从第二行开始遍历
if row[0] and row[1]: # 确保索引和论文名称都存在
CCFA_list.append(row[1])
CCFA_list.append(row[2])
# 把list转为长的字符串, ','分割
CCFA = ','.join(CCFA_list)
conferenceJournal = "IEEE Journal of Solid-State Circuits"
result = chechCCFA(conferenceJournal, CCFA, configModel, client)
print(result)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment