Commit 4aee3217 by jiangdongchen

update

parent 68c6acc7
.vscode/
Papers/
psrc/__pycache__/
psrc/stage1/__pycache__/
psrc/stage2/__pycache__/
json/
\ No newline at end of file
......@@ -58,6 +58,7 @@
3. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
2. 匹配失败后,输出无法匹配的条目,使用warning记录无法匹配的条目,方便后续处理.
3. 得到从pdf中提取的信息和格式化的excel表格.
4. 可能会有重复的pdf,只有标准化重命名之后才会发现,需要手动删除重复的pdf和excel表项,由人工判定.
3. stage2: 国家机构索引、牛人判断
# 代码结构说明
......
......@@ -9,5 +9,5 @@
"target_excel_path": "./others/target.xlsx",
"logLevel": 20,
"sheetNum": 1,
"maxItem": 10
"maxItem": 64
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
No preview for this file type
......@@ -224,15 +224,17 @@ def citationProcess(config: dict):
pdf_issue = result_dict["ISSUE"]
# 遍历Excel表项进行模糊匹配
matched = False
for idx, excel_name in zip(index_list, paperName_list):
# 预处理字符串
# 返回pdf字符前的字符串,所以加上索引0
clean_excel_name = excel_name.split('.pdf')[0].replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
clean_pdf_title = pdf_title.replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
logging.debug(f"clean_excel_name: {clean_excel_name}")
logging.debug(f"clean_pdf_title: {clean_pdf_title}")
similarity = fuzz.partial_ratio(clean_pdf_title.lower(), clean_excel_name.lower())
if similarity >= 85:
# 重命名PDF文件
new_pdf_name = f"{idx}-{pdf_title.replace(':', '_').replace(' ', '_').replace('?', '_')}.pdf" # 将冒号替换为连字符
......@@ -254,14 +256,17 @@ def citationProcess(config: dict):
authors_list = result_dict.get("Authors", [])
authors = ";".join(authors_list) if isinstance(authors_list, list) else ""
sheet.cell(row=idx+4, column=7, value=authors) # 第7列是作者名称
logging.info(f"Standardization author info.")
institution_list = result_dict.get("Institutions", [])
institutions = ";".join(institution_list) if isinstance( institution_list, list) else ""
sheet.cell(row=idx+4, column=11, value=institutions) # 第9列是机构
logging.info(f"Standardization Institution info.")
countrys_list = result_dict.get("Countrys", [])
countrys = ";".join(countrys_list) if isinstance(countrys_list, list) else ""
sheet.cell(row=idx+4, column=11, value=countrys) # 第11列是国家
logging.info(f"Standardization countrys info.")
# CCFA判断
logging.info(f"Judge CCFA.")
......@@ -290,4 +295,10 @@ def citationProcess(config: dict):
logging.info(f"Matched: {file.name} -> idx: {idx}, excel_name: {excel_name}")
logging.info(f"Change: {file.name} -> {new_pdf_name}")
break
\ No newline at end of file
matched = True
break
if matched == False:
logging.warning(f"{RED}Not matched: {file.name} -> idx: {idx}, excel_name: {excel_name}{RESET}")
else:
logging.error(f"{RED}Failed to extract key info from {file.name}{RESET}")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment