update

4aee3217 · jiangdongchen · 68c6acc7 · 4aee3217 · 4aee3217 · 4aee3217
Commit 4aee3217 authored May 08, 2025 by jiangdongchen
8 changed files
--- a/.gitignore
+++ b/.gitignore
 .vscode/
 Papers/
 psrc/__pycache__/
+psrc/stage1/__pycache__/
+psrc/stage2/__pycache__/
 json/
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -58,6 +58,7 @@
                3. 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
            2. 匹配失败后，输出无法匹配的条目,使用warning记录无法匹配的条目，方便后续处理.
    3. 得到从pdf中提取的信息和格式化的excel表格.
+    4. 可能会有重复的pdf，只有标准化重命名之后才会发现，需要手动删除重复的pdf和excel表项，由人工判定.
 3. stage2: 国家机构索引、牛人判断

 # 代码结构说明

--- a/config.json
+++ b/config.json
@@ -9,5 +9,5 @@
    "target_excel_path": "./others/target.xlsx",
    "logLevel": 20,
    "sheetNum": 1,
-    "maxItem": 10
+    "maxItem": 64
 }
\ No newline at end of file
--- a/logs/citation_process.log
+++ b/logs/citation_process.log
--- a/others/target.xlsx
+++ b/others/target.xlsx
--- a/others/论文被引用情况-陈老师-2025.05.01.xlsx
+++ b/others/论文被引用情况-陈老师-2025.05.01.xlsx
--- a/psrc/stage1/__pycache__/citationProcess.cpython-312.pyc
+++ b/psrc/stage1/__pycache__/citationProcess.cpython-312.pyc
--- a/psrc/stage1/citationProcess.py
+++ b/psrc/stage1/citationProcess.py
@@ -224,15 +224,17 @@ def citationProcess(config: dict):
                pdf_issue = result_dict["ISSUE"]

                # 遍历Excel表项进行模糊匹配
+                matched = False
                for idx, excel_name in zip(index_list, paperName_list):
                    # 预处理字符串
                    # 返回pdf字符前的字符串，所以加上索引0
                    clean_excel_name = excel_name.split('.pdf')[0].replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
                    clean_pdf_title = pdf_title.replace(" ", "").replace("_", "").replace(":", "").replace("-", "")
+                    logging.debug(f"clean_excel_name: {clean_excel_name}")
+                    logging.debug(f"clean_pdf_title: {clean_pdf_title}")
                    
                    similarity = fuzz.partial_ratio(clean_pdf_title.lower(), clean_excel_name.lower())
                    
-
                    if similarity >= 85:
                        # 重命名PDF文件
                        new_pdf_name = f"{idx}-{pdf_title.replace(':', '_').replace(' ', '_').replace('?', '_')}.pdf"  # 将冒号替换为连字符
@@ -254,14 +256,17 @@ def citationProcess(config: dict):
                        authors_list = result_dict.get("Authors", [])
                        authors = ";".join(authors_list) if isinstance(authors_list, list) else ""
                        sheet.cell(row=idx+4, column=7, value=authors)  # 第7列是作者名称
+                        logging.info(f"Standardization author info.")

                        institution_list = result_dict.get("Institutions", [])
                        institutions = ";".join(institution_list) if isinstance( institution_list, list) else ""
                        sheet.cell(row=idx+4, column=11, value=institutions)  # 第9列是机构
+                        logging.info(f"Standardization Institution info.")

                        countrys_list = result_dict.get("Countrys", [])
                        countrys = ";".join(countrys_list) if isinstance(countrys_list, list) else ""
                        sheet.cell(row=idx+4, column=11, value=countrys)  # 第11列是国家
+                        logging.info(f"Standardization countrys info.")

                        # CCFA判断
                        logging.info(f"Judge CCFA.")
@@ -290,4 +295,10 @@ def citationProcess(config: dict):
                        
                        logging.info(f"Matched: {file.name} -> idx: {idx}, excel_name: {excel_name}")
                        logging.info(f"Change: {file.name} -> {new_pdf_name}")
-                        break 
\ No newline at end of file
+
+                        matched = True
+                        break 
+                if matched == False:
+                    logging.warning(f"{RED}Not matched: {file.name} -> idx: {idx}, excel_name: {excel_name}{RESET}")
+            else:
+                logging.error(f"{RED}Failed to extract key info from {file.name}{RESET}")
\ No newline at end of file