excel未匹配项输出log以及清空表项

f999529f · jiangdongchen · 1a389f1e · f999529f · f999529f · f999529f
Commit f999529f authored May 15, 2025 by jiangdongchen
Hide whitespace changes
Inline Side-by-side

Showing with 40 additions and 9 deletions

README.md
+2 -1

others/target.xlsx
+0 -0

psrc/stage1/citationProcess.py
+38 -8

No files found.
--- a/README.md
+++ b/README.md
@@ -57,7 +57,8 @@
                - 将论文标题、会议名称、作者姓名、通讯作者姓名、机构写入excel.
                - 用**大模型**将英文国家名翻译成中国名，将国家对应的索引写入目标excel表格中.
                - 将pdf文件中的会议或者期刊名称和CCFA的会议或者期刊名称的表格交给**大模型**匹配,匹配结果以“是/否”的形式写入目标excel表格中.
-            - 匹配失败后，输出无法匹配的条目,使用warning记录无法匹配的条目，方便后续处理.
+            - 匹配失败后，log输出无法匹配的条目,使用warning记录无法匹配的pdf条目，方便后续处理.
+        - 遍历结束后，log输出excel中未被匹配的条目集合，并且将其对应表项清空.
    3. 得到从pdf中提取的信息json和格式化的excel表格.
    4. 人工复核
        1. 关键信息可能会提取失败，详见log，一般不会有问题, 如果出现了需要人工加一下.

--- a/others/target.xlsx
+++ b/others/target.xlsx
--- a/psrc/stage1/citationProcess.py
+++ b/psrc/stage1/citationProcess.py
 from errno import ESTALE
+from logging import config
 from pathlib import Path
 import logging
 from openai import OpenAI
@@ -205,14 +206,14 @@ def get_citation_ids(pdf_path, title, configModel, client):
        logging.error(f"An error occurred while processing {pdf_path.name}: {str(e)}")
        return None
-# excel表格的第4行开始读取索引和论文名称
+# excel表格的第content_start行开始读取索引和论文名称
-def read_rough_nameIndex_from_excel(sheet, maxItem):
+def read_rough_nameIndex_from_excel(sheet, min_row, maxItem):
    index_list = []
    paperName_list = []
    # 从第4行开始遍历
-    for idx, row in enumerate(sheet.iter_rows(min_row=4, values_only=True)):
+    for idx, row in enumerate(sheet.iter_rows(min_row, values_only=True)):
        if idx >= maxItem:  # 限制读取的行数
            break
        if row[0] and row[2]:  # 确保索引和论文名称都存在
@@ -274,7 +275,7 @@ def citationProcess(config: dict):
        sheet = wb[sheet_name]
        logging.info(f"{BLUE}Processing sheet: {sheet_name}{RESET}")
-        index_list, paperName_list = read_rough_nameIndex_from_excel(sheet, config["maxItem"])
+        index_list, paperName_list = read_rough_nameIndex_from_excel(sheet, config["content_start"]+1,config["maxItem"])
        rst_dir = Path.cwd() / config["result_dir"] / sheet_name
        rst_dir.mkdir(parents=True, exist_ok=True)  # 确保结果目录存在
@@ -302,6 +303,11 @@ def citationProcess(config: dict):
            logging.error(f"Error parsing BibTeX in sheet {sheet_name} row 3: {str(e)}")
        # 遍历当前工作表下的所有项目
+        # 在遍历PDF文件前，先记录所有Excel索引
+        all_excel_indices: set[int] = set(index_list)
+        matched_indices: set[int] = set()
+        # 遍历当前工作表下的所有项目
        for file in pdf_files:
            logging.info(f"{BLUE}Processing PDF file: {file.name}{RESET}")  # 添加蓝色日志输出
@@ -317,16 +323,20 @@ def citationProcess(config: dict):
                # 直接遍历index_list查找匹配的索引
                for i, excel_idx in enumerate(index_list):
                    if excel_idx == file_idx:
-                        excel_row_idx = i + config["content_start"]
+                        excel_row_idx = i + config["content_start"] + 1
                        excel_name = paperName_list[i]
                        logging.info(f"{BLUE}Matched - Excel Row: {excel_row_idx}, Index: {excel_idx}, Title: {excel_name}{RESET}")
+                         # 在成功匹配索引后记录
+                        matched_indices.add(excel_idx)
                        break
                else:
                    raise ValueError(f"Index {file_idx} not found")
            except ValueError:
                logging.warning(f"{RED}Index {file_idx} not found in Excel sheet{RESET}")
                continue
            first_page_text = extract_first_page_text(file)
            if first_page_text is None:
@@ -420,4 +430,24 @@ def citationProcess(config: dict):
                # 保存修改后的Excel文件
                wb.save(target_path)
            else:
-                logging.warning(f"{RED}Failed to extract key info from {file.name}{RESET}")
\ No newline at end of file
+                logging.warning(f"{RED}Failed to extract key info from {file.name}{RESET}")
+        # 在处理完所有PDF文件后，检查未匹配的索引
+        unmatched_indices = all_excel_indices - matched_indices
+        if unmatched_indices:
+            # 将索引统一转换为整数再排序
+            sorted_indices = sorted( unmatched_indices)
+            logging.warning(f"{RED}以下索引在Excel中存在但没有对应的PDF文件: {sorted_indices}{RESET}")
+            # 清空未匹配索引对应的行
+            for excel_idx in unmatched_indices:
+                for i, idx in enumerate(index_list):
+                    if idx == excel_idx:
+                        row_idx = i + config["content_start"] + 1
+                        # 清空从第4列开始的内容（保留索引和原始名称）
+                        for col in range(4, sheet.max_column + 1):
+                            sheet.cell(row=row_idx, column=col, value="")
+                        break
+        # 保存修改后的Excel文件
+        wb.save(target_path)
\ No newline at end of file